├── .gitignore ├── CherubNLP.Console ├── CherubNLP.Console.csproj └── Program.cs ├── CherubNLP.UnitTest ├── CherubNLP.UnitTest.csproj ├── Classification │ ├── SVMClassifierTest.cs │ └── WordCnnTest.cs ├── DefaultTaggerTest.cs ├── Featuring │ └── CountFeatureExtractorTest.cs ├── Kaggle │ └── SpookyAuthorIdentification │ │ ├── ClassificationTest.cs │ │ ├── README.md │ │ ├── sample_submission.zip │ │ ├── test.zip │ │ └── train.zip ├── NGramTaggerTest.cs ├── NaiveBayesClassifierTest.cs ├── RegexStemmerTest.cs ├── TestEssential.cs ├── Tokenize │ ├── RegexTokenizerTest.cs │ └── TreebankTokenizerTest.cs └── Vector │ ├── FastTextTest.cs │ ├── OneHotEncodingTest.cs │ └── Word2VecTest.cs ├── CherubNLP.sln ├── CherubNLP ├── CherubNLP.csproj ├── Classify │ ├── ClassifierFactory.cs │ ├── ClassifyOptions.cs │ ├── IClassifier.cs │ ├── IEstimator.cs │ ├── ITextFeatureExtractor.cs │ ├── NaiveBayesClassifier.cs │ ├── SVMClassifier.cs │ ├── SentenceFeatureExtractor.cs │ └── WordFeatureExtractor.cs ├── Corpus │ ├── ConllReader.cs │ ├── FasttextDataReader.cs │ ├── KaggleTextDataReader.cs │ ├── LabeledPerFileNameReader.cs │ └── ReaderOptions.cs ├── Featuring │ ├── CountFeatureExtractor.cs │ ├── IFeatureExtractor.cs │ ├── TfIdfFeatureExtractor.cs │ └── Word2VecFeatureExtractor.cs ├── Jieba.NET │ ├── Common │ │ ├── Counter.cs │ │ ├── Extensions.cs │ │ ├── FileExtension.cs │ │ └── Trie.cs │ ├── ConfigManager.cs │ ├── Constants.cs │ ├── DefaultDictionary.cs │ ├── FinalSeg │ │ ├── IFinalSeg.cs │ │ └── Viterbi.cs │ ├── JiebaSegmenter.cs │ ├── JiebaTagger.cs │ ├── JiebaTokenizer.cs │ ├── Node.cs │ ├── Pair.cs │ ├── PosSeg │ │ ├── Pair.cs │ │ ├── PosSegmenter.cs │ │ └── Viterbi.cs │ ├── README.rst │ ├── Spelling │ │ └── SpellChecker.cs │ ├── Token.cs │ └── WordDictionary.cs ├── Models │ ├── Entropy │ │ ├── AbstractDataIndexer.cs │ │ ├── BasicContextGenerator.cs │ │ ├── BasicEventReader.cs │ │ ├── ComparableEvent.cs │ │ ├── GisModel.cs │ │ ├── GisTrainer.cs │ │ ├── IContextGenerator.cs │ │ ├── IMaximumEntropyModel.cs │ │ ├── IO │ │ │ ├── BinaryGisModelReader.cs │ │ │ ├── BinaryGisModelWriter.cs │ │ │ ├── GisModelReader.cs │ │ │ ├── GisModelWriter.cs │ │ │ ├── IGisModelReader.cs │ │ │ ├── JavaBinaryGisModelReader.cs │ │ │ ├── JavaBinaryGisModelWriter.cs │ │ │ ├── PlainTextGisModelReader.cs │ │ │ └── PlainTextGisModelWriter.cs │ │ ├── ITrainingDataIndexer.cs │ │ ├── ITrainingDataReader.cs │ │ ├── ITrainingEventReader.cs │ │ ├── OnePassDataIndexer.cs │ │ ├── PatternedPredicate.cs │ │ ├── PlainTextByLineDataReader.cs │ │ ├── TrainingEvent.cs │ │ └── TwoPassDataIndexer.cs │ └── WordNet │ │ ├── DataFileEngine.cs │ │ ├── IndexWord.cs │ │ ├── Morph │ │ ├── AbstractDelegatingOperation.cs │ │ ├── DetachSuffixesOperation.cs │ │ ├── IOperation.cs │ │ ├── LookupExceptionsOperation.cs │ │ ├── LookupIndexWordOperation.cs │ │ ├── TokenizerOperation.cs │ │ └── Util.cs │ │ ├── Relation.cs │ │ ├── RelationType.cs │ │ ├── Synset.cs │ │ ├── Tokenizer.cs │ │ └── WordNetEngine.cs ├── NER │ └── README.md ├── Sentence.cs ├── Similarity │ └── Similarity.cs ├── Stem │ ├── IStemmer.cs │ ├── RegexStemmer.cs │ ├── StemOptions.cs │ └── StemmerFactory.cs ├── SupportedLanguage.cs ├── Tag │ ├── DefaultTagger.cs │ ├── ITagger.cs │ ├── NGramTagger.cs │ ├── TagOptions.cs │ └── TaggerFactory.cs ├── Tokenize │ ├── ITokenizer.cs │ ├── README.rst │ ├── RegexTokenizer.cs │ ├── Token.cs │ ├── TokenizationOptions.cs │ ├── TokenizerBase.cs │ ├── TokenizerFactory.cs │ └── TreebankTokenizer.cs └── Txt2Vec │ ├── Decoder.cs │ ├── Encoder.cs │ ├── Model.cs │ ├── OneHotEncoder.cs │ ├── Shrink.cs │ └── VectorGenerator.cs ├── LICENSE ├── README.md ├── Settings └── app.json ├── data └── dbpedia.ftz └── docs ├── Makefile ├── conf.py ├── index.rst └── make.bat /CherubNLP.Console/CherubNLP.Console.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Exe 5 | net6.0 6 | AnyCPU;x64 7 | 8 | 9 | 10 | DEBUG;TRACE 11 | 12 | 13 | 14 | DEBUG;TRACE 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /CherubNLP.Console/Program.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.UnitTest.Kaggle; 2 | using FastText.NetWrapper; 3 | using System; 4 | using System.IO; 5 | 6 | namespace CherubNLP.Console 7 | { 8 | class Program 9 | { 10 | static void Main(string[] args) 11 | { 12 | var model = Path.Combine(@"D:\SciSharp\CherubNLP\data", "dbpedia.bin"); 13 | using (var fastText = new FastTextWrapper()) 14 | { 15 | fastText.LoadModel(model); 16 | var vector1 = fastText.GetSentenceVector("Hello"); 17 | } 18 | 19 | var similarities = Similarity.Cosine("Power Outage -Fifth & Park - JPMC150713", new[] 20 | { 21 | "Cosine Similarity algorithm function sample.", 22 | "Power Restored -Fifth & Park - JPMC150713", 23 | "Compute the similarity of two hardcoded lists.", 24 | "We can compute the similarity of two hardcoded lists.", 25 | "Coronavirus app could trace your contacts without sacrificing your privacy" 26 | }, model); 27 | 28 | // var test = new KaggleTest(); 29 | // test.SpookyAuthorIdentification(); 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /CherubNLP.UnitTest/CherubNLP.UnitTest.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | net6.0 5 | 6 | false 7 | 8 | AnyCPU;x64 9 | 10 | 11 | 12 | DEBUG;TRACE 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /CherubNLP.UnitTest/Classification/SVMClassifierTest.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.Classify; 2 | using CherubNLP.Corpus; 3 | using CherubNLP.Tokenize; 4 | using Microsoft.Extensions.Configuration; 5 | using Microsoft.VisualStudio.TestTools.UnitTesting; 6 | using System; 7 | using System.Collections.Generic; 8 | using System.IO; 9 | using System.Text; 10 | using Txt2Vec; 11 | 12 | namespace CherubNLP.UnitTest 13 | { 14 | [TestClass] 15 | public class SVMClassifierTest : TestEssential 16 | { 17 | [TestMethod] 18 | public void TFIDFTest() 19 | { 20 | string[] documents = 21 | { 22 | "Hello, how are you!", 23 | "Hi Bolo!", 24 | "Hey Haiping!", 25 | "Hello Haiping", 26 | "hi, how do you do?", 27 | "goodbye Haiping", 28 | "see you Bolo", 29 | "byebye Haiping" 30 | }; 31 | /*TFIDFGenerator tfidfGenerator = new TFIDFGenerator(); 32 | List> weights = tfidfGenerator.TFIDFWeightVectorsForSentences(documents);*/ 33 | } 34 | 35 | [TestMethod] 36 | public void Doc2VectorTest() 37 | { 38 | List sentences = new List(); 39 | sentences.Add("The sun in the sky is bright."); 40 | sentences.Add("We can see the shining sun, the bright sun."); 41 | Args args = new Args(); 42 | args.ModelFile = "CherubNLP\\App_Data\\wordvec_enu.bin"; 43 | VectorGenerator vg = new VectorGenerator(args); 44 | var list = vg.Sentence2Vec(sentences); 45 | } 46 | 47 | [TestMethod] 48 | public void similarityTest() 49 | { 50 | List sentences = new List(); 51 | sentences.Add("How's it going"); 52 | sentences.Add("How's your day"); 53 | sentences.Add("How's everything"); 54 | sentences.Add("Good morning"); 55 | sentences.Add("Good afternoon"); 56 | sentences.Add("Good evening"); 57 | sentences.Add("I appreciate it"); 58 | sentences.Add("Thanks a lot"); 59 | sentences.Add("Thank you"); 60 | 61 | 62 | Args args = new Args(); 63 | args.ModelFile = "CherubNLP\\CherubNLP.UnitTest\\wordvec_enu.bin"; 64 | VectorGenerator vg = new VectorGenerator(args); 65 | var list = vg.Sentence2Vec(sentences); 66 | Vec vec1 = vg.SingleSentence2Vec("Good morning"); 67 | Vec vec2 = vg.SingleSentence2Vec("How's it going"); 68 | double score = vg.Similarity(vec1, vec2); 69 | Console.WriteLine("Similarity score: {0}", score); 70 | 71 | vec1 = vg.SingleSentence2Vec("Good morning"); 72 | vec2 = vg.SingleSentence2Vec("How's your day"); 73 | double score1 = vg.Similarity(vec1, vec2); 74 | Console.WriteLine("Similarity score: {0}", score1); 75 | 76 | vec1 = vg.SingleSentence2Vec("Good morning"); 77 | vec2 = vg.SingleSentence2Vec("How's everything"); 78 | double score2 = vg.Similarity(vec1, vec2); 79 | Console.WriteLine("Similarity score: {0}", score2); 80 | 81 | 82 | vec1 = vg.SingleSentence2Vec("Good morning"); 83 | vec2 = vg.SingleSentence2Vec("Good afternoon"); 84 | double score3 = vg.Similarity(vec1, vec2); 85 | Console.WriteLine("Similarity score: {0}", score3); 86 | 87 | vec1 = vg.SingleSentence2Vec("Good morning"); 88 | vec2 = vg.SingleSentence2Vec("I appreciate"); 89 | double score4 = vg.Similarity(vec1, vec2); 90 | Console.WriteLine("Similarity score: {0}", score4); 91 | 92 | vec1 = vg.SingleSentence2Vec("Good morning"); 93 | vec2 = vg.SingleSentence2Vec("Thanks a lot"); 94 | double score5 = vg.Similarity(vec1, vec2); 95 | Console.WriteLine("Similarity score: {0}", score5); 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /CherubNLP.UnitTest/Classification/WordCnnTest.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.VisualStudio.TestTools.UnitTesting; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace CherubNLP.UnitTest 7 | { 8 | [TestClass] 9 | public class WordCnnTest : TestEssential 10 | { 11 | [TestMethod] 12 | public void TFIDFTest() 13 | { 14 | 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /CherubNLP.UnitTest/DefaultTaggerTest.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.Corpus; 2 | using CherubNLP.Tag; 3 | using CherubNLP.Tokenize; 4 | using Microsoft.VisualStudio.TestTools.UnitTesting; 5 | using System; 6 | using System.Collections.Generic; 7 | using System.Text; 8 | 9 | namespace CherubNLP.UnitTest 10 | { 11 | [TestClass] 12 | public class DefaultTaggerTest 13 | { 14 | [TestMethod] 15 | public void TagInCoNLL2000() 16 | { 17 | var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); 18 | tokenizer.GetTokenizer(); 19 | 20 | var tokens = tokenizer.Tokenize("How are you doing?"); 21 | 22 | var tagger = new TaggerFactory(new TagOptions 23 | { 24 | Tag = "NN" 25 | }, SupportedLanguage.English); 26 | 27 | tagger.GetTagger(); 28 | 29 | tagger.Tag(new Sentence { Words = tokens }); 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /CherubNLP.UnitTest/Featuring/CountFeatureExtractorTest.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.Featuring; 2 | using CherubNLP.Tokenize; 3 | using Microsoft.VisualStudio.TestTools.UnitTesting; 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Text; 7 | 8 | namespace CherubNLP.UnitTest.Featuring 9 | { 10 | [TestClass] 11 | public class CountFeatureExtractorTest : TestEssential 12 | { 13 | [TestMethod] 14 | public void TestVectorizer() 15 | { 16 | var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); 17 | tokenizer.GetTokenizer(); 18 | 19 | var extractor = new CountFeatureExtractor(); 20 | extractor.Sentences = tokenizer.Tokenize(Corpus()); 21 | extractor.Vectorize(new List()); 22 | 23 | var vectors = Vectors(); 24 | 25 | for (int i = 0; i < extractor.Sentences.Count; i++) 26 | { 27 | var sentence = extractor.Sentences[i]; 28 | 29 | for(int j = 0; j < extractor.Features.Count; j++) 30 | { 31 | var word = sentence.Words.Find(w => w.Lemma == extractor.Features[j]); 32 | 33 | if(word != null) 34 | { 35 | Assert.IsTrue(word.Vector == vectors[i][j]); 36 | } 37 | } 38 | } 39 | } 40 | 41 | public List Corpus() 42 | { 43 | return new List 44 | { 45 | "This is the first document.", 46 | "This document is the second document.", 47 | "And this is the third one.", 48 | "Is this the first document?" 49 | }; 50 | } 51 | 52 | public int[][] Vectors() 53 | { 54 | return new int[4][] 55 | { 56 | new int []{ 0, 1, 1, 1, 0, 0, 1, 0, 1 }, 57 | new int []{ 0, 2, 0, 1, 0, 1, 1, 0, 1 }, 58 | new int []{ 1, 0, 0, 1, 1, 0, 1, 1, 1 }, 59 | new int []{ 0, 1, 1, 1, 0, 0, 1, 0, 1 } 60 | }; 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /CherubNLP.UnitTest/Kaggle/SpookyAuthorIdentification/ClassificationTest.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.Classify; 2 | using CherubNLP.Corpus; 3 | using CherubNLP.Tokenize; 4 | using Microsoft.Extensions.Configuration; 5 | using Microsoft.VisualStudio.TestTools.UnitTesting; 6 | using System; 7 | using System.Collections.Generic; 8 | using System.IO; 9 | using System.Linq; 10 | using System.Text; 11 | using Bigtree.Algorithm.Extensions; 12 | 13 | namespace CherubNLP.UnitTest.Kaggle 14 | { 15 | [TestClass] 16 | public partial class KaggleTest : TestEssential 17 | { 18 | 19 | [TestMethod] 20 | public void SpookyAuthorIdentification() 21 | { 22 | var reader = new KaggleTextDataReader(); 23 | var sentences = reader.Read(new ReaderOptions { FileName = "train.csv" }); 24 | 25 | var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); 26 | tokenizer.GetTokenizer(); 27 | 28 | var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList()); 29 | for (int i = 0; i < newSentences.Count; i++) 30 | { 31 | newSentences[i].Id = sentences[i].Id; 32 | newSentences[i].Label = sentences[i].Label; 33 | } 34 | sentences = newSentences.ToList(); 35 | 36 | sentences.Shuffle(); 37 | var dataset = sentences.Take(2000).ToList().Split(0.7M); 38 | 39 | var options = new ClassifyOptions 40 | { 41 | ModelDir = AppContext.BaseDirectory, 42 | ModelFilePath = Path.Combine(AppContext.BaseDirectory, "nb.model"), 43 | Dimension = 300 44 | }; 45 | var classifier = new ClassifierFactory(options, SupportedLanguage.English); 46 | classifier.GetClassifer("NaiveBayesClassifier"); 47 | classifier.Train(dataset.Item1); 48 | 49 | int correct = 0; 50 | int total = 0; 51 | dataset.Item2.ForEach(td => 52 | { 53 | var classes = classifier.Classify(td); 54 | if (td.Label == classes[0].Item1) 55 | { 56 | correct++; 57 | } 58 | total++; 59 | }); 60 | 61 | var accuracy = (float)correct / total; 62 | 63 | Assert.IsTrue(accuracy > 0.5); 64 | } 65 | 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /CherubNLP.UnitTest/Kaggle/SpookyAuthorIdentification/README.md: -------------------------------------------------------------------------------- 1 | # Spooky Author Identification 2 | Share code and discuss insights to identify horror authors from their writings 3 | 4 | ### Data Description 5 | The competition dataset contains text from works of fiction written by spooky authors of the public domain: Edgar Allan Poe, HP Lovecraft and Mary Shelley. The data was prepared by chunking larger texts into sentences using CoreNLP's MaxEnt sentence tokenizer, so you may notice the odd non-sentence here and there. Your objective is to accurately identify the author of the sentences in the test set. 6 | 7 | ### Evaluation 8 | Submissions are evaluated using multi-class logarithmic loss. Each id has one true class. For each id, you must submit a predicted probability for each author. The formula is then: 9 | logloss=−1N∑i=1N∑j=1Myijlog(pij), 10 | where N is the number of observations in the test set, M is the number of class labels (3 classes), log is the natural logarithm, yij is 1 if observation i belongs to class j and 0 otherwise, and pij is the predicted probability that observation i belongs to class j. 11 | 12 | The submitted probabilities for a given sentences are not required to sum to one because they are rescaled prior to being scored (each row is divided by the row sum). In order to avoid the extremes of the log function, predicted probabilities are replaced with max(min(p,1−10−15),10−15). 13 | 14 | [Kaggle Link](https://www.kaggle.com/c/spooky-author-identification) -------------------------------------------------------------------------------- /CherubNLP.UnitTest/Kaggle/SpookyAuthorIdentification/sample_submission.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSharp/CherubNLP/7e80875a7909288c15c0e5ee1fafcb4c8df41198/CherubNLP.UnitTest/Kaggle/SpookyAuthorIdentification/sample_submission.zip -------------------------------------------------------------------------------- /CherubNLP.UnitTest/Kaggle/SpookyAuthorIdentification/test.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSharp/CherubNLP/7e80875a7909288c15c0e5ee1fafcb4c8df41198/CherubNLP.UnitTest/Kaggle/SpookyAuthorIdentification/test.zip -------------------------------------------------------------------------------- /CherubNLP.UnitTest/Kaggle/SpookyAuthorIdentification/train.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSharp/CherubNLP/7e80875a7909288c15c0e5ee1fafcb4c8df41198/CherubNLP.UnitTest/Kaggle/SpookyAuthorIdentification/train.zip -------------------------------------------------------------------------------- /CherubNLP.UnitTest/NGramTaggerTest.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.Corpus; 2 | using CherubNLP.Tag; 3 | using CherubNLP.Tokenize; 4 | using Microsoft.Extensions.Configuration; 5 | using Microsoft.VisualStudio.TestTools.UnitTesting; 6 | using System; 7 | using System.Collections.Generic; 8 | using System.Diagnostics; 9 | using System.IO; 10 | using System.Text; 11 | 12 | namespace CherubNLP.UnitTest 13 | { 14 | [TestClass] 15 | public class NGramTaggerTest : TestEssential 16 | { 17 | [TestMethod] 18 | public void UniGramInCoNLL2000() 19 | { 20 | // tokenization 21 | var tokenizer = new TokenizerFactory(new TokenizationOptions 22 | { 23 | Pattern = RegexTokenizer.WORD_PUNC 24 | }, SupportedLanguage.English); 25 | tokenizer.GetTokenizer(); 26 | 27 | var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment"); 28 | 29 | // test tag 30 | var tagger = new TaggerFactory(new TagOptions 31 | { 32 | CorpusDir = Configuration.GetValue("CherubNLP:dataDir"), 33 | NGram = 1, 34 | Tag = "NN" 35 | }, SupportedLanguage.English); 36 | 37 | tagger.GetTagger(); 38 | 39 | var watch = Stopwatch.StartNew(); 40 | tagger.Tag(new Sentence { Words = tokens }); 41 | watch.Stop(); 42 | var elapsedMs1 = watch.ElapsedMilliseconds; 43 | 44 | Assert.IsTrue(tokens[0].Pos == "NNP"); 45 | Assert.IsTrue(tokens[1].Pos == "IN"); 46 | Assert.IsTrue(tokens[2].Pos == "DT"); 47 | Assert.IsTrue(tokens[3].Pos == "NNP"); 48 | 49 | // test if model is loaded repeatly. 50 | watch = Stopwatch.StartNew(); 51 | tagger.Tag(new Sentence { Words = tokens }); 52 | watch.Stop(); 53 | var elapsedMs2 = watch.ElapsedMilliseconds; 54 | 55 | Assert.IsTrue(elapsedMs1 > elapsedMs2 * 100); 56 | } 57 | 58 | [TestMethod] 59 | public void BiGramInCoNLL2000() 60 | { 61 | // tokenization 62 | var tokenizer = new TokenizerFactory(new TokenizationOptions 63 | { 64 | Pattern = RegexTokenizer.WORD_PUNC 65 | }, SupportedLanguage.English); 66 | tokenizer.GetTokenizer(); 67 | 68 | var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment"); 69 | 70 | // test tag 71 | var tagger = new TaggerFactory(new TagOptions 72 | { 73 | CorpusDir = Configuration.GetValue("CherubNLP:dataDir"), 74 | NGram = 2, 75 | Tag = "NN" 76 | }, SupportedLanguage.English); 77 | 78 | tagger.GetTagger(); 79 | 80 | tagger.Tag(new Sentence { Words = tokens }); 81 | 82 | Assert.IsTrue(tokens[0].Pos == "NNP"); 83 | Assert.IsTrue(tokens[1].Pos == "IN"); 84 | Assert.IsTrue(tokens[2].Pos == "DT"); 85 | Assert.IsTrue(tokens[3].Pos == "NNP"); 86 | } 87 | 88 | [TestMethod] 89 | public void TriGramInCoNLL2000() 90 | { 91 | // tokenization 92 | var tokenizer = new TokenizerFactory(new TokenizationOptions 93 | { 94 | Pattern = RegexTokenizer.WORD_PUNC 95 | }, SupportedLanguage.English); 96 | tokenizer.GetTokenizer(); 97 | 98 | var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment"); 99 | 100 | // test tag 101 | var tagger = new TaggerFactory(new TagOptions 102 | { 103 | CorpusDir = Configuration.GetValue("CherubNLP:dataDir"), 104 | NGram = 3, 105 | Tag = "NN" 106 | }, SupportedLanguage.English); 107 | 108 | tagger.GetTagger(); 109 | 110 | tagger.Tag(new Sentence { Words = tokens }); 111 | 112 | Assert.IsTrue(tokens[0].Pos == "NNP"); 113 | Assert.IsTrue(tokens[1].Pos == "IN"); 114 | Assert.IsTrue(tokens[2].Pos == "DT"); 115 | Assert.IsTrue(tokens[3].Pos == "NNP"); 116 | } 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /CherubNLP.UnitTest/RegexStemmerTest.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.Stem; 2 | using Microsoft.VisualStudio.TestTools.UnitTesting; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Text; 6 | 7 | namespace CherubNLP.UnitTest 8 | { 9 | [TestClass] 10 | public class RegexStemmerTest 11 | { 12 | [TestMethod] 13 | public void StemInDefault() 14 | { 15 | var stemmer = new StemmerFactory(new StemOptions 16 | { 17 | Pattern = RegexStemmer.PATTERN 18 | }, SupportedLanguage.English); 19 | 20 | var stem = stemmer.Stem("doing"); 21 | Assert.IsTrue(stem == "do"); 22 | 23 | stem = stemmer.Stem("ponies"); 24 | Assert.IsTrue(stem == "poni"); 25 | 26 | stem = stemmer.Stem("caresses"); 27 | Assert.IsTrue(stem == "caress"); 28 | 29 | stem = stemmer.Stem("cats"); 30 | Assert.IsTrue(stem == "cat"); 31 | 32 | stem = stemmer.Stem("am"); 33 | Assert.IsTrue(stem == "be"); 34 | 35 | stem = stemmer.Stem("are"); 36 | Assert.IsTrue(stem == "be"); 37 | 38 | stem = stemmer.Stem("is"); 39 | Assert.IsTrue(stem == "be"); 40 | 41 | stem = stemmer.Stem("were"); 42 | Assert.IsTrue(stem == "be"); 43 | 44 | stem = stemmer.Stem("running"); 45 | Assert.IsTrue(stem == "run"); 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /CherubNLP.UnitTest/TestEssential.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.Extensions.Configuration; 2 | using System; 3 | using System.IO; 4 | using System.Linq; 5 | 6 | namespace CherubNLP.UnitTest 7 | { 8 | public abstract class TestEssential 9 | { 10 | protected IConfiguration Configuration { get; } 11 | protected string rootDir; 12 | protected string dataDir; 13 | protected string settingsDir; 14 | 15 | public TestEssential() 16 | { 17 | rootDir = Path.GetFullPath($"{Directory.GetCurrentDirectory()}{Path.DirectorySeparatorChar}..{Path.DirectorySeparatorChar}..{Path.DirectorySeparatorChar}..{Path.DirectorySeparatorChar}..{Path.DirectorySeparatorChar}"); 18 | settingsDir = Path.Combine(rootDir, "Settings"); 19 | dataDir = Path.Combine(rootDir, "data"); 20 | 21 | // x64 22 | if (!Directory.Exists(settingsDir)) 23 | { 24 | rootDir = Path.GetFullPath($"{rootDir}{Path.DirectorySeparatorChar}..{Path.DirectorySeparatorChar}"); 25 | settingsDir = Path.Combine(rootDir, "Settings"); 26 | dataDir = Path.Combine(rootDir, "data"); 27 | } 28 | 29 | ConfigurationBuilder configurationBuilder = new ConfigurationBuilder(); 30 | var settings = Directory.GetFiles(settingsDir, "*.json"); 31 | settings.ToList().ForEach(setting => 32 | { 33 | configurationBuilder.AddJsonFile(setting, optional: false, reloadOnChange: true); 34 | }); 35 | Configuration = configurationBuilder.Build(); 36 | } 37 | } 38 | 39 | 40 | } 41 | -------------------------------------------------------------------------------- /CherubNLP.UnitTest/Tokenize/RegexTokenizerTest.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.Tokenize; 2 | using Microsoft.VisualStudio.TestTools.UnitTesting; 3 | using System.Collections.Generic; 4 | 5 | namespace CherubNLP.UnitTest.Tokenize 6 | { 7 | [TestClass] 8 | public class RegexTokenizerTest 9 | { 10 | [TestMethod] 11 | public void TokenizeInWhiteSpace() 12 | { 13 | var tokenizer = new TokenizerFactory(new TokenizationOptions 14 | { 15 | Pattern = RegexTokenizer.WHITE_SPACE 16 | }, SupportedLanguage.English); 17 | tokenizer.GetTokenizer(); 18 | 19 | var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?"); 20 | 21 | Assert.IsTrue(tokens[0].Start == 0); 22 | Assert.IsTrue(tokens[0].Text == "Chop"); 23 | 24 | Assert.IsTrue(tokens[1].Start == 5); 25 | Assert.IsTrue(tokens[1].Text == "into"); 26 | 27 | Assert.IsTrue(tokens[2].Start == 10); 28 | Assert.IsTrue(tokens[2].Text == "pieces,"); 29 | 30 | Assert.IsTrue(tokens[3].Start == 18); 31 | Assert.IsTrue(tokens[3].Text == "isn't"); 32 | 33 | Assert.IsTrue(tokens[4].Start == 24); 34 | Assert.IsTrue(tokens[4].Text == "it?"); 35 | } 36 | 37 | [TestMethod] 38 | public void TokenizeInWordPunctuation() 39 | { 40 | var tokenizer = new TokenizerFactory(new TokenizationOptions 41 | { 42 | Pattern = RegexTokenizer.WORD_PUNC, 43 | SpecialWords = new List { "n't" } 44 | }, SupportedLanguage.English); 45 | tokenizer.GetTokenizer(); 46 | 47 | var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?"); 48 | 49 | Assert.IsTrue(tokens[0].Start == 0); 50 | Assert.IsTrue(tokens[0].Text == "Chop"); 51 | 52 | Assert.IsTrue(tokens[1].Start == 5); 53 | Assert.IsTrue(tokens[1].Text == "into"); 54 | 55 | Assert.IsTrue(tokens[2].Start == 10); 56 | Assert.IsTrue(tokens[2].Text == "pieces"); 57 | 58 | Assert.IsTrue(tokens[3].Start == 16); 59 | Assert.IsTrue(tokens[3].Text == ","); 60 | 61 | Assert.IsTrue(tokens[4].Start == 18); 62 | Assert.IsTrue(tokens[4].Text == "is"); 63 | 64 | Assert.IsTrue(tokens[5].Start == 20); 65 | Assert.IsTrue(tokens[5].Text == "n't"); 66 | 67 | Assert.IsTrue(tokens[6].Start == 24); 68 | Assert.IsTrue(tokens[6].Text == "it"); 69 | 70 | Assert.IsTrue(tokens[7].Start == 26); 71 | Assert.IsTrue(tokens[7].Text == "?"); 72 | } 73 | 74 | [TestMethod] 75 | public void TokenizeInBlankLine() 76 | { 77 | var tokenizer = new TokenizerFactory(new TokenizationOptions 78 | { 79 | Pattern = RegexTokenizer.BLANK_LINE 80 | }, SupportedLanguage.English); 81 | tokenizer.GetTokenizer(); 82 | 83 | var tokens = tokenizer.Tokenize(@"Chop into pieces, 84 | 85 | isn't 86 | 87 | it?"); 88 | 89 | Assert.IsTrue(tokens[0].Start == 0); 90 | Assert.IsTrue(tokens[0].Text == "Chop into pieces,"); 91 | 92 | Assert.IsTrue(tokens[1].Start == 18); 93 | Assert.IsTrue(tokens[1].Text == "isn't"); 94 | 95 | Assert.IsTrue(tokens[2].Start == 28); 96 | Assert.IsTrue(tokens[2].Text == "it?"); 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /CherubNLP.UnitTest/Tokenize/TreebankTokenizerTest.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.Tokenize; 2 | using Microsoft.VisualStudio.TestTools.UnitTesting; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Text; 6 | 7 | namespace CherubNLP.UnitTest.Tokenize 8 | { 9 | [TestClass] 10 | public class TreebankTokenizerTest 11 | { 12 | [TestMethod] 13 | public void ReplaceStartingQuoting() 14 | { 15 | var tokenizer = new TokenizerFactory(new TokenizationOptions 16 | { 17 | }, SupportedLanguage.English); 18 | tokenizer.GetTokenizer(); 19 | 20 | var tokens = tokenizer.Tokenize("«Hello!"); 21 | 22 | Assert.IsTrue(tokens[0].Text == "«"); 23 | Assert.IsTrue(tokens[0].Start == 0); 24 | 25 | Assert.IsTrue(tokens[1].Text == "Hello"); 26 | Assert.IsTrue(tokens[1].Start == 1); 27 | 28 | Assert.IsTrue(tokens[2].Text == "!"); 29 | Assert.IsTrue(tokens[2].Start == 6); 30 | } 31 | 32 | [TestMethod] 33 | public void ReplaceEndingQuoting() 34 | { 35 | var tokenizer = new TokenizerFactory(new TokenizationOptions 36 | { 37 | }, SupportedLanguage.English); 38 | tokenizer.GetTokenizer(); 39 | 40 | var tokens = tokenizer.Tokenize("Aren't you"); 41 | 42 | Assert.IsTrue(tokens[0].Text == "Are"); 43 | Assert.IsTrue(tokens[0].Start == 0); 44 | 45 | Assert.IsTrue(tokens[1].Text == "n't"); 46 | Assert.IsTrue(tokens[1].Start == 3); 47 | 48 | Assert.IsTrue(tokens[2].Text == "you"); 49 | Assert.IsTrue(tokens[2].Start == 7); 50 | } 51 | 52 | [TestMethod] 53 | public void ReplacePunctuation() 54 | { 55 | var tokenizer = new TokenizerFactory(new TokenizationOptions 56 | { 57 | }, SupportedLanguage.English); 58 | tokenizer.GetTokenizer(); 59 | 60 | var tokens = tokenizer.Tokenize("Hello World..."); 61 | 62 | Assert.IsTrue(tokens[0].Text == "Hello"); 63 | Assert.IsTrue(tokens[0].Start == 0); 64 | 65 | Assert.IsTrue(tokens[1].Text == "World"); 66 | Assert.IsTrue(tokens[1].Start == 6); 67 | 68 | Assert.IsTrue(tokens[2].Text == "..."); 69 | Assert.IsTrue(tokens[2].Start == 11); 70 | } 71 | 72 | [TestMethod] 73 | public void ReplaceBrackets() 74 | { 75 | var tokenizer = new TokenizerFactory(new TokenizationOptions 76 | { 77 | }, SupportedLanguage.English); 78 | tokenizer.GetTokenizer(); 79 | 80 | var tokens = tokenizer.Tokenize(""); 81 | 82 | Assert.IsTrue(tokens[0].Text == "<"); 83 | Assert.IsTrue(tokens[0].Start == 0); 84 | 85 | Assert.IsTrue(tokens[1].Text == "Hello"); 86 | Assert.IsTrue(tokens[1].Start == 1); 87 | 88 | Assert.IsTrue(tokens[2].Text == "."); 89 | Assert.IsTrue(tokens[2].Start == 6); 90 | 91 | Assert.IsTrue(tokens[3].Text == ">"); 92 | Assert.IsTrue(tokens[3].Start == 7); 93 | } 94 | 95 | [TestMethod] 96 | public void ReplaceConventions() 97 | { 98 | var tokenizer = new TokenizerFactory(new TokenizationOptions 99 | { 100 | }, SupportedLanguage.English); 101 | tokenizer.GetTokenizer(); 102 | 103 | var tokens = tokenizer.Tokenize("I cannot jump."); 104 | 105 | Assert.IsTrue(tokens[0].Text == "I"); 106 | Assert.IsTrue(tokens[0].Start == 0); 107 | 108 | Assert.IsTrue(tokens[1].Text == "can"); 109 | Assert.IsTrue(tokens[1].Start == 2); 110 | 111 | Assert.IsTrue(tokens[2].Text == "not"); 112 | Assert.IsTrue(tokens[2].Start == 5); 113 | 114 | Assert.IsTrue(tokens[3].Text == "jump"); 115 | Assert.IsTrue(tokens[3].Start == 9); 116 | 117 | Assert.IsTrue(tokens[4].Text == "."); 118 | Assert.IsTrue(tokens[4].Start == 13); 119 | } 120 | 121 | [TestMethod] 122 | public void ReplaceConventionsIncludeMultipleSymbol() 123 | { 124 | var tokenizer = new TokenizerFactory(new TokenizationOptions 125 | { 126 | }, SupportedLanguage.English); 127 | tokenizer.GetTokenizer(); 128 | 129 | var tokens = tokenizer.Tokenize("I jump. And you?"); 130 | 131 | Assert.IsTrue(tokens[0].Text == "I"); 132 | Assert.IsTrue(tokens[0].Start == 0); 133 | 134 | Assert.IsTrue(tokens[1].Text == "jump"); 135 | Assert.IsTrue(tokens[1].Start == 2); 136 | 137 | Assert.IsTrue(tokens[2].Text == "."); 138 | Assert.IsTrue(tokens[2].Start == 6); 139 | 140 | Assert.IsTrue(tokens[3].Text == "And"); 141 | Assert.IsTrue(tokens[3].Start == 8); 142 | 143 | Assert.IsTrue(tokens[4].Text == "you"); 144 | Assert.IsTrue(tokens[4].Start == 12); 145 | 146 | Assert.IsTrue(tokens[5].Text == "?"); 147 | Assert.IsTrue(tokens[5].Start == 15); 148 | } 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /CherubNLP.UnitTest/Vector/FastTextTest.cs: -------------------------------------------------------------------------------- 1 | using FastText.NetWrapper; 2 | using Microsoft.VisualStudio.TestTools.UnitTesting; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.IO; 6 | using System.Linq; 7 | using System.Text; 8 | using Tensorflow.NumPy; 9 | 10 | namespace CherubNLP.UnitTest.Vector 11 | { 12 | [TestClass] 13 | public class FastTextTest : TestEssential 14 | { 15 | [TestMethod] 16 | public void Word2Vec() 17 | { 18 | using (var fastText = new FastTextWrapper()) 19 | { 20 | fastText.LoadModel(Path.Combine(dataDir, "dbpedia.ftz")); 21 | var vector = fastText.GetSentenceVector("Can I use a larger crockpot than the recipe calls for?"); 22 | } 23 | } 24 | 25 | [TestMethod] 26 | public void CosineSimilarity() 27 | { 28 | var similarities = Similarity.Cosine("We can use Cosine to compute the similarity of two hardcoded lists.", new[] 29 | { 30 | "Cosine Similarity algorithm function sample.", 31 | "The Cosine Similarity function computes the similarity of two lists of numbers.", 32 | "Compute the similarity of two hardcoded lists.", 33 | "We can compute the similarity of two hardcoded lists.", 34 | "Coronavirus app could trace your contacts without sacrificing your privacy", 35 | "We can use Cosine to compute the similarity of two lists." 36 | }, Path.Combine(dataDir, "dbpedia.ftz")); 37 | 38 | Assert.AreEqual(new[] { 5, 2, 3, 1, 4, 0 }, np.argsort(similarities)); 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /CherubNLP.UnitTest/Vector/OneHotEncodingTest.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.Corpus; 2 | using CherubNLP.Tokenize; 3 | using CherubNLP.Txt2Vec; 4 | using Microsoft.Extensions.Configuration; 5 | using Microsoft.VisualStudio.TestTools.UnitTesting; 6 | using System; 7 | using System.Collections.Generic; 8 | using System.IO; 9 | using System.Linq; 10 | using System.Text; 11 | 12 | namespace CherubNLP.UnitTest.Vector 13 | { 14 | [TestClass] 15 | public class OneHotEncodingTest : TestEssential 16 | { 17 | [TestMethod] 18 | public void OneHotTest() 19 | { 20 | var reader = new FasttextDataReader(); 21 | var sentences = reader.Read(new ReaderOptions 22 | { 23 | DataDir = Path.Combine(Configuration.GetValue("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"), 24 | FileName = "cooking.stackexchange.txt" 25 | }); 26 | 27 | var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); 28 | tokenizer.GetTokenizer(); 29 | 30 | var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList()); 31 | for (int i = 0; i < newSentences.Count; i++) 32 | { 33 | newSentences[i].Label = sentences[i].Label; 34 | } 35 | sentences = newSentences.ToList(); 36 | 37 | var encoder = new OneHotEncoder(); 38 | encoder.Sentences = sentences; 39 | encoder.EncodeAll(); 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /CherubNLP.UnitTest/Vector/Word2VecTest.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.Txt2Vec; 2 | using Microsoft.Extensions.Configuration; 3 | using Microsoft.VisualStudio.TestTools.UnitTesting; 4 | using System; 5 | using System.Collections.Generic; 6 | using System.IO; 7 | using System.Linq; 8 | using System.Text; 9 | using Txt2Vec; 10 | 11 | namespace CherubNLP.UnitTest.Vector 12 | { 13 | [TestClass] 14 | public class Word2VecTest : TestEssential 15 | { 16 | [TestMethod] 17 | public void Word2Vec() 18 | { 19 | string sentence = "stop this song"; 20 | List words = sentence.Split(' ').ToList(); 21 | Args args = new Args(); 22 | args.ModelFile = @"C:\Users\bpeng\Desktop\BoloReborn\Txt2VecDemo\wordvec_enu.bin"; 23 | VectorGenerator vg = new VectorGenerator(args); 24 | 25 | vg.Distance(words); 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /CherubNLP.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.4.33213.308 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CherubNLP", "CherubNLP\CherubNLP.csproj", "{5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}" 7 | EndProject 8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CherubNLP.UnitTest", "CherubNLP.UnitTest\CherubNLP.UnitTest.csproj", "{958AC705-B9D7-4071-B135-048DE1EEE87A}" 9 | EndProject 10 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CherubNLP.Console", "CherubNLP.Console\CherubNLP.Console.csproj", "{98AB0B44-E5E2-4C7E-B541-9E4226B24E63}" 11 | EndProject 12 | Global 13 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 14 | Debug|Any CPU = Debug|Any CPU 15 | Debug|x64 = Debug|x64 16 | GPU|Any CPU = GPU|Any CPU 17 | GPU|x64 = GPU|x64 18 | Release|Any CPU = Release|Any CPU 19 | Release|x64 = Release|x64 20 | EndGlobalSection 21 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 22 | {5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 23 | {5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.Debug|Any CPU.Build.0 = Debug|Any CPU 24 | {5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.Debug|x64.ActiveCfg = Debug|x64 25 | {5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.Debug|x64.Build.0 = Debug|x64 26 | {5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.GPU|Any CPU.ActiveCfg = Release|Any CPU 27 | {5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.GPU|Any CPU.Build.0 = Release|Any CPU 28 | {5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.GPU|x64.ActiveCfg = Release|x64 29 | {5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.GPU|x64.Build.0 = Release|x64 30 | {5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.Release|Any CPU.ActiveCfg = Release|Any CPU 31 | {5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.Release|Any CPU.Build.0 = Release|Any CPU 32 | {5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.Release|x64.ActiveCfg = Release|Any CPU 33 | {5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.Release|x64.Build.0 = Release|Any CPU 34 | {958AC705-B9D7-4071-B135-048DE1EEE87A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 35 | {958AC705-B9D7-4071-B135-048DE1EEE87A}.Debug|Any CPU.Build.0 = Debug|Any CPU 36 | {958AC705-B9D7-4071-B135-048DE1EEE87A}.Debug|x64.ActiveCfg = Debug|x64 37 | {958AC705-B9D7-4071-B135-048DE1EEE87A}.Debug|x64.Build.0 = Debug|x64 38 | {958AC705-B9D7-4071-B135-048DE1EEE87A}.GPU|Any CPU.ActiveCfg = Release|Any CPU 39 | {958AC705-B9D7-4071-B135-048DE1EEE87A}.GPU|Any CPU.Build.0 = Release|Any CPU 40 | {958AC705-B9D7-4071-B135-048DE1EEE87A}.GPU|x64.ActiveCfg = Release|x64 41 | {958AC705-B9D7-4071-B135-048DE1EEE87A}.GPU|x64.Build.0 = Release|x64 42 | {958AC705-B9D7-4071-B135-048DE1EEE87A}.Release|Any CPU.ActiveCfg = Release|Any CPU 43 | {958AC705-B9D7-4071-B135-048DE1EEE87A}.Release|Any CPU.Build.0 = Release|Any CPU 44 | {958AC705-B9D7-4071-B135-048DE1EEE87A}.Release|x64.ActiveCfg = Release|Any CPU 45 | {958AC705-B9D7-4071-B135-048DE1EEE87A}.Release|x64.Build.0 = Release|Any CPU 46 | {98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 47 | {98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.Debug|Any CPU.Build.0 = Debug|Any CPU 48 | {98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.Debug|x64.ActiveCfg = Debug|x64 49 | {98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.Debug|x64.Build.0 = Debug|x64 50 | {98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.GPU|Any CPU.ActiveCfg = Release|Any CPU 51 | {98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.GPU|Any CPU.Build.0 = Release|Any CPU 52 | {98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.GPU|x64.ActiveCfg = Release|x64 53 | {98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.GPU|x64.Build.0 = Release|x64 54 | {98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.Release|Any CPU.ActiveCfg = Release|Any CPU 55 | {98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.Release|Any CPU.Build.0 = Release|Any CPU 56 | {98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.Release|x64.ActiveCfg = Release|Any CPU 57 | {98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.Release|x64.Build.0 = Release|Any CPU 58 | EndGlobalSection 59 | GlobalSection(SolutionProperties) = preSolution 60 | HideSolutionNode = FALSE 61 | EndGlobalSection 62 | GlobalSection(ExtensibilityGlobals) = postSolution 63 | SolutionGuid = {26DCDD72-01C4-45FA-85B7-2BE26A7D153C} 64 | EndGlobalSection 65 | EndGlobal 66 | -------------------------------------------------------------------------------- /CherubNLP/CherubNLP.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | netstandard2.0 5 | 0.5.0 6 | Haiping Chen 7 | https://github.com/SciSharp/CherubNLP 8 | git 9 | NLP 10 | 11 | Apache 2.0 12 | https://github.com/SciSharp/CherubNLP 13 | true 14 | false 15 | Add more stemming regex. 16 | Add Cosine Similarity algorithm. 17 | Upgrade FastText wrapper to v1.2.3. 18 | 0.5.0.0 19 | 0.5.0.0 20 | https://github.com/SciSharp 21 | .NET text analysis tool. 22 | Tokenize, Stemming and Lemmatization. 23 | AnyCPU;x64 24 | 25 | 26 | 27 | DEBUG;TRACE 28 | 29 | 30 | 31 | DEBUG;TRACE 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /CherubNLP/Classify/ClassifierFactory.cs: -------------------------------------------------------------------------------- 1 | using Bigtree.Algorithm.Features; 2 | using CherubNLP.Tokenize; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Linq; 6 | using System.Reflection; 7 | using System.Text; 8 | 9 | namespace CherubNLP.Classify 10 | { 11 | public class ClassifierFactory 12 | where IFeatureExtractor : ITextFeatureExtractor, new() 13 | { 14 | private SupportedLanguage _lang; 15 | 16 | private IClassifier _classifier; 17 | 18 | private ClassifyOptions _options; 19 | 20 | private IFeatureExtractor featureExtractor; 21 | 22 | public ClassifierFactory(ClassifyOptions options, SupportedLanguage lang) 23 | { 24 | _lang = lang; 25 | _options = options; 26 | featureExtractor = new IFeatureExtractor(); 27 | } 28 | 29 | public IClassifier GetClassifer(string name) 30 | { 31 | List types = new List(); 32 | 33 | types.AddRange(Assembly.Load(new AssemblyName("CherubNLP")) 34 | .GetTypes().Where(x => !x.IsAbstract && !x.FullName.StartsWith("<>f__AnonymousType")).ToList()); 35 | 36 | Type type = types.FirstOrDefault(x => x.Name == name); 37 | var instance = (IClassifier)Activator.CreateInstance(type); 38 | 39 | return _classifier = instance; 40 | } 41 | 42 | public void Train(List sentences) 43 | { 44 | _classifier.Train(sentences, _options); 45 | _classifier.SaveModel(_options); 46 | } 47 | 48 | public List> Classify(Sentence sentence) 49 | { 50 | var options = new ClassifyOptions 51 | { 52 | ModelFilePath = _options.ModelFilePath, 53 | ModelDir = _options.ModelDir, 54 | ModelName = _options.ModelName 55 | }; 56 | 57 | _classifier.LoadModel(options); 58 | 59 | var classes = _classifier.Classify(sentence, options); 60 | 61 | classes = classes.OrderByDescending(x => x.Item2).ToList(); 62 | 63 | return classes; 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /CherubNLP/Classify/ClassifyOptions.cs: -------------------------------------------------------------------------------- 1 | using Bigtree.Algorithm.SVM; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace CherubNLP.Classify 7 | { 8 | public class ClassifyOptions 9 | { 10 | public string TrainingCorpusDir { get; set; } 11 | public string ModelFilePath { get; set; } 12 | public string ModelDir { get; set; } 13 | public string ModelName { get; set; } 14 | public string Word2VecFilePath { get; set; } 15 | 16 | public string FeaturesFileName { get; set; } 17 | public string FeaturesInTfIdfFileName { get; set; } 18 | public string DictionaryFileName { get; set; } 19 | public string CategoriesFileName { get; set; } 20 | 21 | public string PrediceOutputFile { get; set; } 22 | public string TransformFilePath { get; set; } 23 | public RangeTransform Transform { get; set; } 24 | 25 | /// 26 | /// Feature dimension 27 | /// 28 | public int Dimension { get; set; } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /CherubNLP/Classify/IClassifier.cs: -------------------------------------------------------------------------------- 1 | using Bigtree.Algorithm.Features; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace CherubNLP.Classify 7 | { 8 | public interface IClassifier 9 | { 10 | /// 11 | /// Training by feature vector 12 | /// 13 | /// 14 | /// 15 | void Train(List sentences, ClassifyOptions options); 16 | 17 | /// 18 | /// Predict by feature vector 19 | /// 20 | /// 21 | /// 22 | /// 23 | List> Classify(Sentence sentence, ClassifyOptions options); 24 | 25 | String SaveModel(ClassifyOptions options); 26 | 27 | Object LoadModel(ClassifyOptions options); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /CherubNLP/Classify/IEstimator.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace CherubNLP.Classify 6 | { 7 | public interface IEstimator 8 | { 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /CherubNLP/Classify/ITextFeatureExtractor.cs: -------------------------------------------------------------------------------- 1 | using Bigtree.Algorithm.Features; 2 | using CherubNLP.Tokenize; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Text; 6 | 7 | namespace CherubNLP.Classify 8 | { 9 | /// 10 | /// Featuring text 11 | /// 12 | public interface ITextFeatureExtractor 13 | { 14 | List GetFeatures(List words); 15 | 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /CherubNLP/Classify/SentenceFeatureExtractor.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using Bigtree.Algorithm.Features; 6 | using CherubNLP.Tokenize; 7 | 8 | namespace CherubNLP.Classify 9 | { 10 | public class SentenceFeatureExtractor : ITextFeatureExtractor 11 | { 12 | public List GetFeatures(List words) 13 | { 14 | var features = new List(); 15 | 16 | words.Where(x => x.IsAlpha) 17 | .Distinct() 18 | .ToList() 19 | .ForEach(w => features.Add(new Feature($"contains {w.Text.ToLower()}", "True"))); 20 | 21 | return features; 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /CherubNLP/Classify/WordFeatureExtractor.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using Bigtree.Algorithm.Features; 5 | using CherubNLP.Tokenize; 6 | 7 | namespace CherubNLP.Classify 8 | { 9 | public class WordFeatureExtractor : ITextFeatureExtractor 10 | { 11 | public List GetFeatures(List words) 12 | { 13 | string text = words[0].Text; 14 | var features = new List(); 15 | 16 | features.Add(new Feature("alwayson", "True")); 17 | features.Add(new Feature("startswith", text[0].ToString().ToLower())); 18 | features.Add(new Feature("endswith", text[text.Length - 1].ToString().ToLower())); 19 | 20 | return features; 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /CherubNLP/Corpus/ConllReader.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.Tokenize; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Text; 6 | 7 | namespace CherubNLP.Corpus 8 | { 9 | /// 10 | /// A corpus reader for CoNLL-style files. These files consist of a 11 | /// series of sentences, separated by blank lines.Each sentence is 12 | /// encoded using a table(or "grid") of values, where each line 13 | /// corresponds to a single word, and each column corresponds to an 14 | /// annotation type.The set of columns used by CoNLL-style files can 15 | /// vary from corpus to corpus; 16 | /// 17 | public class CoNLLReader 18 | { 19 | public List Read(ReaderOptions options) 20 | { 21 | var sentences = new List(); 22 | using(StreamReader reader = new StreamReader(Path.Combine(options.DataDir, options.FileName))) 23 | { 24 | string line = reader.ReadLine(); 25 | var sentence = new Sentence { Words = new List { } }; 26 | 27 | while (!reader.EndOfStream) 28 | { 29 | if (String.IsNullOrEmpty(line)) 30 | { 31 | sentences.Add(sentence); 32 | sentence = new Sentence { Words = new List { } }; 33 | } 34 | else 35 | { 36 | var columns = line.Split(' '); 37 | 38 | sentence.Words.Add(new Token 39 | { 40 | Text = columns[0], 41 | Pos = columns[1] 42 | }); 43 | } 44 | 45 | line = reader.ReadLine(); 46 | } 47 | 48 | } 49 | 50 | return sentences; 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /CherubNLP/Corpus/FasttextDataReader.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Text.RegularExpressions; 7 | 8 | namespace CherubNLP.Corpus 9 | { 10 | /// 11 | /// Fasttext labeled data reader 12 | /// 13 | public class FasttextDataReader 14 | { 15 | public List Read(ReaderOptions options) 16 | { 17 | if (String.IsNullOrEmpty(options.LabelPrefix)) 18 | { 19 | options.LabelPrefix = "__label__"; 20 | } 21 | 22 | var sentences = new List(); 23 | using (StreamReader reader = new StreamReader(Path.Combine(options.DataDir, options.FileName))) 24 | { 25 | while (!reader.EndOfStream) 26 | { 27 | string line = reader.ReadLine(); 28 | if (!String.IsNullOrEmpty(line)) 29 | { 30 | var ms = Regex.Matches(line, options.LabelPrefix + @"\S+") 31 | .Cast() 32 | .ToList(); 33 | 34 | var text = line.Substring(ms.Last().Index + ms.Last().Length + 1); 35 | 36 | ms.ForEach(m => 37 | { 38 | sentences.Add(new Sentence 39 | { 40 | Label = m.Value.Substring(options.LabelPrefix.Length), 41 | Text = text 42 | }); 43 | }); 44 | 45 | } 46 | } 47 | } 48 | 49 | return sentences; 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /CherubNLP/Corpus/KaggleTextDataReader.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.Tokenize; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Linq; 6 | using System.Text; 7 | using System.Text.RegularExpressions; 8 | 9 | namespace CherubNLP.Corpus 10 | { 11 | /// 12 | /// A corpus reader for Kaggle-style files. These files consist of a 13 | /// series of sentences, separated by blank lines.Each sentence is 14 | /// encoded using a table(or "grid") of values, where each line 15 | /// corresponds to a single word, and each column corresponds to an 16 | /// annotation type.The set of columns used by Kaggle-style files can 17 | /// vary from corpus to corpus; 18 | /// 19 | public class KaggleTextDataReader 20 | { 21 | public List Read(ReaderOptions options) 22 | { 23 | if (String.IsNullOrEmpty(options.DataDir)) 24 | { 25 | options.DataDir = AppContext.BaseDirectory; 26 | } 27 | 28 | var sentences = new List(); 29 | using(StreamReader reader = new StreamReader(Path.Combine(options.DataDir, options.FileName))) 30 | { 31 | // skip header 32 | string line = reader.ReadLine(); 33 | line = reader.ReadLine(); 34 | 35 | while (!reader.EndOfStream) 36 | { 37 | var id = line.Substring(1, 7); 38 | var label = line.Substring(line.Length - 4, 3); 39 | var text = line.Substring(11, line.Length - 18); 40 | 41 | sentences.Add(new Sentence 42 | { 43 | Id = id, 44 | Text = text, 45 | Label = label 46 | }); 47 | 48 | line = reader.ReadLine(); 49 | } 50 | 51 | } 52 | 53 | return sentences; 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /CherubNLP/Corpus/LabeledPerFileNameReader.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Text; 5 | 6 | namespace CherubNLP.Corpus 7 | { 8 | /// 9 | /// It used to read labeled data which is seperated by file. 10 | /// The same category data is in one file. 11 | /// File name is the label. 12 | /// 13 | public class LabeledPerFileNameReader 14 | { 15 | public List Read(ReaderOptions options) 16 | { 17 | string label = options.FileName.Split('.')[0]; 18 | 19 | var sentences = new List(); 20 | using (StreamReader reader = new StreamReader(Path.Combine(options.DataDir, options.FileName))) 21 | { 22 | while (!reader.EndOfStream) 23 | { 24 | string line = reader.ReadLine(); 25 | if (!String.IsNullOrEmpty(line)) 26 | { 27 | sentences.Add(new Sentence 28 | { 29 | Label = label, 30 | Text = line 31 | }); 32 | } 33 | } 34 | } 35 | 36 | return sentences; 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /CherubNLP/Corpus/ReaderOptions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace CherubNLP.Corpus 6 | { 7 | public class ReaderOptions 8 | { 9 | public string DataDir { get; set; } 10 | 11 | public string FileName { get; set; } 12 | 13 | public string LabelPrefix { get; set; } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /CherubNLP/Featuring/CountFeatureExtractor.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * CherubNLP Library 3 | * Copyright (C) 2018 Haiping Chen 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program. If not, see . 17 | */ 18 | 19 | //using Bigtree.Algorithm.Matrix; 20 | using CherubNLP.Tokenize; 21 | using System; 22 | using System.Collections.Generic; 23 | using System.Linq; 24 | using System.Text; 25 | 26 | namespace CherubNLP.Featuring 27 | { 28 | /// 29 | /// Convert a collection of text documents to a matrix of token counts 30 | /// 31 | public class CountFeatureExtractor : IFeatureExtractor 32 | { 33 | public int Dimension { get; set; } 34 | public List Sentences { get; set; } 35 | 36 | public List> Dictionary { get; set; } 37 | public List Features { get; set; } 38 | public string ModelFile { get; set; } 39 | 40 | public void Vectorize(List features) 41 | { 42 | CalculateDictionary(); 43 | 44 | int[][] vec = new int[Sentences.Count][]; 45 | 46 | Sentences.ForEach(s => 47 | { 48 | s.Vector = new double[Features.Count]; 49 | for (int i = 0; i < Features.Count; i++) 50 | { 51 | s.Vector[i] = s.Words.Count(w => w.Lemma == Features[i]); 52 | } 53 | 54 | for (int i = 0; i < s.Words.Count; i++) 55 | { 56 | var dic = Dictionary.Find(x => x.Item1 == s.Words[i].Lemma); 57 | if(dic != null) 58 | { 59 | s.Words[i].Vector = s.Words.Count(w => w.Lemma == dic.Item1); 60 | } 61 | } 62 | }); 63 | } 64 | 65 | private void CalculateDictionary() 66 | { 67 | if (Dictionary == null) 68 | { 69 | List allWords = new List(); 70 | 71 | Sentences.ForEach(s => 72 | { 73 | allWords.AddRange(s.Words); 74 | }); 75 | 76 | Features = allWords.Where(w => w.IsAlpha).Select(x => x.Lemma).Distinct().OrderBy(x => x).ToList(); 77 | 78 | Dictionary = new List>(); 79 | 80 | allWords.Select(x => x.Lemma) 81 | .Distinct() 82 | .OrderBy(x => x) 83 | .ToList() 84 | .ForEach(word => 85 | { 86 | Dictionary.Add(new Tuple(word, allWords.Count(x => x.Lemma == word))); 87 | }); 88 | } 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /CherubNLP/Featuring/IFeatureExtractor.cs: -------------------------------------------------------------------------------- 1 | //using Bigtree.Algorithm.Matrix; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace CherubNLP.Featuring 7 | { 8 | public interface IFeatureExtractor 9 | { 10 | /// 11 | /// Feature dimension size 12 | /// 13 | int Dimension { get; set; } 14 | 15 | /// 16 | /// The whole corpus 17 | /// 18 | List Sentences { get; set; } 19 | 20 | /// 21 | /// Feature names 22 | /// 23 | List Features { get; set; } 24 | 25 | /// 26 | /// All words and frequency 27 | /// 28 | List> Dictionary { get; set; } 29 | 30 | /// 31 | /// Vectorize sentence 32 | /// 33 | void Vectorize(List features); 34 | 35 | /// 36 | /// Array shape 37 | /// 38 | //Shape Shape { get; set; } 39 | 40 | /// 41 | /// Pre-trained model file path 42 | /// 43 | string ModelFile { get; set; } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /CherubNLP/Featuring/Word2VecFeatureExtractor.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | //using Bigtree.Algorithm.Matrix; 5 | using Txt2Vec; 6 | 7 | namespace CherubNLP.Featuring 8 | { 9 | public class Word2VecFeatureExtractor : IFeatureExtractor 10 | { 11 | public int Dimension { get; set; } 12 | public List Sentences { get; set; } 13 | public List> Dictionary { get; set; } 14 | public List Features { get; set; } 15 | //public Shape Shape { get; set; } 16 | public VectorGenerator Vg { get; set; } 17 | public int SentenceVectorSize { get; set; } 18 | public string ModelFile { get; set; } 19 | 20 | public void Vectorize(List features) 21 | { 22 | Init(); 23 | 24 | Sentences.ForEach(s => { 25 | List wordLemmas = new List(); 26 | s.Words.ForEach(word => { 27 | if (features.Contains(word.Lemma)) 28 | { 29 | wordLemmas.Add(word.Lemma); 30 | } 31 | }); 32 | Vec sentenceVec = Vg.Sent2Vec(wordLemmas); 33 | 34 | s.Vector = sentenceVec.VecNodes.ToArray(); 35 | }); 36 | 37 | 38 | } 39 | 40 | private void Init() 41 | { 42 | if(Vg == null) 43 | { 44 | Args args = new Args(); 45 | args.ModelFile = ModelFile; 46 | Vg = new VectorGenerator(args); 47 | SentenceVectorSize = this.Vg.Model.VectorSize; 48 | Features = new List(); 49 | for (int i = 0; i < SentenceVectorSize; i++) 50 | { 51 | Features.Add($"f-{i}"); 52 | } 53 | } 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /CherubNLP/Jieba.NET/Common/Counter.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | 5 | namespace JiebaNet.Segmenter.Common 6 | { 7 | public interface ICounter 8 | { 9 | int Count { get; } 10 | int Total { get; } 11 | int this[T key] { get; set; } 12 | IEnumerable> Elements { get; } 13 | 14 | /// 15 | /// Lists the n most common elements from the most common to the least. 16 | /// 17 | /// Number of elements, list all elements if n is less than 0. 18 | /// 19 | IEnumerable> MostCommon(int n = -1); 20 | 21 | /// 22 | /// Subtracts items from a counter. 23 | /// 24 | /// 25 | void Subtract(IEnumerable items); 26 | 27 | /// 28 | /// Subtracts counts from another counter. 29 | /// 30 | /// 31 | void Subtract(ICounter other); 32 | 33 | /// 34 | /// Adds items to a counter. 35 | /// 36 | /// 37 | void Add(IEnumerable items); 38 | 39 | /// 40 | /// Adds another counter. 41 | /// 42 | /// 43 | void Add(ICounter other); 44 | 45 | /// 46 | /// Union is the maximum of value in either of the input . 47 | /// 48 | /// The other counter. 49 | ICounter Union(ICounter other); 50 | 51 | void Remove(T key); 52 | void Clear(); 53 | bool Contains(T key); 54 | } 55 | 56 | public class Counter: ICounter 57 | { 58 | private Dictionary data = new Dictionary(); 59 | 60 | public Counter() {} 61 | 62 | public Counter(IEnumerable items) 63 | { 64 | CountItems(items); 65 | } 66 | 67 | public int Count => data.Count; 68 | public int Total => data.Values.Sum(); 69 | public IEnumerable> Elements => data; 70 | 71 | public int this[T key] 72 | { 73 | get => data.ContainsKey(key) ? data[key] : 0; 74 | set => data[key] = value; 75 | } 76 | 77 | public IEnumerable> MostCommon(int n = -1) 78 | { 79 | var pairs = data.Where(pair => pair.Value > 0).OrderByDescending(pair => pair.Value); 80 | return n < 0 ? pairs : pairs.Take(n); 81 | } 82 | 83 | public void Subtract(IEnumerable items) 84 | { 85 | SubtractItems(items); 86 | } 87 | 88 | public void Subtract(ICounter other) 89 | { 90 | SubtractPairs(other.Elements); 91 | } 92 | 93 | public void Add(IEnumerable items) 94 | { 95 | CountItems(items); 96 | } 97 | 98 | public void Add(ICounter other) 99 | { 100 | CountPairs(other.Elements); 101 | } 102 | 103 | public ICounter Union(ICounter other) 104 | { 105 | var result = new Counter(); 106 | foreach (var pair in data) 107 | { 108 | var count = pair.Value; 109 | var otherCount = other[pair.Key]; 110 | var newCount = count < otherCount ? otherCount : count; 111 | result[pair.Key] = newCount; 112 | } 113 | 114 | foreach (var pair in other.Elements) 115 | { 116 | if (!Contains(pair.Key)) 117 | { 118 | result[pair.Key] = pair.Value; 119 | } 120 | } 121 | return result; 122 | } 123 | 124 | public void Remove(T key) 125 | { 126 | if (data.ContainsKey(key)) 127 | { 128 | data.Remove(key); 129 | } 130 | } 131 | 132 | public void Clear() 133 | { 134 | data.Clear(); 135 | } 136 | 137 | public bool Contains(T key) 138 | { 139 | return data.ContainsKey(key); 140 | } 141 | 142 | #region Private Methods 143 | 144 | private void CountItems(IEnumerable items) 145 | { 146 | foreach (var item in items) 147 | { 148 | data[item] = data.GetDefault(item, 0) + 1; 149 | } 150 | } 151 | 152 | private void CountPairs(IEnumerable> pairs) 153 | { 154 | foreach (var pair in pairs) 155 | { 156 | this[pair.Key] += pair.Value; 157 | } 158 | } 159 | 160 | private void SubtractItems(IEnumerable items) 161 | { 162 | foreach (var item in items) 163 | { 164 | data[item] = data.GetDefault(item, 0) - 1; 165 | } 166 | } 167 | 168 | private void SubtractPairs(IEnumerable> pairs) 169 | { 170 | foreach (var pair in pairs) 171 | { 172 | this[pair.Key] -= pair.Value; 173 | } 174 | } 175 | 176 | #endregion 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /CherubNLP/Jieba.NET/Common/Extensions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text.RegularExpressions; 5 | 6 | namespace JiebaNet.Segmenter.Common 7 | { 8 | public static class Extensions 9 | { 10 | private static readonly Regex RegexDigits = new Regex(@"\d+", RegexOptions.Compiled); 11 | private static readonly Regex RegexNewline = new Regex("(\r\n|\n|\r)", RegexOptions.Compiled); 12 | 13 | #region Objects 14 | 15 | public static bool IsNull(this object obj) 16 | { 17 | return obj == null; 18 | } 19 | 20 | public static bool IsNotNull(this object obj) 21 | { 22 | return obj != null; 23 | } 24 | 25 | #endregion 26 | 27 | 28 | #region Enumerable 29 | 30 | public static bool IsEmpty(this IEnumerable enumerable) 31 | { 32 | return (enumerable == null) || !enumerable.Any(); 33 | } 34 | 35 | public static bool IsNotEmpty(this IEnumerable enumerable) 36 | { 37 | return (enumerable != null) && enumerable.Any(); 38 | } 39 | 40 | public static TValue GetValueOrDefault(this IDictionary d, TKey key) 41 | { 42 | return d.ContainsKey(key) ? d[key] : default(TValue); 43 | } 44 | 45 | public static TValue GetDefault(this IDictionary dict, TKey key, TValue defaultValue) 46 | { 47 | if (dict.ContainsKey(key)) 48 | { 49 | return dict[key]; 50 | } 51 | return defaultValue; 52 | } 53 | 54 | public static void Update(this IDictionary dict, IDictionary other) 55 | { 56 | foreach (var key in other.Keys) 57 | { 58 | dict[key] = other[key]; 59 | } 60 | } 61 | 62 | #endregion 63 | 64 | #region String & Text 65 | 66 | public static string Left(this string s, int endIndex) 67 | { 68 | if (string.IsNullOrEmpty(s)) 69 | { 70 | return s; 71 | } 72 | 73 | return s.Substring(0, endIndex); 74 | } 75 | 76 | public static string Right(this string s, int startIndex) 77 | { 78 | if (string.IsNullOrEmpty(s)) 79 | { 80 | return s; 81 | } 82 | 83 | 84 | return s.Substring(startIndex); 85 | } 86 | 87 | public static string Sub(this string s, int startIndex, int endIndex) 88 | { 89 | return s.Substring(startIndex, endIndex - startIndex); 90 | } 91 | 92 | public static bool IsInt32(this string s) 93 | { 94 | return RegexDigits.IsMatch(s); 95 | } 96 | 97 | public static string[] SplitLines(this string s) 98 | { 99 | return RegexNewline.Split(s); 100 | } 101 | 102 | public static string Join(this IEnumerable inputs, string separator = ", ") 103 | { 104 | return string.Join(separator, inputs); 105 | } 106 | 107 | public static IEnumerable SubGroupValues(this GroupCollection groups) 108 | { 109 | var result = from Group g in groups 110 | select g.Value; 111 | return result.Skip(1); 112 | } 113 | 114 | #endregion 115 | 116 | #region Conversion 117 | 118 | public static int ToInt32(this char ch) 119 | { 120 | return ch; 121 | } 122 | 123 | public static char ToChar(this int i) 124 | { 125 | return (char)i; 126 | } 127 | 128 | #endregion 129 | } 130 | } -------------------------------------------------------------------------------- /CherubNLP/Jieba.NET/Common/FileExtension.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Reflection; 5 | using System.Text; 6 | 7 | namespace JiebaNet.Segmenter.Common 8 | { 9 | public static class FileExtension 10 | { 11 | public static string ReadEmbeddedAllLine(string path) 12 | { 13 | return ReadEmbeddedAllLine(path, Encoding.UTF8); 14 | } 15 | 16 | public static string ReadEmbeddedAllLine(string path,Encoding encoding) 17 | { 18 | using (var sr = new StreamReader(path)) 19 | { 20 | return sr.ReadToEnd(); 21 | } 22 | } 23 | 24 | public static List ReadEmbeddedAllLines(string path, Encoding encoding) 25 | { 26 | List list = new List(); 27 | using (var sr = new StreamReader(path)) 28 | { 29 | string item; 30 | while ((item = sr.ReadLine()) != null) 31 | { 32 | list.Add(item); 33 | } 34 | } 35 | return list; 36 | } 37 | 38 | public static List ReadEmbeddedAllLines(string path) 39 | { 40 | return ReadEmbeddedAllLines(path, Encoding.UTF8); 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /CherubNLP/Jieba.NET/Common/Trie.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | 5 | namespace JiebaNet.Segmenter.Common 6 | { 7 | // Refer to: https://github.com/brianfromoregon/trie 8 | public class TrieNode 9 | { 10 | public char Char { get; set; } 11 | public int Frequency { get; set; } 12 | public Dictionary Children { get; set; } 13 | 14 | public TrieNode(char ch) 15 | { 16 | Char = ch; 17 | Frequency = 0; 18 | 19 | // TODO: or an empty dict? 20 | //Children = null; 21 | } 22 | 23 | public int Insert(string s, int pos, int freq = 1) 24 | { 25 | if (string.IsNullOrEmpty(s) || pos >= s.Length) 26 | { 27 | return 0; 28 | } 29 | 30 | if (Children == null) 31 | { 32 | Children = new Dictionary(); 33 | } 34 | 35 | var c = s[pos]; 36 | if (!Children.ContainsKey(c)) 37 | { 38 | Children[c] = new TrieNode(c); 39 | } 40 | 41 | var curNode = Children[c]; 42 | if (pos == s.Length - 1) 43 | { 44 | curNode.Frequency += freq; 45 | return curNode.Frequency; 46 | } 47 | 48 | return curNode.Insert(s, pos + 1, freq); 49 | } 50 | 51 | public TrieNode Search(string s, int pos) 52 | { 53 | if (string.IsNullOrEmpty(s)) 54 | { 55 | return null; 56 | } 57 | 58 | // if out of range or without any child nodes 59 | if (pos >= s.Length || Children == null) 60 | { 61 | return null; 62 | } 63 | // if reaches the last char of s, it's time to make the decision. 64 | if (pos == s.Length - 1) 65 | { 66 | return Children.ContainsKey(s[pos]) ? Children[s[pos]] : null; 67 | } 68 | // continue if necessary. 69 | return Children.ContainsKey(s[pos]) ? Children[s[pos]].Search(s, pos + 1) : null; 70 | } 71 | } 72 | 73 | public interface ITrie 74 | { 75 | //string BestMatch(string word, long maxTime); 76 | bool Contains(string word); 77 | int Frequency(string word); 78 | int Insert(string word, int freq = 1); 79 | //bool Remove(string word); 80 | int Count { get; } 81 | int TotalFrequency { get; } 82 | } 83 | 84 | public class Trie : ITrie 85 | { 86 | private static readonly char RootChar = '\0'; 87 | 88 | internal TrieNode Root; 89 | 90 | public int Count { get; private set; } 91 | public int TotalFrequency { get; private set; } 92 | 93 | public Trie() 94 | { 95 | Root = new TrieNode(RootChar); 96 | Count = 0; 97 | } 98 | 99 | public bool Contains(string word) 100 | { 101 | CheckWord(word); 102 | 103 | var node = Root.Search(word.Trim(), 0); 104 | return node.IsNotNull() && node.Frequency > 0; 105 | } 106 | 107 | public bool ContainsPrefix(string word) 108 | { 109 | CheckWord(word); 110 | 111 | var node = Root.Search(word.Trim(), 0); 112 | return node.IsNotNull(); 113 | } 114 | 115 | public int Frequency(string word) 116 | { 117 | CheckWord(word); 118 | 119 | var node = Root.Search(word.Trim(), 0); 120 | return node.IsNull() ? 0 : node.Frequency; 121 | } 122 | 123 | public int Insert(string word, int freq = 1) 124 | { 125 | CheckWord(word); 126 | 127 | var i = Root.Insert(word.Trim(), 0, freq); 128 | if (i > 0) 129 | { 130 | TotalFrequency += freq; 131 | Count++; 132 | } 133 | 134 | return i; 135 | } 136 | 137 | public IEnumerable ChildChars(string prefix) 138 | { 139 | var node = Root.Search(prefix.Trim(), 0); 140 | return node.IsNull() || node.Children.IsNull() ? null : node.Children.Select(p => p.Key); 141 | } 142 | 143 | private void CheckWord(string word) 144 | { 145 | if (string.IsNullOrWhiteSpace(word)) 146 | { 147 | throw new ArgumentException("word must not be null or whitespace"); 148 | } 149 | } 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /CherubNLP/Jieba.NET/ConfigManager.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | 4 | namespace JiebaNet.Segmenter 5 | { 6 | public class ConfigManager 7 | { 8 | public static string ConfigFileBaseDir 9 | { 10 | get 11 | { 12 | string path = String.Empty; 13 | 14 | var dir = AppDomain.CurrentDomain.GetData("JiebaConfigFileDir"); 15 | if (dir == null) 16 | { 17 | path = "Resources"; 18 | } 19 | else 20 | { 21 | path = Path.Combine(dir.ToString(), "Resources"); 22 | } 23 | 24 | return path; 25 | } 26 | } 27 | 28 | public static string MainDictFile 29 | { 30 | get { return Path.Combine(ConfigFileBaseDir, "dict.txt"); } 31 | } 32 | 33 | public static string ProbTransFile 34 | { 35 | get { return Path.Combine(ConfigFileBaseDir, "prob_trans.json"); } 36 | } 37 | 38 | public static string ProbEmitFile 39 | { 40 | get { return Path.Combine(ConfigFileBaseDir, "prob_emit.json"); } 41 | } 42 | 43 | public static string PosProbStartFile 44 | { 45 | get { return Path.Combine(ConfigFileBaseDir, "pos_prob_start.json"); } 46 | } 47 | 48 | public static string PosProbTransFile 49 | { 50 | get { return Path.Combine(ConfigFileBaseDir, "pos_prob_trans.json"); } 51 | } 52 | 53 | public static string PosProbEmitFile 54 | { 55 | get { return Path.Combine(ConfigFileBaseDir, "pos_prob_emit.json"); } 56 | } 57 | 58 | public static string CharStateTabFile 59 | { 60 | get { return Path.Combine(ConfigFileBaseDir, "char_state_tab.json"); } 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /CherubNLP/Jieba.NET/Constants.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | 4 | namespace JiebaNet.Segmenter 5 | { 6 | public class Constants 7 | { 8 | public static readonly double MinProb = -3.14e100; 9 | 10 | public static readonly List NounPos = new List() { "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz" }; 11 | public static readonly List VerbPos = new List() { "v", "vd", "vg", "vi", "vn", "vq" }; 12 | public static readonly List NounAndVerbPos = NounPos.Union(VerbPos).ToList(); 13 | public static readonly List IdiomPos = new List() { "i" }; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /CherubNLP/Jieba.NET/DefaultDictionary.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace JiebaNet.Segmenter 8 | { 9 | public class DefaultDictionary : Dictionary 10 | { 11 | public new TValue this[TKey key] 12 | { 13 | get 14 | { 15 | if (!ContainsKey(key)) 16 | { 17 | Add(key, default(TValue)); 18 | } 19 | return base[key]; 20 | } 21 | set { base[key] = value; } 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /CherubNLP/Jieba.NET/FinalSeg/IFinalSeg.cs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSharp/CherubNLP/7e80875a7909288c15c0e5ee1fafcb4c8df41198/CherubNLP/Jieba.NET/FinalSeg/IFinalSeg.cs -------------------------------------------------------------------------------- /CherubNLP/Jieba.NET/JiebaTagger.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP; 2 | using CherubNLP.Tag; 3 | using JiebaNet.Segmenter.PosSeg; 4 | using System; 5 | using System.Collections.Generic; 6 | using System.IO; 7 | using System.Linq; 8 | using System.Text; 9 | 10 | namespace BotSharp.Core.Engines.Jieba.NET 11 | { 12 | public class JiebaTagger : ITagger 13 | { 14 | private PosSegmenter posSeg; 15 | 16 | public void Tag(Sentence sentence, TagOptions options) 17 | { 18 | Init(); 19 | 20 | var tokens = posSeg.Cut(sentence.Text).ToList(); 21 | 22 | for(int i = 0; i < sentence.Words.Count; i++) 23 | { 24 | sentence.Words[i].Pos = tokens[i].Flag; 25 | sentence.Words[i].Tag = tokens[i].Flag; 26 | } 27 | } 28 | 29 | public void Train(List sentences, TagOptions options) 30 | { 31 | 32 | } 33 | 34 | private void Init() 35 | { 36 | if (posSeg == null) 37 | { 38 | string contentDir = AppDomain.CurrentDomain.GetData("DataPath").ToString(); 39 | AppDomain.CurrentDomain.SetData("JiebaConfigFileDir", contentDir); 40 | 41 | posSeg = new PosSegmenter(); 42 | } 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /CherubNLP/Jieba.NET/JiebaTokenizer.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.Tokenize; 2 | using JiebaNet.Segmenter; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.IO; 6 | using System.Linq; 7 | using System.Text; 8 | using Token = CherubNLP.Tokenize.Token; 9 | 10 | namespace BotSharp.Core.Engines.Jieba.NET 11 | { 12 | public class JiebaTokenizer : TokenizerBase, ITokenizer 13 | { 14 | private JiebaSegmenter segmenter; 15 | 16 | public List Tokenize(string sentence, TokenizationOptions options) 17 | { 18 | Init(); 19 | 20 | var tokens = segmenter.Cut(sentence) 21 | .Select(x => new Token 22 | { 23 | Text = x 24 | }).ToList(); 25 | 26 | CorrectTokenPosition(sentence, tokens); 27 | 28 | return tokens; 29 | } 30 | 31 | private void Init() 32 | { 33 | if (segmenter == null) 34 | { 35 | string contentDir = AppDomain.CurrentDomain.GetData("DataPath").ToString(); 36 | AppDomain.CurrentDomain.SetData("JiebaConfigFileDir", contentDir); 37 | 38 | segmenter = new JiebaSegmenter(); 39 | segmenter.LoadUserDict(Path.Combine(contentDir, "userdict.txt")); 40 | } 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /CherubNLP/Jieba.NET/Node.cs: -------------------------------------------------------------------------------- 1 | namespace JiebaNet.Segmenter 2 | { 3 | public class Node 4 | { 5 | public char Value { get; private set; } 6 | public Node Parent { get; private set; } 7 | 8 | public Node(char value, Node parent) 9 | { 10 | Value = value; 11 | Parent = parent; 12 | } 13 | } 14 | } -------------------------------------------------------------------------------- /CherubNLP/Jieba.NET/Pair.cs: -------------------------------------------------------------------------------- 1 | namespace JiebaNet.Segmenter 2 | { 3 | public class Pair 4 | { 5 | public TKey Key { get;set; } 6 | public double Freq { get; set; } 7 | 8 | public Pair(TKey key, double freq) 9 | { 10 | Key = key; 11 | Freq = freq; 12 | } 13 | 14 | public override string ToString() 15 | { 16 | return "Candidate [Key=" + Key + ", Freq=" + Freq + "]"; 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /CherubNLP/Jieba.NET/PosSeg/Pair.cs: -------------------------------------------------------------------------------- 1 | namespace JiebaNet.Segmenter.PosSeg 2 | { 3 | public class Pair 4 | { 5 | public string Word { get; set; } 6 | public string Flag { get; set; } 7 | public Pair(string word, string flag) 8 | { 9 | Word = word; 10 | Flag = flag; 11 | } 12 | 13 | public override string ToString() 14 | { 15 | return string.Format("{0}/{1}", Word, Flag); 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /CherubNLP/Jieba.NET/README.rst: -------------------------------------------------------------------------------- 1 | BotSharp uses the jieba.NetCore to do tokenization. (https://github.com/1483523635/jieba.NetCore) 2 | 3 | Please follow the install instruction (https://github.com/anderscui/jieba.NET/) 4 | -------------------------------------------------------------------------------- /CherubNLP/Jieba.NET/Token.cs: -------------------------------------------------------------------------------- 1 | namespace JiebaNet.Segmenter 2 | { 3 | public class Token 4 | { 5 | public string Word { get; set; } 6 | public int StartIndex { get; set; } 7 | public int EndIndex { get; set; } 8 | 9 | public Token(string word, int startIndex, int endIndex) 10 | { 11 | Word = word; 12 | StartIndex = startIndex; 13 | EndIndex = endIndex; 14 | } 15 | 16 | public override string ToString() 17 | { 18 | return string.Format("[{0}, ({1}, {2})]", Word, StartIndex, EndIndex); 19 | } 20 | } 21 | } -------------------------------------------------------------------------------- /CherubNLP/Jieba.NET/WordDictionary.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.IO; 5 | using System.Linq; 6 | using System.Text; 7 | using JiebaNet.Segmenter.Common; 8 | using System.Reflection; 9 | 10 | namespace JiebaNet.Segmenter 11 | { 12 | public class WordDictionary 13 | { 14 | private static readonly Lazy lazy = new Lazy(() => new WordDictionary()); 15 | private static readonly string MainDict = ConfigManager.MainDictFile; 16 | 17 | internal IDictionary Trie = new Dictionary(); 18 | 19 | /// 20 | /// total occurrence of all words. 21 | /// 22 | public double Total { get; set; } 23 | 24 | private WordDictionary() 25 | { 26 | LoadDict(); 27 | 28 | Debug.WriteLine("{0} words (and their prefixes)", Trie.Count); 29 | Debug.WriteLine("total freq: {0}", Total); 30 | } 31 | 32 | public static WordDictionary Instance 33 | { 34 | get { return lazy.Value; } 35 | } 36 | 37 | private void LoadDict() 38 | { 39 | try 40 | { 41 | var stopWatch = new Stopwatch(); 42 | stopWatch.Start(); 43 | var filePath = ConfigManager.MainDictFile; 44 | 45 | using (var sr = new StreamReader(filePath)) 46 | { 47 | string line = null; 48 | while ((line = sr.ReadLine()) != null) 49 | { 50 | var tokens = line.Split(' '); 51 | if (tokens.Length < 2) 52 | { 53 | Debug.Fail(string.Format("Invalid line: {0}", line)); 54 | continue; 55 | } 56 | 57 | var word = tokens[0]; 58 | var freq = int.Parse(tokens[1]); 59 | 60 | Trie[word] = freq; 61 | Total += freq; 62 | 63 | foreach (var ch in Enumerable.Range(0, word.Length)) 64 | { 65 | var wfrag = word.Sub(0, ch + 1); 66 | if (!Trie.ContainsKey(wfrag)) 67 | { 68 | Trie[wfrag] = 0; 69 | } 70 | } 71 | } 72 | } 73 | 74 | stopWatch.Stop(); 75 | Debug.WriteLine("main dict load finished, time elapsed {0} ms", stopWatch.ElapsedMilliseconds); 76 | } 77 | catch (IOException e) 78 | { 79 | Debug.Fail(string.Format("{0} load failure, reason: {1}", MainDict, e.Message)); 80 | } 81 | catch (FormatException fe) 82 | { 83 | Debug.Fail(fe.Message); 84 | } 85 | } 86 | 87 | public bool ContainsWord(string word) 88 | { 89 | return Trie.ContainsKey(word) && Trie[word] > 0; 90 | } 91 | 92 | public int GetFreqOrDefault(string key) 93 | { 94 | if (ContainsWord(key)) 95 | return Trie[key]; 96 | else 97 | return 1; 98 | } 99 | 100 | public void AddWord(string word, int freq, string tag = null) 101 | { 102 | if (ContainsWord(word)) 103 | { 104 | Total -= Trie[word]; 105 | } 106 | 107 | Trie[word] = freq; 108 | Total += freq; 109 | for (var i = 0; i < word.Length; i++) 110 | { 111 | var wfrag = word.Substring(0, i + 1); 112 | if (!Trie.ContainsKey(wfrag)) 113 | { 114 | Trie[wfrag] = 0; 115 | } 116 | } 117 | } 118 | 119 | public void DeleteWord(string word) 120 | { 121 | AddWord(word, 0); 122 | } 123 | 124 | internal int SuggestFreq(string word, IEnumerable segments) 125 | { 126 | double freq = 1; 127 | foreach (var seg in segments) 128 | { 129 | freq *= GetFreqOrDefault(seg) / Total; 130 | } 131 | 132 | return Math.Max((int)(freq * Total) + 1, GetFreqOrDefault(word)); 133 | } 134 | } 135 | } -------------------------------------------------------------------------------- /CherubNLP/Models/Entropy/BasicContextGenerator.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2005 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the BasicContextGenerator.java source file found in the 18 | //original java implementation of MaxEnt. That source file contains the following header: 19 | 20 | // Copyright (C) 2001 Jason Baldridge 21 | // 22 | // This library is free software; you can redistribute it and/or 23 | // modify it under the terms of the GNU Lesser General Public 24 | // License as published by the Free Software Foundation; either 25 | // version 2.1 of the License, or (at your option) any later version. 26 | // 27 | // This library is distributed in the hope that it will be useful, 28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | // GNU Lesser General Public License for more details. 31 | // 32 | // You should have received a copy of the GNU Lesser General Public 33 | // License along with this program; if not, write to the Free Software 34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 35 | 36 | using System; 37 | 38 | namespace CherubNLP.Models 39 | { 40 | /// 41 | /// Generate contexts for maxent decisions, assuming that the input 42 | /// given to the GetContext() method is a string containing contextual 43 | /// predicates separated by spaces, e.g: 44 | ///

45 | /// cp_1 cp_2 ... cp_n 46 | ///

47 | ///
48 | /// 49 | /// Jason Baldridge 50 | /// 51 | /// 52 | /// Richard J. Northedge 53 | /// 54 | /// based on BasicContextGenerator.java, $Revision: 1.2 $, $Date: 2002/04/30 08:48:35 $ 55 | /// 56 | public class BasicContextGenerator : IContextGenerator 57 | { 58 | /// 59 | /// Builds up the list of contextual predicates given a string. 60 | /// 61 | /// 62 | /// string with contextual predicates separated by spaces. 63 | /// 64 | /// string array of contextual predicates. 65 | public virtual string[] GetContext(string input) 66 | { 67 | return input.Split(' '); 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /CherubNLP/Models/Entropy/BasicEventReader.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2005 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the BasicEventStream.java source file found in the 18 | //original java implementation of MaxEnt. That source file contains the following header: 19 | 20 | // Copyright (C) 2001 Jason Baldridge 21 | // 22 | // This library is free software; you can redistribute it and/or 23 | // modify it under the terms of the GNU Lesser General Public 24 | // License as published by the Free Software Foundation; either 25 | // version 2.1 of the License, or (at your option) any later version. 26 | // 27 | // This library is distributed in the hope that it will be useful, 28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | // GNU Lesser General Public License for more details. 31 | // 32 | // You should have received a copy of the GNU Lesser General Public 33 | // License along with this program; if not, write to the Free Software 34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 35 | 36 | using System; 37 | 38 | namespace CherubNLP.Models 39 | { 40 | /// 41 | /// An object which can deliver a stream of training events assuming 42 | /// that each event is represented as a space separated list containing 43 | /// all the contextual predicates, with the last item being the 44 | /// outcome, e.g.: 45 | /// 46 | ///

cp_1 cp_2 ... cp_n outcome

47 | ///
48 | public class BasicEventReader : ITrainingEventReader 49 | { 50 | private IContextGenerator mContext; 51 | private ITrainingDataReader mDataReader; 52 | private TrainingEvent mNextEvent; 53 | 54 | /// 55 | /// Constructor sets up the training event reader based on a stream of training data. 56 | /// 57 | /// 58 | /// Stream of training data. 59 | /// 60 | public BasicEventReader(ITrainingDataReader dataReader) 61 | { 62 | mContext = new BasicContextGenerator(); 63 | 64 | mDataReader = dataReader; 65 | if (mDataReader.HasNext()) 66 | { 67 | mNextEvent = CreateEvent(mDataReader.NextToken()); 68 | } 69 | } 70 | 71 | /// 72 | /// Returns the next Event object held in this EventReader. Each call to ReadNextEvent advances the EventReader. 73 | /// 74 | /// 75 | /// the Event object which is next in this EventReader 76 | /// 77 | public virtual TrainingEvent ReadNextEvent() 78 | { 79 | while (mNextEvent == null && mDataReader.HasNext()) 80 | { 81 | mNextEvent = CreateEvent(mDataReader.NextToken()); 82 | } 83 | 84 | TrainingEvent currentEvent = mNextEvent; 85 | if (mDataReader.HasNext()) 86 | { 87 | mNextEvent = CreateEvent(mDataReader.NextToken()); 88 | } 89 | else 90 | { 91 | mNextEvent = null; 92 | } 93 | return currentEvent; 94 | } 95 | 96 | /// 97 | /// Test whether there are any Events remaining in this EventReader. 98 | /// 99 | /// 100 | /// true if this EventReader has more Events 101 | /// 102 | public virtual bool HasNext() 103 | { 104 | while (mNextEvent == null && mDataReader.HasNext()) 105 | { 106 | mNextEvent = CreateEvent(mDataReader.NextToken()); 107 | } 108 | return mNextEvent != null; 109 | } 110 | 111 | private TrainingEvent CreateEvent(string observation) 112 | { 113 | int lastSpace = observation.LastIndexOf((char)' '); 114 | if (lastSpace == -1) 115 | { 116 | return null; 117 | } 118 | else 119 | { 120 | return new TrainingEvent(observation.Substring(lastSpace + 1), mContext.GetContext(observation.Substring(0, (lastSpace) - (0)))); 121 | } 122 | } 123 | } 124 | } 125 | 126 | -------------------------------------------------------------------------------- /CherubNLP/Models/Entropy/IContextGenerator.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2005 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the ContextGenerator.java source file found in the 18 | //original java implementation of MaxEnt. That source file contains the following header: 19 | 20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner 21 | // 22 | // This library is free software; you can redistribute it and/or 23 | // modify it under the terms of the GNU Lesser General Public 24 | // License as published by the Free Software Foundation; either 25 | // version 2.1 of the License, or (at your option) any later version. 26 | // 27 | // This library is distributed in the hope that it will be useful, 28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | // GNU General Public License for more details. 31 | // 32 | // You should have received a copy of the GNU Lesser General Public 33 | // License along with this program; if not, write to the Free Software 34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 35 | 36 | using System; 37 | 38 | namespace CherubNLP.Models 39 | { 40 | /// 41 | /// Generate contexts for maximum entropy decisions. 42 | /// 43 | /// 44 | /// Jason Baldridge 45 | /// 46 | /// 47 | /// Richard J. Northedge 48 | /// 49 | /// 50 | /// based on ContextGenerator.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $ 51 | /// 52 | public interface IContextGenerator 53 | { 54 | /// 55 | /// Builds up the list of contextual predicates given an object. 56 | /// 57 | string[] GetContext(object input); 58 | } 59 | 60 | /// 61 | /// Generate contexts for maximum entropy decisions. 62 | /// 63 | public interface IContextGenerator 64 | { 65 | /// 66 | /// Builds up the list of contextual predicates given an object of type T. 67 | /// 68 | string[] GetContext(T input); 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /CherubNLP/Models/Entropy/IMaximumEntropyModel.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2005 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the MaxentModel.java source file found in the 18 | //original java implementation of MaxEnt. That source file contains the following header: 19 | 20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner 21 | // 22 | // This library is free software; you can redistribute it and/or 23 | // modify it under the terms of the GNU Lesser General Public 24 | // License as published by the Free Software Foundation; either 25 | // version 2.1 of the License, or (at your option) any later version. 26 | // 27 | // This library is distributed in the hope that it will be useful, 28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | // GNU General Public License for more details. 31 | // 32 | // You should have received a copy of the GNU Lesser General Public 33 | // License along with this program; if not, write to the Free Software 34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 35 | 36 | using System; 37 | 38 | namespace CherubNLP.Models 39 | { 40 | /// 41 | /// Interface for maximum entropy models. 42 | /// 43 | /// 44 | /// Jason Baldridge 45 | /// 46 | /// 47 | /// Richard J. Northedge 48 | /// 49 | /// 50 | /// based on MaxentModel.java, $Revision: 1.4 $, $Date: 2003/12/09 23:13:53 $ 51 | /// 52 | public interface IMaximumEntropyModel 53 | { 54 | /// 55 | /// Returns the number of outcomes for this model. 56 | /// 57 | /// 58 | /// The number of outcomes. 59 | /// 60 | int OutcomeCount 61 | { 62 | get; 63 | } 64 | 65 | /// 66 | /// Evaluates a context. 67 | /// 68 | /// 69 | /// A list of string names of the contextual predicates 70 | /// which are to be evaluated together. 71 | /// 72 | /// 73 | /// An array of the probabilities for each of the different 74 | /// outcomes, all of which sum to 1. 75 | /// 76 | double[] Evaluate(string[] context); 77 | 78 | /// 79 | /// Evaluates a context. 80 | /// 81 | /// 82 | /// A list of string names of the contextual predicates 83 | /// which are to be evaluated together. 84 | /// 85 | /// 86 | /// An array which is populated with the probabilities for each of the different 87 | /// outcomes, all of which sum to 1. 88 | /// 89 | /// 90 | /// an array of the probabilities for each of the different 91 | /// outcomes, all of which sum to 1. The probabilities array is returned if it is appropiately sized. 92 | /// 93 | double[] Evaluate(string[] context, double[] probabilities); 94 | 95 | /// 96 | /// Simple function to return the outcome associated with the index 97 | /// containing the highest probability in the double[]. 98 | /// 99 | /// 100 | /// A double[] as returned by the 101 | /// Evaluate(string[] context) 102 | /// method. 103 | /// 104 | /// 105 | /// the string name of the best outcome 106 | /// 107 | string GetBestOutcome(double[] outcomes); 108 | 109 | /// 110 | /// Return a string matching all the outcome names with all the 111 | /// probabilities produced by the eval(string[] 112 | /// context) method. 113 | /// 114 | /// 115 | /// A double[] as returned by the 116 | /// eval(string[] context) 117 | /// method. 118 | /// 119 | /// 120 | /// string containing outcome names paired with the normalized 121 | /// probability (contained in the double[] ocs) 122 | /// for each one. 123 | /// 124 | string GetAllOutcomes(double[] outcomes); 125 | 126 | /// 127 | /// Gets the string name of the outcome associated with the supplied index 128 | /// 129 | /// 130 | /// the index for which the name of the associated outcome is desired. 131 | /// 132 | /// 133 | /// the string name of the outcome 134 | /// 135 | string GetOutcomeName(int index); 136 | 137 | /// 138 | /// Gets the index associated with the string name of the given 139 | /// outcome. 140 | /// 141 | /// 142 | /// the string name of the outcome for which the 143 | /// index is desired 144 | /// 145 | /// 146 | /// the index if the given outcome label exists for this 147 | /// model, -1 if it does not. 148 | /// 149 | int GetOutcomeIndex(string outcome); 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /CherubNLP/Models/Entropy/IO/IGisModelReader.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2005 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file has no equivalent in the java MaxEnt library, because the link 18 | //between GISModel and GISModelReader is implemented differently there. This 19 | //interface is designed so that GIS model reader classes can hold some or all of 20 | //their data in persistent storage rather than in memory. 21 | 22 | using System; 23 | using System.Collections.Generic; 24 | 25 | namespace CherubNLP.Models.IO 26 | { 27 | /// 28 | /// Interface for readers of GIS models. 29 | /// 30 | public interface IGisModelReader 31 | { 32 | /// 33 | /// Returns the value of the model's correction constant. This property should 34 | /// usually only be accessed by GIS model writer classes via the GisModel class. 35 | /// 36 | int CorrectionConstant 37 | { 38 | get; 39 | } 40 | 41 | /// 42 | /// Returns the value of the model's correction constant parameter. This property should 43 | /// usually only be accessed by GIS model writer classes via the GisModel class. 44 | /// 45 | double CorrectionParameter 46 | { 47 | get; 48 | } 49 | 50 | /// 51 | /// Returns the model's outcome labels as a string array. This method should 52 | /// usually only be accessed by GIS model writer classes via the GisModel class. 53 | /// 54 | string[] GetOutcomeLabels(); 55 | 56 | /// 57 | /// Returns the model's outcome patterns. This method should 58 | /// usually only be accessed by GIS model writer classes via the GisModel class. 59 | /// 60 | int[][] GetOutcomePatterns(); 61 | 62 | /// 63 | /// Returns the model's predicates. This method should 64 | /// usually only be accessed by GIS model writer classes via the GisModel class. 65 | /// 66 | Dictionary GetPredicates(); 67 | 68 | /// 69 | /// Returns model information for a predicate, given the predicate label. 70 | /// 71 | /// 72 | /// The predicate label to fetch information for. 73 | /// 74 | /// 75 | /// Array to be passed in to the method; it should have a length equal to the number of outcomes 76 | /// in the model. The method increments the count of each outcome that is active in the specified 77 | /// predicate. 78 | /// 79 | /// 80 | /// Array to be passed in to the method; it should have a length equal to the number of outcomes 81 | /// in the model. The method adds the parameter values for each of the active outcomes in the 82 | /// predicate. 83 | /// 84 | void GetPredicateData(string predicateLabel, int[] featureCounts, double[] outcomeSums); 85 | 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /CherubNLP/Models/Entropy/IO/JavaBinaryGisModelReader.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2005 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the BinaryGISModelReader.java source file found in the 18 | //original java implementation of MaxEnt. That source file contains the following header: 19 | 20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner 21 | // 22 | // This library is free software; you can redistribute it and/or 23 | // modify it under the terms of the GNU Lesser General Public 24 | // License as published by the Free Software Foundation; either 25 | // version 2.1 of the License, or (at your option) any later version. 26 | // 27 | // This library is distributed in the hope that it will be useful, 28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | // GNU General Public License for more details. 31 | // 32 | // You should have received a copy of the GNU Lesser General Public 33 | // License along with this program; if not, write to the Free Software 34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 35 | 36 | using System; 37 | using System.IO; 38 | 39 | namespace CherubNLP.Models.IO 40 | { 41 | /// 42 | /// A reader for GIS models stored in the binary format produced by the java version 43 | /// of MaxEnt. This binary format stores data using big-endian values, which means 44 | /// that the C# version must reverse the byte order of each value in turn, making it 45 | /// less efficient. Use only for compatibility with the java MaxEnt library. 46 | /// 47 | /// 48 | /// Jason Baldridge 49 | /// 50 | /// 51 | /// Richard J. Northedge 52 | /// 53 | /// 54 | /// based on BinaryGISModelReader.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $ 55 | /// 56 | public class JavaBinaryGisModelReader : GisModelReader 57 | { 58 | private readonly Stream _input; 59 | private readonly byte[] _buffer; 60 | private int _stringLength = 0; 61 | private readonly System.Text.Encoding _encoding = System.Text.Encoding.UTF8; 62 | 63 | /// 64 | /// Constructor which directly instantiates the Stream containing 65 | /// the model contents. 66 | /// 67 | /// The Stream containing the model information. 68 | /// 69 | public JavaBinaryGisModelReader(Stream dataInputStream) 70 | { 71 | using (_input = dataInputStream) 72 | { 73 | _buffer = new byte[256]; 74 | base.ReadModel(); 75 | } 76 | } 77 | 78 | /// 79 | /// Constructor which takes a filename and creates a reader for it. 80 | /// 81 | /// The full path and name of the file in which the model is stored. 82 | /// 83 | public JavaBinaryGisModelReader(string fileName) 84 | { 85 | using (_input = new FileStream(fileName, FileMode.Open, FileAccess.Read)) 86 | { 87 | _buffer = new byte[256]; 88 | base.ReadModel(); 89 | } 90 | } 91 | 92 | /// 93 | /// Reads a 32-bit signed integer from the model file. 94 | /// 95 | protected override int ReadInt32() 96 | { 97 | _input.Read(_buffer, 0, 4); 98 | Array.Reverse(_buffer, 0, 4); 99 | return BitConverter.ToInt32(_buffer, 0); 100 | } 101 | 102 | /// 103 | /// Reads a double-precision floating point number from the model file. 104 | /// 105 | protected override double ReadDouble() 106 | { 107 | _input.Read(_buffer, 0, 8); 108 | Array.Reverse(_buffer, 0, 8); 109 | return BitConverter.ToDouble(_buffer, 0); 110 | } 111 | 112 | /// 113 | /// Reads a UTF-8 encoded string from the model file. 114 | /// 115 | protected override string ReadString() 116 | { 117 | //read string from binary file with UTF8 encoding 118 | _stringLength = (_input.ReadByte() * 256) + _input.ReadByte(); 119 | _input.Read(_buffer, 0, _stringLength); 120 | return _encoding.GetString(_buffer, 0, _stringLength); 121 | } 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /CherubNLP/Models/Entropy/IO/JavaBinaryGisModelWriter.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2005 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the BinaryGISModelWriter.java source file found in the 18 | //original java implementation of MaxEnt. That source file contains the following header: 19 | 20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner 21 | // 22 | // This library is free software; you can redistribute it and/or 23 | // modify it under the terms of the GNU Lesser General Public 24 | // License as published by the Free Software Foundation; either 25 | // version 2.1 of the License, or (at your option) any later version. 26 | // 27 | // This library is distributed in the hope that it will be useful, 28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | // GNU General Public License for more details. 31 | // 32 | // You should have received a copy of the GNU Lesser General Public 33 | // License along with this program; if not, write to the Free Software 34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 35 | 36 | using System; 37 | using System.IO; 38 | 39 | namespace CherubNLP.Models.IO 40 | { 41 | /// 42 | /// A writer for GIS models that saves models in the binary format used by the java 43 | /// version of MaxEnt. This binary format stores data using big-endian values, which means 44 | /// that the C# version must reverse the byte order of each value in turn, making it 45 | /// less efficient. Use only for compatibility with the java MaxEnt library. 46 | /// 47 | /// 48 | /// Jason Baldridge 49 | /// 50 | /// 51 | /// Richard J. Northedge 52 | /// 53 | /// 54 | /// based on BinaryGISModelWriter.java $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $ 55 | /// 56 | public class JavaBinaryGisModelWriter : GisModelWriter 57 | { 58 | private Stream mOutput; 59 | private byte[] mBuffer = new byte[7]; 60 | private System.Text.Encoding mEncoding = System.Text.Encoding.UTF8; 61 | 62 | /// 63 | /// Default constructor. 64 | /// 65 | public JavaBinaryGisModelWriter() 66 | { 67 | } 68 | 69 | /// Takes a GisModel and a File and 70 | /// writes the model to that file. 71 | /// 72 | /// The GisModel which is to be persisted. 73 | /// 74 | /// The name of the file in which the model is to be persisted. 75 | /// 76 | public void Persist(GisModel model, string fileName) 77 | { 78 | using (mOutput = new FileStream(fileName, FileMode.Create)) 79 | { 80 | base.Persist(model); 81 | } 82 | } 83 | 84 | /// 85 | /// Takes a GisModel and a Stream and writes the model to that stream. 86 | /// 87 | /// 88 | /// The GIS model which is to be persisted. 89 | /// 90 | /// 91 | /// The Stream which will be used to persist the model. 92 | /// 93 | public void Persist(GisModel model, Stream dataOutputStream) 94 | { 95 | using (mOutput = dataOutputStream) 96 | { 97 | base.Persist(model); 98 | } 99 | } 100 | 101 | /// 102 | /// Writes a UTF-8 encoded string to the model file. 103 | /// 104 | /// /// 105 | /// The string data to be persisted. 106 | /// 107 | protected override void WriteString(string data) 108 | { 109 | mOutput.WriteByte((byte)(mEncoding.GetByteCount(data) / 256)); 110 | mOutput.WriteByte((byte)(mEncoding.GetByteCount(data) % 256)); 111 | mOutput.Write(mEncoding.GetBytes(data), 0, mEncoding.GetByteCount(data)); 112 | } 113 | 114 | /// 115 | /// Writes a 32-bit signed integer to the model file. 116 | /// 117 | /// /// 118 | /// The integer data to be persisted. 119 | /// 120 | protected override void WriteInt32(int data) 121 | { 122 | mBuffer = BitConverter.GetBytes(data); 123 | Array.Reverse(mBuffer); 124 | mOutput.Write(mBuffer, 0, 4); 125 | } 126 | 127 | /// 128 | /// Writes a double-precision floating point number to the model file. 129 | /// 130 | /// /// 131 | /// The floating point data to be persisted. 132 | /// 133 | protected override void WriteDouble(double data) 134 | { 135 | mBuffer = BitConverter.GetBytes(data); 136 | Array.Reverse(mBuffer); 137 | mOutput.Write(mBuffer, 0, 8); 138 | } 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /CherubNLP/Models/Entropy/IO/PlainTextGisModelReader.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2005 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the PlainTextGISModelReader.java source file found in the 18 | //original java implementation of MaxEnt. That source file contains the following header: 19 | 20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner 21 | // 22 | // This library is free software; you can redistribute it and/or 23 | // modify it under the terms of the GNU Lesser General Public 24 | // License as published by the Free Software Foundation; either 25 | // version 2.1 of the License, or (at your option) any later version. 26 | // 27 | // This library is distributed in the hope that it will be useful, 28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | // GNU General Public License for more details. 31 | // 32 | // You should have received a copy of the GNU Lesser General Public 33 | // License along with this program; if not, write to the Free Software 34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 35 | 36 | using System; 37 | using System.IO; 38 | 39 | namespace CherubNLP.Models.IO 40 | { 41 | /// 42 | /// A reader for GIS models stored in plain text format. 43 | /// 44 | /// 45 | /// Jason Baldridge 46 | /// 47 | /// 48 | /// Richard J. Northedge 49 | /// 50 | /// 51 | /// based on PlainTextGISModelReader.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $ 52 | /// 53 | public class PlainTextGisModelReader : GisModelReader 54 | { 55 | private StreamReader mInput; 56 | 57 | /// 58 | /// Constructor which directly instantiates the StreamReader containing 59 | /// the model contents. 60 | /// 61 | /// 62 | /// The StreamReader containing the model information. 63 | /// 64 | public PlainTextGisModelReader(StreamReader reader) 65 | { 66 | using (mInput = reader) 67 | { 68 | base.ReadModel(); 69 | } 70 | } 71 | 72 | /// 73 | /// Constructor which takes a file and creates a reader for it. 74 | /// 75 | /// 76 | /// The full path and file name in which the model is stored. 77 | /// 78 | public PlainTextGisModelReader(string fileName) 79 | { 80 | using (mInput = new StreamReader(fileName, System.Text.Encoding.UTF7)) 81 | { 82 | base.ReadModel(); 83 | } 84 | } 85 | 86 | /// 87 | /// Reads a 32-bit signed integer from the model file. 88 | /// 89 | protected override int ReadInt32() 90 | { 91 | return int.Parse(mInput.ReadLine(), System.Globalization.CultureInfo.InvariantCulture); 92 | } 93 | 94 | /// 95 | /// Reads a double-precision floating point number from the model file. 96 | /// 97 | protected override double ReadDouble() 98 | { 99 | return double.Parse(mInput.ReadLine(), System.Globalization.CultureInfo.InvariantCulture); 100 | } 101 | 102 | /// 103 | /// Reads a string from the model file. 104 | /// 105 | protected override string ReadString() 106 | { 107 | return mInput.ReadLine(); 108 | } 109 | 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /CherubNLP/Models/Entropy/IO/PlainTextGisModelWriter.cs: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2005 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the PlainTextGISModelReader.java source file found in the 18 | //original java implementation of MaxEnt. That source file contains the following header: 19 | 20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner 21 | // 22 | // This library is free software; you can redistribute it and/or 23 | // modify it under the terms of the GNU Lesser General Public 24 | // License as published by the Free Software Foundation; either 25 | // version 2.1 of the License, or (at your option) any later version. 26 | // 27 | // This library is distributed in the hope that it will be useful, 28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | // GNU General Public License for more details. 31 | // 32 | // You should have received a copy of the GNU Lesser General Public 33 | // License along with this program; if not, write to the Free Software 34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 35 | 36 | using System; 37 | using System.IO; 38 | 39 | namespace CherubNLP.Models.IO 40 | { 41 | /// 42 | /// Model writer that saves models in plain text format. 43 | /// 44 | /// 45 | /// Jason Baldridge 46 | /// 47 | /// 48 | /// Richard J. Northedge 49 | /// 50 | /// 51 | /// based on PlainTextGISModelWriter.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $ 52 | /// 53 | public class PlainTextGisModelWriter : GisModelWriter 54 | { 55 | private StreamWriter mOutput; 56 | 57 | /// 58 | /// Default constructor. 59 | /// 60 | public PlainTextGisModelWriter() 61 | { 62 | } 63 | 64 | /// 65 | /// Takes a GIS model and a file and writes the model to that file. 66 | /// 67 | /// 68 | /// The GisModel which is to be persisted. 69 | /// 70 | /// 71 | /// The name of the file in which the model is to be persisted. 72 | /// 73 | public void Persist(GisModel model, string fileName) 74 | { 75 | using (mOutput = new StreamWriter(fileName, false, System.Text.Encoding.UTF7)) 76 | { 77 | base.Persist(model); 78 | } 79 | } 80 | 81 | /// 82 | /// Takes a GisModel and a stream and writes the model to that stream. 83 | /// 84 | /// 85 | /// The GisModel which is to be persisted. 86 | /// 87 | /// 88 | /// The StreamWriter which will be used to persist the model. 89 | /// 90 | public void Persist(GisModel model, StreamWriter writer) 91 | { 92 | using (mOutput = writer) 93 | { 94 | base.Persist(model); 95 | } 96 | } 97 | 98 | /// 99 | /// Writes a string to the model file. 100 | /// 101 | /// /// 102 | /// The string data to be persisted. 103 | /// 104 | protected override void WriteString(string data) 105 | { 106 | mOutput.Write(data); 107 | mOutput.WriteLine(); 108 | } 109 | 110 | /// 111 | /// Writes a 32-bit signed integer to the model file. 112 | /// 113 | /// 114 | /// The integer data to be persisted. 115 | /// 116 | protected override void WriteInt32(int data) 117 | { 118 | mOutput.Write(data.ToString(System.Globalization.CultureInfo.InvariantCulture)); 119 | mOutput.WriteLine(); 120 | } 121 | 122 | /// 123 | /// Writes a double-precision floating point number to the model file. 124 | /// 125 | /// 126 | /// The floating point data to be persisted. 127 | /// 128 | protected override void WriteDouble(double data) 129 | { 130 | mOutput.Write(data.ToString(System.Globalization.CultureInfo.InvariantCulture)); 131 | mOutput.WriteLine(); 132 | } 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /CherubNLP/Models/Entropy/ITrainingDataIndexer.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2005 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the DataIndexer.java source file found in the 18 | //original java implementation of MaxEnt. That source file contains the following header: 19 | 20 | //Copyright (C) 2003 Thomas Morton 21 | // 22 | //This library is free software; you can redistribute it and/or 23 | //modify it under the terms of the GNU Lesser General Public 24 | //License as published by the Free Software Foundation; either 25 | //version 2.1 of the License, or (at your option) any later version. 26 | // 27 | //This library is distributed in the hope that it will be useful, 28 | //but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | //GNU General Public License for more details. 31 | // 32 | //You should have received a copy of the GNU Lesser General Public 33 | //License along with this program; if not, write to the Free Software 34 | //Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 35 | 36 | using System; 37 | 38 | namespace CherubNLP.Models 39 | { 40 | /// 41 | /// Object that compresses events in memory and performs feature selection. 42 | /// 43 | public interface ITrainingDataIndexer 44 | { 45 | 46 | /// 47 | /// Gets an array of context data calculated from the training data. 48 | /// 49 | /// 50 | /// Array of integer arrays, each containing the context data for an event. 51 | /// 52 | int[][] GetContexts(); 53 | 54 | /// 55 | /// Gets an array indicating how many times each event is seen. 56 | /// 57 | /// 58 | /// Integer array with event frequencies. 59 | /// 60 | int[] GetNumTimesEventsSeen(); 61 | 62 | /// 63 | /// Gets an outcome list. 64 | /// 65 | /// 66 | /// Integer array of outcomes. 67 | /// 68 | int[] GetOutcomeList(); 69 | 70 | /// 71 | /// Gets an array of predicate labels. 72 | /// 73 | /// 74 | /// Array of predicate labels. 75 | /// 76 | string[] GetPredicateLabels(); 77 | 78 | /// 79 | /// Gets an array of outcome labels. 80 | /// 81 | /// 82 | /// Array of outcome labels. 83 | /// 84 | string[] GetOutcomeLabels(); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /CherubNLP/Models/Entropy/ITrainingDataReader.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2005 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the DataStream.java source file found in the 18 | //original java implementation of MaxEnt. That source file contains the following header: 19 | 20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner 21 | // 22 | // This library is free software; you can redistribute it and/or 23 | // modify it under the terms of the GNU Lesser General Public 24 | // License as published by the Free Software Foundation; either 25 | // version 2.1 of the License, or (at your option) any later version. 26 | // 27 | // This library is distributed in the hope that it will be useful, 28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | // GNU General Public License for more details. 31 | // 32 | // You should have received a copy of the GNU Lesser General Public 33 | // License along with this program; if not, write to the Free Software 34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 35 | 36 | using System; 37 | 38 | namespace CherubNLP.Models 39 | { 40 | /// 41 | /// A interface for objects which can deliver a stream of training data to be 42 | /// supplied to an ITrainingEventReader. It is not necessary to use a ITrainingDataReader in a 43 | /// SharpEntropy application, but it can be used to support a wider variety of formats 44 | /// in which your training data can be held. 45 | /// 46 | /// 47 | /// Jason Baldridge 48 | /// 49 | /// 50 | /// Richard J. Northedge 51 | /// 52 | /// 53 | /// based on DataStream.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $ 54 | /// 55 | public interface ITrainingDataReader 56 | { 57 | /// 58 | /// Returns the next slice of data held in this ITrainingDataReader. 59 | /// 60 | /// 61 | /// the object representing the data which is next in this 62 | /// ITrainingDataReader 63 | /// 64 | T NextToken(); 65 | 66 | /// 67 | /// Test whether there are any training data items remaining in this ITrainingDataReader. 68 | /// 69 | /// 70 | /// true if this ITrainingDataReader has more data tokens 71 | /// 72 | bool HasNext(); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /CherubNLP/Models/Entropy/ITrainingEventReader.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2005 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the EventStream.java source file found in the 18 | //original java implementation of MaxEnt. That source file contains the following header: 19 | 20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner 21 | // 22 | // This library is free software; you can redistribute it and/or 23 | // modify it under the terms of the GNU Lesser General Public 24 | // License as published by the Free Software Foundation; either 25 | // version 2.1 of the License, or (at your option) any later version. 26 | // 27 | // This library is distributed in the hope that it will be useful, 28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | // GNU General Public License for more details. 31 | // 32 | // You should have received a copy of the GNU Lesser General Public 33 | // License along with this program; if not, write to the Free Software 34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 35 | 36 | using System; 37 | 38 | namespace CherubNLP.Models 39 | { 40 | /// 41 | /// An object which can deliver a stream of training events for the GIS 42 | /// procedure (or others such as IIS if and when they are implemented). 43 | /// TrainingEventReaders don't need to use SharpEntropy.ITrainingDataReader, but doing so 44 | /// would provide greater flexibility for producing events from data stored in 45 | /// different formats. 46 | /// 47 | public interface ITrainingEventReader 48 | { 49 | 50 | /// 51 | /// Returns the next TrainingEvent object held in this TrainingEventReader. 52 | /// 53 | /// 54 | /// the TrainingEvent object which is next in this TrainingEventReader 55 | /// 56 | TrainingEvent ReadNextEvent(); 57 | 58 | /// 59 | /// Test whether there are any TrainingEvents remaining in this TrainingEventReader. 60 | /// 61 | /// 62 | /// true if this TrainingEventReader has more TrainingEvents 63 | /// 64 | bool HasNext(); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /CherubNLP/Models/Entropy/PatternedPredicate.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2005 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | using System; 18 | 19 | namespace CherubNLP.Models 20 | { 21 | /// 22 | /// Object containing predicate data, where the parameters are matched to 23 | /// the outcomes in an outcome pattern. 24 | /// 25 | /// 26 | /// Richard J. Northedge 27 | /// 28 | public class PatternedPredicate 29 | { 30 | private int mOutcomePattern; 31 | private double[] mParameters; 32 | private string mName; 33 | 34 | /// 35 | /// Creates a PatternedPredicate object. 36 | /// 37 | /// 38 | /// Index into the outcome pattern array, specifying which outcome pattern relates to 39 | /// this predicate. 40 | /// 41 | /// 42 | /// Array of parameters for this predicate. 43 | /// 44 | protected internal PatternedPredicate(int outcomePattern, double[] parameters) 45 | { 46 | mOutcomePattern = outcomePattern; 47 | mParameters = parameters; 48 | } 49 | 50 | /// 51 | /// Creates a PatternedPredicate object. 52 | /// 53 | /// 54 | /// The predicate name. 55 | /// 56 | /// 57 | /// Array of parameters for this predicate. 58 | /// 59 | protected internal PatternedPredicate(string name, double[] parameters) 60 | { 61 | mName = name; 62 | mParameters = parameters; 63 | } 64 | 65 | /// 66 | /// Index into array of outcome patterns. 67 | /// 68 | public int OutcomePattern 69 | { 70 | get 71 | { 72 | return mOutcomePattern; 73 | } 74 | set // for trainer 75 | { 76 | mOutcomePattern = value; 77 | } 78 | } 79 | 80 | /// 81 | /// Gets the value of a parameter from this predicate. 82 | /// 83 | /// 84 | /// index into the parameter array. 85 | /// 86 | /// 87 | public double GetParameter(int index) 88 | { 89 | return mParameters[index]; 90 | } 91 | 92 | /// 93 | /// Number of parameters associated with this predicate. 94 | /// 95 | public int ParameterCount 96 | { 97 | get 98 | { 99 | return mParameters.Length; 100 | } 101 | } 102 | 103 | /// 104 | /// Name of the predicate. 105 | /// 106 | public string Name 107 | { 108 | get 109 | { 110 | return mName; 111 | } 112 | set 113 | { 114 | mName = value; 115 | } 116 | } 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /CherubNLP/Models/Entropy/PlainTextByLineDataReader.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2005 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the PlainTextByLineDataStream.java source file found in the 18 | //original java implementation of MaxEnt. That source file contains the following header: 19 | 20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner 21 | // 22 | // This library is free software; you can redistribute it and/or 23 | // modify it under the terms of the GNU Lesser General Public 24 | // License as published by the Free Software Foundation; either 25 | // version 2.1 of the License, or (at your option) any later version. 26 | // 27 | // This library is distributed in the hope that it will be useful, 28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | // GNU General Public License for more details. 31 | // 32 | // You should have received a copy of the GNU Lesser General Public 33 | // License along with this program; if not, write to the Free Software 34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 35 | 36 | using System; 37 | using System.IO; 38 | 39 | namespace CherubNLP.Models 40 | { 41 | /// 42 | /// This ITrainingDataReader implementation will take care of reading a plain text file 43 | /// and returning the strings between each new line character, which is what 44 | /// many SharpEntropy applications need in order to create ITrainingEventReaders. 45 | /// 46 | /// 47 | /// Jason Baldridge 48 | /// 49 | /// 50 | /// Richard J. Northedge 51 | /// 52 | /// 53 | /// based on PlainTextByLineDataStream.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $ 54 | /// 55 | public class PlainTextByLineDataReader : ITrainingDataReader 56 | { 57 | private readonly StreamReader _dataReader; 58 | private string _nextLine; 59 | 60 | /// 61 | /// Creates a training data reader for reading text lines from a file or other text stream 62 | /// 63 | /// StreamReader containing the source of the training data 64 | public PlainTextByLineDataReader(StreamReader dataSource) 65 | { 66 | _dataReader = dataSource; 67 | _nextLine = _dataReader.ReadLine(); 68 | } 69 | 70 | /// Gets the next text line from the training data 71 | /// Next text line from the training data 72 | public virtual string NextToken() 73 | { 74 | string currentLine = _nextLine; 75 | _nextLine = _dataReader.ReadLine(); 76 | return currentLine; 77 | } 78 | 79 | /// Checks if there is any more training data 80 | /// true if there is more training data to be read 81 | public virtual bool HasNext() 82 | { 83 | return (_nextLine != null); 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /CherubNLP/Models/Entropy/TrainingEvent.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2005 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the Event.java source file found in the 18 | //original java implementation of MaxEnt. That source file contains the following header: 19 | 20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner 21 | // 22 | // This library is free software; you can redistribute it and/or 23 | // modify it under the terms of the GNU Lesser General Public 24 | // License as published by the Free Software Foundation; either 25 | // version 2.1 of the License, or (at your option) any later version. 26 | // 27 | // This library is distributed in the hope that it will be useful, 28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | // GNU General Public License for more details. 31 | // 32 | // You should have received a copy of the GNU Lesser General Public 33 | // License along with this program; if not, write to the Free Software 34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 35 | 36 | using System; 37 | 38 | namespace CherubNLP.Models 39 | { 40 | /// 41 | /// The context of a decision point during training. This includes 42 | /// contextual predicates and an outcome. 43 | /// 44 | /// 45 | /// Jason Baldridge 46 | /// 47 | /// 48 | /// Richard J. Northedge 49 | /// 50 | /// 51 | /// based on Event.java, $Revision: 1.3 $, $Date: 2003/12/09 23:13:08 $ 52 | /// 53 | public class TrainingEvent 54 | { 55 | /// 56 | /// The outcome label for this training event. 57 | /// 58 | public string Outcome { get; private set; } 59 | 60 | /// 61 | /// The context for this training event. 62 | /// 63 | /// 64 | /// A string array of context values for this training event. 65 | /// 66 | public string[] Context { get; private set; } 67 | 68 | /// 69 | /// Constructor for a training event. 70 | /// 71 | /// 72 | /// the outcome label 73 | /// 74 | /// 75 | /// array containing context values 76 | /// 77 | public TrainingEvent(string outcome, string[] context) 78 | { 79 | Outcome = outcome; 80 | Context = context; 81 | } 82 | 83 | /// 84 | /// Override providing text summary of the training event. 85 | /// 86 | /// 87 | /// Summary of the training event. 88 | /// 89 | public override string ToString() 90 | { 91 | return Outcome + " " + string.Join(", ", Context); 92 | } 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /CherubNLP/Models/WordNet/IndexWord.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2006 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | using System; 18 | using System.Linq; 19 | 20 | namespace CherubNLP.Models 21 | { 22 | /// 23 | /// Summary description for IndexWord. 24 | /// 25 | public class IndexWord 26 | { 27 | // Properties ------------------------ 28 | 29 | public string PartOfSpeech { get; private set; } 30 | 31 | public int[] SynsetOffsets { get; private set; } 32 | 33 | public string Lemma { get; private set; } 34 | 35 | public int SenseCount 36 | { 37 | get { return this.SynsetOffsets != null ? this.SynsetOffsets.Count() : 0; } 38 | } 39 | 40 | public int TagSenseCount { get; private set; } 41 | 42 | public string[] RelationTypes { get; private set; } 43 | 44 | 45 | // Constructors -------------------- 46 | 47 | public IndexWord(string lemma, string partOfSpeech, string[] relationTypes, int[] synsetOffsets, int tagSenseCount) 48 | { 49 | this.Lemma = lemma; 50 | this.PartOfSpeech = partOfSpeech; 51 | this.RelationTypes = relationTypes; 52 | this.SynsetOffsets = synsetOffsets; 53 | this.TagSenseCount = tagSenseCount; 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /CherubNLP/Models/WordNet/Morph/AbstractDelegatingOperation.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2006 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the AbstractDelegatingOperation.java source file found in 18 | //the Java WordNet Library (JWNL). That source file is licensed under BSD. 19 | 20 | using System; 21 | using System.Collections.Generic; 22 | using System.Text; 23 | 24 | namespace CherubNLP.Models.Morph 25 | { 26 | public abstract class AbstractDelegatingOperation : IOperation 27 | { 28 | private Dictionary mOperationSets; 29 | 30 | public virtual void AddDelegate(string key, IOperation[] operations) 31 | { 32 | if (!mOperationSets.ContainsKey(key)) 33 | { 34 | mOperationSets.Add(key, operations); 35 | } 36 | else 37 | { 38 | mOperationSets[key] = operations; 39 | } 40 | } 41 | 42 | protected internal AbstractDelegatingOperation() 43 | { 44 | mOperationSets = new Dictionary(); 45 | } 46 | 47 | //protected internal abstract AbstractDelegatingOperation getInstance(System.Collections.IDictionary params_Renamed); 48 | 49 | protected internal virtual bool HasDelegate(string key) 50 | { 51 | return mOperationSets.ContainsKey(key); 52 | } 53 | 54 | protected internal virtual bool ExecuteDelegate(string lemma, string partOfSpeech, ListbaseForms, string key) 55 | { 56 | IOperation[] operations = mOperationSets[key]; 57 | bool result = false; 58 | for (int currentOperation = 0; currentOperation < operations.Length; currentOperation++) 59 | { 60 | if (operations[currentOperation].Execute(lemma, partOfSpeech, baseForms)) 61 | { 62 | result = true; 63 | } 64 | } 65 | return result; 66 | } 67 | 68 | #region IOperation Members 69 | 70 | public abstract bool Execute(string lemma, string partOfSpeech, List baseForms); 71 | 72 | #endregion 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /CherubNLP/Models/WordNet/Morph/DetachSuffixesOperation.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2006 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the DetachSuffixesOperation.java source file found in 18 | //the Java WordNet Library (JWNL). That source file is licensed under BSD. 19 | 20 | using System; 21 | using System.Collections.Generic; 22 | using System.Text; 23 | 24 | namespace CherubNLP.Models.Morph 25 | { 26 | /// 27 | /// Remove all applicable suffixes from the word(s) and do a look-up. 28 | /// 29 | public class DetachSuffixesOperation : AbstractDelegatingOperation 30 | { 31 | public const string Operations = "operations"; 32 | 33 | private Dictionary mSuffixMap; 34 | 35 | public DetachSuffixesOperation(Dictionary suffixMap) 36 | { 37 | mSuffixMap = suffixMap; 38 | } 39 | 40 | #region IOperation Members 41 | 42 | public override bool Execute(string lemma, string partOfSpeech, List baseForms) 43 | { 44 | if (!mSuffixMap.ContainsKey(partOfSpeech)) 45 | { 46 | return false; 47 | } 48 | string[][] suffixArray = mSuffixMap[partOfSpeech]; 49 | 50 | bool addedBaseForm = false; 51 | for (int currentSuffix = 0; currentSuffix < suffixArray.Length; currentSuffix++) 52 | { 53 | if (lemma.EndsWith(suffixArray[currentSuffix][0])) 54 | { 55 | string stem = lemma.Substring(0, (lemma.Length - suffixArray[currentSuffix][0].Length) - (0)) + suffixArray[currentSuffix][1]; 56 | if (ExecuteDelegate(stem, partOfSpeech, baseForms, Operations)) 57 | { 58 | addedBaseForm = true; 59 | } 60 | } 61 | } 62 | return addedBaseForm; 63 | } 64 | 65 | #endregion 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /CherubNLP/Models/WordNet/Morph/IOperation.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2006 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the Operation.java source file found in 18 | //the Java WordNet Library (JWNL). That source file is licensed under BSD. 19 | 20 | using System; 21 | using System.Collections.Generic; 22 | using System.Text; 23 | 24 | namespace CherubNLP.Models.Morph 25 | { 26 | public interface IOperation 27 | { 28 | /// 29 | /// Execute the operation. 30 | /// 31 | /// 32 | /// input lemma to look up 33 | /// 34 | /// 35 | /// part of speech of the lemma to look up 36 | /// 37 | /// 38 | /// List to which all discovered base forms should be added. 39 | /// 40 | /// 41 | /// True if at least one base form was discovered by the operation and 42 | /// added to baseForms. 43 | /// 44 | bool Execute(string lemma, string partOfSpeech, List baseForms); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /CherubNLP/Models/WordNet/Morph/LookupExceptionsOperation.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2006 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the LookupExceptionsOperation.java source file found in 18 | //the Java WordNet Library (JWNL). That source file is licensed under BSD. 19 | 20 | using System; 21 | using System.Collections.Generic; 22 | using System.Text; 23 | 24 | namespace CherubNLP.Models.Morph 25 | { 26 | /// Lookup the word in the exceptions file of the given part-of-speech. 27 | public class LookupExceptionsOperation : IOperation 28 | { 29 | private WordNetEngine mEngine; 30 | 31 | public LookupExceptionsOperation(WordNetEngine engine) 32 | { 33 | mEngine = engine; 34 | } 35 | 36 | #region IOperation Members 37 | 38 | public bool Execute(string lemma, string partOfSpeech, List baseForms) 39 | { 40 | bool addedBaseForm = false; 41 | string[] exceptionForms = mEngine.GetExceptionForms(lemma, partOfSpeech); 42 | 43 | foreach (string exceptionForm in exceptionForms) 44 | { 45 | if (!baseForms.Contains(exceptionForm)) 46 | { 47 | baseForms.Add(exceptionForm); 48 | addedBaseForm = true; 49 | } 50 | } 51 | 52 | return addedBaseForm; 53 | } 54 | 55 | #endregion 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /CherubNLP/Models/WordNet/Morph/LookupIndexWordOperation.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2006 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the LookupIndexWordOperation.java source file found in 18 | //the Java WordNet Library (JWNL). That source file is licensed under BSD. 19 | 20 | using System; 21 | using System.Collections.Generic; 22 | using System.Text; 23 | 24 | namespace CherubNLP.Models.Morph 25 | { 26 | public class LookupIndexWordOperation : IOperation 27 | { 28 | private WordNetEngine mEngine; 29 | 30 | public LookupIndexWordOperation(WordNetEngine engine) 31 | { 32 | mEngine = engine; 33 | } 34 | 35 | #region IOperation Members 36 | 37 | public bool Execute(string lemma, string partOfSpeech, List baseForms) 38 | { 39 | if (!baseForms.Contains(lemma) && mEngine.GetIndexWord(lemma, partOfSpeech) != null) 40 | { 41 | baseForms.Add(lemma); 42 | return true; 43 | } 44 | return false; 45 | } 46 | 47 | #endregion 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /CherubNLP/Models/WordNet/Morph/Util.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2006 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | //This file is based on the Util.java source file found in 18 | //the Java WordNet Library (JWNL). That source file is licensed under BSD. 19 | 20 | using System; 21 | using System.Collections; 22 | using System.Collections.Generic; 23 | using System.Text; 24 | 25 | namespace CherubNLP.Models.Morph 26 | { 27 | public class Util 28 | { 29 | public static string GetLemma(string[] tokens, BitArray bits, string delimiter) 30 | { 31 | StringBuilder buf = new StringBuilder(); 32 | for (int i = 0; i < tokens.Length; i++) 33 | { 34 | if (i != 0 && !bits.Get(i - 1)) 35 | { 36 | buf.Append(delimiter); 37 | } 38 | buf.Append(tokens[i]); 39 | } 40 | return buf.ToString(); 41 | } 42 | 43 | public static bool Increment(BitArray bits, int size) 44 | { 45 | int i = size - 1; 46 | while (i >= 0 && bits.Get(i)) 47 | { 48 | bits.Set(i--, false); 49 | } 50 | if (i < 0) 51 | { 52 | return false; 53 | } 54 | bits.Set(i, true); 55 | return true; 56 | } 57 | 58 | public static string[] Split(string str) 59 | { 60 | char[] chars = str.ToCharArray(); 61 | List tokens = new List(); 62 | StringBuilder buf = new StringBuilder(); 63 | for (int i = 0; i < chars.Length; i++) 64 | { 65 | if ((chars[i] >= 'a' && chars[i] <= 'z') || chars[i] == '\'') 66 | { 67 | buf.Append(chars[i]); 68 | } 69 | else 70 | { 71 | if (buf.Length > 0) 72 | { 73 | tokens.Add(buf.ToString()); 74 | buf = new StringBuilder(); 75 | } 76 | } 77 | } 78 | if (buf.Length > 0) 79 | { 80 | tokens.Add(buf.ToString()); 81 | } 82 | return (tokens.ToArray()); 83 | } 84 | 85 | private Util() 86 | { 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /CherubNLP/Models/WordNet/Relation.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2006 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | using System; 18 | 19 | namespace CherubNLP.Models 20 | { 21 | /// 22 | /// Summary description for Relation. 23 | /// 24 | public class Relation 25 | { 26 | private WordNetEngine mWordNetEngine; 27 | 28 | private RelationType mRelationType; 29 | 30 | private int mTargetSynsetOffset; 31 | private string mTargetSynsetPartOfSpeech; 32 | 33 | private Synset mTargetSynset; 34 | 35 | private int miSourceWord; 36 | private int miTargetWord; 37 | 38 | public RelationType SynsetRelationType 39 | { 40 | get 41 | { 42 | return mRelationType; 43 | } 44 | } 45 | 46 | public int TargetSynsetOffset 47 | { 48 | get 49 | { 50 | return mTargetSynsetOffset; 51 | } 52 | } 53 | 54 | public Synset TargetSynset 55 | { 56 | get 57 | { 58 | if (mTargetSynset == null) 59 | { 60 | mTargetSynset = mWordNetEngine.CreateSynset(mTargetSynsetPartOfSpeech, mTargetSynsetOffset); 61 | } 62 | return mTargetSynset; 63 | } 64 | } 65 | 66 | private Relation() 67 | { 68 | } 69 | 70 | protected internal Relation(WordNetEngine wordNetEngine, RelationType relationType, int targetSynsetOffset, string targetSynsetPartOfSpeech) 71 | { 72 | mWordNetEngine = wordNetEngine; 73 | mRelationType = relationType; 74 | 75 | mTargetSynsetOffset = targetSynsetOffset; 76 | mTargetSynsetPartOfSpeech = targetSynsetPartOfSpeech; 77 | } 78 | 79 | protected internal Relation(WordNetEngine wordNetEngine, RelationType relationType, int targetSynsetOffset, string targetSynsetPartOfSpeech, int sourceWord, int targetWord) : this(wordNetEngine, relationType, targetSynsetOffset, targetSynsetPartOfSpeech) 80 | { 81 | miSourceWord = sourceWord; 82 | miTargetWord = targetWord; 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /CherubNLP/Models/WordNet/RelationType.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2006 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | using System; 18 | 19 | namespace CherubNLP.Models 20 | { 21 | /// 22 | /// Summary description for RelationType. 23 | /// 24 | public class RelationType 25 | { 26 | private string mName; 27 | private RelationType mOpposite; 28 | private string[] mPartsOfSpeech; 29 | 30 | public string Name 31 | { 32 | get 33 | { 34 | return mName; 35 | } 36 | } 37 | 38 | public RelationType Opposite 39 | { 40 | get 41 | { 42 | return mOpposite; 43 | } 44 | } 45 | 46 | public string GetPartOfSpeech(int index) 47 | { 48 | return mPartsOfSpeech[index]; 49 | } 50 | 51 | public int PartsOfSpeechCount 52 | { 53 | get 54 | { 55 | return mPartsOfSpeech.Length; 56 | } 57 | } 58 | 59 | protected internal RelationType(string name, string[] partsOfSpeech) 60 | { 61 | mName = name; 62 | mPartsOfSpeech = partsOfSpeech; 63 | } 64 | 65 | protected internal RelationType(string name, RelationType opposite, string[] partsOfSpeech) 66 | { 67 | mName = name; 68 | mOpposite = opposite; 69 | mPartsOfSpeech = partsOfSpeech; 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /CherubNLP/Models/WordNet/Synset.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2006 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | using System; 18 | 19 | namespace CherubNLP.Models 20 | { 21 | /// 22 | /// Summary description for Synset. 23 | /// 24 | public class Synset 25 | { 26 | private int mOffset; 27 | private string mGloss; 28 | private string[] mWordList; 29 | private string mLexicographerFile; 30 | private Relation[] mRelations; 31 | 32 | private Synset() 33 | { 34 | } 35 | 36 | internal Synset(int offset, string gloss, string[] wordList, string lexicographerFile, Relation[] relations) 37 | { 38 | mOffset = offset; 39 | mGloss = gloss; 40 | mWordList = wordList; 41 | mLexicographerFile = lexicographerFile; 42 | mRelations = relations; 43 | } 44 | 45 | public int Offset 46 | { 47 | get 48 | { 49 | return mOffset; 50 | } 51 | } 52 | 53 | public string Gloss 54 | { 55 | get 56 | { 57 | return mGloss; 58 | } 59 | } 60 | 61 | public string GetWord(int wordIndex) 62 | { 63 | return mWordList[wordIndex]; 64 | } 65 | 66 | public int WordCount 67 | { 68 | get 69 | { 70 | return mWordList.Length; 71 | } 72 | } 73 | 74 | public string LexicographerFile 75 | { 76 | get 77 | { 78 | return mLexicographerFile; 79 | } 80 | } 81 | 82 | public Relation GetRelation(int relationIndex) 83 | { 84 | return mRelations[relationIndex]; 85 | } 86 | 87 | public int RelationCount 88 | { 89 | get 90 | { 91 | return mRelations.Length; 92 | } 93 | } 94 | 95 | public override string ToString() 96 | { 97 | System.Text.StringBuilder oOutput = new System.Text.StringBuilder(); 98 | 99 | for (int iCurrentWord = 0; iCurrentWord < mWordList.Length; iCurrentWord++) 100 | { 101 | oOutput.Append(mWordList[iCurrentWord]); 102 | if (iCurrentWord < mWordList.Length - 1) 103 | { 104 | oOutput.Append(", "); 105 | } 106 | } 107 | 108 | oOutput.Append(" -- ").Append(mGloss); 109 | 110 | return oOutput.ToString(); 111 | } 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /CherubNLP/Models/WordNet/Tokenizer.cs: -------------------------------------------------------------------------------- 1 | //Copyright (C) 2006 Richard J. Northedge 2 | // 3 | // This library is free software; you can redistribute it and/or 4 | // modify it under the terms of the GNU Lesser General Public 5 | // License as published by the Free Software Foundation; either 6 | // version 2.1 of the License, or (at your option) any later version. 7 | // 8 | // This library is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU Lesser General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU Lesser General Public 14 | // License along with this program; if not, write to the Free Software 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 | 17 | using System; 18 | 19 | namespace CherubNLP.Models 20 | { 21 | /// 22 | /// Summary description for Tokenizer. 23 | /// 24 | public class Tokenizer 25 | { 26 | private readonly string[] _tokens; 27 | int _position; 28 | 29 | public Tokenizer(string input, params char[] separators) 30 | { 31 | _tokens = input.Split(separators); 32 | _position = 0; 33 | } 34 | 35 | public string NextToken() 36 | { 37 | while (_position < _tokens.Length) 38 | { 39 | if ((_tokens[_position].Length > 0)) 40 | { 41 | return _tokens[_position++]; 42 | } 43 | _position++; 44 | } 45 | return null; 46 | } 47 | 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /CherubNLP/NER/README.md: -------------------------------------------------------------------------------- 1 | IOB tagging 2 | 3 | B-{CHUNK_TYPE} – for the word in the Beginning chunk 4 | I-{CHUNK_TYPE} – for words Inside the chunk 5 | O – Outside any chunk -------------------------------------------------------------------------------- /CherubNLP/Sentence.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.Tokenize; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace CherubNLP 7 | { 8 | public class Sentence 9 | { 10 | // User defined id 11 | public string Id { get; set; } 12 | 13 | public List Words { get; set; } 14 | 15 | public String Label { get; set; } 16 | 17 | public String Text { get; set; } 18 | 19 | public double[] Vector { get; set; } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /CherubNLP/Similarity/Similarity.cs: -------------------------------------------------------------------------------- 1 | using FastText.NetWrapper; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Linq; 6 | using System.Text; 7 | using Tensorflow; 8 | using Tensorflow.NumPy; 9 | using static Tensorflow.Binding; 10 | using static Tensorflow.KerasApi; 11 | 12 | namespace CherubNLP 13 | { 14 | public class Similarity 15 | { 16 | public static double[] Cosine(string src, string[] dst, string model) 17 | { 18 | using (var fastText = new FastTextWrapper()) 19 | { 20 | fastText.LoadModel(model); 21 | var vector = fastText.GetSentenceVector(src.ToLower()); 22 | return dst.Select(x => CalCosine(vector, fastText.GetSentenceVector(x.ToLower()))).ToArray(); 23 | } 24 | } 25 | 26 | public static double CalCosine(NDArray vector1, NDArray vector2) 27 | { 28 | var cosine_loss = keras.losses.CosineSimilarity(axis: 0); 29 | return cosine_loss.Call(vector1, vector2).numpy(); 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /CherubNLP/Stem/IStemmer.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.Tokenize; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace CherubNLP.Stem 7 | { 8 | /// 9 | /// Stemmer is used to remove morphological affixes from words, leaving only the word stem. 10 | /// Stemming algorithms aim to remove those affixes leaving only the stem of the word. 11 | /// IStemmer defines a standard interface for stemmers. 12 | /// 13 | public interface IStemmer 14 | { 15 | /// 16 | /// Strip affixes from the token and return the stem. 17 | /// 18 | /// 19 | /// 20 | /// 21 | string Stem(string word, StemOptions options); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /CherubNLP/Stem/RegexStemmer.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * CherubNLP Library 3 | * Copyright (C) 2018 Haiping Chen 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program. If not, see . 17 | */ 18 | 19 | using System; 20 | using System.Collections.Generic; 21 | using System.Linq; 22 | using System.Text; 23 | using System.Text.RegularExpressions; 24 | using CherubNLP.Tokenize; 25 | 26 | namespace CherubNLP.Stem 27 | { 28 | /// 29 | /// A stemmer that uses regular expressions to identify morphological affixes. 30 | /// Any substrings that match the regular expressions will be removed. 31 | /// 32 | public class RegexStemmer : IStemmer 33 | { 34 | static string _pattern; 35 | public static string PATTERN => GetPattern(); 36 | 37 | static Regex _regex; 38 | 39 | static Dictionary replacements = new Dictionary(); 40 | 41 | private static string GetPattern() 42 | { 43 | if (string.IsNullOrEmpty(_pattern)) 44 | { 45 | replacements["nning"] = "n"; // running 46 | replacements["pping"] = "p"; // skipping 47 | replacements["tting"] = "t"; // putting 48 | replacements["able"] = ""; 49 | replacements["were"] = "be"; 50 | replacements["sses"] = "ss"; 51 | replacements["ies"] = "i"; 52 | replacements["are"] = "be"; 53 | replacements["ing"] = ""; 54 | replacements["am"] = "be"; 55 | replacements["es"] = ""; 56 | replacements["is"] = "be"; 57 | replacements["s"] = ""; 58 | 59 | _pattern = string.Join("$|", replacements.Keys) + "$"; 60 | 61 | _regex = new Regex(_pattern); 62 | } 63 | 64 | return _pattern; 65 | } 66 | 67 | public string Stem(string word, StemOptions options) 68 | { 69 | var match = _regex.Matches(word).Cast().FirstOrDefault(); 70 | 71 | return match == null ? 72 | word : 73 | word.Substring(0, match.Index) + replacements[match.Value]; 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /CherubNLP/Stem/StemOptions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace CherubNLP.Stem 6 | { 7 | public class StemOptions 8 | { 9 | /// 10 | /// Regex pattern 11 | /// 12 | public string Pattern { get; set; } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /CherubNLP/Stem/StemmerFactory.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.Tokenize; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace CherubNLP.Stem 7 | { 8 | /// 9 | /// CherubNLP Stemmer Factory 10 | /// In linguistic morphology and information retrieval, 11 | /// stemming is the process of reducing inflected (or sometimes derived) words to their word stem, 12 | /// base or root form—generally a written word form. 13 | /// 14 | /// 15 | public class StemmerFactory where IStem : IStemmer, new() 16 | { 17 | private SupportedLanguage _lang { get; set; } 18 | 19 | private IStem _stemmer; 20 | 21 | private StemOptions _options; 22 | 23 | public StemmerFactory(StemOptions options, SupportedLanguage lang) 24 | { 25 | _lang = lang; 26 | _options = options; 27 | _stemmer = new IStem(); 28 | } 29 | 30 | public string Stem(string word) 31 | { 32 | return _stemmer.Stem(word, _options); 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /CherubNLP/SupportedLanguage.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace CherubNLP 6 | { 7 | public class SupportedLanguage 8 | { 9 | public static readonly SupportedLanguage English = new SupportedLanguage("en"); 10 | public static readonly SupportedLanguage Russian = new SupportedLanguage("ru"); 11 | public static readonly SupportedLanguage German = new SupportedLanguage("de"); 12 | public static readonly SupportedLanguage Portuguese = new SupportedLanguage("pt"); 13 | public static readonly SupportedLanguage PortugueseBrazil = new SupportedLanguage("pt-BR"); 14 | public static readonly SupportedLanguage Spanish = new SupportedLanguage("es"); 15 | public static readonly SupportedLanguage French = new SupportedLanguage("fr"); 16 | public static readonly SupportedLanguage Italian = new SupportedLanguage("it"); 17 | public static readonly SupportedLanguage Dutch = new SupportedLanguage("nl"); 18 | public static readonly SupportedLanguage Japanese = new SupportedLanguage("ja"); 19 | public static readonly SupportedLanguage ChineseChina = new SupportedLanguage("zh-CN"); 20 | public static readonly SupportedLanguage ChineseHongKong = new SupportedLanguage("zh-HK"); 21 | public static readonly SupportedLanguage ChineseTaiwan = new SupportedLanguage("zh-TW"); 22 | 23 | private static readonly SupportedLanguage[] AllLangs = 24 | { 25 | English, 26 | Russian, 27 | German, 28 | Portuguese, 29 | PortugueseBrazil, 30 | Spanish, 31 | French, 32 | Italian, 33 | Dutch, 34 | Japanese, 35 | ChineseChina, 36 | ChineseHongKong, 37 | ChineseTaiwan 38 | }; 39 | 40 | public readonly string code; 41 | 42 | private SupportedLanguage(string code) 43 | { 44 | this.code = code; 45 | } 46 | 47 | public static SupportedLanguage FromLanguageTag(string languageTag) 48 | { 49 | foreach (var item in AllLangs) 50 | { 51 | if (string.Equals(item.code, languageTag, StringComparison.OrdinalIgnoreCase)) 52 | { 53 | return item; 54 | } 55 | } 56 | 57 | return English; 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /CherubNLP/Tag/DefaultTagger.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using CherubNLP.Tokenize; 5 | 6 | namespace CherubNLP.Tag 7 | { 8 | /// 9 | /// The simplest possible tagger assigns the same tag to each token. 10 | /// This may seem to be a rather banal step, but it establishes an important baseline for tagger performance. 11 | /// In order to get the best result, we tag each word with the most likely tag. 12 | /// 13 | public class DefaultTagger : ITagger 14 | { 15 | public void Tag(Sentence sentence, TagOptions options) 16 | { 17 | 18 | } 19 | 20 | public void Train(List sentences, TagOptions options) 21 | { 22 | 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /CherubNLP/Tag/ITagger.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.Tokenize; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace CherubNLP.Tag 7 | { 8 | /// 9 | /// Part-Of-Speech tagging (or POS tagging, for short) is one of the main components of almost any NLP analysis. 10 | /// The task of POS-tagging simply implies labelling words with their appropriate Part-Of-Speech (Noun, Verb, Adjective, Adverb, Pronoun, …). 11 | /// 12 | public interface ITagger 13 | { 14 | /// 15 | /// 16 | /// 17 | /// A tagged corpus. Each item should be a list of tokens. 18 | /// 19 | /// 20 | void Train(List sentences, TagOptions options); 21 | 22 | void Tag(Sentence sentence, TagOptions options); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /CherubNLP/Tag/NGramTagger.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * CherubNLP Library 3 | * Copyright (C) 2018 Haiping Chen 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program. If not, see . 17 | */ 18 | 19 | using System; 20 | using System.Collections.Generic; 21 | using System.IO; 22 | using System.Linq; 23 | using System.Text; 24 | using CherubNLP.Corpus; 25 | using CherubNLP.Tokenize; 26 | 27 | namespace CherubNLP.Tag 28 | { 29 | /// 30 | /// N-Gramm taggers are based on a simple statistical algorithm: 31 | /// for each token, assign the tag that is most likely for that particular token. 32 | /// 33 | public class NGramTagger : ITagger 34 | { 35 | private List _contextMapping { get; set; } 36 | 37 | public void Tag(Sentence sentence, TagOptions options) 38 | { 39 | // need training to generate model 40 | if(_contextMapping == null) 41 | { 42 | var corpus = new CoNLLReader().Read(new ReaderOptions 43 | { 44 | DataDir = Path.Combine(options.CorpusDir, "CoNLL"), 45 | FileName = "conll2000_chunking_train.txt" 46 | }); 47 | 48 | Train(corpus, options); 49 | } 50 | 51 | Fill(sentence, options); 52 | 53 | for (int pos = options.NGram - 1; pos < sentence.Words.Count; pos++) 54 | { 55 | sentence.Words[pos].Pos = _contextMapping.FirstOrDefault(x => x.Context == GetContext(pos, sentence.Words, options))?.Tag; 56 | 57 | // set default tag 58 | if(sentence.Words[pos].Pos == null) 59 | { 60 | sentence.Words[pos].Pos = options.Tag; 61 | } 62 | } 63 | 64 | for(int pos = 0; pos < options.NGram - 1; pos++) 65 | { 66 | sentence.Words.RemoveAt(0); 67 | } 68 | } 69 | 70 | public void Train(List sentences, TagOptions options) 71 | { 72 | var cache = new List(); 73 | 74 | for (int idx = 0; idx < sentences.Count; idx++) 75 | { 76 | var sent = sentences[idx]; 77 | 78 | Fill(sent, options); 79 | 80 | for (int pos = options.NGram - 1; pos < sent.Words.Count; pos++) 81 | { 82 | var freq = new NGramFreq 83 | { 84 | Context = GetContext(pos, sent.Words, options), 85 | Tag = sent.Words[pos].Pos, 86 | Count = 1 87 | }; 88 | 89 | cache.Add(freq); 90 | } 91 | } 92 | 93 | _contextMapping = (from c in cache 94 | group c by new { c.Context, c.Tag } into g 95 | select new NGramFreq 96 | { 97 | Context = g.Key.Context, 98 | Tag = g.Key.Tag, 99 | Count = g.Count() 100 | }).OrderByDescending(x => x.Count) 101 | .ToList(); 102 | } 103 | 104 | private string GetContext(int pos, List words, TagOptions options) 105 | { 106 | string context = words[pos].Text; 107 | for (int ngram = options.NGram - 1; ngram > 0; ngram--) 108 | { 109 | context = words[pos - ngram].Pos + " " + context; 110 | } 111 | 112 | return context; 113 | } 114 | 115 | private void Fill(Sentence sent, TagOptions options) 116 | { 117 | for (int ngram = 1; ngram < options.NGram; ngram++) 118 | { 119 | sent.Words.Insert(0, new Token { Text = "NIL", Pos = options.Tag, Start = (ngram - 1) * 3 }); 120 | } 121 | } 122 | 123 | private class NGramFreq 124 | { 125 | /// 126 | /// Current token tag 127 | /// 128 | public string Tag { get; set; } 129 | 130 | /// 131 | /// Occurence frequency 132 | /// 133 | public int Count { get; set; } 134 | 135 | public string Context { get; set; } 136 | } 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /CherubNLP/Tag/TagOptions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace CherubNLP.Tag 6 | { 7 | public class TagOptions 8 | { 9 | /// 10 | /// Display some stats, if requested. 11 | /// 12 | public bool Verbose { get; set; } 13 | 14 | /// 15 | /// Default Tag 16 | /// Used in DefaultTagger 17 | /// 18 | public string Tag { get; set; } 19 | 20 | /// 21 | /// N-Gram number 22 | /// 23 | public int NGram { get; set; } 24 | 25 | public string CorpusDir { get; set; } 26 | 27 | public TagOptions() 28 | { 29 | NGram = 1; 30 | Tag = "NN"; 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /CherubNLP/Tag/TaggerFactory.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Reflection; 5 | using System.Text; 6 | 7 | namespace CherubNLP.Tag 8 | { 9 | public class TaggerFactory 10 | { 11 | private SupportedLanguage _lang; 12 | 13 | private ITagger _tagger; 14 | 15 | private TagOptions _options; 16 | 17 | public TaggerFactory(TagOptions options, SupportedLanguage lang) 18 | { 19 | _lang = lang; 20 | _options = options; 21 | } 22 | 23 | public ITagger GetTagger() where ITag : ITagger, new() 24 | { 25 | return _tagger = new ITag(); 26 | } 27 | 28 | public ITagger GetTagger(string name) 29 | { 30 | List types = new List(); 31 | 32 | types.AddRange(Assembly.Load(new AssemblyName("CherubNLP")) 33 | .GetTypes().Where(x => !x.IsAbstract && !x.FullName.StartsWith("<>f__AnonymousType")).ToList()); 34 | 35 | Type type = types.FirstOrDefault(x => x.Name == name); 36 | var instance = (ITagger)Activator.CreateInstance(type); 37 | 38 | return _tagger = instance; 39 | } 40 | 41 | public void Tag(Sentence sentence) 42 | { 43 | _tagger.Tag(sentence, _options); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /CherubNLP/Tokenize/ITokenizer.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace CherubNLP.Tokenize 6 | { 7 | /// 8 | /// A tokenizer is a component used for dividing text intotokens. 9 | /// A tokenizer is language specific and takes into account the peculiarities of the language, e.g. don’t in English is tokenized as two tokens. 10 | /// 11 | public interface ITokenizer 12 | { 13 | /// 14 | /// Tokenize 15 | /// 16 | /// input sentence 17 | /// Options such as: regex expression 18 | /// 19 | List Tokenize(string sentence, TokenizationOptions options); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /CherubNLP/Tokenize/README.rst: -------------------------------------------------------------------------------- 1 |  -------------------------------------------------------------------------------- /CherubNLP/Tokenize/RegexTokenizer.cs: -------------------------------------------------------------------------------- 1 | /* 2 | * CherubNLP Library 3 | * Copyright (C) 2018 Haiping Chen 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program. If not, see . 17 | */ 18 | 19 | using System; 20 | using System.Collections.Generic; 21 | using System.Linq; 22 | using System.Text; 23 | using System.Text.RegularExpressions; 24 | 25 | namespace CherubNLP.Tokenize 26 | { 27 | /// 28 | /// Regular-Expression Tokenizers 29 | /// 30 | public class RegexTokenizer : ITokenizer 31 | { 32 | /// 33 | /// Tokenize a text into a sequence of alphabetic and non-alphabetic characters 34 | /// 35 | public const string WORD_PUNC = @"[^\w\s]+|\w+"; 36 | 37 | /// 38 | /// Tokenize a string, treating any sequence of blank lines as a delimiter. 39 | /// Blank lines are defined as lines containing no characters, except for space or tab characters. 40 | /// options.IsGap = true 41 | /// 42 | public const string BLANK_LINE = @"\s*\n\s*\n\s*"; 43 | 44 | /// 45 | /// Tokenize a string on whitespace (space, tab, newline). 46 | /// In general, users should use the string ``split()`` method instead. 47 | /// options.IsGap = true 48 | /// 49 | public const string WHITE_SPACE = @"\s+"; 50 | 51 | private Regex _regex; 52 | 53 | public List Tokenize(string sentence, TokenizationOptions options) 54 | { 55 | string pattern = options.Pattern; 56 | if (options.SpecialWords != null) 57 | { 58 | options.SpecialWords.ForEach(r => 59 | { 60 | sentence = Regex.Replace(sentence, r, " " + r); 61 | }); 62 | 63 | pattern = String.Join("|", options.SpecialWords) + "|" + pattern; 64 | } 65 | 66 | _regex = new Regex(pattern); 67 | 68 | var matches = _regex.Matches(sentence).Cast().ToArray(); 69 | 70 | options.IsGap = new string[] { WHITE_SPACE, BLANK_LINE }.Contains(pattern); 71 | 72 | if (options.IsGap) 73 | { 74 | int pos = 0; 75 | var tokens = new Token[matches.Length + 1]; 76 | 77 | for (int span = 0; span <= matches.Length; span++) 78 | { 79 | var token = new Token 80 | { 81 | Text = (span == matches.Length) ? sentence.Substring(pos) : sentence.Substring(pos, matches[span].Index - pos), 82 | Start = pos 83 | }; 84 | 85 | token.Text = token.Text.Trim(); 86 | 87 | tokens[span] = token; 88 | 89 | if (span < matches.Length) 90 | { 91 | pos = matches[span].Index + 1; 92 | } 93 | } 94 | 95 | return tokens.ToList(); 96 | } 97 | else 98 | { 99 | var m = matches.Select(x => new Token 100 | { 101 | Text = x.Value, 102 | Start = x.Index 103 | }).ToList(); 104 | 105 | if(options.SpecialWords != null) 106 | { 107 | int offset = 0; 108 | m.ForEach(t => 109 | { 110 | if (options.SpecialWords.Contains(t.Text)) 111 | { 112 | offset++; 113 | } 114 | 115 | t.Start = t.Start - offset; 116 | }); 117 | } 118 | 119 | 120 | return m; 121 | } 122 | } 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /CherubNLP/Tokenize/Token.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using System.Text.RegularExpressions; 5 | 6 | namespace CherubNLP.Tokenize 7 | { 8 | public class Token 9 | { 10 | /// 11 | /// The original word text. 12 | /// 13 | public string Text { get; set; } 14 | 15 | /// 16 | /// The offset of word 17 | /// 18 | public int Start { get; set; } 19 | 20 | /// 21 | /// The simple part-of-speech tag. 22 | /// Not widely used, Tag is more general. 23 | /// 24 | public string Pos { get; set; } 25 | 26 | /// 27 | /// The detailed part-of-speech tag. 28 | /// https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html 29 | /// 30 | public string Tag { get; set; } 31 | 32 | /// 33 | /// The base form of the word. 34 | /// 35 | public string Lemma { get; set; } 36 | 37 | /// 38 | /// The word shape – capitalisation, punctuation, digits. 39 | /// 40 | public string Shape { get; set; } 41 | 42 | /// 43 | /// Is the token an alpha character? 44 | /// 45 | public bool IsAlpha 46 | { 47 | get 48 | { 49 | return Regex.IsMatch(Text, @"^[a-zA-Z]+|[\u4e00-\u9fa5]+$"); 50 | } 51 | } 52 | 53 | /// 54 | /// Is the token part of a stop list, i.e. the most common words of the language? 55 | /// 56 | public bool IsStop { get; set; } 57 | 58 | public int End 59 | { 60 | get 61 | { 62 | return Start + Text.Length; 63 | } 64 | } 65 | 66 | public override string ToString() 67 | { 68 | return $"{Text} {Start} {Pos}"; 69 | } 70 | 71 | public double Vector { get; set; } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /CherubNLP/Tokenize/TokenizationOptions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace CherubNLP.Tokenize 6 | { 7 | public class TokenizationOptions 8 | { 9 | /// 10 | /// Regex pattern 11 | /// 12 | public string Pattern { get; set; } 13 | 14 | /// 15 | /// True if this tokenizer's pattern should be used to find separators between tokens; 16 | /// False if this tokenizer's pattern should be used to find the tokens themselves. 17 | /// 18 | public bool IsGap { get; set; } 19 | 20 | /// 21 | /// True if any empty tokens generated by the tokenizer should be discarded. 22 | /// Empty tokens can only be generated if `IsGap == True` 23 | /// 24 | public bool IgnoreEmpty { get; set; } 25 | 26 | /// 27 | /// Split "isn't" into "is", "n't" 28 | /// 29 | public List SpecialWords { get; set; } 30 | 31 | /// 32 | /// Convert bracket-like characters to avoid confusion with parse brackets. 33 | /// 34 | public bool ConvertParentheses { get; set; } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /CherubNLP/Tokenize/TokenizerBase.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace CherubNLP.Tokenize 6 | { 7 | public abstract class TokenizerBase 8 | { 9 | protected void CorrectTokenPosition(string sentence, List tokens) 10 | { 11 | int startPos = 0; 12 | 13 | for (int i = 0; i < tokens.Count; i++) 14 | { 15 | var token = tokens[i]; 16 | token.Start = sentence.IndexOf(token.Text, startPos); 17 | 18 | startPos = token.End; 19 | } 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /CherubNLP/Tokenize/TokenizerFactory.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Reflection; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | 8 | namespace CherubNLP.Tokenize 9 | { 10 | /// 11 | /// CherubNLP Tokenizer Factory 12 | /// Tokenizers divide strings into lists of substrings. 13 | /// The particular tokenizer requires implement interface 14 | /// models to be installed.CherubNLP also provides a simpler, regular-expression based tokenizer, which splits text on whitespace and punctuation. 15 | /// 16 | public class TokenizerFactory 17 | { 18 | private SupportedLanguage _lang; 19 | 20 | private ITokenizer _tokenizer; 21 | 22 | private TokenizationOptions _options; 23 | 24 | public ITokenizer GetTokenizer() where ITokenize : ITokenizer, new() 25 | { 26 | return _tokenizer = new ITokenize(); 27 | } 28 | 29 | public ITokenizer GetTokenizer(string name) 30 | { 31 | List types = new List(); 32 | 33 | types.AddRange(Assembly.Load(new AssemblyName("CherubNLP")) 34 | .GetTypes().Where(x => !x.IsAbstract && !x.FullName.StartsWith("<>f__AnonymousType")).ToList()); 35 | 36 | Type type = types.FirstOrDefault(x => x.Name == name); 37 | var instance = (ITokenizer)Activator.CreateInstance(type); 38 | 39 | return _tokenizer = instance; 40 | } 41 | 42 | public TokenizerFactory(TokenizationOptions options, SupportedLanguage lang) 43 | { 44 | _lang = lang; 45 | _options = options; 46 | } 47 | 48 | public List Tokenize(string sentence) 49 | { 50 | var tokens = _tokenizer.Tokenize(sentence, _options); 51 | tokens.ForEach(x => x.Lemma = x.Text.ToLower()); 52 | return tokens; 53 | } 54 | 55 | public List Tokenize(List sentences) 56 | { 57 | var sents = sentences.Select(s => new Sentence { Text = s }).ToList(); 58 | 59 | Parallel.ForEach(sents, (sentence) => 60 | { 61 | sentence.Words = Tokenize(sentence.Text); 62 | sentence.Words.ForEach(x => x.Lemma = x.Text.ToLower()); 63 | }); 64 | 65 | return sents; 66 | } 67 | 68 | private class ParallelToken 69 | { 70 | public String Text { get; set; } 71 | 72 | public List Tokens { get; set; } 73 | } 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /CherubNLP/Txt2Vec/OneHotEncoder.cs: -------------------------------------------------------------------------------- 1 | using CherubNLP.Tokenize; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | 8 | namespace CherubNLP.Txt2Vec 9 | { 10 | /// 11 | /// A one hot encoding is a representation of categorical variables as binary vectors. 12 | /// Each integer value is represented as a binary vector that is all zero values except the index of the integer, which is marked with a 1. 13 | /// 14 | public class OneHotEncoder 15 | { 16 | public List Sentences { get; set; } 17 | 18 | public List Words { get; set; } 19 | 20 | public void Encode(Sentence sentence) 21 | { 22 | InitDictionary(); 23 | 24 | var vector = Words.Select(x => 0D).ToArray(); 25 | 26 | sentence.Words.ForEach(w => 27 | { 28 | int index = Words.IndexOf(w.Lemma); 29 | if(index > 0) 30 | { 31 | vector[index] = 1; 32 | } 33 | }); 34 | 35 | sentence.Vector = vector; 36 | } 37 | 38 | public List EncodeAll() 39 | { 40 | InitDictionary(); 41 | 42 | Sentences.ForEach(sent => Encode(sent)); 43 | //Parallel.ForEach(Sentences, sent => Encode(sent)); 44 | 45 | return Words; 46 | } 47 | 48 | private List InitDictionary() 49 | { 50 | if (Words == null) 51 | { 52 | Words = new List(); 53 | Sentences.ForEach(x => 54 | { 55 | Words.AddRange(x.Words.Where(w => w.IsAlpha).Select(w => w.Lemma)); 56 | }); 57 | Words = Words.Distinct().OrderBy(x => x).ToList(); 58 | } 59 | 60 | return Words; 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /CherubNLP/Txt2Vec/Shrink.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | using System.IO; 7 | 8 | namespace Txt2Vec 9 | { 10 | public class Shrink 11 | { 12 | public void Run(string strModelFileName, string strNewModelFileName, string strDictFileName) 13 | { 14 | string strLine = null; 15 | 16 | //Load lexical dictionary 17 | // Logger.WriteLine("Load lexical dictionary..."); 18 | StreamReader sr = new StreamReader(strDictFileName); 19 | HashSet setTerm = new HashSet(); 20 | while ((strLine = sr.ReadLine()) != null) 21 | { 22 | string[] items = strLine.Split('\t'); 23 | setTerm.Add(items[0]); 24 | } 25 | sr.Close(); 26 | 27 | 28 | //Load raw model 29 | // Logger.WriteLine("Loading raw model..."); 30 | sr = new StreamReader(strModelFileName); 31 | BinaryReader br = new BinaryReader(sr.BaseStream); 32 | 33 | int words = br.ReadInt32(); 34 | int size = br.ReadInt32(); 35 | int vqSize = br.ReadInt32(); 36 | 37 | // Logger.WriteLine("vocabulary size: {0}, vector size: {1}, VQ size: {2}", words, size, vqSize); 38 | if (vqSize != 0) 39 | { 40 | // Logger.WriteLine(Logger.Level.err, "Currently, we don't support to shrink vector quantization model."); 41 | return; 42 | } 43 | 44 | Dictionary vocab = new Dictionary(); 45 | Dictionary rev_vocab = new Dictionary(); 46 | List termList = new List(); 47 | double []M = new double[words * size]; 48 | 49 | int newwords = 0; 50 | for (int b = 0; b < words; b++) 51 | { 52 | string strTerm = br.ReadString(); 53 | if (setTerm.Contains(strTerm) == true) 54 | { 55 | termList.Add(strTerm); 56 | for (int a = 0; a < size; a++) 57 | { 58 | M[a + newwords * size] = br.ReadSingle(); 59 | } 60 | newwords++; 61 | } 62 | else 63 | { 64 | //Skip the vectors of this word 65 | for (int a = 0; a < size; a++) 66 | { 67 | br.ReadSingle(); 68 | } 69 | } 70 | } 71 | sr.Close(); 72 | 73 | //Save the shrinked model 74 | // Logger.WriteLine("Saving shrinked model..."); 75 | StreamWriter sw = new StreamWriter(strNewModelFileName); 76 | BinaryWriter bw = new BinaryWriter(sw.BaseStream); 77 | 78 | bw.Write(newwords); 79 | bw.Write(size); 80 | bw.Write(vqSize); 81 | 82 | for (int i = 0; i < newwords; i++) 83 | { 84 | bw.Write(termList[i]); 85 | for (int j = 0; j < size; j++) 86 | { 87 | bw.Write((float)M[j + i * size]); 88 | } 89 | } 90 | sw.Close(); 91 | } 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CherubNLP 2 | Natural Language Processing in .NET Standard. 3 | 4 | ```powershell 5 | PM > Install-Package CherubNLP 6 | ``` 7 | 8 | 9 | 10 | 11 | #### Word Vector 12 | #### Text Vector 13 | 14 | #### Text Similarity 15 | 16 | ```csharp 17 | using NumSharp; 18 | 19 | var similarities = Similarity.Cosine("We can use Cosine to compute the similarity of two hardcoded lists.", new[] 20 | { 21 | "Cosine Similarity algorithm function sample.", 22 | "The Cosine Similarity function computes the similarity of two lists of numbers.", 23 | "Compute the similarity of two hardcoded lists.", 24 | "We can compute the similarity of two hardcoded lists.", 25 | "Coronavirus app could trace your contacts without sacrificing your privacy" 26 | }, "dbpedia.ftz")); 27 | 28 | Assert.AreEqual(new[] { 0, 4, 1, 3, 2 }, np.argsort(similarities)); 29 | ``` 30 | 31 | -------------------------------------------------------------------------------- /Settings/app.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | } -------------------------------------------------------------------------------- /data/dbpedia.ftz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciSharp/CherubNLP/7e80875a7909288c15c0e5ee1fafcb4c8df41198/data/dbpedia.ftz -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = CherubNLP 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # CherubNLP documentation build configuration file, created by 5 | # sphinx-quickstart on Sun Oct 14 08:24:22 2018. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | # import os 21 | # import sys 22 | # sys.path.insert(0, os.path.abspath('.')) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = ['sphinx.ext.mathjax'] 35 | 36 | # Add any paths that contain templates here, relative to this directory. 37 | templates_path = ['_templates'] 38 | 39 | # The suffix(es) of source filenames. 40 | # You can specify multiple suffix as a list of string: 41 | # 42 | # source_suffix = ['.rst', '.md'] 43 | source_suffix = ['.rst', '.md'] 44 | 45 | # The master toctree document. 46 | master_doc = 'index' 47 | 48 | # General information about the project. 49 | project = 'CherubNLP' 50 | copyright = '2018, Haiping Chen' 51 | author = 'Haiping Chen' 52 | 53 | # The version info for the project you're documenting, acts as replacement for 54 | # |version| and |release|, also used in various other places throughout the 55 | # built documents. 56 | # 57 | # The short X.Y version. 58 | version = '0.1' 59 | # The full version, including alpha/beta/rc tags. 60 | release = '0.1.0' 61 | 62 | # The language for content autogenerated by Sphinx. Refer to documentation 63 | # for a list of supported languages. 64 | # 65 | # This is also used if you do content translation via gettext catalogs. 66 | # Usually you set "language" from the command line for these cases. 67 | language = None 68 | 69 | # List of patterns, relative to source directory, that match files and 70 | # directories to ignore when looking for source files. 71 | # This patterns also effect to html_static_path and html_extra_path 72 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 73 | 74 | # The name of the Pygments (syntax highlighting) style to use. 75 | pygments_style = 'sphinx' 76 | 77 | # If true, `todo` and `todoList` produce output, else they produce nothing. 78 | todo_include_todos = False 79 | 80 | 81 | # -- Options for HTML output ---------------------------------------------- 82 | 83 | # The theme to use for HTML and HTML Help pages. See the documentation for 84 | # a list of builtin themes. 85 | # 86 | html_theme = 'sphinx_rtd_theme' 87 | 88 | # Theme options are theme-specific and customize the look and feel of a theme 89 | # further. For a list of options available for each theme, see the 90 | # documentation. 91 | # 92 | # html_theme_options = {} 93 | 94 | # Add any paths that contain custom static files (such as style sheets) here, 95 | # relative to this directory. They are copied after the builtin static files, 96 | # so a file named "default.css" will overwrite the builtin "default.css". 97 | html_static_path = ['_static'] 98 | 99 | # Custom sidebar templates, must be a dictionary that maps document names 100 | # to template names. 101 | # 102 | # This is required for the alabaster theme 103 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 104 | html_sidebars = { 105 | '**': [ 106 | 'relations.html', # needs 'show_related': True theme option to display 107 | 'searchbox.html', 108 | ] 109 | } 110 | 111 | 112 | # -- Options for HTMLHelp output ------------------------------------------ 113 | 114 | # Output file base name for HTML help builder. 115 | htmlhelp_basename = 'CherubNLPdoc' 116 | 117 | 118 | # -- Options for LaTeX output --------------------------------------------- 119 | 120 | latex_elements = { 121 | # The paper size ('letterpaper' or 'a4paper'). 122 | # 123 | # 'papersize': 'letterpaper', 124 | 125 | # The font size ('10pt', '11pt' or '12pt'). 126 | # 127 | # 'pointsize': '10pt', 128 | 129 | # Additional stuff for the LaTeX preamble. 130 | # 131 | # 'preamble': '', 132 | 133 | # Latex figure (float) alignment 134 | # 135 | # 'figure_align': 'htbp', 136 | } 137 | 138 | # Grouping the document tree into LaTeX files. List of tuples 139 | # (source start file, target name, title, 140 | # author, documentclass [howto, manual, or own class]). 141 | latex_documents = [ 142 | (master_doc, 'CherubNLP.tex', 'CherubNLP Documentation', 143 | 'Haiping Chen', 'manual'), 144 | ] 145 | 146 | 147 | # -- Options for manual page output --------------------------------------- 148 | 149 | # One entry per manual page. List of tuples 150 | # (source start file, name, description, authors, manual section). 151 | man_pages = [ 152 | (master_doc, 'cherubnlp', 'CherubNLP Documentation', 153 | [author], 1) 154 | ] 155 | 156 | 157 | # -- Options for Texinfo output ------------------------------------------- 158 | 159 | # Grouping the document tree into Texinfo files. List of tuples 160 | # (source start file, target name, title, author, 161 | # dir menu entry, description, category) 162 | texinfo_documents = [ 163 | (master_doc, 'CherubNLP', 'CherubNLP Documentation', 164 | author, 'CherubNLP', 'One line description of project.', 165 | 'Miscellaneous'), 166 | ] 167 | 168 | 169 | 170 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. CherubNLP documentation master file, created by 2 | sphinx-quickstart on Sun Oct 14 08:24:22 2018. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to CherubNLP's documentation! 7 | ===================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | 14 | 15 | Indices and tables 16 | ================== 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=CherubNLP 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | --------------------------------------------------------------------------------