├── .gitignore
├── CherubNLP.Console
    ├── CherubNLP.Console.csproj
    └── Program.cs
├── CherubNLP.UnitTest
    ├── CherubNLP.UnitTest.csproj
    ├── Classification
    │   ├── SVMClassifierTest.cs
    │   └── WordCnnTest.cs
    ├── DefaultTaggerTest.cs
    ├── Featuring
    │   └── CountFeatureExtractorTest.cs
    ├── Kaggle
    │   └── SpookyAuthorIdentification
    │   │   ├── ClassificationTest.cs
    │   │   ├── README.md
    │   │   ├── sample_submission.zip
    │   │   ├── test.zip
    │   │   └── train.zip
    ├── NGramTaggerTest.cs
    ├── NaiveBayesClassifierTest.cs
    ├── RegexStemmerTest.cs
    ├── TestEssential.cs
    ├── Tokenize
    │   ├── RegexTokenizerTest.cs
    │   └── TreebankTokenizerTest.cs
    └── Vector
    │   ├── FastTextTest.cs
    │   ├── OneHotEncodingTest.cs
    │   └── Word2VecTest.cs
├── CherubNLP.sln
├── CherubNLP
    ├── CherubNLP.csproj
    ├── Classify
    │   ├── ClassifierFactory.cs
    │   ├── ClassifyOptions.cs
    │   ├── IClassifier.cs
    │   ├── IEstimator.cs
    │   ├── ITextFeatureExtractor.cs
    │   ├── NaiveBayesClassifier.cs
    │   ├── SVMClassifier.cs
    │   ├── SentenceFeatureExtractor.cs
    │   └── WordFeatureExtractor.cs
    ├── Corpus
    │   ├── ConllReader.cs
    │   ├── FasttextDataReader.cs
    │   ├── KaggleTextDataReader.cs
    │   ├── LabeledPerFileNameReader.cs
    │   └── ReaderOptions.cs
    ├── Featuring
    │   ├── CountFeatureExtractor.cs
    │   ├── IFeatureExtractor.cs
    │   ├── TfIdfFeatureExtractor.cs
    │   └── Word2VecFeatureExtractor.cs
    ├── Jieba.NET
    │   ├── Common
    │   │   ├── Counter.cs
    │   │   ├── Extensions.cs
    │   │   ├── FileExtension.cs
    │   │   └── Trie.cs
    │   ├── ConfigManager.cs
    │   ├── Constants.cs
    │   ├── DefaultDictionary.cs
    │   ├── FinalSeg
    │   │   ├── IFinalSeg.cs
    │   │   └── Viterbi.cs
    │   ├── JiebaSegmenter.cs
    │   ├── JiebaTagger.cs
    │   ├── JiebaTokenizer.cs
    │   ├── Node.cs
    │   ├── Pair.cs
    │   ├── PosSeg
    │   │   ├── Pair.cs
    │   │   ├── PosSegmenter.cs
    │   │   └── Viterbi.cs
    │   ├── README.rst
    │   ├── Spelling
    │   │   └── SpellChecker.cs
    │   ├── Token.cs
    │   └── WordDictionary.cs
    ├── Models
    │   ├── Entropy
    │   │   ├── AbstractDataIndexer.cs
    │   │   ├── BasicContextGenerator.cs
    │   │   ├── BasicEventReader.cs
    │   │   ├── ComparableEvent.cs
    │   │   ├── GisModel.cs
    │   │   ├── GisTrainer.cs
    │   │   ├── IContextGenerator.cs
    │   │   ├── IMaximumEntropyModel.cs
    │   │   ├── IO
    │   │   │   ├── BinaryGisModelReader.cs
    │   │   │   ├── BinaryGisModelWriter.cs
    │   │   │   ├── GisModelReader.cs
    │   │   │   ├── GisModelWriter.cs
    │   │   │   ├── IGisModelReader.cs
    │   │   │   ├── JavaBinaryGisModelReader.cs
    │   │   │   ├── JavaBinaryGisModelWriter.cs
    │   │   │   ├── PlainTextGisModelReader.cs
    │   │   │   └── PlainTextGisModelWriter.cs
    │   │   ├── ITrainingDataIndexer.cs
    │   │   ├── ITrainingDataReader.cs
    │   │   ├── ITrainingEventReader.cs
    │   │   ├── OnePassDataIndexer.cs
    │   │   ├── PatternedPredicate.cs
    │   │   ├── PlainTextByLineDataReader.cs
    │   │   ├── TrainingEvent.cs
    │   │   └── TwoPassDataIndexer.cs
    │   └── WordNet
    │   │   ├── DataFileEngine.cs
    │   │   ├── IndexWord.cs
    │   │   ├── Morph
    │   │       ├── AbstractDelegatingOperation.cs
    │   │       ├── DetachSuffixesOperation.cs
    │   │       ├── IOperation.cs
    │   │       ├── LookupExceptionsOperation.cs
    │   │       ├── LookupIndexWordOperation.cs
    │   │       ├── TokenizerOperation.cs
    │   │       └── Util.cs
    │   │   ├── Relation.cs
    │   │   ├── RelationType.cs
    │   │   ├── Synset.cs
    │   │   ├── Tokenizer.cs
    │   │   └── WordNetEngine.cs
    ├── NER
    │   └── README.md
    ├── Sentence.cs
    ├── Similarity
    │   └── Similarity.cs
    ├── Stem
    │   ├── IStemmer.cs
    │   ├── RegexStemmer.cs
    │   ├── StemOptions.cs
    │   └── StemmerFactory.cs
    ├── SupportedLanguage.cs
    ├── Tag
    │   ├── DefaultTagger.cs
    │   ├── ITagger.cs
    │   ├── NGramTagger.cs
    │   ├── TagOptions.cs
    │   └── TaggerFactory.cs
    ├── Tokenize
    │   ├── ITokenizer.cs
    │   ├── README.rst
    │   ├── RegexTokenizer.cs
    │   ├── Token.cs
    │   ├── TokenizationOptions.cs
    │   ├── TokenizerBase.cs
    │   ├── TokenizerFactory.cs
    │   └── TreebankTokenizer.cs
    └── Txt2Vec
    │   ├── Decoder.cs
    │   ├── Encoder.cs
    │   ├── Model.cs
    │   ├── OneHotEncoder.cs
    │   ├── Shrink.cs
    │   └── VectorGenerator.cs
├── LICENSE
├── README.md
├── Settings
    └── app.json
├── data
    └── dbpedia.ftz
└── docs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    └── make.bat


/CherubNLP.Console/CherubNLP.Console.csproj:
--------------------------------------------------------------------------------
 1 | ﻿<Project Sdk="Microsoft.NET.Sdk">
 2 | 
 3 |   <PropertyGroup>
 4 |     <OutputType>Exe</OutputType>
 5 |     <TargetFramework>net6.0</TargetFramework>
 6 |     <Platforms>AnyCPU;x64</Platforms>
 7 |   </PropertyGroup>
 8 | 
 9 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">
10 |     <DefineConstants>DEBUG;TRACE</DefineConstants>
11 |   </PropertyGroup>
12 | 
13 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
14 |     <DefineConstants>DEBUG;TRACE</DefineConstants>
15 |   </PropertyGroup>
16 | 
17 |   <ItemGroup>
18 |     <ProjectReference Include="..\CherubNLP.UnitTest\CherubNLP.UnitTest.csproj" />
19 |   </ItemGroup>
20 | 
21 | </Project>
22 | 


--------------------------------------------------------------------------------
/CherubNLP.Console/Program.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CherubNLP.UnitTest.Kaggle;
 2 | using FastText.NetWrapper;
 3 | using System;
 4 | using System.IO;
 5 | 
 6 | namespace CherubNLP.Console
 7 | {
 8 |     class Program
 9 |     {
10 |         static void Main(string[] args)
11 |         {
12 |             var model = Path.Combine(@"D:\SciSharp\CherubNLP\data", "dbpedia.bin");
13 |             using (var fastText = new FastTextWrapper())
14 |             {
15 |                 fastText.LoadModel(model);
16 |                 var vector1 = fastText.GetSentenceVector("Hello");
17 |             }
18 | 
19 |             var similarities = Similarity.Cosine("Power Outage -Fifth & Park - JPMC150713", new[]
20 |             {
21 |                 "Cosine Similarity algorithm function sample.",
22 |                 "Power Restored -Fifth & Park - JPMC150713",
23 |                 "Compute the similarity of two hardcoded lists.",
24 |                 "We can compute the similarity of two hardcoded lists.",
25 |                 "Coronavirus app could trace your contacts without sacrificing your privacy"
26 |             }, model);
27 | 
28 |             // var test = new KaggleTest();
29 |             // test.SpookyAuthorIdentification();
30 |         }
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/CherubNLP.UnitTest/CherubNLP.UnitTest.csproj:
--------------------------------------------------------------------------------
 1 | ﻿<Project Sdk="Microsoft.NET.Sdk">
 2 | 
 3 |   <PropertyGroup>
 4 |     <TargetFramework>net6.0</TargetFramework>
 5 | 
 6 |     <IsPackable>false</IsPackable>
 7 | 
 8 |     <Platforms>AnyCPU;x64</Platforms>
 9 |   </PropertyGroup>
10 | 
11 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">
12 |     <DefineConstants>DEBUG;TRACE</DefineConstants>
13 |   </PropertyGroup>
14 | 
15 |   <ItemGroup>
16 |     <PackageReference Include="Microsoft.Extensions.Configuration.Binder" Version="7.0.2" />
17 |     <PackageReference Include="Microsoft.Extensions.Configuration.Json" Version="7.0.0" />
18 |     <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.4.1" />
19 |     <PackageReference Include="MSTest.TestAdapter" Version="3.0.2" />
20 |     <PackageReference Include="MSTest.TestFramework" Version="3.0.2" />
21 |     <PackageReference Include="SciSharp.TensorFlow.Redist" Version="2.10.0" />
22 |   </ItemGroup>
23 | 
24 |   <ItemGroup>
25 |     <ProjectReference Include="..\CherubNLP\CherubNLP.csproj" />
26 |   </ItemGroup>
27 | 
28 | </Project>
29 | 


--------------------------------------------------------------------------------
/CherubNLP.UnitTest/Classification/SVMClassifierTest.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CherubNLP.Classify;
 2 | using CherubNLP.Corpus;
 3 | using CherubNLP.Tokenize;
 4 | using Microsoft.Extensions.Configuration;
 5 | using Microsoft.VisualStudio.TestTools.UnitTesting;
 6 | using System;
 7 | using System.Collections.Generic;
 8 | using System.IO;
 9 | using System.Text;
10 | using Txt2Vec;
11 | 
12 | namespace CherubNLP.UnitTest
13 | {
14 |     [TestClass]
15 |     public class SVMClassifierTest : TestEssential
16 |     {
17 |         [TestMethod]
18 |         public void TFIDFTest()
19 |         {
20 |             string[] documents =
21 |             {
22 |                 "Hello, how are you!",
23 |                 "Hi Bolo!",
24 |                 "Hey Haiping!",
25 |                 "Hello Haiping",
26 |                 "hi, how do you do?",
27 |                 "goodbye Haiping",
28 |                 "see you Bolo",
29 |                 "byebye Haiping"
30 |             };
31 |             /*TFIDFGenerator tfidfGenerator = new TFIDFGenerator();
32 |             List<List<double>> weights = tfidfGenerator.TFIDFWeightVectorsForSentences(documents);*/
33 |         }
34 | 
35 |         [TestMethod]
36 |         public void Doc2VectorTest()
37 |         {
38 |             List<string> sentences = new List<string>();
39 |             sentences.Add("The sun in the sky is bright.");
40 |             sentences.Add("We can see the shining sun, the bright sun.");
41 |             Args args = new Args();
42 |             args.ModelFile = "CherubNLP\\App_Data\\wordvec_enu.bin";
43 |             VectorGenerator vg = new VectorGenerator(args);
44 |             var list = vg.Sentence2Vec(sentences);
45 |         }
46 | 
47 |         [TestMethod]
48 |         public void similarityTest()
49 |         {
50 |             List<string> sentences = new List<string>();
51 |             sentences.Add("How's it going");
52 |             sentences.Add("How's your day");
53 |             sentences.Add("How's everything");
54 |             sentences.Add("Good morning");
55 |             sentences.Add("Good afternoon");
56 |             sentences.Add("Good evening");
57 |             sentences.Add("I appreciate it");
58 |             sentences.Add("Thanks a lot");
59 |             sentences.Add("Thank you");
60 | 
61 | 
62 |             Args args = new Args();
63 |             args.ModelFile = "CherubNLP\\CherubNLP.UnitTest\\wordvec_enu.bin";
64 |             VectorGenerator vg = new VectorGenerator(args);
65 |             var list = vg.Sentence2Vec(sentences);
66 |             Vec vec1 = vg.SingleSentence2Vec("Good morning");
67 |             Vec vec2 = vg.SingleSentence2Vec("How's it going");
68 |             double score = vg.Similarity(vec1, vec2);
69 |             Console.WriteLine("Similarity score: {0}", score);
70 | 
71 |             vec1 = vg.SingleSentence2Vec("Good morning");
72 |             vec2 = vg.SingleSentence2Vec("How's your day");
73 |             double score1 = vg.Similarity(vec1, vec2);
74 |             Console.WriteLine("Similarity score: {0}", score1);
75 | 
76 |             vec1 = vg.SingleSentence2Vec("Good morning");
77 |             vec2 = vg.SingleSentence2Vec("How's everything");
78 |             double score2 = vg.Similarity(vec1, vec2);
79 |             Console.WriteLine("Similarity score: {0}", score2);
80 | 
81 | 
82 |             vec1 = vg.SingleSentence2Vec("Good morning");
83 |             vec2 = vg.SingleSentence2Vec("Good afternoon");
84 |             double score3 = vg.Similarity(vec1, vec2);
85 |             Console.WriteLine("Similarity score: {0}", score3);
86 | 
87 |             vec1 = vg.SingleSentence2Vec("Good morning");
88 |             vec2 = vg.SingleSentence2Vec("I appreciate");
89 |             double score4 = vg.Similarity(vec1, vec2);
90 |             Console.WriteLine("Similarity score: {0}", score4);
91 | 
92 |             vec1 = vg.SingleSentence2Vec("Good morning");
93 |             vec2 = vg.SingleSentence2Vec("Thanks a lot");
94 |             double score5 = vg.Similarity(vec1, vec2);
95 |             Console.WriteLine("Similarity score: {0}", score5);
96 |         }
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/CherubNLP.UnitTest/Classification/WordCnnTest.cs:
--------------------------------------------------------------------------------
 1 | ﻿using Microsoft.VisualStudio.TestTools.UnitTesting;
 2 | using System;
 3 | using System.Collections.Generic;
 4 | using System.Text;
 5 | 
 6 | namespace CherubNLP.UnitTest
 7 | {
 8 |     [TestClass]
 9 |     public class WordCnnTest : TestEssential
10 |     {
11 |         [TestMethod]
12 |         public void TFIDFTest()
13 |         {
14 | 
15 |         }
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/CherubNLP.UnitTest/DefaultTaggerTest.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CherubNLP.Corpus;
 2 | using CherubNLP.Tag;
 3 | using CherubNLP.Tokenize;
 4 | using Microsoft.VisualStudio.TestTools.UnitTesting;
 5 | using System;
 6 | using System.Collections.Generic;
 7 | using System.Text;
 8 | 
 9 | namespace CherubNLP.UnitTest
10 | {
11 |     [TestClass]
12 |     public class DefaultTaggerTest
13 |     {
14 |         [TestMethod]
15 |         public void TagInCoNLL2000()
16 |         {
17 |             var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English);
18 |             tokenizer.GetTokenizer<RegexTokenizer>();
19 | 
20 |             var tokens = tokenizer.Tokenize("How are you doing?");
21 | 
22 |             var tagger = new TaggerFactory(new TagOptions
23 |             {
24 |                 Tag = "NN"
25 |             }, SupportedLanguage.English);
26 | 
27 |             tagger.GetTagger<DefaultTagger>();
28 | 
29 |             tagger.Tag(new Sentence { Words = tokens });
30 |         }
31 |     }
32 | }


--------------------------------------------------------------------------------
/CherubNLP.UnitTest/Featuring/CountFeatureExtractorTest.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CherubNLP.Featuring;
 2 | using CherubNLP.Tokenize;
 3 | using Microsoft.VisualStudio.TestTools.UnitTesting;
 4 | using System;
 5 | using System.Collections.Generic;
 6 | using System.Text;
 7 | 
 8 | namespace CherubNLP.UnitTest.Featuring
 9 | {
10 |     [TestClass]
11 |     public class CountFeatureExtractorTest : TestEssential
12 |     {
13 |         [TestMethod]
14 |         public void TestVectorizer()
15 |         {
16 |             var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English);
17 |             tokenizer.GetTokenizer<TreebankTokenizer>();
18 | 
19 |             var extractor = new CountFeatureExtractor();
20 |             extractor.Sentences = tokenizer.Tokenize(Corpus());
21 |             extractor.Vectorize(new List<string>());
22 | 
23 |             var vectors = Vectors();
24 | 
25 |             for (int i = 0; i < extractor.Sentences.Count; i++)
26 |             {
27 |                 var sentence = extractor.Sentences[i];
28 | 
29 |                 for(int j = 0; j < extractor.Features.Count; j++)
30 |                 {
31 |                     var word = sentence.Words.Find(w => w.Lemma == extractor.Features[j]);
32 | 
33 |                     if(word != null)
34 |                     {
35 |                         Assert.IsTrue(word.Vector == vectors[i][j]);
36 |                     }
37 |                 }
38 |             }
39 |         }
40 | 
41 |         public List<String> Corpus()
42 |         {
43 |             return new List<string>
44 |             {
45 |                 "This is the first document.",
46 |                 "This document is the second document.",
47 |                 "And this is the third one.",
48 |                 "Is this the first document?"
49 |             };
50 |         }
51 | 
52 |         public int[][] Vectors()
53 |         {
54 |             return new int[4][]
55 |             {
56 |                 new int []{ 0, 1, 1, 1, 0, 0, 1, 0, 1 },
57 |                 new int []{ 0, 2, 0, 1, 0, 1, 1, 0, 1 },
58 |                 new int []{ 1, 0, 0, 1, 1, 0, 1, 1, 1 },
59 |                 new int []{ 0, 1, 1, 1, 0, 0, 1, 0, 1 }
60 |             };
61 |         }
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/CherubNLP.UnitTest/Kaggle/SpookyAuthorIdentification/ClassificationTest.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CherubNLP.Classify;
 2 | using CherubNLP.Corpus;
 3 | using CherubNLP.Tokenize;
 4 | using Microsoft.Extensions.Configuration;
 5 | using Microsoft.VisualStudio.TestTools.UnitTesting;
 6 | using System;
 7 | using System.Collections.Generic;
 8 | using System.IO;
 9 | using System.Linq;
10 | using System.Text;
11 | using Bigtree.Algorithm.Extensions;
12 | 
13 | namespace CherubNLP.UnitTest.Kaggle
14 | {
15 |     [TestClass]
16 |     public partial class KaggleTest : TestEssential
17 |     {
18 | 
19 |         [TestMethod]
20 |         public void SpookyAuthorIdentification()
21 |         {
22 |             var reader = new KaggleTextDataReader();
23 |             var sentences = reader.Read(new ReaderOptions { FileName = "train.csv" });
24 | 
25 |             var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English);
26 |             tokenizer.GetTokenizer<TreebankTokenizer>();
27 | 
28 |             var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList());
29 |             for (int i = 0; i < newSentences.Count; i++)
30 |             {
31 |                 newSentences[i].Id = sentences[i].Id;
32 |                 newSentences[i].Label = sentences[i].Label;
33 |             }
34 |             sentences = newSentences.ToList();
35 | 
36 |             sentences.Shuffle();
37 |             var dataset = sentences.Take(2000).ToList().Split(0.7M);
38 | 
39 |             var options = new ClassifyOptions
40 |             {
41 |                 ModelDir = AppContext.BaseDirectory,
42 |                 ModelFilePath = Path.Combine(AppContext.BaseDirectory, "nb.model"),
43 |                 Dimension = 300
44 |             };
45 |             var classifier = new ClassifierFactory<SentenceFeatureExtractor>(options, SupportedLanguage.English);
46 |             classifier.GetClassifer("NaiveBayesClassifier");
47 |             classifier.Train(dataset.Item1);
48 | 
49 |             int correct = 0;
50 |             int total = 0;
51 |             dataset.Item2.ForEach(td =>
52 |             {
53 |                 var classes = classifier.Classify(td);
54 |                 if (td.Label == classes[0].Item1)
55 |                 {
56 |                     correct++;
57 |                 }
58 |                 total++;
59 |             });
60 | 
61 |             var accuracy = (float)correct / total;
62 | 
63 |             Assert.IsTrue(accuracy > 0.5);
64 |         }
65 | 
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/CherubNLP.UnitTest/Kaggle/SpookyAuthorIdentification/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# Spooky Author Identification
 2 | Share code and discuss insights to identify horror authors from their writings
 3 | 
 4 | ### Data Description
 5 | The competition dataset contains text from works of fiction written by spooky authors of the public domain: Edgar Allan Poe, HP Lovecraft and Mary Shelley. The data was prepared by chunking larger texts into sentences using CoreNLP's MaxEnt sentence tokenizer, so you may notice the odd non-sentence here and there. Your objective is to accurately identify the author of the sentences in the test set.
 6 | 
 7 | ### Evaluation
 8 | Submissions are evaluated using multi-class logarithmic loss. Each id has one true class. For each id, you must submit a predicted probability for each author. The formula is then:
 9 | logloss=−1N∑i=1N∑j=1Myijlog(pij),
10 | where N is the number of observations in the test set, M is the number of class labels (3 classes), log is the natural logarithm, yij is 1 if observation i belongs to class j and 0 otherwise, and pij is the predicted probability that observation i belongs to class j.
11 | 
12 | The submitted probabilities for a given sentences are not required to sum to one because they are rescaled prior to being scored (each row is divided by the row sum). In order to avoid the extremes of the log function, predicted probabilities are replaced with max(min(p,1−10−15),10−15).
13 | 
14 | [Kaggle Link](https://www.kaggle.com/c/spooky-author-identification)


--------------------------------------------------------------------------------
/CherubNLP.UnitTest/Kaggle/SpookyAuthorIdentification/sample_submission.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSharp/CherubNLP/7e80875a7909288c15c0e5ee1fafcb4c8df41198/CherubNLP.UnitTest/Kaggle/SpookyAuthorIdentification/sample_submission.zip


--------------------------------------------------------------------------------
/CherubNLP.UnitTest/Kaggle/SpookyAuthorIdentification/test.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSharp/CherubNLP/7e80875a7909288c15c0e5ee1fafcb4c8df41198/CherubNLP.UnitTest/Kaggle/SpookyAuthorIdentification/test.zip


--------------------------------------------------------------------------------
/CherubNLP.UnitTest/Kaggle/SpookyAuthorIdentification/train.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSharp/CherubNLP/7e80875a7909288c15c0e5ee1fafcb4c8df41198/CherubNLP.UnitTest/Kaggle/SpookyAuthorIdentification/train.zip


--------------------------------------------------------------------------------
/CherubNLP.UnitTest/NGramTaggerTest.cs:
--------------------------------------------------------------------------------
  1 | ﻿using CherubNLP.Corpus;
  2 | using CherubNLP.Tag;
  3 | using CherubNLP.Tokenize;
  4 | using Microsoft.Extensions.Configuration;
  5 | using Microsoft.VisualStudio.TestTools.UnitTesting;
  6 | using System;
  7 | using System.Collections.Generic;
  8 | using System.Diagnostics;
  9 | using System.IO;
 10 | using System.Text;
 11 | 
 12 | namespace CherubNLP.UnitTest
 13 | {
 14 |     [TestClass]
 15 |     public class NGramTaggerTest : TestEssential
 16 |     {
 17 |         [TestMethod]
 18 |         public void UniGramInCoNLL2000()
 19 |         {
 20 |             // tokenization
 21 |             var tokenizer = new TokenizerFactory(new TokenizationOptions
 22 |             {
 23 |                 Pattern = RegexTokenizer.WORD_PUNC
 24 |             }, SupportedLanguage.English);
 25 |             tokenizer.GetTokenizer<RegexTokenizer>();
 26 | 
 27 |             var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment");
 28 | 
 29 |             // test tag
 30 |             var tagger = new TaggerFactory(new TagOptions
 31 |             {
 32 |                 CorpusDir = Configuration.GetValue<String>("CherubNLP:dataDir"),
 33 |                 NGram = 1,
 34 |                 Tag = "NN"
 35 |             }, SupportedLanguage.English);
 36 | 
 37 |             tagger.GetTagger<NGramTagger>();
 38 | 
 39 |             var watch = Stopwatch.StartNew();
 40 |             tagger.Tag(new Sentence { Words = tokens });
 41 |             watch.Stop();
 42 |             var elapsedMs1 = watch.ElapsedMilliseconds;
 43 | 
 44 |             Assert.IsTrue(tokens[0].Pos == "NNP");
 45 |             Assert.IsTrue(tokens[1].Pos == "IN");
 46 |             Assert.IsTrue(tokens[2].Pos == "DT");
 47 |             Assert.IsTrue(tokens[3].Pos == "NNP");
 48 | 
 49 |             // test if model is loaded repeatly.
 50 |             watch = Stopwatch.StartNew();
 51 |             tagger.Tag(new Sentence { Words = tokens });
 52 |             watch.Stop();
 53 |             var elapsedMs2 = watch.ElapsedMilliseconds;
 54 | 
 55 |             Assert.IsTrue(elapsedMs1 > elapsedMs2 * 100);
 56 |         }
 57 | 
 58 |         [TestMethod]
 59 |         public void BiGramInCoNLL2000()
 60 |         {
 61 |             // tokenization
 62 |             var tokenizer = new TokenizerFactory(new TokenizationOptions
 63 |             {
 64 |                 Pattern = RegexTokenizer.WORD_PUNC
 65 |             }, SupportedLanguage.English);
 66 |             tokenizer.GetTokenizer<RegexTokenizer>();
 67 | 
 68 |             var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment");
 69 | 
 70 |             // test tag
 71 |             var tagger = new TaggerFactory(new TagOptions
 72 |             {
 73 |                 CorpusDir = Configuration.GetValue<String>("CherubNLP:dataDir"),
 74 |                 NGram = 2,
 75 |                 Tag = "NN"
 76 |             }, SupportedLanguage.English);
 77 | 
 78 |             tagger.GetTagger<NGramTagger>();
 79 | 
 80 |             tagger.Tag(new Sentence { Words = tokens });
 81 | 
 82 |             Assert.IsTrue(tokens[0].Pos == "NNP");
 83 |             Assert.IsTrue(tokens[1].Pos == "IN");
 84 |             Assert.IsTrue(tokens[2].Pos == "DT");
 85 |             Assert.IsTrue(tokens[3].Pos == "NNP");
 86 |         }
 87 | 
 88 |         [TestMethod]
 89 |         public void TriGramInCoNLL2000()
 90 |         {
 91 |             // tokenization
 92 |             var tokenizer = new TokenizerFactory(new TokenizationOptions
 93 |             {
 94 |                 Pattern = RegexTokenizer.WORD_PUNC
 95 |             }, SupportedLanguage.English);
 96 |             tokenizer.GetTokenizer<RegexTokenizer>();
 97 | 
 98 |             var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment");
 99 | 
100 |             // test tag
101 |             var tagger = new TaggerFactory(new TagOptions
102 |             {
103 |                 CorpusDir = Configuration.GetValue<String>("CherubNLP:dataDir"),
104 |                 NGram = 3,
105 |                 Tag = "NN"
106 |             }, SupportedLanguage.English);
107 | 
108 |             tagger.GetTagger<NGramTagger>();
109 | 
110 |             tagger.Tag(new Sentence { Words = tokens });
111 | 
112 |             Assert.IsTrue(tokens[0].Pos == "NNP");
113 |             Assert.IsTrue(tokens[1].Pos == "IN");
114 |             Assert.IsTrue(tokens[2].Pos == "DT");
115 |             Assert.IsTrue(tokens[3].Pos == "NNP");
116 |         }
117 |     }
118 | }
119 | 


--------------------------------------------------------------------------------
/CherubNLP.UnitTest/RegexStemmerTest.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CherubNLP.Stem;
 2 | using Microsoft.VisualStudio.TestTools.UnitTesting;
 3 | using System;
 4 | using System.Collections.Generic;
 5 | using System.Text;
 6 | 
 7 | namespace CherubNLP.UnitTest
 8 | {
 9 |     [TestClass]
10 |     public class RegexStemmerTest
11 |     {
12 |         [TestMethod]
13 |         public void StemInDefault()
14 |         {
15 |             var stemmer = new StemmerFactory<RegexStemmer>(new StemOptions
16 |             {
17 |                 Pattern = RegexStemmer.PATTERN
18 |             }, SupportedLanguage.English);
19 | 
20 |             var stem = stemmer.Stem("doing");
21 |             Assert.IsTrue(stem == "do");
22 | 
23 |             stem = stemmer.Stem("ponies");
24 |             Assert.IsTrue(stem == "poni");
25 | 
26 |             stem = stemmer.Stem("caresses");
27 |             Assert.IsTrue(stem == "caress");
28 | 
29 |             stem = stemmer.Stem("cats");
30 |             Assert.IsTrue(stem == "cat");
31 | 
32 |             stem = stemmer.Stem("am");
33 |             Assert.IsTrue(stem == "be");
34 | 
35 |             stem = stemmer.Stem("are");
36 |             Assert.IsTrue(stem == "be");
37 | 
38 |             stem = stemmer.Stem("is");
39 |             Assert.IsTrue(stem == "be");
40 | 
41 |             stem = stemmer.Stem("were");
42 |             Assert.IsTrue(stem == "be");
43 | 
44 |             stem = stemmer.Stem("running");
45 |             Assert.IsTrue(stem == "run");
46 |         }
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/CherubNLP.UnitTest/TestEssential.cs:
--------------------------------------------------------------------------------
 1 | ﻿using Microsoft.Extensions.Configuration;
 2 | using System;
 3 | using System.IO;
 4 | using System.Linq;
 5 | 
 6 | namespace CherubNLP.UnitTest
 7 | {
 8 |     public abstract class TestEssential
 9 |     {
10 |         protected IConfiguration Configuration { get; }
11 |         protected string rootDir;
12 |         protected string dataDir;
13 |         protected string settingsDir;
14 | 
15 |         public TestEssential()
16 |         {
17 |             rootDir = Path.GetFullPath($"{Directory.GetCurrentDirectory()}{Path.DirectorySeparatorChar}..{Path.DirectorySeparatorChar}..{Path.DirectorySeparatorChar}..{Path.DirectorySeparatorChar}..{Path.DirectorySeparatorChar}");
18 |             settingsDir = Path.Combine(rootDir, "Settings");
19 |             dataDir = Path.Combine(rootDir, "data");
20 | 
21 |             // x64
22 |             if (!Directory.Exists(settingsDir))
23 |             {
24 |                 rootDir = Path.GetFullPath($"{rootDir}{Path.DirectorySeparatorChar}..{Path.DirectorySeparatorChar}");
25 |                 settingsDir = Path.Combine(rootDir, "Settings");
26 |                 dataDir = Path.Combine(rootDir, "data");
27 |             }
28 | 
29 |             ConfigurationBuilder configurationBuilder = new ConfigurationBuilder();
30 |             var settings = Directory.GetFiles(settingsDir, "*.json");
31 |             settings.ToList().ForEach(setting =>
32 |             {
33 |                 configurationBuilder.AddJsonFile(setting, optional: false, reloadOnChange: true);
34 |             });
35 |             Configuration = configurationBuilder.Build();
36 |         }
37 |     }
38 | 
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/CherubNLP.UnitTest/Tokenize/RegexTokenizerTest.cs:
--------------------------------------------------------------------------------
  1 | using CherubNLP.Tokenize;
  2 | using Microsoft.VisualStudio.TestTools.UnitTesting;
  3 | using System.Collections.Generic;
  4 | 
  5 | namespace CherubNLP.UnitTest.Tokenize
  6 | {
  7 |     [TestClass]
  8 |     public class RegexTokenizerTest
  9 |     {
 10 |         [TestMethod]
 11 |         public void TokenizeInWhiteSpace()
 12 |         {
 13 |             var tokenizer = new TokenizerFactory(new TokenizationOptions
 14 |             {
 15 |                 Pattern = RegexTokenizer.WHITE_SPACE
 16 |             }, SupportedLanguage.English);
 17 |             tokenizer.GetTokenizer<RegexTokenizer>();
 18 | 
 19 |             var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?");
 20 | 
 21 |             Assert.IsTrue(tokens[0].Start == 0);
 22 |             Assert.IsTrue(tokens[0].Text == "Chop");
 23 | 
 24 |             Assert.IsTrue(tokens[1].Start == 5);
 25 |             Assert.IsTrue(tokens[1].Text == "into");
 26 | 
 27 |             Assert.IsTrue(tokens[2].Start == 10);
 28 |             Assert.IsTrue(tokens[2].Text == "pieces,");
 29 | 
 30 |             Assert.IsTrue(tokens[3].Start == 18);
 31 |             Assert.IsTrue(tokens[3].Text == "isn't");
 32 | 
 33 |             Assert.IsTrue(tokens[4].Start == 24);
 34 |             Assert.IsTrue(tokens[4].Text == "it?");
 35 |         }
 36 | 
 37 |         [TestMethod]
 38 |         public void TokenizeInWordPunctuation()
 39 |         {
 40 |             var tokenizer = new TokenizerFactory(new TokenizationOptions
 41 |             {
 42 |                 Pattern = RegexTokenizer.WORD_PUNC,
 43 |                 SpecialWords = new List<string> { "n't" }
 44 |             }, SupportedLanguage.English);
 45 |             tokenizer.GetTokenizer<RegexTokenizer>();
 46 | 
 47 |             var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?");
 48 | 
 49 |             Assert.IsTrue(tokens[0].Start == 0);
 50 |             Assert.IsTrue(tokens[0].Text == "Chop");
 51 | 
 52 |             Assert.IsTrue(tokens[1].Start == 5);
 53 |             Assert.IsTrue(tokens[1].Text == "into");
 54 | 
 55 |             Assert.IsTrue(tokens[2].Start == 10);
 56 |             Assert.IsTrue(tokens[2].Text == "pieces");
 57 | 
 58 |             Assert.IsTrue(tokens[3].Start == 16);
 59 |             Assert.IsTrue(tokens[3].Text == ",");
 60 | 
 61 |             Assert.IsTrue(tokens[4].Start == 18);
 62 |             Assert.IsTrue(tokens[4].Text == "is");
 63 | 
 64 |             Assert.IsTrue(tokens[5].Start == 20);
 65 |             Assert.IsTrue(tokens[5].Text == "n't");
 66 | 
 67 |             Assert.IsTrue(tokens[6].Start == 24);
 68 |             Assert.IsTrue(tokens[6].Text == "it");
 69 | 
 70 |             Assert.IsTrue(tokens[7].Start == 26);
 71 |             Assert.IsTrue(tokens[7].Text == "?");
 72 |         }
 73 | 
 74 |         [TestMethod]
 75 |         public void TokenizeInBlankLine()
 76 |         {
 77 |             var tokenizer = new TokenizerFactory(new TokenizationOptions
 78 |             {
 79 |                 Pattern = RegexTokenizer.BLANK_LINE
 80 |             }, SupportedLanguage.English);
 81 |             tokenizer.GetTokenizer<RegexTokenizer>();
 82 | 
 83 |             var tokens = tokenizer.Tokenize(@"Chop into pieces, 
 84 | 
 85 | isn't
 86 | 
 87 | it?");
 88 | 
 89 |             Assert.IsTrue(tokens[0].Start == 0);
 90 |             Assert.IsTrue(tokens[0].Text == "Chop into pieces,");
 91 | 
 92 |             Assert.IsTrue(tokens[1].Start == 18);
 93 |             Assert.IsTrue(tokens[1].Text == "isn't");
 94 | 
 95 |             Assert.IsTrue(tokens[2].Start == 28);
 96 |             Assert.IsTrue(tokens[2].Text == "it?");
 97 |         }
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/CherubNLP.UnitTest/Tokenize/TreebankTokenizerTest.cs:
--------------------------------------------------------------------------------
  1 | ﻿using CherubNLP.Tokenize;
  2 | using Microsoft.VisualStudio.TestTools.UnitTesting;
  3 | using System;
  4 | using System.Collections.Generic;
  5 | using System.Text;
  6 | 
  7 | namespace CherubNLP.UnitTest.Tokenize
  8 | {
  9 |     [TestClass]
 10 |     public class TreebankTokenizerTest
 11 |     {
 12 |         [TestMethod]
 13 |         public void ReplaceStartingQuoting()
 14 |         {
 15 |             var tokenizer = new TokenizerFactory(new TokenizationOptions
 16 |             {
 17 |             }, SupportedLanguage.English);
 18 |             tokenizer.GetTokenizer<TreebankTokenizer>();
 19 | 
 20 |             var tokens = tokenizer.Tokenize("«Hello!");
 21 | 
 22 |             Assert.IsTrue(tokens[0].Text == "«");
 23 |             Assert.IsTrue(tokens[0].Start == 0);
 24 | 
 25 |             Assert.IsTrue(tokens[1].Text == "Hello");
 26 |             Assert.IsTrue(tokens[1].Start == 1);
 27 | 
 28 |             Assert.IsTrue(tokens[2].Text == "!");
 29 |             Assert.IsTrue(tokens[2].Start == 6);
 30 |         }
 31 | 
 32 |         [TestMethod]
 33 |         public void ReplaceEndingQuoting()
 34 |         {
 35 |             var tokenizer = new TokenizerFactory(new TokenizationOptions
 36 |             {
 37 |             }, SupportedLanguage.English);
 38 |             tokenizer.GetTokenizer<TreebankTokenizer>();
 39 | 
 40 |             var tokens = tokenizer.Tokenize("Aren't you");
 41 | 
 42 |             Assert.IsTrue(tokens[0].Text == "Are");
 43 |             Assert.IsTrue(tokens[0].Start == 0);
 44 | 
 45 |             Assert.IsTrue(tokens[1].Text == "n't");
 46 |             Assert.IsTrue(tokens[1].Start == 3);
 47 | 
 48 |             Assert.IsTrue(tokens[2].Text == "you");
 49 |             Assert.IsTrue(tokens[2].Start == 7);
 50 |         }
 51 | 
 52 |         [TestMethod]
 53 |         public void ReplacePunctuation()
 54 |         {
 55 |             var tokenizer = new TokenizerFactory(new TokenizationOptions
 56 |             {
 57 |             }, SupportedLanguage.English);
 58 |             tokenizer.GetTokenizer<TreebankTokenizer>();
 59 | 
 60 |             var tokens = tokenizer.Tokenize("Hello World...");
 61 | 
 62 |             Assert.IsTrue(tokens[0].Text == "Hello");
 63 |             Assert.IsTrue(tokens[0].Start == 0);
 64 | 
 65 |             Assert.IsTrue(tokens[1].Text == "World");
 66 |             Assert.IsTrue(tokens[1].Start == 6);
 67 | 
 68 |             Assert.IsTrue(tokens[2].Text == "...");
 69 |             Assert.IsTrue(tokens[2].Start == 11);
 70 |         }
 71 | 
 72 |         [TestMethod]
 73 |         public void ReplaceBrackets()
 74 |         {
 75 |             var tokenizer = new TokenizerFactory(new TokenizationOptions
 76 |             {
 77 |             }, SupportedLanguage.English);
 78 |             tokenizer.GetTokenizer<TreebankTokenizer>();
 79 | 
 80 |             var tokens = tokenizer.Tokenize("<Hello.>");
 81 | 
 82 |             Assert.IsTrue(tokens[0].Text == "<");
 83 |             Assert.IsTrue(tokens[0].Start == 0);
 84 | 
 85 |             Assert.IsTrue(tokens[1].Text == "Hello");
 86 |             Assert.IsTrue(tokens[1].Start == 1);
 87 | 
 88 |             Assert.IsTrue(tokens[2].Text == ".");
 89 |             Assert.IsTrue(tokens[2].Start == 6);
 90 | 
 91 |             Assert.IsTrue(tokens[3].Text == ">");
 92 |             Assert.IsTrue(tokens[3].Start == 7);
 93 |         }
 94 | 
 95 |         [TestMethod]
 96 |         public void ReplaceConventions()
 97 |         {
 98 |             var tokenizer = new TokenizerFactory(new TokenizationOptions
 99 |             {
100 |             }, SupportedLanguage.English);
101 |             tokenizer.GetTokenizer<TreebankTokenizer>();
102 | 
103 |             var tokens = tokenizer.Tokenize("I cannot jump.");
104 | 
105 |             Assert.IsTrue(tokens[0].Text == "I");
106 |             Assert.IsTrue(tokens[0].Start == 0);
107 | 
108 |             Assert.IsTrue(tokens[1].Text == "can");
109 |             Assert.IsTrue(tokens[1].Start == 2);
110 | 
111 |             Assert.IsTrue(tokens[2].Text == "not");
112 |             Assert.IsTrue(tokens[2].Start == 5);
113 | 
114 |             Assert.IsTrue(tokens[3].Text == "jump");
115 |             Assert.IsTrue(tokens[3].Start == 9);
116 | 
117 |             Assert.IsTrue(tokens[4].Text == ".");
118 |             Assert.IsTrue(tokens[4].Start == 13);
119 |         }
120 | 
121 |         [TestMethod]
122 |         public void ReplaceConventionsIncludeMultipleSymbol()
123 |         {
124 |             var tokenizer = new TokenizerFactory(new TokenizationOptions
125 |             {
126 |             }, SupportedLanguage.English);
127 |             tokenizer.GetTokenizer<TreebankTokenizer>();
128 | 
129 |             var tokens = tokenizer.Tokenize("I jump. And you?");
130 | 
131 |             Assert.IsTrue(tokens[0].Text == "I");
132 |             Assert.IsTrue(tokens[0].Start == 0);
133 | 
134 |             Assert.IsTrue(tokens[1].Text == "jump");
135 |             Assert.IsTrue(tokens[1].Start == 2);
136 | 
137 |             Assert.IsTrue(tokens[2].Text == ".");
138 |             Assert.IsTrue(tokens[2].Start == 6);
139 | 
140 |             Assert.IsTrue(tokens[3].Text == "And");
141 |             Assert.IsTrue(tokens[3].Start == 8);
142 | 
143 |             Assert.IsTrue(tokens[4].Text == "you");
144 |             Assert.IsTrue(tokens[4].Start == 12);
145 | 
146 |             Assert.IsTrue(tokens[5].Text == "?");
147 |             Assert.IsTrue(tokens[5].Start == 15);
148 |         }
149 |     }
150 | }
151 | 


--------------------------------------------------------------------------------
/CherubNLP.UnitTest/Vector/FastTextTest.cs:
--------------------------------------------------------------------------------
 1 | ﻿using FastText.NetWrapper;
 2 | using Microsoft.VisualStudio.TestTools.UnitTesting;
 3 | using System;
 4 | using System.Collections.Generic;
 5 | using System.IO;
 6 | using System.Linq;
 7 | using System.Text;
 8 | using Tensorflow.NumPy;
 9 | 
10 | namespace CherubNLP.UnitTest.Vector
11 | {
12 |     [TestClass]
13 |     public class FastTextTest : TestEssential
14 |     {
15 |         [TestMethod]
16 |         public void Word2Vec()
17 |         {
18 |             using (var fastText = new FastTextWrapper())
19 |             {
20 |                 fastText.LoadModel(Path.Combine(dataDir, "dbpedia.ftz"));
21 |                 var vector = fastText.GetSentenceVector("Can I use a larger crockpot than the recipe calls for?");
22 |             }
23 |         }
24 | 
25 |         [TestMethod]
26 |         public void CosineSimilarity()
27 |         {
28 |             var similarities = Similarity.Cosine("We can use Cosine to compute the similarity of two hardcoded lists.", new[]
29 |             {
30 |                 "Cosine Similarity algorithm function sample.",
31 |                 "The Cosine Similarity function computes the similarity of two lists of numbers.",
32 |                 "Compute the similarity of two hardcoded lists.",
33 |                 "We can compute the similarity of two hardcoded lists.",
34 |                 "Coronavirus app could trace your contacts without sacrificing your privacy",
35 |                 "We can use Cosine to compute the similarity of two lists."
36 |             }, Path.Combine(dataDir, "dbpedia.ftz"));
37 | 
38 |             Assert.AreEqual(new[] { 5, 2, 3, 1, 4, 0 }, np.argsort(similarities));
39 |         }
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/CherubNLP.UnitTest/Vector/OneHotEncodingTest.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CherubNLP.Corpus;
 2 | using CherubNLP.Tokenize;
 3 | using CherubNLP.Txt2Vec;
 4 | using Microsoft.Extensions.Configuration;
 5 | using Microsoft.VisualStudio.TestTools.UnitTesting;
 6 | using System;
 7 | using System.Collections.Generic;
 8 | using System.IO;
 9 | using System.Linq;
10 | using System.Text;
11 | 
12 | namespace CherubNLP.UnitTest.Vector
13 | {
14 |     [TestClass]
15 |     public class OneHotEncodingTest : TestEssential
16 |     {
17 |         [TestMethod]
18 |         public void OneHotTest()
19 |         {
20 |             var reader = new FasttextDataReader();
21 |             var sentences = reader.Read(new ReaderOptions
22 |             {
23 |                 DataDir = Path.Combine(Configuration.GetValue<String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"),
24 |                 FileName = "cooking.stackexchange.txt"
25 |             });
26 | 
27 |             var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English);
28 |             tokenizer.GetTokenizer<TreebankTokenizer>();
29 | 
30 |             var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList());
31 |             for (int i = 0; i < newSentences.Count; i++)
32 |             {
33 |                 newSentences[i].Label = sentences[i].Label;
34 |             }
35 |             sentences = newSentences.ToList();
36 | 
37 |             var encoder = new OneHotEncoder();
38 |             encoder.Sentences = sentences;
39 |             encoder.EncodeAll();
40 |         }
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/CherubNLP.UnitTest/Vector/Word2VecTest.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CherubNLP.Txt2Vec;
 2 | using Microsoft.Extensions.Configuration;
 3 | using Microsoft.VisualStudio.TestTools.UnitTesting;
 4 | using System;
 5 | using System.Collections.Generic;
 6 | using System.IO;
 7 | using System.Linq;
 8 | using System.Text;
 9 | using Txt2Vec;
10 | 
11 | namespace CherubNLP.UnitTest.Vector
12 | {
13 |     [TestClass]
14 |     public class Word2VecTest : TestEssential
15 |     {
16 |         [TestMethod]
17 |         public void Word2Vec()
18 |         {
19 |             string sentence = "stop this song";
20 |             List<string> words = sentence.Split(' ').ToList();
21 |             Args args = new Args();
22 |             args.ModelFile = @"C:\Users\bpeng\Desktop\BoloReborn\Txt2VecDemo\wordvec_enu.bin";
23 |             VectorGenerator vg = new VectorGenerator(args);
24 | 
25 |             vg.Distance(words);
26 |         }
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/CherubNLP.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio Version 17
 4 | VisualStudioVersion = 17.4.33213.308
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CherubNLP", "CherubNLP\CherubNLP.csproj", "{5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}"
 7 | EndProject
 8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CherubNLP.UnitTest", "CherubNLP.UnitTest\CherubNLP.UnitTest.csproj", "{958AC705-B9D7-4071-B135-048DE1EEE87A}"
 9 | EndProject
10 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CherubNLP.Console", "CherubNLP.Console\CherubNLP.Console.csproj", "{98AB0B44-E5E2-4C7E-B541-9E4226B24E63}"
11 | EndProject
12 | Global
13 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
14 | 		Debug|Any CPU = Debug|Any CPU
15 | 		Debug|x64 = Debug|x64
16 | 		GPU|Any CPU = GPU|Any CPU
17 | 		GPU|x64 = GPU|x64
18 | 		Release|Any CPU = Release|Any CPU
19 | 		Release|x64 = Release|x64
20 | 	EndGlobalSection
21 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
22 | 		{5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
23 | 		{5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.Debug|Any CPU.Build.0 = Debug|Any CPU
24 | 		{5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.Debug|x64.ActiveCfg = Debug|x64
25 | 		{5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.Debug|x64.Build.0 = Debug|x64
26 | 		{5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.GPU|Any CPU.ActiveCfg = Release|Any CPU
27 | 		{5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.GPU|Any CPU.Build.0 = Release|Any CPU
28 | 		{5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.GPU|x64.ActiveCfg = Release|x64
29 | 		{5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.GPU|x64.Build.0 = Release|x64
30 | 		{5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.Release|Any CPU.ActiveCfg = Release|Any CPU
31 | 		{5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.Release|Any CPU.Build.0 = Release|Any CPU
32 | 		{5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.Release|x64.ActiveCfg = Release|Any CPU
33 | 		{5E7C15B9-FE16-474F-B8C4-C5DC41E656AA}.Release|x64.Build.0 = Release|Any CPU
34 | 		{958AC705-B9D7-4071-B135-048DE1EEE87A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
35 | 		{958AC705-B9D7-4071-B135-048DE1EEE87A}.Debug|Any CPU.Build.0 = Debug|Any CPU
36 | 		{958AC705-B9D7-4071-B135-048DE1EEE87A}.Debug|x64.ActiveCfg = Debug|x64
37 | 		{958AC705-B9D7-4071-B135-048DE1EEE87A}.Debug|x64.Build.0 = Debug|x64
38 | 		{958AC705-B9D7-4071-B135-048DE1EEE87A}.GPU|Any CPU.ActiveCfg = Release|Any CPU
39 | 		{958AC705-B9D7-4071-B135-048DE1EEE87A}.GPU|Any CPU.Build.0 = Release|Any CPU
40 | 		{958AC705-B9D7-4071-B135-048DE1EEE87A}.GPU|x64.ActiveCfg = Release|x64
41 | 		{958AC705-B9D7-4071-B135-048DE1EEE87A}.GPU|x64.Build.0 = Release|x64
42 | 		{958AC705-B9D7-4071-B135-048DE1EEE87A}.Release|Any CPU.ActiveCfg = Release|Any CPU
43 | 		{958AC705-B9D7-4071-B135-048DE1EEE87A}.Release|Any CPU.Build.0 = Release|Any CPU
44 | 		{958AC705-B9D7-4071-B135-048DE1EEE87A}.Release|x64.ActiveCfg = Release|Any CPU
45 | 		{958AC705-B9D7-4071-B135-048DE1EEE87A}.Release|x64.Build.0 = Release|Any CPU
46 | 		{98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
47 | 		{98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.Debug|Any CPU.Build.0 = Debug|Any CPU
48 | 		{98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.Debug|x64.ActiveCfg = Debug|x64
49 | 		{98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.Debug|x64.Build.0 = Debug|x64
50 | 		{98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.GPU|Any CPU.ActiveCfg = Release|Any CPU
51 | 		{98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.GPU|Any CPU.Build.0 = Release|Any CPU
52 | 		{98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.GPU|x64.ActiveCfg = Release|x64
53 | 		{98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.GPU|x64.Build.0 = Release|x64
54 | 		{98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.Release|Any CPU.ActiveCfg = Release|Any CPU
55 | 		{98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.Release|Any CPU.Build.0 = Release|Any CPU
56 | 		{98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.Release|x64.ActiveCfg = Release|Any CPU
57 | 		{98AB0B44-E5E2-4C7E-B541-9E4226B24E63}.Release|x64.Build.0 = Release|Any CPU
58 | 	EndGlobalSection
59 | 	GlobalSection(SolutionProperties) = preSolution
60 | 		HideSolutionNode = FALSE
61 | 	EndGlobalSection
62 | 	GlobalSection(ExtensibilityGlobals) = postSolution
63 | 		SolutionGuid = {26DCDD72-01C4-45FA-85B7-2BE26A7D153C}
64 | 	EndGlobalSection
65 | EndGlobal
66 | 


--------------------------------------------------------------------------------
/CherubNLP/CherubNLP.csproj:
--------------------------------------------------------------------------------
 1 | ﻿<Project Sdk="Microsoft.NET.Sdk">
 2 | 
 3 |   <PropertyGroup>
 4 |     <TargetFrameworks>netstandard2.0</TargetFrameworks>
 5 |     <Version>0.5.0</Version>
 6 |     <Authors>Haiping Chen</Authors>
 7 |     <RepositoryUrl>https://github.com/SciSharp/CherubNLP</RepositoryUrl>
 8 |     <RepositoryType>git</RepositoryType>
 9 |     <PackageTags>NLP</PackageTags>
10 |     <PackageLicenseUrl></PackageLicenseUrl>
11 |     <Copyright>Apache 2.0</Copyright>
12 |     <PackageProjectUrl>https://github.com/SciSharp/CherubNLP</PackageProjectUrl>
13 |     <GeneratePackageOnBuild>true</GeneratePackageOnBuild>
14 |     <PackageRequireLicenseAcceptance>false</PackageRequireLicenseAcceptance>
15 |     <PackageReleaseNotes>Add more stemming regex.
16 | Add Cosine Similarity algorithm.
17 | Upgrade FastText wrapper to v1.2.3.</PackageReleaseNotes>
18 |     <AssemblyVersion>0.5.0.0</AssemblyVersion>
19 |     <FileVersion>0.5.0.0</FileVersion>
20 |     <Company>https://github.com/SciSharp</Company>
21 |     <Description>.NET text analysis tool.
22 | Tokenize, Stemming  and Lemmatization.</Description>
23 |     <Platforms>AnyCPU;x64</Platforms>
24 |   </PropertyGroup>
25 | 
26 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">
27 |     <DefineConstants>DEBUG;TRACE</DefineConstants>
28 |   </PropertyGroup>
29 | 
30 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
31 |     <DefineConstants>DEBUG;TRACE</DefineConstants>
32 |   </PropertyGroup>
33 | 
34 |   <ItemGroup>
35 |     <PackageReference Include="Bigtree.Algorithm" Version="0.4.0" />
36 |     <PackageReference Include="FastText.NetWrapper" Version="1.3.0" />
37 |     <PackageReference Include="Newtonsoft.Json" Version="13.0.2" />
38 |     <PackageReference Include="TensorFlow.Keras" Version="0.10.2" />
39 |   </ItemGroup>
40 | 
41 | </Project>
42 | 


--------------------------------------------------------------------------------
/CherubNLP/Classify/ClassifierFactory.cs:
--------------------------------------------------------------------------------
 1 | ﻿using Bigtree.Algorithm.Features;
 2 | using CherubNLP.Tokenize;
 3 | using System;
 4 | using System.Collections.Generic;
 5 | using System.Linq;
 6 | using System.Reflection;
 7 | using System.Text;
 8 | 
 9 | namespace CherubNLP.Classify
10 | {
11 |     public class ClassifierFactory<IFeatureExtractor> 
12 |         where IFeatureExtractor : ITextFeatureExtractor, new()
13 |     {
14 |         private SupportedLanguage _lang;
15 | 
16 |         private IClassifier _classifier;
17 | 
18 |         private ClassifyOptions _options;
19 | 
20 |         private IFeatureExtractor featureExtractor;
21 | 
22 |         public ClassifierFactory(ClassifyOptions options, SupportedLanguage lang)
23 |         {
24 |             _lang = lang;
25 |             _options = options;
26 |             featureExtractor = new IFeatureExtractor();
27 |         }
28 | 
29 |         public IClassifier GetClassifer(string name)
30 |         {
31 |             List<Type> types = new List<Type>();
32 | 
33 |             types.AddRange(Assembly.Load(new AssemblyName("CherubNLP"))
34 |                 .GetTypes().Where(x => !x.IsAbstract && !x.FullName.StartsWith("<>f__AnonymousType")).ToList());
35 | 
36 |             Type type = types.FirstOrDefault(x => x.Name == name);
37 |             var instance = (IClassifier)Activator.CreateInstance(type);
38 | 
39 |             return _classifier = instance;
40 |         }
41 | 
42 |         public void Train(List<Sentence> sentences)
43 |         {
44 |             _classifier.Train(sentences, _options);
45 |             _classifier.SaveModel(_options);
46 |         }
47 | 
48 |         public List<Tuple<string, double>> Classify(Sentence sentence)
49 |         {
50 |             var options = new ClassifyOptions
51 |             {
52 |                 ModelFilePath = _options.ModelFilePath,
53 |                 ModelDir = _options.ModelDir,
54 |                 ModelName = _options.ModelName
55 |             };
56 | 
57 |             _classifier.LoadModel(options);
58 | 
59 |             var classes = _classifier.Classify(sentence, options);
60 | 
61 |             classes = classes.OrderByDescending(x => x.Item2).ToList();
62 | 
63 |             return classes;
64 |         }
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/CherubNLP/Classify/ClassifyOptions.cs:
--------------------------------------------------------------------------------
 1 | ﻿using Bigtree.Algorithm.SVM;
 2 | using System;
 3 | using System.Collections.Generic;
 4 | using System.Text;
 5 | 
 6 | namespace CherubNLP.Classify
 7 | {
 8 |     public class ClassifyOptions
 9 |     {
10 |         public string TrainingCorpusDir { get; set; }
11 |         public string ModelFilePath { get; set; }
12 |         public string ModelDir { get; set; }
13 |         public string ModelName { get; set; }
14 |         public string Word2VecFilePath { get; set; }
15 | 
16 |         public string FeaturesFileName { get; set; }
17 |         public string FeaturesInTfIdfFileName { get; set; }
18 |         public string DictionaryFileName { get; set; }
19 |         public string CategoriesFileName { get; set; }
20 | 
21 |         public string PrediceOutputFile { get; set; }
22 |         public string TransformFilePath { get; set; }
23 |         public RangeTransform Transform { get; set; }
24 | 
25 |         /// <summary>
26 |         /// Feature dimension
27 |         /// </summary>
28 |         public int Dimension { get; set; }
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/CherubNLP/Classify/IClassifier.cs:
--------------------------------------------------------------------------------
 1 | ﻿using Bigtree.Algorithm.Features;
 2 | using System;
 3 | using System.Collections.Generic;
 4 | using System.Text;
 5 | 
 6 | namespace CherubNLP.Classify
 7 | {
 8 |     public interface IClassifier
 9 |     {
10 |         /// <summary>
11 |         /// Training by feature vector
12 |         /// </summary>
13 |         /// <param name="sentences"></param>
14 |         /// <param name="options"></param>
15 |         void Train(List<Sentence> sentences, ClassifyOptions options);
16 | 
17 |         /// <summary>
18 |         /// Predict by feature vector
19 |         /// </summary>
20 |         /// <param name="sentence"></param>
21 |         /// <param name="options"></param>
22 |         /// <returns></returns>
23 |         List<Tuple<string, double>> Classify(Sentence sentence, ClassifyOptions options);
24 | 
25 |         String SaveModel(ClassifyOptions options);
26 | 
27 |         Object LoadModel(ClassifyOptions options);
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/CherubNLP/Classify/IEstimator.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Text;
 4 | 
 5 | namespace CherubNLP.Classify
 6 | {
 7 |     public interface IEstimator
 8 |     {
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/CherubNLP/Classify/ITextFeatureExtractor.cs:
--------------------------------------------------------------------------------
 1 | ﻿using Bigtree.Algorithm.Features;
 2 | using CherubNLP.Tokenize;
 3 | using System;
 4 | using System.Collections.Generic;
 5 | using System.Text;
 6 | 
 7 | namespace CherubNLP.Classify
 8 | {
 9 |     /// <summary>
10 |     /// Featuring text
11 |     /// </summary>
12 |     public interface ITextFeatureExtractor
13 |     {
14 |         List<Feature> GetFeatures(List<Token> words);
15 | 
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/CherubNLP/Classify/SentenceFeatureExtractor.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Linq;
 4 | using System.Text;
 5 | using Bigtree.Algorithm.Features;
 6 | using CherubNLP.Tokenize;
 7 | 
 8 | namespace CherubNLP.Classify
 9 | {
10 |     public class SentenceFeatureExtractor : ITextFeatureExtractor
11 |     {
12 |         public List<Feature> GetFeatures(List<Token> words)
13 |         {
14 |             var features = new List<Feature>();
15 | 
16 |             words.Where(x => x.IsAlpha)
17 |                 .Distinct()
18 |                 .ToList()
19 |                 .ForEach(w => features.Add(new Feature($"contains {w.Text.ToLower()}", "True")));
20 | 
21 |             return features;
22 |         }
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/CherubNLP/Classify/WordFeatureExtractor.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Text;
 4 | using Bigtree.Algorithm.Features;
 5 | using CherubNLP.Tokenize;
 6 | 
 7 | namespace CherubNLP.Classify
 8 | {
 9 |     public class WordFeatureExtractor : ITextFeatureExtractor
10 |     {
11 |         public List<Feature> GetFeatures(List<Token> words)
12 |         {
13 |             string text = words[0].Text;
14 |             var features = new List<Feature>();
15 | 
16 |             features.Add(new Feature("alwayson", "True"));
17 |             features.Add(new Feature("startswith", text[0].ToString().ToLower()));
18 |             features.Add(new Feature("endswith", text[text.Length - 1].ToString().ToLower()));
19 | 
20 |             return features;
21 |         }
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/CherubNLP/Corpus/ConllReader.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CherubNLP.Tokenize;
 2 | using System;
 3 | using System.Collections.Generic;
 4 | using System.IO;
 5 | using System.Text;
 6 | 
 7 | namespace CherubNLP.Corpus
 8 | {
 9 |     /// <summary>
10 |     /// A corpus reader for CoNLL-style files.  These files consist of a
11 |     /// series of sentences, separated by blank lines.Each sentence is
12 |     /// encoded using a table(or "grid") of values, where each line
13 |     /// corresponds to a single word, and each column corresponds to an
14 |     /// annotation type.The set of columns used by CoNLL-style files can
15 |     /// vary from corpus to corpus;
16 |     /// </summary>
17 |     public class CoNLLReader
18 |     {
19 |         public List<Sentence> Read(ReaderOptions options)
20 |         {
21 |             var sentences = new List<Sentence>();
22 |             using(StreamReader reader = new StreamReader(Path.Combine(options.DataDir, options.FileName)))
23 |             {
24 |                 string line = reader.ReadLine();
25 |                 var sentence = new Sentence { Words = new List<Token> { } };
26 | 
27 |                 while (!reader.EndOfStream)
28 |                 {
29 |                     if (String.IsNullOrEmpty(line))
30 |                     {
31 |                         sentences.Add(sentence);
32 |                         sentence = new Sentence { Words = new List<Token> { } };
33 |                     }
34 |                     else
35 |                     {
36 |                         var columns = line.Split(' ');
37 | 
38 |                         sentence.Words.Add(new Token
39 |                         {
40 |                             Text = columns[0],
41 |                             Pos = columns[1]
42 |                         });
43 |                     }
44 | 
45 |                     line = reader.ReadLine();
46 |                 }
47 |                 
48 |             }
49 | 
50 |             return sentences;
51 |         }
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/CherubNLP/Corpus/FasttextDataReader.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.IO;
 4 | using System.Linq;
 5 | using System.Text;
 6 | using System.Text.RegularExpressions;
 7 | 
 8 | namespace CherubNLP.Corpus
 9 | {
10 |     /// <summary>
11 |     /// Fasttext labeled data reader
12 |     /// </summary>
13 |     public class FasttextDataReader
14 |     {
15 |         public List<Sentence> Read(ReaderOptions options)
16 |         {
17 |             if (String.IsNullOrEmpty(options.LabelPrefix))
18 |             {
19 |                 options.LabelPrefix = "__label__";
20 |             }
21 | 
22 |             var sentences = new List<Sentence>();
23 |             using (StreamReader reader = new StreamReader(Path.Combine(options.DataDir, options.FileName)))
24 |             {
25 |                 while (!reader.EndOfStream)
26 |                 {
27 |                     string line = reader.ReadLine();
28 |                     if (!String.IsNullOrEmpty(line))
29 |                     {
30 |                         var ms = Regex.Matches(line, options.LabelPrefix + @"\S+")
31 |                             .Cast<Match>()
32 |                             .ToList();
33 | 
34 |                         var text = line.Substring(ms.Last().Index + ms.Last().Length + 1);
35 | 
36 |                         ms.ForEach(m =>
37 |                         {
38 |                             sentences.Add(new Sentence
39 |                             {
40 |                                 Label = m.Value.Substring(options.LabelPrefix.Length),
41 |                                 Text = text
42 |                             });
43 |                         });
44 | 
45 |                     }
46 |                 }
47 |             }
48 | 
49 |             return sentences;
50 |         }
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/CherubNLP/Corpus/KaggleTextDataReader.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CherubNLP.Tokenize;
 2 | using System;
 3 | using System.Collections.Generic;
 4 | using System.IO;
 5 | using System.Linq;
 6 | using System.Text;
 7 | using System.Text.RegularExpressions;
 8 | 
 9 | namespace CherubNLP.Corpus
10 | {
11 |     /// <summary>
12 |     /// A corpus reader for Kaggle-style files.  These files consist of a
13 |     /// series of sentences, separated by blank lines.Each sentence is
14 |     /// encoded using a table(or "grid") of values, where each line
15 |     /// corresponds to a single word, and each column corresponds to an
16 |     /// annotation type.The set of columns used by Kaggle-style files can
17 |     /// vary from corpus to corpus;
18 |     /// </summary>
19 |     public class KaggleTextDataReader
20 |     {
21 |         public List<Sentence> Read(ReaderOptions options)
22 |         {
23 |             if (String.IsNullOrEmpty(options.DataDir))
24 |             {
25 |                 options.DataDir = AppContext.BaseDirectory;
26 |             }
27 | 
28 |             var sentences = new List<Sentence>();
29 |             using(StreamReader reader = new StreamReader(Path.Combine(options.DataDir, options.FileName)))
30 |             {
31 |                 // skip header
32 |                 string line = reader.ReadLine();
33 |                 line = reader.ReadLine();
34 | 
35 |                 while (!reader.EndOfStream)
36 |                 {
37 |                     var id = line.Substring(1, 7);
38 |                     var label = line.Substring(line.Length - 4, 3);
39 |                     var text = line.Substring(11, line.Length - 18);
40 | 
41 |                     sentences.Add(new Sentence
42 |                     {
43 |                         Id = id,
44 |                         Text = text,
45 |                         Label = label
46 |                     });
47 | 
48 |                     line = reader.ReadLine();
49 |                 }
50 |                 
51 |             }
52 | 
53 |             return sentences;
54 |         }
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/CherubNLP/Corpus/LabeledPerFileNameReader.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.IO;
 4 | using System.Text;
 5 | 
 6 | namespace CherubNLP.Corpus
 7 | {
 8 |     /// <summary>
 9 |     /// It used to read labeled data which is seperated by file.
10 |     /// The same category data is in one file.
11 |     /// File name is the label.
12 |     /// </summary>
13 |     public class LabeledPerFileNameReader
14 |     {
15 |         public List<Sentence> Read(ReaderOptions options)
16 |         {
17 |             string label = options.FileName.Split('.')[0];
18 | 
19 |             var sentences = new List<Sentence>();
20 |             using (StreamReader reader = new StreamReader(Path.Combine(options.DataDir, options.FileName)))
21 |             {
22 |                 while (!reader.EndOfStream)
23 |                 {
24 |                     string line = reader.ReadLine();
25 |                     if (!String.IsNullOrEmpty(line))
26 |                     {
27 |                         sentences.Add(new Sentence
28 |                         {
29 |                             Label = label,
30 |                             Text = line
31 |                         });
32 |                     }
33 |                 }
34 |             }
35 | 
36 |             return sentences;
37 |         }
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/CherubNLP/Corpus/ReaderOptions.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Text;
 4 | 
 5 | namespace CherubNLP.Corpus
 6 | {
 7 |     public class ReaderOptions
 8 |     {
 9 |         public string DataDir { get; set; }
10 | 
11 |         public string FileName { get; set; }
12 | 
13 |         public string LabelPrefix { get; set; }
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/CherubNLP/Featuring/CountFeatureExtractor.cs:
--------------------------------------------------------------------------------
 1 | ﻿/*
 2 |  * CherubNLP Library
 3 |  * Copyright (C) 2018 Haiping Chen
 4 |  * 
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, either version 3 of the License, or
 8 |  * (at your option) any later version.
 9 |  * 
10 |  * This program is distributed in the hope that it will be useful,
11 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 |  * GNU General Public License for more details.
14 |  * 
15 |  * You should have received a copy of the GNU General Public License
16 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 |  */
18 | 
19 | //using Bigtree.Algorithm.Matrix;
20 | using CherubNLP.Tokenize;
21 | using System;
22 | using System.Collections.Generic;
23 | using System.Linq;
24 | using System.Text;
25 | 
26 | namespace CherubNLP.Featuring
27 | {
28 |     /// <summary>
29 |     /// Convert a collection of text documents to a matrix of token counts
30 |     /// </summary>
31 |     public class CountFeatureExtractor : IFeatureExtractor
32 |     {
33 |         public int Dimension { get; set; }
34 |         public List<Sentence> Sentences { get; set; }
35 | 
36 |         public List<Tuple<string, int>> Dictionary { get; set; }
37 |         public List<string> Features { get; set; }
38 |         public string ModelFile { get; set; }
39 | 
40 |         public void Vectorize(List<string> features)
41 |         {
42 |             CalculateDictionary();
43 | 
44 |             int[][] vec = new int[Sentences.Count][];
45 | 
46 |             Sentences.ForEach(s =>
47 |             {
48 |                 s.Vector = new double[Features.Count];
49 |                 for (int i = 0; i < Features.Count; i++)
50 |                 {
51 |                     s.Vector[i] = s.Words.Count(w => w.Lemma == Features[i]);
52 |                 }
53 | 
54 |                 for (int i = 0; i < s.Words.Count; i++)
55 |                 {
56 |                     var dic = Dictionary.Find(x => x.Item1 == s.Words[i].Lemma);
57 |                     if(dic != null)
58 |                     {
59 |                         s.Words[i].Vector = s.Words.Count(w => w.Lemma == dic.Item1);
60 |                     }
61 |                 }
62 |             });
63 |         }
64 | 
65 |         private void CalculateDictionary()
66 |         {
67 |             if (Dictionary == null)
68 |             {
69 |                 List<Token> allWords = new List<Token>();
70 | 
71 |                 Sentences.ForEach(s =>
72 |                 {
73 |                     allWords.AddRange(s.Words);
74 |                 });
75 | 
76 |                 Features = allWords.Where(w => w.IsAlpha).Select(x => x.Lemma).Distinct().OrderBy(x => x).ToList();
77 | 
78 |                 Dictionary = new List<Tuple<string, int>>();
79 | 
80 |                 allWords.Select(x => x.Lemma)
81 |                     .Distinct()
82 |                     .OrderBy(x => x)
83 |                     .ToList()
84 |                     .ForEach(word =>
85 |                     {
86 |                         Dictionary.Add(new Tuple<string, int>(word, allWords.Count(x => x.Lemma == word)));
87 |                     });
88 |             }
89 |         }
90 |     }
91 | }
92 | 


--------------------------------------------------------------------------------
/CherubNLP/Featuring/IFeatureExtractor.cs:
--------------------------------------------------------------------------------
 1 | ﻿//using Bigtree.Algorithm.Matrix;
 2 | using System;
 3 | using System.Collections.Generic;
 4 | using System.Text;
 5 | 
 6 | namespace CherubNLP.Featuring
 7 | {
 8 |     public interface IFeatureExtractor
 9 |     {
10 |         /// <summary>
11 |         /// Feature dimension size
12 |         /// </summary>
13 |         int Dimension { get; set; }
14 | 
15 |         /// <summary>
16 |         /// The whole corpus
17 |         /// </summary>
18 |         List<Sentence> Sentences { get; set; }
19 | 
20 |         /// <summary>
21 |         /// Feature names
22 |         /// </summary>
23 |         List<String> Features { get; set; }
24 | 
25 |         /// <summary>
26 |         /// All words and frequency
27 |         /// </summary>
28 |         List<Tuple<String, int>> Dictionary { get; set; }
29 | 
30 |         /// <summary>
31 |         /// Vectorize sentence
32 |         /// </summary>
33 |         void Vectorize(List<string> features);
34 | 
35 |         /// <summary>
36 |         /// Array shape
37 |         /// </summary>
38 |         //Shape Shape { get; set; }
39 | 
40 |         /// <summary>
41 |         /// Pre-trained model file path
42 |         /// </summary>
43 |         string ModelFile { get; set; }
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/CherubNLP/Featuring/Word2VecFeatureExtractor.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Text;
 4 | //using Bigtree.Algorithm.Matrix;
 5 | using Txt2Vec;
 6 | 
 7 | namespace CherubNLP.Featuring
 8 | {
 9 |     public class Word2VecFeatureExtractor : IFeatureExtractor
10 |     {
11 |         public int Dimension { get; set; }
12 |         public List<Sentence> Sentences { get; set; }
13 |         public List<Tuple<string, int>> Dictionary { get; set; }
14 |         public List<string> Features { get; set; }
15 |         //public Shape Shape { get; set; }
16 |         public VectorGenerator Vg { get; set; }
17 |         public int SentenceVectorSize { get; set; }
18 |         public string ModelFile { get; set; }
19 | 
20 |         public void Vectorize(List<string> features)
21 |         {
22 |             Init();
23 | 
24 |             Sentences.ForEach(s => {
25 |                 List<string> wordLemmas = new List<string>();
26 |                 s.Words.ForEach(word => {
27 |                     if (features.Contains(word.Lemma))
28 |                     {
29 |                         wordLemmas.Add(word.Lemma);
30 |                     }
31 |                 });
32 |                 Vec sentenceVec = Vg.Sent2Vec(wordLemmas);
33 | 
34 |                 s.Vector = sentenceVec.VecNodes.ToArray();
35 |             });
36 | 
37 | 
38 |         }
39 | 
40 |         private void Init()
41 |         {
42 |             if(Vg == null)
43 |             {
44 |                 Args args = new Args();
45 |                 args.ModelFile = ModelFile;
46 |                 Vg = new VectorGenerator(args);
47 |                 SentenceVectorSize = this.Vg.Model.VectorSize;
48 |                 Features = new List<string>();
49 |                 for (int i = 0; i < SentenceVectorSize; i++)
50 |                 {
51 |                     Features.Add($"f-{i}");
52 |                 }
53 |             }
54 |         }
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/CherubNLP/Jieba.NET/Common/Counter.cs:
--------------------------------------------------------------------------------
  1 | ﻿using System;
  2 | using System.Collections.Generic;
  3 | using System.Linq;
  4 | 
  5 | namespace JiebaNet.Segmenter.Common
  6 | {
  7 |     public interface ICounter<T>
  8 |     {
  9 |         int Count { get; }
 10 |         int Total { get; }
 11 |         int this[T key] { get; set; }
 12 |         IEnumerable<KeyValuePair<T, int>> Elements { get; }
 13 | 
 14 |         /// <summary>
 15 |         /// Lists the n most common elements from the most common to the least.
 16 |         /// </summary>
 17 |         /// <param name="n">Number of elements, list all elements if n is less than 0.</param>
 18 |         /// <returns></returns>
 19 |         IEnumerable<KeyValuePair<T, int>> MostCommon(int n = -1);
 20 | 
 21 |         /// <summary>
 22 |         /// Subtracts items from a counter.
 23 |         /// </summary>
 24 |         /// <param name="items"></param>
 25 |         void Subtract(IEnumerable<T> items);
 26 | 
 27 |         /// <summary>
 28 |         /// Subtracts counts from another counter.
 29 |         /// </summary>
 30 |         /// <param name="other"></param>
 31 |         void Subtract(ICounter<T> other);
 32 | 
 33 |         /// <summary>
 34 |         /// Adds items to a counter.
 35 |         /// </summary>
 36 |         /// <param name="items"></param>
 37 |         void Add(IEnumerable<T> items);
 38 | 
 39 |         /// <summary>
 40 |         /// Adds another counter.
 41 |         /// </summary>
 42 |         /// <param name="other"></param>
 43 |         void Add(ICounter<T> other);
 44 | 
 45 |         /// <summary>
 46 |         /// Union is the maximum of value in either of the input <see cref="ICounter{T}"/>.
 47 |         /// </summary>
 48 |         /// <param name="other">The other counter.</param>
 49 |         ICounter<T> Union(ICounter<T> other);
 50 | 
 51 |         void Remove(T key);
 52 |         void Clear();
 53 |         bool Contains(T key);
 54 |     }
 55 | 
 56 |     public class Counter<T>: ICounter<T>
 57 |     {
 58 |         private Dictionary<T, int> data = new Dictionary<T, int>();
 59 | 
 60 |         public Counter() {}
 61 | 
 62 |         public Counter(IEnumerable<T> items)
 63 |         {
 64 |             CountItems(items);
 65 |         }
 66 | 
 67 |         public int Count => data.Count;
 68 |         public int Total => data.Values.Sum();
 69 |         public IEnumerable<KeyValuePair<T, int>> Elements => data;
 70 | 
 71 |         public int this[T key]
 72 |         {
 73 |             get => data.ContainsKey(key) ? data[key] : 0;
 74 |             set => data[key] = value;
 75 |         }
 76 | 
 77 |         public IEnumerable<KeyValuePair<T, int>> MostCommon(int n = -1)
 78 |         {
 79 |             var pairs = data.Where(pair => pair.Value > 0).OrderByDescending(pair => pair.Value);
 80 |             return n < 0 ? pairs : pairs.Take(n);
 81 |         }
 82 | 
 83 |         public void Subtract(IEnumerable<T> items)
 84 |         {
 85 |             SubtractItems(items);
 86 |         }
 87 | 
 88 |         public void Subtract(ICounter<T> other)
 89 |         {
 90 |             SubtractPairs(other.Elements);
 91 |         }
 92 | 
 93 |         public void Add(IEnumerable<T> items)
 94 |         {
 95 |             CountItems(items);
 96 |         }
 97 | 
 98 |         public void Add(ICounter<T> other)
 99 |         {
100 |             CountPairs(other.Elements);
101 |         }
102 | 
103 |         public ICounter<T> Union(ICounter<T> other)
104 |         {
105 |             var result = new Counter<T>();
106 |             foreach (var pair in data)
107 |             {
108 |                 var count = pair.Value;
109 |                 var otherCount = other[pair.Key];
110 |                 var newCount = count < otherCount ? otherCount : count;
111 |                 result[pair.Key] = newCount;
112 |             }
113 | 
114 |             foreach (var pair in other.Elements)
115 |             {
116 |                 if (!Contains(pair.Key))
117 |                 {
118 |                     result[pair.Key] = pair.Value;
119 |                 }
120 |             }
121 |             return result;
122 |         }
123 | 
124 |         public void Remove(T key)
125 |         {
126 |             if (data.ContainsKey(key))
127 |             {
128 |                 data.Remove(key);
129 |             }
130 |         }
131 | 
132 |         public void Clear()
133 |         {
134 |             data.Clear();
135 |         }
136 | 
137 |         public bool Contains(T key)
138 |         {
139 |             return data.ContainsKey(key);
140 |         }
141 | 
142 |         #region Private Methods
143 | 
144 |         private void CountItems(IEnumerable<T> items)
145 |         {
146 |             foreach (var item in items)
147 |             {
148 |                 data[item] = data.GetDefault(item, 0) + 1;
149 |             }
150 |         }
151 | 
152 |         private void CountPairs(IEnumerable<KeyValuePair<T, int>> pairs)
153 |         {
154 |             foreach (var pair in pairs)
155 |             {
156 |                 this[pair.Key] += pair.Value;
157 |             }
158 |         }
159 | 
160 |         private void SubtractItems(IEnumerable<T> items)
161 |         {
162 |             foreach (var item in items)
163 |             {
164 |                 data[item] = data.GetDefault(item, 0) - 1;
165 |             }
166 |         }
167 | 
168 |         private void SubtractPairs(IEnumerable<KeyValuePair<T, int>> pairs)
169 |         {
170 |             foreach (var pair in pairs)
171 |             {
172 |                 this[pair.Key] -= pair.Value;
173 |             }
174 |         }
175 | 
176 |         #endregion
177 |     }
178 | }
179 | 


--------------------------------------------------------------------------------
/CherubNLP/Jieba.NET/Common/Extensions.cs:
--------------------------------------------------------------------------------
  1 | ﻿using System;
  2 | using System.Collections.Generic;
  3 | using System.Linq;
  4 | using System.Text.RegularExpressions;
  5 | 
  6 | namespace JiebaNet.Segmenter.Common
  7 | {
  8 |     public static class Extensions
  9 |     {
 10 |         private static readonly Regex RegexDigits = new Regex(@"\d+", RegexOptions.Compiled);
 11 |         private static readonly Regex RegexNewline = new Regex("(\r\n|\n|\r)", RegexOptions.Compiled);
 12 | 
 13 |         #region Objects
 14 | 
 15 |         public static bool IsNull(this object obj)
 16 |         {
 17 |             return obj == null;
 18 |         }
 19 | 
 20 |         public static bool IsNotNull(this object obj)
 21 |         {
 22 |             return obj != null;
 23 |         }
 24 | 
 25 |         #endregion
 26 | 
 27 | 
 28 |         #region Enumerable
 29 | 
 30 |         public static bool IsEmpty<T>(this IEnumerable<T> enumerable)
 31 |         {
 32 |             return (enumerable == null) || !enumerable.Any();
 33 |         }
 34 | 
 35 |         public static bool IsNotEmpty<T>(this IEnumerable<T> enumerable)
 36 |         {
 37 |             return (enumerable != null) && enumerable.Any();
 38 |         }
 39 | 
 40 |         public static TValue GetValueOrDefault<TKey, TValue>(this IDictionary<TKey, TValue> d, TKey key)
 41 |         {
 42 |             return d.ContainsKey(key) ? d[key] : default(TValue);
 43 |         }
 44 | 
 45 |         public static TValue GetDefault<TKey, TValue>(this IDictionary<TKey, TValue> dict, TKey key, TValue defaultValue)
 46 |         {
 47 |             if (dict.ContainsKey(key))
 48 |             {
 49 |                 return dict[key];
 50 |             }
 51 |             return defaultValue;
 52 |         }
 53 | 
 54 |         public static void Update<TKey, TValue>(this IDictionary<TKey, TValue> dict, IDictionary<TKey, TValue> other)
 55 |         {
 56 |             foreach (var key in other.Keys)
 57 |             {
 58 |                 dict[key] = other[key];
 59 |             }
 60 |         }
 61 | 
 62 |         #endregion
 63 | 
 64 |         #region String & Text
 65 | 
 66 |         public static string Left(this string s, int endIndex)
 67 |         {
 68 |             if (string.IsNullOrEmpty(s))
 69 |             {
 70 |                 return s;
 71 |             }
 72 | 
 73 |             return s.Substring(0, endIndex);
 74 |         }
 75 | 
 76 |         public static string Right(this string s, int startIndex)
 77 |         {
 78 |             if (string.IsNullOrEmpty(s))
 79 |             {
 80 |                 return s;
 81 |             }
 82 | 
 83 | 
 84 |             return s.Substring(startIndex);
 85 |         }
 86 | 
 87 |         public static string Sub(this string s, int startIndex, int endIndex)
 88 |         {
 89 |             return s.Substring(startIndex, endIndex - startIndex);
 90 |         }
 91 | 
 92 |         public static bool IsInt32(this string s)
 93 |         {
 94 |             return RegexDigits.IsMatch(s);
 95 |         }
 96 |         
 97 |         public static string[] SplitLines(this string s)
 98 |         {
 99 |             return RegexNewline.Split(s);
100 |         }
101 | 
102 |         public static string Join(this IEnumerable<string> inputs, string separator = ", ")
103 |         {
104 |             return string.Join(separator, inputs);
105 |         }
106 | 
107 |         public static IEnumerable<string> SubGroupValues(this GroupCollection groups)
108 |         {
109 |             var result = from Group g in groups
110 |                          select g.Value;
111 |             return result.Skip(1);
112 |         }
113 | 
114 |         #endregion
115 | 
116 |         #region Conversion
117 | 
118 |         public static int ToInt32(this char ch)
119 |         {
120 |             return ch;
121 |         }
122 | 
123 |         public static char ToChar(this int i)
124 |         {
125 |             return (char)i;
126 |         }
127 | 
128 |         #endregion
129 |     }
130 | }


--------------------------------------------------------------------------------
/CherubNLP/Jieba.NET/Common/FileExtension.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.IO;
 4 | using System.Reflection;
 5 | using System.Text;
 6 | 
 7 | namespace JiebaNet.Segmenter.Common
 8 | {
 9 |     public static class FileExtension
10 |     {
11 |         public static string ReadEmbeddedAllLine(string path)
12 |         {
13 |             return ReadEmbeddedAllLine(path, Encoding.UTF8);
14 |         }
15 | 
16 |         public static string ReadEmbeddedAllLine(string path,Encoding encoding)
17 |         {
18 |             using (var sr = new StreamReader(path))
19 |             {
20 |                 return sr.ReadToEnd();
21 |             }
22 |         }
23 | 
24 |         public static List<string> ReadEmbeddedAllLines(string path, Encoding encoding)
25 |         {
26 |             List<string> list = new List<string>();
27 |             using (var sr = new StreamReader(path))
28 |             {
29 |                 string item;
30 |                 while ((item = sr.ReadLine()) != null)
31 |                 {
32 |                     list.Add(item);
33 |                 }
34 |             }
35 |             return list;
36 |         }
37 | 
38 |         public static List<string> ReadEmbeddedAllLines(string path)
39 |         {
40 |             return ReadEmbeddedAllLines(path, Encoding.UTF8);
41 |         }
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/CherubNLP/Jieba.NET/Common/Trie.cs:
--------------------------------------------------------------------------------
  1 | ﻿using System;
  2 | using System.Collections.Generic;
  3 | using System.Linq;
  4 | 
  5 | namespace JiebaNet.Segmenter.Common
  6 | {
  7 |     // Refer to: https://github.com/brianfromoregon/trie
  8 |     public class TrieNode
  9 |     {
 10 |         public char Char { get; set; }
 11 |         public int Frequency { get; set; }
 12 |         public Dictionary<char, TrieNode> Children { get; set; }
 13 | 
 14 |         public TrieNode(char ch)
 15 |         {
 16 |             Char = ch;
 17 |             Frequency = 0;
 18 |             
 19 |             // TODO: or an empty dict?
 20 |             //Children = null;
 21 |         }
 22 | 
 23 |         public int Insert(string s, int pos, int freq = 1)
 24 |         {
 25 |             if (string.IsNullOrEmpty(s) || pos >= s.Length)
 26 |             {
 27 |                 return 0;
 28 |             }
 29 | 
 30 |             if (Children == null)
 31 |             {
 32 |                 Children = new Dictionary<char, TrieNode>();
 33 |             }
 34 | 
 35 |             var c = s[pos];
 36 |             if (!Children.ContainsKey(c))
 37 |             {
 38 |                 Children[c] = new TrieNode(c);
 39 |             }
 40 | 
 41 |             var curNode = Children[c];
 42 |             if (pos == s.Length - 1)
 43 |             {
 44 |                 curNode.Frequency += freq;
 45 |                 return curNode.Frequency;
 46 |             }
 47 | 
 48 |             return curNode.Insert(s, pos + 1, freq);
 49 |         }
 50 | 
 51 |         public TrieNode Search(string s, int pos)
 52 |         {
 53 |             if (string.IsNullOrEmpty(s))
 54 |             {
 55 |                 return null;
 56 |             }
 57 | 
 58 |             // if out of range or without any child nodes
 59 |             if (pos >= s.Length || Children == null)
 60 |             {
 61 |                 return null;
 62 |             }
 63 |             // if reaches the last char of s, it's time to make the decision.
 64 |             if (pos == s.Length - 1)
 65 |             {
 66 |                 return Children.ContainsKey(s[pos]) ? Children[s[pos]] : null;
 67 |             }
 68 |             // continue if necessary.
 69 |             return Children.ContainsKey(s[pos]) ? Children[s[pos]].Search(s, pos + 1) : null;
 70 |         }
 71 |     }
 72 | 
 73 |     public interface ITrie
 74 |     {
 75 |         //string BestMatch(string word, long maxTime);
 76 |         bool Contains(string word);
 77 |         int Frequency(string word);
 78 |         int Insert(string word, int freq = 1);
 79 |         //bool Remove(string word);
 80 |         int Count { get; }
 81 |         int TotalFrequency { get; }
 82 |     }
 83 | 
 84 |     public class Trie : ITrie
 85 |     {
 86 |         private static readonly char RootChar = '\0';
 87 | 
 88 |         internal TrieNode Root;
 89 | 
 90 |         public int Count { get; private set; }
 91 |         public int TotalFrequency { get; private set; }
 92 | 
 93 |         public Trie()
 94 |         {
 95 |             Root = new TrieNode(RootChar);
 96 |             Count = 0;
 97 |         }
 98 | 
 99 |         public bool Contains(string word)
100 |         {
101 |             CheckWord(word);
102 | 
103 |             var node = Root.Search(word.Trim(), 0);
104 |             return node.IsNotNull() && node.Frequency > 0;
105 |         }
106 | 
107 |         public bool ContainsPrefix(string word)
108 |         {
109 |             CheckWord(word);
110 | 
111 |             var node = Root.Search(word.Trim(), 0);
112 |             return node.IsNotNull();
113 |         }
114 | 
115 |         public int Frequency(string word)
116 |         {
117 |             CheckWord(word);
118 | 
119 |             var node = Root.Search(word.Trim(), 0);
120 |             return node.IsNull() ? 0 : node.Frequency;
121 |         }
122 | 
123 |         public int Insert(string word, int freq = 1)
124 |         {
125 |             CheckWord(word);
126 | 
127 |             var i = Root.Insert(word.Trim(), 0, freq);
128 |             if (i > 0)
129 |             {
130 |                 TotalFrequency += freq;
131 |                 Count++;
132 |             }
133 | 
134 |             return i;
135 |         }
136 | 
137 |         public IEnumerable<char> ChildChars(string prefix)
138 |         {
139 |             var node = Root.Search(prefix.Trim(), 0);
140 |             return node.IsNull() || node.Children.IsNull() ? null : node.Children.Select(p => p.Key);
141 |         }
142 | 
143 |         private void CheckWord(string word)
144 |         {
145 |             if (string.IsNullOrWhiteSpace(word))
146 |             {
147 |                 throw new ArgumentException("word must not be null or whitespace");
148 |             }
149 |         }
150 |     }
151 | }
152 | 


--------------------------------------------------------------------------------
/CherubNLP/Jieba.NET/ConfigManager.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.IO;
 3 | 
 4 | namespace JiebaNet.Segmenter
 5 | {
 6 |     public class ConfigManager
 7 |     {
 8 |         public static string ConfigFileBaseDir
 9 |         {
10 |             get
11 |             {
12 |                 string path = String.Empty;
13 | 
14 |                 var dir = AppDomain.CurrentDomain.GetData("JiebaConfigFileDir");
15 |                 if (dir == null)
16 |                 {
17 |                     path = "Resources";
18 |                 }
19 |                 else
20 |                 {
21 |                     path = Path.Combine(dir.ToString(), "Resources");
22 |                 }
23 | 
24 |                 return path;
25 |             }
26 |         }
27 | 
28 |         public static string MainDictFile
29 |         {
30 |             get { return Path.Combine(ConfigFileBaseDir, "dict.txt"); }
31 |         }
32 | 
33 |         public static string ProbTransFile
34 |         {
35 |             get { return Path.Combine(ConfigFileBaseDir, "prob_trans.json"); }
36 |         }
37 | 
38 |         public static string ProbEmitFile
39 |         {
40 |             get { return Path.Combine(ConfigFileBaseDir, "prob_emit.json"); }
41 |         }
42 | 
43 |         public static string PosProbStartFile
44 |         {
45 |             get { return Path.Combine(ConfigFileBaseDir, "pos_prob_start.json"); }
46 |         }
47 | 
48 |         public static string PosProbTransFile
49 |         {
50 |             get { return Path.Combine(ConfigFileBaseDir, "pos_prob_trans.json"); }
51 |         }
52 | 
53 |         public static string PosProbEmitFile
54 |         {
55 |             get { return Path.Combine(ConfigFileBaseDir, "pos_prob_emit.json"); }
56 |         }
57 | 
58 |         public static string CharStateTabFile
59 |         {
60 |             get { return Path.Combine(ConfigFileBaseDir, "char_state_tab.json"); }
61 |         }
62 |     }
63 | }


--------------------------------------------------------------------------------
/CherubNLP/Jieba.NET/Constants.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System.Collections.Generic;
 2 | using System.Linq;
 3 | 
 4 | namespace JiebaNet.Segmenter
 5 | {
 6 |     public class Constants
 7 |     {
 8 |         public static readonly double MinProb = -3.14e100;
 9 | 
10 |         public static readonly List<string> NounPos = new List<string>() { "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz" };
11 |         public static readonly List<string> VerbPos = new List<string>() { "v", "vd", "vg", "vi", "vn", "vq" };
12 |         public static readonly List<string> NounAndVerbPos = NounPos.Union(VerbPos).ToList();
13 |         public static readonly List<string> IdiomPos = new List<string>() { "i" };
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/CherubNLP/Jieba.NET/DefaultDictionary.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Linq;
 4 | using System.Text;
 5 | using System.Threading.Tasks;
 6 | 
 7 | namespace JiebaNet.Segmenter
 8 | {
 9 |     public class DefaultDictionary<TKey, TValue> : Dictionary<TKey, TValue>
10 |     {
11 |         public new TValue this[TKey key]
12 |         {
13 |             get
14 |             {
15 |                 if (!ContainsKey(key))
16 |                 {
17 |                     Add(key, default(TValue));
18 |                 }
19 |                 return base[key];
20 |             }
21 |             set { base[key] = value; }
22 |         }
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/CherubNLP/Jieba.NET/FinalSeg/IFinalSeg.cs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSharp/CherubNLP/7e80875a7909288c15c0e5ee1fafcb4c8df41198/CherubNLP/Jieba.NET/FinalSeg/IFinalSeg.cs


--------------------------------------------------------------------------------
/CherubNLP/Jieba.NET/JiebaTagger.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CherubNLP;
 2 | using CherubNLP.Tag;
 3 | using JiebaNet.Segmenter.PosSeg;
 4 | using System;
 5 | using System.Collections.Generic;
 6 | using System.IO;
 7 | using System.Linq;
 8 | using System.Text;
 9 | 
10 | namespace BotSharp.Core.Engines.Jieba.NET
11 | {
12 |     public class JiebaTagger : ITagger
13 |     {
14 |         private PosSegmenter posSeg;
15 | 
16 |         public void Tag(Sentence sentence, TagOptions options)
17 |         {
18 |             Init();
19 | 
20 |             var tokens = posSeg.Cut(sentence.Text).ToList();
21 | 
22 |             for(int i = 0; i < sentence.Words.Count; i++)
23 |             {
24 |                 sentence.Words[i].Pos = tokens[i].Flag;
25 |                 sentence.Words[i].Tag = tokens[i].Flag;
26 |             }
27 |         }
28 | 
29 |         public void Train(List<Sentence> sentences, TagOptions options)
30 |         {
31 |             
32 |         }
33 | 
34 |         private void Init()
35 |         {
36 |             if (posSeg == null)
37 |             {
38 |                 string contentDir = AppDomain.CurrentDomain.GetData("DataPath").ToString();
39 |                 AppDomain.CurrentDomain.SetData("JiebaConfigFileDir", contentDir);
40 | 
41 |                 posSeg = new PosSegmenter();
42 |             }
43 |         }
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/CherubNLP/Jieba.NET/JiebaTokenizer.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CherubNLP.Tokenize;
 2 | using JiebaNet.Segmenter;
 3 | using System;
 4 | using System.Collections.Generic;
 5 | using System.IO;
 6 | using System.Linq;
 7 | using System.Text;
 8 | using Token = CherubNLP.Tokenize.Token;
 9 | 
10 | namespace BotSharp.Core.Engines.Jieba.NET
11 | {
12 |     public class JiebaTokenizer : TokenizerBase, ITokenizer
13 |     {
14 |         private JiebaSegmenter segmenter;
15 | 
16 |         public List<Token> Tokenize(string sentence, TokenizationOptions options)
17 |         {
18 |             Init();
19 | 
20 |             var tokens = segmenter.Cut(sentence)
21 |                 .Select(x => new Token
22 |                 {
23 |                     Text = x
24 |                 }).ToList();
25 | 
26 |             CorrectTokenPosition(sentence, tokens);
27 | 
28 |             return tokens;
29 |         }
30 | 
31 |         private void Init()
32 |         {
33 |             if (segmenter == null)
34 |             {
35 |                 string contentDir = AppDomain.CurrentDomain.GetData("DataPath").ToString();
36 |                 AppDomain.CurrentDomain.SetData("JiebaConfigFileDir", contentDir);
37 | 
38 |                 segmenter = new JiebaSegmenter();
39 |                 segmenter.LoadUserDict(Path.Combine(contentDir, "userdict.txt"));
40 |             }
41 |         }
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/CherubNLP/Jieba.NET/Node.cs:
--------------------------------------------------------------------------------
 1 | ﻿namespace JiebaNet.Segmenter
 2 | {
 3 |     public class Node
 4 |     {
 5 |         public char Value { get; private set; }
 6 |         public Node Parent { get; private set; }
 7 | 
 8 |         public Node(char value, Node parent)
 9 |         {
10 |             Value = value;
11 |             Parent = parent;
12 |         }
13 |     }
14 | }


--------------------------------------------------------------------------------
/CherubNLP/Jieba.NET/Pair.cs:
--------------------------------------------------------------------------------
 1 | ﻿namespace JiebaNet.Segmenter
 2 | {
 3 |     public class Pair<TKey>
 4 |     {
 5 |         public TKey Key { get;set; }
 6 |         public double Freq { get; set; }
 7 | 
 8 |         public Pair(TKey key, double freq)
 9 |         {
10 |             Key = key;
11 |             Freq = freq;
12 |         }
13 | 
14 |         public override string ToString()
15 |         {
16 |             return "Candidate [Key=" + Key + ", Freq=" + Freq + "]";
17 |         }
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/CherubNLP/Jieba.NET/PosSeg/Pair.cs:
--------------------------------------------------------------------------------
 1 | ﻿namespace JiebaNet.Segmenter.PosSeg
 2 | {
 3 |     public class Pair
 4 |     {
 5 |         public string Word { get; set; }
 6 |         public string Flag { get; set; }
 7 |         public Pair(string word, string flag)
 8 |         {
 9 |             Word = word;
10 |             Flag = flag;
11 |         }
12 | 
13 |         public override string ToString()
14 |         {
15 |             return string.Format("{0}/{1}", Word, Flag);
16 |         }
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/CherubNLP/Jieba.NET/README.rst:
--------------------------------------------------------------------------------
1 | ﻿BotSharp uses the jieba.NetCore to do tokenization. (https://github.com/1483523635/jieba.NetCore)
2 | 
3 | Please follow the install instruction (https://github.com/anderscui/jieba.NET/)
4 | 


--------------------------------------------------------------------------------
/CherubNLP/Jieba.NET/Token.cs:
--------------------------------------------------------------------------------
 1 | ﻿namespace JiebaNet.Segmenter
 2 | {
 3 |     public class Token
 4 |     {
 5 |         public string Word { get; set; }
 6 |         public int StartIndex { get; set; }
 7 |         public int EndIndex { get; set; }
 8 | 
 9 |         public Token(string word, int startIndex, int endIndex)
10 |         {
11 |             Word = word;
12 |             StartIndex = startIndex;
13 |             EndIndex = endIndex;
14 |         }
15 | 
16 |         public override string ToString()
17 |         {
18 |             return string.Format("[{0}, ({1}, {2})]", Word, StartIndex, EndIndex);
19 |         }
20 |     }
21 | }


--------------------------------------------------------------------------------
/CherubNLP/Jieba.NET/WordDictionary.cs:
--------------------------------------------------------------------------------
  1 | using System;
  2 | using System.Collections.Generic;
  3 | using System.Diagnostics;
  4 | using System.IO;
  5 | using System.Linq;
  6 | using System.Text;
  7 | using JiebaNet.Segmenter.Common;
  8 | using System.Reflection;
  9 | 
 10 | namespace JiebaNet.Segmenter
 11 | {
 12 |     public class WordDictionary
 13 |     {
 14 |         private static readonly Lazy<WordDictionary> lazy = new Lazy<WordDictionary>(() => new WordDictionary());
 15 |         private static readonly string MainDict = ConfigManager.MainDictFile;
 16 | 
 17 |         internal IDictionary<string, int> Trie = new Dictionary<string, int>();
 18 | 
 19 |         /// <summary>
 20 |         /// total occurrence of all words.
 21 |         /// </summary>
 22 |         public double Total { get; set; }
 23 | 
 24 |         private WordDictionary()
 25 |         {
 26 |             LoadDict();
 27 | 
 28 |             Debug.WriteLine("{0} words (and their prefixes)", Trie.Count);
 29 |             Debug.WriteLine("total freq: {0}", Total);
 30 |         }
 31 | 
 32 |         public static WordDictionary Instance
 33 |         {
 34 |             get { return lazy.Value; }
 35 |         }
 36 | 
 37 |         private void LoadDict()
 38 |         {
 39 |             try
 40 |             {
 41 |                 var stopWatch = new Stopwatch();
 42 |                 stopWatch.Start();
 43 |                 var filePath = ConfigManager.MainDictFile;
 44 | 
 45 |                 using (var sr = new StreamReader(filePath))
 46 |                 {
 47 |                     string line = null;
 48 |                     while ((line = sr.ReadLine()) != null)
 49 |                     {
 50 |                         var tokens = line.Split(' ');
 51 |                         if (tokens.Length < 2)
 52 |                         {
 53 |                             Debug.Fail(string.Format("Invalid line: {0}", line));
 54 |                             continue;
 55 |                         }
 56 | 
 57 |                         var word = tokens[0];
 58 |                         var freq = int.Parse(tokens[1]);
 59 | 
 60 |                         Trie[word] = freq;
 61 |                         Total += freq;
 62 | 
 63 |                         foreach (var ch in Enumerable.Range(0, word.Length))
 64 |                         {
 65 |                             var wfrag = word.Sub(0, ch + 1);
 66 |                             if (!Trie.ContainsKey(wfrag))
 67 |                             {
 68 |                                 Trie[wfrag] = 0;
 69 |                             }
 70 |                         }
 71 |                     }
 72 |                 }
 73 | 
 74 |                 stopWatch.Stop();
 75 |                 Debug.WriteLine("main dict load finished, time elapsed {0} ms", stopWatch.ElapsedMilliseconds);
 76 |             }
 77 |             catch (IOException e)
 78 |             {
 79 |                 Debug.Fail(string.Format("{0} load failure, reason: {1}", MainDict, e.Message));
 80 |             }
 81 |             catch (FormatException fe)
 82 |             {
 83 |                 Debug.Fail(fe.Message);
 84 |             }
 85 |         }
 86 | 
 87 |         public bool ContainsWord(string word)
 88 |         {
 89 |             return Trie.ContainsKey(word) && Trie[word] > 0;
 90 |         }
 91 | 
 92 |         public int GetFreqOrDefault(string key)
 93 |         {
 94 |             if (ContainsWord(key))
 95 |                 return Trie[key];
 96 |             else
 97 |                 return 1;
 98 |         }
 99 | 
100 |         public void AddWord(string word, int freq, string tag = null)
101 |         {
102 |             if (ContainsWord(word))
103 |             {
104 |                 Total -= Trie[word];
105 |             }
106 | 
107 |             Trie[word] = freq;
108 |             Total += freq;
109 |             for (var i = 0; i < word.Length; i++)
110 |             {
111 |                 var wfrag = word.Substring(0, i + 1);
112 |                 if (!Trie.ContainsKey(wfrag))
113 |                 {
114 |                     Trie[wfrag] = 0;
115 |                 }
116 |             }
117 |         }
118 | 
119 |         public void DeleteWord(string word)
120 |         {
121 |             AddWord(word, 0);
122 |         }
123 | 
124 |         internal int SuggestFreq(string word, IEnumerable<string> segments)
125 |         {
126 |             double freq = 1;
127 |             foreach (var seg in segments)
128 |             {
129 |                 freq *= GetFreqOrDefault(seg) / Total;
130 |             }
131 | 
132 |             return Math.Max((int)(freq * Total) + 1, GetFreqOrDefault(word));
133 |         }
134 |     }
135 | }


--------------------------------------------------------------------------------
/CherubNLP/Models/Entropy/BasicContextGenerator.cs:
--------------------------------------------------------------------------------
 1 | //Copyright (C) 2005 Richard J. Northedge
 2 | //
 3 | // This library is free software; you can redistribute it and/or
 4 | // modify it under the terms of the GNU Lesser General Public
 5 | // License as published by the Free Software Foundation; either
 6 | // version 2.1 of the License, or (at your option) any later version.
 7 | //
 8 | // This library is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU Lesser General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU Lesser General Public
14 | // License along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16 | 
17 | //This file is based on the BasicContextGenerator.java source file found in the
18 | //original java implementation of MaxEnt.  That source file contains the following header:
19 | 
20 | // Copyright (C) 2001 Jason Baldridge
21 | //
22 | // This library is free software; you can redistribute it and/or
23 | // modify it under the terms of the GNU Lesser General Public
24 | // License as published by the Free Software Foundation; either
25 | // version 2.1 of the License, or (at your option) any later version.
26 | //
27 | // This library is distributed in the hope that it will be useful,
28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
30 | // GNU Lesser General Public License for more details.
31 | //
32 | // You should have received a copy of the GNU Lesser General Public
33 | // License along with this program; if not, write to the Free Software
34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. 
35 | 
36 | using System;
37 | 
38 | namespace CherubNLP.Models
39 | {
40 | 	/// <summary>
41 | 	/// Generate contexts for maxent decisions, assuming that the input
42 | 	/// given to the GetContext() method is a string containing contextual
43 | 	/// predicates separated by spaces, e.g:
44 | 	/// <p>
45 | 	/// cp_1 cp_2 ... cp_n
46 | 	/// </p>
47 | 	/// </summary>
48 | 	/// <author>
49 | 	/// Jason Baldridge
50 | 	/// </author>
51 | 	/// <author>
52 | 	/// Richard J. Northedge
53 | 	/// </author>
54 | 	/// <version>based on BasicContextGenerator.java, $Revision: 1.2 $, $Date: 2002/04/30 08:48:35 $
55 | 	/// </version>
56 | 	public class BasicContextGenerator : IContextGenerator<string>
57 | 	{
58 | 		/// <summary>
59 | 		/// Builds up the list of contextual predicates given a string.
60 | 		/// </summary>
61 | 		/// <param name="input">
62 | 		/// string with contextual predicates separated by spaces.
63 | 		/// </param>
64 | 		/// <returns>string array of contextual predicates.</returns>
65 | 		public virtual string[] GetContext(string input)
66 | 		{
67 | 			return input.Split(' ');
68 | 		}
69 | 	}
70 | }
71 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/Entropy/BasicEventReader.cs:
--------------------------------------------------------------------------------
  1 | //Copyright (C) 2005 Richard J. Northedge
  2 | //
  3 | // This library is free software; you can redistribute it and/or
  4 | // modify it under the terms of the GNU Lesser General Public
  5 | // License as published by the Free Software Foundation; either
  6 | // version 2.1 of the License, or (at your option) any later version.
  7 | //
  8 | // This library is distributed in the hope that it will be useful,
  9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | // GNU Lesser General Public License for more details.
 12 | //
 13 | // You should have received a copy of the GNU Lesser General Public
 14 | // License along with this program; if not, write to the Free Software
 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 16 | 
 17 | //This file is based on the BasicEventStream.java source file found in the
 18 | //original java implementation of MaxEnt.  That source file contains the following header:
 19 | 
 20 | // Copyright (C) 2001 Jason Baldridge
 21 | //
 22 | // This library is free software; you can redistribute it and/or
 23 | // modify it under the terms of the GNU Lesser General Public
 24 | // License as published by the Free Software Foundation; either
 25 | // version 2.1 of the License, or (at your option) any later version.
 26 | //
 27 | // This library is distributed in the hope that it will be useful,
 28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 30 | // GNU Lesser General Public License for more details.
 31 | //
 32 | // You should have received a copy of the GNU Lesser General Public
 33 | // License along with this program; if not, write to the Free Software
 34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. 
 35 | 
 36 | using System;
 37 | 
 38 | namespace CherubNLP.Models
 39 | {
 40 | 	/// <summary>
 41 | 	/// An object which can deliver a stream of training events assuming
 42 | 	/// that each event is represented as a space separated list containing
 43 | 	/// all the contextual predicates, with the last item being the
 44 | 	/// outcome, e.g.: 
 45 | 	/// 
 46 | 	/// <p> cp_1 cp_2 ... cp_n outcome</p>
 47 | 	/// </summary>
 48 | 	public class BasicEventReader : ITrainingEventReader
 49 | 	{
 50 | 		private IContextGenerator<string> mContext;
 51 | 		private ITrainingDataReader<string> mDataReader;
 52 | 		private TrainingEvent mNextEvent;
 53 | 		
 54 | 		/// <summary>
 55 | 		/// Constructor sets up the training event reader based on a stream of training data.
 56 | 		/// </summary>
 57 | 		/// <param name="dataReader">
 58 | 		/// Stream of training data.
 59 | 		/// </param>
 60 | 		public BasicEventReader(ITrainingDataReader<string> dataReader)
 61 | 		{
 62 | 			mContext = new BasicContextGenerator();
 63 | 
 64 | 			mDataReader = dataReader;
 65 | 			if (mDataReader.HasNext())
 66 | 			{
 67 | 				mNextEvent = CreateEvent(mDataReader.NextToken());
 68 | 			}
 69 | 		}
 70 | 		
 71 | 		/// <summary> 
 72 | 		/// Returns the next Event object held in this EventReader.  Each call to ReadNextEvent advances the EventReader.
 73 | 		/// </summary>
 74 | 		/// <returns>
 75 | 		/// the Event object which is next in this EventReader
 76 | 		/// </returns>
 77 | 		public virtual TrainingEvent ReadNextEvent()
 78 | 		{
 79 | 			while (mNextEvent == null && mDataReader.HasNext())
 80 | 			{
 81 | 				mNextEvent = CreateEvent(mDataReader.NextToken());
 82 | 			}
 83 | 			
 84 | 			TrainingEvent currentEvent = mNextEvent;
 85 | 			if (mDataReader.HasNext())
 86 | 			{
 87 | 				mNextEvent = CreateEvent(mDataReader.NextToken());
 88 | 			}
 89 | 			else
 90 | 			{
 91 | 				mNextEvent = null;
 92 | 			}
 93 | 			return currentEvent;
 94 | 		}
 95 | 		
 96 | 		/// <summary> 
 97 | 		/// Test whether there are any Events remaining in this EventReader.
 98 | 		/// </summary>
 99 | 		/// <returns>
100 | 		/// true if this EventReader has more Events
101 | 		/// </returns>
102 | 		public virtual bool HasNext()
103 | 		{
104 | 			while (mNextEvent == null && mDataReader.HasNext())
105 | 			{
106 | 				mNextEvent = CreateEvent(mDataReader.NextToken());
107 | 			}
108 | 			return mNextEvent != null;
109 | 		}
110 | 		
111 | 		private TrainingEvent CreateEvent(string observation)
112 | 		{
113 | 			int lastSpace = observation.LastIndexOf((char)' ');
114 | 			if (lastSpace == -1)
115 | 			{
116 | 				return null;
117 | 			}
118 | 			else
119 | 			{
120 | 				return new TrainingEvent(observation.Substring(lastSpace + 1), mContext.GetContext(observation.Substring(0, (lastSpace) - (0))));
121 | 			}
122 | 		}
123 | 	}
124 | }
125 | 
126 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/Entropy/IContextGenerator.cs:
--------------------------------------------------------------------------------
 1 | //Copyright (C) 2005 Richard J. Northedge
 2 | //
 3 | // This library is free software; you can redistribute it and/or
 4 | // modify it under the terms of the GNU Lesser General Public
 5 | // License as published by the Free Software Foundation; either
 6 | // version 2.1 of the License, or (at your option) any later version.
 7 | //
 8 | // This library is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU Lesser General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU Lesser General Public
14 | // License along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16 | 
17 | //This file is based on the ContextGenerator.java source file found in the
18 | //original java implementation of MaxEnt.  That source file contains the following header:
19 | 
20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner
21 | //
22 | // This library is free software; you can redistribute it and/or
23 | // modify it under the terms of the GNU Lesser General Public
24 | // License as published by the Free Software Foundation; either
25 | // version 2.1 of the License, or (at your option) any later version.
26 | //
27 | // This library is distributed in the hope that it will be useful,
28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
30 | // GNU General Public License for more details.
31 | //
32 | // You should have received a copy of the GNU Lesser General Public
33 | // License along with this program; if not, write to the Free Software
34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
35 |  
36 | using System;
37 | 
38 | namespace CherubNLP.Models
39 | {
40 | 	/// <summary> 
41 | 	/// Generate contexts for maximum entropy decisions.
42 | 	/// </summary>
43 | 	/// <author>
44 | 	/// Jason Baldridge
45 | 	/// </author>
46 | 	/// <author>
47 | 	/// Richard J. Northedge
48 | 	/// </author>
49 | 	/// <version>
50 | 	/// based on ContextGenerator.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $
51 | 	/// </version>
52 | 	public interface IContextGenerator
53 | 	{
54 | 		/// <summary>
55 | 		/// Builds up the list of contextual predicates given an object.
56 | 		/// </summary>
57 | 		string[] GetContext(object input);
58 | 	}
59 | 
60 |     /// <summary> 
61 |     /// Generate contexts for maximum entropy decisions.
62 |     /// </summary>
63 |     public interface IContextGenerator<T>
64 |     {
65 |         /// <summary>
66 |         /// Builds up the list of contextual predicates given an object of type T.
67 |         /// </summary>
68 |         string[] GetContext(T input);
69 |     }
70 | 
71 | }
72 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/Entropy/IMaximumEntropyModel.cs:
--------------------------------------------------------------------------------
  1 | //Copyright (C) 2005 Richard J. Northedge
  2 | //
  3 | // This library is free software; you can redistribute it and/or
  4 | // modify it under the terms of the GNU Lesser General Public
  5 | // License as published by the Free Software Foundation; either
  6 | // version 2.1 of the License, or (at your option) any later version.
  7 | //
  8 | // This library is distributed in the hope that it will be useful,
  9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | // GNU Lesser General Public License for more details.
 12 | //
 13 | // You should have received a copy of the GNU Lesser General Public
 14 | // License along with this program; if not, write to the Free Software
 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 16 | 
 17 | //This file is based on the MaxentModel.java source file found in the
 18 | //original java implementation of MaxEnt.  That source file contains the following header:
 19 | 
 20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner
 21 | //
 22 | // This library is free software; you can redistribute it and/or
 23 | // modify it under the terms of the GNU Lesser General Public
 24 | // License as published by the Free Software Foundation; either
 25 | // version 2.1 of the License, or (at your option) any later version.
 26 | //
 27 | // This library is distributed in the hope that it will be useful,
 28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 30 | // GNU General Public License for more details.
 31 | //
 32 | // You should have received a copy of the GNU Lesser General Public
 33 | // License along with this program; if not, write to the Free Software
 34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 35 | 
 36 | using System;
 37 | 
 38 | namespace CherubNLP.Models
 39 | {
 40 | 	/// <summary>
 41 | 	/// Interface for maximum entropy models.
 42 | 	/// </summary>
 43 | 	/// <author>
 44 | 	/// Jason Baldridge
 45 | 	/// </author>
 46 | 	/// <author>
 47 | 	/// Richard J. Northedge
 48 | 	/// </author>
 49 | 	/// <version>    
 50 | 	/// based on MaxentModel.java, $Revision: 1.4 $, $Date: 2003/12/09 23:13:53 $
 51 | 	/// </version>
 52 | 	public interface IMaximumEntropyModel
 53 | 	{
 54 | 		/// <summary>
 55 | 		/// Returns the number of outcomes for this model.
 56 | 		/// </summary>
 57 | 		/// <returns>
 58 | 		/// The number of outcomes.
 59 | 		/// </returns>
 60 | 		int OutcomeCount
 61 | 		{
 62 | 			get;		
 63 | 		}
 64 | 			
 65 | 		/// <summary> 
 66 | 		/// Evaluates a context.
 67 | 		/// </summary>
 68 | 		/// <param name="context">
 69 | 		/// A list of string names of the contextual predicates
 70 | 		/// which are to be evaluated together.
 71 | 		/// </param>
 72 | 		/// <returns>
 73 | 		/// An array of the probabilities for each of the different
 74 | 		/// outcomes, all of which sum to 1.
 75 | 		/// </returns>
 76 | 		double[] Evaluate(string[] context);
 77 | 			
 78 | 		/// <summary>
 79 | 		/// Evaluates a context.
 80 | 		/// </summary>
 81 | 		/// <param name="context">
 82 | 		/// A list of string names of the contextual predicates
 83 | 		/// which are to be evaluated together.
 84 | 		/// </param>
 85 | 		/// <param name="probabilities">
 86 | 		/// An array which is populated with the probabilities for each of the different
 87 | 		/// outcomes, all of which sum to 1.
 88 | 		/// </param>
 89 | 		/// <returns>
 90 | 		/// an array of the probabilities for each of the different
 91 | 		/// outcomes, all of which sum to 1.  The <code>probabilities</code> array is returned if it is appropiately sized. 
 92 | 		/// </returns>
 93 | 		double[] Evaluate(string[] context, double[] probabilities);
 94 | 			
 95 | 		/// <summary>
 96 | 		/// Simple function to return the outcome associated with the index
 97 | 		/// containing the highest probability in the double[].
 98 | 		/// </summary>
 99 | 		/// <param name="outcomes">
100 | 		/// A <code>double[]</code> as returned by the
101 | 		/// <code>Evaluate(string[] context)</code>
102 | 		/// method.
103 | 		/// </param>
104 | 		/// <returns> 
105 | 		/// the string name of the best outcome
106 | 		/// </returns>
107 | 		string GetBestOutcome(double[] outcomes);
108 | 			
109 | 		/// <summary>
110 | 		/// Return a string matching all the outcome names with all the
111 | 		/// probabilities produced by the <code>eval(string[]
112 | 		/// context)</code> method.
113 | 		/// </summary>
114 | 		/// <param name="outcomes">
115 | 		/// A <code>double[]</code> as returned by the
116 | 		/// <code>eval(string[] context)</code>
117 | 		/// method.
118 | 		/// </param>
119 | 		/// <returns>
120 | 		/// string containing outcome names paired with the normalized
121 | 		/// probability (contained in the <code>double[] ocs</code>)
122 | 		/// for each one.
123 | 		/// </returns>
124 | 		string GetAllOutcomes(double[] outcomes);
125 | 			
126 | 		/// <summary>
127 | 		/// Gets the string name of the outcome associated with the supplied index
128 | 		/// </summary>
129 | 		/// <param name="index">
130 | 		/// the index for which the name of the associated outcome is desired.
131 | 		/// </param>
132 | 		/// <returns> 
133 | 		/// the string name of the outcome
134 | 		/// </returns>
135 | 		string GetOutcomeName(int index);
136 | 			
137 | 		/// <summary>
138 | 		/// Gets the index associated with the string name of the given
139 | 		/// outcome.
140 | 		/// </summary>
141 | 		/// <param name="outcome">
142 | 		/// the string name of the outcome for which the
143 | 		/// index is desired
144 | 		/// </param>
145 | 		/// <returns>
146 | 		/// the index if the given outcome label exists for this
147 | 		/// model, -1 if it does not.
148 | 		/// </returns>
149 | 		int GetOutcomeIndex(string outcome);
150 | 	}
151 | }
152 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/Entropy/IO/IGisModelReader.cs:
--------------------------------------------------------------------------------
 1 | //Copyright (C) 2005 Richard J. Northedge
 2 | //
 3 | // This library is free software; you can redistribute it and/or
 4 | // modify it under the terms of the GNU Lesser General Public
 5 | // License as published by the Free Software Foundation; either
 6 | // version 2.1 of the License, or (at your option) any later version.
 7 | //
 8 | // This library is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU Lesser General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU Lesser General Public
14 | // License along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16 | 
17 | //This file has no equivalent in the java MaxEnt library, because the link
18 | //between GISModel and GISModelReader is implemented differently there.  This
19 | //interface is designed so that GIS model reader classes can hold some or all of 
20 | //their data in persistent storage rather than in memory.
21 | 
22 | using System;
23 | using System.Collections.Generic;
24 | 
25 | namespace CherubNLP.Models.IO
26 | {
27 | 	/// <summary> 
28 | 	/// Interface for readers of GIS models.
29 | 	/// </summary>
30 | 	public interface IGisModelReader
31 | 	{
32 | 		/// <summary>
33 | 		/// Returns the value of the model's correction constant.  This property should
34 | 		/// usually only be accessed by GIS model writer classes via the GisModel class.
35 | 		/// </summary>
36 | 		int CorrectionConstant
37 | 		{
38 | 			get;
39 | 		}
40 | 
41 | 		/// <summary>
42 | 		/// Returns the value of the model's correction constant parameter.  This property should
43 | 		/// usually only be accessed by GIS model writer classes via the GisModel class.
44 | 		/// </summary>
45 | 		double CorrectionParameter
46 | 		{
47 | 			get;
48 | 		}
49 | 
50 | 		/// <summary>
51 | 		/// Returns the model's outcome labels as a string array.  This method should
52 | 		/// usually only be accessed by GIS model writer classes via the GisModel class.
53 | 		/// </summary>
54 | 		string[] GetOutcomeLabels();
55 | 
56 | 		/// <summary>
57 | 		/// Returns the model's outcome patterns.  This method should
58 | 		/// usually only be accessed by GIS model writer classes via the GisModel class.
59 | 		/// </summary>
60 | 		int[][] GetOutcomePatterns();
61 | 
62 | 		/// <summary>
63 | 		/// Returns the model's predicates.  This method should
64 | 		/// usually only be accessed by GIS model writer classes via the GisModel class.
65 | 		/// </summary>
66 | 		Dictionary<string, PatternedPredicate> GetPredicates();
67 | 
68 | 		/// <summary>
69 | 		/// Returns model information for a predicate, given the predicate label.
70 | 		/// </summary>
71 | 		/// <param name="predicateLabel">
72 | 		/// The predicate label to fetch information for.
73 | 		/// </param>
74 | 		/// <param name="featureCounts">
75 | 		/// Array to be passed in to the method; it should have a length equal to the number of outcomes
76 | 		/// in the model.  The method increments the count of each outcome that is active in the specified
77 | 		/// predicate.
78 | 		/// </param>
79 | 		/// <param name="outcomeSums">
80 | 		/// Array to be passed in to the method; it should have a length equal to the number of outcomes
81 | 		/// in the model.  The method adds the parameter values for each of the active outcomes in the
82 | 		/// predicate.
83 | 		/// </param>
84 | 		void GetPredicateData(string predicateLabel, int[] featureCounts, double[] outcomeSums);
85 | 
86 | 	}
87 | }
88 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/Entropy/IO/JavaBinaryGisModelReader.cs:
--------------------------------------------------------------------------------
  1 | //Copyright (C) 2005 Richard J. Northedge
  2 | //
  3 | // This library is free software; you can redistribute it and/or
  4 | // modify it under the terms of the GNU Lesser General Public
  5 | // License as published by the Free Software Foundation; either
  6 | // version 2.1 of the License, or (at your option) any later version.
  7 | //
  8 | // This library is distributed in the hope that it will be useful,
  9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | // GNU Lesser General Public License for more details.
 12 | //
 13 | // You should have received a copy of the GNU Lesser General Public
 14 | // License along with this program; if not, write to the Free Software
 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 16 | 
 17 | //This file is based on the BinaryGISModelReader.java source file found in the
 18 | //original java implementation of MaxEnt.  That source file contains the following header:
 19 | 
 20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner
 21 | //
 22 | // This library is free software; you can redistribute it and/or
 23 | // modify it under the terms of the GNU Lesser General Public
 24 | // License as published by the Free Software Foundation; either
 25 | // version 2.1 of the License, or (at your option) any later version.
 26 | //
 27 | // This library is distributed in the hope that it will be useful,
 28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 30 | // GNU General Public License for more details.
 31 | //
 32 | // You should have received a copy of the GNU Lesser General Public
 33 | // License along with this program; if not, write to the Free Software
 34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 35 |  
 36 | using System;
 37 | using System.IO;
 38 | 
 39 | namespace CherubNLP.Models.IO
 40 | {
 41 | 	/// <summary>
 42 | 	/// A reader for GIS models stored in the binary format produced by the java version
 43 | 	/// of MaxEnt.  This binary format stores data using big-endian values, which means
 44 | 	/// that the C# version must reverse the byte order of each value in turn, making it
 45 | 	/// less efficient. Use only for compatibility with the java MaxEnt library.
 46 | 	/// </summary>
 47 | 	/// <author> 
 48 | 	/// Jason Baldridge
 49 | 	/// </author>
 50 | 	/// <author>
 51 | 	/// Richard J. Northedge
 52 | 	/// </author>
 53 | 	/// <version>
 54 | 	/// based on BinaryGISModelReader.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $
 55 | 	/// </version>
 56 | 	public class JavaBinaryGisModelReader : GisModelReader
 57 | 	{
 58 | 		private readonly Stream _input;
 59 | 		private readonly byte[] _buffer;
 60 | 		private int _stringLength = 0;
 61 | 		private readonly System.Text.Encoding _encoding = System.Text.Encoding.UTF8;
 62 | 
 63 | 		/// <summary>
 64 | 		/// Constructor which directly instantiates the Stream containing
 65 | 		/// the model contents.
 66 | 		/// </summary>
 67 | 		/// <param name="dataInputStream">The Stream containing the model information.
 68 | 		/// </param>
 69 | 		public JavaBinaryGisModelReader(Stream dataInputStream)
 70 | 		{
 71 | 			using (_input = dataInputStream)
 72 | 			{
 73 | 				_buffer = new byte[256];
 74 | 				base.ReadModel();
 75 | 			}
 76 | 		}
 77 | 		
 78 | 		/// <summary>
 79 | 		/// Constructor which takes a filename and creates a reader for it.
 80 | 		/// </summary>
 81 | 		/// <param name="fileName">The full path and name of the file in which the model is stored.
 82 | 		/// </param>
 83 | 		public JavaBinaryGisModelReader(string fileName)
 84 | 		{
 85 | 			using (_input = new FileStream(fileName, FileMode.Open, FileAccess.Read))
 86 | 			{
 87 | 				_buffer = new byte[256];
 88 | 				base.ReadModel();
 89 | 			}
 90 | 		}
 91 | 
 92 | 		/// <summary>
 93 | 		/// Reads a 32-bit signed integer from the model file.
 94 | 		/// </summary>
 95 | 		protected override int ReadInt32()
 96 | 		{
 97 | 			_input.Read(_buffer, 0, 4);
 98 | 			Array.Reverse(_buffer, 0, 4);
 99 | 			return BitConverter.ToInt32(_buffer, 0);
100 | 		}
101 | 		
102 | 		/// <summary>
103 | 		/// Reads a double-precision floating point number from the model file.
104 | 		/// </summary>
105 | 		protected override double ReadDouble()
106 | 		{
107 | 			_input.Read(_buffer, 0, 8);
108 | 			Array.Reverse(_buffer, 0, 8);
109 | 			return BitConverter.ToDouble(_buffer, 0);
110 | 		}
111 | 		
112 | 		/// <summary>
113 | 		/// Reads a UTF-8 encoded string from the model file.
114 | 		/// </summary>
115 | 		protected override string ReadString()
116 | 		{
117 | 			//read string from binary file with UTF8 encoding
118 | 			_stringLength = (_input.ReadByte() * 256) + _input.ReadByte();
119 | 			_input.Read(_buffer, 0, _stringLength);
120 | 			return _encoding.GetString(_buffer, 0, _stringLength);
121 | 		}
122 | 	}
123 | }
124 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/Entropy/IO/JavaBinaryGisModelWriter.cs:
--------------------------------------------------------------------------------
  1 | //Copyright (C) 2005 Richard J. Northedge
  2 | //
  3 | // This library is free software; you can redistribute it and/or
  4 | // modify it under the terms of the GNU Lesser General Public
  5 | // License as published by the Free Software Foundation; either
  6 | // version 2.1 of the License, or (at your option) any later version.
  7 | //
  8 | // This library is distributed in the hope that it will be useful,
  9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | // GNU Lesser General Public License for more details.
 12 | //
 13 | // You should have received a copy of the GNU Lesser General Public
 14 | // License along with this program; if not, write to the Free Software
 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 16 | 
 17 | //This file is based on the BinaryGISModelWriter.java source file found in the
 18 | //original java implementation of MaxEnt.  That source file contains the following header:
 19 | 
 20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner
 21 | //
 22 | // This library is free software; you can redistribute it and/or
 23 | // modify it under the terms of the GNU Lesser General Public
 24 | // License as published by the Free Software Foundation; either
 25 | // version 2.1 of the License, or (at your option) any later version.
 26 | //
 27 | // This library is distributed in the hope that it will be useful,
 28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 30 | // GNU General Public License for more details.
 31 | //
 32 | // You should have received a copy of the GNU Lesser General Public
 33 | // License along with this program; if not, write to the Free Software
 34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 35 | 
 36 | using System;
 37 | using System.IO;
 38 | 
 39 | namespace CherubNLP.Models.IO
 40 | {
 41 | 	/// <summary>
 42 | 	/// A writer for GIS models that saves models in the binary format used by the java 
 43 | 	/// version of MaxEnt.  This binary format stores data using big-endian values, which means
 44 | 	/// that the C# version must reverse the byte order of each value in turn, making it
 45 | 	/// less efficient.  Use only for compatibility with the java MaxEnt library.
 46 | 	/// </summary>
 47 | 	/// <author> 
 48 | 	/// Jason Baldridge
 49 | 	/// </author>
 50 | 	/// <author>
 51 | 	/// Richard J. Northedge
 52 | 	/// </author>
 53 | 	/// <version>
 54 | 	/// based on BinaryGISModelWriter.java $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $
 55 | 	/// </version>
 56 | 	public class JavaBinaryGisModelWriter : GisModelWriter
 57 | 	{
 58 | 		private Stream mOutput;
 59 | 		private byte[] mBuffer = new byte[7];
 60 | 		private System.Text.Encoding mEncoding = System.Text.Encoding.UTF8;
 61 | 		
 62 | 		/// <summary>
 63 | 		/// Default constructor.
 64 | 		/// </summary>
 65 | 		public JavaBinaryGisModelWriter()
 66 | 		{
 67 | 		}
 68 | 			
 69 | 		/// <summary> Takes a GisModel and a File and
 70 | 		/// writes the model to that file.
 71 | 		/// </summary>
 72 | 		/// <param name="model">The GisModel which is to be persisted.
 73 | 		/// </param>
 74 | 		/// <param name="fileName">The name of the file in which the model is to be persisted.
 75 | 		/// </param>
 76 | 		public void Persist(GisModel model, string fileName)
 77 | 		{
 78 | 			using (mOutput = new FileStream(fileName, FileMode.Create))
 79 | 			{
 80 | 				base.Persist(model);
 81 | 			}
 82 | 		}
 83 | 
 84 | 		/// <summary>
 85 | 		/// Takes a GisModel and a Stream and writes the model to that stream.
 86 | 		/// </summary>
 87 | 		/// <param name="model">
 88 | 		/// The GIS model which is to be persisted.
 89 | 		/// </param>
 90 | 		/// <param name="dataOutputStream">
 91 | 		/// The Stream which will be used to persist the model.
 92 | 		/// </param>
 93 | 		public void Persist(GisModel model, Stream dataOutputStream)
 94 | 		{
 95 | 			using (mOutput = dataOutputStream)
 96 | 			{
 97 | 				base.Persist(model);
 98 | 			}
 99 | 		}
100 | 
101 | 		/// <summary>
102 | 		/// Writes a UTF-8 encoded string to the model file.
103 | 		/// </summary>
104 | 		/// /// <param name="data">
105 | 		/// The string data to be persisted.
106 | 		/// </param>
107 | 		protected override void WriteString(string data)
108 | 		{
109 | 			mOutput.WriteByte((byte)(mEncoding.GetByteCount(data) / 256));
110 | 			mOutput.WriteByte((byte)(mEncoding.GetByteCount(data) % 256));
111 | 			mOutput.Write(mEncoding.GetBytes(data), 0, mEncoding.GetByteCount(data));
112 | 		}
113 | 		
114 | 		/// <summary>
115 | 		/// Writes a 32-bit signed integer to the model file.
116 | 		/// </summary>
117 | 		/// /// <param name="data">
118 | 		/// The integer data to be persisted.
119 | 		/// </param>
120 | 		protected override void WriteInt32(int data)
121 | 		{
122 | 			mBuffer = BitConverter.GetBytes(data);
123 | 			Array.Reverse(mBuffer);
124 | 			mOutput.Write(mBuffer, 0, 4);
125 | 		}
126 | 		
127 | 		/// <summary>
128 | 		/// Writes a double-precision floating point number to the model file.
129 | 		/// </summary>
130 | 		/// /// <param name="data">
131 | 		/// The floating point data to be persisted.
132 | 		/// </param>
133 | 		protected override void WriteDouble(double data)
134 | 		{
135 | 			mBuffer = BitConverter.GetBytes(data);
136 | 			Array.Reverse(mBuffer);
137 | 			mOutput.Write(mBuffer, 0, 8);
138 | 		}
139 | 	}
140 | }
141 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/Entropy/IO/PlainTextGisModelReader.cs:
--------------------------------------------------------------------------------
  1 | //Copyright (C) 2005 Richard J. Northedge
  2 | //
  3 | // This library is free software; you can redistribute it and/or
  4 | // modify it under the terms of the GNU Lesser General Public
  5 | // License as published by the Free Software Foundation; either
  6 | // version 2.1 of the License, or (at your option) any later version.
  7 | //
  8 | // This library is distributed in the hope that it will be useful,
  9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | // GNU Lesser General Public License for more details.
 12 | //
 13 | // You should have received a copy of the GNU Lesser General Public
 14 | // License along with this program; if not, write to the Free Software
 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 16 | 
 17 | //This file is based on the PlainTextGISModelReader.java source file found in the
 18 | //original java implementation of MaxEnt.  That source file contains the following header:
 19 | 
 20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner
 21 | //
 22 | // This library is free software; you can redistribute it and/or
 23 | // modify it under the terms of the GNU Lesser General Public
 24 | // License as published by the Free Software Foundation; either
 25 | // version 2.1 of the License, or (at your option) any later version.
 26 | //
 27 | // This library is distributed in the hope that it will be useful,
 28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 30 | // GNU General Public License for more details.
 31 | //
 32 | // You should have received a copy of the GNU Lesser General Public
 33 | // License along with this program; if not, write to the Free Software
 34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 35 | 
 36 | using System;
 37 | using System.IO;
 38 | 
 39 | namespace CherubNLP.Models.IO
 40 | {
 41 | 	/// <summary>
 42 | 	/// A reader for GIS models stored in plain text format.
 43 | 	/// </summary>
 44 | 	/// <author>
 45 | 	/// Jason Baldridge
 46 | 	/// </author>
 47 | 	/// <author>
 48 | 	/// Richard J. Northedge
 49 | 	/// </author>
 50 | 	/// <version>
 51 | 	/// based on PlainTextGISModelReader.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $
 52 | 	/// </version>
 53 | 	public class PlainTextGisModelReader : GisModelReader
 54 | 	{
 55 | 		private StreamReader mInput;
 56 | 		
 57 | 		/// <summary>
 58 | 		/// Constructor which directly instantiates the StreamReader containing
 59 | 		/// the model contents.
 60 | 		/// </summary>
 61 | 		/// <param name="reader">
 62 | 		/// The StreamReader containing the model information.
 63 | 		/// </param>
 64 | 		public PlainTextGisModelReader(StreamReader reader)
 65 | 		{
 66 | 			using (mInput = reader)
 67 | 			{
 68 | 				base.ReadModel();
 69 | 			}
 70 | 		}
 71 | 		
 72 | 		/// <summary>
 73 | 		/// Constructor which takes a file and creates a reader for it. 
 74 | 		/// </summary>
 75 | 		/// <param name="fileName">
 76 | 		/// The full path and file name in which the model is stored.
 77 | 		/// </param>
 78 | 		public PlainTextGisModelReader(string fileName)
 79 | 		{
 80 | 			using (mInput = new StreamReader(fileName, System.Text.Encoding.UTF7))
 81 | 			{
 82 | 				base.ReadModel();
 83 | 			}
 84 | 		}
 85 | 
 86 | 		/// <summary>
 87 | 		/// Reads a 32-bit signed integer from the model file.
 88 | 		/// </summary>
 89 | 		protected override int ReadInt32()
 90 | 		{
 91 | 			return int.Parse(mInput.ReadLine(), System.Globalization.CultureInfo.InvariantCulture);
 92 | 		}
 93 | 		
 94 | 		/// <summary>
 95 | 		/// Reads a double-precision floating point number from the model file.
 96 | 		/// </summary>
 97 | 		protected override double ReadDouble()
 98 | 		{
 99 | 			return double.Parse(mInput.ReadLine(), System.Globalization.CultureInfo.InvariantCulture);
100 | 		}
101 | 		
102 | 		/// <summary>
103 | 		/// Reads a string from the model file.
104 | 		/// </summary>
105 | 		protected override string ReadString()
106 | 		{
107 | 			return mInput.ReadLine();
108 | 		}
109 | 
110 | 	}
111 | }
112 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/Entropy/IO/PlainTextGisModelWriter.cs:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2005 Richard J. Northedge
  2 | //
  3 | // This library is free software; you can redistribute it and/or
  4 | // modify it under the terms of the GNU Lesser General Public
  5 | // License as published by the Free Software Foundation; either
  6 | // version 2.1 of the License, or (at your option) any later version.
  7 | //
  8 | // This library is distributed in the hope that it will be useful,
  9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | // GNU Lesser General Public License for more details.
 12 | //
 13 | // You should have received a copy of the GNU Lesser General Public
 14 | // License along with this program; if not, write to the Free Software
 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 16 | 
 17 | //This file is based on the PlainTextGISModelReader.java source file found in the
 18 | //original java implementation of MaxEnt.  That source file contains the following header:
 19 | 
 20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner
 21 | //
 22 | // This library is free software; you can redistribute it and/or
 23 | // modify it under the terms of the GNU Lesser General Public
 24 | // License as published by the Free Software Foundation; either
 25 | // version 2.1 of the License, or (at your option) any later version.
 26 | //
 27 | // This library is distributed in the hope that it will be useful,
 28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 30 | // GNU General Public License for more details.
 31 | //
 32 | // You should have received a copy of the GNU Lesser General Public
 33 | // License along with this program; if not, write to the Free Software
 34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 35 | 
 36 | using System;
 37 | using System.IO;
 38 | 
 39 | namespace CherubNLP.Models.IO
 40 | {
 41 | 	/// <summary> 
 42 | 	/// Model writer that saves models in plain text format.
 43 | 	/// </summary>
 44 | 	/// <author>
 45 | 	/// Jason Baldridge
 46 | 	/// </author>
 47 | 	/// <author>
 48 | 	/// Richard J. Northedge
 49 | 	/// </author>
 50 | 	/// <version>
 51 | 	/// based on PlainTextGISModelWriter.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $
 52 | 	/// </version>
 53 | 	public class PlainTextGisModelWriter : GisModelWriter
 54 | 	{
 55 | 		private StreamWriter mOutput;
 56 | 		
 57 | 		/// <summary>
 58 | 		/// Default constructor.
 59 | 		/// </summary>
 60 | 		public PlainTextGisModelWriter()
 61 | 		{
 62 | 		}
 63 | 			
 64 | 		/// <summary>
 65 | 		/// Takes a GIS model and a file and writes the model to that file.
 66 | 		/// </summary>
 67 | 		/// <param name="model">
 68 | 		/// The GisModel which is to be persisted.
 69 | 		/// </param>
 70 | 		/// <param name="fileName">
 71 | 		/// The name of the file in which the model is to be persisted.
 72 | 		/// </param>
 73 | 		public void Persist(GisModel model, string fileName)
 74 | 		{
 75 |             using (mOutput = new StreamWriter(fileName, false, System.Text.Encoding.UTF7))
 76 | 			{
 77 | 				base.Persist(model);
 78 | 			}
 79 | 		}
 80 | 
 81 | 		/// <summary>
 82 | 		/// Takes a GisModel and a stream and writes the model to that stream.
 83 | 		/// </summary>
 84 | 		/// <param name="model">
 85 | 		/// The GisModel which is to be persisted.
 86 | 		/// </param>
 87 | 		/// <param name="writer">
 88 | 		/// The StreamWriter which will be used to persist the model.
 89 | 		/// </param>
 90 | 		public void Persist(GisModel model, StreamWriter writer)
 91 | 		{
 92 | 			using (mOutput = writer)
 93 | 			{
 94 | 				base.Persist(model);
 95 | 			}
 96 | 		}
 97 | 	
 98 | 		/// <summary>
 99 | 		/// Writes a string to the model file.
100 | 		/// </summary>
101 | 		/// /// <param name="data">
102 | 		/// The string data to be persisted.
103 | 		/// </param>
104 | 		protected override void WriteString(string data)
105 | 		{
106 | 			mOutput.Write(data);
107 | 			mOutput.WriteLine();
108 | 		}
109 | 		
110 | 		/// <summary>
111 | 		/// Writes a 32-bit signed integer to the model file.
112 | 		/// </summary>
113 | 		/// <param name="data">
114 | 		/// The integer data to be persisted.
115 | 		/// </param>
116 | 		protected override void WriteInt32(int data)
117 | 		{
118 | 			mOutput.Write(data.ToString(System.Globalization.CultureInfo.InvariantCulture));
119 | 			mOutput.WriteLine();
120 | 		}
121 | 		
122 | 		/// <summary>
123 | 		/// Writes a double-precision floating point number to the model file.
124 | 		/// </summary>
125 | 		/// <param name="data">
126 | 		/// The floating point data to be persisted.
127 | 		/// </param>
128 | 		protected override void WriteDouble(double data)
129 | 		{
130 | 			mOutput.Write(data.ToString(System.Globalization.CultureInfo.InvariantCulture));
131 | 			mOutput.WriteLine();
132 | 		}
133 | 	}
134 | }
135 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/Entropy/ITrainingDataIndexer.cs:
--------------------------------------------------------------------------------
 1 | //Copyright (C) 2005 Richard J. Northedge
 2 | //
 3 | // This library is free software; you can redistribute it and/or
 4 | // modify it under the terms of the GNU Lesser General Public
 5 | // License as published by the Free Software Foundation; either
 6 | // version 2.1 of the License, or (at your option) any later version.
 7 | //
 8 | // This library is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU Lesser General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU Lesser General Public
14 | // License along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16 | 
17 | //This file is based on the DataIndexer.java source file found in the
18 | //original java implementation of MaxEnt.  That source file contains the following header:
19 | 
20 | //Copyright (C) 2003 Thomas Morton
21 | //
22 | //This library is free software; you can redistribute it and/or
23 | //modify it under the terms of the GNU Lesser General Public
24 | //License as published by the Free Software Foundation; either
25 | //version 2.1 of the License, or (at your option) any later version.
26 | //
27 | //This library is distributed in the hope that it will be useful,
28 | //but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
30 | //GNU General Public License for more details.
31 | //
32 | //You should have received a copy of the GNU Lesser General Public
33 | //License along with this program; if not, write to the Free Software
34 | //Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
35 | 
36 | using System;
37 | 
38 | namespace CherubNLP.Models
39 | {
40 | 	/// <summary>
41 | 	/// Object that compresses events in memory and performs feature selection.
42 | 	/// </summary>
43 | 	public interface ITrainingDataIndexer
44 | 	{
45 | 
46 | 		/// <summary>
47 | 		/// Gets an array of context data calculated from the training data.
48 | 		/// </summary>
49 | 		/// <returns>
50 | 		/// Array of integer arrays, each containing the context data for an event.
51 | 		/// </returns>
52 | 		int[][] GetContexts();
53 | 		
54 | 		/// <summary>
55 | 		/// Gets an array indicating how many times each event is seen.
56 | 		/// </summary>
57 | 		/// <returns>
58 | 		/// Integer array with event frequencies.
59 | 		/// </returns>
60 | 		int[] GetNumTimesEventsSeen();
61 | 		
62 | 		/// <summary>
63 | 		/// Gets an outcome list.
64 | 		/// </summary>
65 | 		/// <returns>
66 | 		/// Integer array of outcomes.
67 | 		/// </returns>
68 | 		int[] GetOutcomeList();
69 | 		
70 | 		/// <summary>
71 | 		/// Gets an array of predicate labels.
72 | 		/// </summary>
73 | 		/// <returns>
74 | 		/// Array of predicate labels.
75 | 		/// </returns>
76 | 		string[] GetPredicateLabels();
77 | 		
78 | 		/// <summary>
79 | 		/// Gets an array of outcome labels.
80 | 		/// </summary>
81 | 		/// <returns>
82 | 		/// Array of outcome labels.
83 | 		/// </returns>
84 | 		string[] GetOutcomeLabels();
85 | 	}
86 | }
87 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/Entropy/ITrainingDataReader.cs:
--------------------------------------------------------------------------------
 1 | //Copyright (C) 2005 Richard J. Northedge
 2 | //
 3 | // This library is free software; you can redistribute it and/or
 4 | // modify it under the terms of the GNU Lesser General Public
 5 | // License as published by the Free Software Foundation; either
 6 | // version 2.1 of the License, or (at your option) any later version.
 7 | //
 8 | // This library is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU Lesser General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU Lesser General Public
14 | // License along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16 | 
17 | //This file is based on the DataStream.java source file found in the
18 | //original java implementation of MaxEnt.  That source file contains the following header:
19 | 
20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner
21 | //
22 | // This library is free software; you can redistribute it and/or
23 | // modify it under the terms of the GNU Lesser General Public
24 | // License as published by the Free Software Foundation; either
25 | // version 2.1 of the License, or (at your option) any later version.
26 | //
27 | // This library is distributed in the hope that it will be useful,
28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
30 | // GNU General Public License for more details.
31 | //
32 | // You should have received a copy of the GNU Lesser General Public
33 | // License along with this program; if not, write to the Free Software
34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
35 |  
36 | using System;
37 | 
38 | namespace CherubNLP.Models
39 | {
40 | 	/// <summary>
41 | 	/// A interface for objects which can deliver a stream of training data to be
42 | 	/// supplied to an ITrainingEventReader. It is not necessary to use a ITrainingDataReader in a
43 | 	/// SharpEntropy application, but it can be used to support a wider variety of formats
44 | 	/// in which your training data can be held.
45 | 	/// </summary>
46 | 	/// <author>
47 | 	/// Jason Baldridge
48 | 	/// </author>
49 | 	/// <author>
50 | 	/// Richard J. Northedge
51 | 	/// </author>
52 | 	/// <version>
53 | 	/// based on DataStream.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $
54 | 	/// </version>
55 | 	public interface ITrainingDataReader<T>
56 | 	{
57 | 		/// <summary> 
58 | 		/// Returns the next slice of data held in this ITrainingDataReader.
59 | 		/// </summary>
60 | 		/// <returns>
61 | 		/// the object representing the data which is next in this
62 | 		/// ITrainingDataReader
63 | 		/// </returns>
64 | 		T NextToken();
65 | 			
66 | 		/// <summary> 
67 | 		/// Test whether there are any training data items remaining in this ITrainingDataReader.
68 | 		/// </summary>
69 | 		/// <returns>
70 | 		/// true if this ITrainingDataReader has more data tokens
71 | 		/// </returns>
72 | 		bool HasNext();
73 | 	}
74 | }
75 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/Entropy/ITrainingEventReader.cs:
--------------------------------------------------------------------------------
 1 | //Copyright (C) 2005 Richard J. Northedge
 2 | //
 3 | // This library is free software; you can redistribute it and/or
 4 | // modify it under the terms of the GNU Lesser General Public
 5 | // License as published by the Free Software Foundation; either
 6 | // version 2.1 of the License, or (at your option) any later version.
 7 | //
 8 | // This library is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU Lesser General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU Lesser General Public
14 | // License along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16 | 
17 | //This file is based on the EventStream.java source file found in the
18 | //original java implementation of MaxEnt.  That source file contains the following header:
19 | 
20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner
21 | //
22 | // This library is free software; you can redistribute it and/or
23 | // modify it under the terms of the GNU Lesser General Public
24 | // License as published by the Free Software Foundation; either
25 | // version 2.1 of the License, or (at your option) any later version.
26 | //
27 | // This library is distributed in the hope that it will be useful,
28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
30 | // GNU General Public License for more details.
31 | //
32 | // You should have received a copy of the GNU Lesser General Public
33 | // License along with this program; if not, write to the Free Software
34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
35 | 
36 | using System;
37 | 
38 | namespace CherubNLP.Models
39 | {
40 | 	/// <summary>
41 | 	/// An object which can deliver a stream of training events for the GIS
42 | 	/// procedure (or others such as IIS if and when they are implemented).
43 | 	/// TrainingEventReaders don't need to use SharpEntropy.ITrainingDataReader, but doing so
44 | 	/// would provide greater flexibility for producing events from data stored in
45 | 	/// different formats.
46 | 	/// </summary>
47 | 	public interface ITrainingEventReader
48 | 	{
49 | 			
50 | 		/// <summary> 
51 | 		/// Returns the next TrainingEvent object held in this TrainingEventReader.
52 | 		/// </summary>
53 | 		/// <returns>
54 | 		/// the TrainingEvent object which is next in this TrainingEventReader
55 | 		/// </returns>
56 | 		TrainingEvent ReadNextEvent();
57 | 			
58 | 		/// <summary> 
59 | 		/// Test whether there are any TrainingEvents remaining in this TrainingEventReader.
60 | 		/// </summary>
61 | 		/// <returns>
62 | 		/// true if this TrainingEventReader has more TrainingEvents
63 | 		/// </returns>
64 | 		bool HasNext();
65 | 	}
66 | }
67 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/Entropy/PatternedPredicate.cs:
--------------------------------------------------------------------------------
  1 | //Copyright (C) 2005 Richard J. Northedge
  2 | //
  3 | // This library is free software; you can redistribute it and/or
  4 | // modify it under the terms of the GNU Lesser General Public
  5 | // License as published by the Free Software Foundation; either
  6 | // version 2.1 of the License, or (at your option) any later version.
  7 | //
  8 | // This library is distributed in the hope that it will be useful,
  9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | // GNU Lesser General Public License for more details.
 12 | //
 13 | // You should have received a copy of the GNU Lesser General Public
 14 | // License along with this program; if not, write to the Free Software
 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 16 | 
 17 | using System;
 18 | 
 19 | namespace CherubNLP.Models
 20 | {
 21 | 	/// <summary>
 22 | 	/// Object containing predicate data, where the parameters are matched to
 23 | 	/// the outcomes in an outcome pattern.
 24 | 	/// </summary>
 25 | 	/// <author>
 26 | 	/// Richard J. Northedge
 27 | 	/// </author>
 28 | 	public class PatternedPredicate
 29 | 	{
 30 | 		private int mOutcomePattern;
 31 | 		private double[] mParameters;
 32 | 		private string mName;
 33 | 
 34 | 		/// <summary>
 35 | 		/// Creates a PatternedPredicate object.
 36 | 		/// </summary>
 37 | 		/// <param name="outcomePattern">
 38 | 		/// Index into the outcome pattern array, specifying which outcome pattern relates to
 39 | 		/// this predicate.
 40 | 		/// </param>
 41 | 		/// <param name="parameters">
 42 | 		/// Array of parameters for this predicate.
 43 | 		/// </param>
 44 | 		protected internal PatternedPredicate(int outcomePattern, double[] parameters)
 45 | 		{
 46 | 			mOutcomePattern = outcomePattern;
 47 | 			mParameters = parameters;
 48 | 		}
 49 | 
 50 | 		/// <summary>
 51 | 		/// Creates a PatternedPredicate object.
 52 | 		/// </summary>
 53 | 		/// <param name="name">
 54 | 		/// The predicate name.
 55 | 		/// </param>
 56 | 		/// <param name="parameters">
 57 | 		/// Array of parameters for this predicate.
 58 | 		/// </param>
 59 | 		protected internal PatternedPredicate(string name, double[] parameters)
 60 | 		{
 61 | 			mName = name;
 62 | 			mParameters = parameters;
 63 | 		}
 64 | 
 65 | 		/// <summary>
 66 | 		/// Index into array of outcome patterns.
 67 | 		/// </summary>
 68 | 		public int OutcomePattern
 69 | 		{
 70 | 			get
 71 | 			{
 72 | 				return mOutcomePattern;
 73 | 			}
 74 | 			set // for trainer
 75 | 			{
 76 | 				mOutcomePattern = value;
 77 | 			}
 78 | 		}
 79 | 
 80 | 		/// <summary>
 81 | 		/// Gets the value of a parameter from this predicate.
 82 | 		/// </summary>
 83 | 		/// <param name="index">
 84 | 		/// index into the parameter array.
 85 | 		/// </param>
 86 | 		/// <returns></returns>
 87 | 		public double GetParameter(int index)
 88 | 		{
 89 | 			return mParameters[index];
 90 | 		}
 91 | 
 92 | 		/// <summary>
 93 | 		/// Number of parameters associated with this predicate.
 94 | 		/// </summary>
 95 | 		public int ParameterCount
 96 | 		{
 97 | 			get
 98 | 			{
 99 | 				return mParameters.Length;
100 | 			}
101 | 		}
102 | 
103 | 		/// <summary>
104 | 		/// Name of the predicate.
105 | 		/// </summary>
106 | 		public string Name
107 | 		{
108 | 			get
109 | 			{
110 | 				return mName;
111 | 			}
112 | 			set
113 | 			{
114 | 				mName = value;
115 | 			}
116 | 		}
117 | 	}
118 | }
119 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/Entropy/PlainTextByLineDataReader.cs:
--------------------------------------------------------------------------------
 1 | //Copyright (C) 2005 Richard J. Northedge
 2 | //
 3 | // This library is free software; you can redistribute it and/or
 4 | // modify it under the terms of the GNU Lesser General Public
 5 | // License as published by the Free Software Foundation; either
 6 | // version 2.1 of the License, or (at your option) any later version.
 7 | //
 8 | // This library is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU Lesser General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU Lesser General Public
14 | // License along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16 | 
17 | //This file is based on the PlainTextByLineDataStream.java source file found in the
18 | //original java implementation of MaxEnt.  That source file contains the following header:
19 | 
20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner
21 | //
22 | // This library is free software; you can redistribute it and/or
23 | // modify it under the terms of the GNU Lesser General Public
24 | // License as published by the Free Software Foundation; either
25 | // version 2.1 of the License, or (at your option) any later version.
26 | //
27 | // This library is distributed in the hope that it will be useful,
28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
30 | // GNU General Public License for more details.
31 | //
32 | // You should have received a copy of the GNU Lesser General Public
33 | // License along with this program; if not, write to the Free Software
34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
35 | 
36 | using System;
37 | using System.IO;
38 | 
39 | namespace CherubNLP.Models
40 | {
41 | 	/// <summary>
42 | 	/// This ITrainingDataReader implementation will take care of reading a plain text file
43 | 	/// and returning the strings between each new line character, which is what
44 | 	/// many SharpEntropy applications need in order to create ITrainingEventReaders.
45 | 	/// </summary>
46 | 	/// <author>
47 | 	/// Jason Baldridge
48 | 	/// </author>
49 | 	/// <author>
50 | 	/// Richard J. Northedge
51 | 	/// </author>
52 | 	/// <version>
53 | 	/// based on PlainTextByLineDataStream.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $
54 | 	/// </version>
55 | 	public class PlainTextByLineDataReader : ITrainingDataReader<string>
56 | 	{
57 | 		private readonly StreamReader _dataReader;
58 | 		private string _nextLine;
59 | 		
60 | 		/// <summary>
61 | 		/// Creates a training data reader for reading text lines from a file or other text stream
62 | 		/// </summary>
63 | 		/// <param name="dataSource">StreamReader containing the source of the training data</param>
64 | 		public PlainTextByLineDataReader(StreamReader dataSource)
65 | 		{
66 | 			_dataReader = dataSource;
67 | 			_nextLine = _dataReader.ReadLine();
68 | 		}
69 | 		
70 | 		/// <summary>Gets the next text line from the training data</summary>
71 | 		/// <returns>Next text line from the training data</returns>
72 | 		public virtual string NextToken()
73 | 		{
74 | 			string currentLine = _nextLine;
75 | 			_nextLine = _dataReader.ReadLine();
76 | 			return currentLine;
77 | 		}
78 | 		
79 | 		/// <summary>Checks if there is any more training data</summary>
80 | 		/// <returns>true if there is more training data to be read</returns>
81 | 		public virtual bool HasNext()
82 | 		{
83 | 			return (_nextLine != null);
84 | 		}
85 | 	}
86 | }
87 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/Entropy/TrainingEvent.cs:
--------------------------------------------------------------------------------
 1 | //Copyright (C) 2005 Richard J. Northedge
 2 | //
 3 | // This library is free software; you can redistribute it and/or
 4 | // modify it under the terms of the GNU Lesser General Public
 5 | // License as published by the Free Software Foundation; either
 6 | // version 2.1 of the License, or (at your option) any later version.
 7 | //
 8 | // This library is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU Lesser General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU Lesser General Public
14 | // License along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16 | 
17 | //This file is based on the Event.java source file found in the
18 | //original java implementation of MaxEnt.  That source file contains the following header:
19 | 
20 | // Copyright (C) 2001 Jason Baldridge and Gann Bierner
21 | //
22 | // This library is free software; you can redistribute it and/or
23 | // modify it under the terms of the GNU Lesser General Public
24 | // License as published by the Free Software Foundation; either
25 | // version 2.1 of the License, or (at your option) any later version.
26 | //
27 | // This library is distributed in the hope that it will be useful,
28 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
30 | // GNU General Public License for more details.
31 | //
32 | // You should have received a copy of the GNU Lesser General Public
33 | // License along with this program; if not, write to the Free Software
34 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
35 | 
36 | using System;
37 | 
38 | namespace CherubNLP.Models
39 | {
40 | 	/// <summary>
41 | 	/// The context of a decision point during training.  This includes
42 | 	/// contextual predicates and an outcome.
43 | 	/// </summary>
44 | 	/// <author>
45 | 	/// Jason Baldridge
46 | 	/// </author>
47 | 	/// <author>
48 | 	/// Richard J. Northedge
49 | 	/// </author>
50 | 	/// <version>
51 | 	/// based on Event.java, $Revision: 1.3 $, $Date: 2003/12/09 23:13:08 $
52 | 	/// </version>
53 | 	public class TrainingEvent
54 | 	{
55 | 		/// <summary>
56 | 		/// The outcome label for this training event.
57 | 		/// </summary>
58 | 		public string Outcome { get; private set; }
59 | 
60 | 		/// <summary>
61 | 		/// The context for this training event.
62 | 		/// </summary>
63 | 		/// <returns>
64 | 		/// A string array of context values for this training event.
65 | 		/// </returns>
66 | 		public string[] Context { get; private set; }
67 | 
68 | 		/// <summary>
69 | 		/// Constructor for a training event.
70 | 		/// </summary>
71 | 		/// <param name="outcome">
72 | 		/// the outcome label
73 | 		/// </param>
74 | 		/// <param name="context">
75 | 		/// array containing context values
76 | 		/// </param>
77 | 		public TrainingEvent(string outcome, string[] context)
78 | 		{
79 | 			Outcome = outcome;
80 | 			Context = context;
81 | 		}
82 | 		
83 | 		/// <summary>
84 | 		/// Override providing text summary of the training event.
85 | 		/// </summary>
86 | 		/// <returns>
87 | 		/// Summary of the training event.
88 | 		/// </returns>
89 | 		public override string ToString()
90 | 		{
91 | 			return Outcome + " " + string.Join(", ", Context);
92 | 		}
93 | 	}
94 | }
95 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/WordNet/IndexWord.cs:
--------------------------------------------------------------------------------
 1 | //Copyright (C) 2006 Richard J. Northedge
 2 | //
 3 | // This library is free software; you can redistribute it and/or
 4 | // modify it under the terms of the GNU Lesser General Public
 5 | // License as published by the Free Software Foundation; either
 6 | // version 2.1 of the License, or (at your option) any later version.
 7 | //
 8 | // This library is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU Lesser General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU Lesser General Public
14 | // License along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16 | 
17 | using System;
18 | using System.Linq;
19 | 
20 | namespace CherubNLP.Models
21 | {
22 | 	/// <summary>
23 | 	/// Summary description for IndexWord.
24 | 	/// </summary>
25 | 	public class IndexWord
26 | 	{
27 |         // Properties ------------------------
28 | 
29 |         public string PartOfSpeech { get; private set; }
30 | 			
31 | 		public int[] SynsetOffsets { get; private set; }
32 | 
33 |         public string Lemma { get; private set; }
34 | 
35 |         public int SenseCount
36 |         {
37 |             get { return this.SynsetOffsets != null ? this.SynsetOffsets.Count() : 0; }
38 |         }
39 | 
40 | 	    public int TagSenseCount { get; private set; }
41 | 
42 | 		public string[] RelationTypes { get; private set; }
43 | 
44 | 
45 |         // Constructors --------------------
46 | 
47 | 		public IndexWord(string lemma, string partOfSpeech, string[] relationTypes, int[] synsetOffsets, int tagSenseCount)
48 | 		{
49 |             this.Lemma = lemma;
50 |             this.PartOfSpeech = partOfSpeech;
51 |             this.RelationTypes = relationTypes;
52 |             this.SynsetOffsets = synsetOffsets;
53 |             this.TagSenseCount = tagSenseCount;
54 | 		}
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/WordNet/Morph/AbstractDelegatingOperation.cs:
--------------------------------------------------------------------------------
 1 | //Copyright (C) 2006 Richard J. Northedge
 2 | //
 3 | // This library is free software; you can redistribute it and/or
 4 | // modify it under the terms of the GNU Lesser General Public
 5 | // License as published by the Free Software Foundation; either
 6 | // version 2.1 of the License, or (at your option) any later version.
 7 | //
 8 | // This library is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU Lesser General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU Lesser General Public
14 | // License along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16 | 
17 | //This file is based on the AbstractDelegatingOperation.java source file found in
18 | //the Java WordNet Library (JWNL).  That source file is licensed under BSD.
19 | 
20 | using System;
21 | using System.Collections.Generic;
22 | using System.Text;
23 | 
24 | namespace CherubNLP.Models.Morph
25 | {
26 |     public abstract class AbstractDelegatingOperation : IOperation
27 |     {
28 |         private Dictionary<string, IOperation[]> mOperationSets;
29 | 
30 |         public virtual void AddDelegate(string key, IOperation[] operations)
31 |         {
32 |             if (!mOperationSets.ContainsKey(key))
33 |             {
34 |                 mOperationSets.Add(key, operations);
35 |             }
36 |             else
37 |             {
38 |                 mOperationSets[key] = operations;
39 |             }
40 |         }
41 | 
42 |         protected internal AbstractDelegatingOperation()
43 |         {
44 |             mOperationSets = new Dictionary<string, IOperation[]>();
45 |         }
46 | 
47 |         //protected internal abstract AbstractDelegatingOperation getInstance(System.Collections.IDictionary params_Renamed);
48 | 
49 |         protected internal virtual bool HasDelegate(string key)
50 |         {
51 |             return mOperationSets.ContainsKey(key);
52 |         }
53 | 
54 |         protected internal virtual bool ExecuteDelegate(string lemma, string partOfSpeech, List<string>baseForms, string key)
55 |         {
56 |             IOperation[] operations = mOperationSets[key];
57 |             bool result = false;
58 |             for (int currentOperation = 0; currentOperation < operations.Length; currentOperation++)
59 |             {
60 |                 if (operations[currentOperation].Execute(lemma, partOfSpeech, baseForms))
61 |                 {
62 |                     result = true;
63 |                 }
64 |             }
65 |             return result;
66 |         }
67 | 
68 |         #region IOperation Members
69 | 
70 |         public abstract bool Execute(string lemma, string partOfSpeech, List<string> baseForms);
71 | 
72 |         #endregion
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/WordNet/Morph/DetachSuffixesOperation.cs:
--------------------------------------------------------------------------------
 1 | //Copyright (C) 2006 Richard J. Northedge
 2 | //
 3 | // This library is free software; you can redistribute it and/or
 4 | // modify it under the terms of the GNU Lesser General Public
 5 | // License as published by the Free Software Foundation; either
 6 | // version 2.1 of the License, or (at your option) any later version.
 7 | //
 8 | // This library is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU Lesser General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU Lesser General Public
14 | // License along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16 | 
17 | //This file is based on the DetachSuffixesOperation.java source file found in
18 | //the Java WordNet Library (JWNL).  That source file is licensed under BSD.
19 | 
20 | using System;
21 | using System.Collections.Generic;
22 | using System.Text;
23 | 
24 | namespace CherubNLP.Models.Morph
25 | {
26 |     /// <summary>
27 |     /// Remove all applicable suffixes from the word(s) and do a look-up.
28 |     /// </summary>
29 |     public class DetachSuffixesOperation : AbstractDelegatingOperation
30 |     {
31 |         public const string Operations = "operations";
32 | 
33 |         private Dictionary<string, string[][]> mSuffixMap;
34 | 
35 |         public DetachSuffixesOperation(Dictionary<string, string[][]> suffixMap)
36 |         {
37 |             mSuffixMap = suffixMap;
38 |         }
39 | 
40 |         #region IOperation Members
41 | 
42 |         public override bool Execute(string lemma, string partOfSpeech, List<string> baseForms)
43 |         {
44 |             if (!mSuffixMap.ContainsKey(partOfSpeech))
45 |             {
46 |                 return false;
47 |             }
48 |             string[][] suffixArray = mSuffixMap[partOfSpeech];
49 |             
50 |             bool addedBaseForm = false;
51 |             for (int currentSuffix = 0; currentSuffix < suffixArray.Length; currentSuffix++)
52 |             {
53 |                 if (lemma.EndsWith(suffixArray[currentSuffix][0]))
54 |                 {
55 |                     string stem = lemma.Substring(0, (lemma.Length - suffixArray[currentSuffix][0].Length) - (0)) + suffixArray[currentSuffix][1];
56 |                     if (ExecuteDelegate(stem, partOfSpeech, baseForms, Operations))
57 |                     {
58 |                         addedBaseForm = true;
59 |                     }
60 |                 }
61 |             }
62 |             return addedBaseForm;
63 |         }
64 | 
65 |         #endregion
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/WordNet/Morph/IOperation.cs:
--------------------------------------------------------------------------------
 1 | //Copyright (C) 2006 Richard J. Northedge
 2 | //
 3 | // This library is free software; you can redistribute it and/or
 4 | // modify it under the terms of the GNU Lesser General Public
 5 | // License as published by the Free Software Foundation; either
 6 | // version 2.1 of the License, or (at your option) any later version.
 7 | //
 8 | // This library is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU Lesser General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU Lesser General Public
14 | // License along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16 | 
17 | //This file is based on the Operation.java source file found in
18 | //the Java WordNet Library (JWNL).  That source file is licensed under BSD.
19 | 
20 | using System;
21 | using System.Collections.Generic;
22 | using System.Text;
23 | 
24 | namespace CherubNLP.Models.Morph
25 | {
26 |     public interface IOperation
27 |     {
28 |         /// <summary>
29 |         /// Execute the operation.
30 |         /// </summary>
31 |         /// <param name="lemma">
32 |         /// input lemma to look up
33 |         /// </param>
34 |         ///<param name="partOfSpeech">
35 |         /// part of speech of the lemma to look up
36 |         /// </param>
37 |         /// <param name="baseForms">
38 |         /// List to which all discovered base forms should be added.
39 |         /// </param>
40 |         /// <returns>
41 |         /// True if at least one base form was discovered by the operation and
42 |         /// added to baseForms.
43 |         /// </returns>
44 |         bool Execute(string lemma, string partOfSpeech, List<string> baseForms);
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/WordNet/Morph/LookupExceptionsOperation.cs:
--------------------------------------------------------------------------------
 1 | //Copyright (C) 2006 Richard J. Northedge
 2 | //
 3 | // This library is free software; you can redistribute it and/or
 4 | // modify it under the terms of the GNU Lesser General Public
 5 | // License as published by the Free Software Foundation; either
 6 | // version 2.1 of the License, or (at your option) any later version.
 7 | //
 8 | // This library is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU Lesser General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU Lesser General Public
14 | // License along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16 | 
17 | //This file is based on the LookupExceptionsOperation.java source file found in
18 | //the Java WordNet Library (JWNL).  That source file is licensed under BSD.
19 | 
20 | using System;
21 | using System.Collections.Generic;
22 | using System.Text;
23 | 
24 | namespace CherubNLP.Models.Morph
25 | {
26 |     /// <summary>Lookup the word in the exceptions file of the given part-of-speech. </summary>
27 |     public class LookupExceptionsOperation : IOperation
28 |     {
29 |         private WordNetEngine mEngine;
30 | 
31 |         public LookupExceptionsOperation(WordNetEngine engine)
32 |         {
33 |             mEngine = engine;
34 |         }
35 | 
36 |         #region IOperation Members
37 | 
38 |         public bool Execute(string lemma, string partOfSpeech, List<string> baseForms)
39 |         {
40 |             bool addedBaseForm = false;
41 |             string[] exceptionForms = mEngine.GetExceptionForms(lemma, partOfSpeech);
42 | 
43 |             foreach (string exceptionForm in exceptionForms)
44 |             {
45 |                 if (!baseForms.Contains(exceptionForm))
46 |                 {
47 |                     baseForms.Add(exceptionForm);
48 |                     addedBaseForm = true;
49 |                 }
50 |             }
51 | 
52 |             return addedBaseForm;
53 |         }
54 | 
55 |         #endregion
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/WordNet/Morph/LookupIndexWordOperation.cs:
--------------------------------------------------------------------------------
 1 | //Copyright (C) 2006 Richard J. Northedge
 2 | //
 3 | // This library is free software; you can redistribute it and/or
 4 | // modify it under the terms of the GNU Lesser General Public
 5 | // License as published by the Free Software Foundation; either
 6 | // version 2.1 of the License, or (at your option) any later version.
 7 | //
 8 | // This library is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU Lesser General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU Lesser General Public
14 | // License along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16 | 
17 | //This file is based on the LookupIndexWordOperation.java source file found in
18 | //the Java WordNet Library (JWNL).  That source file is licensed under BSD.
19 | 
20 | using System;
21 | using System.Collections.Generic;
22 | using System.Text;
23 | 
24 | namespace CherubNLP.Models.Morph
25 | {
26 |     public class LookupIndexWordOperation : IOperation
27 |     {
28 |         private WordNetEngine mEngine;
29 | 
30 |         public LookupIndexWordOperation(WordNetEngine engine)
31 |         {
32 |             mEngine = engine;
33 |         }
34 | 
35 |         #region IOperation Members
36 | 
37 |         public bool Execute(string lemma, string partOfSpeech, List<string> baseForms)
38 |         {
39 |             if (!baseForms.Contains(lemma) && mEngine.GetIndexWord(lemma, partOfSpeech) != null)
40 |             {
41 |                 baseForms.Add(lemma);
42 |                 return true;
43 |             }
44 |             return false;
45 |         }
46 | 
47 |         #endregion
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/WordNet/Morph/Util.cs:
--------------------------------------------------------------------------------
 1 | //Copyright (C) 2006 Richard J. Northedge
 2 | //
 3 | // This library is free software; you can redistribute it and/or
 4 | // modify it under the terms of the GNU Lesser General Public
 5 | // License as published by the Free Software Foundation; either
 6 | // version 2.1 of the License, or (at your option) any later version.
 7 | //
 8 | // This library is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU Lesser General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU Lesser General Public
14 | // License along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16 | 
17 | //This file is based on the Util.java source file found in
18 | //the Java WordNet Library (JWNL).  That source file is licensed under BSD.
19 | 
20 | using System;
21 | using System.Collections;
22 | using System.Collections.Generic;
23 | using System.Text;
24 | 
25 | namespace CherubNLP.Models.Morph
26 | {
27 |     public class Util
28 |     {
29 |         public static string GetLemma(string[] tokens, BitArray bits, string delimiter)
30 |         {
31 |             StringBuilder buf = new StringBuilder();
32 |             for (int i = 0; i < tokens.Length; i++)
33 |             {
34 |                 if (i != 0 && !bits.Get(i - 1))
35 |                 {
36 |                     buf.Append(delimiter);
37 |                 }
38 |                 buf.Append(tokens[i]);
39 |             }
40 |             return buf.ToString();
41 |         }
42 | 
43 |         public static bool Increment(BitArray bits, int size)
44 |         {
45 |             int i = size - 1;
46 |             while (i >= 0 && bits.Get(i))
47 |             {
48 |                 bits.Set(i--, false);
49 |             }
50 |             if (i < 0)
51 |             {
52 |                 return false;
53 |             }
54 |             bits.Set(i, true);
55 |             return true;
56 |         }
57 | 
58 |         public static string[] Split(string str)
59 |         {
60 |             char[] chars = str.ToCharArray();
61 |             List<string> tokens = new List<string>();
62 |             StringBuilder buf = new StringBuilder();
63 |             for (int i = 0; i < chars.Length; i++)
64 |             {
65 |                 if ((chars[i] >= 'a' && chars[i] <= 'z') || chars[i] == '\'')
66 |                 {
67 |                     buf.Append(chars[i]);
68 |                 }
69 |                 else
70 |                 {
71 |                     if (buf.Length > 0)
72 |                     {
73 |                         tokens.Add(buf.ToString());
74 |                         buf = new StringBuilder();
75 |                     }
76 |                 }
77 |             }
78 |             if (buf.Length > 0)
79 |             {
80 |                 tokens.Add(buf.ToString());
81 |             }
82 |             return (tokens.ToArray());
83 |         }
84 | 
85 |         private Util()
86 |         {
87 |         }
88 |     }
89 | }
90 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/WordNet/Relation.cs:
--------------------------------------------------------------------------------
 1 | //Copyright (C) 2006 Richard J. Northedge
 2 | //
 3 | // This library is free software; you can redistribute it and/or
 4 | // modify it under the terms of the GNU Lesser General Public
 5 | // License as published by the Free Software Foundation; either
 6 | // version 2.1 of the License, or (at your option) any later version.
 7 | //
 8 | // This library is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU Lesser General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU Lesser General Public
14 | // License along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16 | 
17 | using System;
18 | 
19 | namespace CherubNLP.Models
20 | {
21 | 	/// <summary>
22 | 	/// Summary description for Relation.
23 | 	/// </summary>
24 | 	public class Relation
25 | 	{
26 | 		private WordNetEngine mWordNetEngine;
27 | 
28 | 		private RelationType mRelationType;
29 | 		
30 | 		private int mTargetSynsetOffset;
31 | 		private string mTargetSynsetPartOfSpeech;
32 | 	
33 | 		private Synset mTargetSynset;
34 | 
35 | 		private int miSourceWord;
36 | 		private int miTargetWord;
37 | 
38 | 		public RelationType SynsetRelationType
39 | 		{
40 | 			get
41 | 			{
42 | 				return mRelationType;
43 | 			}
44 | 		}
45 | 
46 |         public int TargetSynsetOffset
47 |         {
48 |             get
49 |             {
50 |                 return mTargetSynsetOffset;
51 |             }
52 |         }
53 | 
54 | 		public Synset TargetSynset
55 | 		{
56 | 			get
57 | 			{
58 | 				if (mTargetSynset == null)
59 | 				{
60 | 					mTargetSynset = mWordNetEngine.CreateSynset(mTargetSynsetPartOfSpeech, mTargetSynsetOffset);
61 | 				}
62 | 				return mTargetSynset;
63 | 			}
64 | 		}
65 | 
66 | 		private Relation()
67 | 		{
68 | 		}
69 | 
70 | 		protected internal Relation(WordNetEngine wordNetEngine, RelationType relationType, int targetSynsetOffset, string targetSynsetPartOfSpeech)
71 | 		{
72 | 			mWordNetEngine = wordNetEngine;
73 | 			mRelationType = relationType;
74 | 
75 | 			mTargetSynsetOffset = targetSynsetOffset;
76 | 			mTargetSynsetPartOfSpeech = targetSynsetPartOfSpeech;
77 | 		}
78 | 
79 | 		protected internal Relation(WordNetEngine wordNetEngine, RelationType relationType, int targetSynsetOffset, string targetSynsetPartOfSpeech, int sourceWord, int targetWord) : this(wordNetEngine, relationType, targetSynsetOffset, targetSynsetPartOfSpeech)
80 | 		{
81 | 			miSourceWord = sourceWord;
82 | 			miTargetWord = targetWord;
83 | 		}
84 | 	}
85 | }
86 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/WordNet/RelationType.cs:
--------------------------------------------------------------------------------
 1 | //Copyright (C) 2006 Richard J. Northedge
 2 | //
 3 | // This library is free software; you can redistribute it and/or
 4 | // modify it under the terms of the GNU Lesser General Public
 5 | // License as published by the Free Software Foundation; either
 6 | // version 2.1 of the License, or (at your option) any later version.
 7 | //
 8 | // This library is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU Lesser General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU Lesser General Public
14 | // License along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16 | 
17 | using System;
18 | 
19 | namespace CherubNLP.Models
20 | {
21 | 	/// <summary>
22 | 	/// Summary description for RelationType.
23 | 	/// </summary>
24 | 	public class RelationType
25 | 	{
26 | 		private string mName;
27 | 		private RelationType mOpposite;
28 | 		private string[] mPartsOfSpeech;
29 | 
30 | 		public string Name
31 | 		{
32 | 			get
33 | 			{
34 | 				return mName;
35 | 			}
36 | 		}
37 | 
38 | 		public RelationType Opposite
39 | 		{
40 | 			get
41 | 			{
42 | 				return mOpposite;
43 | 			}
44 | 		}
45 | 
46 | 		public string GetPartOfSpeech(int index)
47 | 		{
48 | 			return mPartsOfSpeech[index];
49 | 		}
50 | 
51 | 		public int PartsOfSpeechCount
52 | 		{
53 | 			get
54 | 			{
55 | 				return mPartsOfSpeech.Length;
56 | 			}
57 | 		}
58 | 
59 | 		protected internal RelationType(string name, string[] partsOfSpeech)
60 | 		{
61 | 			mName = name;
62 | 			mPartsOfSpeech = partsOfSpeech;
63 | 		}
64 | 
65 | 		protected internal RelationType(string name, RelationType opposite, string[] partsOfSpeech)
66 | 		{
67 | 			mName = name;
68 | 			mOpposite = opposite;
69 | 			mPartsOfSpeech = partsOfSpeech;
70 | 		}
71 | 	}
72 | }
73 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/WordNet/Synset.cs:
--------------------------------------------------------------------------------
  1 | //Copyright (C) 2006 Richard J. Northedge
  2 | //
  3 | // This library is free software; you can redistribute it and/or
  4 | // modify it under the terms of the GNU Lesser General Public
  5 | // License as published by the Free Software Foundation; either
  6 | // version 2.1 of the License, or (at your option) any later version.
  7 | //
  8 | // This library is distributed in the hope that it will be useful,
  9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | // GNU Lesser General Public License for more details.
 12 | //
 13 | // You should have received a copy of the GNU Lesser General Public
 14 | // License along with this program; if not, write to the Free Software
 15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 16 | 
 17 | using System;
 18 | 
 19 | namespace CherubNLP.Models
 20 | {
 21 | 	/// <summary>
 22 | 	/// Summary description for Synset.
 23 | 	/// </summary>
 24 | 	public class Synset
 25 | 	{
 26 |         private int mOffset;
 27 | 		private string mGloss;
 28 | 		private string[] mWordList;
 29 | 		private string mLexicographerFile;
 30 | 		private Relation[] mRelations;
 31 | 
 32 | 		private Synset()
 33 | 		{
 34 | 		}
 35 | 
 36 | 		internal Synset(int offset, string gloss, string[] wordList, string lexicographerFile, Relation[] relations)
 37 | 		{
 38 |             mOffset = offset;
 39 | 			mGloss = gloss;
 40 | 			mWordList = wordList;
 41 | 			mLexicographerFile = lexicographerFile;
 42 | 			mRelations = relations;
 43 | 		}
 44 | 
 45 |         public int Offset
 46 |         {
 47 |             get
 48 |             {
 49 |                 return mOffset;
 50 |             }
 51 |         }
 52 | 
 53 | 		public string Gloss
 54 | 		{
 55 | 			get
 56 | 			{
 57 | 				return mGloss;
 58 | 			}
 59 | 		}
 60 | 
 61 | 		public string GetWord(int wordIndex)
 62 | 		{
 63 | 			return mWordList[wordIndex];
 64 | 		}
 65 | 
 66 | 		public int WordCount
 67 | 		{
 68 | 			get
 69 | 			{
 70 | 				return mWordList.Length;
 71 | 			}
 72 | 		}
 73 | 
 74 | 		public string LexicographerFile
 75 | 		{
 76 | 			get
 77 | 			{
 78 | 				return mLexicographerFile;
 79 | 			}
 80 | 		}
 81 | 
 82 | 		public Relation GetRelation(int relationIndex)
 83 | 		{
 84 | 			return mRelations[relationIndex];
 85 | 		}
 86 | 
 87 | 		public int RelationCount
 88 | 		{
 89 | 			get
 90 | 			{
 91 | 				return mRelations.Length;
 92 | 			}
 93 | 		}
 94 | 
 95 | 		public override string ToString()
 96 | 		{
 97 | 			System.Text.StringBuilder oOutput = new System.Text.StringBuilder();
 98 | 
 99 | 			for (int iCurrentWord = 0; iCurrentWord < mWordList.Length; iCurrentWord++)
100 | 			{
101 | 				oOutput.Append(mWordList[iCurrentWord]);
102 | 				if (iCurrentWord < mWordList.Length - 1) 
103 | 				{
104 | 					oOutput.Append(", ");
105 | 				} 
106 | 			}
107 | 					
108 | 			oOutput.Append("  --  ").Append(mGloss);
109 | 
110 | 			return oOutput.ToString();
111 | 		}
112 | 	}
113 | }
114 | 


--------------------------------------------------------------------------------
/CherubNLP/Models/WordNet/Tokenizer.cs:
--------------------------------------------------------------------------------
 1 | //Copyright (C) 2006 Richard J. Northedge
 2 | //
 3 | // This library is free software; you can redistribute it and/or
 4 | // modify it under the terms of the GNU Lesser General Public
 5 | // License as published by the Free Software Foundation; either
 6 | // version 2.1 of the License, or (at your option) any later version.
 7 | //
 8 | // This library is distributed in the hope that it will be useful,
 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | // GNU Lesser General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU Lesser General Public
14 | // License along with this program; if not, write to the Free Software
15 | // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16 | 
17 | using System;
18 | 
19 | namespace CherubNLP.Models
20 | {
21 | 	/// <summary>
22 | 	/// Summary description for Tokenizer.
23 | 	/// </summary>
24 | 	public class Tokenizer
25 | 	{
26 | 		private readonly string[] _tokens;
27 | 		int _position;
28 | 
29 | 		public Tokenizer(string input, params char[] separators) 
30 | 		{
31 | 			_tokens = input.Split(separators);
32 | 			_position = 0;
33 | 		}
34 | 
35 | 		public string NextToken()
36 | 		{
37 | 			while (_position < _tokens.Length)
38 | 			{
39 | 				if ((_tokens[_position].Length > 0))
40 | 				{
41 | 					return _tokens[_position++];
42 | 				}
43 | 				_position++;
44 | 			}
45 | 			return null;
46 | 		}
47 | 		
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/CherubNLP/NER/README.md:
--------------------------------------------------------------------------------
1 | ﻿IOB tagging
2 | 
3 | B-{CHUNK_TYPE} – for the word in the Beginning chunk
4 | I-{CHUNK_TYPE} – for words Inside the chunk
5 | O – Outside any chunk


--------------------------------------------------------------------------------
/CherubNLP/Sentence.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CherubNLP.Tokenize;
 2 | using System;
 3 | using System.Collections.Generic;
 4 | using System.Text;
 5 | 
 6 | namespace CherubNLP
 7 | {
 8 |     public class Sentence
 9 |     {
10 |         // User defined id
11 |         public string Id { get; set; }
12 | 
13 |         public List<Token> Words { get; set; }
14 | 
15 |         public String Label { get; set; }
16 | 
17 |         public String Text { get; set; }
18 | 
19 |         public double[] Vector { get; set; }
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/CherubNLP/Similarity/Similarity.cs:
--------------------------------------------------------------------------------
 1 | ﻿using FastText.NetWrapper;
 2 | using System;
 3 | using System.Collections.Generic;
 4 | using System.IO;
 5 | using System.Linq;
 6 | using System.Text;
 7 | using Tensorflow;
 8 | using Tensorflow.NumPy;
 9 | using static Tensorflow.Binding;
10 | using static Tensorflow.KerasApi;
11 | 
12 | namespace CherubNLP
13 | {
14 |     public class Similarity
15 |     {
16 |         public static double[] Cosine(string src, string[] dst, string model)
17 |         {
18 |             using (var fastText = new FastTextWrapper())
19 |             {
20 |                 fastText.LoadModel(model);
21 |                 var vector = fastText.GetSentenceVector(src.ToLower());
22 |                 return dst.Select(x => CalCosine(vector, fastText.GetSentenceVector(x.ToLower()))).ToArray();
23 |             }
24 |         }
25 | 
26 |         public static double CalCosine(NDArray vector1, NDArray vector2)
27 |         {
28 |             var cosine_loss = keras.losses.CosineSimilarity(axis: 0);
29 |             return cosine_loss.Call(vector1, vector2).numpy();
30 |         } 
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/CherubNLP/Stem/IStemmer.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CherubNLP.Tokenize;
 2 | using System;
 3 | using System.Collections.Generic;
 4 | using System.Text;
 5 | 
 6 | namespace CherubNLP.Stem
 7 | {
 8 |     /// <summary>
 9 |     /// Stemmer is used to remove morphological affixes from words, leaving only the word stem.
10 |     /// Stemming algorithms aim to remove those affixes leaving only the stem of the word.
11 |     /// IStemmer defines a standard interface for stemmers.
12 |     /// </summary>
13 |     public interface IStemmer
14 |     {
15 |         /// <summary>
16 |         /// Strip affixes from the token and return the stem.
17 |         /// </summary>
18 |         /// <param name="word"></param>
19 |         /// <param name="options"></param>
20 |         /// <returns></returns>
21 |         string Stem(string word, StemOptions options);
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/CherubNLP/Stem/RegexStemmer.cs:
--------------------------------------------------------------------------------
 1 | ﻿/*
 2 |  * CherubNLP Library
 3 |  * Copyright (C) 2018 Haiping Chen
 4 |  * 
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, either version 3 of the License, or
 8 |  * (at your option) any later version.
 9 |  * 
10 |  * This program is distributed in the hope that it will be useful,
11 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 |  * GNU General Public License for more details.
14 |  * 
15 |  * You should have received a copy of the GNU General Public License
16 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 |  */
18 | 
19 | using System;
20 | using System.Collections.Generic;
21 | using System.Linq;
22 | using System.Text;
23 | using System.Text.RegularExpressions;
24 | using CherubNLP.Tokenize;
25 | 
26 | namespace CherubNLP.Stem
27 | {
28 |     /// <summary>
29 |     /// A stemmer that uses regular expressions to identify morphological affixes.
30 |     /// Any substrings that match the regular expressions will be removed.
31 |     /// </summary>
32 |     public class RegexStemmer : IStemmer
33 |     {
34 |         static string _pattern;
35 |         public static string PATTERN => GetPattern();
36 | 
37 |         static Regex _regex;
38 | 
39 |         static Dictionary<string, string> replacements = new Dictionary<string, string>();
40 | 
41 |         private static string GetPattern()
42 |         {
43 |             if (string.IsNullOrEmpty(_pattern))
44 |             {
45 |                 replacements["nning"] = "n"; // running
46 |                 replacements["pping"] = "p"; // skipping
47 |                 replacements["tting"] = "t"; // putting
48 |                 replacements["able"] = "";
49 |                 replacements["were"] = "be";
50 |                 replacements["sses"] = "ss";
51 |                 replacements["ies"] = "i";
52 |                 replacements["are"] = "be";
53 |                 replacements["ing"] = "";
54 |                 replacements["am"] = "be";
55 |                 replacements["es"] = "";
56 |                 replacements["is"] = "be";
57 |                 replacements["s"] = "";
58 | 
59 |                 _pattern = string.Join("$|", replacements.Keys) + "$";
60 | 
61 |                 _regex = new Regex(_pattern);
62 |             }
63 | 
64 |             return _pattern;
65 |         }
66 | 
67 |         public string Stem(string word, StemOptions options)
68 |         {
69 |             var match = _regex.Matches(word).Cast<Match>().FirstOrDefault();
70 | 
71 |             return match == null ?
72 |                 word :
73 |                 word.Substring(0, match.Index) + replacements[match.Value];
74 |         }
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/CherubNLP/Stem/StemOptions.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Text;
 4 | 
 5 | namespace CherubNLP.Stem
 6 | {
 7 |     public class StemOptions
 8 |     {
 9 |         /// <summary>
10 |         /// Regex pattern
11 |         /// </summary>
12 |         public string Pattern { get; set; }
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/CherubNLP/Stem/StemmerFactory.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CherubNLP.Tokenize;
 2 | using System;
 3 | using System.Collections.Generic;
 4 | using System.Text;
 5 | 
 6 | namespace CherubNLP.Stem
 7 | {
 8 |     /// <summary>
 9 |     /// CherubNLP Stemmer Factory
10 |     /// In linguistic morphology and information retrieval, 
11 |     /// stemming is the process of reducing inflected (or sometimes derived) words to their word stem, 
12 |     /// base or root form—generally a written word form.
13 |     /// </summary>
14 |     /// <typeparam name="IStem"></typeparam>
15 |     public class StemmerFactory<IStem> where IStem : IStemmer, new()
16 |     {
17 |         private SupportedLanguage _lang { get; set; }
18 | 
19 |         private IStem _stemmer;
20 | 
21 |         private StemOptions _options;
22 | 
23 |         public StemmerFactory(StemOptions options, SupportedLanguage lang)
24 |         {
25 |             _lang = lang;
26 |             _options = options;
27 |             _stemmer = new IStem();
28 |         }
29 | 
30 |         public string Stem(string word)
31 |         {
32 |             return _stemmer.Stem(word, _options);
33 |         }
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/CherubNLP/SupportedLanguage.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Text;
 4 | 
 5 | namespace CherubNLP
 6 | {
 7 |     public class SupportedLanguage
 8 |     {
 9 |         public static readonly SupportedLanguage English = new SupportedLanguage("en");
10 |         public static readonly SupportedLanguage Russian = new SupportedLanguage("ru");
11 |         public static readonly SupportedLanguage German = new SupportedLanguage("de");
12 |         public static readonly SupportedLanguage Portuguese = new SupportedLanguage("pt");
13 |         public static readonly SupportedLanguage PortugueseBrazil = new SupportedLanguage("pt-BR");
14 |         public static readonly SupportedLanguage Spanish = new SupportedLanguage("es");
15 |         public static readonly SupportedLanguage French = new SupportedLanguage("fr");
16 |         public static readonly SupportedLanguage Italian = new SupportedLanguage("it");
17 |         public static readonly SupportedLanguage Dutch = new SupportedLanguage("nl");
18 |         public static readonly SupportedLanguage Japanese = new SupportedLanguage("ja");
19 |         public static readonly SupportedLanguage ChineseChina = new SupportedLanguage("zh-CN");
20 |         public static readonly SupportedLanguage ChineseHongKong = new SupportedLanguage("zh-HK");
21 |         public static readonly SupportedLanguage ChineseTaiwan = new SupportedLanguage("zh-TW");
22 | 
23 |         private static readonly SupportedLanguage[] AllLangs =
24 |         {
25 |                 English,
26 |                 Russian,
27 |                 German,
28 |                 Portuguese,
29 |                 PortugueseBrazil,
30 |                 Spanish,
31 |                 French,
32 |                 Italian,
33 |                 Dutch,
34 |                 Japanese,
35 |                 ChineseChina,
36 |                 ChineseHongKong,
37 |                 ChineseTaiwan
38 |         };
39 | 
40 |         public readonly string code;
41 | 
42 |         private SupportedLanguage(string code)
43 |         {
44 |             this.code = code;
45 |         }
46 | 
47 |         public static SupportedLanguage FromLanguageTag(string languageTag)
48 |         {
49 |             foreach (var item in AllLangs)
50 |             {
51 |                 if (string.Equals(item.code, languageTag, StringComparison.OrdinalIgnoreCase))
52 |                 {
53 |                     return item;
54 |                 }
55 |             }
56 | 
57 |             return English;
58 |         }
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/CherubNLP/Tag/DefaultTagger.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Text;
 4 | using CherubNLP.Tokenize;
 5 | 
 6 | namespace CherubNLP.Tag
 7 | {
 8 |     /// <summary>
 9 |     /// The simplest possible tagger assigns the same tag to each token. 
10 |     /// This may seem to be a rather banal step, but it establishes an important baseline for tagger performance. 
11 |     /// In order to get the best result, we tag each word with the most likely tag. 
12 |     /// </summary>
13 |     public class DefaultTagger : ITagger
14 |     {
15 |         public void Tag(Sentence sentence, TagOptions options)
16 |         {
17 |             
18 |         }
19 | 
20 |         public void Train(List<Sentence> sentences, TagOptions options)
21 |         {
22 |             
23 |         }
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/CherubNLP/Tag/ITagger.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CherubNLP.Tokenize;
 2 | using System;
 3 | using System.Collections.Generic;
 4 | using System.Text;
 5 | 
 6 | namespace CherubNLP.Tag
 7 | {
 8 |     /// <summary>
 9 |     /// Part-Of-Speech tagging (or POS tagging, for short) is one of the main components of almost any NLP analysis. 
10 |     /// The task of POS-tagging simply implies labelling words with their appropriate Part-Of-Speech (Noun, Verb, Adjective, Adverb, Pronoun, …).
11 |     /// </summary>
12 |     public interface ITagger
13 |     {
14 |         /// <summary>
15 |         /// 
16 |         /// </summary>
17 |         /// <param name="sentences">A tagged corpus. Each item should be a list of tokens.</param>
18 |         /// <param name="options"></param>
19 |         /// <returns></returns>
20 |         void Train(List<Sentence> sentences, TagOptions options);
21 | 
22 |         void Tag(Sentence sentence, TagOptions options);
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/CherubNLP/Tag/NGramTagger.cs:
--------------------------------------------------------------------------------
  1 | ﻿/*
  2 |  * CherubNLP Library
  3 |  * Copyright (C) 2018 Haiping Chen
  4 |  * 
  5 |  * This program is free software: you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation, either version 3 of the License, or
  8 |  * (at your option) any later version.
  9 |  * 
 10 |  * This program is distributed in the hope that it will be useful,
 11 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 |  * GNU General Public License for more details.
 14 |  * 
 15 |  * You should have received a copy of the GNU General Public License
 16 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 |  */
 18 | 
 19 | using System;
 20 | using System.Collections.Generic;
 21 | using System.IO;
 22 | using System.Linq;
 23 | using System.Text;
 24 | using CherubNLP.Corpus;
 25 | using CherubNLP.Tokenize;
 26 | 
 27 | namespace CherubNLP.Tag
 28 | {
 29 |     /// <summary>
 30 |     /// N-Gramm taggers are based on a simple statistical algorithm: 
 31 |     /// for each token, assign the tag that is most likely for that particular token.
 32 |     /// </summary>
 33 |     public class NGramTagger : ITagger
 34 |     {
 35 |         private List<NGramFreq> _contextMapping { get; set; }
 36 | 
 37 |         public void Tag(Sentence sentence, TagOptions options)
 38 |         {
 39 |             // need training to generate model
 40 |             if(_contextMapping == null)
 41 |             {
 42 |                 var corpus = new CoNLLReader().Read(new ReaderOptions
 43 |                 {
 44 |                     DataDir = Path.Combine(options.CorpusDir, "CoNLL"),
 45 |                     FileName = "conll2000_chunking_train.txt"
 46 |                 });
 47 | 
 48 |                 Train(corpus, options);
 49 |             }
 50 | 
 51 |             Fill(sentence, options);
 52 | 
 53 |             for (int pos = options.NGram - 1; pos < sentence.Words.Count; pos++)
 54 |             {
 55 |                 sentence.Words[pos].Pos = _contextMapping.FirstOrDefault(x => x.Context == GetContext(pos, sentence.Words, options))?.Tag;
 56 | 
 57 |                 // set default tag
 58 |                 if(sentence.Words[pos].Pos == null)
 59 |                 {
 60 |                     sentence.Words[pos].Pos = options.Tag;
 61 |                 }
 62 |             }
 63 | 
 64 |             for(int pos = 0; pos < options.NGram - 1; pos++)
 65 |             {
 66 |                 sentence.Words.RemoveAt(0);
 67 |             }
 68 |         }
 69 | 
 70 |         public void Train(List<Sentence> sentences, TagOptions options)
 71 |         {
 72 |             var cache = new List<NGramFreq>();
 73 | 
 74 |             for (int idx = 0; idx < sentences.Count; idx++)
 75 |             {
 76 |                 var sent = sentences[idx];
 77 | 
 78 |                 Fill(sent, options);
 79 | 
 80 |                 for (int pos = options.NGram - 1; pos < sent.Words.Count; pos++)
 81 |                 {
 82 |                     var freq = new NGramFreq
 83 |                     {
 84 |                         Context = GetContext(pos, sent.Words, options),
 85 |                         Tag = sent.Words[pos].Pos,
 86 |                         Count = 1
 87 |                     };
 88 | 
 89 |                     cache.Add(freq);
 90 |                 }
 91 |             }
 92 | 
 93 |             _contextMapping = (from c in cache
 94 |                                group c by new { c.Context, c.Tag } into g
 95 |                                select new NGramFreq
 96 |                                {
 97 |                                    Context = g.Key.Context,
 98 |                                    Tag = g.Key.Tag,
 99 |                                    Count = g.Count()
100 |                                }).OrderByDescending(x => x.Count)
101 |                                .ToList();
102 |         }
103 | 
104 |         private string GetContext(int pos, List<Token> words, TagOptions options)
105 |         {
106 |             string context = words[pos].Text;
107 |             for (int ngram = options.NGram - 1; ngram > 0; ngram--)
108 |             {
109 |                 context = words[pos - ngram].Pos + " " + context;
110 |             }
111 | 
112 |             return context;
113 |         }
114 | 
115 |         private void Fill(Sentence sent, TagOptions options)
116 |         {
117 |             for (int ngram = 1; ngram < options.NGram; ngram++)
118 |             {
119 |                 sent.Words.Insert(0, new Token { Text = "NIL", Pos = options.Tag, Start = (ngram - 1) * 3 });
120 |             }
121 |         }
122 |         
123 |         private class NGramFreq
124 |         {
125 |             /// <summary>
126 |             /// Current token tag
127 |             /// </summary>
128 |             public string Tag { get; set; }
129 | 
130 |             /// <summary>
131 |             /// Occurence frequency
132 |             /// </summary>
133 |             public int Count { get; set; }
134 | 
135 |             public string Context { get; set; }
136 |         }
137 |     }
138 | }
139 | 


--------------------------------------------------------------------------------
/CherubNLP/Tag/TagOptions.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Text;
 4 | 
 5 | namespace CherubNLP.Tag
 6 | {
 7 |     public class TagOptions
 8 |     {
 9 |         /// <summary>
10 |         /// Display some stats, if requested.
11 |         /// </summary>
12 |         public bool Verbose { get; set; }
13 | 
14 |         /// <summary>
15 |         /// Default Tag
16 |         /// Used in DefaultTagger
17 |         /// </summary>
18 |         public string Tag { get; set; }
19 | 
20 |         /// <summary>
21 |         /// N-Gram number
22 |         /// </summary>
23 |         public int NGram { get; set; }
24 | 
25 |         public string CorpusDir { get; set; }
26 | 
27 |         public TagOptions()
28 |         {
29 |             NGram = 1;
30 |             Tag = "NN";
31 |         }
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/CherubNLP/Tag/TaggerFactory.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Linq;
 4 | using System.Reflection;
 5 | using System.Text;
 6 | 
 7 | namespace CherubNLP.Tag
 8 | {
 9 |     public class TaggerFactory
10 |     {
11 |         private SupportedLanguage _lang;
12 | 
13 |         private ITagger _tagger;
14 | 
15 |         private TagOptions _options;
16 | 
17 |         public TaggerFactory(TagOptions options, SupportedLanguage lang)
18 |         {
19 |             _lang = lang;
20 |             _options = options;
21 |         }
22 | 
23 |         public ITagger GetTagger<ITag>() where ITag : ITagger, new()
24 |         {
25 |             return _tagger = new ITag();
26 |         }
27 | 
28 |         public ITagger GetTagger(string name)
29 |         {
30 |             List<Type> types = new List<Type>();
31 | 
32 |             types.AddRange(Assembly.Load(new AssemblyName("CherubNLP"))
33 |                 .GetTypes().Where(x => !x.IsAbstract && !x.FullName.StartsWith("<>f__AnonymousType")).ToList());
34 | 
35 |             Type type = types.FirstOrDefault(x => x.Name == name);
36 |             var instance = (ITagger)Activator.CreateInstance(type);
37 | 
38 |             return _tagger = instance;
39 |         }
40 | 
41 |         public void Tag(Sentence sentence)
42 |         {
43 |             _tagger.Tag(sentence, _options);
44 |         }
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/CherubNLP/Tokenize/ITokenizer.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Text;
 4 | 
 5 | namespace CherubNLP.Tokenize
 6 | {
 7 |     /// <summary>
 8 |     /// A tokenizer is a component used for dividing text intotokens. 
 9 |     /// A tokenizer is language specific and takes into account the peculiarities of the language, e.g. don’t in English is tokenized as two tokens.
10 |     /// </summary>
11 |     public interface ITokenizer
12 |     {
13 |         /// <summary>
14 |         /// Tokenize
15 |         /// </summary>
16 |         /// <param name="sentence">input sentence</param>
17 |         /// <param name="options">Options such as: regex expression</param>
18 |         /// <returns></returns>
19 |         List<Token> Tokenize(string sentence, TokenizationOptions options);
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/CherubNLP/Tokenize/README.rst:
--------------------------------------------------------------------------------
1 | ﻿


--------------------------------------------------------------------------------
/CherubNLP/Tokenize/RegexTokenizer.cs:
--------------------------------------------------------------------------------
  1 | ﻿/*
  2 |  * CherubNLP Library
  3 |  * Copyright (C) 2018 Haiping Chen
  4 |  * 
  5 |  * This program is free software: you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation, either version 3 of the License, or
  8 |  * (at your option) any later version.
  9 |  * 
 10 |  * This program is distributed in the hope that it will be useful,
 11 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 |  * GNU General Public License for more details.
 14 |  * 
 15 |  * You should have received a copy of the GNU General Public License
 16 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 |  */
 18 | 
 19 | using System;
 20 | using System.Collections.Generic;
 21 | using System.Linq;
 22 | using System.Text;
 23 | using System.Text.RegularExpressions;
 24 | 
 25 | namespace CherubNLP.Tokenize
 26 | {
 27 |     /// <summary>
 28 |     /// Regular-Expression Tokenizers
 29 |     /// </summary>
 30 |     public class RegexTokenizer : ITokenizer
 31 |     {
 32 |         /// <summary>
 33 |         /// Tokenize a text into a sequence of alphabetic and non-alphabetic characters
 34 |         /// </summary>
 35 |         public const string WORD_PUNC = @"[^\w\s]+|\w+";
 36 | 
 37 |         /// <summary>
 38 |         /// Tokenize a string, treating any sequence of blank lines as a delimiter.
 39 |         /// Blank lines are defined as lines containing no characters, except for space or tab characters.
 40 |         /// options.IsGap = true
 41 |         /// </summary>
 42 |         public const string BLANK_LINE = @"\s*\n\s*\n\s*";
 43 | 
 44 |         /// <summary>
 45 |         /// Tokenize a string on whitespace (space, tab, newline).
 46 |         /// In general, users should use the string ``split()`` method instead.
 47 |         /// options.IsGap = true
 48 |         /// </summary>
 49 |         public const string WHITE_SPACE = @"\s+";
 50 | 
 51 |         private Regex _regex;
 52 | 
 53 |         public List<Token> Tokenize(string sentence, TokenizationOptions options)
 54 |         {
 55 |             string pattern = options.Pattern;
 56 |             if (options.SpecialWords != null)
 57 |             {
 58 |                 options.SpecialWords.ForEach(r =>
 59 |                 {
 60 |                     sentence = Regex.Replace(sentence, r, " " + r);
 61 |                 });
 62 | 
 63 |                 pattern = String.Join("|", options.SpecialWords) + "|" + pattern;
 64 |             }
 65 | 
 66 |             _regex = new Regex(pattern);
 67 | 
 68 |             var matches = _regex.Matches(sentence).Cast<Match>().ToArray();
 69 | 
 70 |             options.IsGap = new string[] { WHITE_SPACE, BLANK_LINE }.Contains(pattern);
 71 | 
 72 |             if (options.IsGap)
 73 |             {
 74 |                 int pos = 0;
 75 |                 var tokens = new Token[matches.Length + 1];
 76 | 
 77 |                 for (int span = 0; span <= matches.Length; span++)
 78 |                 {
 79 |                     var token = new Token
 80 |                     {
 81 |                         Text = (span == matches.Length) ? sentence.Substring(pos) : sentence.Substring(pos, matches[span].Index - pos),
 82 |                         Start = pos
 83 |                     };
 84 | 
 85 |                     token.Text = token.Text.Trim();
 86 | 
 87 |                     tokens[span] = token;
 88 | 
 89 |                     if (span < matches.Length)
 90 |                     {
 91 |                         pos = matches[span].Index + 1;
 92 |                     }
 93 |                 }
 94 | 
 95 |                 return tokens.ToList();
 96 |             }
 97 |             else
 98 |             {
 99 |                 var m = matches.Select(x => new Token
100 |                 {
101 |                     Text = x.Value,
102 |                     Start = x.Index
103 |                 }).ToList();
104 | 
105 |                 if(options.SpecialWords != null)
106 |                 {
107 |                     int offset = 0;
108 |                     m.ForEach(t =>
109 |                     {
110 |                         if (options.SpecialWords.Contains(t.Text))
111 |                         {
112 |                             offset++;
113 |                         }
114 | 
115 |                         t.Start = t.Start - offset;
116 |                     });
117 |                 }
118 | 
119 | 
120 |                 return m;
121 |             }
122 |         }
123 |     }
124 | }
125 | 


--------------------------------------------------------------------------------
/CherubNLP/Tokenize/Token.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Text;
 4 | using System.Text.RegularExpressions;
 5 | 
 6 | namespace CherubNLP.Tokenize
 7 | {
 8 |     public class Token
 9 |     {
10 |         /// <summary>
11 |         /// The original word text.
12 |         /// </summary>
13 |         public string Text { get; set; }
14 | 
15 |         /// <summary>
16 |         /// The offset of word
17 |         /// </summary>
18 |         public int Start { get; set; }
19 | 
20 |         /// <summary>
21 |         /// The simple part-of-speech tag.
22 |         /// Not widely used, Tag is more general.
23 |         /// </summary>
24 |         public string Pos { get; set; }
25 | 
26 |         /// <summary>
27 |         /// The detailed part-of-speech tag.
28 |         /// https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
29 |         /// </summary>
30 |         public string Tag { get; set; }
31 | 
32 |         /// <summary>
33 |         /// The base form of the word.
34 |         /// </summary>
35 |         public string Lemma { get; set; }
36 | 
37 |         /// <summary>
38 |         /// The word shape – capitalisation, punctuation, digits.
39 |         /// </summary>
40 |         public string Shape { get; set; }
41 | 
42 |         /// <summary>
43 |         /// Is the token an alpha character?
44 |         /// </summary>
45 |         public bool IsAlpha
46 |         {
47 |             get
48 |             {
49 |                 return Regex.IsMatch(Text, @"^[a-zA-Z]+|[\u4e00-\u9fa5]+$");
50 |             }
51 |         }
52 | 
53 |         /// <summary>
54 |         /// Is the token part of a stop list, i.e. the most common words of the language?
55 |         /// </summary>
56 |         public bool IsStop { get; set; }
57 | 
58 |         public int End
59 |         {
60 |             get
61 |             {
62 |                 return Start + Text.Length;
63 |             }
64 |         }
65 | 
66 |         public override string ToString()
67 |         {
68 |             return $"{Text} {Start} {Pos}";
69 |         }
70 | 
71 |         public double Vector { get; set; }
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------
/CherubNLP/Tokenize/TokenizationOptions.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Text;
 4 | 
 5 | namespace CherubNLP.Tokenize
 6 | {
 7 |     public class TokenizationOptions
 8 |     {
 9 |         /// <summary>
10 |         /// Regex pattern
11 |         /// </summary>
12 |         public string Pattern { get; set; }
13 | 
14 |         /// <summary>
15 |         /// True if this tokenizer's pattern should be used to find separators between tokens; 
16 |         /// False if this tokenizer's pattern should be used to find the tokens themselves.
17 |         /// </summary>
18 |         public bool IsGap { get; set; }
19 | 
20 |         /// <summary>
21 |         /// True if any empty tokens generated by the tokenizer should be discarded.
22 |         /// Empty tokens can only be generated if `IsGap == True`
23 |         /// </summary>
24 |         public bool IgnoreEmpty { get; set; }
25 | 
26 |         /// <summary>
27 |         /// Split "isn't" into "is", "n't"
28 |         /// </summary>
29 |         public List<string> SpecialWords { get; set; }
30 | 
31 |         /// <summary>
32 |         /// Convert bracket-like characters to avoid confusion with parse brackets.
33 |         /// </summary>
34 |         public bool ConvertParentheses { get; set; }
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/CherubNLP/Tokenize/TokenizerBase.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Text;
 4 | 
 5 | namespace CherubNLP.Tokenize
 6 | {
 7 |     public abstract class TokenizerBase
 8 |     {
 9 |         protected void CorrectTokenPosition(string sentence, List<Token> tokens)
10 |         {
11 |             int startPos = 0;
12 | 
13 |             for (int i = 0; i < tokens.Count; i++)
14 |             {
15 |                 var token = tokens[i];
16 |                 token.Start = sentence.IndexOf(token.Text, startPos);
17 | 
18 |                 startPos = token.End;
19 |             }
20 |         }
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/CherubNLP/Tokenize/TokenizerFactory.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Linq;
 4 | using System.Reflection;
 5 | using System.Text;
 6 | using System.Threading.Tasks;
 7 | 
 8 | namespace CherubNLP.Tokenize
 9 | {
10 |     /// <summary>
11 |     /// CherubNLP Tokenizer Factory
12 |     /// Tokenizers divide strings into lists of substrings.
13 |     /// The particular tokenizer requires implement interface 
14 |     /// models to be installed.CherubNLP also provides a simpler, regular-expression based tokenizer, which splits text on whitespace and punctuation.
15 |     /// </summary>
16 |     public class TokenizerFactory
17 |     {
18 |         private SupportedLanguage _lang;
19 | 
20 |         private ITokenizer _tokenizer;
21 | 
22 |         private TokenizationOptions _options;
23 | 
24 |         public ITokenizer GetTokenizer<ITokenize>() where ITokenize : ITokenizer, new()
25 |         {
26 |             return _tokenizer = new ITokenize();
27 |         }
28 | 
29 |         public ITokenizer GetTokenizer(string name)
30 |         {
31 |             List<Type> types = new List<Type>();
32 | 
33 |             types.AddRange(Assembly.Load(new AssemblyName("CherubNLP"))
34 |                 .GetTypes().Where(x => !x.IsAbstract && !x.FullName.StartsWith("<>f__AnonymousType")).ToList());
35 | 
36 |             Type type = types.FirstOrDefault(x => x.Name == name);
37 |             var instance = (ITokenizer)Activator.CreateInstance(type);
38 | 
39 |             return _tokenizer = instance;
40 |         }
41 | 
42 |         public TokenizerFactory(TokenizationOptions options, SupportedLanguage lang)
43 |         {
44 |             _lang = lang;
45 |             _options = options;
46 |         }
47 | 
48 |         public List<Token> Tokenize(string sentence)
49 |         {
50 |             var tokens = _tokenizer.Tokenize(sentence, _options);
51 |             tokens.ForEach(x => x.Lemma = x.Text.ToLower());
52 |             return tokens;
53 |         }
54 | 
55 |         public List<Sentence> Tokenize(List<String> sentences)
56 |         {
57 |             var sents = sentences.Select(s => new Sentence { Text = s }).ToList();
58 | 
59 |             Parallel.ForEach(sents, (sentence) =>
60 |             {
61 |                 sentence.Words = Tokenize(sentence.Text);
62 |                 sentence.Words.ForEach(x => x.Lemma = x.Text.ToLower());
63 |             });
64 | 
65 |             return sents;
66 |         }
67 | 
68 |         private class ParallelToken
69 |         {
70 |             public String Text { get; set; }
71 | 
72 |             public List<Token> Tokens { get; set; }
73 |         }
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/CherubNLP/Txt2Vec/OneHotEncoder.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CherubNLP.Tokenize;
 2 | using System;
 3 | using System.Collections.Generic;
 4 | using System.Linq;
 5 | using System.Text;
 6 | using System.Threading.Tasks;
 7 | 
 8 | namespace CherubNLP.Txt2Vec
 9 | {
10 |     /// <summary>
11 |     /// A one hot encoding is a representation of categorical variables as binary vectors. 
12 |     /// Each integer value is represented as a binary vector that is all zero values except the index of the integer, which is marked with a 1.
13 |     /// </summary>
14 |     public class OneHotEncoder
15 |     {
16 |         public List<Sentence> Sentences { get; set; }
17 | 
18 |         public List<string> Words { get; set; }
19 | 
20 |         public void Encode(Sentence sentence)
21 |         {
22 |             InitDictionary();
23 | 
24 |             var vector = Words.Select(x => 0D).ToArray();
25 | 
26 |             sentence.Words.ForEach(w =>
27 |             {
28 |                 int index = Words.IndexOf(w.Lemma);
29 |                 if(index > 0)
30 |                 {
31 |                     vector[index] = 1;
32 |                 }
33 |             });
34 | 
35 |             sentence.Vector = vector;
36 |         }
37 | 
38 |         public List<string> EncodeAll()
39 |         {
40 |             InitDictionary();
41 |             
42 |             Sentences.ForEach(sent => Encode(sent));
43 |             //Parallel.ForEach(Sentences, sent => Encode(sent));
44 | 
45 |             return Words;
46 |         }
47 | 
48 |         private List<string> InitDictionary()
49 |         {
50 |             if (Words == null)
51 |             {
52 |                 Words = new List<string>();
53 |                 Sentences.ForEach(x =>
54 |                 {
55 |                     Words.AddRange(x.Words.Where(w => w.IsAlpha).Select(w => w.Lemma));
56 |                 });
57 |                 Words = Words.Distinct().OrderBy(x => x).ToList();
58 |             }
59 | 
60 |             return Words;
61 |         }
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/CherubNLP/Txt2Vec/Shrink.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Linq;
 4 | using System.Text;
 5 | using System.Threading.Tasks;
 6 | using System.IO;
 7 | 
 8 | namespace Txt2Vec
 9 | {
10 |     public class Shrink
11 |     {
12 |         public void Run(string strModelFileName, string strNewModelFileName, string strDictFileName)
13 |         {
14 |             string strLine = null;
15 | 
16 |             //Load lexical dictionary
17 |             // Logger.WriteLine("Load lexical dictionary...");
18 |             StreamReader sr = new StreamReader(strDictFileName);
19 |             HashSet<string> setTerm = new HashSet<string>();
20 |             while ((strLine = sr.ReadLine()) != null)
21 |             {
22 |                 string[] items = strLine.Split('\t');
23 |                 setTerm.Add(items[0]);
24 |             }
25 |             sr.Close();
26 | 
27 | 
28 |             //Load raw model
29 |             // Logger.WriteLine("Loading raw model...");
30 |             sr = new StreamReader(strModelFileName);
31 |             BinaryReader br = new BinaryReader(sr.BaseStream);
32 | 
33 |             int words = br.ReadInt32();
34 |             int size = br.ReadInt32();
35 |             int vqSize = br.ReadInt32();
36 | 
37 |             // Logger.WriteLine("vocabulary size: {0}, vector size: {1}, VQ size: {2}", words, size, vqSize);
38 |             if (vqSize != 0)
39 |             {
40 |                 // Logger.WriteLine(Logger.Level.err, "Currently, we don't support to shrink vector quantization model.");
41 |                 return;
42 |             }
43 | 
44 |             Dictionary<string, int> vocab = new Dictionary<string, int>();
45 |             Dictionary<int, string> rev_vocab = new Dictionary<int, string>();
46 |             List<string> termList = new List<string>();
47 |             double []M = new double[words * size];
48 | 
49 |             int newwords = 0;
50 |             for (int b = 0; b < words; b++)
51 |             {
52 |                 string strTerm = br.ReadString();
53 |                 if (setTerm.Contains(strTerm) == true)
54 |                 {
55 |                     termList.Add(strTerm);
56 |                     for (int a = 0; a < size; a++)
57 |                     {
58 |                         M[a + newwords * size] = br.ReadSingle();
59 |                     }
60 |                     newwords++;
61 |                 }
62 |                 else
63 |                 {
64 |                     //Skip the vectors of this word
65 |                     for (int a = 0; a < size; a++)
66 |                     {
67 |                         br.ReadSingle();
68 |                     }
69 |                 }
70 |             }
71 |             sr.Close();
72 | 
73 |             //Save the shrinked model
74 |             // Logger.WriteLine("Saving shrinked model...");
75 |             StreamWriter sw = new StreamWriter(strNewModelFileName);
76 |             BinaryWriter bw = new BinaryWriter(sw.BaseStream);
77 | 
78 |             bw.Write(newwords);
79 |             bw.Write(size);
80 |             bw.Write(vqSize);
81 | 
82 |             for (int i = 0; i < newwords; i++)
83 |             {
84 |                 bw.Write(termList[i]);
85 |                 for (int j = 0; j < size; j++)
86 |                 {
87 |                     bw.Write((float)M[j + i * size]);
88 |                 }
89 |             }
90 |             sw.Close();
91 |         }
92 |     }
93 | }
94 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CherubNLP
 2 | Natural Language Processing in .NET Standard.
 3 | 
 4 | ```powershell
 5 | PM > Install-Package CherubNLP
 6 | ```
 7 | 
 8 | 
 9 | 
10 | 
11 | #### Word Vector
12 | #### Text Vector
13 | 
14 | #### Text Similarity
15 | 
16 | ```csharp
17 | using NumSharp;
18 | 
19 | var similarities = Similarity.Cosine("We can use Cosine to compute the similarity of two hardcoded lists.", new[]
20 |             {
21 |                 "Cosine Similarity algorithm function sample.",
22 |                 "The Cosine Similarity function computes the similarity of two lists of numbers.",
23 |                 "Compute the similarity of two hardcoded lists.",
24 |                 "We can compute the similarity of two hardcoded lists.",
25 |                 "Coronavirus app could trace your contacts without sacrificing your privacy"
26 |             }, "dbpedia.ftz"));
27 | 
28 | Assert.AreEqual(new[] { 0, 4, 1, 3, 2 }, np.argsort<double>(similarities));
29 | ```
30 | 
31 | 


--------------------------------------------------------------------------------
/Settings/app.json:
--------------------------------------------------------------------------------
1 | {
2 | 
3 | }


--------------------------------------------------------------------------------
/data/dbpedia.ftz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciSharp/CherubNLP/7e80875a7909288c15c0e5ee1fafcb4c8df41198/data/dbpedia.ftz


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = CherubNLP
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # CherubNLP documentation build configuration file, created by
  5 | # sphinx-quickstart on Sun Oct 14 08:24:22 2018.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | # import os
 21 | # import sys
 22 | # sys.path.insert(0, os.path.abspath('.'))
 23 | 
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #
 29 | # needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = ['sphinx.ext.mathjax']
 35 | 
 36 | # Add any paths that contain templates here, relative to this directory.
 37 | templates_path = ['_templates']
 38 | 
 39 | # The suffix(es) of source filenames.
 40 | # You can specify multiple suffix as a list of string:
 41 | #
 42 | # source_suffix = ['.rst', '.md']
 43 | source_suffix = ['.rst', '.md']
 44 | 
 45 | # The master toctree document.
 46 | master_doc = 'index'
 47 | 
 48 | # General information about the project.
 49 | project = 'CherubNLP'
 50 | copyright = '2018, Haiping Chen'
 51 | author = 'Haiping Chen'
 52 | 
 53 | # The version info for the project you're documenting, acts as replacement for
 54 | # |version| and |release|, also used in various other places throughout the
 55 | # built documents.
 56 | #
 57 | # The short X.Y version.
 58 | version = '0.1'
 59 | # The full version, including alpha/beta/rc tags.
 60 | release = '0.1.0'
 61 | 
 62 | # The language for content autogenerated by Sphinx. Refer to documentation
 63 | # for a list of supported languages.
 64 | #
 65 | # This is also used if you do content translation via gettext catalogs.
 66 | # Usually you set "language" from the command line for these cases.
 67 | language = None
 68 | 
 69 | # List of patterns, relative to source directory, that match files and
 70 | # directories to ignore when looking for source files.
 71 | # This patterns also effect to html_static_path and html_extra_path
 72 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 73 | 
 74 | # The name of the Pygments (syntax highlighting) style to use.
 75 | pygments_style = 'sphinx'
 76 | 
 77 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 78 | todo_include_todos = False
 79 | 
 80 | 
 81 | # -- Options for HTML output ----------------------------------------------
 82 | 
 83 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 84 | # a list of builtin themes.
 85 | #
 86 | html_theme = 'sphinx_rtd_theme'
 87 | 
 88 | # Theme options are theme-specific and customize the look and feel of a theme
 89 | # further.  For a list of options available for each theme, see the
 90 | # documentation.
 91 | #
 92 | # html_theme_options = {}
 93 | 
 94 | # Add any paths that contain custom static files (such as style sheets) here,
 95 | # relative to this directory. They are copied after the builtin static files,
 96 | # so a file named "default.css" will overwrite the builtin "default.css".
 97 | html_static_path = ['_static']
 98 | 
 99 | # Custom sidebar templates, must be a dictionary that maps document names
100 | # to template names.
101 | #
102 | # This is required for the alabaster theme
103 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
104 | html_sidebars = {
105 |     '**': [
106 |         'relations.html',  # needs 'show_related': True theme option to display
107 |         'searchbox.html',
108 |     ]
109 | }
110 | 
111 | 
112 | # -- Options for HTMLHelp output ------------------------------------------
113 | 
114 | # Output file base name for HTML help builder.
115 | htmlhelp_basename = 'CherubNLPdoc'
116 | 
117 | 
118 | # -- Options for LaTeX output ---------------------------------------------
119 | 
120 | latex_elements = {
121 |     # The paper size ('letterpaper' or 'a4paper').
122 |     #
123 |     # 'papersize': 'letterpaper',
124 | 
125 |     # The font size ('10pt', '11pt' or '12pt').
126 |     #
127 |     # 'pointsize': '10pt',
128 | 
129 |     # Additional stuff for the LaTeX preamble.
130 |     #
131 |     # 'preamble': '',
132 | 
133 |     # Latex figure (float) alignment
134 |     #
135 |     # 'figure_align': 'htbp',
136 | }
137 | 
138 | # Grouping the document tree into LaTeX files. List of tuples
139 | # (source start file, target name, title,
140 | #  author, documentclass [howto, manual, or own class]).
141 | latex_documents = [
142 |     (master_doc, 'CherubNLP.tex', 'CherubNLP Documentation',
143 |      'Haiping Chen', 'manual'),
144 | ]
145 | 
146 | 
147 | # -- Options for manual page output ---------------------------------------
148 | 
149 | # One entry per manual page. List of tuples
150 | # (source start file, name, description, authors, manual section).
151 | man_pages = [
152 |     (master_doc, 'cherubnlp', 'CherubNLP Documentation',
153 |      [author], 1)
154 | ]
155 | 
156 | 
157 | # -- Options for Texinfo output -------------------------------------------
158 | 
159 | # Grouping the document tree into Texinfo files. List of tuples
160 | # (source start file, target name, title, author,
161 | #  dir menu entry, description, category)
162 | texinfo_documents = [
163 |     (master_doc, 'CherubNLP', 'CherubNLP Documentation',
164 |      author, 'CherubNLP', 'One line description of project.',
165 |      'Miscellaneous'),
166 | ]
167 | 
168 | 
169 | 
170 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. CherubNLP documentation master file, created by
 2 |    sphinx-quickstart on Sun Oct 14 08:24:22 2018.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to CherubNLP's documentation!
 7 | =====================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 | 
14 | 
15 | Indices and tables
16 | ==================
17 | 
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | * :ref:`search`
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=CherubNLP
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------