├── .gitignore ├── LICENSE ├── LemmaSharp ├── Classes │ ├── 7zip │ │ ├── Common │ │ │ ├── CRC.cs │ │ │ ├── CommandLineParser.cs │ │ │ ├── InBuffer.cs │ │ │ └── OutBuffer.cs │ │ ├── Compress │ │ │ ├── LZ │ │ │ │ ├── IMatchFinder.cs │ │ │ │ ├── LzBinTree.cs │ │ │ │ ├── LzInWindow.cs │ │ │ │ └── LzOutWindow.cs │ │ │ ├── LZMA │ │ │ │ ├── LzmaBase.cs │ │ │ │ ├── LzmaDecoder.cs │ │ │ │ └── LzmaEncoder.cs │ │ │ ├── LzmaAlone │ │ │ │ ├── LzmaAlone.cs │ │ │ │ ├── LzmaAlone.csproj │ │ │ │ ├── LzmaAlone.sln │ │ │ │ ├── LzmaBench.cs │ │ │ │ └── Properties │ │ │ │ │ ├── AssemblyInfo.cs │ │ │ │ │ ├── Resources.cs │ │ │ │ │ └── Settings.cs │ │ │ └── RangeCoder │ │ │ │ ├── RangeCoder.cs │ │ │ │ ├── RangeCoderBit.cs │ │ │ │ └── RangeCoderBitTree.cs │ │ └── ICoder.cs │ ├── Constants.cs │ ├── ExampleList.cs │ ├── LemmaExample.cs │ ├── LemmaRule.cs │ ├── LemmaTreeNode.cs │ ├── Lemmatizer.cs │ ├── LemmatizerSettings.cs │ ├── RuleList.cs │ └── RuleWeighted.cs ├── Interfaces │ ├── ILemmatizer.cs │ ├── ILemmatizerModel.cs │ └── ILemmatizerTrainable.cs ├── LatinoCompatibility │ └── BinarySerializer.cs ├── LemmaSharp.csproj └── LemmaSharp.nuspec ├── LemmaSharpPrebuiltFull.sln ├── README.md ├── SourceFileBuilder ├── App.config ├── Classes │ └── EnricherFileReader.cs ├── Input │ ├── english-acronyms.txt │ ├── english-contractions.txt │ ├── english-irregular_verbs-enricher.txt │ └── english-lemma-enricher.txt ├── Program.cs ├── Properties │ └── AssemblyInfo.cs └── SourceFileBuilder.csproj ├── Test ├── App.config ├── Classes │ ├── LemmatizerPrebuilt.cs │ └── LemmatizerPrebuiltFull.cs ├── Data │ ├── Custom │ │ ├── english.lem │ │ └── full7z-mlteast-en-modified.lem │ ├── full7z-mlteast-bg.lem │ ├── full7z-mlteast-cs.lem │ ├── full7z-mlteast-en.lem │ ├── full7z-mlteast-et.lem │ ├── full7z-mlteast-fa.lem │ ├── full7z-mlteast-fr.lem │ ├── full7z-mlteast-hu.lem │ ├── full7z-mlteast-mk.lem │ ├── full7z-mlteast-pl.lem │ ├── full7z-mlteast-ro.lem │ ├── full7z-mlteast-ru.lem │ ├── full7z-mlteast-sk.lem │ ├── full7z-mlteast-sl.lem │ ├── full7z-mlteast-sr.lem │ ├── full7z-mlteast-uk.lem │ ├── full7z-multext-en.lem │ ├── full7z-multext-fr.lem │ ├── full7z-multext-ge.lem │ ├── full7z-multext-it.lem │ └── full7z-multext-sp.lem ├── Program.cs ├── Properties │ └── AssemblyInfo.cs └── Test.csproj └── nuget ├── NuGet.exe └── NuGet.targets /.gitignore: -------------------------------------------------------------------------------- 1 | # Build Folders (you can keep bin if you'd like, to store dlls and pdbs) 2 | [Bb]in/ 3 | [Oo]bj/ 4 | 5 | # mstest test results 6 | TestResults 7 | 8 | ## Ignore Visual Studio temporary files, build results, and 9 | ## files generated by popular Visual Studio add-ons. 10 | .vs 11 | 12 | # User-specific files 13 | *.suo 14 | *.user 15 | *.sln.docstates 16 | 17 | # Build results 18 | [Dd]ebug/ 19 | [Rr]elease/ 20 | x64/ 21 | *_i.c 22 | *_p.c 23 | *.ilk 24 | *.meta 25 | *.obj 26 | *.pch 27 | *.pdb 28 | *.pgc 29 | *.pgd 30 | *.rsp 31 | *.sbr 32 | *.tlb 33 | *.tli 34 | *.tlh 35 | *.tmp 36 | *.log 37 | *.vspscc 38 | *.vssscc 39 | .builds 40 | 41 | # Visual C++ cache files 42 | ipch/ 43 | *.aps 44 | *.ncb 45 | *.opensdf 46 | *.sdf 47 | 48 | # Visual Studio profiler 49 | *.psess 50 | *.vsp 51 | *.vspx 52 | 53 | # Guidance Automation Toolkit 54 | *.gpState 55 | 56 | # ReSharper is a .NET coding add-in 57 | _ReSharper* 58 | 59 | # NCrunch 60 | *.ncrunch* 61 | .*crunch*.local.xml 62 | 63 | # Installshield output folder 64 | [Ee]xpress 65 | 66 | # DocProject is a documentation generator add-in 67 | DocProject/buildhelp/ 68 | DocProject/Help/*.HxT 69 | DocProject/Help/*.HxC 70 | DocProject/Help/*.hhc 71 | DocProject/Help/*.hhk 72 | DocProject/Help/*.hhp 73 | DocProject/Help/Html2 74 | DocProject/Help/html 75 | 76 | # Click-Once directory 77 | publish 78 | 79 | # Publish Web Output 80 | *.Publish.xml 81 | 82 | # NuGet Packages Directory 83 | packages 84 | 85 | # Windows Azure Build Output 86 | csx 87 | *.build.csdef 88 | 89 | # Windows Store app package directory 90 | AppPackages/ 91 | 92 | # Others 93 | [Bb]in 94 | [Oo]bj 95 | sql 96 | TestResults 97 | [Tt]est[Rr]esult* 98 | *.Cache 99 | ClientBin 100 | [Ss]tyle[Cc]op.* 101 | ~$* 102 | *.dbmdl 103 | Generated_Code #added for RIA/Silverlight projects 104 | 105 | # Backup & report files from converting an old project file to a newer 106 | # Visual Studio version. Backup files are not needed, because we have git ;-) 107 | _UpgradeReport_Files/ 108 | Backup*/ 109 | UpgradeLog*.XML 110 | 111 | # binaries 112 | *.exe 113 | *.snk 114 | 115 | # Output folders 116 | Output/ 117 | Uncompressed/ -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Common/CRC.cs: -------------------------------------------------------------------------------- 1 | // Common/CRC.cs 2 | 3 | namespace SevenZip 4 | { 5 | class CRC 6 | { 7 | public static readonly uint[] Table; 8 | 9 | static CRC() 10 | { 11 | Table = new uint[256]; 12 | const uint kPoly = 0xEDB88320; 13 | for (uint i = 0; i < 256; i++) 14 | { 15 | uint r = i; 16 | for (int j = 0; j < 8; j++) 17 | if ((r & 1) != 0) 18 | r = (r >> 1) ^ kPoly; 19 | else 20 | r >>= 1; 21 | Table[i] = r; 22 | } 23 | } 24 | 25 | uint _value = 0xFFFFFFFF; 26 | 27 | public void Init() { _value = 0xFFFFFFFF; } 28 | 29 | public void UpdateByte(byte b) 30 | { 31 | _value = Table[(((byte)(_value)) ^ b)] ^ (_value >> 8); 32 | } 33 | 34 | public void Update(byte[] data, uint offset, uint size) 35 | { 36 | for (uint i = 0; i < size; i++) 37 | _value = Table[(((byte)(_value)) ^ data[offset + i])] ^ (_value >> 8); 38 | } 39 | 40 | public uint GetDigest() { return _value ^ 0xFFFFFFFF; } 41 | 42 | static uint CalculateDigest(byte[] data, uint offset, uint size) 43 | { 44 | CRC crc = new CRC(); 45 | // crc.Init(); 46 | crc.Update(data, offset, size); 47 | return crc.GetDigest(); 48 | } 49 | 50 | static bool VerifyDigest(uint digest, byte[] data, uint offset, uint size) 51 | { 52 | return (CalculateDigest(data, offset, size) == digest); 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Common/CommandLineParser.cs: -------------------------------------------------------------------------------- 1 | // CommandLineParser.cs 2 | 3 | using System; 4 | using System.Collections; 5 | 6 | namespace SevenZip.CommandLineParser 7 | { 8 | public enum SwitchType 9 | { 10 | Simple, 11 | PostMinus, 12 | LimitedPostString, 13 | UnLimitedPostString, 14 | PostChar 15 | } 16 | 17 | public class SwitchForm 18 | { 19 | public string IDString; 20 | public SwitchType Type; 21 | public bool Multi; 22 | public int MinLen; 23 | public int MaxLen; 24 | public string PostCharSet; 25 | 26 | public SwitchForm(string idString, SwitchType type, bool multi, 27 | int minLen, int maxLen, string postCharSet) 28 | { 29 | IDString = idString; 30 | Type = type; 31 | Multi = multi; 32 | MinLen = minLen; 33 | MaxLen = maxLen; 34 | PostCharSet = postCharSet; 35 | } 36 | public SwitchForm(string idString, SwitchType type, bool multi, int minLen): 37 | this(idString, type, multi, minLen, 0, "") 38 | { 39 | } 40 | public SwitchForm(string idString, SwitchType type, bool multi): 41 | this(idString, type, multi, 0) 42 | { 43 | } 44 | } 45 | 46 | public class SwitchResult 47 | { 48 | public bool ThereIs; 49 | public bool WithMinus; 50 | public ArrayList PostStrings = new ArrayList(); 51 | public int PostCharIndex; 52 | public SwitchResult() 53 | { 54 | ThereIs = false; 55 | } 56 | } 57 | 58 | public class Parser 59 | { 60 | public ArrayList NonSwitchStrings = new ArrayList(); 61 | SwitchResult[] _switches; 62 | 63 | public Parser(int numSwitches) 64 | { 65 | _switches = new SwitchResult[numSwitches]; 66 | for (int i = 0; i < numSwitches; i++) 67 | _switches[i] = new SwitchResult(); 68 | } 69 | 70 | bool ParseString(string srcString, SwitchForm[] switchForms) 71 | { 72 | int len = srcString.Length; 73 | if (len == 0) 74 | return false; 75 | int pos = 0; 76 | if (!IsItSwitchChar(srcString[pos])) 77 | return false; 78 | while (pos < len) 79 | { 80 | if (IsItSwitchChar(srcString[pos])) 81 | pos++; 82 | const int kNoLen = -1; 83 | int matchedSwitchIndex = 0; 84 | int maxLen = kNoLen; 85 | for (int switchIndex = 0; switchIndex < _switches.Length; switchIndex++) 86 | { 87 | int switchLen = switchForms[switchIndex].IDString.Length; 88 | if (switchLen <= maxLen || pos + switchLen > len) 89 | continue; 90 | if (String.Compare(switchForms[switchIndex].IDString, 0, 91 | srcString, pos, switchLen, true) == 0) 92 | { 93 | matchedSwitchIndex = switchIndex; 94 | maxLen = switchLen; 95 | } 96 | } 97 | if (maxLen == kNoLen) 98 | throw new Exception("maxLen == kNoLen"); 99 | SwitchResult matchedSwitch = _switches[matchedSwitchIndex]; 100 | SwitchForm switchForm = switchForms[matchedSwitchIndex]; 101 | if ((!switchForm.Multi) && matchedSwitch.ThereIs) 102 | throw new Exception("switch must be single"); 103 | matchedSwitch.ThereIs = true; 104 | pos += maxLen; 105 | int tailSize = len - pos; 106 | SwitchType type = switchForm.Type; 107 | switch (type) 108 | { 109 | case SwitchType.PostMinus: 110 | { 111 | if (tailSize == 0) 112 | matchedSwitch.WithMinus = false; 113 | else 114 | { 115 | matchedSwitch.WithMinus = (srcString[pos] == kSwitchMinus); 116 | if (matchedSwitch.WithMinus) 117 | pos++; 118 | } 119 | break; 120 | } 121 | case SwitchType.PostChar: 122 | { 123 | if (tailSize < switchForm.MinLen) 124 | throw new Exception("switch is not full"); 125 | string charSet = switchForm.PostCharSet; 126 | const int kEmptyCharValue = -1; 127 | if (tailSize == 0) 128 | matchedSwitch.PostCharIndex = kEmptyCharValue; 129 | else 130 | { 131 | int index = charSet.IndexOf(srcString[pos]); 132 | if (index < 0) 133 | matchedSwitch.PostCharIndex = kEmptyCharValue; 134 | else 135 | { 136 | matchedSwitch.PostCharIndex = index; 137 | pos++; 138 | } 139 | } 140 | break; 141 | } 142 | case SwitchType.LimitedPostString: 143 | case SwitchType.UnLimitedPostString: 144 | { 145 | int minLen = switchForm.MinLen; 146 | if (tailSize < minLen) 147 | throw new Exception("switch is not full"); 148 | if (type == SwitchType.UnLimitedPostString) 149 | { 150 | matchedSwitch.PostStrings.Add(srcString.Substring(pos)); 151 | return true; 152 | } 153 | String stringSwitch = srcString.Substring(pos, minLen); 154 | pos += minLen; 155 | for (int i = minLen; i < switchForm.MaxLen && pos < len; i++, pos++) 156 | { 157 | char c = srcString[pos]; 158 | if (IsItSwitchChar(c)) 159 | break; 160 | stringSwitch += c; 161 | } 162 | matchedSwitch.PostStrings.Add(stringSwitch); 163 | break; 164 | } 165 | } 166 | } 167 | return true; 168 | 169 | } 170 | 171 | public void ParseStrings(SwitchForm[] switchForms, string[] commandStrings) 172 | { 173 | int numCommandStrings = commandStrings.Length; 174 | bool stopSwitch = false; 175 | for (int i = 0; i < numCommandStrings; i++) 176 | { 177 | string s = commandStrings[i]; 178 | if (stopSwitch) 179 | NonSwitchStrings.Add(s); 180 | else 181 | if (s == kStopSwitchParsing) 182 | stopSwitch = true; 183 | else 184 | if (!ParseString(s, switchForms)) 185 | NonSwitchStrings.Add(s); 186 | } 187 | } 188 | 189 | public SwitchResult this[int index] { get { return _switches[index]; } } 190 | 191 | public static int ParseCommand(CommandForm[] commandForms, string commandString, 192 | out string postString) 193 | { 194 | for (int i = 0; i < commandForms.Length; i++) 195 | { 196 | string id = commandForms[i].IDString; 197 | if (commandForms[i].PostStringMode) 198 | { 199 | if (commandString.IndexOf(id) == 0) 200 | { 201 | postString = commandString.Substring(id.Length); 202 | return i; 203 | } 204 | } 205 | else 206 | if (commandString == id) 207 | { 208 | postString = ""; 209 | return i; 210 | } 211 | } 212 | postString = ""; 213 | return -1; 214 | } 215 | 216 | static bool ParseSubCharsCommand(int numForms, CommandSubCharsSet[] forms, 217 | string commandString, ArrayList indices) 218 | { 219 | indices.Clear(); 220 | int numUsedChars = 0; 221 | for (int i = 0; i < numForms; i++) 222 | { 223 | CommandSubCharsSet charsSet = forms[i]; 224 | int currentIndex = -1; 225 | int len = charsSet.Chars.Length; 226 | for (int j = 0; j < len; j++) 227 | { 228 | char c = charsSet.Chars[j]; 229 | int newIndex = commandString.IndexOf(c); 230 | if (newIndex >= 0) 231 | { 232 | if (currentIndex >= 0) 233 | return false; 234 | if (commandString.IndexOf(c, newIndex + 1) >= 0) 235 | return false; 236 | currentIndex = j; 237 | numUsedChars++; 238 | } 239 | } 240 | if (currentIndex == -1 && !charsSet.EmptyAllowed) 241 | return false; 242 | indices.Add(currentIndex); 243 | } 244 | return (numUsedChars == commandString.Length); 245 | } 246 | const char kSwitchID1 = '-'; 247 | const char kSwitchID2 = '/'; 248 | 249 | const char kSwitchMinus = '-'; 250 | const string kStopSwitchParsing = "--"; 251 | 252 | static bool IsItSwitchChar(char c) 253 | { 254 | return (c == kSwitchID1 || c == kSwitchID2); 255 | } 256 | } 257 | 258 | public class CommandForm 259 | { 260 | public string IDString = ""; 261 | public bool PostStringMode = false; 262 | public CommandForm(string idString, bool postStringMode) 263 | { 264 | IDString = idString; 265 | PostStringMode = postStringMode; 266 | } 267 | } 268 | 269 | class CommandSubCharsSet 270 | { 271 | public string Chars = ""; 272 | public bool EmptyAllowed = false; 273 | } 274 | } 275 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Common/InBuffer.cs: -------------------------------------------------------------------------------- 1 | // InBuffer.cs 2 | 3 | namespace SevenZip.Buffer 4 | { 5 | public class InBuffer 6 | { 7 | byte[] m_Buffer; 8 | uint m_Pos; 9 | uint m_Limit; 10 | uint m_BufferSize; 11 | System.IO.Stream m_Stream; 12 | bool m_StreamWasExhausted; 13 | ulong m_ProcessedSize; 14 | 15 | public InBuffer(uint bufferSize) 16 | { 17 | m_Buffer = new byte[bufferSize]; 18 | m_BufferSize = bufferSize; 19 | } 20 | 21 | public void Init(System.IO.Stream stream) 22 | { 23 | m_Stream = stream; 24 | m_ProcessedSize = 0; 25 | m_Limit = 0; 26 | m_Pos = 0; 27 | m_StreamWasExhausted = false; 28 | } 29 | 30 | public bool ReadBlock() 31 | { 32 | if (m_StreamWasExhausted) 33 | return false; 34 | m_ProcessedSize += m_Pos; 35 | int aNumProcessedBytes = m_Stream.Read(m_Buffer, 0, (int)m_BufferSize); 36 | m_Pos = 0; 37 | m_Limit = (uint)aNumProcessedBytes; 38 | m_StreamWasExhausted = (aNumProcessedBytes == 0); 39 | return (!m_StreamWasExhausted); 40 | } 41 | 42 | 43 | public void ReleaseStream() 44 | { 45 | // m_Stream.Close(); 46 | m_Stream = null; 47 | } 48 | 49 | public bool ReadByte(byte b) // check it 50 | { 51 | if (m_Pos >= m_Limit) 52 | if (!ReadBlock()) 53 | return false; 54 | b = m_Buffer[m_Pos++]; 55 | return true; 56 | } 57 | 58 | public byte ReadByte() 59 | { 60 | // return (byte)m_Stream.ReadByte(); 61 | if (m_Pos >= m_Limit) 62 | if (!ReadBlock()) 63 | return 0xFF; 64 | return m_Buffer[m_Pos++]; 65 | } 66 | 67 | public ulong GetProcessedSize() 68 | { 69 | return m_ProcessedSize + m_Pos; 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Common/OutBuffer.cs: -------------------------------------------------------------------------------- 1 | // OutBuffer.cs 2 | 3 | namespace SevenZip.Buffer 4 | { 5 | public class OutBuffer 6 | { 7 | byte[] m_Buffer; 8 | uint m_Pos; 9 | uint m_BufferSize; 10 | System.IO.Stream m_Stream; 11 | ulong m_ProcessedSize; 12 | 13 | public OutBuffer(uint bufferSize) 14 | { 15 | m_Buffer = new byte[bufferSize]; 16 | m_BufferSize = bufferSize; 17 | } 18 | 19 | public void SetStream(System.IO.Stream stream) { m_Stream = stream; } 20 | public void FlushStream() { m_Stream.Flush(); } 21 | public void CloseStream() { m_Stream.Close(); } 22 | public void ReleaseStream() { m_Stream = null; } 23 | 24 | public void Init() 25 | { 26 | m_ProcessedSize = 0; 27 | m_Pos = 0; 28 | } 29 | 30 | public void WriteByte(byte b) 31 | { 32 | m_Buffer[m_Pos++] = b; 33 | if (m_Pos >= m_BufferSize) 34 | FlushData(); 35 | } 36 | 37 | public void FlushData() 38 | { 39 | if (m_Pos == 0) 40 | return; 41 | m_Stream.Write(m_Buffer, 0, (int)m_Pos); 42 | m_Pos = 0; 43 | } 44 | 45 | public ulong GetProcessedSize() { return m_ProcessedSize + m_Pos; } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Compress/LZ/IMatchFinder.cs: -------------------------------------------------------------------------------- 1 | // IMatchFinder.cs 2 | 3 | using System; 4 | 5 | namespace SevenZip.Compression.LZ 6 | { 7 | interface IInWindowStream 8 | { 9 | void SetStream(System.IO.Stream inStream); 10 | void Init(); 11 | void ReleaseStream(); 12 | Byte GetIndexByte(Int32 index); 13 | UInt32 GetMatchLen(Int32 index, UInt32 distance, UInt32 limit); 14 | UInt32 GetNumAvailableBytes(); 15 | } 16 | 17 | interface IMatchFinder : IInWindowStream 18 | { 19 | void Create(UInt32 historySize, UInt32 keepAddBufferBefore, 20 | UInt32 matchMaxLen, UInt32 keepAddBufferAfter); 21 | UInt32 GetMatches(UInt32[] distances); 22 | void Skip(UInt32 num); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Compress/LZ/LzBinTree.cs: -------------------------------------------------------------------------------- 1 | // LzBinTree.cs 2 | 3 | using System; 4 | 5 | namespace SevenZip.Compression.LZ 6 | { 7 | public class BinTree : InWindow, IMatchFinder 8 | { 9 | UInt32 _cyclicBufferPos; 10 | UInt32 _cyclicBufferSize = 0; 11 | UInt32 _matchMaxLen; 12 | 13 | UInt32[] _son; 14 | UInt32[] _hash; 15 | 16 | UInt32 _cutValue = 0xFF; 17 | UInt32 _hashMask; 18 | UInt32 _hashSizeSum = 0; 19 | 20 | bool HASH_ARRAY = true; 21 | 22 | const UInt32 kHash2Size = 1 << 10; 23 | const UInt32 kHash3Size = 1 << 16; 24 | const UInt32 kBT2HashSize = 1 << 16; 25 | const UInt32 kStartMaxLen = 1; 26 | const UInt32 kHash3Offset = kHash2Size; 27 | const UInt32 kEmptyHashValue = 0; 28 | const UInt32 kMaxValForNormalize = ((UInt32)1 << 31) - 1; 29 | 30 | UInt32 kNumHashDirectBytes = 0; 31 | UInt32 kMinMatchCheck = 4; 32 | UInt32 kFixHashSize = kHash2Size + kHash3Size; 33 | 34 | public void SetType(int numHashBytes) 35 | { 36 | HASH_ARRAY = (numHashBytes > 2); 37 | if (HASH_ARRAY) 38 | { 39 | kNumHashDirectBytes = 0; 40 | kMinMatchCheck = 4; 41 | kFixHashSize = kHash2Size + kHash3Size; 42 | } 43 | else 44 | { 45 | kNumHashDirectBytes = 2; 46 | kMinMatchCheck = 2 + 1; 47 | kFixHashSize = 0; 48 | } 49 | } 50 | 51 | public new void SetStream(System.IO.Stream stream) { base.SetStream(stream); } 52 | public new void ReleaseStream() { base.ReleaseStream(); } 53 | 54 | public new void Init() 55 | { 56 | base.Init(); 57 | for (UInt32 i = 0; i < _hashSizeSum; i++) 58 | _hash[i] = kEmptyHashValue; 59 | _cyclicBufferPos = 0; 60 | ReduceOffsets(-1); 61 | } 62 | 63 | public new void MovePos() 64 | { 65 | if (++_cyclicBufferPos >= _cyclicBufferSize) 66 | _cyclicBufferPos = 0; 67 | base.MovePos(); 68 | if (_pos == kMaxValForNormalize) 69 | Normalize(); 70 | } 71 | 72 | public new Byte GetIndexByte(Int32 index) { return base.GetIndexByte(index); } 73 | 74 | public new UInt32 GetMatchLen(Int32 index, UInt32 distance, UInt32 limit) 75 | { return base.GetMatchLen(index, distance, limit); } 76 | 77 | public new UInt32 GetNumAvailableBytes() { return base.GetNumAvailableBytes(); } 78 | 79 | public void Create(UInt32 historySize, UInt32 keepAddBufferBefore, 80 | UInt32 matchMaxLen, UInt32 keepAddBufferAfter) 81 | { 82 | if (historySize > kMaxValForNormalize - 256) 83 | throw new Exception(); 84 | _cutValue = 16 + (matchMaxLen >> 1); 85 | 86 | UInt32 windowReservSize = (historySize + keepAddBufferBefore + 87 | matchMaxLen + keepAddBufferAfter) / 2 + 256; 88 | 89 | base.Create(historySize + keepAddBufferBefore, matchMaxLen + keepAddBufferAfter, windowReservSize); 90 | 91 | _matchMaxLen = matchMaxLen; 92 | 93 | UInt32 cyclicBufferSize = historySize + 1; 94 | if (_cyclicBufferSize != cyclicBufferSize) 95 | _son = new UInt32[(_cyclicBufferSize = cyclicBufferSize) * 2]; 96 | 97 | UInt32 hs = kBT2HashSize; 98 | 99 | if (HASH_ARRAY) 100 | { 101 | hs = historySize - 1; 102 | hs |= (hs >> 1); 103 | hs |= (hs >> 2); 104 | hs |= (hs >> 4); 105 | hs |= (hs >> 8); 106 | hs >>= 1; 107 | hs |= 0xFFFF; 108 | if (hs > (1 << 24)) 109 | hs >>= 1; 110 | _hashMask = hs; 111 | hs++; 112 | hs += kFixHashSize; 113 | } 114 | if (hs != _hashSizeSum) 115 | _hash = new UInt32[_hashSizeSum = hs]; 116 | } 117 | 118 | public UInt32 GetMatches(UInt32[] distances) 119 | { 120 | UInt32 lenLimit; 121 | if (_pos + _matchMaxLen <= _streamPos) 122 | lenLimit = _matchMaxLen; 123 | else 124 | { 125 | lenLimit = _streamPos - _pos; 126 | if (lenLimit < kMinMatchCheck) 127 | { 128 | MovePos(); 129 | return 0; 130 | } 131 | } 132 | 133 | UInt32 offset = 0; 134 | UInt32 matchMinPos = (_pos > _cyclicBufferSize) ? (_pos - _cyclicBufferSize) : 0; 135 | UInt32 cur = _bufferOffset + _pos; 136 | UInt32 maxLen = kStartMaxLen; // to avoid items for len < hashSize; 137 | UInt32 hashValue, hash2Value = 0, hash3Value = 0; 138 | 139 | if (HASH_ARRAY) 140 | { 141 | UInt32 temp = CRC.Table[_bufferBase[cur]] ^ _bufferBase[cur + 1]; 142 | hash2Value = temp & (kHash2Size - 1); 143 | temp ^= ((UInt32)(_bufferBase[cur + 2]) << 8); 144 | hash3Value = temp & (kHash3Size - 1); 145 | hashValue = (temp ^ (CRC.Table[_bufferBase[cur + 3]] << 5)) & _hashMask; 146 | } 147 | else 148 | hashValue = _bufferBase[cur] ^ ((UInt32)(_bufferBase[cur + 1]) << 8); 149 | 150 | UInt32 curMatch = _hash[kFixHashSize + hashValue]; 151 | if (HASH_ARRAY) 152 | { 153 | UInt32 curMatch2 = _hash[hash2Value]; 154 | UInt32 curMatch3 = _hash[kHash3Offset + hash3Value]; 155 | _hash[hash2Value] = _pos; 156 | _hash[kHash3Offset + hash3Value] = _pos; 157 | if (curMatch2 > matchMinPos) 158 | if (_bufferBase[_bufferOffset + curMatch2] == _bufferBase[cur]) 159 | { 160 | distances[offset++] = maxLen = 2; 161 | distances[offset++] = _pos - curMatch2 - 1; 162 | } 163 | if (curMatch3 > matchMinPos) 164 | if (_bufferBase[_bufferOffset + curMatch3] == _bufferBase[cur]) 165 | { 166 | if (curMatch3 == curMatch2) 167 | offset -= 2; 168 | distances[offset++] = maxLen = 3; 169 | distances[offset++] = _pos - curMatch3 - 1; 170 | curMatch2 = curMatch3; 171 | } 172 | if (offset != 0 && curMatch2 == curMatch) 173 | { 174 | offset -= 2; 175 | maxLen = kStartMaxLen; 176 | } 177 | } 178 | 179 | _hash[kFixHashSize + hashValue] = _pos; 180 | 181 | UInt32 ptr0 = (_cyclicBufferPos << 1) + 1; 182 | UInt32 ptr1 = (_cyclicBufferPos << 1); 183 | 184 | UInt32 len0, len1; 185 | len0 = len1 = kNumHashDirectBytes; 186 | 187 | if (kNumHashDirectBytes != 0) 188 | { 189 | if (curMatch > matchMinPos) 190 | { 191 | if (_bufferBase[_bufferOffset + curMatch + kNumHashDirectBytes] != 192 | _bufferBase[cur + kNumHashDirectBytes]) 193 | { 194 | distances[offset++] = maxLen = kNumHashDirectBytes; 195 | distances[offset++] = _pos - curMatch - 1; 196 | } 197 | } 198 | } 199 | 200 | UInt32 count = _cutValue; 201 | 202 | while(true) 203 | { 204 | if(curMatch <= matchMinPos || count-- == 0) 205 | { 206 | _son[ptr0] = _son[ptr1] = kEmptyHashValue; 207 | break; 208 | } 209 | UInt32 delta = _pos - curMatch; 210 | UInt32 cyclicPos = ((delta <= _cyclicBufferPos) ? 211 | (_cyclicBufferPos - delta) : 212 | (_cyclicBufferPos - delta + _cyclicBufferSize)) << 1; 213 | 214 | UInt32 pby1 = _bufferOffset + curMatch; 215 | UInt32 len = Math.Min(len0, len1); 216 | if (_bufferBase[pby1 + len] == _bufferBase[cur + len]) 217 | { 218 | while(++len != lenLimit) 219 | if (_bufferBase[pby1 + len] != _bufferBase[cur + len]) 220 | break; 221 | if (maxLen < len) 222 | { 223 | distances[offset++] = maxLen = len; 224 | distances[offset++] = delta - 1; 225 | if (len == lenLimit) 226 | { 227 | _son[ptr1] = _son[cyclicPos]; 228 | _son[ptr0] = _son[cyclicPos + 1]; 229 | break; 230 | } 231 | } 232 | } 233 | if (_bufferBase[pby1 + len] < _bufferBase[cur + len]) 234 | { 235 | _son[ptr1] = curMatch; 236 | ptr1 = cyclicPos + 1; 237 | curMatch = _son[ptr1]; 238 | len1 = len; 239 | } 240 | else 241 | { 242 | _son[ptr0] = curMatch; 243 | ptr0 = cyclicPos; 244 | curMatch = _son[ptr0]; 245 | len0 = len; 246 | } 247 | } 248 | MovePos(); 249 | return offset; 250 | } 251 | 252 | public void Skip(UInt32 num) 253 | { 254 | do 255 | { 256 | UInt32 lenLimit; 257 | if (_pos + _matchMaxLen <= _streamPos) 258 | lenLimit = _matchMaxLen; 259 | else 260 | { 261 | lenLimit = _streamPos - _pos; 262 | if (lenLimit < kMinMatchCheck) 263 | { 264 | MovePos(); 265 | continue; 266 | } 267 | } 268 | 269 | UInt32 matchMinPos = (_pos > _cyclicBufferSize) ? (_pos - _cyclicBufferSize) : 0; 270 | UInt32 cur = _bufferOffset + _pos; 271 | 272 | UInt32 hashValue; 273 | 274 | if (HASH_ARRAY) 275 | { 276 | UInt32 temp = CRC.Table[_bufferBase[cur]] ^ _bufferBase[cur + 1]; 277 | UInt32 hash2Value = temp & (kHash2Size - 1); 278 | _hash[hash2Value] = _pos; 279 | temp ^= ((UInt32)(_bufferBase[cur + 2]) << 8); 280 | UInt32 hash3Value = temp & (kHash3Size - 1); 281 | _hash[kHash3Offset + hash3Value] = _pos; 282 | hashValue = (temp ^ (CRC.Table[_bufferBase[cur + 3]] << 5)) & _hashMask; 283 | } 284 | else 285 | hashValue = _bufferBase[cur] ^ ((UInt32)(_bufferBase[cur + 1]) << 8); 286 | 287 | UInt32 curMatch = _hash[kFixHashSize + hashValue]; 288 | _hash[kFixHashSize + hashValue] = _pos; 289 | 290 | UInt32 ptr0 = (_cyclicBufferPos << 1) + 1; 291 | UInt32 ptr1 = (_cyclicBufferPos << 1); 292 | 293 | UInt32 len0, len1; 294 | len0 = len1 = kNumHashDirectBytes; 295 | 296 | UInt32 count = _cutValue; 297 | while (true) 298 | { 299 | if (curMatch <= matchMinPos || count-- == 0) 300 | { 301 | _son[ptr0] = _son[ptr1] = kEmptyHashValue; 302 | break; 303 | } 304 | 305 | UInt32 delta = _pos - curMatch; 306 | UInt32 cyclicPos = ((delta <= _cyclicBufferPos) ? 307 | (_cyclicBufferPos - delta) : 308 | (_cyclicBufferPos - delta + _cyclicBufferSize)) << 1; 309 | 310 | UInt32 pby1 = _bufferOffset + curMatch; 311 | UInt32 len = Math.Min(len0, len1); 312 | if (_bufferBase[pby1 + len] == _bufferBase[cur + len]) 313 | { 314 | while (++len != lenLimit) 315 | if (_bufferBase[pby1 + len] != _bufferBase[cur + len]) 316 | break; 317 | if (len == lenLimit) 318 | { 319 | _son[ptr1] = _son[cyclicPos]; 320 | _son[ptr0] = _son[cyclicPos + 1]; 321 | break; 322 | } 323 | } 324 | if (_bufferBase[pby1 + len] < _bufferBase[cur + len]) 325 | { 326 | _son[ptr1] = curMatch; 327 | ptr1 = cyclicPos + 1; 328 | curMatch = _son[ptr1]; 329 | len1 = len; 330 | } 331 | else 332 | { 333 | _son[ptr0] = curMatch; 334 | ptr0 = cyclicPos; 335 | curMatch = _son[ptr0]; 336 | len0 = len; 337 | } 338 | } 339 | MovePos(); 340 | } 341 | while (--num != 0); 342 | } 343 | 344 | void NormalizeLinks(UInt32[] items, UInt32 numItems, UInt32 subValue) 345 | { 346 | for (UInt32 i = 0; i < numItems; i++) 347 | { 348 | UInt32 value = items[i]; 349 | if (value <= subValue) 350 | value = kEmptyHashValue; 351 | else 352 | value -= subValue; 353 | items[i] = value; 354 | } 355 | } 356 | 357 | void Normalize() 358 | { 359 | UInt32 subValue = _pos - _cyclicBufferSize; 360 | NormalizeLinks(_son, _cyclicBufferSize * 2, subValue); 361 | NormalizeLinks(_hash, _hashSizeSum, subValue); 362 | ReduceOffsets((Int32)subValue); 363 | } 364 | 365 | public void SetCutValue(UInt32 cutValue) { _cutValue = cutValue; } 366 | } 367 | } 368 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Compress/LZ/LzInWindow.cs: -------------------------------------------------------------------------------- 1 | // LzInWindow.cs 2 | 3 | using System; 4 | 5 | namespace SevenZip.Compression.LZ 6 | { 7 | public class InWindow 8 | { 9 | public Byte[] _bufferBase = null; // pointer to buffer with data 10 | System.IO.Stream _stream; 11 | UInt32 _posLimit; // offset (from _buffer) of first byte when new block reading must be done 12 | bool _streamEndWasReached; // if (true) then _streamPos shows real end of stream 13 | 14 | UInt32 _pointerToLastSafePosition; 15 | 16 | public UInt32 _bufferOffset; 17 | 18 | public UInt32 _blockSize; // Size of Allocated memory block 19 | public UInt32 _pos; // offset (from _buffer) of curent byte 20 | UInt32 _keepSizeBefore; // how many BYTEs must be kept in buffer before _pos 21 | UInt32 _keepSizeAfter; // how many BYTEs must be kept buffer after _pos 22 | public UInt32 _streamPos; // offset (from _buffer) of first not read byte from Stream 23 | 24 | public void MoveBlock() 25 | { 26 | UInt32 offset = (UInt32)(_bufferOffset) + _pos - _keepSizeBefore; 27 | // we need one additional byte, since MovePos moves on 1 byte. 28 | if (offset > 0) 29 | offset--; 30 | 31 | UInt32 numBytes = (UInt32)(_bufferOffset) + _streamPos - offset; 32 | 33 | // check negative offset ???? 34 | for (UInt32 i = 0; i < numBytes; i++) 35 | _bufferBase[i] = _bufferBase[offset + i]; 36 | _bufferOffset -= offset; 37 | } 38 | 39 | public virtual void ReadBlock() 40 | { 41 | if (_streamEndWasReached) 42 | return; 43 | while (true) 44 | { 45 | int size = (int)((0 - _bufferOffset) + _blockSize - _streamPos); 46 | if (size == 0) 47 | return; 48 | int numReadBytes = _stream.Read(_bufferBase, (int)(_bufferOffset + _streamPos), size); 49 | if (numReadBytes == 0) 50 | { 51 | _posLimit = _streamPos; 52 | UInt32 pointerToPostion = _bufferOffset + _posLimit; 53 | if (pointerToPostion > _pointerToLastSafePosition) 54 | _posLimit = (UInt32)(_pointerToLastSafePosition - _bufferOffset); 55 | 56 | _streamEndWasReached = true; 57 | return; 58 | } 59 | _streamPos += (UInt32)numReadBytes; 60 | if (_streamPos >= _pos + _keepSizeAfter) 61 | _posLimit = _streamPos - _keepSizeAfter; 62 | } 63 | } 64 | 65 | void Free() { _bufferBase = null; } 66 | 67 | public void Create(UInt32 keepSizeBefore, UInt32 keepSizeAfter, UInt32 keepSizeReserv) 68 | { 69 | _keepSizeBefore = keepSizeBefore; 70 | _keepSizeAfter = keepSizeAfter; 71 | UInt32 blockSize = keepSizeBefore + keepSizeAfter + keepSizeReserv; 72 | if (_bufferBase == null || _blockSize != blockSize) 73 | { 74 | Free(); 75 | _blockSize = blockSize; 76 | _bufferBase = new Byte[_blockSize]; 77 | } 78 | _pointerToLastSafePosition = _blockSize - keepSizeAfter; 79 | } 80 | 81 | public void SetStream(System.IO.Stream stream) { _stream = stream; } 82 | public void ReleaseStream() { _stream = null; } 83 | 84 | public void Init() 85 | { 86 | _bufferOffset = 0; 87 | _pos = 0; 88 | _streamPos = 0; 89 | _streamEndWasReached = false; 90 | ReadBlock(); 91 | } 92 | 93 | public void MovePos() 94 | { 95 | _pos++; 96 | if (_pos > _posLimit) 97 | { 98 | UInt32 pointerToPostion = _bufferOffset + _pos; 99 | if (pointerToPostion > _pointerToLastSafePosition) 100 | MoveBlock(); 101 | ReadBlock(); 102 | } 103 | } 104 | 105 | public Byte GetIndexByte(Int32 index) { return _bufferBase[_bufferOffset + _pos + index]; } 106 | 107 | // index + limit have not to exceed _keepSizeAfter; 108 | public UInt32 GetMatchLen(Int32 index, UInt32 distance, UInt32 limit) 109 | { 110 | if (_streamEndWasReached) 111 | if ((_pos + index) + limit > _streamPos) 112 | limit = _streamPos - (UInt32)(_pos + index); 113 | distance++; 114 | // Byte *pby = _buffer + (size_t)_pos + index; 115 | UInt32 pby = _bufferOffset + _pos + (UInt32)index; 116 | 117 | UInt32 i; 118 | for (i = 0; i < limit && _bufferBase[pby + i] == _bufferBase[pby + i - distance]; i++); 119 | return i; 120 | } 121 | 122 | public UInt32 GetNumAvailableBytes() { return _streamPos - _pos; } 123 | 124 | public void ReduceOffsets(Int32 subValue) 125 | { 126 | _bufferOffset += (UInt32)subValue; 127 | _posLimit -= (UInt32)subValue; 128 | _pos -= (UInt32)subValue; 129 | _streamPos -= (UInt32)subValue; 130 | } 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Compress/LZ/LzOutWindow.cs: -------------------------------------------------------------------------------- 1 | // LzOutWindow.cs 2 | 3 | namespace SevenZip.Compression.LZ 4 | { 5 | public class OutWindow 6 | { 7 | byte[] _buffer = null; 8 | uint _pos; 9 | uint _windowSize = 0; 10 | uint _streamPos; 11 | System.IO.Stream _stream; 12 | 13 | public uint TrainSize = 0; 14 | 15 | public void Create(uint windowSize) 16 | { 17 | if (_windowSize != windowSize) 18 | { 19 | // System.GC.Collect(); 20 | _buffer = new byte[windowSize]; 21 | } 22 | _windowSize = windowSize; 23 | _pos = 0; 24 | _streamPos = 0; 25 | } 26 | 27 | public void Init(System.IO.Stream stream, bool solid) 28 | { 29 | ReleaseStream(); 30 | _stream = stream; 31 | if (!solid) 32 | { 33 | _streamPos = 0; 34 | _pos = 0; 35 | TrainSize = 0; 36 | } 37 | } 38 | 39 | public bool Train(System.IO.Stream stream) 40 | { 41 | long len = stream.Length; 42 | uint size = (len < _windowSize) ? (uint)len : _windowSize; 43 | TrainSize = size; 44 | stream.Position = len - size; 45 | _streamPos = _pos = 0; 46 | while (size > 0) 47 | { 48 | uint curSize = _windowSize - _pos; 49 | if (size < curSize) 50 | curSize = size; 51 | int numReadBytes = stream.Read(_buffer, (int)_pos, (int)curSize); 52 | if (numReadBytes == 0) 53 | return false; 54 | size -= (uint)numReadBytes; 55 | _pos += (uint)numReadBytes; 56 | _streamPos += (uint)numReadBytes; 57 | if (_pos == _windowSize) 58 | _streamPos = _pos = 0; 59 | } 60 | return true; 61 | } 62 | 63 | public void ReleaseStream() 64 | { 65 | Flush(); 66 | _stream = null; 67 | } 68 | 69 | public void Flush() 70 | { 71 | uint size = _pos - _streamPos; 72 | if (size == 0) 73 | return; 74 | _stream.Write(_buffer, (int)_streamPos, (int)size); 75 | if (_pos >= _windowSize) 76 | _pos = 0; 77 | _streamPos = _pos; 78 | } 79 | 80 | public void CopyBlock(uint distance, uint len) 81 | { 82 | uint pos = _pos - distance - 1; 83 | if (pos >= _windowSize) 84 | pos += _windowSize; 85 | for (; len > 0; len--) 86 | { 87 | if (pos >= _windowSize) 88 | pos = 0; 89 | _buffer[_pos++] = _buffer[pos++]; 90 | if (_pos >= _windowSize) 91 | Flush(); 92 | } 93 | } 94 | 95 | public void PutByte(byte b) 96 | { 97 | _buffer[_pos++] = b; 98 | if (_pos >= _windowSize) 99 | Flush(); 100 | } 101 | 102 | public byte GetByte(uint distance) 103 | { 104 | uint pos = _pos - distance - 1; 105 | if (pos >= _windowSize) 106 | pos += _windowSize; 107 | return _buffer[pos]; 108 | } 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Compress/LZMA/LzmaBase.cs: -------------------------------------------------------------------------------- 1 | // LzmaBase.cs 2 | 3 | namespace SevenZip.Compression.LZMA 4 | { 5 | internal abstract class Base 6 | { 7 | public const uint kNumRepDistances = 4; 8 | public const uint kNumStates = 12; 9 | 10 | // static byte []kLiteralNextStates = {0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5}; 11 | // static byte []kMatchNextStates = {7, 7, 7, 7, 7, 7, 7, 10, 10, 10, 10, 10}; 12 | // static byte []kRepNextStates = {8, 8, 8, 8, 8, 8, 8, 11, 11, 11, 11, 11}; 13 | // static byte []kShortRepNextStates = {9, 9, 9, 9, 9, 9, 9, 11, 11, 11, 11, 11}; 14 | 15 | public struct State 16 | { 17 | public uint Index; 18 | public void Init() { Index = 0; } 19 | public void UpdateChar() 20 | { 21 | if (Index < 4) Index = 0; 22 | else if (Index < 10) Index -= 3; 23 | else Index -= 6; 24 | } 25 | public void UpdateMatch() { Index = (uint)(Index < 7 ? 7 : 10); } 26 | public void UpdateRep() { Index = (uint)(Index < 7 ? 8 : 11); } 27 | public void UpdateShortRep() { Index = (uint)(Index < 7 ? 9 : 11); } 28 | public bool IsCharState() { return Index < 7; } 29 | } 30 | 31 | public const int kNumPosSlotBits = 6; 32 | public const int kDicLogSizeMin = 0; 33 | // public const int kDicLogSizeMax = 30; 34 | // public const uint kDistTableSizeMax = kDicLogSizeMax * 2; 35 | 36 | public const int kNumLenToPosStatesBits = 2; // it's for speed optimization 37 | public const uint kNumLenToPosStates = 1 << kNumLenToPosStatesBits; 38 | 39 | public const uint kMatchMinLen = 2; 40 | 41 | public static uint GetLenToPosState(uint len) 42 | { 43 | len -= kMatchMinLen; 44 | if (len < kNumLenToPosStates) 45 | return len; 46 | return (uint)(kNumLenToPosStates - 1); 47 | } 48 | 49 | public const int kNumAlignBits = 4; 50 | public const uint kAlignTableSize = 1 << kNumAlignBits; 51 | public const uint kAlignMask = (kAlignTableSize - 1); 52 | 53 | public const uint kStartPosModelIndex = 4; 54 | public const uint kEndPosModelIndex = 14; 55 | public const uint kNumPosModels = kEndPosModelIndex - kStartPosModelIndex; 56 | 57 | public const uint kNumFullDistances = 1 << ((int)kEndPosModelIndex / 2); 58 | 59 | public const uint kNumLitPosStatesBitsEncodingMax = 4; 60 | public const uint kNumLitContextBitsMax = 8; 61 | 62 | public const int kNumPosStatesBitsMax = 4; 63 | public const uint kNumPosStatesMax = (1 << kNumPosStatesBitsMax); 64 | public const int kNumPosStatesBitsEncodingMax = 4; 65 | public const uint kNumPosStatesEncodingMax = (1 << kNumPosStatesBitsEncodingMax); 66 | 67 | public const int kNumLowLenBits = 3; 68 | public const int kNumMidLenBits = 3; 69 | public const int kNumHighLenBits = 8; 70 | public const uint kNumLowLenSymbols = 1 << kNumLowLenBits; 71 | public const uint kNumMidLenSymbols = 1 << kNumMidLenBits; 72 | public const uint kNumLenSymbols = kNumLowLenSymbols + kNumMidLenSymbols + 73 | (1 << kNumHighLenBits); 74 | public const uint kMatchMaxLen = kMatchMinLen + kNumLenSymbols - 1; 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Compress/LZMA/LzmaDecoder.cs: -------------------------------------------------------------------------------- 1 | // LzmaDecoder.cs 2 | 3 | using System; 4 | 5 | namespace SevenZip.Compression.LZMA 6 | { 7 | using RangeCoder; 8 | 9 | public class Decoder : ICoder, ISetDecoderProperties // ,System.IO.Stream 10 | { 11 | class LenDecoder 12 | { 13 | BitDecoder m_Choice = new BitDecoder(); 14 | BitDecoder m_Choice2 = new BitDecoder(); 15 | BitTreeDecoder[] m_LowCoder = new BitTreeDecoder[Base.kNumPosStatesMax]; 16 | BitTreeDecoder[] m_MidCoder = new BitTreeDecoder[Base.kNumPosStatesMax]; 17 | BitTreeDecoder m_HighCoder = new BitTreeDecoder(Base.kNumHighLenBits); 18 | uint m_NumPosStates = 0; 19 | 20 | public void Create(uint numPosStates) 21 | { 22 | for (uint posState = m_NumPosStates; posState < numPosStates; posState++) 23 | { 24 | m_LowCoder[posState] = new BitTreeDecoder(Base.kNumLowLenBits); 25 | m_MidCoder[posState] = new BitTreeDecoder(Base.kNumMidLenBits); 26 | } 27 | m_NumPosStates = numPosStates; 28 | } 29 | 30 | public void Init() 31 | { 32 | m_Choice.Init(); 33 | for (uint posState = 0; posState < m_NumPosStates; posState++) 34 | { 35 | m_LowCoder[posState].Init(); 36 | m_MidCoder[posState].Init(); 37 | } 38 | m_Choice2.Init(); 39 | m_HighCoder.Init(); 40 | } 41 | 42 | public uint Decode(RangeCoder.Decoder rangeDecoder, uint posState) 43 | { 44 | if (m_Choice.Decode(rangeDecoder) == 0) 45 | return m_LowCoder[posState].Decode(rangeDecoder); 46 | else 47 | { 48 | uint symbol = Base.kNumLowLenSymbols; 49 | if (m_Choice2.Decode(rangeDecoder) == 0) 50 | symbol += m_MidCoder[posState].Decode(rangeDecoder); 51 | else 52 | { 53 | symbol += Base.kNumMidLenSymbols; 54 | symbol += m_HighCoder.Decode(rangeDecoder); 55 | } 56 | return symbol; 57 | } 58 | } 59 | } 60 | 61 | class LiteralDecoder 62 | { 63 | struct Decoder2 64 | { 65 | BitDecoder[] m_Decoders; 66 | public void Create() { m_Decoders = new BitDecoder[0x300]; } 67 | public void Init() { for (int i = 0; i < 0x300; i++) m_Decoders[i].Init(); } 68 | 69 | public byte DecodeNormal(RangeCoder.Decoder rangeDecoder) 70 | { 71 | uint symbol = 1; 72 | do 73 | symbol = (symbol << 1) | m_Decoders[symbol].Decode(rangeDecoder); 74 | while (symbol < 0x100); 75 | return (byte)symbol; 76 | } 77 | 78 | public byte DecodeWithMatchByte(RangeCoder.Decoder rangeDecoder, byte matchByte) 79 | { 80 | uint symbol = 1; 81 | do 82 | { 83 | uint matchBit = (uint)(matchByte >> 7) & 1; 84 | matchByte <<= 1; 85 | uint bit = m_Decoders[((1 + matchBit) << 8) + symbol].Decode(rangeDecoder); 86 | symbol = (symbol << 1) | bit; 87 | if (matchBit != bit) 88 | { 89 | while (symbol < 0x100) 90 | symbol = (symbol << 1) | m_Decoders[symbol].Decode(rangeDecoder); 91 | break; 92 | } 93 | } 94 | while (symbol < 0x100); 95 | return (byte)symbol; 96 | } 97 | } 98 | 99 | Decoder2[] m_Coders; 100 | int m_NumPrevBits; 101 | int m_NumPosBits; 102 | uint m_PosMask; 103 | 104 | public void Create(int numPosBits, int numPrevBits) 105 | { 106 | if (m_Coders != null && m_NumPrevBits == numPrevBits && 107 | m_NumPosBits == numPosBits) 108 | return; 109 | m_NumPosBits = numPosBits; 110 | m_PosMask = ((uint)1 << numPosBits) - 1; 111 | m_NumPrevBits = numPrevBits; 112 | uint numStates = (uint)1 << (m_NumPrevBits + m_NumPosBits); 113 | m_Coders = new Decoder2[numStates]; 114 | for (uint i = 0; i < numStates; i++) 115 | m_Coders[i].Create(); 116 | } 117 | 118 | public void Init() 119 | { 120 | uint numStates = (uint)1 << (m_NumPrevBits + m_NumPosBits); 121 | for (uint i = 0; i < numStates; i++) 122 | m_Coders[i].Init(); 123 | } 124 | 125 | uint GetState(uint pos, byte prevByte) 126 | { return ((pos & m_PosMask) << m_NumPrevBits) + (uint)(prevByte >> (8 - m_NumPrevBits)); } 127 | 128 | public byte DecodeNormal(RangeCoder.Decoder rangeDecoder, uint pos, byte prevByte) 129 | { return m_Coders[GetState(pos, prevByte)].DecodeNormal(rangeDecoder); } 130 | 131 | public byte DecodeWithMatchByte(RangeCoder.Decoder rangeDecoder, uint pos, byte prevByte, byte matchByte) 132 | { return m_Coders[GetState(pos, prevByte)].DecodeWithMatchByte(rangeDecoder, matchByte); } 133 | }; 134 | 135 | LZ.OutWindow m_OutWindow = new LZ.OutWindow(); 136 | RangeCoder.Decoder m_RangeDecoder = new RangeCoder.Decoder(); 137 | 138 | BitDecoder[] m_IsMatchDecoders = new BitDecoder[Base.kNumStates << Base.kNumPosStatesBitsMax]; 139 | BitDecoder[] m_IsRepDecoders = new BitDecoder[Base.kNumStates]; 140 | BitDecoder[] m_IsRepG0Decoders = new BitDecoder[Base.kNumStates]; 141 | BitDecoder[] m_IsRepG1Decoders = new BitDecoder[Base.kNumStates]; 142 | BitDecoder[] m_IsRepG2Decoders = new BitDecoder[Base.kNumStates]; 143 | BitDecoder[] m_IsRep0LongDecoders = new BitDecoder[Base.kNumStates << Base.kNumPosStatesBitsMax]; 144 | 145 | BitTreeDecoder[] m_PosSlotDecoder = new BitTreeDecoder[Base.kNumLenToPosStates]; 146 | BitDecoder[] m_PosDecoders = new BitDecoder[Base.kNumFullDistances - Base.kEndPosModelIndex]; 147 | 148 | BitTreeDecoder m_PosAlignDecoder = new BitTreeDecoder(Base.kNumAlignBits); 149 | 150 | LenDecoder m_LenDecoder = new LenDecoder(); 151 | LenDecoder m_RepLenDecoder = new LenDecoder(); 152 | 153 | LiteralDecoder m_LiteralDecoder = new LiteralDecoder(); 154 | 155 | uint m_DictionarySize; 156 | uint m_DictionarySizeCheck; 157 | 158 | uint m_PosStateMask; 159 | 160 | public Decoder() 161 | { 162 | m_DictionarySize = 0xFFFFFFFF; 163 | for (int i = 0; i < Base.kNumLenToPosStates; i++) 164 | m_PosSlotDecoder[i] = new BitTreeDecoder(Base.kNumPosSlotBits); 165 | } 166 | 167 | void SetDictionarySize(uint dictionarySize) 168 | { 169 | if (m_DictionarySize != dictionarySize) 170 | { 171 | m_DictionarySize = dictionarySize; 172 | m_DictionarySizeCheck = Math.Max(m_DictionarySize, 1); 173 | uint blockSize = Math.Max(m_DictionarySizeCheck, (1 << 12)); 174 | m_OutWindow.Create(blockSize); 175 | } 176 | } 177 | 178 | void SetLiteralProperties(int lp, int lc) 179 | { 180 | if (lp > 8) 181 | throw new InvalidParamException(); 182 | if (lc > 8) 183 | throw new InvalidParamException(); 184 | m_LiteralDecoder.Create(lp, lc); 185 | } 186 | 187 | void SetPosBitsProperties(int pb) 188 | { 189 | if (pb > Base.kNumPosStatesBitsMax) 190 | throw new InvalidParamException(); 191 | uint numPosStates = (uint)1 << pb; 192 | m_LenDecoder.Create(numPosStates); 193 | m_RepLenDecoder.Create(numPosStates); 194 | m_PosStateMask = numPosStates - 1; 195 | } 196 | 197 | bool _solid = false; 198 | void Init(System.IO.Stream inStream, System.IO.Stream outStream) 199 | { 200 | m_RangeDecoder.Init(inStream); 201 | m_OutWindow.Init(outStream, _solid); 202 | 203 | uint i; 204 | for (i = 0; i < Base.kNumStates; i++) 205 | { 206 | for (uint j = 0; j <= m_PosStateMask; j++) 207 | { 208 | uint index = (i << Base.kNumPosStatesBitsMax) + j; 209 | m_IsMatchDecoders[index].Init(); 210 | m_IsRep0LongDecoders[index].Init(); 211 | } 212 | m_IsRepDecoders[i].Init(); 213 | m_IsRepG0Decoders[i].Init(); 214 | m_IsRepG1Decoders[i].Init(); 215 | m_IsRepG2Decoders[i].Init(); 216 | } 217 | 218 | m_LiteralDecoder.Init(); 219 | for (i = 0; i < Base.kNumLenToPosStates; i++) 220 | m_PosSlotDecoder[i].Init(); 221 | // m_PosSpecDecoder.Init(); 222 | for (i = 0; i < Base.kNumFullDistances - Base.kEndPosModelIndex; i++) 223 | m_PosDecoders[i].Init(); 224 | 225 | m_LenDecoder.Init(); 226 | m_RepLenDecoder.Init(); 227 | m_PosAlignDecoder.Init(); 228 | } 229 | 230 | public void Code(System.IO.Stream inStream, System.IO.Stream outStream, 231 | Int64 inSize, Int64 outSize, ICodeProgress progress) 232 | { 233 | Init(inStream, outStream); 234 | 235 | Base.State state = new Base.State(); 236 | state.Init(); 237 | uint rep0 = 0, rep1 = 0, rep2 = 0, rep3 = 0; 238 | 239 | UInt64 nowPos64 = 0; 240 | UInt64 outSize64 = (UInt64)outSize; 241 | if (nowPos64 < outSize64) 242 | { 243 | if (m_IsMatchDecoders[state.Index << Base.kNumPosStatesBitsMax].Decode(m_RangeDecoder) != 0) 244 | throw new DataErrorException(); 245 | state.UpdateChar(); 246 | byte b = m_LiteralDecoder.DecodeNormal(m_RangeDecoder, 0, 0); 247 | m_OutWindow.PutByte(b); 248 | nowPos64++; 249 | } 250 | while (nowPos64 < outSize64) 251 | { 252 | // UInt64 next = Math.Min(nowPos64 + (1 << 18), outSize64); 253 | // while(nowPos64 < next) 254 | { 255 | uint posState = (uint)nowPos64 & m_PosStateMask; 256 | if (m_IsMatchDecoders[(state.Index << Base.kNumPosStatesBitsMax) + posState].Decode(m_RangeDecoder) == 0) 257 | { 258 | byte b; 259 | byte prevByte = m_OutWindow.GetByte(0); 260 | if (!state.IsCharState()) 261 | b = m_LiteralDecoder.DecodeWithMatchByte(m_RangeDecoder, 262 | (uint)nowPos64, prevByte, m_OutWindow.GetByte(rep0)); 263 | else 264 | b = m_LiteralDecoder.DecodeNormal(m_RangeDecoder, (uint)nowPos64, prevByte); 265 | m_OutWindow.PutByte(b); 266 | state.UpdateChar(); 267 | nowPos64++; 268 | } 269 | else 270 | { 271 | uint len; 272 | if (m_IsRepDecoders[state.Index].Decode(m_RangeDecoder) == 1) 273 | { 274 | if (m_IsRepG0Decoders[state.Index].Decode(m_RangeDecoder) == 0) 275 | { 276 | if (m_IsRep0LongDecoders[(state.Index << Base.kNumPosStatesBitsMax) + posState].Decode(m_RangeDecoder) == 0) 277 | { 278 | state.UpdateShortRep(); 279 | m_OutWindow.PutByte(m_OutWindow.GetByte(rep0)); 280 | nowPos64++; 281 | continue; 282 | } 283 | } 284 | else 285 | { 286 | UInt32 distance; 287 | if (m_IsRepG1Decoders[state.Index].Decode(m_RangeDecoder) == 0) 288 | { 289 | distance = rep1; 290 | } 291 | else 292 | { 293 | if (m_IsRepG2Decoders[state.Index].Decode(m_RangeDecoder) == 0) 294 | distance = rep2; 295 | else 296 | { 297 | distance = rep3; 298 | rep3 = rep2; 299 | } 300 | rep2 = rep1; 301 | } 302 | rep1 = rep0; 303 | rep0 = distance; 304 | } 305 | len = m_RepLenDecoder.Decode(m_RangeDecoder, posState) + Base.kMatchMinLen; 306 | state.UpdateRep(); 307 | } 308 | else 309 | { 310 | rep3 = rep2; 311 | rep2 = rep1; 312 | rep1 = rep0; 313 | len = Base.kMatchMinLen + m_LenDecoder.Decode(m_RangeDecoder, posState); 314 | state.UpdateMatch(); 315 | uint posSlot = m_PosSlotDecoder[Base.GetLenToPosState(len)].Decode(m_RangeDecoder); 316 | if (posSlot >= Base.kStartPosModelIndex) 317 | { 318 | int numDirectBits = (int)((posSlot >> 1) - 1); 319 | rep0 = ((2 | (posSlot & 1)) << numDirectBits); 320 | if (posSlot < Base.kEndPosModelIndex) 321 | rep0 += BitTreeDecoder.ReverseDecode(m_PosDecoders, 322 | rep0 - posSlot - 1, m_RangeDecoder, numDirectBits); 323 | else 324 | { 325 | rep0 += (m_RangeDecoder.DecodeDirectBits( 326 | numDirectBits - Base.kNumAlignBits) << Base.kNumAlignBits); 327 | rep0 += m_PosAlignDecoder.ReverseDecode(m_RangeDecoder); 328 | } 329 | } 330 | else 331 | rep0 = posSlot; 332 | } 333 | if (rep0 >= m_OutWindow.TrainSize + nowPos64 || rep0 >= m_DictionarySizeCheck) 334 | { 335 | if (rep0 == 0xFFFFFFFF) 336 | break; 337 | throw new DataErrorException(); 338 | } 339 | m_OutWindow.CopyBlock(rep0, len); 340 | nowPos64 += len; 341 | } 342 | } 343 | } 344 | m_OutWindow.Flush(); 345 | m_OutWindow.ReleaseStream(); 346 | m_RangeDecoder.ReleaseStream(); 347 | } 348 | 349 | public void SetDecoderProperties(byte[] properties) 350 | { 351 | if (properties.Length < 5) 352 | throw new InvalidParamException(); 353 | int lc = properties[0] % 9; 354 | int remainder = properties[0] / 9; 355 | int lp = remainder % 5; 356 | int pb = remainder / 5; 357 | if (pb > Base.kNumPosStatesBitsMax) 358 | throw new InvalidParamException(); 359 | UInt32 dictionarySize = 0; 360 | for (int i = 0; i < 4; i++) 361 | dictionarySize += ((UInt32)(properties[1 + i])) << (i * 8); 362 | SetDictionarySize(dictionarySize); 363 | SetLiteralProperties(lp, lc); 364 | SetPosBitsProperties(pb); 365 | } 366 | 367 | public bool Train(System.IO.Stream stream) 368 | { 369 | _solid = true; 370 | return m_OutWindow.Train(stream); 371 | } 372 | 373 | /* 374 | public override bool CanRead { get { return true; }} 375 | public override bool CanWrite { get { return true; }} 376 | public override bool CanSeek { get { return true; }} 377 | public override long Length { get { return 0; }} 378 | public override long Position 379 | { 380 | get { return 0; } 381 | set { } 382 | } 383 | public override void Flush() { } 384 | public override int Read(byte[] buffer, int offset, int count) 385 | { 386 | return 0; 387 | } 388 | public override void Write(byte[] buffer, int offset, int count) 389 | { 390 | } 391 | public override long Seek(long offset, System.IO.SeekOrigin origin) 392 | { 393 | return 0; 394 | } 395 | public override void SetLength(long value) {} 396 | */ 397 | } 398 | } 399 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Compress/LzmaAlone/LzmaAlone.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | namespace SevenZip 4 | { 5 | using CommandLineParser; 6 | 7 | public class CDoubleStream: Stream 8 | { 9 | public System.IO.Stream s1; 10 | public System.IO.Stream s2; 11 | public int fileIndex; 12 | public long skipSize; 13 | 14 | public override bool CanRead { get { return true; }} 15 | public override bool CanWrite { get { return false; }} 16 | public override bool CanSeek { get { return false; }} 17 | public override long Length { get { return s1.Length + s2.Length - skipSize; } } 18 | public override long Position 19 | { 20 | get { return 0; } 21 | set { } 22 | } 23 | public override void Flush() { } 24 | public override int Read(byte[] buffer, int offset, int count) 25 | { 26 | int numTotal = 0; 27 | while (count > 0) 28 | { 29 | if (fileIndex == 0) 30 | { 31 | int num = s1.Read(buffer, offset, count); 32 | offset += num; 33 | count -= num; 34 | numTotal += num; 35 | if (num == 0) 36 | fileIndex++; 37 | } 38 | if (fileIndex == 1) 39 | { 40 | numTotal += s2.Read(buffer, offset, count); 41 | return numTotal; 42 | } 43 | } 44 | return numTotal; 45 | } 46 | public override void Write(byte[] buffer, int offset, int count) 47 | { 48 | throw (new Exception("can't Write")); 49 | } 50 | public override long Seek(long offset, System.IO.SeekOrigin origin) 51 | { 52 | throw (new Exception("can't Seek")); 53 | } 54 | public override void SetLength(long value) 55 | { 56 | throw (new Exception("can't SetLength")); 57 | } 58 | } 59 | 60 | class LzmaAlone 61 | { 62 | enum Key 63 | { 64 | Help1 = 0, 65 | Help2, 66 | Mode, 67 | Dictionary, 68 | FastBytes, 69 | LitContext, 70 | LitPos, 71 | PosBits, 72 | MatchFinder, 73 | EOS, 74 | StdIn, 75 | StdOut, 76 | Train 77 | }; 78 | 79 | static void PrintHelp() 80 | { 81 | System.Console.WriteLine("\nUsage: LZMA [...] inputFile outputFile\n" + 82 | " e: encode file\n" + 83 | " d: decode file\n" + 84 | " b: Benchmark\n" + 85 | "\n" + 86 | // " -a{N}: set compression mode - [0, 1], default: 1 (max)\n" + 87 | " -d{N}: set dictionary - [0, 29], default: 23 (8MB)\n" + 88 | " -fb{N}: set number of fast bytes - [5, 273], default: 128\n" + 89 | " -lc{N}: set number of literal context bits - [0, 8], default: 3\n" + 90 | " -lp{N}: set number of literal pos bits - [0, 4], default: 0\n" + 91 | " -pb{N}: set number of pos bits - [0, 4], default: 2\n" + 92 | " -mf{MF_ID}: set Match Finder: [bt2, bt4], default: bt4\n" + 93 | " -eos: write End Of Stream marker\n" 94 | // + " -si: read data from stdin\n" 95 | // + " -so: write data to stdout\n" 96 | ); 97 | } 98 | 99 | static bool GetNumber(string s, out Int32 v) 100 | { 101 | v = 0; 102 | for (int i = 0; i < s.Length; i++) 103 | { 104 | char c = s[i]; 105 | if (c < '0' || c > '9') 106 | return false; 107 | v *= 10; 108 | v += (Int32)(c - '0'); 109 | } 110 | return true; 111 | } 112 | 113 | static int IncorrectCommand() 114 | { 115 | throw (new Exception("Command line error")); 116 | // System.Console.WriteLine("\nCommand line error\n"); 117 | // return 1; 118 | } 119 | static int Main2(string[] args) 120 | { 121 | System.Console.WriteLine("\nLZMA# 4.61 2008-11-23\n"); 122 | 123 | if (args.Length == 0) 124 | { 125 | PrintHelp(); 126 | return 0; 127 | } 128 | 129 | SwitchForm[] kSwitchForms = new SwitchForm[13]; 130 | int sw = 0; 131 | kSwitchForms[sw++] = new SwitchForm("?", SwitchType.Simple, false); 132 | kSwitchForms[sw++] = new SwitchForm("H", SwitchType.Simple, false); 133 | kSwitchForms[sw++] = new SwitchForm("A", SwitchType.UnLimitedPostString, false, 1); 134 | kSwitchForms[sw++] = new SwitchForm("D", SwitchType.UnLimitedPostString, false, 1); 135 | kSwitchForms[sw++] = new SwitchForm("FB", SwitchType.UnLimitedPostString, false, 1); 136 | kSwitchForms[sw++] = new SwitchForm("LC", SwitchType.UnLimitedPostString, false, 1); 137 | kSwitchForms[sw++] = new SwitchForm("LP", SwitchType.UnLimitedPostString, false, 1); 138 | kSwitchForms[sw++] = new SwitchForm("PB", SwitchType.UnLimitedPostString, false, 1); 139 | kSwitchForms[sw++] = new SwitchForm("MF", SwitchType.UnLimitedPostString, false, 1); 140 | kSwitchForms[sw++] = new SwitchForm("EOS", SwitchType.Simple, false); 141 | kSwitchForms[sw++] = new SwitchForm("SI", SwitchType.Simple, false); 142 | kSwitchForms[sw++] = new SwitchForm("SO", SwitchType.Simple, false); 143 | kSwitchForms[sw++] = new SwitchForm("T", SwitchType.UnLimitedPostString, false, 1); 144 | 145 | 146 | Parser parser = new Parser(sw); 147 | try 148 | { 149 | parser.ParseStrings(kSwitchForms, args); 150 | } 151 | catch 152 | { 153 | return IncorrectCommand(); 154 | } 155 | 156 | if (parser[(int)Key.Help1].ThereIs || parser[(int)Key.Help2].ThereIs) 157 | { 158 | PrintHelp(); 159 | return 0; 160 | } 161 | 162 | System.Collections.ArrayList nonSwitchStrings = parser.NonSwitchStrings; 163 | 164 | int paramIndex = 0; 165 | if (paramIndex >= nonSwitchStrings.Count) 166 | return IncorrectCommand(); 167 | string command = (string)nonSwitchStrings[paramIndex++]; 168 | command = command.ToLower(); 169 | 170 | bool dictionaryIsDefined = false; 171 | Int32 dictionary = 1 << 21; 172 | if (parser[(int)Key.Dictionary].ThereIs) 173 | { 174 | Int32 dicLog; 175 | if (!GetNumber((string)parser[(int)Key.Dictionary].PostStrings[0], out dicLog)) 176 | IncorrectCommand(); 177 | dictionary = (Int32)1 << dicLog; 178 | dictionaryIsDefined = true; 179 | } 180 | string mf = "bt4"; 181 | if (parser[(int)Key.MatchFinder].ThereIs) 182 | mf = (string)parser[(int)Key.MatchFinder].PostStrings[0]; 183 | mf = mf.ToLower(); 184 | 185 | if (command == "b") 186 | { 187 | const Int32 kNumDefaultItereations = 10; 188 | Int32 numIterations = kNumDefaultItereations; 189 | if (paramIndex < nonSwitchStrings.Count) 190 | if (!GetNumber((string)nonSwitchStrings[paramIndex++], out numIterations)) 191 | numIterations = kNumDefaultItereations; 192 | return LzmaBench.LzmaBenchmark(numIterations, (UInt32)dictionary); 193 | } 194 | 195 | string train = ""; 196 | if (parser[(int)Key.Train].ThereIs) 197 | train = (string)parser[(int)Key.Train].PostStrings[0]; 198 | 199 | bool encodeMode = false; 200 | if (command == "e") 201 | encodeMode = true; 202 | else if (command == "d") 203 | encodeMode = false; 204 | else 205 | IncorrectCommand(); 206 | 207 | bool stdInMode = parser[(int)Key.StdIn].ThereIs; 208 | bool stdOutMode = parser[(int)Key.StdOut].ThereIs; 209 | 210 | Stream inStream = null; 211 | if (stdInMode) 212 | { 213 | throw (new Exception("Not implemeted")); 214 | } 215 | else 216 | { 217 | if (paramIndex >= nonSwitchStrings.Count) 218 | IncorrectCommand(); 219 | string inputName = (string)nonSwitchStrings[paramIndex++]; 220 | inStream = new FileStream(inputName, FileMode.Open, FileAccess.Read); 221 | } 222 | 223 | FileStream outStream = null; 224 | if (stdOutMode) 225 | { 226 | throw (new Exception("Not implemeted")); 227 | } 228 | else 229 | { 230 | if (paramIndex >= nonSwitchStrings.Count) 231 | IncorrectCommand(); 232 | string outputName = (string)nonSwitchStrings[paramIndex++]; 233 | outStream = new FileStream(outputName, FileMode.Create, FileAccess.Write); 234 | } 235 | 236 | FileStream trainStream = null; 237 | if (train.Length != 0) 238 | trainStream = new FileStream(train, FileMode.Open, FileAccess.Read); 239 | 240 | if (encodeMode) 241 | { 242 | if (!dictionaryIsDefined) 243 | dictionary = 1 << 23; 244 | 245 | Int32 posStateBits = 2; 246 | Int32 litContextBits = 3; // for normal files 247 | // UInt32 litContextBits = 0; // for 32-bit data 248 | Int32 litPosBits = 0; 249 | // UInt32 litPosBits = 2; // for 32-bit data 250 | Int32 algorithm = 2; 251 | Int32 numFastBytes = 128; 252 | 253 | bool eos = parser[(int)Key.EOS].ThereIs || stdInMode; 254 | 255 | if (parser[(int)Key.Mode].ThereIs) 256 | if (!GetNumber((string)parser[(int)Key.Mode].PostStrings[0], out algorithm)) 257 | IncorrectCommand(); 258 | 259 | if (parser[(int)Key.FastBytes].ThereIs) 260 | if (!GetNumber((string)parser[(int)Key.FastBytes].PostStrings[0], out numFastBytes)) 261 | IncorrectCommand(); 262 | if (parser[(int)Key.LitContext].ThereIs) 263 | if (!GetNumber((string)parser[(int)Key.LitContext].PostStrings[0], out litContextBits)) 264 | IncorrectCommand(); 265 | if (parser[(int)Key.LitPos].ThereIs) 266 | if (!GetNumber((string)parser[(int)Key.LitPos].PostStrings[0], out litPosBits)) 267 | IncorrectCommand(); 268 | if (parser[(int)Key.PosBits].ThereIs) 269 | if (!GetNumber((string)parser[(int)Key.PosBits].PostStrings[0], out posStateBits)) 270 | IncorrectCommand(); 271 | 272 | CoderPropID[] propIDs = 273 | { 274 | CoderPropID.DictionarySize, 275 | CoderPropID.PosStateBits, 276 | CoderPropID.LitContextBits, 277 | CoderPropID.LitPosBits, 278 | CoderPropID.Algorithm, 279 | CoderPropID.NumFastBytes, 280 | CoderPropID.MatchFinder, 281 | CoderPropID.EndMarker 282 | }; 283 | object[] properties = 284 | { 285 | (Int32)(dictionary), 286 | (Int32)(posStateBits), 287 | (Int32)(litContextBits), 288 | (Int32)(litPosBits), 289 | (Int32)(algorithm), 290 | (Int32)(numFastBytes), 291 | mf, 292 | eos 293 | }; 294 | 295 | Compression.LZMA.Encoder encoder = new Compression.LZMA.Encoder(); 296 | encoder.SetCoderProperties(propIDs, properties); 297 | encoder.WriteCoderProperties(outStream); 298 | Int64 fileSize; 299 | if (eos || stdInMode) 300 | fileSize = -1; 301 | else 302 | fileSize = inStream.Length; 303 | for (int i = 0; i < 8; i++) 304 | outStream.WriteByte((Byte)(fileSize >> (8 * i))); 305 | if (trainStream != null) 306 | { 307 | CDoubleStream doubleStream = new CDoubleStream(); 308 | doubleStream.s1 = trainStream; 309 | doubleStream.s2 = inStream; 310 | doubleStream.fileIndex = 0; 311 | inStream = doubleStream; 312 | long trainFileSize = trainStream.Length; 313 | doubleStream.skipSize = 0; 314 | if (trainFileSize > dictionary) 315 | doubleStream.skipSize = trainFileSize - dictionary; 316 | trainStream.Seek(doubleStream.skipSize, SeekOrigin.Begin); 317 | encoder.SetTrainSize((uint)(trainFileSize - doubleStream.skipSize)); 318 | } 319 | encoder.Code(inStream, outStream, -1, -1, null); 320 | } 321 | else if (command == "d") 322 | { 323 | byte[] properties = new byte[5]; 324 | if (inStream.Read(properties, 0, 5) != 5) 325 | throw (new Exception("input .lzma is too short")); 326 | Compression.LZMA.Decoder decoder = new Compression.LZMA.Decoder(); 327 | decoder.SetDecoderProperties(properties); 328 | if (trainStream != null) 329 | { 330 | if (!decoder.Train(trainStream)) 331 | throw (new Exception("can't train")); 332 | } 333 | long outSize = 0; 334 | for (int i = 0; i < 8; i++) 335 | { 336 | int v = inStream.ReadByte(); 337 | if (v < 0) 338 | throw (new Exception("Can't Read 1")); 339 | outSize |= ((long)(byte)v) << (8 * i); 340 | } 341 | long compressedSize = inStream.Length - inStream.Position; 342 | decoder.Code(inStream, outStream, compressedSize, outSize, null); 343 | } 344 | else 345 | throw (new Exception("Command Error")); 346 | return 0; 347 | } 348 | 349 | [STAThread] 350 | static int Main(string[] args) 351 | { 352 | try 353 | { 354 | return Main2(args); 355 | } 356 | catch (Exception e) 357 | { 358 | Console.WriteLine("{0} Caught exception #1.", e); 359 | // throw e; 360 | return 1; 361 | } 362 | } 363 | } 364 | } 365 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Compress/LzmaAlone/LzmaAlone.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | Debug 4 | AnyCPU 5 | 9.0.21022 6 | 2.0 7 | {CE33DF18-F9C8-4D6F-9057-DBB4DB96E973} 8 | Library 9 | LzmaAlone 10 | Lzma# 11 | 4 12 | 13 | 14 | 2.0 15 | 16 | 17 | 18 | 19 | publish\ 20 | true 21 | Disk 22 | false 23 | Foreground 24 | 7 25 | Days 26 | false 27 | false 28 | true 29 | 0 30 | 1.0.0.%2a 31 | false 32 | false 33 | true 34 | 35 | 36 | true 37 | full 38 | false 39 | .\bin\Debug\ 40 | DEBUG;TRACE 41 | 42 | 43 | false 44 | true 45 | .\bin\Release\ 46 | TRACE 47 | AnyCPU 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | Common\CommandLineParser.cs 57 | 58 | 59 | Common\CRC.cs 60 | 61 | 62 | ICoder.cs 63 | 64 | 65 | LZ\IMatchFinder.cs 66 | 67 | 68 | LZ\LzBinTree.cs 69 | 70 | 71 | LZ\LzInWindow.cs 72 | 73 | 74 | LZ\LzOutWindow.cs 75 | 76 | 77 | LZMA\LzmaBase.cs 78 | 79 | 80 | LZMA\LzmaDecoder.cs 81 | 82 | 83 | LZMA\LzmaEncoder.cs 84 | 85 | 86 | RangeCoder\RangeCoder.cs 87 | 88 | 89 | RangeCoder\RangeCoderBit.cs 90 | 91 | 92 | RangeCoder\RangeCoderBitTree.cs 93 | 94 | 95 | Code 96 | 97 | 98 | Code 99 | 100 | 101 | 102 | True 103 | Settings.settings 104 | 105 | 106 | SettingsSingleFileGenerator 107 | Settings.cs 108 | 109 | 110 | 111 | 112 | 113 | False 114 | .NET Framework 2.0 %28x86%29 115 | true 116 | 117 | 118 | False 119 | .NET Framework 3.0 %28x86%29 120 | false 121 | 122 | 123 | False 124 | .NET Framework 3.5 125 | false 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Compress/LzmaAlone/LzmaAlone.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 10.00 3 | # Visual Studio 2008 4 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LzmaAlone", "LzmaAlone.csproj", "{CE33DF18-F9C8-4D6F-9057-DBB4DB96E973}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|Any CPU = Debug|Any CPU 9 | Release|Any CPU = Release|Any CPU 10 | EndGlobalSection 11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 12 | {CE33DF18-F9C8-4D6F-9057-DBB4DB96E973}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 13 | {CE33DF18-F9C8-4D6F-9057-DBB4DB96E973}.Debug|Any CPU.Build.0 = Debug|Any CPU 14 | {CE33DF18-F9C8-4D6F-9057-DBB4DB96E973}.Release|Any CPU.ActiveCfg = Release|Any CPU 15 | {CE33DF18-F9C8-4D6F-9057-DBB4DB96E973}.Release|Any CPU.Build.0 = Release|Any CPU 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Compress/LzmaAlone/LzmaBench.cs: -------------------------------------------------------------------------------- 1 | // LzmaBench.cs 2 | 3 | using System; 4 | using System.IO; 5 | 6 | namespace SevenZip 7 | { 8 | /// 9 | /// LZMA Benchmark 10 | /// 11 | internal abstract class LzmaBench 12 | { 13 | const UInt32 kAdditionalSize = (6 << 20); 14 | const UInt32 kCompressedAdditionalSize = (1 << 10); 15 | const UInt32 kMaxLzmaPropSize = 10; 16 | 17 | class CRandomGenerator 18 | { 19 | UInt32 A1; 20 | UInt32 A2; 21 | public CRandomGenerator() { Init(); } 22 | public void Init() { A1 = 362436069; A2 = 521288629; } 23 | public UInt32 GetRnd() 24 | { 25 | return 26 | ((A1 = 36969 * (A1 & 0xffff) + (A1 >> 16)) << 16) ^ 27 | ((A2 = 18000 * (A2 & 0xffff) + (A2 >> 16))); 28 | } 29 | }; 30 | 31 | class CBitRandomGenerator 32 | { 33 | CRandomGenerator RG = new CRandomGenerator(); 34 | UInt32 Value; 35 | int NumBits; 36 | public void Init() 37 | { 38 | Value = 0; 39 | NumBits = 0; 40 | } 41 | public UInt32 GetRnd(int numBits) 42 | { 43 | UInt32 result; 44 | if (NumBits > numBits) 45 | { 46 | result = Value & (((UInt32)1 << numBits) - 1); 47 | Value >>= numBits; 48 | NumBits -= numBits; 49 | return result; 50 | } 51 | numBits -= NumBits; 52 | result = (Value << numBits); 53 | Value = RG.GetRnd(); 54 | result |= Value & (((UInt32)1 << numBits) - 1); 55 | Value >>= numBits; 56 | NumBits = 32 - numBits; 57 | return result; 58 | } 59 | }; 60 | 61 | class CBenchRandomGenerator 62 | { 63 | CBitRandomGenerator RG = new CBitRandomGenerator(); 64 | UInt32 Pos; 65 | UInt32 Rep0; 66 | 67 | public UInt32 BufferSize; 68 | public Byte[] Buffer = null; 69 | 70 | public CBenchRandomGenerator() { } 71 | 72 | public void Set(UInt32 bufferSize) 73 | { 74 | Buffer = new Byte[bufferSize]; 75 | Pos = 0; 76 | BufferSize = bufferSize; 77 | } 78 | UInt32 GetRndBit() { return RG.GetRnd(1); } 79 | UInt32 GetLogRandBits(int numBits) 80 | { 81 | UInt32 len = RG.GetRnd(numBits); 82 | return RG.GetRnd((int)len); 83 | } 84 | UInt32 GetOffset() 85 | { 86 | if (GetRndBit() == 0) 87 | return GetLogRandBits(4); 88 | return (GetLogRandBits(4) << 10) | RG.GetRnd(10); 89 | } 90 | UInt32 GetLen1() { return RG.GetRnd(1 + (int)RG.GetRnd(2)); } 91 | UInt32 GetLen2() { return RG.GetRnd(2 + (int)RG.GetRnd(2)); } 92 | public void Generate() 93 | { 94 | RG.Init(); 95 | Rep0 = 1; 96 | while (Pos < BufferSize) 97 | { 98 | if (GetRndBit() == 0 || Pos < 1) 99 | Buffer[Pos++] = (Byte)RG.GetRnd(8); 100 | else 101 | { 102 | UInt32 len; 103 | if (RG.GetRnd(3) == 0) 104 | len = 1 + GetLen1(); 105 | else 106 | { 107 | do 108 | Rep0 = GetOffset(); 109 | while (Rep0 >= Pos); 110 | Rep0++; 111 | len = 2 + GetLen2(); 112 | } 113 | for (UInt32 i = 0; i < len && Pos < BufferSize; i++, Pos++) 114 | Buffer[Pos] = Buffer[Pos - Rep0]; 115 | } 116 | } 117 | } 118 | }; 119 | 120 | class CrcOutStream : System.IO.Stream 121 | { 122 | public CRC CRC = new CRC(); 123 | public void Init() { CRC.Init(); } 124 | public UInt32 GetDigest() { return CRC.GetDigest(); } 125 | 126 | public override bool CanRead { get { return false; } } 127 | public override bool CanSeek { get { return false; } } 128 | public override bool CanWrite { get { return true; } } 129 | public override Int64 Length { get { return 0; } } 130 | public override Int64 Position { get { return 0; } set { } } 131 | public override void Flush() { } 132 | public override long Seek(long offset, SeekOrigin origin) { return 0; } 133 | public override void SetLength(long value) { } 134 | public override int Read(byte[] buffer, int offset, int count) { return 0; } 135 | 136 | public override void WriteByte(byte b) 137 | { 138 | CRC.UpdateByte(b); 139 | } 140 | public override void Write(byte[] buffer, int offset, int count) 141 | { 142 | CRC.Update(buffer, (uint)offset, (uint)count); 143 | } 144 | }; 145 | 146 | class CProgressInfo : ICodeProgress 147 | { 148 | public Int64 ApprovedStart; 149 | public Int64 InSize; 150 | public System.DateTime Time; 151 | public void Init() { InSize = 0; } 152 | public void SetProgress(Int64 inSize, Int64 outSize) 153 | { 154 | if (inSize >= ApprovedStart && InSize == 0) 155 | { 156 | Time = DateTime.UtcNow; 157 | InSize = inSize; 158 | } 159 | } 160 | } 161 | const int kSubBits = 8; 162 | 163 | static UInt32 GetLogSize(UInt32 size) 164 | { 165 | for (int i = kSubBits; i < 32; i++) 166 | for (UInt32 j = 0; j < (1 << kSubBits); j++) 167 | if (size <= (((UInt32)1) << i) + (j << (i - kSubBits))) 168 | return (UInt32)(i << kSubBits) + j; 169 | return (32 << kSubBits); 170 | } 171 | 172 | static UInt64 MyMultDiv64(UInt64 value, UInt64 elapsedTime) 173 | { 174 | UInt64 freq = TimeSpan.TicksPerSecond; 175 | UInt64 elTime = elapsedTime; 176 | while (freq > 1000000) 177 | { 178 | freq >>= 1; 179 | elTime >>= 1; 180 | } 181 | if (elTime == 0) 182 | elTime = 1; 183 | return value * freq / elTime; 184 | } 185 | 186 | static UInt64 GetCompressRating(UInt32 dictionarySize, UInt64 elapsedTime, UInt64 size) 187 | { 188 | UInt64 t = GetLogSize(dictionarySize) - (18 << kSubBits); 189 | UInt64 numCommandsForOne = 1060 + ((t * t * 10) >> (2 * kSubBits)); 190 | UInt64 numCommands = (UInt64)(size) * numCommandsForOne; 191 | return MyMultDiv64(numCommands, elapsedTime); 192 | } 193 | 194 | static UInt64 GetDecompressRating(UInt64 elapsedTime, UInt64 outSize, UInt64 inSize) 195 | { 196 | UInt64 numCommands = inSize * 220 + outSize * 20; 197 | return MyMultDiv64(numCommands, elapsedTime); 198 | } 199 | 200 | static UInt64 GetTotalRating( 201 | UInt32 dictionarySize, 202 | UInt64 elapsedTimeEn, UInt64 sizeEn, 203 | UInt64 elapsedTimeDe, 204 | UInt64 inSizeDe, UInt64 outSizeDe) 205 | { 206 | return (GetCompressRating(dictionarySize, elapsedTimeEn, sizeEn) + 207 | GetDecompressRating(elapsedTimeDe, inSizeDe, outSizeDe)) / 2; 208 | } 209 | 210 | static void PrintValue(UInt64 v) 211 | { 212 | string s = v.ToString(); 213 | for (int i = 0; i + s.Length < 6; i++) 214 | System.Console.Write(" "); 215 | System.Console.Write(s); 216 | } 217 | 218 | static void PrintRating(UInt64 rating) 219 | { 220 | PrintValue(rating / 1000000); 221 | System.Console.Write(" MIPS"); 222 | } 223 | 224 | static void PrintResults( 225 | UInt32 dictionarySize, 226 | UInt64 elapsedTime, 227 | UInt64 size, 228 | bool decompressMode, UInt64 secondSize) 229 | { 230 | UInt64 speed = MyMultDiv64(size, elapsedTime); 231 | PrintValue(speed / 1024); 232 | System.Console.Write(" KB/s "); 233 | UInt64 rating; 234 | if (decompressMode) 235 | rating = GetDecompressRating(elapsedTime, size, secondSize); 236 | else 237 | rating = GetCompressRating(dictionarySize, elapsedTime, size); 238 | PrintRating(rating); 239 | } 240 | 241 | static public int LzmaBenchmark(Int32 numIterations, UInt32 dictionarySize) 242 | { 243 | if (numIterations <= 0) 244 | return 0; 245 | if (dictionarySize < (1 << 18)) 246 | { 247 | System.Console.WriteLine("\nError: dictionary size for benchmark must be >= 19 (512 KB)"); 248 | return 1; 249 | } 250 | System.Console.Write("\n Compressing Decompressing\n\n"); 251 | 252 | Compression.LZMA.Encoder encoder = new Compression.LZMA.Encoder(); 253 | Compression.LZMA.Decoder decoder = new Compression.LZMA.Decoder(); 254 | 255 | 256 | CoderPropID[] propIDs = 257 | { 258 | CoderPropID.DictionarySize, 259 | }; 260 | object[] properties = 261 | { 262 | (Int32)(dictionarySize), 263 | }; 264 | 265 | UInt32 kBufferSize = dictionarySize + kAdditionalSize; 266 | UInt32 kCompressedBufferSize = (kBufferSize / 2) + kCompressedAdditionalSize; 267 | 268 | encoder.SetCoderProperties(propIDs, properties); 269 | System.IO.MemoryStream propStream = new System.IO.MemoryStream(); 270 | encoder.WriteCoderProperties(propStream); 271 | byte[] propArray = propStream.ToArray(); 272 | 273 | CBenchRandomGenerator rg = new CBenchRandomGenerator(); 274 | 275 | rg.Set(kBufferSize); 276 | rg.Generate(); 277 | CRC crc = new CRC(); 278 | crc.Init(); 279 | crc.Update(rg.Buffer, 0, rg.BufferSize); 280 | 281 | CProgressInfo progressInfo = new CProgressInfo(); 282 | progressInfo.ApprovedStart = dictionarySize; 283 | 284 | UInt64 totalBenchSize = 0; 285 | UInt64 totalEncodeTime = 0; 286 | UInt64 totalDecodeTime = 0; 287 | UInt64 totalCompressedSize = 0; 288 | 289 | MemoryStream inStream = new MemoryStream(rg.Buffer, 0, (int)rg.BufferSize); 290 | MemoryStream compressedStream = new MemoryStream((int)kCompressedBufferSize); 291 | CrcOutStream crcOutStream = new CrcOutStream(); 292 | for (Int32 i = 0; i < numIterations; i++) 293 | { 294 | progressInfo.Init(); 295 | inStream.Seek(0, SeekOrigin.Begin); 296 | compressedStream.Seek(0, SeekOrigin.Begin); 297 | encoder.Code(inStream, compressedStream, -1, -1, progressInfo); 298 | TimeSpan sp2 = DateTime.UtcNow - progressInfo.Time; 299 | UInt64 encodeTime = (UInt64)sp2.Ticks; 300 | 301 | long compressedSize = compressedStream.Position; 302 | if (progressInfo.InSize == 0) 303 | throw (new Exception("Internal ERROR 1282")); 304 | 305 | UInt64 decodeTime = 0; 306 | for (int j = 0; j < 2; j++) 307 | { 308 | compressedStream.Seek(0, SeekOrigin.Begin); 309 | crcOutStream.Init(); 310 | 311 | decoder.SetDecoderProperties(propArray); 312 | UInt64 outSize = kBufferSize; 313 | System.DateTime startTime = DateTime.UtcNow; 314 | decoder.Code(compressedStream, crcOutStream, 0, (Int64)outSize, null); 315 | TimeSpan sp = (DateTime.UtcNow - startTime); 316 | decodeTime = (ulong)sp.Ticks; 317 | if (crcOutStream.GetDigest() != crc.GetDigest()) 318 | throw (new Exception("CRC Error")); 319 | } 320 | UInt64 benchSize = kBufferSize - (UInt64)progressInfo.InSize; 321 | PrintResults(dictionarySize, encodeTime, benchSize, false, 0); 322 | System.Console.Write(" "); 323 | PrintResults(dictionarySize, decodeTime, kBufferSize, true, (ulong)compressedSize); 324 | System.Console.WriteLine(); 325 | 326 | totalBenchSize += benchSize; 327 | totalEncodeTime += encodeTime; 328 | totalDecodeTime += decodeTime; 329 | totalCompressedSize += (ulong)compressedSize; 330 | } 331 | System.Console.WriteLine("---------------------------------------------------"); 332 | PrintResults(dictionarySize, totalEncodeTime, totalBenchSize, false, 0); 333 | System.Console.Write(" "); 334 | PrintResults(dictionarySize, totalDecodeTime, 335 | kBufferSize * (UInt64)numIterations, true, totalCompressedSize); 336 | System.Console.WriteLine(" Average"); 337 | return 0; 338 | } 339 | } 340 | } 341 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Compress/LzmaAlone/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | #region Using directives 2 | 3 | using System.Reflection; 4 | using System.Runtime.CompilerServices; 5 | 6 | #endregion 7 | 8 | // General Information about an assembly is controlled through the following 9 | // set of attributes. Change these attribute values to modify the information 10 | // associated with an assembly. 11 | [assembly: AssemblyTitle("LZMA#")] 12 | [assembly: AssemblyDescription("")] 13 | [assembly: AssemblyConfiguration("")] 14 | [assembly: AssemblyCompany("Igor Pavlov")] 15 | [assembly: AssemblyProduct("LZMA# SDK")] 16 | [assembly: AssemblyCopyright("Copyright @ Igor Pavlov 1999-2004")] 17 | [assembly: AssemblyTrademark("")] 18 | [assembly: AssemblyCulture("")] 19 | 20 | // Version information for an assembly consists of the following four values: 21 | // 22 | // Major Version 23 | // Minor Version 24 | // Build Number 25 | // Revision 26 | // 27 | // You can specify all the values or you can default the Revision and Build Numbers 28 | // by using the '*' as shown below: 29 | [assembly: AssemblyVersion("4.12.*")] 30 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Compress/LzmaAlone/Properties/Resources.cs: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // This code was generated by a tool. 4 | // Runtime Version:2.0.40607.42 5 | // 6 | // Changes to this file may cause incorrect behavior and will be lost if 7 | // the code is regenerated. 8 | // 9 | //------------------------------------------------------------------------------ 10 | 11 | namespace LzmaAlone.Properties 12 | { 13 | using System; 14 | using System.IO; 15 | using System.Resources; 16 | 17 | /// 18 | /// A strongly-typed resource class, for looking up localized strings, etc. 19 | /// 20 | // This class was auto-generated by the Strongly Typed Resource Builder 21 | // class via a tool like ResGen or Visual Studio.NET. 22 | // To add or remove a member, edit your .ResX file then rerun ResGen 23 | // with the /str option, or rebuild your VS project. 24 | class Resources 25 | { 26 | 27 | private static System.Resources.ResourceManager _resMgr; 28 | 29 | private static System.Globalization.CultureInfo _resCulture; 30 | 31 | /*FamANDAssem*/ 32 | internal Resources() 33 | { 34 | } 35 | 36 | /// 37 | /// Returns the cached ResourceManager instance used by this class. 38 | /// 39 | [System.ComponentModel.EditorBrowsableAttribute(System.ComponentModel.EditorBrowsableState.Advanced)] 40 | public static System.Resources.ResourceManager ResourceManager 41 | { 42 | get 43 | { 44 | if ((_resMgr == null)) 45 | { 46 | System.Resources.ResourceManager temp = new System.Resources.ResourceManager("Resources", typeof(Resources).Assembly); 47 | _resMgr = temp; 48 | } 49 | return _resMgr; 50 | } 51 | } 52 | 53 | /// 54 | /// Overrides the current thread's CurrentUICulture property for all 55 | /// resource lookups using this strongly typed resource class. 56 | /// 57 | [System.ComponentModel.EditorBrowsableAttribute(System.ComponentModel.EditorBrowsableState.Advanced)] 58 | public static System.Globalization.CultureInfo Culture 59 | { 60 | get 61 | { 62 | return _resCulture; 63 | } 64 | set 65 | { 66 | _resCulture = value; 67 | } 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Compress/LzmaAlone/Properties/Settings.cs: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // This code was generated by a tool. 4 | // Runtime Version:2.0.40607.42 5 | // 6 | // Changes to this file may cause incorrect behavior and will be lost if 7 | // the code is regenerated. 8 | // 9 | //------------------------------------------------------------------------------ 10 | 11 | namespace LzmaAlone.Properties 12 | { 13 | public partial class Settings : System.Configuration.ApplicationSettingsBase 14 | { 15 | private static Settings m_Value; 16 | 17 | private static object m_SyncObject = new object(); 18 | 19 | public static Settings Value 20 | { 21 | get 22 | { 23 | if ((Settings.m_Value == null)) 24 | { 25 | System.Threading.Monitor.Enter(Settings.m_SyncObject); 26 | if ((Settings.m_Value == null)) 27 | { 28 | try 29 | { 30 | Settings.m_Value = new Settings(); 31 | } 32 | finally 33 | { 34 | System.Threading.Monitor.Exit(Settings.m_SyncObject); 35 | } 36 | } 37 | } 38 | return Settings.m_Value; 39 | } 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Compress/RangeCoder/RangeCoder.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace SevenZip.Compression.RangeCoder 4 | { 5 | class Encoder 6 | { 7 | public const uint kTopValue = (1 << 24); 8 | 9 | System.IO.Stream Stream; 10 | 11 | public UInt64 Low; 12 | public uint Range; 13 | uint _cacheSize; 14 | byte _cache; 15 | 16 | long StartPosition; 17 | 18 | public void SetStream(System.IO.Stream stream) 19 | { 20 | Stream = stream; 21 | } 22 | 23 | public void ReleaseStream() 24 | { 25 | Stream = null; 26 | } 27 | 28 | public void Init() 29 | { 30 | StartPosition = Stream.Position; 31 | 32 | Low = 0; 33 | Range = 0xFFFFFFFF; 34 | _cacheSize = 1; 35 | _cache = 0; 36 | } 37 | 38 | public void FlushData() 39 | { 40 | for (int i = 0; i < 5; i++) 41 | ShiftLow(); 42 | } 43 | 44 | public void FlushStream() 45 | { 46 | Stream.Flush(); 47 | } 48 | 49 | public void CloseStream() 50 | { 51 | Stream.Close(); 52 | } 53 | 54 | public void Encode(uint start, uint size, uint total) 55 | { 56 | Low += start * (Range /= total); 57 | Range *= size; 58 | while (Range < kTopValue) 59 | { 60 | Range <<= 8; 61 | ShiftLow(); 62 | } 63 | } 64 | 65 | public void ShiftLow() 66 | { 67 | if ((uint)Low < (uint)0xFF000000 || (uint)(Low >> 32) == 1) 68 | { 69 | byte temp = _cache; 70 | do 71 | { 72 | Stream.WriteByte((byte)(temp + (Low >> 32))); 73 | temp = 0xFF; 74 | } 75 | while (--_cacheSize != 0); 76 | _cache = (byte)(((uint)Low) >> 24); 77 | } 78 | _cacheSize++; 79 | Low = ((uint)Low) << 8; 80 | } 81 | 82 | public void EncodeDirectBits(uint v, int numTotalBits) 83 | { 84 | for (int i = numTotalBits - 1; i >= 0; i--) 85 | { 86 | Range >>= 1; 87 | if (((v >> i) & 1) == 1) 88 | Low += Range; 89 | if (Range < kTopValue) 90 | { 91 | Range <<= 8; 92 | ShiftLow(); 93 | } 94 | } 95 | } 96 | 97 | public void EncodeBit(uint size0, int numTotalBits, uint symbol) 98 | { 99 | uint newBound = (Range >> numTotalBits) * size0; 100 | if (symbol == 0) 101 | Range = newBound; 102 | else 103 | { 104 | Low += newBound; 105 | Range -= newBound; 106 | } 107 | while (Range < kTopValue) 108 | { 109 | Range <<= 8; 110 | ShiftLow(); 111 | } 112 | } 113 | 114 | public long GetProcessedSizeAdd() 115 | { 116 | return _cacheSize + 117 | Stream.Position - StartPosition + 4; 118 | // (long)Stream.GetProcessedSize(); 119 | } 120 | } 121 | 122 | class Decoder 123 | { 124 | public const uint kTopValue = (1 << 24); 125 | public uint Range; 126 | public uint Code; 127 | // public Buffer.InBuffer Stream = new Buffer.InBuffer(1 << 16); 128 | public System.IO.Stream Stream; 129 | 130 | public void Init(System.IO.Stream stream) 131 | { 132 | // Stream.Init(stream); 133 | Stream = stream; 134 | 135 | Code = 0; 136 | Range = 0xFFFFFFFF; 137 | for (int i = 0; i < 5; i++) 138 | Code = (Code << 8) | (byte)Stream.ReadByte(); 139 | } 140 | 141 | public void ReleaseStream() 142 | { 143 | // Stream.ReleaseStream(); 144 | Stream = null; 145 | } 146 | 147 | public void CloseStream() 148 | { 149 | Stream.Close(); 150 | } 151 | 152 | public void Normalize() 153 | { 154 | while (Range < kTopValue) 155 | { 156 | Code = (Code << 8) | (byte)Stream.ReadByte(); 157 | Range <<= 8; 158 | } 159 | } 160 | 161 | public void Normalize2() 162 | { 163 | if (Range < kTopValue) 164 | { 165 | Code = (Code << 8) | (byte)Stream.ReadByte(); 166 | Range <<= 8; 167 | } 168 | } 169 | 170 | public uint GetThreshold(uint total) 171 | { 172 | return Code / (Range /= total); 173 | } 174 | 175 | public void Decode(uint start, uint size, uint total) 176 | { 177 | Code -= start * Range; 178 | Range *= size; 179 | Normalize(); 180 | } 181 | 182 | public uint DecodeDirectBits(int numTotalBits) 183 | { 184 | uint range = Range; 185 | uint code = Code; 186 | uint result = 0; 187 | for (int i = numTotalBits; i > 0; i--) 188 | { 189 | range >>= 1; 190 | /* 191 | result <<= 1; 192 | if (code >= range) 193 | { 194 | code -= range; 195 | result |= 1; 196 | } 197 | */ 198 | uint t = (code - range) >> 31; 199 | code -= range & (t - 1); 200 | result = (result << 1) | (1 - t); 201 | 202 | if (range < kTopValue) 203 | { 204 | code = (code << 8) | (byte)Stream.ReadByte(); 205 | range <<= 8; 206 | } 207 | } 208 | Range = range; 209 | Code = code; 210 | return result; 211 | } 212 | 213 | public uint DecodeBit(uint size0, int numTotalBits) 214 | { 215 | uint newBound = (Range >> numTotalBits) * size0; 216 | uint symbol; 217 | if (Code < newBound) 218 | { 219 | symbol = 0; 220 | Range = newBound; 221 | } 222 | else 223 | { 224 | symbol = 1; 225 | Code -= newBound; 226 | Range -= newBound; 227 | } 228 | Normalize(); 229 | return symbol; 230 | } 231 | 232 | // ulong GetProcessedSize() {return Stream.GetProcessedSize(); } 233 | } 234 | } 235 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Compress/RangeCoder/RangeCoderBit.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace SevenZip.Compression.RangeCoder 4 | { 5 | struct BitEncoder 6 | { 7 | public const int kNumBitModelTotalBits = 11; 8 | public const uint kBitModelTotal = (1 << kNumBitModelTotalBits); 9 | const int kNumMoveBits = 5; 10 | const int kNumMoveReducingBits = 2; 11 | public const int kNumBitPriceShiftBits = 6; 12 | 13 | uint Prob; 14 | 15 | public void Init() { Prob = kBitModelTotal >> 1; } 16 | 17 | public void UpdateModel(uint symbol) 18 | { 19 | if (symbol == 0) 20 | Prob += (kBitModelTotal - Prob) >> kNumMoveBits; 21 | else 22 | Prob -= (Prob) >> kNumMoveBits; 23 | } 24 | 25 | public void Encode(Encoder encoder, uint symbol) 26 | { 27 | // encoder.EncodeBit(Prob, kNumBitModelTotalBits, symbol); 28 | // UpdateModel(symbol); 29 | uint newBound = (encoder.Range >> kNumBitModelTotalBits) * Prob; 30 | if (symbol == 0) 31 | { 32 | encoder.Range = newBound; 33 | Prob += (kBitModelTotal - Prob) >> kNumMoveBits; 34 | } 35 | else 36 | { 37 | encoder.Low += newBound; 38 | encoder.Range -= newBound; 39 | Prob -= (Prob) >> kNumMoveBits; 40 | } 41 | if (encoder.Range < Encoder.kTopValue) 42 | { 43 | encoder.Range <<= 8; 44 | encoder.ShiftLow(); 45 | } 46 | } 47 | 48 | private static UInt32[] ProbPrices = new UInt32[kBitModelTotal >> kNumMoveReducingBits]; 49 | 50 | static BitEncoder() 51 | { 52 | const int kNumBits = (kNumBitModelTotalBits - kNumMoveReducingBits); 53 | for (int i = kNumBits - 1; i >= 0; i--) 54 | { 55 | UInt32 start = (UInt32)1 << (kNumBits - i - 1); 56 | UInt32 end = (UInt32)1 << (kNumBits - i); 57 | for (UInt32 j = start; j < end; j++) 58 | ProbPrices[j] = ((UInt32)i << kNumBitPriceShiftBits) + 59 | (((end - j) << kNumBitPriceShiftBits) >> (kNumBits - i - 1)); 60 | } 61 | } 62 | 63 | public uint GetPrice(uint symbol) 64 | { 65 | return ProbPrices[(((Prob - symbol) ^ ((-(int)symbol))) & (kBitModelTotal - 1)) >> kNumMoveReducingBits]; 66 | } 67 | public uint GetPrice0() { return ProbPrices[Prob >> kNumMoveReducingBits]; } 68 | public uint GetPrice1() { return ProbPrices[(kBitModelTotal - Prob) >> kNumMoveReducingBits]; } 69 | } 70 | 71 | struct BitDecoder 72 | { 73 | public const int kNumBitModelTotalBits = 11; 74 | public const uint kBitModelTotal = (1 << kNumBitModelTotalBits); 75 | const int kNumMoveBits = 5; 76 | 77 | uint Prob; 78 | 79 | public void UpdateModel(int numMoveBits, uint symbol) 80 | { 81 | if (symbol == 0) 82 | Prob += (kBitModelTotal - Prob) >> numMoveBits; 83 | else 84 | Prob -= (Prob) >> numMoveBits; 85 | } 86 | 87 | public void Init() { Prob = kBitModelTotal >> 1; } 88 | 89 | public uint Decode(RangeCoder.Decoder rangeDecoder) 90 | { 91 | uint newBound = (uint)(rangeDecoder.Range >> kNumBitModelTotalBits) * (uint)Prob; 92 | if (rangeDecoder.Code < newBound) 93 | { 94 | rangeDecoder.Range = newBound; 95 | Prob += (kBitModelTotal - Prob) >> kNumMoveBits; 96 | if (rangeDecoder.Range < Decoder.kTopValue) 97 | { 98 | rangeDecoder.Code = (rangeDecoder.Code << 8) | (byte)rangeDecoder.Stream.ReadByte(); 99 | rangeDecoder.Range <<= 8; 100 | } 101 | return 0; 102 | } 103 | else 104 | { 105 | rangeDecoder.Range -= newBound; 106 | rangeDecoder.Code -= newBound; 107 | Prob -= (Prob) >> kNumMoveBits; 108 | if (rangeDecoder.Range < Decoder.kTopValue) 109 | { 110 | rangeDecoder.Code = (rangeDecoder.Code << 8) | (byte)rangeDecoder.Stream.ReadByte(); 111 | rangeDecoder.Range <<= 8; 112 | } 113 | return 1; 114 | } 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/Compress/RangeCoder/RangeCoderBitTree.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace SevenZip.Compression.RangeCoder 4 | { 5 | struct BitTreeEncoder 6 | { 7 | BitEncoder[] Models; 8 | int NumBitLevels; 9 | 10 | public BitTreeEncoder(int numBitLevels) 11 | { 12 | NumBitLevels = numBitLevels; 13 | Models = new BitEncoder[1 << numBitLevels]; 14 | } 15 | 16 | public void Init() 17 | { 18 | for (uint i = 1; i < (1 << NumBitLevels); i++) 19 | Models[i].Init(); 20 | } 21 | 22 | public void Encode(Encoder rangeEncoder, UInt32 symbol) 23 | { 24 | UInt32 m = 1; 25 | for (int bitIndex = NumBitLevels; bitIndex > 0; ) 26 | { 27 | bitIndex--; 28 | UInt32 bit = (symbol >> bitIndex) & 1; 29 | Models[m].Encode(rangeEncoder, bit); 30 | m = (m << 1) | bit; 31 | } 32 | } 33 | 34 | public void ReverseEncode(Encoder rangeEncoder, UInt32 symbol) 35 | { 36 | UInt32 m = 1; 37 | for (UInt32 i = 0; i < NumBitLevels; i++) 38 | { 39 | UInt32 bit = symbol & 1; 40 | Models[m].Encode(rangeEncoder, bit); 41 | m = (m << 1) | bit; 42 | symbol >>= 1; 43 | } 44 | } 45 | 46 | public UInt32 GetPrice(UInt32 symbol) 47 | { 48 | UInt32 price = 0; 49 | UInt32 m = 1; 50 | for (int bitIndex = NumBitLevels; bitIndex > 0; ) 51 | { 52 | bitIndex--; 53 | UInt32 bit = (symbol >> bitIndex) & 1; 54 | price += Models[m].GetPrice(bit); 55 | m = (m << 1) + bit; 56 | } 57 | return price; 58 | } 59 | 60 | public UInt32 ReverseGetPrice(UInt32 symbol) 61 | { 62 | UInt32 price = 0; 63 | UInt32 m = 1; 64 | for (int i = NumBitLevels; i > 0; i--) 65 | { 66 | UInt32 bit = symbol & 1; 67 | symbol >>= 1; 68 | price += Models[m].GetPrice(bit); 69 | m = (m << 1) | bit; 70 | } 71 | return price; 72 | } 73 | 74 | public static UInt32 ReverseGetPrice(BitEncoder[] Models, UInt32 startIndex, 75 | int NumBitLevels, UInt32 symbol) 76 | { 77 | UInt32 price = 0; 78 | UInt32 m = 1; 79 | for (int i = NumBitLevels; i > 0; i--) 80 | { 81 | UInt32 bit = symbol & 1; 82 | symbol >>= 1; 83 | price += Models[startIndex + m].GetPrice(bit); 84 | m = (m << 1) | bit; 85 | } 86 | return price; 87 | } 88 | 89 | public static void ReverseEncode(BitEncoder[] Models, UInt32 startIndex, 90 | Encoder rangeEncoder, int NumBitLevels, UInt32 symbol) 91 | { 92 | UInt32 m = 1; 93 | for (int i = 0; i < NumBitLevels; i++) 94 | { 95 | UInt32 bit = symbol & 1; 96 | Models[startIndex + m].Encode(rangeEncoder, bit); 97 | m = (m << 1) | bit; 98 | symbol >>= 1; 99 | } 100 | } 101 | } 102 | 103 | struct BitTreeDecoder 104 | { 105 | BitDecoder[] Models; 106 | int NumBitLevels; 107 | 108 | public BitTreeDecoder(int numBitLevels) 109 | { 110 | NumBitLevels = numBitLevels; 111 | Models = new BitDecoder[1 << numBitLevels]; 112 | } 113 | 114 | public void Init() 115 | { 116 | for (uint i = 1; i < (1 << NumBitLevels); i++) 117 | Models[i].Init(); 118 | } 119 | 120 | public uint Decode(RangeCoder.Decoder rangeDecoder) 121 | { 122 | uint m = 1; 123 | for (int bitIndex = NumBitLevels; bitIndex > 0; bitIndex--) 124 | m = (m << 1) + Models[m].Decode(rangeDecoder); 125 | return m - ((uint)1 << NumBitLevels); 126 | } 127 | 128 | public uint ReverseDecode(RangeCoder.Decoder rangeDecoder) 129 | { 130 | uint m = 1; 131 | uint symbol = 0; 132 | for (int bitIndex = 0; bitIndex < NumBitLevels; bitIndex++) 133 | { 134 | uint bit = Models[m].Decode(rangeDecoder); 135 | m <<= 1; 136 | m += bit; 137 | symbol |= (bit << bitIndex); 138 | } 139 | return symbol; 140 | } 141 | 142 | public static uint ReverseDecode(BitDecoder[] Models, UInt32 startIndex, 143 | RangeCoder.Decoder rangeDecoder, int NumBitLevels) 144 | { 145 | uint m = 1; 146 | uint symbol = 0; 147 | for (int bitIndex = 0; bitIndex < NumBitLevels; bitIndex++) 148 | { 149 | uint bit = Models[startIndex + m].Decode(rangeDecoder); 150 | m <<= 1; 151 | m += bit; 152 | symbol |= (bit << bitIndex); 153 | } 154 | return symbol; 155 | } 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/7zip/ICoder.cs: -------------------------------------------------------------------------------- 1 | // ICoder.h 2 | 3 | using System; 4 | 5 | namespace SevenZip 6 | { 7 | /// 8 | /// The exception that is thrown when an error in input stream occurs during decoding. 9 | /// 10 | class DataErrorException : ApplicationException 11 | { 12 | public DataErrorException(): base("Data Error") { } 13 | } 14 | 15 | /// 16 | /// The exception that is thrown when the value of an argument is outside the allowable range. 17 | /// 18 | class InvalidParamException : ApplicationException 19 | { 20 | public InvalidParamException(): base("Invalid Parameter") { } 21 | } 22 | 23 | public interface ICodeProgress 24 | { 25 | /// 26 | /// Callback progress. 27 | /// 28 | /// 29 | /// input size. -1 if unknown. 30 | /// 31 | /// 32 | /// output size. -1 if unknown. 33 | /// 34 | void SetProgress(Int64 inSize, Int64 outSize); 35 | }; 36 | 37 | public interface ICoder 38 | { 39 | /// 40 | /// Codes streams. 41 | /// 42 | /// 43 | /// input Stream. 44 | /// 45 | /// 46 | /// output Stream. 47 | /// 48 | /// 49 | /// input Size. -1 if unknown. 50 | /// 51 | /// 52 | /// output Size. -1 if unknown. 53 | /// 54 | /// 55 | /// callback progress reference. 56 | /// 57 | /// 58 | /// if input stream is not valid 59 | /// 60 | void Code(System.IO.Stream inStream, System.IO.Stream outStream, 61 | Int64 inSize, Int64 outSize, ICodeProgress progress); 62 | }; 63 | 64 | /* 65 | public interface ICoder2 66 | { 67 | void Code(ISequentialInStream []inStreams, 68 | const UInt64 []inSizes, 69 | ISequentialOutStream []outStreams, 70 | UInt64 []outSizes, 71 | ICodeProgress progress); 72 | }; 73 | */ 74 | 75 | /// 76 | /// Provides the fields that represent properties idenitifiers for compressing. 77 | /// 78 | public enum CoderPropID 79 | { 80 | /// 81 | /// Specifies default property. 82 | /// 83 | DefaultProp = 0, 84 | /// 85 | /// Specifies size of dictionary. 86 | /// 87 | DictionarySize, 88 | /// 89 | /// Specifies size of memory for PPM*. 90 | /// 91 | UsedMemorySize, 92 | /// 93 | /// Specifies order for PPM methods. 94 | /// 95 | Order, 96 | /// 97 | /// Specifies Block Size. 98 | /// 99 | BlockSize, 100 | /// 101 | /// Specifies number of postion state bits for LZMA (0 <= x <= 4). 102 | /// 103 | PosStateBits, 104 | /// 105 | /// Specifies number of literal context bits for LZMA (0 <= x <= 8). 106 | /// 107 | LitContextBits, 108 | /// 109 | /// Specifies number of literal position bits for LZMA (0 <= x <= 4). 110 | /// 111 | LitPosBits, 112 | /// 113 | /// Specifies number of fast bytes for LZ*. 114 | /// 115 | NumFastBytes, 116 | /// 117 | /// Specifies match finder. LZMA: "BT2", "BT4" or "BT4B". 118 | /// 119 | MatchFinder, 120 | /// 121 | /// Specifies the number of match finder cyckes. 122 | /// 123 | MatchFinderCycles, 124 | /// 125 | /// Specifies number of passes. 126 | /// 127 | NumPasses, 128 | /// 129 | /// Specifies number of algorithm. 130 | /// 131 | Algorithm, 132 | /// 133 | /// Specifies the number of threads. 134 | /// 135 | NumThreads, 136 | /// 137 | /// Specifies mode with end marker. 138 | /// 139 | EndMarker 140 | }; 141 | 142 | 143 | public interface ISetCoderProperties 144 | { 145 | void SetCoderProperties(CoderPropID[] propIDs, object[] properties); 146 | }; 147 | 148 | public interface IWriteCoderProperties 149 | { 150 | void WriteCoderProperties(System.IO.Stream outStream); 151 | } 152 | 153 | public interface ISetDecoderProperties 154 | { 155 | void SetDecoderProperties(byte[] properties); 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/Constants.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace LemmaSharp.Classes 6 | { 7 | static class Constants 8 | { 9 | public const string Separator = "|"; 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/ExampleList.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using System.IO; 5 | using System.Runtime.Serialization; 6 | 7 | namespace LemmaSharp.Classes { 8 | 9 | [Serializable] 10 | public class ExampleList : ISerializable { 11 | 12 | // Private Variables ---------------------------------- 13 | 14 | private LemmatizerSettings lsett; 15 | private RuleList rlRules; 16 | private Dictionary dictExamples; 17 | private List lstExamples; 18 | 19 | 20 | // Constructor(s) & Destructor(s) -------------------- 21 | 22 | public ExampleList(LemmatizerSettings lsett): base() { 23 | this.lsett = lsett; 24 | 25 | this.dictExamples = new Dictionary(); 26 | this.lstExamples = null; 27 | this.rlRules = new RuleList(lsett); 28 | } 29 | public ExampleList(StreamReader srIn, string sFormat, LemmatizerSettings lsett): this(lsett) { 30 | AddMultextFile(srIn, sFormat); 31 | } 32 | 33 | 34 | // Public Properties & Indexers --------------------- 35 | 36 | public LemmaExample this[int i] { 37 | get { 38 | if (lstExamples == null){ FinalizeAdditions();} 39 | return lstExamples[i]; 40 | } 41 | } 42 | public int Count { 43 | get { 44 | if (lstExamples == null){ FinalizeAdditions();} 45 | return lstExamples.Count; 46 | } 47 | } 48 | public double WeightSum { 49 | get { 50 | if (lstExamples == null){ FinalizeAdditions();} 51 | 52 | double dWeight = 0; 53 | 54 | foreach (LemmaExample exm in lstExamples) 55 | { 56 | dWeight += exm.Weight; 57 | } 58 | 59 | return dWeight; 60 | } 61 | } 62 | public RuleList Rules { 63 | get { 64 | return rlRules; 65 | } 66 | } 67 | public List ListExamples { 68 | get { 69 | if (lstExamples == null){ FinalizeAdditions();} 70 | return lstExamples; 71 | } 72 | } 73 | 74 | 75 | // Essential Class Functions (adding/removing examples) --------- 76 | 77 | public void AddMultextFile(StreamReader srIn, string sFormat) { 78 | //read from file 79 | string sLine = null; 80 | int iError = 0; 81 | int iLine = 0; 82 | 83 | int iW = sFormat.IndexOf('W'); 84 | int iL = sFormat.IndexOf('L'); 85 | int iM = sFormat.IndexOf('M'); 86 | int iF = sFormat.IndexOf('F'); 87 | int iLen = Math.Max(Math.Max(iW, iL), Math.Max(iM, iF))+1; 88 | 89 | if (iW < 0 || iL < 0) { 90 | Console.WriteLine(" Can not find word and lemma location in the format specification"); 91 | return; 92 | } 93 | 94 | while ((sLine = srIn.ReadLine()) != null && iError < 50) { 95 | iLine++; 96 | 97 | string[] asWords = sLine.Split(new char[] { '\t' }); 98 | if (asWords.Length < iLen) { 99 | Console.WriteLine(" ERROR: Line doesn't confirm to the given format \"" + sFormat + "\"! Line " + iLine.ToString() + "."); 100 | iError++; 101 | continue; 102 | } 103 | 104 | string sWord = asWords[iW]; 105 | string sLemma = asWords[iL]; 106 | if (sLemma == "="){ sLemma = sWord;} 107 | string sMsd = null; 108 | if (iM > -1){ sMsd = asWords[iM];} 109 | double dWeight = 1; ; 110 | if (iF > -1) 111 | { 112 | Double.TryParse(asWords[iM], out dWeight); 113 | } 114 | 115 | AddExample(sWord, sLemma, dWeight, sMsd); 116 | } 117 | if (iError == 50){ Console.WriteLine("Parsing stopped because of too many (50) errors. Check format specification");} 118 | } 119 | 120 | public LemmaExample AddExample(string sWord, string sLemma, double dWeight, string sMsd) { 121 | string sNewMsd = lsett.eMsdConsider != LemmatizerSettings.MsdConsideration.Ignore ? sMsd : null; 122 | var leNew = new LemmaExample(sWord, sLemma, dWeight, sNewMsd, rlRules, lsett); 123 | return Add(leNew); 124 | } 125 | 126 | private LemmaExample Add(LemmaExample leNew) { 127 | LemmaExample leReturn = null; 128 | 129 | if (!dictExamples.TryGetValue(leNew.Signature, out leReturn)) 130 | { 131 | leReturn = leNew; 132 | dictExamples.Add(leReturn.Signature, leReturn); 133 | } 134 | else 135 | { 136 | leReturn.Join(leNew); 137 | } 138 | 139 | lstExamples = null; 140 | 141 | return leReturn; 142 | } 143 | public void DropExamples() { 144 | dictExamples.Clear(); 145 | lstExamples = null; 146 | } 147 | public void FinalizeAdditions() { 148 | if (lstExamples != null){ return;} 149 | lstExamples = new List(dictExamples.Values); 150 | lstExamples.Sort(); 151 | } 152 | 153 | public ExampleList GetFrontRearExampleList(bool front) { 154 | var elExamplesNew = new ExampleList(lsett); 155 | 156 | foreach (LemmaExample le in this.ListExamples) { 157 | if (front) 158 | { 159 | elExamplesNew.AddExample(le.WordFront, le.LemmaFront, le.Weight, le.Msd); 160 | } 161 | else 162 | { 163 | elExamplesNew.AddExample(le.WordRear, le.LemmaRear, le.Weight, le.Msd); 164 | } 165 | } 166 | elExamplesNew.FinalizeAdditions(); 167 | 168 | return elExamplesNew; 169 | } 170 | 171 | 172 | // Output Functions (ToString) -------------------------- 173 | 174 | public override string ToString() { 175 | var sb = new StringBuilder(); 176 | 177 | foreach (LemmaExample exm in lstExamples) { 178 | sb.AppendLine(exm.ToString()); 179 | } 180 | 181 | return sb.ToString(); 182 | } 183 | 184 | 185 | // Serialization Functions (.Net Default - ISerializable) ---- 186 | 187 | public void GetObjectData(SerializationInfo info, StreamingContext context) { 188 | 189 | info.AddValue("lsett", lsett); 190 | info.AddValue("iNumExamples", dictExamples.Count); 191 | 192 | var aWords = new string[dictExamples.Count]; 193 | var aLemmas = new string[dictExamples.Count]; 194 | var aWeights = new double[dictExamples.Count]; 195 | var aMsds = new string[dictExamples.Count]; 196 | int iExm = 0; 197 | foreach (LemmaExample exm in dictExamples.Values) { 198 | aWords[iExm] = exm.Word; 199 | aLemmas[iExm] = exm.Lemma; 200 | aWeights[iExm] = exm.Weight; 201 | aMsds[iExm] = exm.Msd; 202 | iExm++; 203 | } 204 | 205 | info.AddValue("aWords", aWords); 206 | info.AddValue("aLemmas", aLemmas); 207 | info.AddValue("aWeights", aWeights); 208 | info.AddValue("aMsds", aMsds); 209 | } 210 | public ExampleList(SerializationInfo info, StreamingContext context) { 211 | 212 | lsett = (LemmatizerSettings)info.GetValue("lsett", typeof(LemmatizerSettings)); 213 | 214 | this.dictExamples = new Dictionary(); 215 | this.lstExamples = null; 216 | this.rlRules = new RuleList(lsett); 217 | 218 | var aWords = (string[])info.GetValue("aWords", typeof(string[])); 219 | var aLemmas = (string[])info.GetValue("aLemmas", typeof(string[])); 220 | var aWeights = (double[])info.GetValue("aWeights", typeof(double[])); 221 | var aMsds = (string[])info.GetValue("aMsds", typeof(string[])); 222 | 223 | for (int iExm = 0; iExm < aWords.Length; iExm++) 224 | { 225 | AddExample(aWords[iExm], aLemmas[iExm], aWeights[iExm], aMsds[iExm]); 226 | } 227 | } 228 | 229 | // Serialization functions (regular) ---------------------- 230 | 231 | public void Serialize(StreamWriter sWrt, bool bSerializeExamples, bool bThisTopObject) 232 | { 233 | //save metadata 234 | sWrt.Write(bThisTopObject); sWrt.Write(Constants.Separator); 235 | 236 | //save refernce types if needed ------------------------- 237 | if (bThisTopObject) 238 | { 239 | lsett.Serialize(sWrt); sWrt.Write(Constants.Separator); 240 | } 241 | 242 | rlRules.Serialize(sWrt, false); 243 | 244 | if (!bSerializeExamples) 245 | { 246 | sWrt.Write(false); // lstExamples == null 247 | sWrt.Write(Constants.Separator); 248 | sWrt.Write(0); // dictExamples.Count == 0 249 | sWrt.Write(Constants.Separator); 250 | } 251 | else 252 | { 253 | if (lstExamples == null) 254 | { 255 | sWrt.Write(false); // lstExamples == null 256 | sWrt.Write(Constants.Separator); 257 | 258 | //save dictionary items 259 | int iCount = dictExamples.Count; 260 | sWrt.Write(iCount); sWrt.Write(Constants.Separator); 261 | 262 | foreach (KeyValuePair kvp in dictExamples) 263 | { 264 | sWrt.Write(kvp.Value.Rule.Signature); sWrt.Write(Constants.Separator); 265 | kvp.Value.Serialize(sWrt, false); 266 | } 267 | } 268 | else 269 | { 270 | sWrt.Write(true); // lstExamples != null 271 | sWrt.Write(Constants.Separator); 272 | 273 | //save list & dictionary items 274 | int iCount = lstExamples.Count; 275 | sWrt.Write(iCount); sWrt.Write(Constants.Separator); 276 | 277 | foreach (LemmaExample le in lstExamples) 278 | { 279 | sWrt.Write(le.Rule.Signature); sWrt.Write(Constants.Separator); 280 | le.Serialize(sWrt, false); 281 | } 282 | } 283 | } 284 | sWrt.WriteLine(); 285 | } 286 | 287 | 288 | // Serialization Functions (Binary) ----------------------- 289 | 290 | public void Serialize(BinaryWriter binWrt, bool bSerializeExamples, bool bThisTopObject){ 291 | //save metadata 292 | binWrt.Write(bThisTopObject); 293 | 294 | //save refernce types if needed ------------------------- 295 | if (bThisTopObject) 296 | { 297 | lsett.Serialize(binWrt); 298 | } 299 | 300 | // serialize rules 301 | rlRules.Serialize(binWrt, false); 302 | 303 | // serialize examples 304 | if (!bSerializeExamples) { 305 | binWrt.Write(false); // lstExamples == null 306 | binWrt.Write(0); // dictExamples.Count == 0 307 | } 308 | else { 309 | if (lstExamples == null) { 310 | binWrt.Write(false); // lstExamples == null 311 | 312 | //save dictionary items 313 | int iCount = dictExamples.Count; 314 | binWrt.Write(iCount); 315 | 316 | foreach (KeyValuePair kvp in dictExamples) { 317 | binWrt.Write(kvp.Value.Rule.Signature); 318 | kvp.Value.Serialize(binWrt, false); 319 | } 320 | } 321 | else { 322 | binWrt.Write(true); // lstExamples != null 323 | 324 | //save list & dictionary items 325 | int iCount = lstExamples.Count; 326 | binWrt.Write(iCount); 327 | 328 | foreach (LemmaExample le in lstExamples) { 329 | binWrt.Write(le.Rule.Signature); 330 | le.Serialize(binWrt, false); 331 | } 332 | } 333 | } 334 | } 335 | 336 | public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett) { 337 | //load metadata 338 | bool bThisTopObject = binRead.ReadBoolean(); 339 | 340 | //load refernce types if needed ------------------------- 341 | if (bThisTopObject) 342 | { 343 | this.lsett = new LemmatizerSettings(binRead); 344 | } 345 | else 346 | { 347 | this.lsett = lsett; 348 | } 349 | 350 | // deserialize rules 351 | rlRules = new RuleList(binRead, this.lsett); 352 | 353 | // deserialize examples 354 | bool bCreateLstExamples = binRead.ReadBoolean(); 355 | lstExamples = bCreateLstExamples ? new List() : null; 356 | dictExamples = new Dictionary(); 357 | 358 | //load dictionary items 359 | int iCount = binRead.ReadInt32(); 360 | for (int iId = 0; iId < iCount; iId++) { 361 | LemmaRule lrRule = rlRules[binRead.ReadString()]; 362 | var le = new LemmaExample(binRead, this.lsett, lrRule); 363 | 364 | dictExamples.Add(le.Signature, le); 365 | if (bCreateLstExamples){ lstExamples.Add(le);} 366 | } 367 | } 368 | public ExampleList(BinaryReader binRead, LemmatizerSettings lsett) { 369 | Deserialize(binRead, lsett); 370 | } 371 | 372 | 373 | // Serialization Functions (Latino) ------------------- 374 | 375 | #if LATINO 376 | 377 | public void Save(Latino.BinarySerializer binWrt, bool bSerializeExamples, bool bThisTopObject) { 378 | //save metadata 379 | binWrt.WriteBool(bThisTopObject); 380 | 381 | //save refernce types if needed ------------------------- 382 | if (bThisTopObject) 383 | lsett.Save(binWrt); 384 | 385 | rlRules.Save(binWrt, false); 386 | 387 | if (!bSerializeExamples) { 388 | binWrt.WriteBool(false); // lstExamples == null 389 | binWrt.WriteInt(0); // dictExamples.Count == 0 390 | } 391 | else { 392 | if (lstExamples == null) { 393 | binWrt.WriteBool(false); // lstExamples == null 394 | 395 | //save dictionary items 396 | int iCount = dictExamples.Count; 397 | binWrt.WriteInt(iCount); 398 | 399 | foreach (KeyValuePair kvp in dictExamples) { 400 | binWrt.WriteString(kvp.Value.Rule.Signature); 401 | kvp.Value.Save(binWrt, false); 402 | } 403 | } 404 | else { 405 | binWrt.WriteBool(true); // lstExamples != null 406 | 407 | //save list & dictionary items 408 | int iCount = lstExamples.Count; 409 | binWrt.WriteInt(iCount); 410 | 411 | foreach (LemmaExample le in lstExamples) { 412 | binWrt.WriteString(le.Rule.Signature); 413 | le.Save(binWrt, false); 414 | } 415 | } 416 | } 417 | 418 | } 419 | public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett) { 420 | //load metadata 421 | bool bThisTopObject = binRead.ReadBool(); 422 | 423 | //load refernce types if needed ------------------------- 424 | if (bThisTopObject) 425 | this.lsett = new LemmatizerSettings(binRead); 426 | else 427 | this.lsett = lsett; 428 | 429 | rlRules = new RuleList(binRead, this.lsett); 430 | 431 | bool bCreateLstExamples = binRead.ReadBool(); 432 | 433 | lstExamples = bCreateLstExamples ? new List() : null; 434 | dictExamples = new Dictionary(); 435 | 436 | //load dictionary items 437 | int iCount = binRead.ReadInt(); 438 | for (int iId = 0; iId < iCount; iId++) { 439 | LemmaRule lrRule = rlRules[binRead.ReadString()]; 440 | LemmaExample le = new LemmaExample(binRead, this.lsett, lrRule); 441 | 442 | dictExamples.Add(le.Signature, le); 443 | if (bCreateLstExamples) lstExamples.Add(le); 444 | } 445 | 446 | } 447 | public ExampleList(Latino.BinarySerializer binRead, LemmatizerSettings lsett) { 448 | Load(binRead, lsett); 449 | } 450 | 451 | #endif 452 | 453 | } 454 | } 455 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/LemmaRule.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | 4 | namespace LemmaSharp.Classes { 5 | public class LemmaRule { 6 | 7 | // Private Variables ----------------------- 8 | 9 | private int iId; 10 | private int iFrom; 11 | private string sFrom; 12 | private string sTo; 13 | private string sSignature; 14 | private LemmatizerSettings lsett; 15 | 16 | 17 | // Constructor(s) & Destructor(s) --------- 18 | 19 | public LemmaRule(string sWord, string sLemma, int iId, LemmatizerSettings lsett) { 20 | this.lsett = lsett; 21 | this.iId = iId; 22 | 23 | int iSameStem = SameStem(sWord, sLemma); 24 | sTo = sLemma.Substring(iSameStem); 25 | iFrom = sWord.Length - iSameStem; 26 | 27 | if (lsett.bUseFromInRules) { 28 | sFrom = sWord.Substring(iSameStem); 29 | sSignature = "[" + sFrom + "]==>[" + sTo + "]"; 30 | } 31 | else { 32 | sFrom = null; 33 | sSignature = "[#" + iFrom + "]==>[" + sTo + "]"; 34 | } 35 | } 36 | 37 | 38 | // Public Properties --------------------- 39 | 40 | public string Signature { 41 | get { 42 | return sSignature; 43 | } 44 | } 45 | public int Id { 46 | get { 47 | return iId; 48 | } 49 | } 50 | 51 | 52 | // Essential Class Functions ------------- 53 | 54 | private static int SameStem(string sStr1, string sStr2) { 55 | int iLen1 = sStr1.Length; 56 | int iLen2 = sStr2.Length; 57 | int iMaxLen = Math.Min(iLen1, iLen2); 58 | 59 | for (int iPos = 0; iPos < iMaxLen; iPos++) 60 | if (sStr1[iPos] != sStr2[iPos]) return iPos; 61 | 62 | return iMaxLen; 63 | } 64 | public bool IsApplicableToGroup(int iGroupCondLen) { 65 | return iGroupCondLen >= iFrom; 66 | } 67 | public string Lemmatize(string sWord) 68 | { 69 | // if the removed part is upper, replace by an uppercase string 70 | var isRemovedPartUpper = IsFullyUpper(sWord.Substring(sWord.Length - iFrom, iFrom)); 71 | return sWord.Substring(0, sWord.Length - iFrom) + (isRemovedPartUpper ? sTo.ToUpper() : sTo); 72 | } 73 | 74 | 75 | // Output Functions (ToString) ---------- 76 | 77 | public override string ToString() { 78 | return iId + ":" + sSignature; 79 | } 80 | 81 | // Serialization Functions (regular) ----- 82 | 83 | public void Serialize(StreamWriter sWrt, bool bThisTopObject) 84 | { 85 | //save metadata 86 | sWrt.Write(bThisTopObject); sWrt.Write(Constants.Separator); 87 | 88 | //save value types -------------------------------------- 89 | sWrt.Write(iId); sWrt.Write(Constants.Separator); 90 | sWrt.Write(iFrom); sWrt.Write(Constants.Separator); 91 | if (sFrom == null) 92 | { 93 | sWrt.Write(false); sWrt.Write(Constants.Separator); 94 | } 95 | else 96 | { 97 | sWrt.Write(true); sWrt.Write(Constants.Separator); 98 | sWrt.Write(sFrom); sWrt.Write(Constants.Separator); 99 | } 100 | sWrt.Write(sTo); sWrt.Write(Constants.Separator); 101 | sWrt.Write(sSignature); sWrt.Write(Constants.Separator); 102 | 103 | if (bThisTopObject) 104 | { 105 | lsett.Serialize(sWrt); sWrt.Write(Constants.Separator); 106 | } 107 | 108 | sWrt.WriteLine(); 109 | } 110 | 111 | // Serialization Functions (Binary) ----- 112 | 113 | public void Serialize(BinaryWriter binWrt, bool bThisTopObject) { 114 | //save metadata 115 | binWrt.Write(bThisTopObject); 116 | 117 | //save value types -------------------------------------- 118 | binWrt.Write(iId); 119 | binWrt.Write(iFrom); 120 | if (sFrom == null) 121 | { 122 | binWrt.Write(false); 123 | } 124 | else 125 | { 126 | binWrt.Write(true); 127 | binWrt.Write(sFrom); 128 | } 129 | binWrt.Write(sTo); 130 | binWrt.Write(sSignature); 131 | 132 | if (bThisTopObject) 133 | { 134 | lsett.Serialize(binWrt); 135 | } 136 | } 137 | public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett) { 138 | //load metadata 139 | bool bThisTopObject = binRead.ReadBoolean(); 140 | 141 | //load value types -------------------------------------- 142 | iId = binRead.ReadInt32(); 143 | iFrom = binRead.ReadInt32(); 144 | if (binRead.ReadBoolean()) 145 | { 146 | sFrom = binRead.ReadString(); 147 | } 148 | else 149 | { 150 | sFrom = null; 151 | } 152 | sTo = binRead.ReadString(); 153 | sSignature = binRead.ReadString(); 154 | 155 | //load refernce types if needed ------------------------- 156 | if (bThisTopObject) 157 | { 158 | this.lsett = new LemmatizerSettings(binRead); 159 | } 160 | else 161 | { 162 | this.lsett = lsett; 163 | } 164 | } 165 | public LemmaRule(BinaryReader binRead, LemmatizerSettings lsett) { 166 | this.Deserialize(binRead, lsett); 167 | } 168 | 169 | 170 | // Serialization Functions (Latino) ----- 171 | #if LATINO 172 | 173 | public void Save(Latino.BinarySerializer binWrt, bool bThisTopObject) { 174 | //save metadata 175 | binWrt.WriteBool(bThisTopObject); 176 | 177 | //save value types -------------------------------------- 178 | binWrt.WriteInt(iId); 179 | binWrt.WriteInt(iFrom); 180 | if (sFrom == null) 181 | binWrt.WriteBool(false); 182 | else { 183 | binWrt.WriteBool(true); 184 | binWrt.WriteString(sFrom); 185 | } 186 | binWrt.WriteString(sTo); 187 | binWrt.WriteString(sSignature); 188 | 189 | if (bThisTopObject) 190 | lsett.Save(binWrt); 191 | } 192 | public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett) { 193 | //load metadata 194 | bool bThisTopObject = binRead.ReadBool(); 195 | 196 | //load value types -------------------------------------- 197 | iId = binRead.ReadInt(); 198 | iFrom = binRead.ReadInt(); 199 | if (binRead.ReadBool()) 200 | sFrom = binRead.ReadString(); 201 | else 202 | sFrom = null; 203 | sTo = binRead.ReadString(); 204 | sSignature = binRead.ReadString(); 205 | 206 | //load refernce types if needed ------------------------- 207 | if (bThisTopObject) 208 | this.lsett = new LemmatizerSettings(binRead); 209 | else 210 | this.lsett = lsett; 211 | } 212 | public LemmaRule(Latino.BinarySerializer binRead, LemmatizerSettings lsett) { 213 | Load(binRead, lsett); 214 | } 215 | 216 | #endif 217 | 218 | 219 | // String utilities ------ 220 | public static bool IsFullyUpper(string value) 221 | { 222 | if (string.IsNullOrEmpty(value)){ return false; } 223 | 224 | // Consider string to be uppercase if it has no lowercase letters. 225 | for (int i = 0; i < value.Length; i++) 226 | { 227 | if (char.IsLower(value[i])) 228 | { 229 | return false; 230 | } 231 | } 232 | return true; 233 | } 234 | } 235 | } 236 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/LemmatizerSettings.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Runtime.Serialization; 4 | 5 | namespace LemmaSharp.Classes { 6 | 7 | /// 8 | /// These are the lemmagen algorithm settings that affect speed/power of the learning and lemmatizing algorithm. 9 | /// TODO this class will be probbably removed in the future. 10 | /// 11 | [Serializable()] 12 | public class LemmatizerSettings : ISerializable { 13 | 14 | // Constructor(s) & Destructor(s) ------------------- 15 | 16 | public LemmatizerSettings() { } 17 | 18 | 19 | // Sub-Structures ---------------------------------- 20 | 21 | /// 22 | /// How algorithm considers msd tags. 23 | /// MSD stands for the wordform morphosyntactic description. 24 | /// This is the set of all lemmas starting with "writ-", as they appear in the Multext English lexicon 25 | /// 26 | public enum MsdConsideration { 27 | /// 28 | /// Completely ignores msd tags (join examples with different tags and sum their weihgts). 29 | /// 30 | Ignore, 31 | /// 32 | /// Same examples with different msd's are not considered equal and joined. 33 | /// 34 | Distinct, 35 | /// 36 | /// Joins examples with different tags (concatenates all msd tags). 37 | /// 38 | JoinAll, 39 | /// 40 | /// Joins examples with different tags (concatenates just distinct msd tags - somehow slower). 41 | /// 42 | JoinDistinct, 43 | /// 44 | /// Joins examples with different tags (new tag is the left to right substring that all joined examples share). 45 | /// 46 | JoinSameSubstring 47 | } 48 | 49 | 50 | // Public Variables -------------------------------- 51 | 52 | /// 53 | /// True if from string should be included in rule identifier ([from]->[to]). False if just length of from string is used ([#len]->[to]). 54 | /// 55 | public bool bUseFromInRules = true; 56 | /// 57 | /// Specification how algorithm considers msd tags. 58 | /// 59 | public MsdConsideration eMsdConsider = MsdConsideration.Distinct; 60 | /// 61 | /// How many of the best rules are kept in memory for each node. Zero means unlimited. 62 | /// 63 | public int iMaxRulesPerNode = 0; 64 | /// 65 | /// If true, than build proccess uses few more hevristics to build first left to right lemmatizer (lemmatizes front of the word) 66 | /// 67 | public bool bBuildFrontLemmatizer = false; 68 | 69 | 70 | // Cloneable functions -------------------------------- 71 | 72 | public LemmatizerSettings CloneDeep() { 73 | return new LemmatizerSettings() { 74 | bUseFromInRules = this.bUseFromInRules, 75 | eMsdConsider = this.eMsdConsider, 76 | iMaxRulesPerNode = this.iMaxRulesPerNode, 77 | bBuildFrontLemmatizer = this.bBuildFrontLemmatizer 78 | }; 79 | } 80 | 81 | 82 | // Serialization Functions (ISerializable) ----------- 83 | 84 | public void GetObjectData(SerializationInfo info, StreamingContext context) { 85 | info.AddValue("bUseFromInRules", bUseFromInRules); 86 | info.AddValue("eMsdConsider", eMsdConsider); 87 | info.AddValue("iMaxRulesPerNode", iMaxRulesPerNode); 88 | info.AddValue("bBuildFrontLemmatizer", bBuildFrontLemmatizer); 89 | } 90 | public LemmatizerSettings(SerializationInfo info, StreamingContext context) { 91 | bUseFromInRules = info.GetBoolean("bUseFromInRules"); 92 | eMsdConsider = (MsdConsideration)info.GetValue("eMsdConsider", typeof(MsdConsideration)); 93 | iMaxRulesPerNode = info.GetInt32("iMaxRulesPerNode"); 94 | bBuildFrontLemmatizer = info.GetBoolean("bBuildFrontLemmatizer"); 95 | } 96 | 97 | // Serialization Functions (regular) ---------------- 98 | 99 | public void Serialize(StreamWriter sWrt) 100 | { 101 | sWrt.Write(bUseFromInRules); sWrt.Write(Constants.Separator); 102 | sWrt.Write((int)eMsdConsider); sWrt.Write(Constants.Separator); 103 | sWrt.Write(iMaxRulesPerNode); sWrt.Write(Constants.Separator); 104 | sWrt.Write(bBuildFrontLemmatizer); sWrt.Write(Constants.Separator); 105 | sWrt.WriteLine(); 106 | } 107 | 108 | 109 | // Serialization Functions (Binary) ----------------- 110 | 111 | public void Serialize(BinaryWriter binWrt) { 112 | binWrt.Write(bUseFromInRules); 113 | binWrt.Write((int)eMsdConsider); 114 | binWrt.Write(iMaxRulesPerNode); 115 | binWrt.Write(bBuildFrontLemmatizer); 116 | } 117 | public void Deserialize(BinaryReader binRead) { 118 | bUseFromInRules = binRead.ReadBoolean(); 119 | eMsdConsider = (MsdConsideration)binRead.ReadInt32(); 120 | iMaxRulesPerNode = binRead.ReadInt32(); 121 | bBuildFrontLemmatizer = binRead.ReadBoolean(); 122 | } 123 | public LemmatizerSettings(BinaryReader binRead) { 124 | this.Deserialize(binRead); 125 | } 126 | 127 | 128 | // Serialization Functions (Latino) ----------------- 129 | 130 | #if LATINO 131 | 132 | public void Save(Latino.BinarySerializer binWrt) { 133 | binWrt.WriteBool(bUseFromInRules); 134 | binWrt.WriteInt((int)eMsdConsider); 135 | binWrt.WriteInt(iMaxRulesPerNode); 136 | binWrt.WriteBool(bBuildFrontLemmatizer); 137 | } 138 | 139 | public void Load(Latino.BinarySerializer binRead) { 140 | bUseFromInRules = binRead.ReadBool(); 141 | eMsdConsider = (MsdConsideration)binRead.ReadInt(); 142 | iMaxRulesPerNode = binRead.ReadInt(); 143 | bBuildFrontLemmatizer = binRead.ReadBool(); 144 | } 145 | 146 | public LemmatizerSettings(Latino.BinarySerializer reader) { 147 | Load(reader); 148 | } 149 | 150 | #endif 151 | 152 | 153 | 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/RuleList.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.IO; 3 | 4 | namespace LemmaSharp.Classes { 5 | 6 | public class RuleList : Dictionary { 7 | 8 | // Private Variables ------------------------ 9 | 10 | private LemmatizerSettings lsett; 11 | private LemmaRule lrDefaultRule; 12 | 13 | 14 | // Constructor(s) & Destructor(s) ------------ 15 | 16 | public RuleList(LemmatizerSettings lsett) { 17 | this.lsett = lsett; 18 | lrDefaultRule = AddRule(new LemmaRule("", "", 0, lsett)); 19 | } 20 | 21 | 22 | // Public Properties ----------------------- 23 | 24 | public LemmaRule DefaultRule { 25 | get { 26 | return lrDefaultRule; 27 | } 28 | } 29 | 30 | 31 | // Essential Class Functions -------------- 32 | 33 | public LemmaRule AddRule(LemmaExample le) { 34 | return AddRule(new LemmaRule(le.Word, le.Lemma, this.Count, lsett)); 35 | } 36 | private LemmaRule AddRule(LemmaRule lrRuleNew) { 37 | LemmaRule lrRuleReturn = null; 38 | 39 | if (!this.TryGetValue(lrRuleNew.Signature, out lrRuleReturn)) { 40 | lrRuleReturn = lrRuleNew; 41 | this.Add(lrRuleReturn.Signature, lrRuleReturn); 42 | } 43 | 44 | return lrRuleReturn; 45 | } 46 | 47 | // Serialization Functions (regular) ------ 48 | 49 | public void Serialize(StreamWriter sWrt, bool bThisTopObject) 50 | { 51 | //save metadata 52 | sWrt.Write(bThisTopObject); sWrt.WriteLine(Constants.Separator); 53 | 54 | //save value types -------------------------------------- 55 | 56 | //save refernce types if needed ------------------------- 57 | if (bThisTopObject) 58 | { 59 | lsett.Serialize(sWrt); 60 | } 61 | 62 | //save list items --------------------------------------- 63 | int iCount = this.Count; 64 | sWrt.WriteLine(iCount); 65 | foreach (KeyValuePair kvp in this) 66 | { 67 | sWrt.WriteLine(kvp.Key); 68 | kvp.Value.Serialize(sWrt, false); 69 | } 70 | 71 | //default rule is already saved in the list. Here just save its id. 72 | sWrt.WriteLine(lrDefaultRule.Signature); 73 | } 74 | 75 | // Serialization Functions (Binary) ------ 76 | 77 | public void Serialize(BinaryWriter binWrt, bool bThisTopObject) { 78 | //save metadata 79 | binWrt.Write(bThisTopObject); 80 | 81 | //save value types -------------------------------------- 82 | 83 | //save refernce types if needed ------------------------- 84 | if (bThisTopObject) 85 | { 86 | lsett.Serialize(binWrt); 87 | } 88 | 89 | //save list items --------------------------------------- 90 | int iCount = this.Count; 91 | binWrt.Write(iCount); 92 | foreach (KeyValuePair kvp in this) { 93 | binWrt.Write(kvp.Key); 94 | kvp.Value.Serialize(binWrt, false); 95 | } 96 | 97 | //default rule is already saved in the list. Here just save its id. 98 | binWrt.Write(lrDefaultRule.Signature); 99 | } 100 | public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett) { 101 | //load metadata 102 | bool bThisTopObject = binRead.ReadBoolean(); 103 | 104 | //load value types -------------------------------------- 105 | 106 | //load refernce types if needed ------------------------- 107 | this.lsett = bThisTopObject ? new LemmatizerSettings(binRead) : lsett; 108 | 109 | //load list items --------------------------------------- 110 | this.Clear(); 111 | int iCount = binRead.ReadInt32(); 112 | for (int iId = 0; iId < iCount; iId++) { 113 | string sKey = binRead.ReadString(); 114 | var lrVal = new LemmaRule(binRead, this.lsett); 115 | this.Add(sKey, lrVal); 116 | } 117 | 118 | //link the default rule just Id was saved. 119 | lrDefaultRule = this[binRead.ReadString()]; 120 | } 121 | public RuleList(BinaryReader binRead, LemmatizerSettings lsett) { 122 | this.Deserialize(binRead, lsett); 123 | } 124 | 125 | 126 | // Serialization Functions (Latino) ------ 127 | #if LATINO 128 | 129 | public void Save(Latino.BinarySerializer binWrt, bool bThisTopObject) { 130 | //save metadata 131 | binWrt.WriteBool(bThisTopObject); 132 | 133 | //save value types -------------------------------------- 134 | 135 | //save refernce types if needed ------------------------- 136 | if (bThisTopObject) 137 | lsett.Save(binWrt); 138 | 139 | //save list items --------------------------------------- 140 | int iCount = this.Count; 141 | binWrt.WriteInt(iCount); 142 | foreach (KeyValuePair kvp in this) { 143 | binWrt.WriteString(kvp.Key); 144 | kvp.Value.Save(binWrt, false); 145 | } 146 | 147 | //default rule is already saved in the list. Here just save its id. 148 | binWrt.WriteString(lrDefaultRule.Signature); 149 | } 150 | public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett) { 151 | //load metadata 152 | bool bThisTopObject = binRead.ReadBool(); 153 | 154 | //load value types -------------------------------------- 155 | 156 | //load refernce types if needed ------------------------- 157 | if (bThisTopObject) 158 | this.lsett = new LemmatizerSettings(binRead); 159 | else 160 | this.lsett = lsett; 161 | 162 | //load list items --------------------------------------- 163 | this.Clear(); 164 | int iCount = binRead.ReadInt(); 165 | for (int iId = 0; iId < iCount; iId++) { 166 | string sKey = binRead.ReadString(); 167 | LemmaRule lrVal = new LemmaRule(binRead, this.lsett); 168 | this.Add(sKey, lrVal); 169 | } 170 | 171 | //link the default rule just Id was saved. 172 | lrDefaultRule = this[binRead.ReadString()]; 173 | 174 | } 175 | public RuleList(Latino.BinarySerializer binRead, LemmatizerSettings lsett) { 176 | Load(binRead, lsett); 177 | } 178 | 179 | #endif 180 | 181 | } 182 | } 183 | -------------------------------------------------------------------------------- /LemmaSharp/Classes/RuleWeighted.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace LemmaSharp.Classes { 4 | 5 | [Serializable] 6 | class RuleWeighted: IComparable{ 7 | 8 | // Private Variables --------------------- 9 | 10 | private LemmaRule lrRule; 11 | private double dWeight; 12 | 13 | 14 | // Constructor(s) & Destructor(s) ------- 15 | 16 | public RuleWeighted(LemmaRule lrRule, double dWeight) { 17 | this.lrRule = lrRule; 18 | this.dWeight = dWeight; 19 | } 20 | 21 | 22 | // Public Properties -------------------- 23 | 24 | public LemmaRule Rule { 25 | get { return lrRule; } 26 | } 27 | public double Weight { 28 | get { return dWeight; } 29 | } 30 | 31 | 32 | // Essential Class Functions (comparing objects, eg.: for sorting) ------- 33 | 34 | public int CompareTo(RuleWeighted rl) { 35 | if (this.dWeight < rl.dWeight) return 1; 36 | if (this.dWeight > rl.dWeight) return -1; 37 | if (this.lrRule.Id < rl.lrRule.Id) return 1; 38 | if (this.lrRule.Id > rl.lrRule.Id) return -1; 39 | return 0; 40 | } 41 | 42 | 43 | // Output & Serialization Functions ----------- 44 | 45 | public override string ToString() { 46 | return lrRule.ToString() + dWeight.ToString("(0.00%)"); 47 | } 48 | 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /LemmaSharp/Interfaces/ILemmatizer.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using System.Runtime.Serialization; 5 | 6 | namespace LemmaSharp { 7 | public interface ILemmatizer : ISerializable { 8 | string Lemmatize(string word); 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /LemmaSharp/Interfaces/ILemmatizerModel.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | namespace LemmaSharp { 3 | public interface ILemmatizerModel { 4 | string Lemmatize(string sWord); 5 | string ToString(); 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /LemmaSharp/Interfaces/ILemmatizerTrainable.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | //using System.Linq; 4 | using System.Text; 5 | using LemmaSharp.Classes; 6 | 7 | namespace LemmaSharp { 8 | public interface ITrainableLemmatizer:ILemmatizer { 9 | ExampleList Examples { 10 | get; 11 | } 12 | ILemmatizerModel Model { 13 | get; 14 | } 15 | 16 | void AddExample(string sWord, string sLemma); 17 | void AddExample(string sWord, string sLemma, double dWeight); 18 | void AddExample(string sWord, string sLemma, double dWeight, string sMsd); 19 | 20 | void BuildModel(); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /LemmaSharp/LemmaSharp.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Debug 5 | AnyCPU 6 | 9.0.21022 7 | 2.0 8 | {A39293C1-92D8-47B9-93A4-41F443B4F9E4} 9 | Library 10 | Properties 11 | LemmaSharp 12 | LemmaSharp 13 | v4.0 14 | 512 15 | true 16 | 17 | 18 | 19 | 20 | 21 | 22 | 3.5 23 | http://localhost/LemmaSharp/ 24 | true 25 | Web 26 | true 27 | Foreground 28 | 7 29 | Days 30 | false 31 | false 32 | true 33 | 0 34 | 1.0.0.%2a 35 | false 36 | true 37 | 38 | 39 | 40 | true 41 | full 42 | false 43 | bin\Debug\ 44 | TRACE;DEBUG;NOLATINO 45 | prompt 46 | 4 47 | 48 | 49 | pdbonly 50 | true 51 | bin\Release\ 52 | TRACE;NOLATINO 53 | prompt 54 | 4 55 | 56 | 57 | true 58 | bin\x86\Debug\ 59 | DEBUG;TRACE 60 | full 61 | x86 62 | true 63 | GlobalSuppressions.cs 64 | prompt 65 | 66 | 67 | bin\x86\Release\ 68 | TRACE 69 | true 70 | pdbonly 71 | x86 72 | true 73 | GlobalSuppressions.cs 74 | prompt 75 | 76 | 77 | true 78 | bin\x64\Debug\ 79 | DEBUG;TRACE 80 | full 81 | x64 82 | true 83 | GlobalSuppressions.cs 84 | prompt 85 | 86 | 87 | bin\x64\Release\ 88 | TRACE 89 | true 90 | pdbonly 91 | x64 92 | true 93 | GlobalSuppressions.cs 94 | prompt 95 | 96 | 97 | false 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | Code 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | False 148 | .NET Framework 3.5 SP1 Client Profile 149 | false 150 | 151 | 152 | False 153 | .NET Framework 2.0 %28x86%29 154 | true 155 | 156 | 157 | False 158 | .NET Framework 3.0 %28x86%29 159 | false 160 | 161 | 162 | False 163 | .NET Framework 3.5 164 | false 165 | 166 | 167 | False 168 | .NET Framework 3.5 SP1 169 | false 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 213 | -------------------------------------------------------------------------------- /LemmaSharp/LemmaSharp.nuspec: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | LemmaGenerator 5 | 1.1.0 6 | Lemmatizer generator 7 | AlexPoint 8 | AlexPoint 9 | https://github.com/AlexPoint/LemmaGenerator 10 | false 11 | Generator of lemmatizers for several European languages 12 | Added exceptions to lemmatizers 13 | Copyright 2014 14 | lemmatization lemmatisation lemmatizer stemming stemmer 15 | 16 | -------------------------------------------------------------------------------- /LemmaSharpPrebuiltFull.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2013 4 | VisualStudioVersion = 12.0.21005.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LemmaSharp", "LemmaSharp\LemmaSharp.csproj", "{A39293C1-92D8-47B9-93A4-41F443B4F9E4}" 7 | EndProject 8 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Test", "Test\Test.csproj", "{C5B09C13-EA3B-4A00-B3D6-F8B790B108CA}" 9 | EndProject 10 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SourceFileBuilder", "SourceFileBuilder\SourceFileBuilder.csproj", "{D5450D07-4F00-4EA5-A99A-A570845CE6F9}" 11 | EndProject 12 | Global 13 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 14 | Debug|Any CPU = Debug|Any CPU 15 | Debug|x64 = Debug|x64 16 | Debug|x86 = Debug|x86 17 | Release|Any CPU = Release|Any CPU 18 | Release|x64 = Release|x64 19 | Release|x86 = Release|x86 20 | EndGlobalSection 21 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 22 | {A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 23 | {A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Debug|Any CPU.Build.0 = Debug|Any CPU 24 | {A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Debug|x64.ActiveCfg = Debug|x64 25 | {A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Debug|x64.Build.0 = Debug|x64 26 | {A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Debug|x86.ActiveCfg = Debug|x86 27 | {A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Debug|x86.Build.0 = Debug|x86 28 | {A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Release|Any CPU.ActiveCfg = Release|Any CPU 29 | {A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Release|Any CPU.Build.0 = Release|Any CPU 30 | {A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Release|x64.ActiveCfg = Release|x64 31 | {A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Release|x64.Build.0 = Release|x64 32 | {A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Release|x86.ActiveCfg = Release|x86 33 | {A39293C1-92D8-47B9-93A4-41F443B4F9E4}.Release|x86.Build.0 = Release|x86 34 | {C5B09C13-EA3B-4A00-B3D6-F8B790B108CA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 35 | {C5B09C13-EA3B-4A00-B3D6-F8B790B108CA}.Debug|Any CPU.Build.0 = Debug|Any CPU 36 | {C5B09C13-EA3B-4A00-B3D6-F8B790B108CA}.Debug|x64.ActiveCfg = Debug|Any CPU 37 | {C5B09C13-EA3B-4A00-B3D6-F8B790B108CA}.Debug|x86.ActiveCfg = Debug|Any CPU 38 | {C5B09C13-EA3B-4A00-B3D6-F8B790B108CA}.Release|Any CPU.ActiveCfg = Release|Any CPU 39 | {C5B09C13-EA3B-4A00-B3D6-F8B790B108CA}.Release|Any CPU.Build.0 = Release|Any CPU 40 | {C5B09C13-EA3B-4A00-B3D6-F8B790B108CA}.Release|x64.ActiveCfg = Release|Any CPU 41 | {C5B09C13-EA3B-4A00-B3D6-F8B790B108CA}.Release|x86.ActiveCfg = Release|Any CPU 42 | {D5450D07-4F00-4EA5-A99A-A570845CE6F9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 43 | {D5450D07-4F00-4EA5-A99A-A570845CE6F9}.Debug|Any CPU.Build.0 = Debug|Any CPU 44 | {D5450D07-4F00-4EA5-A99A-A570845CE6F9}.Debug|x64.ActiveCfg = Debug|Any CPU 45 | {D5450D07-4F00-4EA5-A99A-A570845CE6F9}.Debug|x86.ActiveCfg = Debug|Any CPU 46 | {D5450D07-4F00-4EA5-A99A-A570845CE6F9}.Release|Any CPU.ActiveCfg = Release|Any CPU 47 | {D5450D07-4F00-4EA5-A99A-A570845CE6F9}.Release|Any CPU.Build.0 = Release|Any CPU 48 | {D5450D07-4F00-4EA5-A99A-A570845CE6F9}.Release|x64.ActiveCfg = Release|Any CPU 49 | {D5450D07-4F00-4EA5-A99A-A570845CE6F9}.Release|x86.ActiveCfg = Release|Any CPU 50 | EndGlobalSection 51 | GlobalSection(SolutionProperties) = preSolution 52 | HideSolutionNode = FALSE 53 | EndGlobalSection 54 | EndGlobal 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | LemmaGenerator 2 | ============== 3 | 4 | LemmaGenerator creates lemmatizers for several European languages that you can customize. 5 | 6 | This package is available on Nuget: 7 | > Install-Package LemmaGenerator 8 | 9 | This project was created by Matjaz Jursic and was retrieved on http://lemmatise.ijs.si/. He's the expert and did a great job so for all questions you should check his website. 10 | 11 | Quickstart 12 | ---------------- 13 | 14 | If you just want to lemmatize words, you want to check the precompiled lemmatizer files here: https://github.com/AlexPoint/LemmaGenerator/tree/master/Test/Data. 15 | 16 | Load the selected file in a stream a build a lemmatizer with it: 17 | 18 | ```csharp 19 | var dataFilepath = "/path/to/the/lemmatizer/file"; 20 | var stream = File.OpenRead(dataFilePath); 21 | var lemmatizer = new Lemmatizer(stream); 22 | var result = lemmatizer.Lemmatize("words"); 23 | Console.WriteLine(result); 24 | // prints "word" 25 | ``` 26 | 27 | Note: Since this is an old Nuget, some newer environments may not support directly referencing the namespace after installing it via nuget.org. In such cases, you can add the .dll file from http://lemmatise.ijs.si/Software/Version3 into References of your project. 28 | 29 | Customizing the lemmatizer 30 | ---------------- 31 | 32 | As mentioned above, you can customize your lemmatizer by using your own dictionary { word, lemma }. 33 | For more information, check [Matjaz's website](http://lemmatise.ijs.si/). 34 | -------------------------------------------------------------------------------- /SourceFileBuilder/App.config: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /SourceFileBuilder/Classes/EnricherFileReader.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | 8 | namespace SourceFileBuilder.Classes 9 | { 10 | public class EnricherFileReader : StreamReader 11 | { 12 | public EnricherFileReader(Stream stream) : base(stream) { } 13 | public EnricherFileReader(string filePath) : base(filePath) { } 14 | 15 | 16 | public Tuple ReadLemmaEntry() 17 | { 18 | var line = this.ReadLine(); 19 | if (!string.IsNullOrEmpty(line)) 20 | { 21 | // don't read comment lines 22 | if (!line.StartsWith("//") && !line.StartsWith("#")) 23 | { 24 | var parts = line.Split(' '); 25 | var weight = parts.Length > 2 ? int.Parse(parts[2]) : 1; 26 | return new Tuple(parts[0], parts[1], weight); 27 | } 28 | } 29 | return null; 30 | } 31 | 32 | public List> ReadAllLemmaEntries() 33 | { 34 | var results = new List>(); 35 | while (!this.EndOfStream) 36 | { 37 | var lemmaEntry = this.ReadLemmaEntry(); 38 | if (lemmaEntry != null) 39 | { 40 | results.Add(lemmaEntry); 41 | } 42 | } 43 | return results; 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /SourceFileBuilder/Input/english-acronyms.txt: -------------------------------------------------------------------------------- 1 | // Acronyms : lemmatized form is the same (TODO: parse http://en.wikipedia.org/wiki/List_of_acronyms for full list) 2 | p.p.s p.p.s 3 | dsd dsd 4 | b.s b.s 5 | FCS FCS 6 | ohd ohd 7 | apcd apcd 8 | french-ed french-ed 9 | vts vts 10 | gds gds 11 | aes aes 12 | rhs rhs 13 | ldls ldls 14 | crps crps 15 | CJD CJD 16 | rvd rvd 17 | lhd lhd 18 | hsd hsd 19 | scs scs 20 | ScD ScD 21 | BNS BNS 22 | acd acd 23 | mvd mvd 24 | tcd tcd 25 | accs accs 26 | avs avs 27 | sbd sbd 28 | n-d n-d 29 | csts csts 30 | mxs mxs 31 | kbd kbd 32 | hfd hfd 33 | pstd pstd 34 | evs evs 35 | tmd tmd 36 | a-d a-d 37 | bkd bkd 38 | agd agd 39 | gbs gbs 40 | BVD BVD 41 | mns mns 42 | tcs tcs 43 | FPD FPD 44 | mcd mcd 45 | nbs nbs 46 | nbd nbd 47 | nbes nbes 48 | ncd ncd 49 | OLED OLED 50 | sae sae 51 | DMD DMD 52 | uks uks 53 | crs crs 54 | tls tls 55 | co-ed co-ed 56 | tns tns 57 | dpd dpd 58 | ecd ecd 59 | vtd vtd 60 | bhs bhs 61 | kys kys 62 | PKD PKD 63 | rbcs rbcs 64 | asls asls 65 | bcs bcs 66 | ABLS ABLS 67 | psd psd 68 | igs igs 69 | bbd bbd 70 | pcd pcd 71 | wvs wvs 72 | mds mds 73 | sgs sgs 74 | BJS BJS 75 | ETD ETD 76 | esd esd 77 | rvs rvs 78 | jrs jrs 79 | rbs rbs 80 | ags ags 81 | r&d r&d 82 | pxs pxs 83 | apcs apcs 84 | MALS MALS 85 | lrs lrs 86 | qs qs 87 | bcd bcd 88 | cos cos 89 | ohs ohs 90 | Eos Eos 91 | ers ers 92 | kbs kbs 93 | wae wae 94 | jd jd 95 | cts cts 96 | BS BS 97 | ses ses 98 | sed sed 99 | ecs ecs 100 | dps dps 101 | Bos Bos 102 | DEd DEd 103 | MD MD 104 | MEd MEd 105 | ovs ovs 106 | https https 107 | cest cest 108 | fos fos 109 | mes mes 110 | aks aks 111 | pdas pdas 112 | icbms icbms 113 | dcs dcs 114 | pcs pcs 115 | ged ged 116 | EAS EAS 117 | DVD DVD 118 | gcs gcs 119 | vas vas 120 | BD BD 121 | DCI DCI 122 | pts pts 123 | aus aus 124 | sms sms 125 | das das 126 | WMD WMD 127 | fd fd 128 | wys wys 129 | les les 130 | hd hd 131 | ws ws 132 | zs zs 133 | zd zd 134 | blts blts 135 | mfs mfs 136 | crts crts 137 | qd qd 138 | PBS PBS 139 | KS KS 140 | cas cas 141 | cae cae 142 | PMS PMS 143 | DS DS 144 | ced ced 145 | GI GI 146 | arps arps 147 | lps lps 148 | DOS DOS 149 | uns uns 150 | uvd uvd 151 | OAS OAS 152 | MLS MLS 153 | USPS USPS 154 | HHS HHS 155 | FPS FPS 156 | ns ns 157 | nhs nhs 158 | MIPS MIPS 159 | rns rns 160 | SARS SARS 161 | coed coed 162 | VD VD 163 | PhD PhD 164 | PHS PHS 165 | mpd mpd 166 | tbd tbd 167 | amd amd 168 | ews ews 169 | FRS FRS 170 | bbs bbs 171 | eegs eegs 172 | SOS SOS 173 | LCD LCD 174 | foed foed 175 | acs acs 176 | ges ges 177 | npd npd 178 | FAE FAE 179 | nes nes 180 | gpd gpd 181 | ias ias 182 | aux aux 183 | nae nae 184 | dae dae 185 | fs fs 186 | mbs mbs 187 | PTSD PTSD 188 | tbs tbs 189 | GD GD 190 | ncs ncs 191 | Ted Ted 192 | ARDS ARDS 193 | wbd wbd 194 | CNS CNS 195 | ios ios 196 | lpd lpd 197 | tms tms 198 | ods ods 199 | abd abd 200 | os os 201 | SIDS SIDS 202 | AIDS AIDS 203 | avd avd 204 | bsd bsd 205 | SD SD 206 | ctd ctd 207 | ces ces 208 | usd usd 209 | INS INS 210 | hbs hbs 211 | FWS FWS 212 | GPS GPS 213 | sws sws 214 | pds pds 215 | LSD LSD -------------------------------------------------------------------------------- /SourceFileBuilder/Input/english-contractions.txt: -------------------------------------------------------------------------------- 1 | // Add missing lemmas in this file to enrich the lemmatizer. 2 | don't do 3 | doesn't do 4 | didn't did 5 | won't will 6 | shan't shall 7 | can't can 8 | couldn't could 9 | wouldn't would 10 | shouldn't should 11 | mustn't must 12 | mightn't might 13 | oughtn't ought 14 | needn't need 15 | aren't are 16 | isn't be 17 | wasn't be 18 | weren't be 19 | haven't have 20 | hasn't have 21 | hadn't have 22 | 's 's 23 | 've have 24 | 'm be 25 | 're be 26 | 'll will -------------------------------------------------------------------------------- /SourceFileBuilder/Input/english-irregular_verbs-enricher.txt: -------------------------------------------------------------------------------- 1 | // List of irregular verbs associated to their infinitive form 2 | // The format is "word lemma weight" 3 | abides abide 10 4 | abiding abide 10 5 | abode abide 10 6 | abided abide 10 7 | abidden abide 10 8 | alights alight 10 9 | alighting alight 10 10 | alit alight 10 11 | alighted alight 10 12 | arises arise 10 13 | arising arise 10 14 | arose arise 10 15 | arisen arise 10 16 | awakes awake 10 17 | awaking awake 10 18 | awoke awake 10 19 | awoken awake 10 20 | is be 10 21 | being be 10 22 | was be 10 23 | were be 10 24 | been be 10 25 | bears bear 10 26 | bearing bear 10 27 | bore bear 10 28 | born bear 10 29 | borne bear 10 30 | beats beat 10 31 | beating beat 10 32 | beat beat 10 33 | beaten beat 10 34 | becomes become 10 35 | becoming become 10 36 | became become 10 37 | become become 10 38 | begins begin 10 39 | beginning begin 10 40 | began begin 10 41 | begun begin 10 42 | beholds behold 10 43 | beholding behold 10 44 | beheld behold 10 45 | bends bend 10 46 | bending bend 10 47 | bent bend 10 48 | bets bet 10 49 | betting bet 10 50 | bet bet 10 51 | bids bid 10 52 | bidding bid 10 53 | bade bid 10 54 | bidden bid 10 55 | bid bid 10 56 | binds bind 10 57 | binding bind 10 58 | bound bind 10 59 | bites bite 10 60 | biting bite 10 61 | bit bite 10 62 | bitten bite 10 63 | bleeds bleed 10 64 | bleeding bleed 10 65 | bled bleed 10 66 | blows blow 10 67 | blowing blow 10 68 | blew blow 10 69 | blown blow 10 70 | breaks break 10 71 | breaking break 10 72 | broke break 10 73 | broken break 10 74 | breeds breed 10 75 | breeding breed 10 76 | bred breed 10 77 | brings bring 10 78 | bringing bring 10 79 | brought bring 10 80 | broadcasts broadcast 10 81 | broadcasting broadcast 10 82 | broadcast broadcast 10 83 | broadcasted broadcast 10 84 | builds build 10 85 | building build 10 86 | built build 10 87 | burns burn 10 88 | burning burn 10 89 | burnt burn 10 90 | burned burn 10 91 | bursts burst 10 92 | bursting burst 10 93 | burst burst 10 94 | busts bust 10 95 | busting bust 10 96 | bust bust 10 97 | buys buy 10 98 | buying buy 10 99 | bought buy 10 100 | casts cast 10 101 | casting cast 10 102 | cast cast 10 103 | catches catch 10 104 | catching catch 10 105 | caught catch 10 106 | chooses choose 10 107 | choosing choose 10 108 | chose choose 10 109 | chosen choose 10 110 | claps clap 10 111 | clapping clap 10 112 | clapped clap 10 113 | clapt clap 10 114 | clings cling 10 115 | clinging cling 10 116 | clung cling 10 117 | clothes clothe 10 118 | clothing clothe 10 119 | clad clothe 10 120 | clothed clothe 10 121 | comes come 10 122 | coming come 10 123 | came come 10 124 | come come 10 125 | costs cost 10 126 | costing cost 10 127 | cost cost 10 128 | creeps creep 10 129 | creeping creep 10 130 | crept creep 10 131 | cuts cut 10 132 | cutting cut 10 133 | cut cut 10 134 | dares dare 10 135 | daring dare 10 136 | dared dare 10 137 | durst dare 10 138 | deals deal 10 139 | dealing deal 10 140 | dealt deal 10 141 | digs dig 10 142 | digging dig 10 143 | dug dig 10 144 | dives dive 10 145 | diving dive 10 146 | dived dive 10 147 | dove dive 10 148 | does do 10 149 | doing do 10 150 | did do 10 151 | done do 10 152 | draws draw 10 153 | drawing draw 10 154 | drew draw 10 155 | drawn draw 10 156 | dreams dream 10 157 | dreaming dream 10 158 | dreamt dream 10 159 | dreamed dream 10 160 | drinks drink 10 161 | drinking drink 10 162 | drank drink 10 163 | drunk drink 10 164 | drives drive 10 165 | driving drive 10 166 | drove drive 10 167 | driven drive 10 168 | dwells dwell 10 169 | dwelling dwell 10 170 | dwelt dwell 10 171 | eats eat 10 172 | eating eat 10 173 | ate eat 10 174 | eaten eat 10 175 | falls fall 10 176 | falling fall 10 177 | fell fall 10 178 | fallen fall 10 179 | feeds feed 10 180 | feeding feed 10 181 | fed feed 10 182 | feels feel 10 183 | feeling feel 10 184 | felt feel 10 185 | fights fight 10 186 | fighting fight 10 187 | fought fight 10 188 | finds find 10 189 | finding find 10 190 | found find 10 191 | fits fit 10 192 | fitting fit 10 193 | fit fit 10 194 | fitted fit 10 195 | flees flee 10 196 | fleeing flee 10 197 | fled flee 10 198 | flings fling 10 199 | flinging fling 10 200 | flung fling 10 201 | flies fly 10 202 | flying fly 10 203 | flew fly 10 204 | flown fly 10 205 | forbids forbid 10 206 | forbidding forbid 10 207 | forbade forbid 10 208 | forbad forbid 10 209 | forbidden forbid 10 210 | forecasts forecast 10 211 | forecasting forecast 10 212 | forecast forecast 10 213 | forecasted forecast 10 214 | foresees foresee 10 215 | foreseeing foresee 10 216 | foresaw foresee 10 217 | foreseen foresee 10 218 | foretells foretell 10 219 | foretelling foretell 10 220 | foretold foretell 10 221 | forgets forget 10 222 | foregetting forget 10 223 | forgot forget 10 224 | forgotten forget 10 225 | forgives forgive 10 226 | forgiving forgive 10 227 | forgave forgive 10 228 | forgiven forgive 10 229 | forsakes forsake 10 230 | forsaking forsake 10 231 | forsook forsake 10 232 | forsaken forsake 10 233 | freezes freeze 10 234 | freezing freeze 10 235 | froze freeze 10 236 | frozen freeze 10 237 | frostbites frostbite 10 238 | frostbiting frostbite 10 239 | frostbit frostbite 10 240 | frostbitten frostbite 10 241 | gets get 10 242 | getting get 10 243 | got get 10 244 | gotten get 10 245 | gives give 10 246 | giving give 10 247 | gave give 10 248 | given give 10 249 | goes go 10 250 | going go 10 251 | went go 10 252 | gone go 10 253 | grinds grind 10 254 | grinding grind 10 255 | ground grind 10 256 | grows grow 10 257 | growing grow 10 258 | grew grow 10 259 | grown grow 10 260 | handwrites handwrite 10 261 | handwriting handwrite 10 262 | handwrote handwrite 10 263 | handwritten handwrite 10 264 | hangs hang 10 265 | hanging hang 10 266 | hung hang 10 267 | hanged hang 10 268 | has have 10 269 | having have 10 270 | had have 10 271 | hears hear 10 272 | hearing hear 10 273 | heard hear 10 274 | hides hide 10 275 | hiding hide 10 276 | hid hide 10 277 | hidden hide 10 278 | hits hit 10 279 | hitting hit 10 280 | hit hit 10 281 | holds hold 10 282 | holding hold 10 283 | held hold 10 284 | hurts hurt 10 285 | hurting hurt 10 286 | hurt hurt 10 287 | inlays inlay 10 288 | inlaying inlay 10 289 | inlaid inlay 10 290 | inputs input 10 291 | inputting input 10 292 | input input 10 293 | inputted input 10 294 | interlays interlay 10 295 | interlaying interlay 10 296 | interlaid interlay 10 297 | keeps keep 10 298 | keeping keep 10 299 | kept keep 10 300 | kneels kneel 10 301 | kneeling kneel 10 302 | knelt kneel 10 303 | kneeled kneel 10 304 | knits knit 10 305 | knitting knit 10 306 | knit knit 10 307 | knitted knit 10 308 | knows know 10 309 | knowing know 10 310 | knew know 10 311 | known know 10 312 | lays lay 10 313 | laying lay 10 314 | laid lay 10 315 | leads lead 10 316 | leading lead 10 317 | led lead 10 318 | leans lean 10 319 | leaning lean 10 320 | leant lean 10 321 | leaned lean 10 322 | leaps leap 10 323 | leaping leap 10 324 | leapt leap 10 325 | leaped leap 10 326 | learns learn 10 327 | learning learn 10 328 | learnt learn 10 329 | learned learn 10 330 | leaves leave 10 331 | leaving leave 10 332 | left leave 10 333 | lends lend 10 334 | lending lend 10 335 | lent lend 10 336 | lets let 10 337 | letting let 10 338 | let let 10 339 | lies lie 10 340 | lying lie 10 341 | lay lie 10 342 | lain lie 10 343 | lights light 10 344 | lighting light 10 345 | lit light 10 346 | loses lose 10 347 | losing lose 10 348 | lost lose 10 349 | makes make 10 350 | making make 10 351 | made make 10 352 | means mean 10 353 | meaning mean 10 354 | meant mean 10 355 | meets meet 10 356 | meeting meet 10 357 | met meet 10 358 | melts melt 10 359 | melting melt 10 360 | melted melt 10 361 | molten melt 10 362 | misleads mislead 10 363 | misleading mislead 10 364 | misled mislead 10 365 | mistakes mistake 10 366 | mistaking mistake 10 367 | mistook mistake 10 368 | mistaken mistake 10 369 | misunderstands misunderstand 10 370 | misunderstanding misunderstand 10 371 | misunderstood misunderstand 10 372 | misweds miswed 10 373 | miswedding miswed 10 374 | miswed miswed 10 375 | miswedded miswed 10 376 | mows mow 10 377 | mowing mow 10 378 | mowed mow 10 379 | mown mow 10 380 | overdraws overdraw 10 381 | overdrawing overdraw 10 382 | overdrew overdraw 10 383 | overdrawn overdraw 10 384 | overhears overhear 10 385 | overhearing overhear 10 386 | overheard overhear 10 387 | overtakes overtake 10 388 | overtaking overtake 10 389 | overtook overtake 10 390 | overtaken overtake 10 391 | pays pay 10 392 | paying pay 10 393 | paid pay 10 394 | presets preset 10 395 | presetting preset 10 396 | preset preset 10 397 | proves prove 10 398 | proving prove 10 399 | proved prove 10 400 | proven prove 10 401 | puts put 10 402 | putting put 10 403 | put put 10 404 | quits quit 10 405 | quitting quit 10 406 | quit quit 10 407 | re-proves re-prove 10 408 | re-proving re-prove 10 409 | re-proved re-prove 10 410 | re-proven re-prove 10 411 | reads read 10 412 | reading read 10 413 | read read 10 414 | rids rid 10 415 | ridding rid 10 416 | rid rid 10 417 | ridded rid 10 418 | rides ride 10 419 | riding ride 10 420 | rode ride 10 421 | ridden ride 10 422 | rings ring 10 423 | ringing ring 10 424 | rang ring 10 425 | rung ring 10 426 | rises rise 10 427 | rising rise 10 428 | rose rise 10 429 | risen rise 10 430 | rives rive 10 431 | riving rive 10 432 | rived rive 10 433 | riven rive 10 434 | runs run 10 435 | running run 10 436 | ran run 10 437 | run run 10 438 | saws saw 10 439 | sawing saw 10 440 | sawed saw 10 441 | sawn saw 10 442 | says say 10 443 | saying say 10 444 | said say 10 445 | sees see 10 446 | seeing see 10 447 | saw see 10 448 | seen see 10 449 | seeks seek 10 450 | seeking seek 10 451 | sought seek 10 452 | sells sell 10 453 | selling sell 10 454 | sold sell 10 455 | sends send 10 456 | sending send 10 457 | sent send 10 458 | sets set 10 459 | setting set 10 460 | set set 10 461 | sews sew 10 462 | sewing sew 10 463 | sewed sew 10 464 | sewn sew 10 465 | shakes shake 10 466 | shaking shake 10 467 | shook shake 10 468 | shaken shake 10 469 | shaves shave 10 470 | shaving shave 10 471 | shaved shave 10 472 | shaven shave 10 473 | shears shear 10 474 | shearing shear 10 475 | shore shear 10 476 | sheared shear 10 477 | shorn shear 10 478 | sheds shed 10 479 | shedding shed 10 480 | shed shed 10 481 | shines shine 10 482 | shining shine 10 483 | shone shine 10 484 | shoes shoe 10 485 | shoeing shoe 10 486 | shod shoe 10 487 | shoots shoot 10 488 | shooting shoot 10 489 | shot shoot 10 490 | shows show 10 491 | showing show 10 492 | showed show 10 493 | shown show 10 494 | shrinks shrink 10 495 | shrinking shrink 10 496 | shrank shrink 10 497 | shrunk shrink 10 498 | shuts shut 10 499 | shutting shut 10 500 | shut shut 10 501 | sings sing 10 502 | singing sing 10 503 | sang sing 10 504 | sung sing 10 505 | sinks sink 10 506 | sinking sink 10 507 | sank sink 10 508 | sunk sink 10 509 | sits sit 10 510 | sitting sit 10 511 | sat sit 10 512 | slays slay 10 513 | slaying slay 10 514 | slew slay 10 515 | slain slay 10 516 | sleeps sleep 10 517 | sleeping sleep 10 518 | slept sleep 10 519 | slides slide 10 520 | sliding slide 10 521 | slid slide 10 522 | slidden slide 10 523 | slings sling 10 524 | slinging sling 10 525 | slung sling 10 526 | slinks slink 10 527 | slinking slink 10 528 | slunk slink 10 529 | slits slit 10 530 | slitting slit 10 531 | slit slit 10 532 | smells smell 10 533 | smelling smell 10 534 | smelt smell 10 535 | smelled smell 10 536 | sneaks sneak 10 537 | sneaking sneak 10 538 | sneaked sneak 10 539 | snuck sneak 10 540 | soothsays soothsay 10 541 | soothsaying soothsay 10 542 | soothsaid soothsay 10 543 | sows sow 10 544 | sowing sow 10 545 | sowed sow 10 546 | sown sow 10 547 | speaks speak 10 548 | speaking speak 10 549 | spoke speak 10 550 | spoken speak 10 551 | speeds speed 10 552 | speeding speed 10 553 | sped speed 10 554 | speeded speed 10 555 | spells spell 10 556 | spelling spell 10 557 | spelt spell 10 558 | spelled spell 10 559 | spends spend 10 560 | spending spend 10 561 | spent spend 10 562 | spills spill 10 563 | spilling spill 10 564 | spilt spill 10 565 | spilled spill 10 566 | spins spin 10 567 | spinning spin 10 568 | span spin 10 569 | spun spin 10 570 | spits spit 10 571 | spitting spit 10 572 | spat spit 10 573 | spit spit 10 574 | splits split 10 575 | splitting split 10 576 | split split 10 577 | spoils spoil 10 578 | spoiling spoil 10 579 | spoilt spoil 10 580 | spoiled spoil 10 581 | spreads spread 10 582 | spreading spread 10 583 | spread spread 10 584 | springs spring 10 585 | springing spring 10 586 | sprang spring 10 587 | sprung spring 10 588 | stands stand 10 589 | standing stand 10 590 | stood stand 10 591 | steals steal 10 592 | stealing steal 10 593 | stole steal 10 594 | stolen steal 10 595 | sticks stick 10 596 | sticking stick 10 597 | stuck stick 10 598 | stings sting 10 599 | stinging sting 10 600 | stung sting 10 601 | stinks stink 10 602 | stinking stink 10 603 | stank stink 10 604 | stunk stink 10 605 | strides stride 10 606 | striding stride 10 607 | strode stride 10 608 | strided stride 10 609 | stridden stride 10 610 | strikes strike 10 611 | striking strike 10 612 | struck strike 10 613 | stricken strike 10 614 | strings string 10 615 | stringing string 10 616 | strung string 10 617 | strips strip 10 618 | stripping strip 10 619 | stript strip 10 620 | stripped strip 10 621 | strives strive 10 622 | striving strive 10 623 | strove strive 10 624 | striven strive 10 625 | sublets sublet 10 626 | subletting sublet 10 627 | sublet sublet 10 628 | sunburns sunburn 10 629 | sunburning sunburn 10 630 | sunburned sunburn 10 631 | sunburnt sunburn 10 632 | swears swear 10 633 | swearing swear 10 634 | swore swear 10 635 | sworn swear 10 636 | sweats sweat 10 637 | sweating sweat 10 638 | sweat sweat 10 639 | sweated sweat 10 640 | sweeps sweep 10 641 | sweeping sweep 10 642 | swept sweep 10 643 | sweeped sweep 10 644 | swells swell 10 645 | swelling swell 10 646 | swelled swell 10 647 | swollen swell 10 648 | swims swim 10 649 | swimming swim 10 650 | swam swim 10 651 | swum swim 10 652 | swings swing 10 653 | swinging swing 10 654 | swung swing 10 655 | takes take 10 656 | taking take 10 657 | took take 10 658 | taken take 10 659 | teaches teach 10 660 | teaching teach 10 661 | taught teach 10 662 | tears tear 10 663 | tearing tear 10 664 | tore tear 10 665 | torn tear 10 666 | tells tell 10 667 | telling tell 10 668 | told tell 10 669 | thinks think 10 670 | thinking think 10 671 | thought think 10 672 | thrives thrive 10 673 | thriving thrive 10 674 | throve thrive 10 675 | thrived thrive 10 676 | thriven thrive 10 677 | throws throw 10 678 | throwing throw 10 679 | threw throw 10 680 | thrown throw 10 681 | thrusts thrust 10 682 | thrusting thrust 10 683 | thrust thrust 10 684 | treads tread 10 685 | treading tread 10 686 | trod tread 10 687 | trodden tread 10 688 | undergoes undergo 10 689 | undergoing undergo 10 690 | underwent undergo 10 691 | undergone undergo 10 692 | understands understand 10 693 | understanding understand 10 694 | understood understand 10 695 | undertakes undertake 10 696 | undertaking undertake 10 697 | undertook undertake 10 698 | undertaken undertake 10 699 | upsets upset 10 700 | upsetting upset 10 701 | upset upset 10 702 | vexes vex 10 703 | vexing vex 10 704 | vext vex 10 705 | vexed vex 10 706 | wakes wake 10 707 | waking wake 10 708 | woke wake 10 709 | woken wake 10 710 | wears wear 10 711 | wearing wear 10 712 | wore wear 10 713 | worn wear 10 714 | weaves weave 10 715 | weaving weave 10 716 | wove weave 10 717 | woven weave 10 718 | weds wed 10 719 | wedding wed 10 720 | wed wed 10 721 | wedded wed 10 722 | weeps weep 10 723 | weeping weep 10 724 | wept weep 10 725 | wends wend 10 726 | wending wend 10 727 | wended wend 10 728 | //went wend 10 729 | wets wet 10 730 | wetting wet 10 731 | wet wet 10 732 | wetted wet 10 733 | wins win 10 734 | winning win 10 735 | won win 10 736 | winds wind 10 737 | winding wind 10 738 | wound wind 10 739 | withdraws withdraw 10 740 | withdrawing withdraw 10 741 | withdrew withdraw 10 742 | withdrawn withdraw 10 743 | withholds withhold 10 744 | withholding withhold 10 745 | withheld withhold 10 746 | withstands withstand 10 747 | withstanding withstand 10 748 | withstood withstand 10 749 | wrings wring 10 750 | wringing wring 10 751 | wrung wring 10 752 | writes write 10 753 | writing write 10 754 | wrote write 10 755 | written write 10 756 | zincs zinc 10 757 | zincks zinc 10 758 | zincking zinc 10 759 | zinced zinc 10 760 | zincked zinc 10 -------------------------------------------------------------------------------- /SourceFileBuilder/Input/english-lemma-enricher.txt: -------------------------------------------------------------------------------- 1 | // Add missing lemmas in this file to enrich the lemmatizer. 2 | // The format is "word lemma weight" 3 | // Ex: attached attach 1 4 | // If undefined weight -> let the program determine the best weight 5 | unattached unattach 6 | acting act 7 | balled ball 8 | ballsed ballse 9 | bottomed bottom 10 | clouded cloud 11 | cozied cozy 12 | fiddling fiddle 13 | following follow 14 | homing home 15 | leveled level 16 | livened liven 17 | magicked magic 18 | mouthing mouth 19 | pootling pootle 20 | sexed sex 21 | spirited spirit 22 | stove stave 23 | vacuumed vacuum 24 | whiled while 25 | wigged wig 26 | zoned zone 27 | ballsed balls 28 | shore shore 29 | feces feces 30 | ideating ideate 31 | coder code 32 | zoning zone 33 | bing bing 34 | sped speed 35 | lied lie 36 | jihad jihad 37 | kies kies 38 | Mars Mars 39 | ned ned 40 | curlew curlew 41 | n't not 42 | overlords overlord 43 | australopithecus australopithecus 44 | proofreading proofread 45 | babed babe 46 | misreading misread 47 | threading thread 48 | dreading dread 49 | breading bread 50 | lay lay 51 | warlords warlord 52 | dragonlords dragonlord 53 | mylords mylord 54 | underlords underlord 55 | slumlords slumlord 56 | medding medding 57 | lords lord -------------------------------------------------------------------------------- /SourceFileBuilder/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Runtime.InteropServices; 6 | using System.Text; 7 | using System.Threading.Tasks; 8 | using LemmaSharp.Classes; 9 | using SourceFileBuilder.Classes; 10 | 11 | namespace SourceFileBuilder 12 | { 13 | class Program 14 | { 15 | static void Main(string[] args) 16 | { 17 | 18 | var currentDirectory = Environment.CurrentDirectory + "/../../"; 19 | var lemmatizerFilePath = currentDirectory + "../Test/Data/full7z-mlteast-en.lem"; 20 | 21 | var fileName = Path.GetFileNameWithoutExtension(lemmatizerFilePath) + "-modified"; 22 | var extension = Path.GetExtension(lemmatizerFilePath); 23 | var outputFilePath = string.Format("{0}Output/{1}{2}", currentDirectory, fileName, extension); 24 | 25 | var enricherFilePaths = Directory.GetFiles(currentDirectory + "Input/"); 26 | 27 | 28 | using (var stream = File.OpenRead(lemmatizerFilePath)) 29 | { 30 | // create base lemmatizer with data in the base source file 31 | var lemmatizer = new Lemmatizer(stream); 32 | 33 | // then, enrich lemmatizer with every other files 34 | foreach (var filePath in enricherFilePaths) 35 | { 36 | EnrichLemmatizerWithDataFile(lemmatizer, filePath); 37 | } 38 | 39 | // persist lemmatizer in output file 40 | Console.WriteLine("Writing output file..."); 41 | using (var oStream = File.Create(outputFilePath)) 42 | { 43 | lemmatizer.Serialize(oStream, true, Lemmatizer.Compression.Lzma, true); 44 | } 45 | Console.WriteLine("Outuput file written at {0}", outputFilePath); 46 | } 47 | 48 | Console.WriteLine("OK"); 49 | Console.ReadKey(); 50 | } 51 | 52 | 53 | private static void EnrichLemmatizerWithDataFile(Lemmatizer lemmatizer, string enricherFilePath) 54 | { 55 | var fileReader = new EnricherFileReader(enricherFilePath); 56 | var newLemmas = fileReader.ReadAllLemmaEntries(); 57 | 58 | EnrichLemmatizerWithExamples(lemmatizer, newLemmas); 59 | } 60 | 61 | private static void EnrichLemmatizerWithExamples(Lemmatizer lemmatizer, IEnumerable> wordsAndLemmaToAdd) 62 | { 63 | // add new words and lemma 64 | foreach (var wordAndLemma in wordsAndLemmaToAdd) 65 | { 66 | AddExampleOrException(lemmatizer, wordAndLemma.Item1, wordAndLemma.Item2); 67 | } 68 | } 69 | 70 | private static void AddExampleOrException(Lemmatizer lemmatizer, string word, string lemma) 71 | { 72 | // compute the lemma of this example 73 | var computedLemma = lemmatizer.Lemmatize(word); 74 | 75 | if(computedLemma != lemma) 76 | { 77 | // if the computed lemma is different from what we expect, 78 | // add this example to lemmatizer (lemmatizer can then deduce a new rule and succeed, or still fail) 79 | lemmatizer.AddExample(word, lemma); 80 | 81 | // if still doesn't work --> add exception 82 | var computedLemma2 = lemmatizer.Lemmatize(word); 83 | if (computedLemma2 != lemma) 84 | { 85 | Console.WriteLine("Added lemma exception: {0} -> {1}", word, lemma); 86 | lemmatizer.AddException(word, lemma); 87 | } 88 | } 89 | } 90 | } 91 | } 92 | 93 | -------------------------------------------------------------------------------- /SourceFileBuilder/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyTitle("SourceFileBuilder")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("")] 12 | [assembly: AssemblyProduct("SourceFileBuilder")] 13 | [assembly: AssemblyCopyright("Copyright © 2014")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Setting ComVisible to false makes the types in this assembly not visible 18 | // to COM components. If you need to access a type in this assembly from 19 | // COM, set the ComVisible attribute to true on that type. 20 | [assembly: ComVisible(false)] 21 | 22 | // The following GUID is for the ID of the typelib if this project is exposed to COM 23 | [assembly: Guid("9ec3fd02-38fa-4383-a4d7-a66d9b1e3d7c")] 24 | 25 | // Version information for an assembly consists of the following four values: 26 | // 27 | // Major Version 28 | // Minor Version 29 | // Build Number 30 | // Revision 31 | // 32 | // You can specify all the values or you can default the Build and Revision Numbers 33 | // by using the '*' as shown below: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("1.0.0.0")] 36 | [assembly: AssemblyFileVersion("1.0.0.0")] 37 | -------------------------------------------------------------------------------- /SourceFileBuilder/SourceFileBuilder.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | AnyCPU 7 | {D5450D07-4F00-4EA5-A99A-A570845CE6F9} 8 | Exe 9 | Properties 10 | SourceFileBuilder 11 | SourceFileBuilder 12 | v4.5 13 | 512 14 | 15 | 16 | AnyCPU 17 | true 18 | full 19 | false 20 | bin\Debug\ 21 | DEBUG;TRACE 22 | prompt 23 | 4 24 | 25 | 26 | AnyCPU 27 | pdbonly 28 | true 29 | bin\Release\ 30 | TRACE 31 | prompt 32 | 4 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | {A39293C1-92D8-47B9-93A4-41F443B4F9E4} 57 | LemmaSharp 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 72 | -------------------------------------------------------------------------------- /Test/App.config: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Test/Classes/LemmatizerPrebuilt.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Runtime.Serialization; 4 | using System.Reflection; 5 | using LemmaSharp.Classes; 6 | 7 | namespace Test.Classes { 8 | 9 | public enum LanguagePrebuilt { 10 | //from Multext-East v4 lexicons 11 | Bulgarian, 12 | Czech, 13 | English, 14 | Estonian, 15 | Persian, 16 | French, 17 | Hungarian, 18 | Macedonian, 19 | Polish, 20 | Romanian, 21 | Russian, 22 | Slovak, 23 | Slovene, 24 | Serbian, 25 | Ukrainian, 26 | //from Multext lexicons 27 | EnglishMt, 28 | FrenchMt, 29 | German, 30 | Italian, 31 | Spanish, 32 | } 33 | 34 | public enum LexiconPrebuilt 35 | { 36 | MltEast, 37 | Multext 38 | } 39 | 40 | [Serializable] 41 | public abstract class LemmatizerPrebuilt : Lemmatizer { 42 | 43 | // Private Variables -------------------------------- 44 | 45 | private static readonly string[] AsLangMapping = 46 | { 47 | "bg", "mlteast", 48 | "cs", "mlteast", 49 | "en", "mlteast", 50 | "et", "mlteast", 51 | "fa", "mlteast", 52 | "fr", "mlteast", 53 | "hu", "mlteast", 54 | "mk", "mlteast", 55 | "pl", "mlteast", 56 | "ro", "mlteast", 57 | "ru", "mlteast", 58 | "sk", "mlteast", 59 | "sl", "mlteast", 60 | "sr", "mlteast", 61 | "uk", "mlteast", 62 | "en", "multext", 63 | "fr", "multext", 64 | "ge", "multext", 65 | "it", "multext", 66 | "sp", "multext", 67 | }; 68 | 69 | private readonly LanguagePrebuilt lang; 70 | 71 | 72 | // Constructor(s) & Destructor(s) ---------------------- 73 | 74 | public LemmatizerPrebuilt(LanguagePrebuilt lang) 75 | { 76 | this.lang = lang; 77 | } 78 | 79 | public LemmatizerPrebuilt(LanguagePrebuilt lang, LemmatizerSettings lsett): base(lsett) { 80 | this.lang = lang; 81 | } 82 | 83 | 84 | // Private Properties Helping Functions --------------- 85 | 86 | protected string GetResourceFileName(string sFileMask) { 87 | return GetResourceFileName(sFileMask, lang); 88 | } 89 | 90 | public static string GetResourceFileName(string sFileMask, LanguagePrebuilt lang) { 91 | string langFileName = AsLangMapping[(int)lang * 2 + 1] + '-' +AsLangMapping[(int)lang * 2]; 92 | return String.Format(sFileMask, langFileName); 93 | } 94 | 95 | 96 | // Public Properties ---------------------------------- 97 | 98 | public LanguagePrebuilt Language { 99 | get{ 100 | return lang; 101 | } 102 | } 103 | public LexiconPrebuilt Lexicon 104 | { 105 | get 106 | { 107 | return GetLexicon(lang); 108 | } 109 | } 110 | 111 | 112 | // Public Properties --------------------------------- 113 | 114 | public static LexiconPrebuilt GetLexicon(LanguagePrebuilt lang) 115 | { 116 | return (LexiconPrebuilt)Enum.Parse(typeof(LexiconPrebuilt), AsLangMapping[((int)lang) * 2 + 1], true); 117 | } 118 | 119 | 120 | // Resource Management Functions -------------------- 121 | 122 | protected abstract Assembly GetExecutingAssembly(); 123 | 124 | protected Stream GetResourceStream(string sResourceShortName) { 125 | Assembly assembly = GetExecutingAssembly(); 126 | 127 | string sResourceName = null; 128 | foreach (string sResource in assembly.GetManifestResourceNames()) 129 | if (sResource.EndsWith(sResourceShortName)) { 130 | sResourceName = sResource; 131 | break; 132 | } 133 | 134 | if (String.IsNullOrEmpty(sResourceName)) return null; 135 | 136 | return assembly.GetManifestResourceStream(sResourceName); 137 | } 138 | 139 | 140 | // Serialization Functions ------------------------- 141 | 142 | public LemmatizerPrebuilt(SerializationInfo info, StreamingContext context): base(info, context) { 143 | } 144 | 145 | 146 | 147 | } 148 | 149 | } 150 | -------------------------------------------------------------------------------- /Test/Classes/LemmatizerPrebuiltFull.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Reflection; 4 | 5 | namespace Test.Classes 6 | { 7 | [Serializable] 8 | public class LemmatizerPrebuiltFull : LemmatizerPrebuilt 9 | { 10 | public const string Filemask = "full7z-{0}.lem"; 11 | 12 | // Constructor(s) & Destructor(s) --------------------- 13 | 14 | public LemmatizerPrebuiltFull(LanguagePrebuilt lang): base(lang) 15 | { 16 | Stream stream = GetResourceStream(GetResourceFileName(Filemask)); 17 | this.Deserialize(stream); 18 | stream.Close(); 19 | } 20 | 21 | 22 | // Resource Management Functions ---------------------- 23 | 24 | protected override Assembly GetExecutingAssembly() 25 | { 26 | return Assembly.GetExecutingAssembly(); 27 | } 28 | 29 | 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /Test/Data/Custom/english.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/Custom/english.lem -------------------------------------------------------------------------------- /Test/Data/Custom/full7z-mlteast-en-modified.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/Custom/full7z-mlteast-en-modified.lem -------------------------------------------------------------------------------- /Test/Data/full7z-mlteast-bg.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-mlteast-bg.lem -------------------------------------------------------------------------------- /Test/Data/full7z-mlteast-cs.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-mlteast-cs.lem -------------------------------------------------------------------------------- /Test/Data/full7z-mlteast-en.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-mlteast-en.lem -------------------------------------------------------------------------------- /Test/Data/full7z-mlteast-et.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-mlteast-et.lem -------------------------------------------------------------------------------- /Test/Data/full7z-mlteast-fa.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-mlteast-fa.lem -------------------------------------------------------------------------------- /Test/Data/full7z-mlteast-fr.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-mlteast-fr.lem -------------------------------------------------------------------------------- /Test/Data/full7z-mlteast-hu.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-mlteast-hu.lem -------------------------------------------------------------------------------- /Test/Data/full7z-mlteast-mk.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-mlteast-mk.lem -------------------------------------------------------------------------------- /Test/Data/full7z-mlteast-pl.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-mlteast-pl.lem -------------------------------------------------------------------------------- /Test/Data/full7z-mlteast-ro.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-mlteast-ro.lem -------------------------------------------------------------------------------- /Test/Data/full7z-mlteast-ru.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-mlteast-ru.lem -------------------------------------------------------------------------------- /Test/Data/full7z-mlteast-sk.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-mlteast-sk.lem -------------------------------------------------------------------------------- /Test/Data/full7z-mlteast-sl.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-mlteast-sl.lem -------------------------------------------------------------------------------- /Test/Data/full7z-mlteast-sr.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-mlteast-sr.lem -------------------------------------------------------------------------------- /Test/Data/full7z-mlteast-uk.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-mlteast-uk.lem -------------------------------------------------------------------------------- /Test/Data/full7z-multext-en.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-multext-en.lem -------------------------------------------------------------------------------- /Test/Data/full7z-multext-fr.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-multext-fr.lem -------------------------------------------------------------------------------- /Test/Data/full7z-multext-ge.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-multext-ge.lem -------------------------------------------------------------------------------- /Test/Data/full7z-multext-it.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-multext-it.lem -------------------------------------------------------------------------------- /Test/Data/full7z-multext-sp.lem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/Test/Data/full7z-multext-sp.lem -------------------------------------------------------------------------------- /Test/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Runtime.InteropServices; 6 | using System.Text; 7 | using System.Text.RegularExpressions; 8 | using System.Threading.Tasks; 9 | using LemmaSharp.Classes; 10 | using Test.Classes; 11 | 12 | namespace Test 13 | { 14 | class Program 15 | { 16 | 17 | static void Main(string[] args) 18 | { 19 | // Create readable file 20 | var currentDirectory = Directory.GetCurrentDirectory(); 21 | var dataFilePath = string.Format("{0}/{1}/{2}", currentDirectory, "../../Data/Custom", "english2.lem"); 22 | //var dataFilePath = string.Format("{0}/{1}/{2}", currentDirectory, "../../Data/Custom", "full7z-mlteast-en-modified.lem"); 23 | 24 | 25 | var dataFilePath2 = string.Format("{0}/{1}/{2}", currentDirectory, "../../Data/Custom", "english2.lem"); 26 | using (var fstream = File.OpenRead(dataFilePath)) 27 | { 28 | var lemmatizer = new Lemmatizer(fstream); 29 | 30 | using (var outStream = File.OpenWrite(dataFilePath2)) 31 | { 32 | using (var writer = new BinaryWriter(outStream)) 33 | { 34 | 35 | lemmatizer.Serialize(writer, true); 36 | } 37 | } 38 | 39 | 40 | // add examples 41 | var examples = new List>() 42 | { 43 | new Tuple("acting","act"), 44 | new Tuple("balled","ball"), 45 | new Tuple("balled","ball"), 46 | new Tuple("ballsed","balls"), 47 | new Tuple("bogged","bog"), 48 | new Tuple("bottomed","bottom"), 49 | new Tuple("bounced","bounce"), 50 | new Tuple("boxed","box"), 51 | new Tuple("brought","bring"), 52 | new Tuple("cashed","cash"), 53 | new Tuple("clouded","cloud"), 54 | new Tuple("cozied","cozy"), 55 | new Tuple("divided","divide"), 56 | new Tuple("felt","feel"), 57 | new Tuple("fiddling","fiddle"), 58 | new Tuple("fishing","fish"), 59 | new Tuple("fleshed","flesh"), 60 | new Tuple("fobbed","fob"), 61 | new Tuple("following","follow"), 62 | new Tuple("homing","home"), 63 | new Tuple("hunkered","hunker"), 64 | new Tuple("leveled","level"), 65 | new Tuple("laid","lay"), 66 | new Tuple("limbered","limber"), 67 | new Tuple("livened","liven"), 68 | new Tuple("livened","liven"), 69 | new Tuple("loaded","load"), 70 | new Tuple("magicked","magic"), 71 | new Tuple("messing","mess"), 72 | new Tuple("meted","mete"), 73 | new Tuple("mouthing","mouth"), 74 | new Tuple("perked","perk"), 75 | new Tuple("pootling","pootle"), 76 | new Tuple("sacked","sack"), 77 | new Tuple("screwing","screw"), 78 | new Tuple("sexed","sex"), 79 | new Tuple("shacked","shack"), 80 | new Tuple("speeded","speed"), 81 | new Tuple("spirited","spirit"), 82 | new Tuple("started","start"), 83 | new Tuple("stove","stave"), 84 | new Tuple("swung","swing"), 85 | new Tuple("teed","tee"), 86 | new Tuple("tired","tire"), 87 | new Tuple("used","use"), 88 | new Tuple("vacuumed","vacuum"), 89 | new Tuple("whiled","while"), 90 | new Tuple("wigged","wig"), 91 | new Tuple("zoned","zone"), 92 | new Tuple("don't","do"), 93 | new Tuple("doesn't","do"), 94 | new Tuple("didn't","did"), 95 | new Tuple("won't","will"), 96 | new Tuple("shan't","shall"), 97 | new Tuple("can't","can"), 98 | new Tuple("couldn't","could"), 99 | new Tuple("wouldn't","would"), 100 | new Tuple("shouldn't","should"), 101 | new Tuple("mustn't","must"), 102 | new Tuple("mightn't","might"), 103 | new Tuple("oughtn't","ought"), 104 | new Tuple("needn't","need"), 105 | new Tuple("aren't","are"), 106 | new Tuple("isn't","be"), 107 | new Tuple("wasn't","be"), 108 | new Tuple("weren't","be"), 109 | new Tuple("haven't","have"), 110 | new Tuple("hasn't","have"), 111 | new Tuple("hadn't","have"), 112 | new Tuple("'s", "'s"), 113 | new Tuple("'ve", "have"), 114 | new Tuple("'m", "be"), 115 | new Tuple("'re", "be"), 116 | new Tuple("'ll", "will"), 117 | }; 118 | foreach (var example in examples) 119 | { 120 | var lemma = lemmatizer.Lemmatize(example.Item1); 121 | Console.WriteLine("{0} --> {1} {2}", example.Item1, lemma, lemma != example.Item2 ? ("!= " + example.Item2):""); 122 | } 123 | } 124 | 125 | 126 | Console.WriteLine("=========="); 127 | Console.WriteLine("OK"); 128 | Console.ReadLine(); 129 | } 130 | 131 | private static Lemmatizer CreatePreBuiltLemmatizer() 132 | { 133 | var lemmatizer = new LemmatizerPrebuiltFull(LanguagePrebuilt.English); 134 | return lemmatizer; 135 | } 136 | 137 | private static Lemmatizer CreateLemmatizerFromFile() 138 | { 139 | var currentDirectory = Directory.GetCurrentDirectory(); 140 | var dataFilePath = string.Format("{0}/{1}/{2}", currentDirectory, "../../Data/Custom", "english.lem"); 141 | using (var stream = File.OpenRead(dataFilePath)) 142 | { 143 | var lemmatizer = new Lemmatizer(stream); 144 | return lemmatizer; 145 | } 146 | } 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /Test/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyTitle("Test")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("")] 12 | [assembly: AssemblyProduct("Test")] 13 | [assembly: AssemblyCopyright("Copyright © 2013")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Setting ComVisible to false makes the types in this assembly not visible 18 | // to COM components. If you need to access a type in this assembly from 19 | // COM, set the ComVisible attribute to true on that type. 20 | [assembly: ComVisible(false)] 21 | 22 | // The following GUID is for the ID of the typelib if this project is exposed to COM 23 | [assembly: Guid("df5d90ec-fbfb-479a-9863-9bf9ddccbff9")] 24 | 25 | // Version information for an assembly consists of the following four values: 26 | // 27 | // Major Version 28 | // Minor Version 29 | // Build Number 30 | // Revision 31 | // 32 | // You can specify all the values or you can default the Build and Revision Numbers 33 | // by using the '*' as shown below: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("1.0.0.0")] 36 | [assembly: AssemblyFileVersion("1.0.0.0")] 37 | -------------------------------------------------------------------------------- /Test/Test.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | AnyCPU 7 | {C5B09C13-EA3B-4A00-B3D6-F8B790B108CA} 8 | Exe 9 | Properties 10 | Test 11 | Test 12 | v4.5 13 | 512 14 | 15 | 16 | AnyCPU 17 | true 18 | full 19 | false 20 | bin\Debug\ 21 | DEBUG;TRACE 22 | prompt 23 | 4 24 | 25 | 26 | AnyCPU 27 | pdbonly 28 | true 29 | bin\Release\ 30 | TRACE 31 | prompt 32 | 4 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | {A39293C1-92D8-47B9-93A4-41F443B4F9E4} 77 | LemmaSharp 78 | 79 | 80 | 81 | 82 | 83 | 84 | 91 | -------------------------------------------------------------------------------- /nuget/NuGet.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexPoint/LemmaGenerator/05f6883970dc863d61c52e169d1e016fa5c8d67b/nuget/NuGet.exe -------------------------------------------------------------------------------- /nuget/NuGet.targets: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | $(MSBuildProjectDirectory)\..\ 5 | 6 | 7 | false 8 | 9 | 10 | false 11 | 12 | 13 | true 14 | 15 | 16 | false 17 | 18 | 19 | 20 | 21 | 22 | 26 | 27 | 28 | 29 | 30 | $([System.IO.Path]::Combine($(SolutionDir), "nuget")) 31 | $([System.IO.Path]::Combine($(ProjectDir), "packages.config")) 32 | 33 | 34 | 35 | 36 | $(SolutionDir)nuget 37 | packages.config 38 | 39 | 40 | 41 | 42 | $(NuGetToolsPath)\NuGet.exe 43 | @(PackageSource) 44 | 45 | "$(NuGetExePath)" 46 | mono --runtime=v4.0.30319 $(NuGetExePath) 47 | 48 | $(TargetDir.Trim('\\')) 49 | 50 | -RequireConsent 51 | 52 | $(NuGetCommand) install "$(PackagesConfig)" -source "$(PackageSources)" $(RequireConsentSwitch) -solutionDir "$(SolutionDir) " 53 | $(NuGetCommand) pack "$(ProjectPath)" -p Configuration=$(Configuration) -o "$(PackageOutputDir)" -symbols 54 | 55 | 56 | 57 | RestorePackages; 58 | $(BuildDependsOn); 59 | 60 | 61 | 62 | 63 | $(BuildDependsOn); 64 | BuildPackage; 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 87 | 88 | 91 | 92 | 93 | 94 | 96 | 97 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 148 | 149 | 150 | 151 | --------------------------------------------------------------------------------