├── dll └── AdvUtils.dll ├── Core ├── CRFSharp │ ├── base │ │ ├── IGenerateFeature.cs │ │ ├── Node.cs │ │ ├── Path.cs │ │ ├── Pool.cs │ │ ├── BaseModel.cs │ │ ├── Utils.cs │ │ └── Tagger.cs │ ├── encoder │ │ ├── FeatureIdPair.cs │ │ ├── IFeatureLexicalDict.cs │ │ ├── FeatureItem.cs │ │ ├── CRFEncoderThread.cs │ │ ├── DefaultFeatureLexicalDict.cs │ │ ├── EncoderTagger.cs │ │ ├── HugeFeatureLexicalDict.cs │ │ ├── LBFGS.cs │ │ ├── ModelWriter.cs │ │ └── Mcsrch.cs │ ├── CRFSharp.csproj │ ├── decoder │ │ ├── ModelReaderExtensions.cs │ │ ├── ModelReader.cs │ │ └── DecoderTagger.cs │ └── Properties │ │ └── AssemblyInfo.cs └── CRFSharpWrapper │ ├── CRFSharpWrapper.csproj │ ├── CRFSharpHelper.cs │ ├── Properties │ └── AssemblyInfo.cs │ ├── Args.cs │ ├── SegDecoderTagger.cs │ ├── Decoder.cs │ └── Encoder.cs ├── CRFSharpConsole ├── App.config ├── CRFSharpConsole.csproj ├── Properties │ └── AssemblyInfo.cs ├── Program.cs ├── EncoderConsole.cs └── DecoderConsole.cs ├── LICENSE ├── .gitignore ├── CRFSharp.sln └── README.md /dll/AdvUtils.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhongkaifu/CRFSharp/HEAD/dll/AdvUtils.dll -------------------------------------------------------------------------------- /Core/CRFSharp/base/IGenerateFeature.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace CRFSharp 7 | { 8 | public interface IGenerateFeature 9 | { 10 | bool Initialize(); 11 | List> GenerateFeature(string strText); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /CRFSharpConsole/App.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /Core/CRFSharp/encoder/FeatureIdPair.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace CRFSharp 7 | { 8 | public sealed class FeatureIdPair 9 | { 10 | public long Key; 11 | public int Value; 12 | 13 | public FeatureIdPair(long key, int value) 14 | { 15 | Key = key; 16 | Value = value; 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /Core/CRFSharp/base/Node.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace CRFSharp 7 | { 8 | public class Node 9 | { 10 | public int fid; 11 | public short x; 12 | public short y; 13 | public double alpha; 14 | public double beta; 15 | public double cost; 16 | public double bestCost; 17 | public Node prev; 18 | 19 | public List lpathList; 20 | public List rpathList; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /Core/CRFSharp/encoder/IFeatureLexicalDict.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using AdvUtils; 6 | 7 | namespace CRFSharp 8 | { 9 | public interface IFeatureLexicalDict 10 | { 11 | void Shrink(int freq); 12 | long GetOrAddId(string strFeature); 13 | long RegenerateFeatureId(BTreeDictionary old2new, long ysize); 14 | void GenerateLexicalIdList(out IList fea, out IList val); 15 | void Clear(); 16 | 17 | long Size 18 | { 19 | get; 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /Core/CRFSharp/encoder/FeatureItem.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace CRFSharp 7 | { 8 | public sealed class FeatureItem : IComparable 9 | { 10 | public string strFeature; 11 | public FeatureIdPair feaIdPair; 12 | 13 | public FeatureItem(string s, FeatureIdPair item) 14 | { 15 | strFeature = s; 16 | feaIdPair = item; 17 | } 18 | 19 | public int CompareTo(FeatureItem fi) 20 | { 21 | return StringComparer.Ordinal.Compare(strFeature, fi.strFeature); 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /Core/CRFSharp/base/Path.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace CRFSharp 7 | { 8 | public class Path 9 | { 10 | public int fid; 11 | public Node rnode; 12 | public Node lnode; 13 | public double cost; 14 | 15 | public Path() 16 | { 17 | rnode = null; 18 | lnode = null; 19 | cost = 0; 20 | } 21 | 22 | public void add(Node _lnode, Node _rnode) 23 | { 24 | lnode = _lnode; 25 | rnode = _rnode; 26 | 27 | lnode.rpathList.Add(this); 28 | rnode.lpathList.Add(this); 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /Core/CRFSharpWrapper/CRFSharpWrapper.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | netcoreapp3.1 4 | Library 5 | false 6 | 7 | 8 | 9 | ..\..\dll\AdvUtils.dll 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /CRFSharpConsole/CRFSharpConsole.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | netcoreapp3.1 4 | Exe 5 | false 6 | 7 | 8 | 9 | False 10 | ..\dll\AdvUtils.dll 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /Core/CRFSharp/CRFSharp.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | netcoreapp3.1 4 | Library 5 | false 6 | 7 | 8 | 9 | 10 | true 11 | 12 | 13 | 14 | ..\..\dll\AdvUtils.dll 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /Core/CRFSharp/decoder/ModelReaderExtensions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | 7 | namespace CRFSharp.decoder 8 | { 9 | internal static class ModelReaderExtensions 10 | { 11 | private static readonly string featureFileNameExtension = ".feature"; 12 | private static readonly string weightFileNameExtension = ".alpha"; 13 | 14 | internal static string ToMetadataModelName(this string modelName) 15 | { 16 | return modelName; 17 | } 18 | 19 | internal static string ToFeatureSetFileName(this string modelName) 20 | { 21 | return String.Concat(modelName, featureFileNameExtension); 22 | } 23 | 24 | internal static string ToFeatureWeightFileName(this string modelName) 25 | { 26 | return String.Concat(modelName, weightFileNameExtension); 27 | } 28 | 29 | internal static void ThrowIfNotExists(this string fileName) 30 | { 31 | if (String.IsNullOrWhiteSpace(fileName)) 32 | throw new ArgumentNullException("fileName", 33 | "Please specify a valid model path"); 34 | 35 | if (!File.Exists(fileName)) 36 | throw new FileNotFoundException("fileName", 37 | "Please specify a valid model path"); 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /Core/CRFSharpWrapper/CRFSharpHelper.cs: -------------------------------------------------------------------------------- 1 | /**********************************************/ 2 | /*Project: CRF# */ 3 | /*Author: Zhongkai Fu */ 4 | /*Email: fuzhongkai@gmail.com */ 5 | /**********************************************/ 6 | 7 | using System; 8 | using System.Collections.Generic; 9 | using System.Text; 10 | using CRFSharp; 11 | 12 | namespace CRFSharpWrapper 13 | { 14 | public class SegToken 15 | { 16 | public int offset; 17 | public int length; 18 | public string strTag; //CRF对应于term组合后的Tag字符串 19 | public double fWeight; //对应属性id的概率值,或者得分 20 | }; 21 | 22 | public class crf_seg_out : crf_term_out 23 | { 24 | //Segmented token by merging raw CRF model output 25 | public int termTotalLength; // the total term length in character 26 | public List tokenList; 27 | 28 | public int Count 29 | { 30 | get { return tokenList.Count; } 31 | } 32 | 33 | public void Clear() 34 | { 35 | termTotalLength = 0; 36 | tokenList.Clear(); 37 | } 38 | 39 | public crf_seg_out(int max_word_num = Utils.DEFAULT_CRF_MAX_WORD_NUM): 40 | base(max_word_num) 41 | { 42 | termTotalLength = 0; 43 | tokenList = new List(); 44 | } 45 | }; 46 | 47 | } 48 | -------------------------------------------------------------------------------- /CRFSharpConsole/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyTitle("CRFSharpConsole")] 9 | [assembly: AssemblyDescription("The console tool for CRFSharp encoder and decoder")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("Zhongkai Fu")] 12 | [assembly: AssemblyProduct("CRFSharpConsole")] 13 | [assembly: AssemblyCopyright("Copyright © 2016")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Setting ComVisible to false makes the types in this assembly not visible 18 | // to COM components. If you need to access a type in this assembly from 19 | // COM, set the ComVisible attribute to true on that type. 20 | [assembly: ComVisible(false)] 21 | 22 | // The following GUID is for the ID of the typelib if this project is exposed to COM 23 | [assembly: Guid("c804ee38-9448-4ec0-8ac0-67084194fc98")] 24 | 25 | // Version information for an assembly consists of the following four values: 26 | // 27 | // Major Version 28 | // Minor Version 29 | // Build Number 30 | // Revision 31 | // 32 | [assembly: AssemblyVersion("1.0.0.0")] 33 | [assembly: AssemblyFileVersion("1.0.0.0")] 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Zhongkai Fu 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of CRFSharp nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /Core/CRFSharp/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyTitle("CRFSharp")] 9 | [assembly: AssemblyDescription("CRFSharp core algorithm APIs")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("Zhongkai Fu")] 12 | [assembly: AssemblyProduct("CRFSharp")] 13 | [assembly: AssemblyCopyright("Copyright © 2012")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Setting ComVisible to false makes the types in this assembly not visible 18 | // to COM components. If you need to access a type in this assembly from 19 | // COM, set the ComVisible attribute to true on that type. 20 | [assembly: ComVisible(false)] 21 | 22 | // The following GUID is for the ID of the typelib if this project is exposed to COM 23 | [assembly: Guid("08e14d97-dd83-4873-959c-1a190fa4e197")] 24 | 25 | // Version information for an assembly consists of the following four values: 26 | // 27 | // Major Version 28 | // Minor Version 29 | // Build Number 30 | // Revision 31 | // 32 | // You can specify all the values or you can default the Build and Revision Numbers 33 | // by using the '*' as shown below: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("1.0.0.0")] 36 | [assembly: AssemblyFileVersion("1.0.0.0")] 37 | -------------------------------------------------------------------------------- /Core/CRFSharpWrapper/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyTitle("CRFSharpWrapper")] 9 | [assembly: AssemblyDescription("The APIs wrapper for CRFSharp core algorithm")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("Zhongkai Fu")] 12 | [assembly: AssemblyProduct("CRFSharpWrapper")] 13 | [assembly: AssemblyCopyright("Copyright © 2012")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Setting ComVisible to false makes the types in this assembly not visible 18 | // to COM components. If you need to access a type in this assembly from 19 | // COM, set the ComVisible attribute to true on that type. 20 | [assembly: ComVisible(false)] 21 | 22 | // The following GUID is for the ID of the typelib if this project is exposed to COM 23 | [assembly: Guid("0ad03326-77fc-4135-9914-56f486a89c1e")] 24 | 25 | // Version information for an assembly consists of the following four values: 26 | // 27 | // Major Version 28 | // Minor Version 29 | // Build Number 30 | // Revision 31 | // 32 | // You can specify all the values or you can default the Build and Revision Numbers 33 | // by using the '*' as shown below: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("1.0.0.0")] 36 | [assembly: AssemblyFileVersion("1.0.0.0")] 37 | -------------------------------------------------------------------------------- /Core/CRFSharpWrapper/Args.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace CRFSharpWrapper 7 | { 8 | public class EncoderArgs 9 | { 10 | public int max_iter = 1000; //maximum iteration, when encoding iteration reaches this value, the process will be ended. 11 | public int min_feature_freq = 2; //minimum feature frequency, if one feature's frequency is less than this value, the feature will be dropped. 12 | public double min_diff = 0.0001; //minimum diff value, when diff less than the value consecutive 3 times, the process will be ended. 13 | public double slot_usage_rate_threshold = 0.95; //the maximum slot usage rate threshold when building feature set. 14 | public int threads_num = 1; //the amount of threads used to train model. 15 | public CRFSharpWrapper.Encoder.REG_TYPE regType = CRFSharpWrapper.Encoder.REG_TYPE.L2; //regularization type 16 | public string strTemplateFileName = null; //template file name 17 | public string strTrainingCorpus = null; //training corpus file name 18 | public string strEncodedModelFileName = null; //encoded model file name 19 | public string strRetrainModelFileName = null; //the model file name for re-training 20 | public int debugLevel = 0; //Debug level 21 | public uint hugeLexMemLoad = 0; 22 | public double C = 1.0; //cost factor, too big or small value may lead encoded model over tune or under tune 23 | public bool bVQ = false; //If we build vector quantization model for feature weights 24 | } 25 | 26 | public class DecoderArgs 27 | { 28 | public string strModelFileName; 29 | public string strInputFileName; 30 | public string strOutputFileName; 31 | public string strOutputSegFileName; 32 | public int nBest; 33 | public int thread; 34 | public int probLevel; 35 | public int maxword; 36 | 37 | public DecoderArgs() 38 | { 39 | thread = 1; 40 | nBest = 1; 41 | probLevel = 0; 42 | maxword = 100; 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /CRFSharpConsole/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using AdvUtils; 3 | 4 | namespace CRFSharpConsole 5 | { 6 | class Program 7 | { 8 | static void Usage() 9 | { 10 | Console.WriteLine("Linear-chain CRF encoder & decoder by Zhongkai Fu (fuzhongkai@gmail.com)"); 11 | Console.WriteLine("CRFSharpConsole.exe [parameters list...]"); 12 | Console.WriteLine(" -encode [parameters list...] - Encode CRF model from training corpus"); 13 | Console.WriteLine(" -decode [parameters list...] - Decode CRF model on test corpus"); 14 | } 15 | 16 | static void Main(string[] args) 17 | { 18 | Logger.LogFile = "CRFSharpConsole.log"; 19 | 20 | if (args.Length < 1) 21 | { 22 | Usage(); 23 | return; 24 | } 25 | 26 | var bEncoder = false; 27 | var bDecoder = false; 28 | 29 | for (int index = 0; index < args.Length; index++) 30 | { 31 | var item = args[index]; 32 | if (item.Length <= 1) 33 | { 34 | continue; 35 | } 36 | 37 | if (item[0] != '-') 38 | { 39 | continue; 40 | } 41 | 42 | var strType = item.Substring(1).ToLower().Trim(); 43 | if (strType == "encode") 44 | { 45 | bEncoder = true; 46 | } 47 | if (strType == "decode") 48 | { 49 | bDecoder = true; 50 | } 51 | } 52 | 53 | //Invalidated parameter 54 | if (bEncoder == false && bDecoder == false) 55 | { 56 | Usage(); 57 | return; 58 | } 59 | 60 | if (bEncoder == true) 61 | { 62 | var encoderConsole = new EncoderConsole(); 63 | encoderConsole.Run(args); 64 | } 65 | else if (bDecoder == true) 66 | { 67 | var decoderConsole = new DecoderConsole(); 68 | decoderConsole.Run(args); 69 | } 70 | else 71 | { 72 | Usage(); 73 | } 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /Core/CRFSharp/encoder/CRFEncoderThread.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading; 6 | using AdvUtils; 7 | 8 | namespace CRFSharp 9 | { 10 | public class CRFEncoderThread 11 | { 12 | public EncoderTagger[] x; 13 | public int start_i; 14 | public int thread_num; 15 | public int zeroone; 16 | public int err; 17 | public double obj; 18 | public Node[,] node_; 19 | short[] result_; 20 | public short max_xsize_; 21 | public LBFGS lbfgs; 22 | public int[,] merr; 23 | 24 | public void Init() 25 | { 26 | if (x.Length == 0) 27 | { 28 | return; 29 | } 30 | 31 | var ysize_ = x[0].ysize_; 32 | max_xsize_ = 0; 33 | for (var i = start_i; i < x.Length; i += thread_num) 34 | { 35 | if (max_xsize_ < x[i].word_num) 36 | { 37 | max_xsize_ = x[i].word_num; 38 | } 39 | } 40 | 41 | result_ = new short[max_xsize_]; 42 | node_ = new Node[max_xsize_, ysize_]; 43 | for (var i = 0; i < max_xsize_; i++) 44 | { 45 | for (var j = 0; j < ysize_; j++) 46 | { 47 | node_[i, j] = new Node(); 48 | node_[i, j].x = (short)i; 49 | node_[i, j].y = (short)j; 50 | node_[i, j].lpathList = new List(ysize_); 51 | node_[i, j].rpathList = new List(ysize_); 52 | } 53 | } 54 | 55 | for (short cur = 1; cur < max_xsize_; ++cur) 56 | { 57 | for (short j = 0; j < ysize_; ++j) 58 | { 59 | for (short i = 0; i < ysize_; ++i) 60 | { 61 | var path = new Path(); 62 | path.fid = -1; 63 | path.cost = 0.0; 64 | path.add(node_[cur - 1, j], node_[cur, i]); 65 | } 66 | } 67 | } 68 | 69 | merr = new int[ysize_, ysize_]; 70 | } 71 | 72 | public void Run() 73 | { 74 | //Initialize thread self data structure 75 | obj = 0.0f; 76 | err = zeroone = 0; 77 | //expected.Clear(); 78 | Array.Clear(merr, 0, merr.Length); 79 | for (var i = start_i; i < x.Length; i += thread_num) 80 | { 81 | x[i].Init(result_, node_); 82 | obj += x[i].gradient(lbfgs.expected); 83 | var error_num = x[i].eval(merr); 84 | err += error_num; 85 | if (error_num > 0) 86 | { 87 | ++zeroone; 88 | } 89 | } 90 | } 91 | 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /Core/CRFSharp/base/Pool.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Concurrent; 3 | using System.Threading; 4 | 5 | namespace CRFSharp 6 | { 7 | /// 8 | /// Represents general purpose pool that has no restrictions (e.g. grows if it's required) 9 | /// 10 | /// 11 | internal sealed class Pool 12 | { 13 | private int _totalCount; 14 | private readonly ConcurrentStack _container = new ConcurrentStack(); 15 | private readonly Func, T> _creator; 16 | private readonly Action _cleaner; 17 | 18 | /// 19 | /// Initializes a new instance of the class. 20 | /// 21 | public Pool(Func, T> creator, Action cleaner = null) 22 | { 23 | _creator = creator; 24 | _cleaner = cleaner; 25 | } 26 | 27 | /// 28 | /// Gets item from pool or creates a new item 29 | /// 30 | /// 31 | public PoolItem GetOrCreate() 32 | { 33 | T item; 34 | if (_container.TryPop(out item)) 35 | { 36 | return new PoolItem(item, _cleaner, this); 37 | } 38 | var newItem = _creator(this); 39 | if (newItem == null) 40 | { 41 | throw new ApplicationException("Unable to create new pool item"); 42 | } 43 | Interlocked.Increment(ref _totalCount); 44 | return new PoolItem(newItem, _cleaner, this); 45 | } 46 | 47 | /// 48 | /// Returns amount of free items in the bag 49 | /// 50 | public int FreeCount { get { return _container.Count; } } 51 | 52 | /// 53 | /// Returns amount items created by pool 54 | /// 55 | public int TotalCount { get { return _totalCount; } } 56 | 57 | private void Return(T item) 58 | { 59 | _container.Push(item); 60 | } 61 | 62 | /// 63 | /// Pool item that is return when pool request is processed 64 | /// 65 | /// 66 | internal struct PoolItem : IDisposable 67 | { 68 | /// 69 | /// Pooled item 70 | /// 71 | public readonly T1 Item; 72 | private readonly Pool _owner; 73 | private readonly Action _cleaner; 74 | 75 | /// 76 | /// Creates a new pool item 77 | /// 78 | /// 79 | /// 80 | /// 81 | internal PoolItem(T1 item, Action cleaner, Pool owner) 82 | { 83 | Item = item; 84 | _cleaner = cleaner; 85 | _owner = owner; 86 | } 87 | 88 | /// 89 | /// Performs application-defined tasks associated with freeing, releasing, or resetting unmanaged resources. 90 | /// 91 | public void Dispose() 92 | { 93 | _cleaner?.Invoke(Item); 94 | _owner.Return(Item); 95 | } 96 | } 97 | } 98 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ################# 2 | ## Eclipse 3 | ################# 4 | 5 | *.pydevproject 6 | .project 7 | .metadata 8 | bin/ 9 | tmp/ 10 | *.tmp 11 | *.bak 12 | *.swp 13 | *~.nib 14 | local.properties 15 | .classpath 16 | .settings/ 17 | .loadpath 18 | 19 | # External tool builders 20 | .externalToolBuilders/ 21 | 22 | # Locally stored "Eclipse launch configurations" 23 | *.launch 24 | 25 | # CDT-specific 26 | .cproject 27 | 28 | # PDT-specific 29 | .buildpath 30 | 31 | 32 | ################# 33 | ## Visual Studio 34 | ################# 35 | 36 | ## Ignore Visual Studio temporary files, build results, and 37 | ## files generated by popular Visual Studio add-ons. 38 | 39 | # User-specific files 40 | *.suo 41 | *.user 42 | *.sln.docstates 43 | 44 | # Build results 45 | 46 | [Dd]ebug/ 47 | [Rr]elease/ 48 | x64/ 49 | build/ 50 | [Bb]in/ 51 | [Oo]bj/ 52 | 53 | # MSTest test Results 54 | [Tt]est[Rr]esult*/ 55 | [Bb]uild[Ll]og.* 56 | 57 | *_i.c 58 | *_p.c 59 | *.ilk 60 | *.meta 61 | *.obj 62 | *.pch 63 | *.pdb 64 | *.pgc 65 | *.pgd 66 | *.rsp 67 | *.sbr 68 | *.tlb 69 | *.tli 70 | *.tlh 71 | *.tmp 72 | *.tmp_proj 73 | *.log 74 | *.vspscc 75 | *.vssscc 76 | .builds 77 | *.pidb 78 | *.log 79 | *.scc 80 | 81 | # Visual C++ cache files 82 | ipch/ 83 | *.aps 84 | *.ncb 85 | *.opensdf 86 | *.sdf 87 | *.cachefile 88 | 89 | # Visual Studio profiler 90 | *.psess 91 | *.vsp 92 | *.vspx 93 | 94 | # Guidance Automation Toolkit 95 | *.gpState 96 | 97 | # ReSharper is a .NET coding add-in 98 | _ReSharper*/ 99 | *.[Rr]e[Ss]harper 100 | 101 | # TeamCity is a build add-in 102 | _TeamCity* 103 | 104 | # DotCover is a Code Coverage Tool 105 | *.dotCover 106 | 107 | # NCrunch 108 | *.ncrunch* 109 | .*crunch*.local.xml 110 | 111 | # Installshield output folder 112 | [Ee]xpress/ 113 | 114 | # DocProject is a documentation generator add-in 115 | DocProject/buildhelp/ 116 | DocProject/Help/*.HxT 117 | DocProject/Help/*.HxC 118 | DocProject/Help/*.hhc 119 | DocProject/Help/*.hhk 120 | DocProject/Help/*.hhp 121 | DocProject/Help/Html2 122 | DocProject/Help/html 123 | 124 | # Click-Once directory 125 | publish/ 126 | 127 | # Publish Web Output 128 | *.Publish.xml 129 | *.pubxml 130 | *.publishproj 131 | 132 | # NuGet Packages Directory 133 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line 134 | #packages/ 135 | 136 | # Windows Azure Build Output 137 | csx 138 | *.build.csdef 139 | 140 | # Windows Store app package directory 141 | AppPackages/ 142 | 143 | # Others 144 | sql/ 145 | *.Cache 146 | ClientBin/ 147 | [Ss]tyle[Cc]op.* 148 | ~$* 149 | *~ 150 | *.dbmdl 151 | *.[Pp]ublish.xml 152 | *.pfx 153 | *.publishsettings 154 | 155 | # RIA/Silverlight projects 156 | Generated_Code/ 157 | 158 | # Backup & report files from converting an old project file to a newer 159 | # Visual Studio version. Backup files are not needed, because we have git ;-) 160 | _UpgradeReport_Files/ 161 | Backup*/ 162 | UpgradeLog*.XML 163 | UpgradeLog*.htm 164 | 165 | # SQL Server files 166 | App_Data/*.mdf 167 | App_Data/*.ldf 168 | 169 | ############# 170 | ## Windows detritus 171 | ############# 172 | 173 | # Windows image file caches 174 | Thumbs.db 175 | ehthumbs.db 176 | 177 | # Folder config file 178 | Desktop.ini 179 | 180 | # Recycle Bin used on file shares 181 | $RECYCLE.BIN/ 182 | 183 | # Mac crap 184 | .DS_Store 185 | 186 | 187 | ############# 188 | ## Python 189 | ############# 190 | 191 | *.py[cod] 192 | 193 | # Packages 194 | *.egg 195 | *.egg-info 196 | dist/ 197 | build/ 198 | eggs/ 199 | parts/ 200 | var/ 201 | sdist/ 202 | develop-eggs/ 203 | .installed.cfg 204 | 205 | # Installer logs 206 | pip-log.txt 207 | 208 | # Unit test / coverage reports 209 | .coverage 210 | .tox 211 | 212 | #Translations 213 | *.mo 214 | 215 | #Mr Developer 216 | .mr.developer.cfg 217 | -------------------------------------------------------------------------------- /Core/CRFSharp/encoder/DefaultFeatureLexicalDict.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Threading; 4 | using AdvUtils; 5 | using System.Threading.Tasks; 6 | 7 | namespace CRFSharp 8 | { 9 | public class DefaultFeatureLexicalDict : IFeatureLexicalDict 10 | { 11 | BTreeDictionary featureset_dict_; 12 | long maxid_; 13 | Object thisLock = new object(); 14 | ParallelOptions parallelOption; 15 | 16 | public DefaultFeatureLexicalDict(int thread_num) 17 | { 18 | featureset_dict_ = new BTreeDictionary(StringComparer.Ordinal, 128); 19 | maxid_ = 0; 20 | parallelOption = new ParallelOptions(); 21 | parallelOption.MaxDegreeOfParallelism = thread_num; 22 | } 23 | 24 | public void Clear() 25 | { 26 | featureset_dict_.Clear(); 27 | featureset_dict_ = null; 28 | } 29 | 30 | public long Size 31 | { 32 | get 33 | { 34 | return featureset_dict_.Count; 35 | } 36 | } 37 | 38 | public void Shrink(int freq) 39 | { 40 | var i = 0; 41 | while (i < featureset_dict_.Count) 42 | { 43 | if (featureset_dict_.ValueList[i].Value < freq) 44 | { 45 | //If the feature's frequency is less than specific frequency, drop the feature. 46 | featureset_dict_.RemoveAt(i); 47 | } 48 | else 49 | { 50 | i++; 51 | } 52 | } 53 | } 54 | 55 | public void GenerateLexicalIdList(out IList keyList, out IList valList) 56 | { 57 | keyList = featureset_dict_.KeyList; 58 | var fixArrayValue = new int[Size]; 59 | valList = fixArrayValue; 60 | 61 | Parallel.For(0, featureset_dict_.ValueList.Count, parallelOption, i => 62 | { 63 | fixArrayValue[i] = (int)featureset_dict_.ValueList[i].Key; 64 | }); 65 | 66 | } 67 | 68 | public long RegenerateFeatureId(BTreeDictionary old2new, long ysize) 69 | { 70 | long new_maxid = 0; 71 | //Regenerate new feature id and create feature ids mapping 72 | foreach (var it in featureset_dict_) 73 | { 74 | var strFeature = it.Key; 75 | //Regenerate new feature id 76 | old2new.Add(it.Value.Key, new_maxid); 77 | it.Value.Key = new_maxid; 78 | 79 | var addValue = (strFeature[0] == 'U' ? ysize : ysize * ysize); 80 | new_maxid += addValue; 81 | } 82 | 83 | return new_maxid; 84 | } 85 | 86 | //Get feature id from feature set by feature string 87 | //If feature string is not existed in the set, generate a new id and return it 88 | private long GetId(string key) 89 | { 90 | FeatureIdPair pair; 91 | if (featureset_dict_.TryGetValue(key, out pair) == true) 92 | { 93 | return pair.Key; 94 | } 95 | 96 | return Utils.ERROR_INVALIDATED_FEATURE; 97 | } 98 | 99 | public long GetOrAddId(string key) 100 | { 101 | FeatureIdPair pair; 102 | if (featureset_dict_.TryGetValue(key, out pair) == true && pair != null) 103 | { 104 | //Find its feature id 105 | System.Threading.Interlocked.Increment(ref pair.Value); 106 | } 107 | else 108 | { 109 | lock (thisLock) 110 | { 111 | if (featureset_dict_.TryGetValue(key, out pair) == true) 112 | { 113 | System.Threading.Interlocked.Increment(ref pair.Value); 114 | } 115 | else 116 | { 117 | var oldValue = Interlocked.Increment(ref maxid_) - 1; 118 | pair = new FeatureIdPair(oldValue, 1); 119 | featureset_dict_.Add(key, pair); 120 | } 121 | } 122 | } 123 | return pair.Key; 124 | } 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /Core/CRFSharpWrapper/SegDecoderTagger.cs: -------------------------------------------------------------------------------- 1 | /**********************************************/ 2 | /*Project: CRFSharp */ 3 | /*Author: Zhongkai Fu */ 4 | /*Email: fuzhongkai@gmail.com */ 5 | /**********************************************/ 6 | 7 | using System; 8 | using System.Collections.Generic; 9 | using System.Text; 10 | using CRFSharp; 11 | 12 | namespace CRFSharpWrapper 13 | { 14 | public class SegDecoderTagger : DecoderTagger 15 | { 16 | public SegDecoderTagger(int nbest, int this_crf_max_word_num = Utils.DEFAULT_CRF_MAX_WORD_NUM) 17 | : base(nbest, this_crf_max_word_num) 18 | { 19 | crf_max_word_num = this_crf_max_word_num; 20 | } 21 | 22 | int seg_termbuf_build(crf_seg_out term_buf) 23 | { 24 | term_buf.Clear(); 25 | 26 | //build raw result at first 27 | var iRet = termbuf_build(term_buf); 28 | if (iRet != Utils.ERROR_SUCCESS) 29 | { 30 | return iRet; 31 | } 32 | 33 | //Then build token result 34 | var term_len = 0; 35 | var weight = 0.0; 36 | var num = 0; 37 | for (var i = 0; i < x_.Count; i++) 38 | { 39 | //Adding the length of current token 40 | var strTag = term_buf.result_[i]; 41 | term_len += x_[i][0].Length; 42 | weight += term_buf.weight_[i]; 43 | num++; 44 | 45 | //Check if current term is the end of a token 46 | if ((strTag.StartsWith("B_") == false && 47 | strTag.StartsWith("M_") == false) || 48 | i == x_.Count - 1) 49 | { 50 | var tkn = new SegToken(); 51 | tkn.length = term_len; 52 | tkn.offset = term_buf.termTotalLength; 53 | 54 | var spos = strTag.IndexOf('_'); 55 | if (spos < 0) 56 | { 57 | if (strTag == "NOR") 58 | { 59 | tkn.strTag = ""; 60 | } 61 | else 62 | { 63 | tkn.strTag = strTag; 64 | } 65 | } 66 | else 67 | { 68 | tkn.strTag = strTag.Substring(spos + 1); 69 | } 70 | 71 | term_buf.termTotalLength += term_len; 72 | //Calculate each token's weight 73 | switch (vlevel_) 74 | { 75 | case 0: 76 | tkn.fWeight = 0.0; 77 | break; 78 | case 2: 79 | tkn.fWeight = weight / num; 80 | weight = 0.0; 81 | num = 0; 82 | break; 83 | } 84 | 85 | term_buf.tokenList.Add(tkn); 86 | term_len = 0; 87 | } 88 | } 89 | 90 | 91 | return Utils.ERROR_SUCCESS; 92 | } 93 | 94 | 95 | public int output(crf_seg_out[] pout) 96 | { 97 | var n = 0; 98 | var ret = 0; 99 | 100 | if (nbest_ == 1) 101 | { 102 | //If only best result and no need probability, "next" is not to be used 103 | ret = seg_termbuf_build(pout[0]); 104 | if (ret < 0) 105 | { 106 | return ret; 107 | } 108 | } 109 | else 110 | { 111 | //Fill the n best result 112 | var iNBest = nbest_; 113 | if (pout.Length < iNBest) 114 | { 115 | iNBest = pout.Length; 116 | } 117 | 118 | for (n = 0; n < iNBest; ++n) 119 | { 120 | ret = next(); 121 | if (ret < 0) 122 | { 123 | break; 124 | } 125 | 126 | ret = seg_termbuf_build(pout[n]); 127 | if (ret < 0) 128 | { 129 | return ret; 130 | } 131 | } 132 | } 133 | 134 | return Utils.ERROR_SUCCESS; 135 | } 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /Core/CRFSharpWrapper/Decoder.cs: -------------------------------------------------------------------------------- 1 | /**********************************************/ 2 | /*Project: CRF# */ 3 | /*Author: Zhongkai Fu */ 4 | /*Email: fuzhongkai@gmail.com */ 5 | /**********************************************/ 6 | 7 | using System; 8 | using System.Collections.Generic; 9 | using System.IO; 10 | using System.Text; 11 | using CRFSharp; 12 | using CRFSharp.decoder; 13 | 14 | namespace CRFSharpWrapper 15 | { 16 | public class Decoder 17 | { 18 | ModelReader _modelReader; 19 | 20 | /// 21 | /// Load encoded model from file 22 | /// 23 | /// 24 | /// The model path. 25 | /// 26 | /// 27 | public void LoadModel(string modelFilename) 28 | { 29 | _modelReader = new ModelReader(modelFilename); 30 | _modelReader.LoadModel(); 31 | } 32 | 33 | /// 34 | /// Loads an encoded model using the specified delegate. 35 | /// Using this overload you can read the model e.g. 36 | /// from network, zipped archives or other locations, as you wish. 37 | /// 38 | /// 39 | /// Allows reading the model from arbitrary formats and sources. 40 | /// 41 | /// 42 | /// The model file name, as used by the given 43 | /// for file resolution. 44 | /// 45 | /// 46 | public void LoadModel(Func modelLoader, string modelFilename) 47 | { 48 | this._modelReader = new ModelReader(modelLoader, modelFilename); 49 | _modelReader.LoadModel(); 50 | } 51 | 52 | public SegDecoderTagger CreateTagger(int nbest, int this_crf_max_word_num = Utils.DEFAULT_CRF_MAX_WORD_NUM) 53 | { 54 | if (_modelReader == null) 55 | { 56 | return null; 57 | } 58 | 59 | var tagger = new SegDecoderTagger(nbest, this_crf_max_word_num); 60 | tagger.init_by_model(_modelReader); 61 | 62 | return tagger; 63 | } 64 | 65 | //Segment given text 66 | public int Segment(crf_seg_out[] pout, //segment result 67 | SegDecoderTagger tagger, //Tagger per thread 68 | List> inbuf //feature set for segment 69 | ) 70 | { 71 | var ret = 0; 72 | if (inbuf.Count == 0) 73 | { 74 | //Empty input string 75 | return Utils.ERROR_SUCCESS; 76 | } 77 | 78 | ret = tagger.reset(); 79 | if (ret < 0) 80 | { 81 | return ret; 82 | } 83 | 84 | ret = tagger.add(inbuf); 85 | if (ret < 0) 86 | { 87 | return ret; 88 | } 89 | 90 | //parse 91 | ret = tagger.parse(); 92 | if (ret < 0) 93 | { 94 | return ret; 95 | } 96 | 97 | //wrap result 98 | ret = tagger.output(pout); 99 | if (ret < 0) 100 | { 101 | return ret; 102 | } 103 | 104 | return Utils.ERROR_SUCCESS; 105 | } 106 | 107 | 108 | 109 | //Segment given text 110 | public int Segment(crf_term_out[] pout, //segment result 111 | DecoderTagger tagger, //Tagger per thread 112 | List> inbuf //feature set for segment 113 | ) 114 | { 115 | var ret = 0; 116 | if (inbuf.Count == 0) 117 | { 118 | //Empty input string 119 | return Utils.ERROR_SUCCESS; 120 | } 121 | 122 | ret = tagger.reset(); 123 | if (ret < 0) 124 | { 125 | return ret; 126 | } 127 | 128 | ret = tagger.add(inbuf); 129 | if (ret < 0) 130 | { 131 | return ret; 132 | } 133 | 134 | //parse 135 | ret = tagger.parse(); 136 | if (ret < 0) 137 | { 138 | return ret; 139 | } 140 | 141 | //wrap result 142 | ret = tagger.output(pout); 143 | if (ret < 0) 144 | { 145 | return ret; 146 | } 147 | 148 | return Utils.ERROR_SUCCESS; 149 | } 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /Core/CRFSharp/base/BaseModel.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Text; 3 | 4 | namespace CRFSharp 5 | { 6 | public class BaseModel 7 | { 8 | public long maxid_; 9 | public double cost_factor_; 10 | 11 | public List unigram_templs_; 12 | public List bigram_templs_; 13 | 14 | //Labeling tag list 15 | public List y_; 16 | public uint ysize() { return (uint)y_.Count; } 17 | 18 | //The dimension training corpus 19 | public uint xsize_; 20 | 21 | //Feature set value array 22 | public double[] alpha_; 23 | 24 | public BaseModel() 25 | { 26 | cost_factor_ = 1.0; 27 | } 28 | 29 | //获取类别i的字符表示 30 | public string y(int i) { return y_[i]; } 31 | 32 | public long feature_size() { return maxid_; } 33 | 34 | public StringBuilder apply_rule(string p, int pos, StringBuilder resultContainer, Tagger tagger) 35 | { 36 | resultContainer.Clear(); 37 | for (var i = 0; i < p.Length; i++) 38 | { 39 | if (p[i] == '%') 40 | { 41 | i++; 42 | if (p[i] == 'x') 43 | { 44 | i++; 45 | var res = get_index(p, pos, i, tagger); 46 | i = res.idx; 47 | if (res.value == null) 48 | { 49 | return null; 50 | } 51 | resultContainer.Append(res.value); 52 | } 53 | else 54 | { 55 | return null; 56 | } 57 | } 58 | else 59 | { 60 | resultContainer.Append(p[i]); 61 | } 62 | } 63 | return resultContainer; 64 | } 65 | 66 | Index get_index(string p, int pos, int i, Tagger tagger) 67 | { 68 | if (p[i] != '[') 69 | { 70 | return new Index(null, i); 71 | } 72 | i++; 73 | var isInRow = true; 74 | var col = 0; 75 | var row = 0; 76 | var neg = 1; 77 | 78 | if (p[i] == '-') 79 | { 80 | neg = -1; 81 | i++; 82 | } 83 | 84 | for (; i < p.Length; i++) 85 | { 86 | var c = p[i]; 87 | if (isInRow) 88 | { 89 | if (c >= '0' && c <= '9') 90 | { 91 | row = 10 * row + (c - '0'); 92 | } 93 | else if (c == ',') 94 | { 95 | isInRow = false; 96 | } 97 | else 98 | { 99 | return new Index(null, i); 100 | } 101 | } 102 | else 103 | { 104 | if (c >= '0' && c <= '9') 105 | { 106 | col = 10 * col + (c - '0'); 107 | } 108 | else if (c == ']') 109 | { 110 | break; 111 | } 112 | else 113 | { 114 | return new Index(null, i); 115 | } 116 | } 117 | } 118 | 119 | row *= neg; 120 | 121 | if (col < 0 || col >= xsize_) 122 | { 123 | return new Index(null, i); 124 | } 125 | var idx = pos + row; 126 | if (idx < 0) 127 | { 128 | return new Index("_B-" + (-idx).ToString(), i); ; 129 | } 130 | if (idx >= tagger.word_num) 131 | { 132 | return new Index("_B+" + (idx - tagger.word_num + 1).ToString(), i); 133 | } 134 | 135 | return new Index(tagger.x_[idx][col], i); 136 | 137 | } 138 | 139 | private struct Index 140 | { 141 | public int idx; 142 | public string value; 143 | 144 | /// 145 | /// Initializes a new instance of the class. 146 | /// 147 | public Index(string value, int idx) 148 | { 149 | this.idx = idx; 150 | this.value = value; 151 | } 152 | } 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /Core/CRFSharp/base/Utils.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace CRFSharp 7 | { 8 | public class QueueElement 9 | { 10 | public Node node; 11 | public QueueElement next; 12 | public double fx; 13 | public double gx; 14 | }; 15 | 16 | public class Heap 17 | { 18 | public int capacity; 19 | public int elem_size; //size of elem_list 20 | public int size; // size of elem_ptr_list 21 | public List elem_ptr_list; 22 | public List elem_list; 23 | }; 24 | 25 | public class Utils 26 | { 27 | public const double eps = 1e-7; 28 | 29 | 30 | public const int MINUS_LOG_EPSILON = 13; 31 | public const int DEFAULT_CRF_MAX_WORD_NUM = 100; 32 | 33 | public const int MODEL_TYPE_NORM = 100; 34 | 35 | 36 | public const int ERROR_INVALIDATED_FEATURE = -8; 37 | public const int ERROR_HEAP_SIZE_TOO_BIG = -7; 38 | public const int ERROR_INSERT_HEAP_FAILED = -6; 39 | public const int ERROR_EMPTY_FEATURE = -5; 40 | public const int ERROR_INVALIDATED_PARAMETER = -4; 41 | public const int ERROR_WRONG_STATUS = -3; 42 | public const int ERROR_TOO_LONG_WORD = -2; 43 | public const int ERROR_UNKNOWN = -1; 44 | public const int ERROR_SUCCESS = 0; 45 | 46 | public static Heap heap_init(int max_size) 47 | { 48 | Heap H; 49 | 50 | H = new Heap(); 51 | H.capacity = max_size; 52 | H.size = 0; 53 | H.elem_size = 0; 54 | 55 | H.elem_ptr_list = new List(max_size + 1); 56 | H.elem_list = new List(max_size + 1); 57 | 58 | for (var z = 0; z < max_size; z++) 59 | { 60 | H.elem_list.Add(new QueueElement()); 61 | H.elem_ptr_list.Add(null); 62 | } 63 | H.elem_list[0].fx = double.MinValue; 64 | H.elem_ptr_list.Add(H.elem_list[0]); 65 | 66 | return H; 67 | } 68 | 69 | public static QueueElement allc_from_heap(Heap H) 70 | { 71 | if (H.elem_size >= H.capacity) 72 | { 73 | return null; 74 | } 75 | else 76 | { 77 | return H.elem_list[++H.elem_size]; 78 | } 79 | } 80 | 81 | public static int heap_insert(QueueElement qe, Heap H) 82 | { 83 | if (H.size >= H.capacity) 84 | { 85 | return Utils.ERROR_HEAP_SIZE_TOO_BIG; 86 | } 87 | var i = ++H.size; 88 | while (i != 1 && H.elem_ptr_list[i / 2].fx > qe.fx) 89 | { 90 | H.elem_ptr_list[i] = H.elem_ptr_list[i / 2]; //此时i还没有进行i/2操作 91 | i /= 2; 92 | } 93 | H.elem_ptr_list[i] = qe; 94 | return 0; 95 | } 96 | 97 | public static QueueElement heap_delete_min(Heap H) 98 | { 99 | var min_elem = H.elem_ptr_list[1]; //堆是从第1号元素开始的 100 | var last_elem = H.elem_ptr_list[H.size--]; 101 | int i = 1, ci = 2; 102 | while (ci <= H.size) 103 | { 104 | if (ci < H.size && H.elem_ptr_list[ci].fx > H.elem_ptr_list[ci + 1].fx) 105 | { 106 | ci++; 107 | } 108 | if (last_elem.fx <= H.elem_ptr_list[ci].fx) 109 | { 110 | break; 111 | } 112 | H.elem_ptr_list[i] = H.elem_ptr_list[ci]; 113 | i = ci; 114 | ci *= 2; 115 | } 116 | H.elem_ptr_list[i] = last_elem; 117 | return min_elem; 118 | } 119 | 120 | public static bool is_heap_empty(Heap H) 121 | { 122 | return H.size == 0; 123 | } 124 | 125 | public static void heap_reset(Heap H) 126 | { 127 | if (H != null) 128 | { 129 | H.size = 0; 130 | H.elem_size = 0; 131 | } 132 | } 133 | 134 | public static double logsumexp(double x, double y, bool flg) 135 | { 136 | if (flg) 137 | { 138 | return y; // init mode 139 | } 140 | double vmin; 141 | double vmax; 142 | if (x > y) 143 | { 144 | vmin = y; 145 | vmax = x; 146 | } 147 | else 148 | { 149 | vmin = x; 150 | vmax = y; 151 | } 152 | 153 | if (vmax > vmin + MINUS_LOG_EPSILON) 154 | { 155 | return vmax; 156 | } 157 | return vmax + Math.Log(Math.Exp(vmin - vmax) + 1.0); 158 | } 159 | } 160 | } 161 | -------------------------------------------------------------------------------- /Core/CRFSharp/base/Tagger.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Runtime.CompilerServices; 5 | using System.Text; 6 | 7 | namespace CRFSharp 8 | { 9 | public class Tagger 10 | { 11 | public List> x_; 12 | public Node[,] node_; //Node matrix 13 | public short ysize_; 14 | public short word_num; //the number of tokens need to be labeled 15 | public double Z_; //概率值 16 | public double cost_; //The path cost 17 | public short[] result_; 18 | public List feature_cache_; 19 | 20 | //Calculate the cost of each path. It's used for finding the best or N-best result 21 | public int viterbi() 22 | { 23 | var bestc = double.MinValue; 24 | Node bestNode = null; 25 | 26 | for (var i = 0; i < word_num; ++i) 27 | { 28 | for (var j = 0; j < ysize_; ++j) 29 | { 30 | bestc = double.MinValue; 31 | bestNode = null; 32 | 33 | var node_i_j = node_[i, j]; 34 | 35 | for (int index = 0; index < node_i_j.lpathList.Count; ++index) 36 | { 37 | var p = node_i_j.lpathList[index]; 38 | var cost = p.lnode.bestCost + p.cost + node_i_j.cost; 39 | if (cost > bestc) 40 | { 41 | bestc = cost; 42 | bestNode = p.lnode; 43 | } 44 | } 45 | 46 | node_i_j.prev = bestNode; 47 | node_i_j.bestCost = bestNode != null ? bestc : node_i_j.cost; 48 | } 49 | } 50 | 51 | bestc = double.MinValue; 52 | bestNode = null; 53 | 54 | var s = (short)(word_num - 1); 55 | for (short j = 0; j < ysize_; ++j) 56 | { 57 | if (bestc < node_[s, j].bestCost) 58 | { 59 | bestNode = node_[s, j]; 60 | bestc = node_[s, j].bestCost; 61 | } 62 | } 63 | 64 | var n = bestNode; 65 | while (n != null) 66 | { 67 | result_[n.x] = n.y; 68 | n = n.prev; 69 | } 70 | 71 | cost_ = -node_[s, result_[s]].bestCost; 72 | 73 | return Utils.ERROR_SUCCESS; 74 | } 75 | 76 | private void calcAlpha(int m, int n) 77 | { 78 | var nd = node_[m, n]; 79 | nd.alpha = 0.0; 80 | 81 | var i = 0; 82 | for (int index = 0; index < nd.lpathList.Count; index++) 83 | { 84 | var p = nd.lpathList[index]; 85 | nd.alpha = Utils.logsumexp(nd.alpha, p.cost + p.lnode.alpha, (i == 0)); 86 | i++; 87 | } 88 | nd.alpha += nd.cost; 89 | } 90 | 91 | private void calcBeta(int m, int n) 92 | { 93 | var nd = node_[m, n]; 94 | nd.beta = 0.0f; 95 | if (m + 1 < word_num) 96 | { 97 | var i = 0; 98 | for (int index = 0; index < nd.rpathList.Count; index++) 99 | { 100 | var p = nd.rpathList[index]; 101 | nd.beta = Utils.logsumexp(nd.beta, p.cost + p.rnode.beta, (i == 0)); 102 | i++; 103 | } 104 | } 105 | nd.beta += nd.cost; 106 | } 107 | 108 | public void forwardbackward() 109 | { 110 | for (int i = 0, k = word_num - 1; i < word_num; ++i, --k) 111 | { 112 | for (var j = 0; j < ysize_; ++j) 113 | { 114 | calcAlpha(i, j); 115 | calcBeta(k, j); 116 | } 117 | } 118 | 119 | Z_ = 0.0; 120 | for (var j = 0; j < ysize_; ++j) 121 | { 122 | Z_ = Utils.logsumexp(Z_, node_[0, j].beta, j == 0); 123 | } 124 | } 125 | 126 | 127 | //Assign feature ids to node and path 128 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 129 | public int RebuildFeatures() 130 | { 131 | var fid = 0; 132 | for (short cur = 0; cur < word_num; ++cur) 133 | { 134 | for (short i = 0; i < ysize_; ++i) 135 | { 136 | node_[cur, i].fid = fid; 137 | if (cur > 0) 138 | { 139 | Node previousNode = node_[cur - 1, i]; 140 | for (int index = 0; index < previousNode.rpathList.Count; ++index) 141 | { 142 | Path path = previousNode.rpathList[index]; 143 | path.fid = fid + word_num - 1; 144 | } 145 | } 146 | } 147 | 148 | ++fid; 149 | } 150 | 151 | return 0; 152 | } 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /CRFSharp.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.30320.27 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Core", "Core", "{0B4B6DD1-D8DC-4A72-8441-37557CA618C9}" 7 | EndProject 8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CRFSharp", "Core\CRFSharp\CRFSharp.csproj", "{A9888D02-572E-45B2-8999-F1202ADDCF03}" 9 | EndProject 10 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CRFSharpWrapper", "Core\CRFSharpWrapper\CRFSharpWrapper.csproj", "{ED50CC60-D581-42ED-A608-22F7ED59A75E}" 11 | EndProject 12 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CRFSharpConsole", "CRFSharpConsole\CRFSharpConsole.csproj", "{16C2FDF6-85D3-494E-9866-AE98C1796FE4}" 13 | EndProject 14 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{9BEFE991-DCEE-43BA-A1E3-2B024C9CCAE0}" 15 | ProjectSection(SolutionItems) = preProject 16 | LICENSE = LICENSE 17 | README.md = README.md 18 | EndProjectSection 19 | EndProject 20 | Global 21 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 22 | Debug|Any CPU = Debug|Any CPU 23 | Debug|Mixed Platforms = Debug|Mixed Platforms 24 | Debug|Win32 = Debug|Win32 25 | Debug|x64 = Debug|x64 26 | Debug|x86 = Debug|x86 27 | Release|Any CPU = Release|Any CPU 28 | Release|Mixed Platforms = Release|Mixed Platforms 29 | Release|Win32 = Release|Win32 30 | Release|x64 = Release|x64 31 | Release|x86 = Release|x86 32 | EndGlobalSection 33 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 34 | {A9888D02-572E-45B2-8999-F1202ADDCF03}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 35 | {A9888D02-572E-45B2-8999-F1202ADDCF03}.Debug|Any CPU.Build.0 = Debug|Any CPU 36 | {A9888D02-572E-45B2-8999-F1202ADDCF03}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU 37 | {A9888D02-572E-45B2-8999-F1202ADDCF03}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU 38 | {A9888D02-572E-45B2-8999-F1202ADDCF03}.Debug|Win32.ActiveCfg = Debug|Any CPU 39 | {A9888D02-572E-45B2-8999-F1202ADDCF03}.Debug|x64.ActiveCfg = Debug|Any CPU 40 | {A9888D02-572E-45B2-8999-F1202ADDCF03}.Debug|x86.ActiveCfg = Debug|Any CPU 41 | {A9888D02-572E-45B2-8999-F1202ADDCF03}.Release|Any CPU.ActiveCfg = Release|Any CPU 42 | {A9888D02-572E-45B2-8999-F1202ADDCF03}.Release|Any CPU.Build.0 = Release|Any CPU 43 | {A9888D02-572E-45B2-8999-F1202ADDCF03}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU 44 | {A9888D02-572E-45B2-8999-F1202ADDCF03}.Release|Mixed Platforms.Build.0 = Release|Any CPU 45 | {A9888D02-572E-45B2-8999-F1202ADDCF03}.Release|Win32.ActiveCfg = Release|Any CPU 46 | {A9888D02-572E-45B2-8999-F1202ADDCF03}.Release|x64.ActiveCfg = Release|Any CPU 47 | {A9888D02-572E-45B2-8999-F1202ADDCF03}.Release|x86.ActiveCfg = Release|Any CPU 48 | {ED50CC60-D581-42ED-A608-22F7ED59A75E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 49 | {ED50CC60-D581-42ED-A608-22F7ED59A75E}.Debug|Any CPU.Build.0 = Debug|Any CPU 50 | {ED50CC60-D581-42ED-A608-22F7ED59A75E}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU 51 | {ED50CC60-D581-42ED-A608-22F7ED59A75E}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU 52 | {ED50CC60-D581-42ED-A608-22F7ED59A75E}.Debug|Win32.ActiveCfg = Debug|Any CPU 53 | {ED50CC60-D581-42ED-A608-22F7ED59A75E}.Debug|x64.ActiveCfg = Debug|Any CPU 54 | {ED50CC60-D581-42ED-A608-22F7ED59A75E}.Debug|x86.ActiveCfg = Debug|Any CPU 55 | {ED50CC60-D581-42ED-A608-22F7ED59A75E}.Release|Any CPU.ActiveCfg = Release|Any CPU 56 | {ED50CC60-D581-42ED-A608-22F7ED59A75E}.Release|Any CPU.Build.0 = Release|Any CPU 57 | {ED50CC60-D581-42ED-A608-22F7ED59A75E}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU 58 | {ED50CC60-D581-42ED-A608-22F7ED59A75E}.Release|Mixed Platforms.Build.0 = Release|Any CPU 59 | {ED50CC60-D581-42ED-A608-22F7ED59A75E}.Release|Win32.ActiveCfg = Release|Any CPU 60 | {ED50CC60-D581-42ED-A608-22F7ED59A75E}.Release|x64.ActiveCfg = Release|Any CPU 61 | {ED50CC60-D581-42ED-A608-22F7ED59A75E}.Release|x86.ActiveCfg = Release|Any CPU 62 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 63 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Debug|Any CPU.Build.0 = Debug|Any CPU 64 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU 65 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU 66 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Debug|Win32.ActiveCfg = Debug|Any CPU 67 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Debug|Win32.Build.0 = Debug|Any CPU 68 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Debug|x64.ActiveCfg = Debug|Any CPU 69 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Debug|x64.Build.0 = Debug|Any CPU 70 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Debug|x86.ActiveCfg = Debug|Any CPU 71 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Debug|x86.Build.0 = Debug|Any CPU 72 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Release|Any CPU.ActiveCfg = Release|Any CPU 73 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Release|Any CPU.Build.0 = Release|Any CPU 74 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU 75 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Release|Mixed Platforms.Build.0 = Release|Any CPU 76 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Release|Win32.ActiveCfg = Release|Any CPU 77 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Release|Win32.Build.0 = Release|Any CPU 78 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Release|x64.ActiveCfg = Release|Any CPU 79 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Release|x64.Build.0 = Release|Any CPU 80 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Release|x86.ActiveCfg = Release|Any CPU 81 | {16C2FDF6-85D3-494E-9866-AE98C1796FE4}.Release|x86.Build.0 = Release|Any CPU 82 | EndGlobalSection 83 | GlobalSection(SolutionProperties) = preSolution 84 | HideSolutionNode = FALSE 85 | EndGlobalSection 86 | GlobalSection(NestedProjects) = preSolution 87 | {A9888D02-572E-45B2-8999-F1202ADDCF03} = {0B4B6DD1-D8DC-4A72-8441-37557CA618C9} 88 | {ED50CC60-D581-42ED-A608-22F7ED59A75E} = {0B4B6DD1-D8DC-4A72-8441-37557CA618C9} 89 | EndGlobalSection 90 | GlobalSection(ExtensibilityGlobals) = postSolution 91 | SolutionGuid = {BEB42FC2-4888-42B8-9EBC-063E22DF9FC1} 92 | EndGlobalSection 93 | GlobalSection(SubversionScc) = preSolution 94 | Svn-Managed = True 95 | Manager = AnkhSVN - Subversion Support for Visual Studio 96 | EndGlobalSection 97 | EndGlobal 98 | -------------------------------------------------------------------------------- /Core/CRFSharp/encoder/EncoderTagger.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Threading; 4 | using System.Linq; 5 | using System.Text; 6 | using System.IO; 7 | using System.Security.Cryptography; 8 | using AdvUtils; 9 | 10 | namespace CRFSharp 11 | { 12 | public class EncoderTagger : Tagger 13 | { 14 | public ModelWriter feature_index_; 15 | public short[] answer_; 16 | 17 | public int eval(int[,] merr) 18 | { 19 | var err = 0; 20 | for (var i = 0; i < word_num; ++i) 21 | { 22 | if (answer_[i] != result_[i]) 23 | { 24 | ++err; 25 | merr[answer_[i], result_[i]]++; 26 | } 27 | } 28 | return err; 29 | } 30 | 31 | public EncoderTagger(ModelWriter modelWriter) 32 | { 33 | feature_index_ = modelWriter; 34 | ysize_ = (short)feature_index_.ysize(); 35 | } 36 | 37 | public bool GenerateFeature(List> recordList) 38 | { 39 | word_num = (short)recordList.Count; 40 | if (word_num == 0) 41 | { 42 | return false; 43 | } 44 | 45 | //Try to find each record's answer tag 46 | var x_num = 0; 47 | var xsize = (int)feature_index_.xsize_; 48 | answer_ = new short[word_num]; 49 | for (int index = 0; index < recordList.Count; index++) 50 | { 51 | var record = recordList[index]; 52 | //get result tag's index and fill answer 53 | for (short k = 0; k < ysize_; ++k) 54 | { 55 | if (feature_index_.y(k) == record[xsize]) 56 | { 57 | answer_[x_num] = k; 58 | break; 59 | } 60 | } 61 | x_num++; 62 | } 63 | 64 | //Build record feature set 65 | x_ = recordList; 66 | Z_ = 0.0; 67 | feature_cache_ = new List(); 68 | feature_index_.BuildFeatures(this); 69 | x_ = null; 70 | 71 | return true; 72 | } 73 | 74 | private void LockFreeAdd(double[] expected, long exp_offset, double addValue) 75 | { 76 | double initialValue; 77 | double newValue; 78 | do 79 | { 80 | initialValue = expected[exp_offset]; // read current value 81 | newValue = initialValue + addValue; //calculate new value 82 | } 83 | while (initialValue != Interlocked.CompareExchange(ref expected[exp_offset], newValue, initialValue)); 84 | } 85 | 86 | private void calcExpectation(int x, int y, double[] expected) 87 | { 88 | var n = node_[x, y]; 89 | var c = Math.Exp(n.alpha + n.beta - n.cost - Z_); 90 | var offset = y + 1; //since expected array is based on 1 91 | for (int index = 0; index < feature_cache_[n.fid].Length; index++) 92 | { 93 | var item = feature_cache_[n.fid][index]; 94 | LockFreeAdd(expected, item + offset, c); 95 | } 96 | 97 | for (int index = 0; index < n.lpathList.Count; index++) 98 | { 99 | var p = n.lpathList[index]; 100 | c = Math.Exp(p.lnode.alpha + p.cost + p.rnode.beta - Z_); 101 | offset = p.lnode.y * ysize_ + p.rnode.y + 1; //since expected array is based on 1 102 | for (int i = 0; i < feature_cache_[p.fid].Length; i++) 103 | { 104 | var item = feature_cache_[p.fid][i]; 105 | LockFreeAdd(expected, item + offset, c); 106 | } 107 | } 108 | } 109 | 110 | public double gradient(double[] expected) 111 | { 112 | buildLattice(); 113 | forwardbackward(); 114 | var s = 0.0; 115 | 116 | for (var i = 0; i < word_num; ++i) 117 | { 118 | for (var j = 0; j < ysize_; ++j) 119 | { 120 | calcExpectation(i, j, expected); 121 | } 122 | } 123 | 124 | for (var i = 0; i < word_num; ++i) 125 | { 126 | var answer_val = answer_[i]; 127 | var answer_Node = node_[i, answer_val]; 128 | var offset = answer_val + 1; //since expected array is based on 1 129 | for (int index = 0; index < feature_cache_[answer_Node.fid].Length; index++) 130 | { 131 | var fid = feature_cache_[answer_Node.fid][index]; 132 | LockFreeAdd(expected, fid + offset, -1.0f); 133 | } 134 | s += answer_Node.cost; // UNIGRAM cost 135 | 136 | 137 | for (int index = 0; index < answer_Node.lpathList.Count; index++) 138 | { 139 | var lpath = answer_Node.lpathList[index]; 140 | if (lpath.lnode.y == answer_[lpath.lnode.x]) 141 | { 142 | offset = lpath.lnode.y * ysize_ + lpath.rnode.y + 1; 143 | for (int index1 = 0; index1 < feature_cache_[lpath.fid].Length; index1++) 144 | { 145 | var fid = feature_cache_[lpath.fid][index1]; 146 | LockFreeAdd(expected, fid + offset, -1.0f); 147 | } 148 | 149 | s += lpath.cost; // BIGRAM COST 150 | break; 151 | } 152 | } 153 | } 154 | 155 | viterbi(); // call for eval() 156 | return Z_ - s; 157 | } 158 | 159 | public void Init(short[] result, Node[,] node) 160 | { 161 | result_ = result; 162 | node_ = node; 163 | } 164 | 165 | 166 | 167 | 168 | public void buildLattice() 169 | { 170 | RebuildFeatures(); 171 | for (var i = 0; i < word_num; ++i) 172 | { 173 | for (var j = 0; j < ysize_; ++j) 174 | { 175 | var node_i_j = node_[i, j]; 176 | node_i_j.cost = calcCost(node_i_j.fid, j); 177 | for (int index = 0; index < node_i_j.lpathList.Count; index++) 178 | { 179 | var p = node_i_j.lpathList[index]; 180 | var offset = p.lnode.y * ysize_ + p.rnode.y; 181 | p.cost = calcCost(p.fid, offset); 182 | } 183 | } 184 | } 185 | } 186 | 187 | public double calcCost(int featureListIdx, int offset) 188 | { 189 | double c = 0.0f; 190 | offset++; //since alpha_ array is based on 1 191 | for (int index = 0; index < feature_cache_[featureListIdx].Length; index++) 192 | { 193 | var fid = feature_cache_[featureListIdx][index]; 194 | c += feature_index_.alpha_[fid + offset]; 195 | } 196 | return feature_index_.cost_factor_ * c; 197 | } 198 | } 199 | } 200 | -------------------------------------------------------------------------------- /Core/CRFSharp/decoder/ModelReader.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.IO; 6 | using AdvUtils; 7 | using CRFSharp.decoder; 8 | 9 | namespace CRFSharp 10 | { 11 | public class ModelReader : BaseModel 12 | { 13 | private readonly Func modelLoader = null; 14 | 15 | public uint version; //模型版本号,读取模型时读入 16 | private DoubleArrayTrieSearch da; //特征集合 17 | 18 | /// 19 | /// Returns the model path. 20 | /// 21 | public string ModelPath { get; private set; } 22 | 23 | /// 24 | /// Creates a new 25 | /// that will load the model from the file system, 26 | /// using the given . 27 | /// 28 | /// 29 | /// Path to the model. 30 | /// 31 | public ModelReader(string modelPath) : 32 | this(GetStreamFromFileSystem, modelPath) 33 | { 34 | 35 | } 36 | 37 | /// 38 | /// Creates a new 39 | /// that will load the model from the file system, 40 | /// using the given . 41 | /// 42 | /// 43 | /// A delegate capable of resolving 44 | /// the given 45 | /// into a stream with the model file. 46 | /// 47 | /// 48 | /// Path to the model. 49 | /// 50 | public ModelReader(Func modelLoader, 51 | string modelPath) 52 | { 53 | this.modelLoader = modelLoader; 54 | this.ModelPath = modelPath; 55 | } 56 | 57 | /// 58 | /// Loads the model into memory. 59 | /// 60 | public void LoadModel() 61 | { 62 | //Load model meta data 63 | LoadMetadata(); 64 | 65 | //Load all feature set data 66 | LoadFeatureSet(); 67 | 68 | //Load all features alpha data 69 | LoadFeatureWeights(); 70 | } 71 | 72 | //获取key对应的特征id 73 | public virtual int get_id(string str) 74 | { 75 | return da.SearchByPerfectMatch(str); 76 | } 77 | 78 | public virtual double GetAlpha(long index) 79 | { 80 | return alpha_[index]; 81 | } 82 | 83 | /// 84 | /// The default model loading strategy - 85 | /// load files from the file system. 86 | /// 87 | /// 88 | /// Model file path. 89 | /// 90 | /// A stream containing the requested file. 91 | /// 92 | private static Stream GetStreamFromFileSystem(string path) 93 | { 94 | path.ThrowIfNotExists(); 95 | return File.OpenRead(path); 96 | } 97 | 98 | /// 99 | /// Provides access to the metadata stream. 100 | /// 101 | /// 102 | /// A instance 103 | /// that points to the model metadata file. 104 | /// 105 | private Stream GetMetadataStream() 106 | { 107 | string path = ModelPath.ToMetadataModelName(); 108 | 109 | return modelLoader(path); 110 | } 111 | 112 | /// 113 | /// Provides access to the feature set stream. 114 | /// 115 | /// 116 | /// A instance 117 | /// that allows accessing the model feature set file. 118 | /// 119 | private Stream GetFeatureSetStream() 120 | { 121 | string path = ModelPath.ToFeatureSetFileName(); 122 | 123 | return modelLoader(path); 124 | } 125 | 126 | /// 127 | /// Provides access to the feature set stream. 128 | /// 129 | /// 130 | /// A instance 131 | /// that allows accessing the model feature weight file. 132 | /// 133 | private Stream GetFeatureWeightStream() 134 | { 135 | string path = ModelPath.ToFeatureWeightFileName(); 136 | 137 | return modelLoader(path); 138 | } 139 | 140 | private void LoadMetadata() 141 | { 142 | using (Stream metadataStream = GetMetadataStream()) 143 | { 144 | var sr = new StreamReader(metadataStream); 145 | string strLine; 146 | 147 | //读入版本号 148 | strLine = sr.ReadLine(); 149 | version = uint.Parse(strLine.Split(':')[1].Trim()); 150 | 151 | //读入cost_factor 152 | strLine = sr.ReadLine(); 153 | cost_factor_ = double.Parse(strLine.Split(':')[1].Trim()); 154 | 155 | //读入maxid 156 | strLine = sr.ReadLine(); 157 | maxid_ = long.Parse(strLine.Split(':')[1].Trim()); 158 | 159 | //读入xsize 160 | strLine = sr.ReadLine(); 161 | xsize_ = uint.Parse(strLine.Split(':')[1].Trim()); 162 | 163 | //读入空行 164 | strLine = sr.ReadLine(); 165 | 166 | //读入待标注的标签 167 | y_ = new List(); 168 | while (true) 169 | { 170 | strLine = sr.ReadLine(); 171 | if (strLine.Length == 0) 172 | { 173 | break; 174 | } 175 | y_.Add(strLine); 176 | } 177 | 178 | //读入unigram和bigram模板 179 | unigram_templs_ = new List(); 180 | bigram_templs_ = new List(); 181 | while (sr.EndOfStream == false) 182 | { 183 | strLine = sr.ReadLine(); 184 | if (strLine.Length == 0) 185 | { 186 | break; 187 | } 188 | if (strLine[0] == 'U') 189 | { 190 | unigram_templs_.Add(strLine); 191 | } 192 | if (strLine[0] == 'B') 193 | { 194 | bigram_templs_.Add(strLine); 195 | } 196 | } 197 | sr.Close(); 198 | } 199 | } 200 | 201 | private void LoadFeatureSet() 202 | { 203 | Stream featureSetStream = GetFeatureSetStream(); 204 | da = new DoubleArrayTrieSearch(); 205 | da.Load(featureSetStream); 206 | } 207 | 208 | private void LoadFeatureWeights() 209 | { 210 | //feature weight array 211 | alpha_ = new double[maxid_ + 1]; 212 | 213 | using (Stream featureWeightStream = GetFeatureWeightStream()) 214 | { 215 | //Load all features alpha data 216 | var sr_alpha = new StreamReader(featureWeightStream); 217 | var br_alpha = new BinaryReader(sr_alpha.BaseStream); 218 | 219 | //Get VQ Size 220 | int vqSize = br_alpha.ReadInt32(); 221 | 222 | if (vqSize > 0) 223 | { 224 | //This is a VQ model, we need to get code book at first 225 | Logger.WriteLine("This is a VQ Model. VQSize: {0}", vqSize); 226 | List vqCodeBook = new List(); 227 | for (int i = 0; i < vqSize; i++) 228 | { 229 | vqCodeBook.Add(br_alpha.ReadDouble()); 230 | } 231 | 232 | //Load weights 233 | for (long i = 0; i < maxid_; i++) 234 | { 235 | int vqIdx = br_alpha.ReadByte(); 236 | alpha_[i] = vqCodeBook[vqIdx]; 237 | } 238 | } 239 | else 240 | { 241 | //This is a normal model 242 | Logger.WriteLine("This is a normal model."); 243 | for (long i = 0; i < maxid_; i++) 244 | { 245 | alpha_[i] = br_alpha.ReadSingle(); 246 | } 247 | } 248 | 249 | br_alpha.Close(); 250 | } 251 | } 252 | 253 | } 254 | } 255 | -------------------------------------------------------------------------------- /CRFSharpConsole/EncoderConsole.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using CRFSharpWrapper; 3 | using AdvUtils; 4 | 5 | namespace CRFSharpConsole 6 | { 7 | class EncoderConsole 8 | { 9 | public void Run(string [] args) 10 | { 11 | var encoder = new CRFSharpWrapper.Encoder(); 12 | var options = new EncoderArgs(); 13 | 14 | for (var i = 0; i < args.Length; i++) 15 | { 16 | if (args[i][0] == '-') 17 | { 18 | var key = args[i].Substring(1).ToLower().Trim(); 19 | var value = ""; 20 | 21 | if (key == "encode") 22 | { 23 | continue; 24 | } 25 | 26 | if (key == "debug") 27 | { 28 | options.debugLevel = 1; 29 | 30 | try 31 | { 32 | if (i < args.Length - 1) 33 | { 34 | var debugLevel = int.Parse(args[i + 1]); 35 | options.debugLevel = debugLevel; 36 | i++; 37 | } 38 | } 39 | catch (Exception){} 40 | } 41 | else if (i < args.Length - 1) 42 | { 43 | i++; 44 | value = args[i]; 45 | switch (key) 46 | { 47 | case "template": 48 | options.strTemplateFileName = value; 49 | break; 50 | case "trainfile": 51 | options.strTrainingCorpus = value; 52 | break; 53 | case "modelfile": 54 | options.strEncodedModelFileName = value; 55 | break; 56 | case "maxiter": 57 | options.max_iter = int.Parse(value); 58 | break; 59 | case "minfeafreq": 60 | options.min_feature_freq = int.Parse(value); 61 | break; 62 | case "mindiff": 63 | options.min_diff = double.Parse(value); 64 | break; 65 | case "thread": 66 | options.threads_num = int.Parse(value); 67 | break; 68 | case "costfactor": 69 | options.C = double.Parse(value); 70 | break; 71 | case "slotrate": 72 | options.slot_usage_rate_threshold = double.Parse(value); 73 | break; 74 | case "hugelexmem": 75 | options.hugeLexMemLoad = uint.Parse(value); 76 | break; 77 | case "retrainmodel": 78 | options.strRetrainModelFileName = value; 79 | break; 80 | case "vq": 81 | options.bVQ = (int.Parse(value) != 0) ? true : false; 82 | break; 83 | case "regtype": 84 | if (value.ToLower().Trim() == "l1") 85 | { 86 | options.regType = CRFSharpWrapper.Encoder.REG_TYPE.L1; 87 | } 88 | else if (value.ToLower().Trim() == "l2") 89 | { 90 | options.regType = CRFSharpWrapper.Encoder.REG_TYPE.L2; 91 | } 92 | else 93 | { 94 | Logger.WriteLine("Invalidated regularization type"); 95 | Usage(); 96 | return; 97 | } 98 | break; 99 | default: 100 | var cc = Console.ForegroundColor; 101 | Console.ForegroundColor = ConsoleColor.Red; 102 | Logger.WriteLine("No supported {0} parameter, exit", key); 103 | Console.ForegroundColor = cc; 104 | Usage(); 105 | return; 106 | } 107 | } 108 | else 109 | { 110 | var cc = Console.ForegroundColor; 111 | Console.ForegroundColor = ConsoleColor.Red; 112 | Logger.WriteLine("{0} is invalidated parameter.", key); 113 | Console.ForegroundColor = cc; 114 | Usage(); 115 | return; 116 | } 117 | } 118 | } 119 | 120 | if (options.strTemplateFileName == null || options.strEncodedModelFileName == null || options.strTrainingCorpus == null) 121 | { 122 | Usage(); 123 | return; 124 | } 125 | 126 | if (options.threads_num <= 0) 127 | { 128 | options.threads_num = Environment.ProcessorCount; 129 | } 130 | 131 | bool bRet; 132 | bRet = encoder.Learn(options); 133 | } 134 | 135 | private static void Usage() 136 | { 137 | Console.WriteLine("Linear-chain CRF encoder & decoder by Zhongkai Fu (fuzhongkai@gmail.com)"); 138 | Console.WriteLine("CRFSharpConsole.exe -encode [parameters list]"); 139 | Console.WriteLine("\t-template : template file name"); 140 | Console.WriteLine("\t-trainfile : training corpus file name"); 141 | Console.WriteLine("\t-modelfile : encoded model file name"); 142 | Console.WriteLine("\t-maxiter : The maximum encoding iteration. Default value is 1000"); 143 | Console.WriteLine("\t-minfeafreq : Any feature's frequency is less than the value will be dropped. Default value is 2"); 144 | Console.WriteLine("\t-mindiff : If diff is less than the value consecutive 3 times, the encoding will be ended. Default value is 0.0001"); 145 | Console.WriteLine("\t-thread : the amount of threads for encoding. Default value is 1"); 146 | Console.WriteLine("\t-slotrate : the maximum slot usage rate threshold when building feature set. it is ranged in (0.0, 1.0). the higher value takes longer time to build feature set, but smaller feature set size. Default value is 0.95"); 147 | Console.WriteLine("\t-regtype : regularization type (L1 and L2). L1 will generate a sparse model. Default is L2"); 148 | Console.WriteLine("\t-hugelexmem : build lexical dictionary in huge mode and shrinking start when used memory reaches this value. This mode can build more lexical items, but slowly. Value ranges [1,100] and default is disabled."); 149 | Console.WriteLine("\t-retrainmodel : the existed model for re-training."); 150 | Console.WriteLine("\t-vq : model vector quantization value (0/1). The model size will be reduced to 1/4 original model size. Default value is 0"); 151 | Console.WriteLine("\t-debug : debug level, default value is 1"); 152 | Console.WriteLine("\t 0 - no debug information output"); 153 | Console.WriteLine("\t 1 - only output raw lexical dictionary for feature set"); 154 | Console.WriteLine("\t 2 - full debug information output, both raw lexical dictionary and detailed encoded information for each iteration"); 155 | Console.WriteLine(); 156 | Console.WriteLine("Note: either -maxiter reaches setting value or -mindiff reaches setting value in consecutive three times, the training process will be finished and saved encoded model."); 157 | Console.WriteLine("Note: -hugelexmem is only used for special task, and it is not recommended for common task, since it costs lots of time for memory shrink in order to load more lexical features into memory"); 158 | Console.WriteLine(); 159 | Console.WriteLine("A command line example as follows:"); 160 | Console.WriteLine("\tCRFSharpConsole.exe -encode -template template.1 -trainfile ner.train -modelfile ner.model -maxiter 100 -minfeafreq 1 -mindiff 0.0001 -thread 4 -debug 2 -vq 1 -slotrate 0.95"); 161 | } 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /Core/CRFSharp/encoder/HugeFeatureLexicalDict.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using System.Threading; 5 | using System.Runtime.InteropServices; 6 | using AdvUtils; 7 | using System.Threading.Tasks; 8 | 9 | 10 | namespace CRFSharp 11 | { 12 | //定义内存的信息结构 13 | [StructLayout(LayoutKind.Sequential, CharSet = CharSet.Auto)] 14 | public class MEMORYSTATUSEX 15 | { 16 | public uint dwLength; 17 | public uint dwMemoryLoad; 18 | public ulong ullTotalPhys; 19 | public ulong ullAvailPhys; 20 | public ulong ullTotalPageFile; 21 | public ulong ullAvailPageFile; 22 | public ulong ullTotalVirtual; 23 | public ulong ullAvailVirtual; 24 | public ulong ullAvailExtendedVirtual; 25 | public MEMORYSTATUSEX() 26 | { 27 | this.dwLength = (uint)Marshal.SizeOf(typeof(MEMORYSTATUSEX)); 28 | } 29 | } 30 | 31 | public sealed class FeatureFreq : IComparable 32 | { 33 | public string strFeature; 34 | public long value; 35 | 36 | public int CompareTo(FeatureFreq fi) 37 | { 38 | return StringComparer.Ordinal.Compare(strFeature, fi.strFeature); 39 | } 40 | } 41 | 42 | public sealed class HugeFeatureLexicalDict : IFeatureLexicalDict 43 | { 44 | VarBigArray arrayFeatureFreq; 45 | long arrayFeatureFreqSize; 46 | uint SHRINK_AVALI_MEM_LOAD; 47 | AdvUtils.Security.Cryptography.MD5 md5; 48 | ParallelOptions parallelOption; 49 | 50 | [return: MarshalAs(UnmanagedType.Bool)] 51 | [DllImport("kernel32.dll", CharSet = CharSet.Auto, SetLastError = true)] 52 | static extern bool GlobalMemoryStatusEx([In, Out] MEMORYSTATUSEX lpBuffer); 53 | 54 | public HugeFeatureLexicalDict(int thread_num, uint shrinkMemLoad) 55 | { 56 | SHRINK_AVALI_MEM_LOAD = shrinkMemLoad; 57 | arrayFeatureFreq = new VarBigArray(1024 * 1024); 58 | arrayFeatureFreqSize = 0; 59 | md5 = new AdvUtils.Security.Cryptography.MD5(); 60 | parallelOption = new ParallelOptions(); 61 | parallelOption.MaxDegreeOfParallelism = thread_num; 62 | } 63 | 64 | public void Clear() 65 | { 66 | arrayFeatureFreq.Clear(); 67 | arrayFeatureFreq = null; 68 | } 69 | 70 | 71 | public VarBigArray featureFreq 72 | { 73 | get 74 | { 75 | return arrayFeatureFreq; 76 | } 77 | } 78 | 79 | public long Size 80 | { 81 | get 82 | { 83 | return arrayFeatureFreqSize; 84 | } 85 | } 86 | 87 | private long ParallelMerge(long startIndex, long endIndex, int freq) 88 | { 89 | var sizePerThread = (endIndex - startIndex + 1) / parallelOption.MaxDegreeOfParallelism; 90 | //Fistly, merge items in each block by parallel 91 | Parallel.For(0, parallelOption.MaxDegreeOfParallelism, parallelOption, i => 92 | { 93 | Merge(startIndex + i * sizePerThread, startIndex + (i + 1) * sizePerThread - 1, 0); 94 | }); 95 | 96 | //Secondly, merge all items 97 | return Merge(startIndex, endIndex, freq); 98 | } 99 | 100 | private void ForceCollectMemory() 101 | { 102 | GC.Collect(); 103 | GC.WaitForPendingFinalizers(); 104 | GC.Collect(); 105 | } 106 | 107 | //Merge same items in sorted list 108 | private long Merge(long startIndex, long endIndex, int freq) 109 | { 110 | var newEndIndex = startIndex; 111 | 112 | //Try to find first not null item 113 | while ((arrayFeatureFreq[startIndex] == null) && 114 | startIndex <= endIndex) 115 | { 116 | startIndex++; 117 | } 118 | arrayFeatureFreq[newEndIndex] = arrayFeatureFreq[startIndex]; 119 | for (var i = startIndex + 1; i <= endIndex; i++) 120 | { 121 | if (arrayFeatureFreq[i] == null) 122 | { 123 | continue; 124 | } 125 | 126 | if (arrayFeatureFreq[newEndIndex].strFeature == arrayFeatureFreq[i].strFeature) 127 | { 128 | //two same items, sum their value up 129 | arrayFeatureFreq[newEndIndex].value += arrayFeatureFreq[i].value; 130 | arrayFeatureFreq[i] = null; 131 | } 132 | else 133 | { 134 | //two different items 135 | if (arrayFeatureFreq[newEndIndex].value >= freq) 136 | { 137 | newEndIndex++; 138 | } 139 | 140 | arrayFeatureFreq[newEndIndex] = arrayFeatureFreq[i]; 141 | if (newEndIndex < i) 142 | { 143 | arrayFeatureFreq[i] = null; 144 | } 145 | } 146 | } 147 | 148 | return newEndIndex; 149 | } 150 | 151 | //Generate feature string and its id list 152 | public void GenerateLexicalIdList(out IList keyList, out IList valList) 153 | { 154 | var fixArrayKey = new FixedBigArray(Size, 0); 155 | keyList = fixArrayKey; 156 | 157 | var fixArrayValue = new FixedBigArray(Size, 0); 158 | valList = fixArrayValue; 159 | Parallel.For(0, arrayFeatureFreqSize, parallelOption, i => 160 | { 161 | fixArrayKey[i] = arrayFeatureFreq[i].strFeature; 162 | fixArrayValue[i] = (int)(arrayFeatureFreq[i].value); 163 | }); 164 | } 165 | 166 | Object thisLock = new object(); 167 | //Generate feature id by NGram rules 168 | public long RegenerateFeatureId(BTreeDictionary old2new, long ysize) 169 | { 170 | var md5 = new AdvUtils.Security.Cryptography.MD5(); 171 | long maxid_ = 0; 172 | Parallel.For(0, arrayFeatureFreqSize, parallelOption, i => 173 | { 174 | //Generate new feature id 175 | var addValue = (arrayFeatureFreq[i].strFeature[0] == 'U' ? ysize : ysize * ysize); 176 | var oldValue = maxid_; 177 | while (System.Threading.Interlocked.CompareExchange(ref maxid_, oldValue + addValue, oldValue) != oldValue) 178 | { 179 | oldValue = maxid_; 180 | } 181 | 182 | //Create existed and new feature ids mapping 183 | lock (thisLock) 184 | { 185 | old2new.Add( 186 | GetId(arrayFeatureFreq[i].strFeature), 187 | oldValue); 188 | } 189 | 190 | arrayFeatureFreq[i].value = oldValue; 191 | }); 192 | return maxid_; 193 | } 194 | 195 | //Shrink entire list 196 | public void Shrink(int freq) 197 | { 198 | var newEndIndex = Shrink(0, arrayFeatureFreqSize - 1, freq); 199 | arrayFeatureFreqSize = newEndIndex + 1; 200 | } 201 | 202 | //Shrink item list 203 | private long Shrink(long startIndex, long endIndex, int freq) 204 | { 205 | Console.Write("Sorting..."); 206 | arrayFeatureFreq.Sort(startIndex, endIndex - startIndex + 1, parallelOption.MaxDegreeOfParallelism); 207 | Console.Write("Merging..."); 208 | 209 | var newEndIndex = ParallelMerge(startIndex, endIndex, freq); 210 | sortedEndIndex = newEndIndex; 211 | 212 | Console.WriteLine("Done!"); 213 | ForceCollectMemory(); 214 | 215 | return newEndIndex; 216 | } 217 | 218 | //Get feature string id 219 | private long GetId(string strFeature) 220 | { 221 | var rawbytes = Encoding.UTF8.GetBytes(strFeature); 222 | 223 | lock (thisLock) 224 | { 225 | return md5.Compute64BitHash(rawbytes); 226 | } 227 | } 228 | 229 | private long sortedEndIndex = 0; 230 | private int ShrinkingLock = 0; 231 | private int AddLock = 0; 232 | //Add the feature string into list and get feature string id 233 | public long GetOrAddId(string strFeature) 234 | { 235 | while (ShrinkingLock == 1) { Thread.Sleep(5000); } 236 | 237 | //add item-adding lock 238 | Interlocked.Increment(ref AddLock); 239 | 240 | var newFFItem = new FeatureFreq(); 241 | newFFItem.strFeature = strFeature; 242 | newFFItem.value = 1; 243 | if (sortedEndIndex > 0) 244 | { 245 | var ff = arrayFeatureFreq.BinarySearch(0, sortedEndIndex, newFFItem); 246 | if (ff != null) 247 | { 248 | Interlocked.Increment(ref ff.value); 249 | //free item-adding lock 250 | Interlocked.Decrement(ref AddLock); 251 | return GetId(strFeature); 252 | } 253 | } 254 | 255 | var oldValue = Interlocked.Increment(ref arrayFeatureFreqSize) - 1; 256 | arrayFeatureFreq[oldValue] = newFFItem; 257 | 258 | //free item-adding lock 259 | Interlocked.Decrement(ref AddLock); 260 | 261 | //Check whether shrink process should be started 262 | uint memoryLoad = 0; 263 | if (oldValue % 10000000 == 0) 264 | { 265 | var msex = new MEMORYSTATUSEX(); 266 | GlobalMemoryStatusEx(msex); 267 | memoryLoad = msex.dwMemoryLoad; 268 | } 269 | 270 | if (memoryLoad >= SHRINK_AVALI_MEM_LOAD) 271 | { 272 | if (Interlocked.CompareExchange(ref ShrinkingLock, 1, 0) == 0) 273 | { 274 | //Double check whether shrink should be started 275 | var msex = new MEMORYSTATUSEX(); 276 | GlobalMemoryStatusEx(msex); 277 | if (msex.dwMemoryLoad >= SHRINK_AVALI_MEM_LOAD) 278 | { 279 | while (AddLock != 0) { Thread.Sleep(1000); } 280 | 281 | var startDT = DateTime.Now; 282 | Console.WriteLine("Begin to shrink [Feature Size: {0}]...", arrayFeatureFreqSize); 283 | var newArrayFeatureFreqSize = Shrink(0, arrayFeatureFreqSize - 1, 0) + 1; 284 | 285 | GlobalMemoryStatusEx(msex); 286 | if (msex.dwMemoryLoad >= SHRINK_AVALI_MEM_LOAD - 1) 287 | { 288 | //Still have enough available memory, raise shrink threshold 289 | SHRINK_AVALI_MEM_LOAD = msex.dwMemoryLoad + 1; 290 | if (SHRINK_AVALI_MEM_LOAD >= 100) 291 | { 292 | //if use more than 100% memory, the performance will extremely reduce 293 | SHRINK_AVALI_MEM_LOAD = 100; 294 | } 295 | } 296 | 297 | arrayFeatureFreqSize = newArrayFeatureFreqSize; 298 | var ts = DateTime.Now - startDT; 299 | Console.WriteLine("Shrink has been done!"); 300 | Console.WriteLine("[Feature Size:{0}, TimeSpan:{1}, Next Shrink Rate:{2}%]", arrayFeatureFreqSize, ts, SHRINK_AVALI_MEM_LOAD); 301 | } 302 | 303 | Interlocked.Decrement(ref ShrinkingLock); 304 | } 305 | } 306 | return GetId(strFeature); 307 | } 308 | } 309 | } 310 | -------------------------------------------------------------------------------- /Core/CRFSharp/encoder/LBFGS.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Threading; 3 | using System.Threading.Tasks; 4 | using AdvUtils; 5 | 6 | namespace CRFSharp 7 | { 8 | public class LBFGS 9 | { 10 | double [] diag; 11 | FixedBigArray w; 12 | Mcsrch mcsrch_; 13 | long nfev, point, npt, iter, info, ispt, iypt; 14 | int iflag_; 15 | double stp; 16 | public int zeroone; 17 | public int err; 18 | public double obj; 19 | 20 | public double[] expected; 21 | public double[] v; 22 | public double[] xi; 23 | 24 | private ParallelOptions parallelOption; 25 | 26 | public LBFGS(int thread_num) 27 | { 28 | iflag_ = 0; nfev = 0; 29 | point = 0; npt = 0; iter = 0; info = 0; 30 | ispt = 0; iypt = 0; 31 | stp = 0.0; 32 | mcsrch_ = new Mcsrch(thread_num); 33 | 34 | parallelOption = new ParallelOptions(); 35 | parallelOption.MaxDegreeOfParallelism = thread_num; 36 | } 37 | 38 | private double ddot_(long size, FixedBigArray dx, long dx_idx, FixedBigArray dy, long dy_idx) 39 | { 40 | double ret = 0.0f; 41 | Parallel.For(0, size, parallelOption, () => 0, (i, loop, subtotal) => 42 | { 43 | subtotal += dx[i + dx_idx] * dy[i + dy_idx]; 44 | return subtotal; 45 | }, 46 | (subtotal) => // lock free accumulator 47 | { 48 | double initialValue; 49 | double newValue; 50 | do 51 | { 52 | initialValue = ret; // read current value 53 | newValue = initialValue + subtotal; //calculate new value 54 | } 55 | while (initialValue != Interlocked.CompareExchange(ref ret, newValue, initialValue)); 56 | }); 57 | return ret; 58 | } 59 | 60 | 61 | private double ddot_(long size, double[] dx, long dx_idx, double[] dy, long dy_idx) 62 | { 63 | double ret = 0.0f; 64 | Parallel.For(0, size, parallelOption, () => 0, (i, loop, subtotal) => 65 | { 66 | subtotal += dx[i + dx_idx] * dy[i + dy_idx]; 67 | return subtotal; 68 | }, 69 | (subtotal) => // lock free accumulator 70 | { 71 | double initialValue; 72 | double newValue; 73 | do 74 | { 75 | initialValue = ret; // read current value 76 | newValue = initialValue + subtotal; //calculate new value 77 | } 78 | while (initialValue != Interlocked.CompareExchange(ref ret, newValue, initialValue)); 79 | }); 80 | return ret; 81 | } 82 | void pseudo_gradient(double[] x, double C) 83 | { 84 | var size = expected.LongLength - 1; 85 | Parallel.For(1, size + 1, parallelOption, i => 86 | { 87 | if (x[i] == 0) 88 | { 89 | if (expected[i] + C < 0) 90 | { 91 | v[i] = (expected[i] + C); 92 | } 93 | else if (expected[i] - C > 0) 94 | { 95 | v[i] = (expected[i] - C); 96 | } 97 | else 98 | { 99 | v[i] = 0; 100 | } 101 | } 102 | else 103 | { 104 | v[i] = (expected[i] + C * sigma(x[i])); 105 | } 106 | }); 107 | } 108 | double sigma(double x) 109 | { 110 | if (x > 0) return 1.0; 111 | else if (x < 0) return -1.0; 112 | return 0.0; 113 | } 114 | 115 | public int optimize(double[] x, double C, bool orthant) 116 | { 117 | const long msize = 5; 118 | var size = x.LongLength - 1; 119 | if (w == null || w.LongLength == 0) 120 | { 121 | iflag_ = 0; 122 | w = new FixedBigArray(size * (2 * msize + 1) + 2 * msize, 1); 123 | diag = new double[size + 1]; 124 | if (orthant == true) 125 | { 126 | xi = new double[size + 1]; 127 | v = new double[size + 1]; 128 | } 129 | } 130 | 131 | if (orthant == true) 132 | { 133 | pseudo_gradient(x, C); 134 | } 135 | else 136 | { 137 | v = expected; 138 | } 139 | 140 | lbfgs_optimize(msize, x, orthant, C); 141 | if (iflag_ < 0) 142 | { 143 | Console.WriteLine("routine stops with unexpected error"); 144 | return -1; 145 | } 146 | 147 | return iflag_; 148 | } 149 | 150 | void lbfgs_optimize(long msize, double[] x, bool orthant, double C) 151 | { 152 | var size = x.LongLength - 1; 153 | var yy = 0.0; 154 | var ys = 0.0; 155 | long bound = 0; 156 | long cp = 0; 157 | var bExit = false; 158 | 159 | // initialization 160 | if (iflag_ == 0) 161 | { 162 | point = 0; 163 | ispt = size + (msize << 1); 164 | iypt = ispt + size * msize; 165 | 166 | Parallel.For(1, size + 1, parallelOption, i => 167 | { 168 | diag[i] = 1.0f; 169 | w[ispt + i] = -v[i]; 170 | w[i] = expected[i]; 171 | }); 172 | 173 | if (orthant == true) 174 | { 175 | Parallel.For(1, size + 1, parallelOption, i => 176 | { 177 | xi[i] = (x[i] != 0 ? sigma(x[i]) : sigma(-v[i])); 178 | }); 179 | } 180 | 181 | //第一次试探步长 182 | stp = 1.0f / Math.Sqrt(ddot_(size, v, 1, v, 1)); 183 | 184 | ++iter; 185 | info = 0; 186 | nfev = 0; 187 | } 188 | 189 | // MAIN ITERATION LOOP 190 | bExit = LineSearchAndUpdateStepGradient(msize, x, orthant); 191 | while (bExit == false) 192 | { 193 | ++iter; 194 | info = 0; 195 | 196 | if (orthant == true) 197 | { 198 | Parallel.For(1, size + 1, parallelOption, i => 199 | { 200 | xi[i] = (x[i] != 0 ? sigma(x[i]) : sigma(-v[i])); 201 | }); 202 | } 203 | 204 | if (iter > size) 205 | { 206 | bound = size; 207 | } 208 | 209 | // COMPUTE -H*G USING THE FORMULA GIVEN IN: Nocedal, J. 1980, 210 | // "Updating quasi-Newton matrices with limited storage", 211 | // Mathematics of Computation, Vol.24, No.151, pp. 773-782. 212 | ys = ddot_(size, w, iypt + npt + 1, w, ispt + npt + 1); 213 | yy = ddot_(size, w, iypt + npt + 1, w, iypt + npt + 1); 214 | 215 | var r_ys_yy = ys / yy; 216 | Parallel.For(1, size + 1, parallelOption, i => 217 | { 218 | diag[i] = r_ys_yy; 219 | w[i] = -v[i]; 220 | }); 221 | 222 | cp = point; 223 | if (point == 0) 224 | { 225 | cp = msize; 226 | } 227 | 228 | w[size + cp] = (1.0 / ys); 229 | //回退次数 230 | bound = Math.Min(iter - 1, msize); 231 | cp = point; 232 | for (var i = 1; i <= bound; ++i) 233 | { 234 | --cp; 235 | if (cp == -1) cp = msize - 1; 236 | var sq = ddot_(size, w, ispt + cp * size + 1, w, 1); 237 | var inmc = size + msize + cp + 1; 238 | var iycn = iypt + cp * size; 239 | w[inmc] = (w[size + cp + 1] * sq); 240 | var d = -w[inmc]; 241 | 242 | Parallel.For(1, size + 1, parallelOption, j => 243 | { 244 | w[j] = (w[j] + d * w[iycn + j]); 245 | }); 246 | } 247 | 248 | Parallel.For(1, size + 1, parallelOption, i => 249 | { 250 | w[i] = (diag[i] * w[i]); 251 | }); 252 | 253 | for (var i = 1; i <= bound; ++i) 254 | { 255 | var yr = ddot_(size, w, iypt + cp * size + 1, w, 1); 256 | var beta = w[size + cp + 1] * yr; 257 | var inmc = size + msize + cp + 1; 258 | beta = w[inmc] - beta; 259 | var iscn = ispt + cp * size; 260 | 261 | Parallel.For(1, size + 1, parallelOption, j => 262 | { 263 | w[j] = (w[j] + beta * w[iscn + j]); 264 | }); 265 | 266 | ++cp; 267 | if (cp == msize) 268 | { 269 | cp = 0; 270 | } 271 | } 272 | 273 | if (orthant == true) 274 | { 275 | Parallel.For(1, size + 1, parallelOption, i => 276 | { 277 | w[i] = (sigma(w[i]) == sigma(-v[i]) ? w[i] : 0); 278 | }); 279 | } 280 | 281 | 282 | // STORE THE NEW SEARCH DIRECTION 283 | var offset = ispt + point * size; 284 | Parallel.For(1, size + 1, parallelOption, i => 285 | { 286 | w[offset + i] = w[i]; 287 | w[i] = expected[i]; 288 | }); 289 | 290 | stp = 1.0f; 291 | nfev = 0; 292 | bExit = LineSearchAndUpdateStepGradient(msize, x, orthant); 293 | } 294 | } 295 | 296 | private bool LineSearchAndUpdateStepGradient(long msize, double[] x, bool orthant) 297 | { 298 | var size = x.LongLength - 1; 299 | var bExit = false; 300 | mcsrch_.mcsrch(x, obj, v, w, ispt + point * size, 301 | ref stp, ref info, ref nfev, diag); 302 | if (info == -1) 303 | { 304 | if (orthant == true) 305 | { 306 | Parallel.For(1, size + 1, parallelOption, i => 307 | { 308 | x[i] = (sigma(x[i]) == sigma(xi[i]) ? x[i] : 0); 309 | }); 310 | } 311 | 312 | 313 | iflag_ = 1; // next value 314 | bExit = true; 315 | } 316 | else if (info != 1) 317 | { 318 | //MCSRCH error, please see error code in info 319 | iflag_ = -1; 320 | bExit = true; 321 | } 322 | else 323 | { 324 | // COMPUTE THE NEW STEP AND GRADIENT CHANGE 325 | npt = point * size; 326 | Parallel.For(1, size + 1, parallelOption, i => 327 | { 328 | w[ispt + npt + i] = (stp * w[ispt + npt + i]); 329 | w[iypt + npt + i] = expected[i] - w[i]; 330 | }); 331 | 332 | ++point; 333 | if (point == msize) 334 | { 335 | point = 0; 336 | } 337 | 338 | var gnorm = Math.Sqrt(ddot_(size, v, 1, v, 1)); 339 | var xnorm = Math.Max(1.0, Math.Sqrt(ddot_(size, x, 1, x, 1))); 340 | if (gnorm / xnorm <= Utils.eps) 341 | { 342 | iflag_ = 0; // OK terminated 343 | bExit = true; 344 | } 345 | } 346 | 347 | return bExit; 348 | } 349 | } 350 | } 351 | -------------------------------------------------------------------------------- /Core/CRFSharpWrapper/Encoder.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Threading; 5 | using AdvUtils; 6 | using CRFSharp; 7 | using System.Threading.Tasks; 8 | using CRFSharp.decoder; 9 | 10 | namespace CRFSharpWrapper 11 | { 12 | public class Encoder 13 | { 14 | public enum REG_TYPE { L1, L2 }; 15 | 16 | //encoding CRF model from training corpus 17 | public bool Learn(EncoderArgs args) 18 | { 19 | if (args.min_diff <= 0.0) 20 | { 21 | Logger.WriteLine(Logger.Level.err, "eta must be > 0.0"); 22 | return false; 23 | } 24 | 25 | if (args.C < 0.0) 26 | { 27 | Logger.WriteLine(Logger.Level.err, "C must be >= 0.0"); 28 | return false; 29 | } 30 | 31 | if (args.threads_num <= 0) 32 | { 33 | Logger.WriteLine(Logger.Level.err, "thread must be > 0"); 34 | return false; 35 | } 36 | 37 | if (args.hugeLexMemLoad > 0) 38 | { 39 | Logger.WriteLine("Build feature lexical dictionary in huge mode[shrink when mem used rate:{0}%]", args.hugeLexMemLoad); 40 | } 41 | 42 | Logger.WriteLine("Open and check training corpus and templates..."); 43 | var modelWriter = new ModelWriter(args.threads_num, args.C, 44 | args.hugeLexMemLoad, args.strRetrainModelFileName); 45 | 46 | if (modelWriter.Open(args.strTemplateFileName, args.strTrainingCorpus) == false) 47 | { 48 | Logger.WriteLine("Open training corpus or template file failed."); 49 | return false; 50 | } 51 | 52 | Logger.WriteLine("Load training data and generate lexical features: "); 53 | var xList = modelWriter.ReadAllRecords(); 54 | 55 | Logger.WriteLine(""); 56 | 57 | Logger.WriteLine("Shrinking feature set [frequency is less than {0}]...", args.min_feature_freq); 58 | modelWriter.Shrink(xList, args.min_feature_freq); 59 | 60 | Logger.WriteLine("Saving model meta data..."); 61 | if (!modelWriter.SaveModelMetaData(args.strEncodedModelFileName)) 62 | { 63 | Logger.WriteLine(Logger.Level.err, "Failed!"); 64 | return false; 65 | } 66 | else 67 | { 68 | Logger.WriteLine("Success"); 69 | } 70 | 71 | Logger.WriteLine("Indexing feature set with {0} maximum slot usage rate threshold...", args.slot_usage_rate_threshold); 72 | if (!modelWriter.BuildFeatureSetIntoIndex(args.strEncodedModelFileName, args.slot_usage_rate_threshold, args.debugLevel)) 73 | { 74 | Logger.WriteLine(Logger.Level.err, "Failed!"); 75 | return false; 76 | } 77 | else 78 | { 79 | Logger.WriteLine("Success"); 80 | } 81 | 82 | Logger.WriteLine("Sentences size: " + xList.Length); 83 | Logger.WriteLine("Features size: " + modelWriter.feature_size()); 84 | Logger.WriteLine("Thread(s): " + args.threads_num); 85 | Logger.WriteLine("Regularization type: " + args.regType.ToString()); 86 | Logger.WriteLine("Freq: " + args.min_feature_freq); 87 | Logger.WriteLine("eta: " + args.min_diff); 88 | Logger.WriteLine("C: " + args.C); 89 | Logger.WriteLine("Vector quantization: " + args.bVQ); 90 | 91 | if (xList.Length == 0) 92 | { 93 | Logger.WriteLine(Logger.Level.err, "No sentence for training."); 94 | return false; 95 | } 96 | 97 | var orthant = false; 98 | if (args.regType == REG_TYPE.L1) 99 | { 100 | orthant = true; 101 | } 102 | if (runCRF(xList, modelWriter, orthant, args) == false) 103 | { 104 | Logger.WriteLine(Logger.Level.warn, "Some warnings are raised during encoding..."); 105 | } 106 | 107 | Logger.WriteLine("Saving model feature's weight..."); 108 | modelWriter.SaveFeatureWeight(args.strEncodedModelFileName, args.bVQ); 109 | 110 | return true; 111 | } 112 | 113 | bool runCRF(EncoderTagger[] x, ModelWriter modelWriter, bool orthant, EncoderArgs args) 114 | { 115 | var old_obj = double.MaxValue; 116 | var converge = 0; 117 | var lbfgs = new LBFGS(args.threads_num); 118 | lbfgs.expected = new double[modelWriter.feature_size() + 1]; 119 | 120 | var processList = new List(); 121 | var parallelOption = new ParallelOptions(); 122 | parallelOption.MaxDegreeOfParallelism = args.threads_num; 123 | 124 | //Initialize encoding threads 125 | for (var i = 0; i < args.threads_num; i++) 126 | { 127 | var thread = new CRFEncoderThread(); 128 | thread.start_i = i; 129 | thread.thread_num = args.threads_num; 130 | thread.x = x; 131 | thread.lbfgs = lbfgs; 132 | thread.Init(); 133 | processList.Add(thread); 134 | } 135 | 136 | //Statistic term and result tags frequency 137 | var termNum = 0; 138 | int[] yfreq; 139 | yfreq = new int[modelWriter.y_.Count]; 140 | for (int index = 0; index < x.Length; index++) 141 | { 142 | var tagger = x[index]; 143 | termNum += tagger.word_num; 144 | for (var j = 0; j < tagger.word_num; j++) 145 | { 146 | yfreq[tagger.answer_[j]]++; 147 | } 148 | } 149 | 150 | //Iterative training 151 | var startDT = DateTime.Now; 152 | var dMinErrRecord = 1.0; 153 | for (var itr = 0; itr < args.max_iter; ++itr) 154 | { 155 | //Clear result container 156 | lbfgs.obj = 0.0f; 157 | lbfgs.err = 0; 158 | lbfgs.zeroone = 0; 159 | 160 | Array.Clear(lbfgs.expected, 0, lbfgs.expected.Length); 161 | 162 | var threadList = new List(); 163 | for (var i = 0; i < args.threads_num; i++) 164 | { 165 | var thread = new Thread(processList[i].Run); 166 | thread.Start(); 167 | threadList.Add(thread); 168 | } 169 | 170 | int[,] merr; 171 | merr = new int[modelWriter.y_.Count, modelWriter.y_.Count]; 172 | for (var i = 0; i < args.threads_num; ++i) 173 | { 174 | threadList[i].Join(); 175 | lbfgs.obj += processList[i].obj; 176 | lbfgs.err += processList[i].err; 177 | lbfgs.zeroone += processList[i].zeroone; 178 | 179 | //Calculate error 180 | for (var j = 0; j < modelWriter.y_.Count; j++) 181 | { 182 | for (var k = 0; k < modelWriter.y_.Count; k++) 183 | { 184 | merr[j, k] += processList[i].merr[j, k]; 185 | } 186 | } 187 | } 188 | 189 | long num_nonzero = 0; 190 | var fsize = modelWriter.feature_size(); 191 | var alpha = modelWriter.alpha_; 192 | if (orthant == true) 193 | { 194 | //L1 regularization 195 | Parallel.For(1, fsize + 1, parallelOption, () => 0, (k, loop, subtotal) => 196 | { 197 | subtotal += Math.Abs(alpha[k] / modelWriter.cost_factor_); 198 | if (alpha[k] != 0.0) 199 | { 200 | Interlocked.Increment(ref num_nonzero); 201 | } 202 | return subtotal; 203 | }, 204 | (subtotal) => // lock free accumulator 205 | { 206 | double initialValue; 207 | double newValue; 208 | do 209 | { 210 | initialValue = lbfgs.obj; // read current value 211 | newValue = initialValue + subtotal; //calculate new value 212 | } 213 | while (initialValue != Interlocked.CompareExchange(ref lbfgs.obj, newValue, initialValue)); 214 | }); 215 | } 216 | else 217 | { 218 | //L2 regularization 219 | num_nonzero = fsize; 220 | Parallel.For(1, fsize + 1, parallelOption, () => 0, (k, loop, subtotal) => 221 | { 222 | subtotal += (alpha[k] * alpha[k] / (2.0 * modelWriter.cost_factor_)); 223 | lbfgs.expected[k] += (alpha[k] / modelWriter.cost_factor_); 224 | return subtotal; 225 | }, 226 | (subtotal) => // lock free accumulator 227 | { 228 | double initialValue; 229 | double newValue; 230 | do 231 | { 232 | initialValue = lbfgs.obj; // read current value 233 | newValue = initialValue + subtotal; //calculate new value 234 | } 235 | while (initialValue != Interlocked.CompareExchange(ref lbfgs.obj, newValue, initialValue)); 236 | }); 237 | } 238 | 239 | //Show each iteration result 240 | var diff = (itr == 0 ? 1.0f : Math.Abs(old_obj - lbfgs.obj) / old_obj); 241 | old_obj = lbfgs.obj; 242 | 243 | ShowEvaluation(x.Length, modelWriter, lbfgs, termNum, itr, merr, yfreq, diff, startDT, num_nonzero, args); 244 | if (diff < args.min_diff) 245 | { 246 | converge++; 247 | } 248 | else 249 | { 250 | converge = 0; 251 | } 252 | if (itr > args.max_iter || converge == 3) 253 | { 254 | break; // 3 is ad-hoc 255 | } 256 | 257 | if (args.debugLevel > 0 && (double)lbfgs.zeroone / (double)x.Length < dMinErrRecord) 258 | { 259 | var cc = Console.ForegroundColor; 260 | Console.ForegroundColor = ConsoleColor.Red; 261 | Console.Write("[Debug Mode] "); 262 | Console.ForegroundColor = cc; 263 | Logger.WriteLine("Save intermediate feature weights at current directory"); 264 | 265 | //Save current best feature weight into file 266 | dMinErrRecord = (double)lbfgs.zeroone / (double)x.Length; 267 | modelWriter.SaveFeatureWeight("feature_weight_tmp", false); 268 | } 269 | 270 | int iret; 271 | iret = lbfgs.optimize(alpha, modelWriter.cost_factor_, orthant); 272 | if (iret <= 0) 273 | { 274 | return false; 275 | } 276 | } 277 | 278 | return true; 279 | } 280 | 281 | private static void ShowEvaluation(int recordNum, ModelWriter feature_index, LBFGS lbfgs, int termNum, int itr, int[,] merr, int[] yfreq, double diff, DateTime startDT, long nonzero_feature_num, EncoderArgs args) 282 | { 283 | var ts = DateTime.Now - startDT; 284 | 285 | if (args.debugLevel > 1) 286 | { 287 | for (var i = 0; i < feature_index.y_.Count; i++) 288 | { 289 | var total_merr = 0; 290 | var sdict = new SortedDictionary>(); 291 | for (var j = 0; j < feature_index.y_.Count; j++) 292 | { 293 | total_merr += merr[i, j]; 294 | var v = (double)merr[i, j] / (double)yfreq[i]; 295 | if (v > 0.0001) 296 | { 297 | if (sdict.ContainsKey(v) == false) 298 | { 299 | sdict.Add(v, new List()); 300 | } 301 | sdict[v].Add(feature_index.y_[j]); 302 | } 303 | } 304 | var vet = (double)total_merr / (double)yfreq[i]; 305 | vet = vet * 100.0F; 306 | 307 | Console.ForegroundColor = ConsoleColor.Green; 308 | Console.Write("{0} ", feature_index.y_[i]); 309 | Console.ResetColor(); 310 | Console.Write("[FR={0}, TE=", yfreq[i]); 311 | Console.ForegroundColor = ConsoleColor.Yellow; 312 | Console.Write("{0:0.00}%", vet); 313 | Console.ResetColor(); 314 | Console.WriteLine("]"); 315 | 316 | var n = 0; 317 | foreach (var pair in sdict.Reverse()) 318 | { 319 | for (int index = 0; index < pair.Value.Count; index++) 320 | { 321 | var item = pair.Value[index]; 322 | n += item.Length + 1 + 7; 323 | if (n > 80) 324 | { 325 | //only show data in one line, more data in tail will not be show. 326 | break; 327 | } 328 | Console.Write("{0}:", item); 329 | Console.ForegroundColor = ConsoleColor.Red; 330 | Console.Write("{0:0.00}% ", pair.Key * 100); 331 | Console.ResetColor(); 332 | } 333 | if (n > 80) 334 | { 335 | break; 336 | } 337 | } 338 | Console.WriteLine(); 339 | } 340 | } 341 | 342 | var act_feature_rate = (double)(nonzero_feature_num) / (double)(feature_index.feature_size()) * 100.0; 343 | Logger.WriteLine("iter={0} terr={1:0.00000} serr={2:0.00000} diff={3:0.000000} fsize={4}({5:0.00}% act)", itr, 1.0 * lbfgs.err / termNum, 1.0 * lbfgs.zeroone / recordNum, diff, feature_index.feature_size(), act_feature_rate); 344 | Logger.WriteLine("Time span: {0}, Aver. time span per iter: {1}", ts, new TimeSpan(0, 0, (int)(ts.TotalSeconds / (itr + 1)))); 345 | } 346 | } 347 | } 348 | -------------------------------------------------------------------------------- /CRFSharpConsole/DecoderConsole.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.IO; 6 | using CRFSharpWrapper; 7 | using CRFSharp; 8 | using System.Threading.Tasks; 9 | using System.Collections.Concurrent; 10 | using System.Diagnostics; 11 | using AdvUtils; 12 | 13 | namespace CRFSharpConsole 14 | { 15 | class DecoderConsole 16 | { 17 | public void Usage() 18 | { 19 | Console.WriteLine("CRFSharpConsole -decode "); 20 | Console.WriteLine("-modelfile : The model file used for decoding"); 21 | Console.WriteLine("-inputfile : The input file to predict its content tags"); 22 | Console.WriteLine("-outputfile : The output file to save raw tagged result"); 23 | Console.WriteLine("-outputsegfile : The output file to save segmented tagged result"); 24 | Console.WriteLine("-nbest : Output n-best result, default value is 1"); 25 | Console.WriteLine("-thread : threads used for decoding"); 26 | Console.WriteLine("-prob : output probability, default is not output"); 27 | Console.WriteLine(" 0 - not output probability"); 28 | Console.WriteLine(" 1 - only output the sequence label probability"); 29 | Console.WriteLine(" 2 - output both sequence label and individual entity probability"); 30 | Console.WriteLine("-maxword : max words per sentence, default value is 100"); 31 | Console.WriteLine("Example: "); 32 | Console.WriteLine(" CRFSharp_Console -decode -modelfile ner.model -inputfile ner_test.txt -outputfile ner_test_result.txt -outputsegfile ner_test_result_seg.txt -thread 4 -nbest 3 -prob 2 -maxword 500"); 33 | } 34 | 35 | public void Run(string[] args) 36 | { 37 | var options = new DecoderArgs(); 38 | for (var i = 0; i < args.Length; i++) 39 | { 40 | if (args[i][0] == '-') 41 | { 42 | var key = args[i].Substring(1).ToLower().Trim(); 43 | var value = ""; 44 | 45 | if (key == "decode") 46 | { 47 | continue; 48 | } 49 | else if (i < args.Length - 1) 50 | { 51 | i++; 52 | value = args[i]; 53 | switch (key) 54 | { 55 | case "outputfile": 56 | options.strOutputFileName = value; 57 | break; 58 | case "inputfile": 59 | options.strInputFileName = value; 60 | break; 61 | case "modelfile": 62 | options.strModelFileName = value; 63 | break; 64 | case "outputsegfile": 65 | options.strOutputSegFileName = value; 66 | break; 67 | case "thread": 68 | options.thread = int.Parse(value); 69 | break; 70 | case "nbest": 71 | options.nBest = int.Parse(value); 72 | break; 73 | case "prob": 74 | options.probLevel = int.Parse(value); 75 | break; 76 | case "maxword": 77 | options.maxword = int.Parse(value); 78 | break; 79 | 80 | default: 81 | var cc = Console.ForegroundColor; 82 | Console.ForegroundColor = ConsoleColor.Red; 83 | Console.WriteLine("No supported {0} parameter, exit", key); 84 | Console.ForegroundColor = cc; 85 | Usage(); 86 | return; 87 | } 88 | } 89 | else 90 | { 91 | var cc = Console.ForegroundColor; 92 | Console.ForegroundColor = ConsoleColor.Red; 93 | Console.WriteLine("{0} is invalidated parameter.", key); 94 | Console.ForegroundColor = cc; 95 | Usage(); 96 | return; 97 | } 98 | } 99 | } 100 | 101 | if (options.strInputFileName == null || options.strModelFileName == null) 102 | { 103 | Usage(); 104 | return; 105 | } 106 | 107 | Decode(options); 108 | } 109 | 110 | object rdLocker = new object(); 111 | 112 | bool Decode(CRFSharpWrapper.DecoderArgs options) 113 | { 114 | var parallelOption = new ParallelOptions(); 115 | var watch = Stopwatch.StartNew(); 116 | if (File.Exists(options.strInputFileName) == false) 117 | { 118 | Logger.WriteLine("FAILED: Open {0} file failed.", options.strInputFileName); 119 | return false; 120 | } 121 | 122 | if (File.Exists(options.strModelFileName) == false) 123 | { 124 | Logger.WriteLine("FAILED: Open {0} file failed.", options.strModelFileName); 125 | return false; 126 | } 127 | 128 | var sr = new StreamReader(options.strInputFileName); 129 | StreamWriter sw = null, swSeg = null; 130 | 131 | if (options.strOutputFileName != null && options.strOutputFileName.Length > 0) 132 | { 133 | sw = new StreamWriter(options.strOutputFileName); 134 | } 135 | if (options.strOutputSegFileName != null && options.strOutputSegFileName.Length > 0) 136 | { 137 | swSeg = new StreamWriter(options.strOutputSegFileName); 138 | } 139 | 140 | //Create CRFSharp wrapper instance. It's a global instance 141 | var crfWrapper = new CRFSharpWrapper.Decoder(); 142 | 143 | //Load encoded model from file 144 | Logger.WriteLine("Loading model from {0}", options.strModelFileName); 145 | crfWrapper.LoadModel(options.strModelFileName); 146 | 147 | var queueRecords = new ConcurrentQueue>>(); 148 | var queueSegRecords = new ConcurrentQueue>>(); 149 | 150 | parallelOption.MaxDegreeOfParallelism = options.thread; 151 | Parallel.For(0, options.thread, parallelOption, t => 152 | { 153 | 154 | //Create decoder tagger instance. If the running environment is multi-threads, each thread needs a separated instance 155 | var tagger = crfWrapper.CreateTagger(options.nBest, options.maxword); 156 | tagger.set_vlevel(options.probLevel); 157 | 158 | //Initialize result 159 | var crf_out = new crf_seg_out[options.nBest]; 160 | for (var i = 0; i < options.nBest; i++) 161 | { 162 | crf_out[i] = new crf_seg_out(tagger.crf_max_word_num); 163 | } 164 | 165 | var inbuf = new List>(); 166 | while (true) 167 | { 168 | lock (rdLocker) 169 | { 170 | if (ReadRecord(inbuf, sr) == false) 171 | { 172 | break; 173 | } 174 | 175 | queueRecords.Enqueue(inbuf); 176 | queueSegRecords.Enqueue(inbuf); 177 | } 178 | 179 | //Call CRFSharp wrapper to predict given string's tags 180 | if (swSeg != null) 181 | { 182 | crfWrapper.Segment(crf_out, tagger, inbuf); 183 | } 184 | else 185 | { 186 | crfWrapper.Segment((crf_term_out[])crf_out, (DecoderTagger)tagger, inbuf); 187 | } 188 | 189 | List> peek = null; 190 | //Save segmented tagged result into file 191 | if (swSeg != null) 192 | { 193 | var rstList = ConvertCRFTermOutToStringList(inbuf, crf_out); 194 | while (peek != inbuf) 195 | { 196 | queueSegRecords.TryPeek(out peek); 197 | } 198 | for (int index = 0; index < rstList.Count; index++) 199 | { 200 | var item = rstList[index]; 201 | swSeg.WriteLine(item); 202 | } 203 | queueSegRecords.TryDequeue(out peek); 204 | peek = null; 205 | } 206 | 207 | //Save raw tagged result (with probability) into file 208 | if (sw != null) 209 | { 210 | while (peek != inbuf) 211 | { 212 | queueRecords.TryPeek(out peek); 213 | } 214 | OutputRawResultToFile(inbuf, crf_out, tagger, sw); 215 | queueRecords.TryDequeue(out peek); 216 | 217 | } 218 | } 219 | }); 220 | 221 | 222 | sr.Close(); 223 | 224 | if (sw != null) 225 | { 226 | sw.Close(); 227 | } 228 | if (swSeg != null) 229 | { 230 | swSeg.Close(); 231 | } 232 | watch.Stop(); 233 | Logger.WriteLine("Elapsed: {0} ms", watch.ElapsedMilliseconds); 234 | 235 | return true; 236 | } 237 | 238 | private bool ReadRecord(List> inbuf, StreamReader sr) 239 | { 240 | inbuf.Clear(); 241 | 242 | while (true) 243 | { 244 | var strLine = sr.ReadLine(); 245 | if (strLine == null) 246 | { 247 | //At the end of current file 248 | if (inbuf.Count == 0) 249 | { 250 | return false; 251 | } 252 | else 253 | { 254 | return true; 255 | } 256 | } 257 | strLine = strLine.Trim(); 258 | if (strLine.Length == 0) 259 | { 260 | return true; 261 | } 262 | 263 | //Read feature set for each record 264 | var items = strLine.Split(new char[] { '\t' }); 265 | inbuf.Add(new List()); 266 | for (int index = 0; index < items.Length; index++) 267 | { 268 | var item = items[index]; 269 | inbuf[inbuf.Count - 1].Add(item); 270 | } 271 | } 272 | } 273 | 274 | //Output raw result with probability 275 | private void OutputRawResultToFile(List> inbuf, crf_term_out[] crf_out, SegDecoderTagger tagger, StreamWriter sw) 276 | { 277 | for (var k = 0; k < crf_out.Length; k++) 278 | { 279 | if (crf_out[k] == null) 280 | { 281 | //No more result 282 | break; 283 | } 284 | 285 | var sb = new StringBuilder(); 286 | 287 | var crf_seg_out = crf_out[k]; 288 | //Show the entire sequence probability 289 | //For each token 290 | for (var i = 0; i < inbuf.Count; i++) 291 | { 292 | //Show all features 293 | for (var j = 0; j < inbuf[i].Count; j++) 294 | { 295 | sb.Append(inbuf[i][j]); 296 | sb.Append("\t"); 297 | } 298 | 299 | //Show the best result and its probability 300 | sb.Append(crf_seg_out.result_[i]); 301 | 302 | if (tagger.vlevel_ > 1) 303 | { 304 | sb.Append("\t"); 305 | sb.Append(crf_seg_out.weight_[i]); 306 | 307 | //Show the probability of all tags 308 | sb.Append("\t"); 309 | for (var j = 0; j < tagger.ysize_; j++) 310 | { 311 | sb.Append(tagger.yname(j)); 312 | sb.Append("/"); 313 | sb.Append(tagger.prob(i, j)); 314 | 315 | if (j < tagger.ysize_ - 1) 316 | { 317 | sb.Append("\t"); 318 | } 319 | } 320 | } 321 | sb.AppendLine(); 322 | } 323 | if (tagger.vlevel_ > 0) 324 | { 325 | sw.WriteLine("#{0}", crf_seg_out.prob); 326 | } 327 | sw.WriteLine(sb.ToString().Trim()); 328 | sw.WriteLine(); 329 | } 330 | } 331 | 332 | //Convert CRFSharp output format to string list 333 | private List ConvertCRFTermOutToStringList(List> inbuf, crf_seg_out[] crf_out) 334 | { 335 | var sb = new StringBuilder(); 336 | for (var i = 0; i < inbuf.Count; i++) 337 | { 338 | sb.Append(inbuf[i][0]); 339 | } 340 | 341 | var strText = sb.ToString(); 342 | var rstList = new List(); 343 | for (var i = 0; i < crf_out.Length; i++) 344 | { 345 | if (crf_out[i] == null) 346 | { 347 | //No more result 348 | break; 349 | } 350 | 351 | sb.Clear(); 352 | var crf_term_out = crf_out[i]; 353 | for (var j = 0; j < crf_term_out.Count; j++) 354 | { 355 | var str = strText.Substring(crf_term_out.tokenList[j].offset, crf_term_out.tokenList[j].length); 356 | var strNE = crf_term_out.tokenList[j].strTag; 357 | 358 | sb.Append(str); 359 | if (strNE.Length > 0) 360 | { 361 | sb.Append("[" + strNE + "]"); 362 | } 363 | sb.Append(" "); 364 | } 365 | rstList.Add(sb.ToString().Trim()); 366 | } 367 | 368 | return rstList; 369 | } 370 | } 371 | } 372 | -------------------------------------------------------------------------------- /Core/CRFSharp/encoder/ModelWriter.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Threading; 4 | using System.IO; 5 | using System.Text; 6 | using AdvUtils; 7 | using System.Threading.Tasks; 8 | using CRFSharp.decoder; 9 | 10 | namespace CRFSharp 11 | { 12 | public class ModelWriter : BaseModel 13 | { 14 | private readonly string modelFileName; 15 | 16 | private readonly Pool _buildersPool = 17 | new Pool(p => new StringBuilder(100), b => b.Clear()); 18 | 19 | 20 | int thread_num_; 21 | public IFeatureLexicalDict featureLexicalDict; 22 | List>> trainCorpusList; 23 | ParallelOptions parallelOption = new ParallelOptions(); 24 | 25 | public ModelWriter(int thread_num, double cost_factor, 26 | uint hugeLexShrinkMemLoad, string modelFileName) 27 | { 28 | cost_factor_ = cost_factor; 29 | maxid_ = 0; 30 | thread_num_ = thread_num; 31 | this.modelFileName = modelFileName; 32 | parallelOption.MaxDegreeOfParallelism = thread_num; 33 | 34 | if (hugeLexShrinkMemLoad > 0) 35 | { 36 | featureLexicalDict = new HugeFeatureLexicalDict(thread_num_, hugeLexShrinkMemLoad); 37 | } 38 | else 39 | { 40 | featureLexicalDict = new DefaultFeatureLexicalDict(thread_num_); 41 | } 42 | } 43 | 44 | //Regenerate feature id and shrink features with lower frequency 45 | public void Shrink(EncoderTagger[] xList, int freq) 46 | { 47 | var old2new = new BTreeDictionary(); 48 | featureLexicalDict.Shrink(freq); 49 | maxid_ = featureLexicalDict.RegenerateFeatureId(old2new, y_.Count); 50 | var feature_count = xList.Length; 51 | 52 | //Update feature ids 53 | Parallel.For(0, feature_count, parallelOption, i => 54 | { 55 | for (var j = 0; j < xList[i].feature_cache_.Count; j++) 56 | { 57 | var newfs = new List(); 58 | long rstValue = 0; 59 | for (int index = 0; index < xList[i].feature_cache_[j].Length; index++) 60 | { 61 | var v = xList[i].feature_cache_[j][index]; 62 | if (old2new.TryGetValue(v, out rstValue) == true) 63 | { 64 | newfs.Add(rstValue); 65 | } 66 | } 67 | xList[i].feature_cache_[j] = newfs.ToArray(); 68 | } 69 | }); 70 | 71 | Logger.WriteLine("Feature size in total : {0}", maxid_); 72 | } 73 | 74 | // Load all records and generate features 75 | public EncoderTagger[] ReadAllRecords() 76 | { 77 | var arrayEncoderTagger = new EncoderTagger[trainCorpusList.Count]; 78 | var arrayEncoderTaggerSize = 0; 79 | 80 | //Generate each record features 81 | Parallel.For(0, trainCorpusList.Count, parallelOption, i => 82 | { 83 | var _x = new EncoderTagger(this); 84 | if (_x.GenerateFeature(trainCorpusList[i]) == false) 85 | { 86 | Logger.WriteLine("Load a training sentence failed, skip it."); 87 | } 88 | else 89 | { 90 | var oldValue = Interlocked.Increment(ref arrayEncoderTaggerSize) - 1; 91 | arrayEncoderTagger[oldValue] = _x; 92 | 93 | if (oldValue % 10000 == 0) 94 | { 95 | //Show current progress on console 96 | Console.Write("{0}...", oldValue); 97 | } 98 | } 99 | }); 100 | 101 | trainCorpusList.Clear(); 102 | trainCorpusList = null; 103 | 104 | Console.WriteLine(); 105 | return arrayEncoderTagger; 106 | } 107 | 108 | //Open and check training and template file 109 | public bool Open(string strTemplateFileName, string strTrainCorpusFileName) 110 | { 111 | return OpenTemplateFile(strTemplateFileName) && OpenTrainCorpusFile(strTrainCorpusFileName); 112 | } 113 | 114 | //Build feature set into indexed data 115 | public bool BuildFeatureSetIntoIndex(string filename, double max_slot_usage_rate_threshold, int debugLevel) 116 | { 117 | Logger.WriteLine("Building {0} features into index...", featureLexicalDict.Size); 118 | 119 | IList keyList; 120 | IList valList; 121 | featureLexicalDict.GenerateLexicalIdList(out keyList, out valList); 122 | 123 | if (debugLevel > 0) 124 | { 125 | Logger.WriteLine("Debug: Write raw feature set into file"); 126 | var filename_featureset_raw_format = filename + ".feature.raw_text"; 127 | var sw = new StreamWriter(filename_featureset_raw_format); 128 | // save feature and its id into lists in raw format 129 | for (var i = 0; i < keyList.Count; i++) 130 | { 131 | sw.WriteLine("{0}\t{1}", keyList[i], valList[i]); 132 | } 133 | sw.Close(); 134 | } 135 | 136 | //Build feature index 137 | var filename_featureset = filename + ".feature"; 138 | var da = new DoubleArrayTrieBuilder(thread_num_); 139 | if (da.build(keyList, valList, max_slot_usage_rate_threshold) == false) 140 | { 141 | Logger.WriteLine("Build lexical dictionary failed."); 142 | return false; 143 | } 144 | //Save indexed feature set into file 145 | da.save(filename_featureset); 146 | 147 | if (string.IsNullOrWhiteSpace(modelFileName)) 148 | { 149 | //Clean up all data 150 | featureLexicalDict.Clear(); 151 | featureLexicalDict = null; 152 | keyList = null; 153 | valList = null; 154 | 155 | GC.Collect(); 156 | 157 | //Create weight matrix 158 | alpha_ = new double[feature_size() + 1]; 159 | } 160 | else 161 | { 162 | Logger.WriteLine(""); 163 | Logger.WriteLine("Loading the existed model for re-training..."); 164 | //Create weight matrix 165 | alpha_ = new double[feature_size() + 1]; 166 | var modelReader = new ModelReader(this.modelFileName); 167 | modelReader.LoadModel(); 168 | 169 | if (modelReader.y_.Count == y_.Count) 170 | { 171 | for (var i = 0; i < keyList.Count; i++) 172 | { 173 | var index = modelReader.get_id(keyList[i]); 174 | if (index < 0) 175 | { 176 | continue; 177 | } 178 | var size = (keyList[i][0] == 'U' ? y_.Count : y_.Count * y_.Count); 179 | for (var j = 0; j < size; j++) 180 | { 181 | alpha_[valList[i] + j + 1] = modelReader.GetAlpha(index + j); 182 | } 183 | } 184 | } 185 | else 186 | { 187 | Logger.WriteLine("The number of tags isn't equal between two models, it cannot be re-trained."); 188 | } 189 | 190 | //Clean up all data 191 | featureLexicalDict.Clear(); 192 | featureLexicalDict = null; 193 | keyList = null; 194 | valList = null; 195 | 196 | GC.Collect(); 197 | } 198 | 199 | return true; 200 | } 201 | 202 | //Save model meta data into file 203 | public bool SaveModelMetaData(string filename) 204 | { 205 | var tofs = new StreamWriter(filename); 206 | 207 | // header 208 | tofs.WriteLine("version: " + Utils.MODEL_TYPE_NORM); 209 | tofs.WriteLine("cost-factor: " + cost_factor_); 210 | tofs.WriteLine("maxid: " + maxid_); 211 | tofs.WriteLine("xsize: " + xsize_); 212 | 213 | tofs.WriteLine(); 214 | 215 | // y 216 | for (var i = 0; i < y_.Count; ++i) 217 | { 218 | tofs.WriteLine(y_[i]); 219 | } 220 | tofs.WriteLine(); 221 | 222 | // template 223 | for (var i = 0; i < unigram_templs_.Count; ++i) 224 | { 225 | tofs.WriteLine(unigram_templs_[i]); 226 | } 227 | for (var i = 0; i < bigram_templs_.Count; ++i) 228 | { 229 | tofs.WriteLine(bigram_templs_[i]); 230 | } 231 | 232 | tofs.Close(); 233 | 234 | return true; 235 | } 236 | 237 | /// 238 | /// Save feature weights into file 239 | /// 240 | /// 241 | /// 242 | /// 243 | public void SaveFeatureWeight(string filename, bool bVQ) 244 | { 245 | var filename_alpha = filename + ".alpha"; 246 | var tofs = new StreamWriter(filename_alpha, false); 247 | var bw = new BinaryWriter(tofs.BaseStream); 248 | 249 | if (bVQ == true) 250 | { 251 | Logger.WriteLine("Save feature weights into a VQ model: {0}", filename_alpha); 252 | //Build code book 253 | VectorQuantization vq = new VectorQuantization(); 254 | for (long i = 1; i <= maxid_; i++) 255 | { 256 | vq.Add(alpha_[i]); 257 | } 258 | 259 | int vqSize = 256; 260 | double distortion = vq.BuildCodebook(vqSize); 261 | Logger.WriteLine("Weight vector quantization distortion: {0}", distortion); 262 | 263 | //VQ size 264 | bw.Write(vqSize); 265 | 266 | //Save VQ codebook into file 267 | for (int j = 0; j < vqSize; j++) 268 | { 269 | bw.Write(vq.CodeBook[j]); 270 | } 271 | 272 | //Save weights 273 | for (long i = 1; i <= maxid_; ++i) 274 | { 275 | bw.Write((byte)vq.ComputeVQ(alpha_[i])); 276 | } 277 | } 278 | else 279 | { 280 | Logger.WriteLine("Save feature weights into a normal model: {0}", filename_alpha); 281 | 282 | bw.Write(0); 283 | //Save weights 284 | for (long i = 1; i <= maxid_; ++i) 285 | { 286 | bw.Write((float)alpha_[i]); 287 | } 288 | } 289 | 290 | bw.Close(); 291 | } 292 | 293 | bool OpenTemplateFile(string filename) 294 | { 295 | var ifs = new StreamReader(filename); 296 | unigram_templs_ = new List(); 297 | bigram_templs_ = new List(); 298 | while (ifs.EndOfStream == false) 299 | { 300 | var line = ifs.ReadLine(); 301 | if (line.Length == 0 || line[0] == '#') 302 | { 303 | continue; 304 | } 305 | if (line[0] == 'U') 306 | { 307 | unigram_templs_.Add(line); 308 | } 309 | else if (line[0] == 'B') 310 | { 311 | bigram_templs_.Add(line); 312 | } 313 | else 314 | { 315 | Logger.WriteLine(Logger.Level.warn, "unknown type: {0}", line); 316 | } 317 | } 318 | ifs.Close(); 319 | return true; 320 | } 321 | 322 | bool OpenTrainCorpusFile(string strTrainingCorpusFileName) 323 | { 324 | var ifs = new StreamReader(strTrainingCorpusFileName); 325 | y_ = new List(); 326 | trainCorpusList = new List>>(); 327 | var hashCand = new HashSet(); 328 | var recordList = new List>(); 329 | 330 | var last_xsize = -1; 331 | while (ifs.EndOfStream == false) 332 | { 333 | var line = ifs.ReadLine(); 334 | if (line.Length == 0 || line[0] == ' ' || line[0] == '\t') 335 | { 336 | //Current record is finished, save it into the list 337 | if (recordList.Count > 0) 338 | { 339 | trainCorpusList.Add(recordList); 340 | recordList = new List>(); 341 | } 342 | continue; 343 | } 344 | 345 | var items = line.Split('\t'); 346 | var size = items.Length; 347 | if (last_xsize >= 0 && last_xsize != size) 348 | { 349 | return false; 350 | } 351 | last_xsize = size; 352 | xsize_ = (uint)(size - 1); 353 | recordList.Add(new List(items)); 354 | 355 | if (hashCand.Contains(items[items.Length - 1]) == false) 356 | { 357 | hashCand.Add(items[items.Length - 1]); 358 | y_.Add(items[items.Length - 1]); 359 | } 360 | } 361 | ifs.Close(); 362 | 363 | Logger.WriteLine("Training corpus size: {0}", trainCorpusList.Count); 364 | return true; 365 | } 366 | 367 | //Get feature id from feature set by feature string 368 | //If feature string is not existed in the set, generate a new id and return it 369 | public bool BuildFeatures(EncoderTagger tagger) 370 | { 371 | var feature = new List(); 372 | using (var v = _buildersPool.GetOrCreate()) 373 | { 374 | var localBuilder = v.Item; 375 | //tagger.feature_id_ = tagger.feature_cache_.Count; 376 | for (var cur = 0; cur < tagger.word_num; ++cur) 377 | { 378 | for (int index = 0; index < unigram_templs_.Count; index++) 379 | { 380 | var it = unigram_templs_[index]; 381 | var strFeature = apply_rule(it, cur, localBuilder, tagger); 382 | if (strFeature == null) 383 | { 384 | Logger.WriteLine(Logger.Level.err, " format error: " + it); 385 | } 386 | else 387 | { 388 | var id = featureLexicalDict.GetOrAddId(strFeature.ToString()); 389 | feature.Add(id); 390 | } 391 | } 392 | tagger.feature_cache_.Add(feature.ToArray()); 393 | feature.Clear(); 394 | } 395 | 396 | for (var cur = 1; cur < tagger.word_num; ++cur) 397 | { 398 | for (int index = 0; index < bigram_templs_.Count; index++) 399 | { 400 | var it = bigram_templs_[index]; 401 | var strFeature = apply_rule(it, cur, localBuilder, tagger); 402 | if (strFeature == null) 403 | { 404 | Logger.WriteLine(Logger.Level.err, " format error: " + it); 405 | } 406 | else 407 | { 408 | var id = featureLexicalDict.GetOrAddId(strFeature.ToString()); 409 | feature.Add(id); 410 | } 411 | } 412 | 413 | tagger.feature_cache_.Add(feature.ToArray()); 414 | feature.Clear(); 415 | 416 | } 417 | 418 | } 419 | 420 | return true; 421 | } 422 | 423 | } 424 | } 425 | -------------------------------------------------------------------------------- /Core/CRFSharp/encoder/Mcsrch.cs: -------------------------------------------------------------------------------- 1 | using AdvUtils; 2 | using System; 3 | using System.Threading; 4 | using System.Threading.Tasks; 5 | 6 | namespace CRFSharp 7 | { 8 | class Mcsrch 9 | { 10 | private int infoc; 11 | private bool stage1, brackt; 12 | private double dginit; 13 | private double width, width1; 14 | private double fx, dgx, fy, dgy; 15 | private double finit; 16 | private double dgtest; 17 | private double stx, sty; 18 | private double stmin, stmax; 19 | 20 | private ParallelOptions parallelOption; 21 | 22 | public Mcsrch(int thread_num) 23 | { 24 | infoc = 0; 25 | stage1 = false; 26 | brackt = false; 27 | finit = 0.0; 28 | dginit = 0.0; 29 | dgtest = 0.0; 30 | width = 0.0; 31 | width1 = 0.0; 32 | stx = 0.0; 33 | fx = 0.0; 34 | dgx = 0.0; 35 | sty = 0.0; 36 | fy = 0.0; 37 | dgy = 0.0; 38 | stmin = 0.0; 39 | stmax = 0.0; 40 | 41 | parallelOption = new ParallelOptions(); 42 | parallelOption.MaxDegreeOfParallelism = thread_num; 43 | } 44 | 45 | 46 | 47 | void mcstep(ref double stx, ref double fx, ref double dx, 48 | ref double sty, ref double fy, ref double dy, 49 | ref double stp, double fp, double dp, 50 | ref bool brackt, 51 | double stpmin, double stpmax, 52 | ref int info) 53 | { 54 | var bound = true; 55 | double p, q, d3, r, stpq, stpc, stpf; 56 | double gamma; 57 | double s; 58 | double d1, d2; 59 | double theta; 60 | info = 0; 61 | 62 | if (brackt == true && ((stp <= Math.Min(stx, sty) || stp >= Math.Max(stx, sty)) || 63 | dx * (stp - stx) >= 0.0 || stpmax < stpmin)) 64 | { 65 | return; 66 | } 67 | 68 | var sgnd = dp * (dx / Math.Abs(dx)); 69 | if (fp > fx) 70 | { 71 | info = 1; 72 | bound = true; 73 | theta = (fx - fp) * 3 / (stp - stx) + dx + dp; 74 | d1 = Math.Abs(theta); 75 | d2 = Math.Abs(dx); 76 | d1 = Math.Max(d1, d2); 77 | d2 = Math.Abs(dp); 78 | s = Math.Max(d1, d2); 79 | d1 = theta / s; 80 | gamma = s * Math.Sqrt(d1 * d1 - dx / s * (dp / s)); 81 | if (stp < stx) 82 | { 83 | gamma = -gamma; 84 | } 85 | p = gamma - dx + theta; 86 | q = gamma - dx + gamma + dp; 87 | r = p / q; 88 | stpc = stx + r * (stp - stx); 89 | stpq = stx + dx / ((fx - fp) / 90 | (stp - stx) + dx) / 2 * (stp - stx); 91 | d1 = stpc - stx; 92 | d2 = stpq - stx; 93 | if (Math.Abs(d1) < Math.Abs(d2)) 94 | { 95 | stpf = stpc; 96 | } 97 | else 98 | { 99 | stpf = stpc + (stpq - stpc) / 2; 100 | } 101 | brackt = true; 102 | } 103 | else if (sgnd < 0.0) 104 | { 105 | info = 2; 106 | bound = false; 107 | theta = (fx - fp) * 3 / (stp - stx) + dx + dp; 108 | d1 = Math.Abs(theta); 109 | d2 = Math.Abs(dx); 110 | d1 = Math.Max(d1, d2); 111 | d2 = Math.Abs(dp); 112 | s = Math.Max(d1, d2); 113 | d1 = theta / s; 114 | gamma = s * Math.Sqrt(d1 * d1 - dx / s * (dp / s)); 115 | if (stp > stx) 116 | { 117 | gamma = -gamma; 118 | } 119 | p = gamma - dp + theta; 120 | q = gamma - dp + gamma + dx; 121 | r = p / q; 122 | stpc = stp + r * (stx - stp); 123 | stpq = stp + dp / (dp - dx) * (stx - stp); 124 | 125 | d1 = stpc - stp; 126 | d2 = stpq - stp; 127 | if (Math.Abs(d1) > Math.Abs(d2)) 128 | { 129 | stpf = stpc; 130 | } 131 | else 132 | { 133 | stpf = stpq; 134 | } 135 | brackt = true; 136 | } 137 | else if (Math.Abs(dp) < Math.Abs(dx)) 138 | { 139 | info = 3; 140 | bound = true; 141 | theta = (fx - fp) * 3 / (stp - stx) + dx + dp; 142 | d1 = Math.Abs(theta); 143 | d2 = Math.Abs(dx); 144 | d1 = Math.Max(d1, d2); 145 | d2 = Math.Abs(dp); 146 | s = Math.Max(d1, d2); 147 | d3 = theta / s; 148 | d1 = 0.0f; 149 | d2 = d3 * d3 - dx / s * (dp / s); 150 | gamma = s * Math.Sqrt((Math.Max(d1, d2))); 151 | if (stp > stx) 152 | { 153 | gamma = -gamma; 154 | } 155 | p = gamma - dp + theta; 156 | q = gamma + (dx - dp) + gamma; 157 | r = p / q; 158 | if (r < 0.0 && gamma != 0.0) 159 | { 160 | stpc = stp + r * (stx - stp); 161 | } 162 | else if (stp > stx) 163 | { 164 | stpc = stpmax; 165 | } 166 | else 167 | { 168 | stpc = stpmin; 169 | } 170 | stpq = stp + dp / (dp - dx) * (stx - stp); 171 | if (brackt == true) 172 | { 173 | d1 = stp - stpc; 174 | d2 = stp - stpq; 175 | if (Math.Abs(d1) < Math.Abs(d2)) 176 | { 177 | stpf = stpc; 178 | } 179 | else 180 | { 181 | stpf = stpq; 182 | } 183 | } 184 | else 185 | { 186 | d1 = stp - stpc; 187 | d2 = stp - stpq; 188 | if (Math.Abs(d1) > Math.Abs(d2)) 189 | { 190 | stpf = stpc; 191 | } 192 | else 193 | { 194 | stpf = stpq; 195 | } 196 | } 197 | } 198 | else 199 | { 200 | info = 4; 201 | bound = false; 202 | if (brackt == true) 203 | { 204 | theta = (fp - fy) * 3 / (sty - stp) + dy + dp; 205 | d1 = Math.Abs(theta); 206 | d2 = Math.Abs(dy); 207 | d1 = Math.Max(d1, d2); 208 | d2 = Math.Abs(dp); 209 | s = Math.Max(d1, d2); 210 | d1 = theta / s; 211 | gamma = s * Math.Sqrt(d1 * d1 - dy / s * (dp / s)); 212 | if (stp > sty) 213 | { 214 | gamma = -gamma; 215 | } 216 | p = gamma - dp + theta; 217 | q = gamma - dp + gamma + dy; 218 | r = p / q; 219 | stpc = stp + r * (sty - stp); 220 | stpf = stpc; 221 | } 222 | else if (stp > stx) 223 | { 224 | stpf = stpmax; 225 | } 226 | else 227 | { 228 | stpf = stpmin; 229 | } 230 | } 231 | 232 | if (fp > fx) 233 | { 234 | sty = stp; 235 | fy = fp; 236 | dy = dp; 237 | } 238 | else 239 | { 240 | if (sgnd < 0.0) 241 | { 242 | sty = stx; 243 | fy = fx; 244 | dy = dx; 245 | } 246 | stx = stp; 247 | fx = fp; 248 | dx = dp; 249 | } 250 | 251 | stpf = Math.Min(stpmax, stpf); 252 | stpf = Math.Max(stpmin, stpf); 253 | stp = stpf; 254 | if (brackt == true && bound) 255 | { 256 | if (sty > stx) 257 | { 258 | d1 = stx + (sty - stx) * 0.66; 259 | stp = Math.Min(d1, stp); 260 | } 261 | else 262 | { 263 | d1 = stx + (sty - stx) * 0.66; 264 | stp = Math.Max(d1, stp); 265 | } 266 | } 267 | 268 | return; 269 | } 270 | 271 | 272 | 273 | const double lb3_1_gtol = 0.9; 274 | const double xtol = 1e-16; 275 | const double lb3_1_stpmin = 1e-20; 276 | const double lb3_1_stpmax = 1e20; 277 | const double ftol = 1e-4; 278 | const double p5 = 0.5; 279 | const double p66 = 0.66; 280 | const double xtrapf = 4.0; 281 | const int maxfev = 20; 282 | 283 | private double ddot_(long size, double[] dx, long dx_idx, FixedBigArray dy, long dy_idx) 284 | { 285 | double ret = 0.0f; 286 | Parallel.For(0, size, parallelOption, () => 0, (i, loop, subtotal) => 287 | { 288 | subtotal += dx[i + dx_idx] * dy[i + dy_idx]; 289 | return subtotal; 290 | }, 291 | (subtotal) => // lock free accumulator 292 | { 293 | double initialValue; 294 | double newValue; 295 | do 296 | { 297 | initialValue = ret; // read current value 298 | newValue = initialValue + subtotal; //calculate new value 299 | } 300 | while (initialValue != Interlocked.CompareExchange(ref ret, newValue, initialValue)); 301 | }); 302 | return ret; 303 | } 304 | 305 | public void mcsrch(double[] x, double f, double[] g, FixedBigArray s, long s_idx, 306 | ref double stp, ref long info, ref long nfev, double[] wa) 307 | { 308 | var size = x.LongLength - 1; 309 | /* Parameter adjustments */ 310 | if (info == -1) 311 | { 312 | info = 0; 313 | nfev++; 314 | 315 | var dg = ddot_(size, g, 1, s, s_idx + 1); 316 | var ftest1 = finit + stp * dgtest; 317 | 318 | if (brackt && ((stp <= stmin || stp >= stmax) || infoc == 0)) 319 | { 320 | info = 6; 321 | Console.WriteLine("MCSRCH warning: Rounding errors prevent further progress.There may not be a step which satisfies the sufficient decrease and curvature conditions. Tolerances may be too small."); 322 | Console.WriteLine("bracket: {0}, stp:{1}, stmin:{2}, stmax:{3}, infoc:{4}", brackt, stp, stmin, stmax, infoc); 323 | } 324 | if (stp == lb3_1_stpmax && f <= ftest1 && dg <= dgtest) 325 | { 326 | info = 5; 327 | Console.WriteLine("MCSRCH warning: The step is too large."); 328 | } 329 | if (stp == lb3_1_stpmin && (f > ftest1 || dg >= dgtest)) 330 | { 331 | info = 4; 332 | Console.WriteLine("MCSRCH warning: The step is too small."); 333 | Console.WriteLine("stp:{0}, lb3_1_stpmin:{1}, f:{2}, ftest1:{3}, dg:{4}, dgtest:{5}", stp, lb3_1_stpmin, f, ftest1, dg, dgtest); 334 | } 335 | if (nfev >= maxfev) 336 | { 337 | info = 3; 338 | Console.WriteLine("MCSRCH warning: More than {0} function evaluations were required at the present iteration.", maxfev); 339 | } 340 | if (brackt && stmax - stmin <= xtol * stmax) 341 | { 342 | info = 2; 343 | Console.WriteLine("MCSRCH warning: Relative width of the interval of uncertainty is at most xtol."); 344 | } 345 | if (f <= ftest1 && Math.Abs(dg) <= lb3_1_gtol * (-dginit)) 346 | { 347 | info = 1; 348 | } 349 | 350 | if (info != 0) 351 | { 352 | return; 353 | } 354 | 355 | if (stage1 && f <= ftest1 && dg >= Math.Min(ftol, lb3_1_gtol) * dginit) 356 | { 357 | stage1 = false; 358 | } 359 | 360 | if (stage1 && f <= fx && f > ftest1) 361 | { 362 | var fm = f - stp * dgtest; 363 | var fxm = fx - stx * dgtest; 364 | var fym = fy - sty * dgtest; 365 | var dgm = dg - dgtest; 366 | var dgxm = dgx - dgtest; 367 | var dgym = dgy - dgtest; 368 | mcstep(ref stx, ref fxm, ref dgxm, ref sty, ref fym, ref dgym, ref stp, fm, dgm, ref brackt, 369 | stmin, stmax, ref infoc); 370 | fx = fxm + stx * dgtest; 371 | fy = fym + sty * dgtest; 372 | dgx = dgxm + dgtest; 373 | dgy = dgym + dgtest; 374 | } 375 | else 376 | { 377 | mcstep(ref stx, ref fx, ref dgx, ref sty, ref fy, ref dgy, ref stp, f, dg, ref brackt, 378 | stmin, stmax, ref infoc); 379 | } 380 | 381 | if (brackt) 382 | { 383 | var d1 = 0.0; 384 | d1 = sty - stx; 385 | if (Math.Abs(d1) >= p66 * width1) 386 | { 387 | stp = stx + p5 * (sty - stx); 388 | } 389 | width1 = width; 390 | d1 = sty - stx; 391 | width = Math.Abs(d1); 392 | } 393 | } 394 | else 395 | { 396 | infoc = 1; 397 | if (size <= 0 || stp <= 0.0) 398 | { 399 | return; 400 | } 401 | 402 | dginit = ddot_(size, g, 1, s, s_idx + 1); 403 | if (dginit >= 0.0) 404 | { 405 | return; 406 | } 407 | 408 | brackt = false; 409 | stage1 = true; 410 | nfev = 0; 411 | finit = f; 412 | dgtest = ftol * dginit; 413 | width = lb3_1_stpmax - lb3_1_stpmin; 414 | width1 = width / p5; 415 | 416 | Parallel.For(1, size + 1, parallelOption, i => 417 | { 418 | wa[i] = x[i]; 419 | } 420 | ); 421 | 422 | stx = 0.0; 423 | fx = finit; 424 | dgx = dginit; 425 | sty = 0.0; 426 | fy = finit; 427 | dgy = dginit; 428 | } 429 | 430 | if (brackt) 431 | { 432 | stmin = Math.Min(stx, sty); 433 | stmax = Math.Max(stx, sty); 434 | } 435 | else 436 | { 437 | stmin = stx; 438 | stmax = stp + xtrapf * (stp - stx); 439 | } 440 | 441 | stp = Math.Max(stp, lb3_1_stpmin); 442 | stp = Math.Min(stp, lb3_1_stpmax); 443 | 444 | if ((brackt && ((stp <= stmin || stp >= stmax) || 445 | nfev >= maxfev - 1 || infoc == 0)) || 446 | (brackt && (stmax - stmin <= xtol * stmax))) 447 | { 448 | stp = stx; 449 | } 450 | 451 | var stp_t = stp; 452 | Parallel.For(1, size + 1, parallelOption, i => 453 | { 454 | x[i] = (wa[i] + stp_t * s[s_idx + i]); 455 | }); 456 | 457 | info = -1; 458 | } 459 | 460 | } 461 | } 462 | -------------------------------------------------------------------------------- /Core/CRFSharp/decoder/DecoderTagger.cs: -------------------------------------------------------------------------------- 1 | /**********************************************/ 2 | /*Project: CRF# */ 3 | /*Author: Zhongkai Fu */ 4 | /*Email: fuzhongkai@gmail.com */ 5 | /**********************************************/ 6 | 7 | using System; 8 | using System.Collections.Generic; 9 | using System.Runtime.CompilerServices; 10 | using System.Text; 11 | 12 | namespace CRFSharp 13 | { 14 | public class crf_term_out 15 | { 16 | //Sequence label probability 17 | public double prob; 18 | 19 | //Raw CRF model output 20 | public string[] result_; 21 | public double[] weight_; 22 | 23 | public crf_term_out(int max_word_num = Utils.DEFAULT_CRF_MAX_WORD_NUM) 24 | { 25 | prob = 0; 26 | result_ = new string[max_word_num]; 27 | weight_ = new double[max_word_num]; 28 | } 29 | } 30 | 31 | public class DecoderTagger : Tagger 32 | { 33 | private readonly Pool _buildersPool = 34 | new Pool(p => new StringBuilder(100), b => b.Clear()); 35 | 36 | public int forward_backward_stat; //前向后向过程运行状态,0为未运行,1为已经运行 37 | 38 | //概率计算函数 39 | double toprob(Node n, double Z) 40 | { 41 | return Math.Exp(n.alpha + n.beta - n.cost - Z); 42 | } 43 | 44 | //To get the fastest decoded result, please set vlevel=0 and nbest=1, since it only outputs 1-best result without probability (forward-backward and A* aren't performed, only run viterbi) 45 | public int vlevel_; //Need to calculate probability 0 - no need to calculate, 1 - calculate sequence label probability, 2 - calculate both sequence label and individual entity probability 46 | protected int nbest_; //output top N-best result 47 | //CrfModel model; 48 | ModelReader featureIndex; 49 | 50 | Node node(int i, int j) 51 | { 52 | return node_[i, j]; 53 | } 54 | 55 | Heap heap_queue; //Using min-heap to get next result, it's only used when nbest > 1 56 | public int crf_max_word_num; 57 | 58 | public DecoderTagger(int nbest, int this_crf_max_word_num = Utils.DEFAULT_CRF_MAX_WORD_NUM) 59 | { 60 | crf_max_word_num = this_crf_max_word_num; 61 | vlevel_ = 0; 62 | nbest_ = nbest; 63 | cost_ = 0.0; 64 | Z_ = 0; 65 | 66 | ysize_ = 0; 67 | word_num = 0; 68 | heap_queue = null; 69 | node_ = null; 70 | x_ = null; 71 | result_ = null; 72 | } 73 | 74 | public void InitializeFeatureCache() 75 | { 76 | feature_cache_ = new List(); 77 | var feature_cache_every_row_size = 0; 78 | if (featureIndex.unigram_templs_.Count > featureIndex.bigram_templs_.Count) 79 | { 80 | feature_cache_every_row_size = featureIndex.unigram_templs_.Count + 1; 81 | } 82 | else 83 | { 84 | feature_cache_every_row_size = featureIndex.bigram_templs_.Count + 1; 85 | } 86 | for (var i = 0; i < crf_max_word_num * 2; i++) 87 | { 88 | var features = new long[feature_cache_every_row_size]; 89 | for (var j = 0; j < feature_cache_every_row_size; j++) 90 | { 91 | features[j] = -1; 92 | } 93 | feature_cache_.Add(features); 94 | } 95 | } 96 | 97 | //获取序列的词数 98 | public short get_word_num() 99 | { 100 | return word_num; 101 | } 102 | 103 | public double prob(int i, int j) 104 | { 105 | return toprob(node_[i, j], Z_); 106 | } 107 | 108 | //Get the probability of the i-th word's best result 109 | public double prob(int i) 110 | { 111 | return toprob(node_[i, result_[i]], Z_); 112 | } 113 | 114 | //Get entire sequence probability 115 | public double prob() 116 | { 117 | return Math.Exp(-cost_ - Z_); 118 | } 119 | 120 | //Get the string of i-th tag 121 | public string yname(int i) { return featureIndex.y(i); } 122 | 123 | //设置vlevel 124 | public void set_vlevel(int vlevel_value) 125 | { 126 | vlevel_ = vlevel_value; 127 | } 128 | 129 | //使用模型初始化tag,必须先使用该函数初始化才能使用add和parse 130 | //正常返回为0, 错误返回<0 131 | public int init_by_model(ModelReader model_p) 132 | { 133 | featureIndex = model_p; 134 | ysize_ = (short)model_p.ysize(); 135 | 136 | if (nbest_ > 1) 137 | { 138 | //Only allocate heap when nbest is more than 1 139 | heap_queue = Utils.heap_init((int)(crf_max_word_num * ysize_ * ysize_)); 140 | } 141 | 142 | //Initialize feature set cache according unigram and bigram templates 143 | InitializeFeatureCache(); 144 | 145 | node_ = new Node[crf_max_word_num, ysize_]; 146 | result_ = new short[crf_max_word_num]; 147 | 148 | //Create node and path cache 149 | for (short cur = 0; cur < crf_max_word_num; cur++) 150 | { 151 | for (short i = 0; i < ysize_; i++) 152 | { 153 | var n = new Node(); 154 | node_[cur, i] = n; 155 | 156 | n.lpathList = new List(); 157 | n.rpathList = new List(); 158 | n.x = cur; 159 | n.y = i; 160 | } 161 | } 162 | 163 | for (var cur = 1; cur < crf_max_word_num; cur++) 164 | { 165 | for (var j = 0; j < ysize_; ++j) 166 | { 167 | for (var i = 0; i < ysize_; ++i) 168 | { 169 | var p = new CRFSharp.Path(); 170 | p.add(node_[cur - 1, j], node_[cur, i]); 171 | } 172 | } 173 | } 174 | 175 | return Utils.ERROR_SUCCESS; 176 | } 177 | 178 | public int initNbest() 179 | { 180 | var k = (int)word_num - 1; 181 | for (var i = 0; i < ysize_; ++i) 182 | { 183 | var eos = Utils.allc_from_heap(heap_queue); 184 | eos.node = node_[k, i]; 185 | eos.fx = -node_[k, i].bestCost; 186 | eos.gx = -node_[k, i].cost; 187 | eos.next = null; 188 | if (Utils.heap_insert(eos, heap_queue) < 0) 189 | { 190 | return Utils.ERROR_INSERT_HEAP_FAILED; 191 | } 192 | } 193 | return Utils.ERROR_SUCCESS; 194 | } 195 | 196 | public int next() 197 | { 198 | while (!Utils.is_heap_empty(heap_queue)) 199 | { 200 | var top = Utils.heap_delete_min(heap_queue); 201 | var rnode = top.node; 202 | 203 | if (rnode.x == 0) 204 | { 205 | for (var n = top; n != null; n = n.next) 206 | { 207 | result_[n.node.x] = n.node.y; 208 | } 209 | cost_ = top.gx; 210 | return 0; 211 | } 212 | 213 | for (int index = 0; index < rnode.lpathList.Count; index++) 214 | { 215 | var p = rnode.lpathList[index]; 216 | var n = Utils.allc_from_heap(heap_queue); 217 | var x_num = (rnode.x) - 1; 218 | n.node = p.lnode; 219 | n.gx = -p.lnode.cost - p.cost + top.gx; 220 | n.fx = -p.lnode.bestCost - p.cost + top.gx; 221 | // | h(x) | | g(x) | 222 | n.next = top; 223 | if (Utils.heap_insert(n, heap_queue) < 0) 224 | { 225 | return Utils.ERROR_INSERT_HEAP_FAILED; 226 | } 227 | } 228 | } 229 | return 0; 230 | } 231 | 232 | public int reset() 233 | { 234 | word_num = 0; 235 | Z_ = cost_ = 0.0; 236 | 237 | Utils.heap_reset(heap_queue); 238 | return Utils.ERROR_SUCCESS; 239 | } 240 | 241 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 242 | int buildLattice() 243 | { 244 | //Generate feature ids for all nodes and paths 245 | RebuildFeatures(); 246 | 247 | for (int i = 0; i < word_num; ++i) 248 | { 249 | for (int j = 0; j < ysize_; ++j) 250 | { 251 | var currentNode = node_[i, j]; 252 | calcCost(currentNode); 253 | for (int index = 0; index < currentNode.lpathList.Count; ++index) 254 | { 255 | var p = currentNode.lpathList[index]; 256 | calcCost(p); 257 | } 258 | } 259 | } 260 | 261 | return Utils.ERROR_SUCCESS; 262 | } 263 | 264 | public int add(List> row_p) 265 | { 266 | x_ = row_p; 267 | word_num = (short)x_.Count; 268 | 269 | return Utils.ERROR_SUCCESS; 270 | } 271 | 272 | 273 | public int termbuf_build(crf_term_out term_buf) 274 | { 275 | if (vlevel_ > 0) 276 | { 277 | //Calcuate the sequence label probability 278 | term_buf.prob = prob(); 279 | } 280 | 281 | var this_word_num = get_word_num(); 282 | 283 | for (var i = 0; i < this_word_num; ++i) 284 | { 285 | term_buf.result_[i] = yname(result_[i]); 286 | switch (vlevel_) 287 | { 288 | case 0: 289 | term_buf.weight_[i] = 0.0; 290 | break; 291 | case 2: 292 | term_buf.weight_[i] = prob(i); 293 | break; 294 | } 295 | } 296 | return Utils.ERROR_SUCCESS; 297 | } 298 | 299 | //Label input string. The result is saved as result [] 300 | //If nbest > 1, get nbest result by "next" 301 | //Returen value: Successed - 0, Failed < 0 302 | public int parse() 303 | { 304 | var ret = 0; 305 | //no word need to be labeled 306 | if (word_num == 0) 307 | { 308 | return Utils.ERROR_SUCCESS; 309 | } 310 | 311 | //building feature set 312 | ret = buildFeatures(); 313 | if (ret < 0) 314 | { 315 | return ret; 316 | } 317 | 318 | 319 | ret = buildLattice(); 320 | if (ret < 0) 321 | { 322 | return ret; 323 | } 324 | 325 | //4.forward-backward when we need to calcuate probability 326 | if (vlevel_ > 0) 327 | { 328 | forwardbackward(); 329 | } 330 | 331 | 332 | //5.using viterbi to search best result path 333 | ret = viterbi(); 334 | if (ret < 0) 335 | { 336 | return ret; 337 | } 338 | 339 | //6.initNbest 340 | // 求nbest(n>1)时的数据结构初始化,此后可以调用next()来获取nbest结果 341 | if (nbest_ > 1) 342 | { 343 | //如果只求1-best,不需要使用initNbest()和next()获取结果 344 | ret = initNbest(); 345 | if (ret < 0) 346 | { 347 | return ret; 348 | } 349 | 350 | } 351 | 352 | return Utils.ERROR_SUCCESS; 353 | } 354 | 355 | 356 | public int buildFeatures() 357 | { 358 | if (word_num <= 0) 359 | { 360 | return Utils.ERROR_INVALIDATED_PARAMETER; 361 | } 362 | using (var v = _buildersPool.GetOrCreate()) 363 | { 364 | var builder = v.Item; 365 | var id = 0; 366 | var feature_cache_row_size = 0; 367 | var feature_cache_size = 0; 368 | for (var cur = 0; cur < word_num; cur++) 369 | { 370 | feature_cache_row_size = 0; 371 | for (int index = 0; index < featureIndex.unigram_templs_.Count; index++) 372 | { 373 | var templ = featureIndex.unigram_templs_[index]; 374 | var res = featureIndex.apply_rule(templ, cur, builder, this); 375 | if (res == null) 376 | { 377 | return Utils.ERROR_EMPTY_FEATURE; 378 | } 379 | id = featureIndex.get_id(res.ToString()); 380 | if (id != -1) 381 | { 382 | feature_cache_[feature_cache_size][feature_cache_row_size] = id; 383 | feature_cache_row_size++; 384 | } 385 | } 386 | feature_cache_[feature_cache_size][feature_cache_row_size] = -1; 387 | feature_cache_size++; 388 | } 389 | 390 | for (var cur = 0; cur < word_num; cur++) 391 | { 392 | feature_cache_row_size = 0; 393 | for (int index = 0; index < featureIndex.bigram_templs_.Count; index++) 394 | { 395 | var templ = featureIndex.bigram_templs_[index]; 396 | var strFeature = featureIndex.apply_rule(templ, cur, builder, this); 397 | if (strFeature == null) 398 | { 399 | return Utils.ERROR_EMPTY_FEATURE; 400 | } 401 | 402 | id = featureIndex.get_id(strFeature.ToString()); 403 | if (id != -1) 404 | { 405 | feature_cache_[feature_cache_size][feature_cache_row_size] = id; 406 | feature_cache_row_size++; 407 | } 408 | } 409 | feature_cache_[feature_cache_size][feature_cache_row_size] = -1; 410 | feature_cache_size++; 411 | } 412 | 413 | return Utils.ERROR_SUCCESS; 414 | } 415 | } 416 | 417 | 418 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 419 | public void calcCost(Node n) 420 | { 421 | double c = 0; 422 | var f = feature_cache_[n.fid]; 423 | 424 | for (int i = 0; i < f.Length; ++i) 425 | { 426 | int fCurrent = (int)f[i]; 427 | if (fCurrent == -1) 428 | break; 429 | c += featureIndex.GetAlpha(fCurrent + n.y); 430 | } 431 | 432 | n.cost = featureIndex.cost_factor_ * c; 433 | } 434 | 435 | [MethodImpl(MethodImplOptions.AggressiveInlining)] 436 | public void calcCost(CRFSharp.Path p) 437 | { 438 | double c = 0; 439 | long[] f = feature_cache_[p.fid]; 440 | for (int i = 0; i < f.Length; ++i) 441 | { 442 | int fCurrent = (int)f[i]; 443 | if (fCurrent == -1) 444 | break; 445 | c += featureIndex.GetAlpha((fCurrent + p.lnode.y * ysize_ + p.rnode.y)); 446 | } 447 | 448 | p.cost = featureIndex.cost_factor_ * c; 449 | } 450 | 451 | 452 | public int output(crf_term_out[] pout) 453 | { 454 | var n = 0; 455 | var ret = 0; 456 | 457 | if (nbest_ == 1) 458 | { 459 | //If only best result and no need probability, "next" is not to be used 460 | ret = termbuf_build(pout[0]); 461 | if (ret < 0) 462 | { 463 | return ret; 464 | } 465 | } 466 | else 467 | { 468 | //Fill the n best result 469 | var iNBest = nbest_; 470 | if (pout.Length < iNBest) 471 | { 472 | iNBest = pout.Length; 473 | } 474 | 475 | for (n = 0; n < iNBest; ++n) 476 | { 477 | ret = next(); 478 | if (ret < 0) 479 | { 480 | break; 481 | } 482 | 483 | ret = termbuf_build(pout[n]); 484 | if (ret < 0) 485 | { 486 | return ret; 487 | } 488 | } 489 | } 490 | 491 | return Utils.ERROR_SUCCESS; 492 | } 493 | } 494 | } 495 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Donate a beverage to help me to keep CRFSharp up to date :) [![Support via PayPal](https://www.paypalobjects.com/en_GB/i/btn/btn_donate_SM.gif)](https://www.paypal.me/fuzhongkai/) 2 | 3 | CRFSharp 4 | ======== 5 | CRFSharp is Conditional Random Fields (CRF) implemented by .Net Core(C#), a machine learning algorithm for learning from labeled sequences of examples. 6 | 7 | ## Overview 8 | CRFSharp is Conditional Random Fields implemented by .Net Core(C#), a machine learning algorithm for learning from labeled sequences of examples. It is widely used in Natural Language Process (NLP) tasks, for example: word breaker, postaging, named entity recognized and so on. 9 | 10 | CRFSharp (aka CRF#) is based on .NET Core, so it can run on Windows/Linux and other platforms .Net core is supporting. Its main algorithm is similar as CRF++ written by Taku Kudo. It encodes model parameters by L-BFGS. Moreover, it also has many significant improvements than CRF++, such as totally parallel encoding, optimizing memory usage and so on. 11 | 12 | Currently, when training corpus, compared with CRF++, CRFSharp can make full use of multi-core CPUs and use memory effectively, especially for very huge size training corpus and tags. So in the same environment, CRFSharp is able to encode much more complex models with less cost than CRF++. 13 | 14 | The following screenshot is an example that CRFSharp is running on a machine with 16 cores CPUs and 96GB memory. 15 | ![](http://download-codeplex.sec.s-msft.com/Download?ProjectName=crfsharp&DownloadId=600636) 16 | The training corpus has 1.24 million records with nearly 1.2 billion features. From the screenshot, all CPU cores are full used and memory usage is stable. The average encoding time per iteration is 3 minutes and 33 seconds. 17 | 18 | Besides command line tool, CRFSharp has also provided APIs and these APIs can be used into other projects and services for key techincal tasks. For example: WordSegment project has used CRFSharp to recognize named entity; Query Term Analyzer project has used it to analyze query term important level in word formation and Geography Coder project has used it to detect geo-entity from text. For detailed information about APIs, please see section [Use CRFSharp API in your project] in below. 19 | 20 | To use CRFSharp, we need to prepare corpus and design feature templates at first. CRFSharp's file formats are compatible with CRF++(official website:https://taku910.github.io/crfpp/). The following paragraphs will introduce data formats and how to use CRFSharp in both command line and APIs 21 | 22 | ## Training file format 23 | Training corpus contains many records to describe what the model should be. For each record, it is split into one or many tokens and each token has one or many dimension features to describe itself. 24 | 25 | In training file, each record can be represented as a matrix and ends with an empty line. In the matrix, each row describes one token and its features, and each column represents a feature in one dimension. In entire training corpus, the number of column must be fixed. 26 | 27 | When CRFSharp encodes, if the column size is N, according template file describes, the first N-1 columns will usually be used as input data to generate binary feature set and train model. The Nth column (aka last column) is the answer that the model should output. The means, for one record, if we have an ideal encoded model, given all tokens’ the first N-1 columns, the model should output each token’s Nth column data as the entire record’s answer. 28 | 29 | There is an example (a bigger training example file is at download section, you can see and download it there): 30 | 31 | Word | Pos | Tag 32 | -----------|------|---- 33 | ! | PUN | S 34 | Tokyo | NNP | S_LOCATION 35 | and | CC | S 36 | New | NNP | B_LOCATION 37 | York | NNP | E_LOCATION 38 | are | VBP | S 39 | major | JJ | S 40 | financial | JJ | S 41 | centers | NNS | S 42 | . | PUN | S 43 | | | 44 | ! | PUN | S 45 | p | FW | S 46 | ' | PUN | S 47 | y | NN | S 48 | h | FW | S 49 | 44 | CD | S 50 | University | NNP | B_ORGANIZATION 51 | of | IN | M_ORGANIZATION 52 | Texas | NNP | M_ORGANIZATION 53 | Austin | NNP | E_ORGANIZATION 54 | 55 | The example is for labeling named entities in records. It has two records and each token has three columns. The first column is the term of a token, the second column is the token’s pos-tag result and the third column is to describe whether the token is a named entity or a part of named entity and its type. The first and the second columns are input data for encoding model, and the third column is the model ideal output as answer. 56 | 57 | In above example, we designed output answer as "POS_TYPE". POS means the position of the term in the chunk or named entity, TYPE means the output type of the term. 58 | 59 | For POS, it supports four types as follows: 60 | S: the chunk has only one term 61 | B: the begin term of the chunk 62 | M: one of the middle term in the chunk 63 | E: the end term of the chunk 64 | 65 | For TYPE, the example contains many types as follows: 66 | ORGANIZATION : the name of one organization 67 | LOCATION : the name of one location 68 | For output answer without TYPE, it's just a normal term, not a named entity. 69 | 70 | ## Test file format 71 | Test file has the similar format as training file. The only different between training and test file is the last column. In test file, all columns are features for CRF model. 72 | 73 | ## CRFSharp command line tools 74 | CRFSharpConsole.exe is a command line tool to encode and decode CRF model. By default, the help information showed as follows: 75 | Linear-chain CRF encoder & decoder by Zhongkai Fu (fuzhongkai@gmail.com) 76 | **CRFSharpConsole.exe** [parameter list...] 77 | **-encode** [parameter list...] - Encode CRF model from given training corpus 78 | **-decode** [parameter list...] - Decode CRF model to label text 79 | **-shrink** [parameter list...] - Shrink encoded CRF model size 80 | 81 | As the above information shows, the tool provides two run modes. Encode mode is for training model, and decode mode is for testing model. The following paragraphs introduces how to use these two modes. 82 | 83 | ## Encode model 84 | This mode is used to train CRF model from training corpus. Besides -encode parameter, the command line parameters as follows: 85 | **CRFSharpConsole.exe** -encode [parameters list] 86 | **-template** : template file name 87 | **-trainfile** : training corpus file name 88 | **-modelfile** : encoded model file name 89 | **-maxiter** : maximum iteration, when encoding iteration reaches this value, the process will be ended. Default value is 1000 90 | **-minfeafreq** : minimum feature frequency, if one feature's frequency is less than this value, the feature will be dropped. Default value is 2 91 | **-mindiff** : minimum diff value, when diff less than the value consecutive 3 times, the process will be ended. Default value is 0.0001 92 | **-thread** : threads used to train model. Default value is 1 93 | **-slotrate** : the maximum slot usage rate threshold when building feature set. it is ranged in (0.0, 1.0). the higher value means longer time to build feature set, but smaller feature set size. Default value is 0.95 94 | **-hugelexmem** : build lexical dictionary in huge mode and shrink starts when used memory reaches this value. This mode can build more lexical items, but slowly. Value ranges [1,100] and default is disabled. 95 | **-regtype** : regularization type. L1 and L2 regularization are supported. Default is L2 96 | **-retrainmodel** : the existing model for re-training. 97 | **-debug**: encode model as debug mode 98 | 99 | Note: either -maxiter reaches setting value or -mindiff reaches setting value in consecutive three times, the training process will be finished and saved encoded model. 100 | 101 | Note: -hugelexmem is only used for special task, and it is not recommended for common task, since it costs lots of time for memory shrink in order to load more lexical features into memory 102 | 103 | A command line example as follows: 104 | CRFSharpConsole.exe -encode -template template.1 -trainfile ner.train -modelfile ner.model -maxiter 100 -minfeafreq 1 -mindiff 0.0001 -thread 4 –debug 105 | 106 | The entire encoding process contains four main steps as follows: 107 | 1. Load train corpus from file, generate and select feature set according templates. 108 | 2. Build selected feature set index data as double array trie-tree format, and save them into file. 109 | 3. Run encoding process iteratively to tune feature values until reach end condition. 110 | 4. Save encoded feature values into file. 111 | In step 3, after run each iteration, some detailed encoding information will be show. For example: 112 | M_RANK_1 [FR=47658, TE=54.84%] 113 | M_RANK_2:27.07% M_RANK_0:26.65% E_RANK_0:0.31% B_RANK_0:0.21% E_RANK_1:0.19% 114 | iter=65 terr=0.320290 serr=0.717372 diff=0.0559666295793355 fsize=73762836(1.10% act) Time span: 00:31:56.4866295, Aver. time span per iter: 00:00:29 115 | The encoding information has two parts. The first part is information about each tag, the second part is information in overview.For each tag, it has two lines information. The first line shows the number of this tag in total (FR) and current token error rate (TE) about this tag. The second line shows this tag's token error distribution. In above example, in No.65 iteration, M_RANK_1 tag's token error rate is 54.84% in total. In these token error, 27.07% is M_RANK_2, 26.65% is M_RANK_0 and so on.For second part (information in overview), some global information is showed. 116 | **iter** : the number of iteration processed 117 | **terr** : tag's token error rate in all 118 | **serr** : record's error rate in all 119 | **diff** : different between current and previous iteration 120 | **fsize( x% act)** : the number of feature set in total, x% act means the number of non-zero value features. In L1 regularization, with the increasement of iter, x% is reduced. In L2 regularization, x% is always 100%. 121 | **Time span** : how long the encoding process has been taken 122 | **Aver. time span per iter** : the average time span for each iteration 123 | 124 | After encoding process is finished, the following files will be generated. 125 | file1: **[model file name]** 126 | This is model meta data file. It contains model's global parameters, feature templates, output tags and so on. 127 | file2: **[model file name]**.feature 128 | This is feature set lexical dictionary file. It contains all features's strings and corresponding ids. For high performance, it's built by double array tri-tree. In debug mode, [model file name].feature.raw_text which saves lexical dictionary in raw text will be generated. 129 | file3: **[model file name]**.alpha 130 | This is feature set weight score file. It contains all features' weight score. 131 | 132 | ## Decode model 133 | This mode is used to decode and test encoded model. Besides -decode parameter, there are some other required and optional parameters: 134 | CRFSharpConsole.exe -decode 135 | **-modelfile** : The model file used for decoding 136 | **-inputfile** : The input file to predict its content tags 137 | **-outputfile** : The output file to save predicted result 138 | **-nbest** : Output n-best result, default value is 1 139 | **-prob** : output probability, default is not output 140 | 141 | Here is an example: 142 | CRFSharpConsole.exe -decode -modelfile ner.model -inputfile ner_test.txt -outputfile ner_test_result.txt -nbest 5 -prob 143 | 144 | ## Shrink model 145 | Encoded model with L1 regularization is usually a sparse model. Shrink parameter is used to reduce model file size. With -shrink parameter, the command line as follows: 146 | CRFSharpConsole.exe -shrink [Encoded CRF model file name] [Shrinked CRF model file name] [thread num] 147 | An example as follows: 148 | CRFSharpConsole.exe -shrink ner.model ner_shrinked.model 16 149 | This example is used to shrink ner.model files and the working thread is 16. 150 | 151 | ## Incremental training 152 | For some complex tasks, encoding model is timing-cost. With "-retrainmodel " option and updated training corpus (both old and new training corpus), CRFSharp supports to train model incrementally and compared with full training, incremental training is able to save lots of time. There is an example: 153 | CRFSharpConsole.exe -encode -template template.1 -trainfile ner_new.train -modelfile ner_new.model -retrainmodel ner.model -maxiter 100 -minfeafreq 1 -mindiff 0.0001 -thread 4 –debug 154 | 155 | ## Feature templates 156 | CRFSharp template is totally compatible with CRF++ and used to generate feature set from training and testing corpus. 157 | 158 | In template file, each line describes one template which consists of prefix, id and rule-string. The prefix is used to indicate template type. There are two prefix, U for unigram template, and B for bigram template. Id is used to distinguish different templates. And rule-string is used to guide CRFSharp to generate features. 159 | 160 | The rule-string has two types of form, one is constant string, and the other is macro. The simplest macro form is {“%x[row,col]”}. Row specifies the offset between current focusing token and generating feature token in row. Col specifies the absolute column position in corpus. Moreover, combined macro is also supported, for example: {“%x[row1, col1]/%x[row2, col2]”}. When generating feature set, macro will be replaced as specific string. A template file example as follows: 161 | 162 | \# Unigram 163 | U01:%x[-1,0] 164 | U02:%x[0,0] 165 | U03:%x[1,0] 166 | U04:%x[-1,0]/%x[0,0] 167 | U05:%x[0,0]/%x[1,0] 168 | U06:%x[-1,0]/%x[1,0] 169 | U07:%x[-1,1] 170 | U08:%x[0,1] 171 | U09:%x[1,1] 172 | U10:%x[-1,1]/%x[0,1] 173 | U11:%x[0,1]/%x[1,1] 174 | U12:%x[-1,1]/%x[1,1] 175 | U13:C%x[-1,0]/%x[-1,1] 176 | U14:C%x[0,0]/%x[0,1] 177 | U15:C%x[1,0]/%x[1,1] 178 | \# Bigram 179 | B 180 | 181 | In this template file, it contains both unigram and bigram templates. Assuming current focusing token is “York NNP E_LOCATION” in the first record in training corpus above, the generated unigram feature set as follows: 182 | 183 | U01:New 184 | U02:York 185 | U03:are 186 | U04:New/York 187 | U05:York/are 188 | U06:New/are 189 | U07:NNP 190 | U08:NNP 191 | U09:are 192 | U10:NNP/NNP 193 | U11:NNP/VBP 194 | U12:NNP/VBP 195 | U13:CNew/NNP 196 | U14:CYork/NNP 197 | U15:Care/VBP 198 | 199 | Although U07 and U08, U11 and U12’s rule-string are the same, we can still distinguish them by id string. 200 | 201 | In encoding process, according templates, encoder will generate feature set (like the example in above) from records in training corpus and save them into model file. 202 | 203 | In decoding process, for each test record, decoder will also generate features by template, and check every feature whether it exists in model. If it is yes, feature’s alpha value will be applied while processing cost value. 204 | 205 | For each token, how many features will be generated from unigram templates? As the above said, if we have M unigram templates, each token will have M feature generated from the template set. Moreover, assuming each token has N different output classes, in order to indicate all possible statuses by binary function, we need to have {“M*N”} features for one token in total. For a record which contains L tokens, the feature size of this record is {“M*N*L”}. 206 | 207 | For bigram template, CRFSharp will enumerate all possible combined output classes of two contiguous tokens, and generate features for each combined one. So, if each token has N different output classes, and the number of features generated by templates is M, the total bigram feature set size is {“N*N*M”}. For a record which contains L tokens, the feature size of this record is {“M*N*N*(L-1)”}. 208 | 209 | ## Run on Linux/Mac 210 | 211 | CRFSharp is built by .Net core, so it can naturally run on Windows/Linux/MacOs and other platforms that .Net core supports. 212 | 213 | ## Use CRFSharp API in your project 214 | 215 | Besides command line tool, CRFSharp provides APIs for developers to use it in their projects. In this section, we will show you how to use it. Basically, CRFSharp has two dll files: One is CRFSharp.dll which contains core algorithm and provides many APIs in low level. The other is CRFSharpWrapper.dll which wraps above low level interfaces and provides interfaces in high level. 216 | 217 | ## Encode a CRF model in your project 218 | 1. Add CRFSharpWrapper.dll as reference 219 | 2. Add following code snippet 220 | ```c# 221 | var encoder = new CRFSharpWrapper.Encoder(); 222 | var options = new EncoderArgs(); 223 | options.debugLevel = 1; 224 | options.strTemplateFileName = "template.txt"; //template file name 225 | options.strTrainingCorpus = "train.txt"; //training corpus file name 226 | options.strEncodedModelFileName = "ner_model"; //encoded model file name 227 | options.max_iter = 1000; 228 | options.min_feature_freq = 2; 229 | options.min_diff = 0.0001; 230 | options.threads_num = 4; 231 | options.C = 1.0; 232 | options.slot_usage_rate_threshold = 0.95; 233 | bool bRet = encoder.Learn(options); 234 | ``` 235 | For detailed information, please visit source code: https://github.com/zhongkaifu/CRFSharp/blob/master/CRFSharpConsole/EncoderConsole.cs 236 | 237 | # Decode a CRFSharp model in your project 238 | 1. Add CRFSharpWrapper.dll as reference 239 | 2. Add following code snippet 240 | 241 | ```c# 242 | //Create CRFSharp wrapper instance. It's a global instance 243 | var crfWrapper = new CRFSharpWrapper.Decoder(); 244 | 245 | //Load encoded model from file 246 | crfWrapper.LoadModel(options.strModelFileName); 247 | 248 | //Create decoder tagger instance. If the running environment is multi-threads, each thread needs a separated instance 249 | 250 | var tagger = crfWrapper.CreateTagger(options.nBest, options.maxword); 251 | tagger.set_vlevel(options.probLevel); 252 | 253 | //Initialize result 254 | var crf_out = new crf_seg_out[options.nBest]; 255 | for (var i = 0; i < options.nBest; i++) 256 | { 257 | crf_out[i] = new crf_seg_out(tagger.crf_max_word_num); 258 | } 259 | 260 | //Process 261 | List> featureSet = BuildFeatureSet(strTestText); //Build feature set from given test text. 262 | crfWrapper.Segment(crf_out, tagger, inbuf); 263 | 264 | //An example for feature set builidng. Only use 1-dim character based feature 265 | privatestatic List> BuildFeatureSet(string str) 266 | { 267 | List> sinbuf = new List>(); 268 | foreach (char ch in str) 269 | { 270 | sinbuf.Add(new List()); 271 | sinbuf[sinbuf.Count - 1].Add(ch.ToString()); 272 | } 273 | return sinbuf; 274 | } 275 | ``` 276 | The Decoder.Segment is a wrapped decoder interface. It's defined as follows: 277 | ```c# 278 | //Segment given text 279 | public int Segment(crf_out pout, //segment result 280 | SegDecoderTagger tagger, //Tagger per thread 281 | List> inbuf, //feature set for segment 282 | ) 283 | ``` 284 | 285 | #CRFSharp referenced by the following published papers 286 | 1. [Reconhecimento de entidades nomeadas em textos em português do Brasil no domınio do e-commerce](http://www.lbd.dcc.ufmg.br/colecoes/tilic/2015/010.pdf) 287 | 2. [Multimodal Wearable Sensing for Fine-Grained Activity Recognition in Healthcare](http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=7155432) 288 | 3. [A CRF-based Method for Automatic Construction of Chinese Symptom Lexicon](http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=7429085) 289 | 4. [Bileşik Cümlelerde Yan Cümleciklerin Otomatik Etiketlenmesi](http://ab.org.tr/ab16/bildiri/20.pdf) 290 | 5. [Entity Recognition in Bengali language](http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=7377333) 291 | 6. [A Hybrid Semi-supervised Learning Approach to Identifying Protected Health Information in Electronic Medical Records](http://dl.acm.org/citation.cfm?id=2857630) 292 | 7. [Global Journal on Technology](https://www.ce.yildiz.edu.tr/personal/mfatih/file/15131/x.pdf) 293 | 8. [Einf uhrung in Conditional Random Fields zum Taggen von sequentiellen Daten Tool: Wapiti](http://kitt.cl.uzh.ch/clab/crf/crf.pdf) 294 | 9. [Nghiên cứu phương pháp trích chọn thông tin thời tiết từ văn bản tiếng Việt](http://repository.vnu.edu.vn/bitstream/VNU_123/4980/1/00050005751.pdf) 295 | 10. [Unsupervised Word and Dependency Path Embeddings for Aspect Term Extraction](http://arxiv.org/abs/1605.07843) 296 | 11. [A HYBRID INTELLIGENT SYSTEM TO IMPROVE DATA PREPROCESSING](http://www.sci-int.com/pdf/3596982411%20a%201%203631-3637%20Sohail%20Sarwar--IT--ISD--4-1-16--PAID.pdf) 297 | 12.   [Family Matters: Company Relations Extraction from Wikipedia](http://rd.springer.com/chapter/10.1007/978-3-319-45880-9_7) 298 | 13.   [Active Learning for Incremental POI Extraction and Pairing](http://ir.lib.ncu.edu.tw:88/thesis/view_etd.asp?URN=103522034) 299 | 14. [Semantic Role Labeling With Relative Clauses](http://dergipark.gov.tr/download/article-file/256977) 300 | 15. [Automatic de-identification of medical records with a multilevel hybrid semi-supervised learning approach](http://ieeexplore.ieee.org/abstract/document/7800267/) 301 | 16. [中文医学术语资源的自动构建方法研究及应用] 302 | 17. [Web2.0环境下的科技论文共享系统研究] 303 | 18. [基于条件随机场的入侵检测方法研究] 304 | 19. [Urdu part of speech tagging using conditional random fields] 305 | 20. [Deep recurrent neural networks with word embeddings for Urdu named entity recognition] 306 | 21. [Part of Speech Tagging in Urdu: Comparison of Machine and Deep Learning Approaches] 307 | 22. [Context based number normalization using skip-chain conditional random fields] 308 | 23. [Generic Urdu NLP Framework for Urdu Text Analysis: Hybridization of heuristics and Machine Learning Techniques] 309 | 24. [Semi-Automatic Corpus Expansion and Extraction of Uyghur-Named Entities and Relations Based on a Hybrid Method] 310 | 25. [Named Entity Recognition: A Survey for Indian Languages] 311 | 26. [A comprehensive review of conditional random fields: variants, hybrids and applications] 312 | 313 | And so on... 314 | 315 | --------------------------------------------------------------------------------