├── Output.png ├── cslogo.png ├── Test ├── py │ ├── doc.out │ ├── displacy.py │ ├── linguistic_features.py │ ├── Test.py │ ├── spaCy101.py │ └── serialization.py └── cs │ ├── DisplaCy.cs │ ├── Test.csproj │ ├── LinguisticFeatures.cs │ ├── Program.cs │ ├── Serialization.cs │ ├── SpaCy101.cs │ ├── Test.sln │ └── ExampleES.cs ├── PythonNetUtils ├── PythonNetUtils.csproj ├── Utils.cs ├── ToPy.cs ├── PythonRt.cs └── ToClr.cs ├── SpaCyDotNet ├── SpaCyDotNet.csproj ├── Serialization.cs └── api │ ├── Displacy.cs │ ├── Spacy.cs │ ├── StringStore.cs │ ├── Span.cs │ ├── Lang.cs │ ├── Lexeme.cs │ ├── Vocab.cs │ ├── DocBin.cs │ ├── Token.cs │ └── Doc.cs ├── LICENSE ├── .gitignore └── README.md /Output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMArostegui/SpacyDotNet/HEAD/Output.png -------------------------------------------------------------------------------- /cslogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMArostegui/SpacyDotNet/HEAD/cslogo.png -------------------------------------------------------------------------------- /Test/py/doc.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AMArostegui/SpacyDotNet/HEAD/Test/py/doc.out -------------------------------------------------------------------------------- /Test/py/displacy.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from spacy import displacy 3 | 4 | nlp = spacy.load("en_core_web_sm") 5 | doc = nlp("Apple is looking at buying U.K. startup for $1 billion") 6 | displacy.serve(doc, style="dep") -------------------------------------------------------------------------------- /Test/py/linguistic_features.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | 3 | nlp = spacy.load("en_core_web_sm") 4 | doc = nlp("Autonomous cars shift insurance liability toward manufacturers") 5 | for token in doc: 6 | print(token.text, token.dep_, token.head.text, token.head.pos_, 7 | [child for child in token.children]) -------------------------------------------------------------------------------- /PythonNetUtils/PythonNetUtils.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net9.0 5 | enable 6 | disable 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /SpaCyDotNet/SpaCyDotNet.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net9.0 5 | AnyCPU;x64 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /SpaCyDotNet/Serialization.cs: -------------------------------------------------------------------------------- 1 | namespace SpacyDotNet 2 | { 3 | public static class Serialization 4 | { 5 | public enum Mode 6 | { 7 | Spacy, 8 | SpacyAndDotNet, 9 | DotNet 10 | } 11 | 12 | public static Mode Selected { get; set; } = Mode.Spacy; 13 | 14 | public static string Namespace = "https://github.com/AMArostegui/SpacyDotNet"; 15 | 16 | public static string Prefix = "sdn"; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /Test/cs/DisplaCy.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using SpacyDotNet; 5 | 6 | namespace Test 7 | { 8 | static class DisplaCy 9 | { 10 | public static void Run() 11 | { 12 | var spacy = new Spacy(); 13 | var nlp = spacy.Load("en_core_web_sm"); 14 | 15 | var doc = nlp.GetDocument("Apple is looking at buying U.K. startup for $1 billion"); 16 | var displacy = new Displacy(); 17 | displacy.Serve(doc, "dep"); 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /Test/cs/Test.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | net9.0 6 | AnyCPU;x64 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /SpaCyDotNet/api/Displacy.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using Python.Runtime; 5 | 6 | namespace SpacyDotNet 7 | { 8 | public class Displacy 9 | { 10 | public Displacy() 11 | { 12 | } 13 | 14 | public void Serve(Doc doc, string style) 15 | { 16 | using (Py.GIL()) 17 | { 18 | dynamic spacy = Py.Import("spacy"); 19 | 20 | var pyDoc = doc.PyDoc; 21 | var pyStyle = new PyString(style); 22 | spacy.displacy.serve(pyDoc, pyStyle); 23 | } 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /Test/py/Test.py: -------------------------------------------------------------------------------- 1 | import zlib 2 | 3 | data = b'\x78\x9c\x6b\x5f\x9a\x58\x52\x52\x54\x3c\xcd\xd1\xcb\xc7\xd7\xcf\x7f\x59\x49\x7e\x76\x6a\x5e\xf1\x91\x09\xfb\x5d\x5d\x0a\xff\xef\x10\x77\x8c\x67\x80\x80\xb5\x8c\x10\x9a\x89\x01\x15\x40\x85\x19\x8e\x6e\xed\x50\x3f\x66\x95\x60\x9c\x02\xe5\xfb\x5c\xa8\x7d\xbf\xe4\xc8\x93\x42\x74\xf5\x30\x30\xf1\x9c\x93\xf0\x9f\xa0\x44\x9d\x18\x28\x7f\x01\x0e\xf3\xff\x43\xc1\xb2\xe2\x82\xc4\xe4\xd4\xe2\x23\xcc\x8c\x8c\x0c\xcb\x73\x52\xf3\xd2\x4b\x32\x8a\x8f\xb0\x30\x03\x55\x2c\x2f\x2e\x29\xca\xcc\x4b\x2f\x9e\xbc\x24\x27\xbf\x2c\x75\xa1\xe7\xb2\xe4\xfc\xb4\xb4\xd4\xd4\x25\xc9\x89\x25\xc5\x13\x1b\x56\x96\x16\xa7\x16\xc5\xa7\x24\x96\x24\x4e\x3c\xc2\xd8\x00\x00\xb5\x98\x41\x87' 4 | zlib.decompress(data) -------------------------------------------------------------------------------- /SpaCyDotNet/api/Spacy.cs: -------------------------------------------------------------------------------- 1 | using Python.Runtime; 2 | using PythonNetUtils; 3 | using System; 4 | 5 | namespace SpacyDotNet 6 | { 7 | public class Spacy 8 | { 9 | public Spacy() 10 | { 11 | if (!PythonRt.IsInitialized) 12 | { 13 | throw new InvalidOperationException("Initialize runtime before usage"); 14 | } 15 | } 16 | 17 | public Lang Load(string model) 18 | { 19 | using (Py.GIL()) 20 | { 21 | dynamic spacy = Py.Import("spacy"); 22 | var pyString = new PyString(model); 23 | var nlp = spacy.load(pyString); 24 | return new Lang(nlp); 25 | } 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /Test/cs/LinguisticFeatures.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using SpacyDotNet; 4 | 5 | namespace Test 6 | { 7 | static class LinguisticFeatures 8 | { 9 | public static void Run() 10 | { 11 | var spacy = new Spacy(); 12 | var nlp = spacy.Load("en_core_web_sm"); 13 | 14 | var text = "Autonomous cars shift insurance liability toward manufacturers"; 15 | var doc = nlp.GetDocument(text); 16 | 17 | foreach (var token in doc.Tokens) 18 | { 19 | var childs = new List(); 20 | token.Children.ForEach(c => childs.Add(c.Text)); 21 | Console.WriteLine($"{token.Text} {token.Dep} {token.Head.Text} [{string.Join(", ", childs)}]"); 22 | } 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /PythonNetUtils/Utils.cs: -------------------------------------------------------------------------------- 1 | using System.Numerics; 2 | 3 | namespace PythonNetUtils 4 | { 5 | public static class Utils 6 | { 7 | public static BigInteger AsBigInteger(this object obj) 8 | { 9 | if (obj is int) 10 | return new BigInteger((int)obj); 11 | if (obj is uint) 12 | return new BigInteger((uint)obj); 13 | if (obj is long) 14 | return new BigInteger((long)obj); 15 | if (obj is ulong) 16 | return new BigInteger((ulong)obj); 17 | if (obj is short) 18 | return new BigInteger((short)obj); 19 | if (obj is ushort) 20 | return new BigInteger((ushort)obj); 21 | 22 | throw new InvalidCastException("Wrong datatype to convert to BigInteger"); 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Antonio Miras 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Test/py/spaCy101.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from spacy.tokens import Doc 3 | from spacy.vocab import Vocab 4 | 5 | nlp = spacy.load("en_core_web_sm") 6 | doc = nlp("Apple is looking at buying U.K. startup for $1 billion") 7 | 8 | for token in doc: 9 | print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, 10 | token.shape_, token.is_alpha, token.is_stop) 11 | 12 | print("") 13 | for ent in doc.ents: 14 | print(ent.text, ent.start_char, ent.end_char, ent.label_) 15 | 16 | nlp = spacy.load("en_core_web_md") 17 | tokens = nlp("dog cat banana afskfsd") 18 | 19 | print("") 20 | for token in tokens: 21 | print(token.text, token.has_vector, token.vector_norm, token.is_oov) 22 | 23 | tokens = nlp("dog cat banana") 24 | print("") 25 | for token1 in tokens: 26 | for token2 in tokens: 27 | print(token1.text, token2.text, token1.similarity(token2)) 28 | 29 | doc = nlp("I love coffee") 30 | print("") 31 | print(doc.vocab.strings["coffee"]) # 3197928453018144401 32 | print(doc.vocab.strings[3197928453018144401]) # 'coffee' 33 | 34 | print("") 35 | for word in doc: 36 | lexeme = doc.vocab[word.text] 37 | print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_, 38 | lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_) -------------------------------------------------------------------------------- /Test/py/serialization.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | 3 | from spacy.tokens import DocBin 4 | from spacy.tokens import Doc 5 | from spacy.vocab import Vocab 6 | 7 | def print_doc(adoc): 8 | for word in adoc: 9 | lexeme = adoc.vocab[word.text] 10 | print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_, 11 | lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_) 12 | 13 | text = "I love coffee" 14 | 15 | # Load base document 16 | nlp = spacy.load("en_core_web_sm") 17 | doc_base = nlp(text) 18 | print("") 19 | print_doc(doc_base) 20 | 21 | # Serialize document to disk and bytes 22 | doc_base.to_disk("doc.spacy") 23 | doc_base_bytes = doc_base.to_bytes() 24 | 25 | # Serialize using DocBin 26 | docbin_base = DocBin(attrs=["ENT_IOB", "POS", "HEAD", "DEP", "ENT_TYPE"], store_user_data=True) 27 | docbin_base.add(doc_base) 28 | docbin_base_bytes = docbin_base.to_bytes() 29 | 30 | # Restore document from disk 31 | doc = Doc(Vocab()) 32 | doc.from_disk("doc.spacy") 33 | print("") 34 | print_doc(doc) 35 | 36 | # Restore document from bytes 37 | doc = Doc(Vocab()) 38 | doc.from_bytes(doc_base_bytes) 39 | print("") 40 | print_doc(doc) 41 | 42 | # Restore using DocBin 43 | docbin = DocBin().from_bytes(docbin_base_bytes) 44 | docs = list(docbin.get_docs(nlp.vocab)) 45 | print("") 46 | print_doc(docs[0]) -------------------------------------------------------------------------------- /Test/cs/Program.cs: -------------------------------------------------------------------------------- 1 | using CommandLine; 2 | using PythonNetUtils; 3 | using System; 4 | using System.Collections.Generic; 5 | 6 | namespace Test 7 | { 8 | class Program 9 | { 10 | static void Main(string[] args) 11 | { 12 | Parser.Default.ParseArguments(args) 13 | .WithParsed(RunOptions) 14 | .WithNotParsed(HandleParseError); 15 | } 16 | 17 | static void RunOptions(CliOptions cliOps) 18 | { 19 | using (new PythonRt(cliOps.Interpreter, cliOps.PathVirtualEnv)) 20 | { 21 | SpaCy101.Run(); 22 | LinguisticFeatures.Run(); 23 | ExampleES.Run(); 24 | Serialization.Run(); 25 | //DisplaCy.Run(); 26 | } 27 | } 28 | 29 | static void HandleParseError(IEnumerable errs) 30 | { 31 | Console.WriteLine("You need to specify virtual environment path"); 32 | } 33 | 34 | public class CliOptions 35 | { 36 | [Option("interpreter", Required = true, HelpText = "Filename for the interpreter. Usually python38.dll on Windows, libpython3.8.so on Linux and libpython3.8.dylib on Mac.")] 37 | public string Interpreter { get; set; } 38 | 39 | [Option("venv", Required = true, HelpText = "Set virtual environment path")] 40 | public string PathVirtualEnv { get; set; } 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /PythonNetUtils/ToPy.cs: -------------------------------------------------------------------------------- 1 | using Python.Runtime; 2 | 3 | namespace PythonNetUtils 4 | { 5 | public class ToPy 6 | { 7 | public static dynamic GetBytes(byte[] bytes) 8 | { 9 | using (Py.GIL()) 10 | { 11 | // Seems like ToPython method doesn't convert properly in the case of a byte array 12 | // The lines below throw: 13 | // Python.Runtime.PythonException: 'TypeError : a bytes-like object is required, not 'Byte[]'' 14 | // var pyObj = bytes.ToPython(); 15 | // _doc.from_bytes(pyObj); 16 | 17 | // We need to make use of builtin function bytes() 18 | // Taken from: 19 | // https://github.com/pythonnet/pythonnet/issues/1150 20 | var builtins = Py.Import("builtins"); 21 | var toBytesFunc = builtins.GetAttr("bytes"); 22 | return toBytesFunc.Invoke(bytes.ToPython()); 23 | } 24 | } 25 | 26 | public static dynamic GetList(T[] list) 27 | { 28 | using (Py.GIL()) 29 | { 30 | var pyLst = new PyList(); 31 | if (list != null) 32 | { 33 | var type = typeof(T); 34 | 35 | foreach (var element in list) 36 | { 37 | if (type == typeof(string)) 38 | { 39 | var pyElement = new PyString((string)(object)element); 40 | pyLst.Append(pyElement); 41 | } 42 | else 43 | { 44 | throw new NotImplementedException(); 45 | } 46 | } 47 | } 48 | 49 | return pyLst; 50 | } 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /Test/cs/Serialization.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using SpacyDotNet; 3 | 4 | namespace Test 5 | { 6 | static class Serialization 7 | { 8 | public static void PrintDoc(Doc adoc) 9 | { 10 | foreach (Token word in adoc.Tokens) 11 | { 12 | var lexeme = adoc.Vocab[word.Text]; 13 | Console.WriteLine($@"{lexeme.Text} {lexeme.Orth} {lexeme.Shape} {lexeme.Prefix} {lexeme.Suffix} {lexeme.IsAlpha} {lexeme.IsDigit} {lexeme.IsTitle} {lexeme.Lang}"); 14 | } 15 | } 16 | 17 | public static void Run() 18 | { 19 | var spacy = new Spacy(); 20 | 21 | var text = "I love coffee"; 22 | 23 | // Load base document 24 | var nlp = spacy.Load("en_core_web_sm"); 25 | var docBase = nlp.GetDocument(text); 26 | Console.WriteLine(""); 27 | PrintDoc(docBase); 28 | 29 | // Serialize document to disk and bytes 30 | docBase.ToDisk("doc.spacy"); 31 | var docBaseBytes = docBase.ToBytes(); 32 | 33 | // Serialize using DocBin 34 | var docBinBase = new DocBin(attrs: new string[] { "ENT_IOB", "POS", "HEAD", "DEP", "ENT_TYPE" }, storeUserData: true); 35 | docBinBase.Add(docBase); 36 | var docBinBaseBytes = docBinBase.ToBytes(); 37 | 38 | // Restore document from disk 39 | var doc = new Doc(new Vocab()); 40 | doc.FromDisk("doc.spacy"); 41 | Console.WriteLine(""); 42 | PrintDoc(doc); 43 | 44 | // Restore document from bytes 45 | doc = new Doc(new Vocab()); 46 | doc.FromBytes(docBaseBytes); 47 | Console.WriteLine(""); 48 | PrintDoc(doc); 49 | 50 | // Restore using DocBin 51 | var docBin = new DocBin(); 52 | docBin.FromBytes(docBinBaseBytes); 53 | var docs = docBin.GetDocs(nlp.Vocab); 54 | Console.WriteLine(""); 55 | PrintDoc(docs[0]); 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /Test/cs/SpaCy101.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using SpacyDotNet; 3 | 4 | namespace Test 5 | { 6 | static class SpaCy101 7 | { 8 | public static void Run() 9 | { 10 | var spacy = new Spacy(); 11 | 12 | var nlp = spacy.Load("en_core_web_sm"); 13 | var doc = nlp.GetDocument("Apple is looking at buying U.K. startup for $1 billion"); 14 | 15 | foreach (Token token in doc.Tokens) 16 | Console.WriteLine($"{token.Text} {token.Lemma} {token.PoS} {token.Tag} {token.Dep} {token.Shape} {token.IsAlpha} {token.IsStop}"); 17 | 18 | Console.WriteLine(""); 19 | foreach (Span ent in doc.Ents) 20 | Console.WriteLine($"{ent.Text} {ent.StartChar} {ent.EndChar} {ent.Label}"); 21 | 22 | nlp = spacy.Load("en_core_web_md"); 23 | var tokens = nlp.GetDocument("dog cat banana afskfsd"); 24 | 25 | Console.WriteLine(""); 26 | foreach (Token token in tokens.Tokens) 27 | Console.WriteLine($"{token.Text} {token.HasVector} {token.VectorNorm}, {token.IsOov}"); 28 | 29 | tokens = nlp.GetDocument("dog cat banana"); 30 | Console.WriteLine(""); 31 | foreach (Token token1 in tokens.Tokens) 32 | { 33 | foreach (Token token2 in tokens.Tokens) 34 | Console.WriteLine($"{token1.Text} {token2.Text} {token1.Similarity(token2) }"); 35 | } 36 | 37 | doc = nlp.GetDocument("I love coffee"); 38 | Console.WriteLine(""); 39 | Console.WriteLine(doc.Vocab.Strings["coffee"]); 40 | Console.WriteLine(doc.Vocab.Strings[3197928453018144401]); 41 | 42 | Console.WriteLine(""); 43 | foreach (Token word in doc.Tokens) 44 | { 45 | var lexeme = doc.Vocab[word.Text]; 46 | Console.WriteLine($@"{lexeme.Text} {lexeme.Orth} {lexeme.Shape} {lexeme.Prefix} {lexeme.Suffix} {lexeme.IsAlpha} {lexeme.IsDigit} {lexeme.IsTitle} {lexeme.Lang}"); 47 | } 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /SpaCyDotNet/api/StringStore.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Numerics; 4 | using Python.Runtime; 5 | using PythonNetUtils; 6 | 7 | namespace SpacyDotNet 8 | { 9 | public class StringStore 10 | { 11 | private dynamic _pyStringStore; 12 | 13 | private Dictionary _dictStrToNumber; 14 | private Dictionary _dictNumberToStr; 15 | 16 | internal StringStore(dynamic stringStore) 17 | { 18 | _pyStringStore = stringStore; 19 | _dictStrToNumber = new Dictionary(); 20 | _dictNumberToStr = new Dictionary(); 21 | } 22 | 23 | public object this[object key] 24 | { 25 | get 26 | { 27 | var keyStr = key as string; 28 | if (keyStr != null) 29 | { 30 | if (_dictStrToNumber.ContainsKey(keyStr)) 31 | return _dictStrToNumber[keyStr]; 32 | 33 | BigInteger valHash; 34 | using (Py.GIL()) 35 | { 36 | var dynPyNumber = _pyStringStore.__getitem__(key); 37 | var pyNumber = new PyInt(dynPyNumber); 38 | valHash = BigInteger.Parse(pyNumber.ToString()); 39 | _dictStrToNumber.Add(keyStr, valHash); 40 | } 41 | 42 | return valHash; 43 | } 44 | 45 | var keyHash = key.AsBigInteger(); 46 | if (_dictNumberToStr.ContainsKey(keyHash)) 47 | return _dictNumberToStr[keyHash]; 48 | 49 | var valStr = string.Empty; 50 | using (Py.GIL()) 51 | { 52 | var dynPyStr = _pyStringStore.__getitem__(key); 53 | var pyString = new PyString(dynPyStr); 54 | valStr = pyString.ToString(); 55 | _dictNumberToStr.Add(keyHash, valStr); 56 | } 57 | 58 | return valStr; 59 | } 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /SpaCyDotNet/api/Span.cs: -------------------------------------------------------------------------------- 1 | using PythonNetUtils; 2 | using System.Diagnostics; 3 | using System.Xml; 4 | using System.Xml.Schema; 5 | using System.Xml.Serialization; 6 | 7 | namespace SpacyDotNet 8 | { 9 | public class Span : IXmlSerializable 10 | { 11 | private dynamic _pySpan; 12 | 13 | private string _text; 14 | private string _label; 15 | private int? _startChar; 16 | private int? _endChar; 17 | 18 | public Span() 19 | { 20 | // Needed to use generics 21 | } 22 | 23 | internal Span(dynamic sentence) 24 | { 25 | _pySpan = sentence; 26 | _startChar = null; 27 | _endChar = null; 28 | } 29 | 30 | public string Text => ToClr.GetMember(_pySpan?.text, ref _text); 31 | public string Label => ToClr.GetMember(_pySpan?.label_, ref _label); 32 | public int StartChar => ToClr.GetMember(_pySpan?.start_char, ref _startChar); 33 | public int EndChar => ToClr.GetMember(_pySpan?.end_char, ref _endChar); 34 | 35 | public XmlSchema GetSchema() 36 | { 37 | return null; 38 | } 39 | 40 | public void ReadXml(XmlReader reader) 41 | { 42 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Text"); 43 | _text = reader.ReadElementContentAsString(); 44 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Label"); 45 | _label = reader.ReadElementContentAsString(); 46 | 47 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:StartChar"); 48 | _startChar = reader.ReadElementContentAsInt(); 49 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:EndChar"); 50 | _endChar = reader.ReadElementContentAsInt(); 51 | } 52 | 53 | public void WriteXml(XmlWriter writer) 54 | { 55 | // Using the property is important form the members to be loaded 56 | writer.WriteElementString("Text", Serialization.Namespace, Text); 57 | writer.WriteElementString("Label", Serialization.Namespace, Label); 58 | writer.WriteStartElement("StartChar", Serialization.Namespace); 59 | writer.WriteValue(StartChar); 60 | writer.WriteEndElement(); 61 | writer.WriteStartElement("EndChar", Serialization.Namespace); 62 | writer.WriteValue(EndChar); 63 | writer.WriteEndElement(); 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /Test/cs/Test.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.13.35806.99 d17.13 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Test", "Test.csproj", "{FF70F2F9-2826-49EF-851C-5FF1697FCE6A}" 7 | ProjectSection(ProjectDependencies) = postProject 8 | {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1} = {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1} 9 | EndProjectSection 10 | EndProject 11 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SpaCyDotNet", "..\..\SpaCyDotNet\SpaCyDotNet.csproj", "{CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}" 12 | EndProject 13 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PythonNetUtils", "..\..\PythonNetUtils\PythonNetUtils.csproj", "{50893F3B-07F7-46CD-AE2E-CF1D1BA2D7C8}" 14 | EndProject 15 | Global 16 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 17 | Debug|Any CPU = Debug|Any CPU 18 | Debug|x64 = Debug|x64 19 | Release|Any CPU = Release|Any CPU 20 | Release|x64 = Release|x64 21 | EndGlobalSection 22 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 23 | {FF70F2F9-2826-49EF-851C-5FF1697FCE6A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 24 | {FF70F2F9-2826-49EF-851C-5FF1697FCE6A}.Debug|Any CPU.Build.0 = Debug|Any CPU 25 | {FF70F2F9-2826-49EF-851C-5FF1697FCE6A}.Debug|x64.ActiveCfg = Debug|x64 26 | {FF70F2F9-2826-49EF-851C-5FF1697FCE6A}.Debug|x64.Build.0 = Debug|x64 27 | {FF70F2F9-2826-49EF-851C-5FF1697FCE6A}.Release|Any CPU.ActiveCfg = Release|Any CPU 28 | {FF70F2F9-2826-49EF-851C-5FF1697FCE6A}.Release|Any CPU.Build.0 = Release|Any CPU 29 | {FF70F2F9-2826-49EF-851C-5FF1697FCE6A}.Release|x64.ActiveCfg = Release|x64 30 | {FF70F2F9-2826-49EF-851C-5FF1697FCE6A}.Release|x64.Build.0 = Release|x64 31 | {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 32 | {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}.Debug|Any CPU.Build.0 = Debug|Any CPU 33 | {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}.Debug|x64.ActiveCfg = Debug|x64 34 | {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}.Debug|x64.Build.0 = Debug|x64 35 | {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}.Release|Any CPU.ActiveCfg = Release|Any CPU 36 | {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}.Release|Any CPU.Build.0 = Release|Any CPU 37 | {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}.Release|x64.ActiveCfg = Release|x64 38 | {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}.Release|x64.Build.0 = Release|x64 39 | {50893F3B-07F7-46CD-AE2E-CF1D1BA2D7C8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 40 | {50893F3B-07F7-46CD-AE2E-CF1D1BA2D7C8}.Debug|Any CPU.Build.0 = Debug|Any CPU 41 | {50893F3B-07F7-46CD-AE2E-CF1D1BA2D7C8}.Debug|x64.ActiveCfg = Debug|Any CPU 42 | {50893F3B-07F7-46CD-AE2E-CF1D1BA2D7C8}.Debug|x64.Build.0 = Debug|Any CPU 43 | {50893F3B-07F7-46CD-AE2E-CF1D1BA2D7C8}.Release|Any CPU.ActiveCfg = Release|Any CPU 44 | {50893F3B-07F7-46CD-AE2E-CF1D1BA2D7C8}.Release|Any CPU.Build.0 = Release|Any CPU 45 | {50893F3B-07F7-46CD-AE2E-CF1D1BA2D7C8}.Release|x64.ActiveCfg = Release|Any CPU 46 | {50893F3B-07F7-46CD-AE2E-CF1D1BA2D7C8}.Release|x64.Build.0 = Release|Any CPU 47 | EndGlobalSection 48 | GlobalSection(SolutionProperties) = preSolution 49 | HideSolutionNode = FALSE 50 | EndGlobalSection 51 | GlobalSection(ExtensibilityGlobals) = postSolution 52 | SolutionGuid = {A512806F-79F9-44AB-9626-EBAFBD21234D} 53 | EndGlobalSection 54 | EndGlobal 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | # SpacyDotNet 5 | # Direct reference to Python Runtime, rather than using a NuGet package 6 | binPyRt/ 7 | SpaCyDotNet.PyRt.csproj 8 | Test.PyRt.csproj 9 | Test.PyRt.sln 10 | veSpacyDotNet*/ 11 | venv/ 12 | 13 | # User-specific files 14 | *.suo 15 | *.v12.suo 16 | *.user 17 | *.sln.docstates 18 | launchSettings.json 19 | *.out 20 | 21 | # Build results 22 | 23 | [Dd]ebug/ 24 | [Rr]elease/ 25 | x64/ 26 | build/ 27 | [Bb]in/ 28 | [Oo]bj/ 29 | 30 | # IDEA 31 | .idea/ 32 | 33 | # Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets 34 | !packages/*/build/ 35 | 36 | # MSTest test Results 37 | [Tt]est[Rr]esult*/ 38 | [Bb]uild[Ll]og.* 39 | 40 | *_i.c 41 | *_p.c 42 | *.ilk 43 | *.meta 44 | *.obj 45 | *.pch 46 | *.pdb 47 | *.pgc 48 | *.pgd 49 | *.rsp 50 | *.sbr 51 | *.tlb 52 | *.tli 53 | *.tlh 54 | *.tmp 55 | *.tmp_proj 56 | *.log 57 | *.vspscc 58 | *.vssscc 59 | .builds 60 | *.pidb 61 | *.log 62 | *.scc 63 | *.odt# 64 | 65 | # Visual C++ cache files 66 | ipch/ 67 | *.aps 68 | *.ncb 69 | *.opensdf 70 | *.sdf 71 | *.cachefile 72 | 73 | # Visual Studio profiler 74 | *.psess 75 | *.vsp 76 | *.vspx 77 | 78 | # Guidance Automation Toolkit 79 | *.gpState 80 | 81 | # ReSharper is a .NET coding add-in 82 | _ReSharper*/ 83 | *.[Rr]e[Ss]harper 84 | 85 | # TeamCity is a build add-in 86 | _TeamCity* 87 | 88 | # DotCover is a Code Coverage Tool 89 | *.dotCover 90 | 91 | # NCrunch 92 | *.ncrunch* 93 | .*crunch*.local.xml 94 | 95 | # Installshield output folder 96 | [Ee]xpress/ 97 | 98 | # DocProject is a documentation generator add-in 99 | DocProject/buildhelp/ 100 | DocProject/Help/*.HxT 101 | DocProject/Help/*.HxC 102 | DocProject/Help/*.hhc 103 | DocProject/Help/*.hhk 104 | DocProject/Help/*.hhp 105 | DocProject/Help/Html2 106 | DocProject/Help/html 107 | 108 | # Click-Once directory 109 | publish/ 110 | 111 | # Publish Web Output 112 | *.Publish.xml 113 | 114 | # NuGet Packages Directory 115 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line 116 | packages/ 117 | 118 | # Windows Azure Build Output 119 | csx 120 | *.build.csdef 121 | 122 | # Windows Store app package directory 123 | AppPackages/ 124 | 125 | # Others 126 | *.Cache 127 | ClientBin/ 128 | [Ss]tyle[Cc]op.* 129 | ~$* 130 | *~ 131 | *.dbmdl 132 | *.[Pp]ublish.xml 133 | *.pfx 134 | *.publishsettings 135 | .vs/ 136 | 137 | # RIA/Silverlight projects 138 | Generated_Code/ 139 | 140 | # Backup & report files from converting an old project file to a newer 141 | # Visual Studio version. Backup files are not needed, because we have git ;-) 142 | _UpgradeReport_Files/ 143 | Backup*/ 144 | UpgradeLog*.XML 145 | UpgradeLog*.htm 146 | 147 | # SQL Server files 148 | App_Data/*.mdf 149 | App_Data/*.ldf 150 | 151 | 152 | #LightSwitch generated files 153 | GeneratedArtifacts/ 154 | _Pvt_Extensions/ 155 | ModelManifest.xml 156 | 157 | # ========================= 158 | # Windows detritus 159 | # ========================= 160 | 161 | # Windows image file caches 162 | Thumbs.db 163 | ehthumbs.db 164 | 165 | # Folder config file 166 | Desktop.ini 167 | 168 | # Recycle Bin used on file shares 169 | $RECYCLE.BIN/ 170 | 171 | # Mac desktop service store files 172 | .DS_Store 173 | -------------------------------------------------------------------------------- /Test/cs/ExampleES.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using SpacyDotNet; 3 | 4 | namespace Test 5 | { 6 | static class ExampleES 7 | { 8 | public static void Run() 9 | { 10 | var spacy = new Spacy(); 11 | var nlp = spacy.Load("es_core_news_sm"); 12 | 13 | var separator = "____________________________________________________________________________"; 14 | var text = @"Cuando Sebastian Thrun empezó a trabajar en coches de conducción autónoma, en 2007, para "; 15 | text += "Google, muy poca gente fuera de la empresa le tomó en serio. “Podría contaros como CEOs muy "; 16 | text += "veteranos de las empresas automotrices más grandes de América me daban la mano para después "; 17 | text += "ignorarme porque no merecía la pena hablar conmigo”, comentaba Thrun, en una entrevista a Recode "; 18 | text += "a principios de semana"; 19 | 20 | var doc = nlp.GetDocument(text); 21 | 22 | Console.WriteLine("Pipeline:"); 23 | Console.WriteLine(string.Join(",", nlp.PipeNames)); 24 | Console.WriteLine(separator); 25 | 26 | Console.WriteLine("Tokenization"); 27 | Console.Write("["); 28 | foreach (var token in doc.Tokens) 29 | Console.Write("'" + token.Text + "', "); 30 | Console.WriteLine("\b\b]"); 31 | Console.WriteLine(separator); 32 | 33 | Console.WriteLine("Pos"); 34 | Console.Write("["); 35 | foreach (var token in doc.Tokens) 36 | Console.Write("'" + token.PoS + "', "); 37 | Console.WriteLine("\b\b]"); 38 | Console.WriteLine(separator); 39 | 40 | Console.WriteLine("PoS[0]:"); 41 | var token0 = doc.Tokens[0]; 42 | Console.WriteLine("Fine-grained POS tag " + token0.PoS); 43 | Console.WriteLine("Coarse-grained POS tag " + token0.Tag); 44 | Console.WriteLine("Word shape " + token0.Shape); 45 | Console.WriteLine("Alphabetic characters? " + token0.IsAlpha); 46 | Console.WriteLine("Punctuation mark? " + token0.IsPunct); 47 | Console.WriteLine("Digit? " + token0.IsDigit); 48 | Console.WriteLine("Like a number? " + token0.LikeNum); 49 | Console.WriteLine("Like an email address? " + token0.LikeEMail); 50 | Console.WriteLine(separator); 51 | 52 | Console.WriteLine("Lemmatization:"); 53 | Console.Write("["); 54 | foreach (var token in doc.Tokens) 55 | Console.Write("'" + token.Lemma + "', "); 56 | Console.WriteLine("\b\b]"); 57 | Console.WriteLine(separator); 58 | 59 | Console.WriteLine("Sentences:"); 60 | Console.Write("["); 61 | foreach (var sentence in doc.Sents) 62 | Console.Write("'" + sentence.Text + "', "); 63 | Console.WriteLine("\b\b]"); 64 | Console.WriteLine(separator); 65 | 66 | Console.WriteLine("Noun Phrases:"); 67 | Console.Write("["); 68 | foreach (var nounChunk in doc.NounChunks) 69 | Console.Write("'" + nounChunk.Text + "', "); 70 | Console.WriteLine("\b\b]"); 71 | Console.WriteLine(separator); 72 | 73 | Console.WriteLine("Entities (Named entities, phrases and concepts):"); 74 | foreach (var entity in doc.Ents) 75 | Console.WriteLine("Entity: " + entity.Text + "\tLabel: " + entity.Label); 76 | Console.WriteLine(separator); 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /PythonNetUtils/PythonRt.cs: -------------------------------------------------------------------------------- 1 | using Python.Runtime; 2 | using System.Diagnostics; 3 | 4 | namespace PythonNetUtils 5 | { 6 | public class PythonRt : IDisposable 7 | { 8 | private static bool _initialized = false; 9 | 10 | public PythonRt(string interpreter, string pathVirtualEnv) 11 | { 12 | if (_initialized) 13 | { 14 | throw new InvalidOperationException("Python runtime already initialized"); 15 | } 16 | 17 | Init(interpreter, pathVirtualEnv); 18 | _initialized = true; 19 | } 20 | 21 | public static bool IsInitialized => _initialized; 22 | 23 | /// 24 | /// Python.NET project provides a WIKI to initialize the library using virtual environments. See: 25 | /// https://github.com/pythonnet/pythonnet/wiki/Using-Python.NET-with-Virtual-Environments 26 | /// Sadly, I couldn't make the code provided in the official wiki to properly work, so I created my own initialization 27 | /// I've experienced all problems below 28 | /// 1) Inability to locate python interpreter 29 | /// 2) Inability to load Python system libraries 30 | /// 3) Inability to load Python virtual env libraries (site-packages) 31 | /// This method aims to solve both 2) and 3) and is an ugly HACK 32 | /// Using the regular workflow everything is fine; activate virtual environment and run the CPython intepreter. Only Python.NET fails. 33 | /// Fixing Python.NET itself would be better but for now, I'm just going to copy sys.path 34 | /// 35 | /// Path to virtual environment 36 | public void Init(string interpreter, string pathVirtualEnv) 37 | { 38 | // SeeCliOptions.Interpreter 39 | Runtime.PythonDLL = interpreter; 40 | 41 | if (string.IsNullOrEmpty(pathVirtualEnv)) 42 | throw new Exception("You need to define PathVirtualEnv before using the wrapper"); 43 | if (!Directory.Exists(pathVirtualEnv)) 44 | throw new Exception("The directory specified for PathVirtualEnv is invalid"); 45 | 46 | string pathVeScripts; 47 | if (Environment.OSVersion.Platform == PlatformID.Win32NT) 48 | pathVeScripts = pathVirtualEnv + @"\Scripts"; 49 | else 50 | pathVeScripts = pathVirtualEnv + @"/bin"; 51 | Environment.SetEnvironmentVariable("PATH", pathVeScripts, EnvironmentVariableTarget.Process); 52 | 53 | var pythonPath = string.Empty; 54 | 55 | var proc = new Process(); 56 | proc.StartInfo.FileName = pathVeScripts + Path.DirectorySeparatorChar + "python"; 57 | proc.StartInfo.Arguments = $"-c \"import sys; print('{Path.PathSeparator}'.join(sys.path))\""; 58 | proc.StartInfo.RedirectStandardOutput = true; 59 | if (!proc.Start()) 60 | throw new Exception("Couldn't initialize Python in virtual environment"); 61 | proc.WaitForExit(); 62 | 63 | pythonPath = proc.StandardOutput.ReadToEnd(); 64 | pythonPath = pythonPath.Replace(Environment.NewLine, ""); 65 | if (string.IsNullOrEmpty(pythonPath)) 66 | throw new Exception("Couldn't initialize Python.NET"); 67 | 68 | Environment.SetEnvironmentVariable("PYTHONPATH", pythonPath, EnvironmentVariableTarget.Process); 69 | PythonEngine.PythonPath = pythonPath; 70 | 71 | PythonEngine.Initialize(); 72 | } 73 | 74 | public void Dispose() 75 | { 76 | try 77 | { 78 | // Python.NET is still using a BinaryFormatter. See: 79 | // https://github.com/pythonnet/pythonnet/issues/2469 80 | PythonEngine.Shutdown(); 81 | } 82 | catch (PlatformNotSupportedException) 83 | { 84 | } 85 | 86 | _initialized = false; 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /SpaCyDotNet/api/Lang.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.Linq; 5 | using System.Xml; 6 | using System.Xml.Schema; 7 | using System.Xml.Serialization; 8 | using Python.Runtime; 9 | using PythonNetUtils; 10 | 11 | namespace SpacyDotNet 12 | { 13 | public class Lang : IXmlSerializable 14 | { 15 | private dynamic _pyLang; 16 | 17 | private List _pipeNames; 18 | private PipelineMeta _meta; 19 | 20 | internal Lang(dynamic lang) 21 | { 22 | _pyLang = lang; 23 | _pipeNames = null; 24 | _meta = new PipelineMeta(this); 25 | } 26 | 27 | public Doc GetDocument(string text) 28 | { 29 | using (Py.GIL()) 30 | { 31 | var pyString = new PyString(text); 32 | dynamic doc = _pyLang.__call__(pyString); 33 | return new Doc(doc, text); 34 | } 35 | } 36 | 37 | internal dynamic PyLang => _pyLang; 38 | public PipelineMeta Meta => _meta; 39 | public List PipeNames => ToClr.GetListFromListMember(_pyLang?.pipe_names, ref _pipeNames); 40 | public Vocab Vocab => new Vocab(_pyLang.vocab); 41 | 42 | public XmlSchema GetSchema() 43 | { 44 | return null; 45 | } 46 | 47 | public void ReadXml(XmlReader reader) 48 | { 49 | var dummyBytes = new byte[1]; 50 | 51 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:PyObj"); 52 | var bytesB64 = reader.ReadElementContentAsString(); 53 | var bytes = Convert.FromBase64String(bytesB64); 54 | var pyBytes = ToPy.GetBytes(bytes); 55 | using (Py.GIL()) 56 | { 57 | _pyLang.from_bytes(pyBytes); 58 | } 59 | 60 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:PipeNames"); 61 | var pipeNames = reader.ReadElementContentAsString(); 62 | _pipeNames = pipeNames.Split(',').ToList(); 63 | 64 | // TODO: Yet to debug. It's not being used so far 65 | _meta = new PipelineMeta(this); 66 | } 67 | 68 | public void WriteXml(XmlWriter writer) 69 | { 70 | using (Py.GIL()) 71 | { 72 | var pyObj = ToClr.GetBytes(_pyLang.to_bytes()); 73 | var pyObjB64 = Convert.ToBase64String(pyObj); 74 | writer.WriteElementString("PyObj", pyObjB64, Serialization.Namespace); 75 | } 76 | 77 | // Using the property is important form the members to be loaded 78 | writer.WriteElementString("PipeNames", string.Join(',', PipeNames), Serialization.Namespace); 79 | } 80 | 81 | public class PipelineMeta : Dictionary 82 | { 83 | private Lang _lang; 84 | 85 | public PipelineMeta(Lang lang) 86 | { 87 | _lang = lang; 88 | } 89 | 90 | public new object this[string key] 91 | { 92 | get 93 | { 94 | if (ContainsKey(key)) 95 | return base[key]; 96 | 97 | if (_lang.PyLang == null) 98 | return null; 99 | 100 | object ret = null; 101 | using (Py.GIL()) 102 | { 103 | var pyKeyStr = new PyString(key); 104 | var pyObj = (PyObject)_lang.PyLang.meta.__getitem__(pyKeyStr); 105 | 106 | if (!PyString.IsStringType(pyObj)) 107 | throw new NotImplementedException(); 108 | 109 | var pyValStr = new PyString(pyObj); 110 | ret = pyValStr.ToString(); 111 | Add(key, ret); 112 | } 113 | 114 | return ret; 115 | } 116 | } 117 | } 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /SpaCyDotNet/api/Lexeme.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Diagnostics; 3 | using System.Numerics; 4 | using System.Xml; 5 | using System.Xml.Schema; 6 | using System.Xml.Serialization; 7 | using Python.Runtime; 8 | using PythonNetUtils; 9 | 10 | namespace SpacyDotNet 11 | { 12 | public class Lexeme : IXmlSerializable 13 | { 14 | private dynamic _pyLexeme; 15 | 16 | private string _text; 17 | private string _shape; 18 | private string _prefix; 19 | private string _suffix; 20 | private string _lang; 21 | 22 | private BigInteger? _orth; 23 | 24 | private bool? _isAlpha; 25 | private bool? _isDigit; 26 | private bool? _isTitle; 27 | 28 | public Lexeme() 29 | { 30 | } 31 | 32 | internal Lexeme(dynamic lexeme) 33 | { 34 | _pyLexeme = lexeme; 35 | _text = null; 36 | _shape = null; 37 | _prefix = null; 38 | _lang = null; 39 | 40 | _orth = null; 41 | 42 | _isAlpha = null; 43 | _isDigit = null; 44 | _isTitle = null; 45 | } 46 | 47 | public string Text => ToClr.GetMember(_pyLexeme?.text, ref _text); 48 | public string Shape => ToClr.GetMember(_pyLexeme?.shape_, ref _shape); 49 | public string Prefix => ToClr.GetMember(_pyLexeme?.prefix_, ref _prefix); 50 | public string Suffix => ToClr.GetMember(_pyLexeme?.suffix_, ref _suffix); 51 | public string Lang => ToClr.GetMember(_pyLexeme?.lang_, ref _lang); 52 | public BigInteger Orth => ToClr.GetMember(_pyLexeme?.orth, ref _orth); 53 | public bool IsAlpha => ToClr.GetMember(_pyLexeme?.is_alpha, ref _isAlpha); 54 | public bool IsDigit => ToClr.GetMember(_pyLexeme?.is_digit, ref _isDigit); 55 | public bool IsTitle => ToClr.GetMember(_pyLexeme?.is_title, ref _isTitle); 56 | 57 | public XmlSchema GetSchema() 58 | { 59 | return null; 60 | } 61 | 62 | public void ReadXml(XmlReader reader) 63 | { 64 | // TODO: Yet to debug. It's not being used so far 65 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:PyObj"); 66 | var bytesB64 = reader.ReadElementContentAsString(); 67 | var bytes = Convert.FromBase64String(bytesB64); 68 | var pyBytes = ToPy.GetBytes(bytes); 69 | 70 | using (Py.GIL()) 71 | { 72 | _pyLexeme.from_bytes(pyBytes); 73 | } 74 | 75 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Text"); 76 | _text = reader.ReadElementContentAsString(); 77 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Shape"); 78 | _shape = reader.ReadElementContentAsString(); 79 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Prefix"); 80 | _prefix = reader.ReadElementContentAsString(); 81 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Suffix"); 82 | _suffix = reader.ReadElementContentAsString(); 83 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Lang"); 84 | _lang = reader.ReadElementContentAsString(); 85 | 86 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Orth"); 87 | var orth = reader.ReadElementContentAsString(); 88 | _orth = BigInteger.Parse(orth); 89 | 90 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsAlpha"); 91 | _isAlpha = reader.ReadElementContentAsBoolean(); 92 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsDigit"); 93 | _isDigit = reader.ReadElementContentAsBoolean(); 94 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsTitle"); 95 | _isTitle = reader.ReadElementContentAsBoolean(); 96 | } 97 | 98 | public void WriteXml(XmlWriter writer) 99 | { 100 | using (Py.GIL()) 101 | { 102 | var pyObj = ToClr.GetBytes(_pyLexeme.to_bytes()); 103 | writer.WriteElementString("PyObj", pyObj, Serialization.Namespace); 104 | } 105 | 106 | // Using the property is important form the members to be loaded 107 | writer.WriteElementString("Text", Text, Serialization.Namespace); 108 | writer.WriteElementString("Shape", Shape, Serialization.Namespace); 109 | writer.WriteElementString("Prefix", Prefix, Serialization.Namespace); 110 | writer.WriteElementString("Suffix", Suffix, Serialization.Namespace); 111 | writer.WriteElementString("Lang", Lang, Serialization.Namespace); 112 | 113 | writer.WriteElementString("Orth", Orth.ToString(), Serialization.Namespace); 114 | 115 | writer.WriteStartElement("IsAlpha", Serialization.Namespace); 116 | writer.WriteValue(IsAlpha); 117 | writer.WriteEndElement(); 118 | writer.WriteStartElement("IsDigit", Serialization.Namespace); 119 | writer.WriteValue(IsDigit); 120 | writer.WriteEndElement(); 121 | writer.WriteStartElement("IsTitle", Serialization.Namespace); 122 | writer.WriteValue(IsTitle); 123 | writer.WriteEndElement(); 124 | } 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /SpaCyDotNet/api/Vocab.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.Numerics; 5 | using System.Runtime.Serialization; 6 | using System.Xml; 7 | using System.Xml.Schema; 8 | using System.Xml.Serialization; 9 | using Python.Runtime; 10 | using PythonNetUtils; 11 | 12 | namespace SpacyDotNet 13 | { 14 | public class Vocab : IXmlSerializable 15 | { 16 | private Dictionary _dictStr2Lex = new Dictionary(); 17 | private Dictionary _dictLong2Lex = new Dictionary(); 18 | private StringStore _stringStore = null; 19 | 20 | public Vocab() 21 | { 22 | using (Py.GIL()) 23 | { 24 | dynamic spacy = Py.Import("spacy"); 25 | PyVocab = spacy.vocab.Vocab.__call__(); 26 | } 27 | } 28 | 29 | internal Vocab(dynamic vocab) 30 | { 31 | PyVocab = vocab; 32 | } 33 | 34 | internal dynamic PyVocab 35 | { get; set; } 36 | 37 | public Lexeme this[object key] 38 | { 39 | get 40 | { 41 | var keyStr = key as string; 42 | if (keyStr != null) 43 | { 44 | if (_dictStr2Lex.ContainsKey(keyStr)) 45 | return _dictStr2Lex[keyStr]; 46 | 47 | Lexeme lexeme = null; 48 | 49 | if (PyVocab != null) 50 | { 51 | using (Py.GIL()) 52 | { 53 | var pyStr = new PyString(keyStr); 54 | var dynPyObj = PyVocab.__getitem__(pyStr); 55 | lexeme = new Lexeme(dynPyObj); 56 | _dictStr2Lex.Add(keyStr, lexeme); 57 | } 58 | } 59 | 60 | return lexeme; 61 | } 62 | 63 | var keyHashN = key as BigInteger?; 64 | if (keyHashN != null) 65 | { 66 | var keyHash = (BigInteger)keyHashN; 67 | if (_dictLong2Lex.ContainsKey(keyHash)) 68 | return _dictLong2Lex[keyHash]; 69 | 70 | Lexeme lexeme = null; 71 | 72 | if (PyVocab != null) 73 | { 74 | using (Py.GIL()) 75 | { 76 | var dynPyObj = PyVocab.__getitem__(key); 77 | lexeme = new Lexeme(dynPyObj); 78 | _dictLong2Lex.Add(keyHash, lexeme); 79 | } 80 | } 81 | 82 | return lexeme; 83 | } 84 | 85 | throw new Exception("Wrong datatype in parameter passed to Vocab"); 86 | } 87 | } 88 | 89 | public StringStore Strings 90 | { 91 | get 92 | { 93 | if (_stringStore != null) 94 | return _stringStore; 95 | 96 | using (Py.GIL()) 97 | { 98 | var stringStore = PyVocab.strings; 99 | _stringStore = new StringStore(stringStore); 100 | return _stringStore; 101 | } 102 | } 103 | } 104 | 105 | public void ToDisk(string path) 106 | { 107 | if (Serialization.Selected != Serialization.Mode.Spacy) 108 | throw new NotImplementedException(); 109 | 110 | using (Py.GIL()) 111 | { 112 | var pyPath = new PyString(path); 113 | PyVocab.to_disk(pyPath); 114 | } 115 | } 116 | 117 | public void FromDisk(string path) 118 | { 119 | if (Serialization.Selected != Serialization.Mode.Spacy) 120 | throw new NotImplementedException(); 121 | 122 | using (Py.GIL()) 123 | { 124 | var pyPath = new PyString(path); 125 | PyVocab.from_disk(pyPath); 126 | } 127 | } 128 | 129 | public XmlSchema GetSchema() 130 | { 131 | return null; 132 | } 133 | 134 | public void ReadXml(XmlReader reader) 135 | { 136 | var serializationMode = Serialization.Selected; 137 | 138 | if (serializationMode == Serialization.Mode.SpacyAndDotNet) 139 | { 140 | reader.ReadStartElement(); 141 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:PyObj"); 142 | var bytesB64 = reader.ReadElementContentAsString(); 143 | var bytes = Convert.FromBase64String(bytesB64); 144 | var pyBytes = ToPy.GetBytes(bytes); 145 | 146 | using (Py.GIL()) 147 | { 148 | dynamic spacy = Py.Import("spacy"); 149 | PyVocab = spacy.vocab.Vocab.__call__(); 150 | PyVocab.from_bytes(pyBytes); 151 | } 152 | 153 | reader.ReadEndElement(); 154 | } 155 | else 156 | reader.Skip(); 157 | 158 | Debug.Assert(serializationMode != Serialization.Mode.Spacy); 159 | } 160 | 161 | public void WriteXml(XmlWriter writer) 162 | { 163 | var serializationMode = Serialization.Selected; 164 | 165 | Debug.Assert(serializationMode != Serialization.Mode.Spacy); 166 | 167 | if (serializationMode == Serialization.Mode.SpacyAndDotNet) 168 | { 169 | using (Py.GIL()) 170 | { 171 | var pyObj = ToClr.GetBytes(PyVocab.to_bytes()); 172 | var pyObjB64 = Convert.ToBase64String(pyObj); 173 | writer.WriteElementString("PyObj", Serialization.Namespace, pyObjB64); 174 | } 175 | } 176 | } 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /PythonNetUtils/ToClr.cs: -------------------------------------------------------------------------------- 1 | using Python.Runtime; 2 | using System.Diagnostics; 3 | using System.Globalization; 4 | using System.Numerics; 5 | using System.Reflection; 6 | 7 | namespace PythonNetUtils 8 | { 9 | public static class ToClr 10 | { 11 | public static T GetMember(dynamic dynBoolPyObj, ref T member) 12 | { 13 | if (member != null) 14 | { 15 | return member; 16 | } 17 | 18 | member = Get(dynBoolPyObj); 19 | return member; 20 | } 21 | 22 | public static T Get(dynamic dynPyBasicType) 23 | { 24 | using (Py.GIL()) 25 | { 26 | var type = typeof(T); 27 | 28 | if (type == typeof(bool) || type == typeof(bool?)) 29 | { 30 | var boolPyInt = new PyInt(dynPyBasicType); 31 | T boolVar = (T)(object)(boolPyInt.ToInt32() != 0); 32 | return boolVar; 33 | } 34 | else if (type == typeof(string)) 35 | { 36 | var depPy = new PyString(dynPyBasicType); 37 | T stringVar = (T)(object)depPy.ToString(); 38 | return stringVar; 39 | } 40 | else if (type == typeof(double) || type == typeof(double?)) 41 | { 42 | var dynDoublePyFloat = PyFloat.AsFloat(dynPyBasicType); 43 | T doubleVar = (T)(object)dynDoublePyFloat.As(); 44 | return doubleVar; 45 | } 46 | else if (type == typeof(int) || type == typeof(int?)) 47 | { 48 | var intPy = new PyInt(dynPyBasicType); 49 | T intVar = (T)(object)intPy.ToInt32(); 50 | return intVar; 51 | } 52 | else if (type == typeof(long) || type == typeof(long?)) 53 | { 54 | var longPy = new PyInt(dynPyBasicType); 55 | T longVar = (T)(object)longPy.ToInt64(); 56 | return longVar; 57 | } 58 | else if (type == typeof(BigInteger) || type == typeof(BigInteger?)) 59 | { 60 | var pyInt = new PyInt(dynPyBasicType); 61 | 62 | // This is inefficient, and should be reworked in the future 63 | var str = pyInt.ToString(); 64 | T bigInt = (T)(object)BigInteger.Parse(str); 65 | return bigInt; 66 | } 67 | else 68 | { 69 | throw new NotImplementedException(); 70 | } 71 | } 72 | } 73 | 74 | public static List GetListFromGeneratorMember(dynamic pyGenerator, ref List lstMember) where T : new() 75 | { 76 | if (lstMember != null) 77 | { 78 | return lstMember; 79 | } 80 | 81 | lstMember = GetListFromGenerator(pyGenerator); 82 | return lstMember; 83 | } 84 | 85 | public static List GetListFromGenerator(dynamic pyGenerator) where T : new() 86 | { 87 | dynamic list; 88 | 89 | using (Py.GIL()) 90 | { 91 | dynamic builtins = Py.Import("builtins"); 92 | list = builtins.list(pyGenerator); 93 | } 94 | 95 | return GetListFromCollection(list); 96 | } 97 | 98 | public static List GetListFromCollectionMember(dynamic pyCollection, ref List lstMember) where T : new() 99 | { 100 | if (lstMember != null) 101 | { 102 | return lstMember; 103 | } 104 | 105 | lstMember = GetListFromCollection(pyCollection); 106 | return lstMember; 107 | } 108 | 109 | public static List GetListFromCollection(dynamic pyCollection) where T: new() 110 | { 111 | var lstVar = new List(); 112 | 113 | using (Py.GIL()) 114 | { 115 | dynamic builtins = Py.Import("builtins"); 116 | var pyCount = new PyInt(builtins.len(pyCollection)); 117 | var count = pyCount.ToInt32(); 118 | 119 | for (var i = 0; i < count; i++) 120 | { 121 | var element = pyCollection[i]; 122 | 123 | Binder binder = null; 124 | BindingFlags flags = BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance; 125 | CultureInfo culture = null; 126 | var parameters = new object[] { element }; 127 | 128 | lstVar.Add((T)Activator.CreateInstance(typeof(T), flags, binder, parameters, culture)); 129 | } 130 | 131 | return lstVar; 132 | } 133 | } 134 | 135 | public static List GetListFromListMember(dynamic pyList, ref List lstMember) 136 | { 137 | if (lstMember != null) 138 | { 139 | return lstMember; 140 | } 141 | 142 | lstMember = GetListFromList(pyList); 143 | return lstMember; 144 | } 145 | 146 | public static List GetListFromList(dynamic pyList) 147 | { 148 | var lstVar = new List(); 149 | 150 | using (Py.GIL()) 151 | { 152 | dynamic builtins = Py.Import("builtins"); 153 | var pyCount = new PyInt(builtins.len(pyList)); 154 | var count = pyCount.ToInt32(); 155 | 156 | for (var i = 0; i < count; i++) 157 | { 158 | var element = pyList[i]; 159 | 160 | object created = null; 161 | if (typeof(T) == typeof(string)) 162 | { 163 | var pyObj = new PyString(element); 164 | created = pyObj.ToString(); 165 | } 166 | else 167 | { 168 | Debug.Assert(false); 169 | return null; 170 | } 171 | 172 | lstVar.Add((T)created); 173 | } 174 | 175 | return lstVar; 176 | } 177 | } 178 | 179 | public static byte[] GetBytes(dynamic dpyBytes) 180 | { 181 | var pyBytes = (PyObject)dpyBytes; 182 | var pyBuff = pyBytes.GetBuffer(); 183 | 184 | var buff = new byte[pyBuff.Length]; 185 | pyBuff.Read(buff, 0, (int)pyBuff.Length, 0); 186 | return buff; 187 | } 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SpacyDotNet 2 | 3 | SpacyDotNet is a .NET wrapper for the natural language library [spaCy](https://spacy.io/) 4 | 5 | ## Project scope and limitations 6 | 7 | This project is not meant to be a complete and exhaustive implementation of all spaCy features and [APIs](https://spacy.io/api). Altough it should be enough for basic tasks, think of it as a starting point, if you need to build a complex project using spaCy in .NET 8 | 9 | Most of the basic features in _Spacy101 section_ of the docs are available. All **Containers** classes are present (_Doc_, _DocBin_, _Token_, _Span_ and _Lexeme_) with their basic properties/methods running. Also _Vocab_ and _StringStore_ in a limited form. 10 | 11 | Nevertheless any developer should be ready to add the missing properties or classes in a straightforward manner. 12 | 13 | ## Requirements 14 | 15 | This project relies on [Python.NET](http://pythonnet.github.io/) to interop with spaCy, which is written in Python/Cython. 16 | 17 | It's been tested under **Windows 11** and **Ubuntu Linux 20.04**, using the following environment 18 | 19 | - .NET 9.0 / .NET Core 3.1 20 | - spaCy 3.8.5 21 | - Python 3.12 22 | - Python.NET: Latest official NuGet: [3.0.5](https://www.nuget.org/api/v2/package/pythonnet/3.0.5) 23 | 24 | but it might work under different conditions: 25 | 26 | - It was previously tested on 27 | - .NET Core 3.1 28 | - spaCy 3.0.5 29 | - Python 3.8 30 | - Python.NET release: [3.0.0-preview2021-04-03](https://www.nuget.org/packages/pythonnet/3.0.0-preview2021-04-03) 31 | - It should work with spaCy 2.3.5 and any other spaCy version that changes only its minor/patch version number 32 | 33 | Python.NET has been tested with Python releases 3.7 to 3.13 34 | 35 | ## Setup 36 | 37 | ### 1) Create a Python virtual environment and install spaCy 38 | 39 | It's advised to create a virtual environment to install spaCy. Depending on the host system this is done in different ways. The spaCy official [installation guide](https://spacy.io/usage) is fine 40 | 41 | To run the examples, we'll also need to install the correspoding language package (_es_core_news_sm_) as shown in the guide. 42 | 43 | ### 2) Check for Python shared library 44 | 45 | Python.NET makes use of Python as a shared library. Sadly, seems like the shared library is not copied with recent versions of _virtualenv_ and it's not even distributed in some flavours of Linux/Python >= 3.8 46 | 47 | While I don't understand the rationale behind those changes, we should check the following: 48 | 49 | **Windows** 50 | 51 | Check whether _python312.dll_ in located under _\Scripts_ folder. Otherwise, go to your main Python folder and copy all dlls. In my case: _python3.dll_, _python312.dll_ and the _vcruntime140.dll_ 52 | 53 | **Linux** 54 | 55 | Check whether a libpython shared object is located under _/bin_ folder. 56 | 57 | If not, we first need to check if the shared object is present on our system. [find_libpython](https://pypi.org/project/find-libpython/) can help with this task. 58 | 59 | If library is nowhere to be found, it's likely that installing _python-dev_ package with the package manager of your favorite distribution will place the file in your system. 60 | 61 | Once we locate the library, drop it to the _bin_ folder. In my case, the file is named _libpython3.12.so.1.0_ 62 | 63 | ## Usage 64 | 65 | SpaCyDotNet is built to be used as a library. However I provide an example project as a CLI program. 66 | 67 | ### 1) Compile and Build 68 | 69 | If using the CLI to run .NET, (Linux), we should simply browse to _Test/cs_ folder and compile the project with `dotnet build`. Under Visual Studio, just load _Test.sln_ solution 70 | 71 | ### 2) Run the project 72 | 73 | The program expects two parameters 74 | 75 | - **interpreter:** Name of Python shared library file. Usually _python312.dll_ on Windows, _libpython3.12.so_ on Linux and _libpython3.12.dylib_ on Mac 76 | - **venv:** Location of the virtual environment created with a compatible python and spaCy versions 77 | 78 | Run the example with `dotnet run --interpreter --venv ` or if using Visual Studio, set the command line in _Project => Properties => Debug => Application arguments_ 79 | 80 | In my case: 81 | 82 | **Linux** 83 | 84 | dotnet run --interpreter libpython3.12.so.1.0 --venv /home/user/Dev/venvSpaCyPy312 85 | 86 | **Windows** 87 | 88 | dotnet run --interpreter python312.dll --venv C:\Users\user\Dev\venvSpaCyPy312 89 | 90 | ## Code comparison 91 | 92 | I've tried to mimic spaCy API as much as possible, considering the different nature of both C# and Python languages 93 | 94 | ### C# SpacyDotNet code 95 | 96 | ```c# 97 | var nlp = spacy.Load("en_core_web_sm"); 98 | var doc = nlp.GetDocument("Apple is looking at buying U.K. startup for $1 billion"); 99 | 100 | foreach (Token token in doc.Tokens) 101 | Console.WriteLine($"{token.Text} {token.Lemma} {token.PoS} {token.Tag} {token.Dep} {token.Shape} {token.IsAlpha} {token.IsStop}"); 102 | 103 | Console.WriteLine(""); 104 | foreach (Span ent in doc.Ents) 105 | Console.WriteLine($"{ent.Text} {ent.StartChar} {ent.EndChar} {ent.Label}"); 106 | 107 | nlp = spacy.Load("en_core_web_md"); 108 | var tokens = nlp.GetDocument("dog cat banana afskfsd"); 109 | 110 | Console.WriteLine(""); 111 | foreach (Token token in tokens.Tokens) 112 | Console.WriteLine($"{token.Text} {token.HasVector} {token.VectorNorm}, {token.IsOov}"); 113 | 114 | tokens = nlp.GetDocument("dog cat banana"); 115 | Console.WriteLine(""); 116 | foreach (Token token1 in tokens.Tokens) 117 | { 118 | foreach (Token token2 in tokens.Tokens) 119 | Console.WriteLine($"{token1.Text} {token2.Text} {token1.Similarity(token2) }"); 120 | } 121 | 122 | doc = nlp.GetDocument("I love coffee"); 123 | Console.WriteLine(""); 124 | Console.WriteLine(doc.Vocab.Strings["coffee"]); 125 | Console.WriteLine(doc.Vocab.Strings[3197928453018144401]); 126 | 127 | Console.WriteLine(""); 128 | foreach (Token word in doc.Tokens) 129 | { 130 | var lexeme = doc.Vocab[word.Text]; 131 | Console.WriteLine($@"{lexeme.Text} {lexeme.Orth} {lexeme.Shape} {lexeme.Prefix} {lexeme.Suffix} 132 | {lexeme.IsAlpha} {lexeme.IsDigit} {lexeme.IsTitle} {lexeme.Lang}"); 133 | } 134 | ``` 135 | ### Python spaCy code 136 | 137 | ```python 138 | nlp = spacy.load("en_core_web_sm") 139 | doc = nlp("Apple is looking at buying U.K. startup for $1 billion") 140 | 141 | for token in doc: 142 | print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, 143 | token.shape_, token.is_alpha, token.is_stop) 144 | 145 | print("") 146 | for ent in doc.ents: 147 | print(ent.text, ent.start_char, ent.end_char, ent.label_) 148 | 149 | nlp = spacy.load("en_core_web_md") 150 | tokens = nlp("dog cat banana afskfsd") 151 | 152 | print("") 153 | for token in tokens: 154 | print(token.text, token.has_vector, token.vector_norm, token.is_oov) 155 | 156 | tokens = nlp("dog cat banana") 157 | print("") 158 | for token1 in tokens: 159 | for token2 in tokens: 160 | print(token1.text, token2.text, token1.similarity(token2)) 161 | 162 | doc = nlp("I love coffee") 163 | print("") 164 | print(doc.vocab.strings["coffee"]) # 3197928453018144401 165 | print(doc.vocab.strings[3197928453018144401]) # 'coffee' 166 | 167 | print("") 168 | for word in doc: 169 | lexeme = doc.vocab[word.text] 170 | print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_, 171 | lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_) 172 | ``` 173 | ### Output 174 | 175 | ![Output](https://github.com/AMArostegui/SpacyDotNet/blob/master/Output.png) 176 | -------------------------------------------------------------------------------- /SpaCyDotNet/api/DocBin.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.IO; 5 | using System.Xml; 6 | using System.Xml.Schema; 7 | using System.Xml.Serialization; 8 | using Python.Runtime; 9 | using PythonNetUtils; 10 | 11 | namespace SpacyDotNet 12 | { 13 | public class DocBin : IXmlSerializable 14 | { 15 | private dynamic _pyDocBin; 16 | private List _docs; 17 | 18 | public DocBin() 19 | { 20 | using (Py.GIL()) 21 | { 22 | dynamic spacy = Py.Import("spacy"); 23 | _pyDocBin = spacy.tokens.DocBin.__call__(); 24 | } 25 | } 26 | 27 | public DocBin(string[] attrs, bool storeUserData) 28 | { 29 | using (Py.GIL()) 30 | { 31 | var pyAttrs = new PyList(); 32 | if (attrs != null) 33 | { 34 | foreach (var att in attrs) 35 | { 36 | var pyAtt = new PyString(att); 37 | pyAttrs.Append(pyAtt); 38 | } 39 | } 40 | 41 | var pyStoreUserDate = new PyInt(storeUserData ? 1 : 0); 42 | dynamic spacy = Py.Import("spacy"); 43 | _pyDocBin = spacy.tokens.DocBin.__call__(pyAttrs, pyStoreUserDate); 44 | } 45 | } 46 | 47 | public void Add(Doc doc) 48 | { 49 | if (_docs == null) 50 | _docs = new List(); 51 | 52 | _docs.Add(doc); 53 | 54 | using (Py.GIL()) 55 | { 56 | dynamic pyDoc = doc.PyDoc; 57 | _pyDocBin.add(pyDoc); 58 | } 59 | } 60 | 61 | public byte[] ToBytes() 62 | { 63 | if (Serialization.Selected == Serialization.Mode.Spacy) 64 | { 65 | using (Py.GIL()) 66 | { 67 | return ToClr.GetBytes(_pyDocBin.to_bytes()); 68 | } 69 | } 70 | else 71 | { 72 | using var stream = new MemoryStream(); 73 | 74 | var settings = new XmlWriterSettings(); 75 | settings.Indent = true; 76 | using var writer = XmlWriter.Create(stream, settings); 77 | 78 | WriteXml(writer); 79 | writer.Flush(); 80 | return stream.ToArray(); 81 | } 82 | } 83 | 84 | public void FromBytes(byte[] bytes) 85 | { 86 | if (Serialization.Selected == Serialization.Mode.Spacy) 87 | { 88 | var pyObj = ToPy.GetBytes(bytes); 89 | using (Py.GIL()) 90 | { 91 | _pyDocBin.from_bytes(pyObj); 92 | } 93 | } 94 | else 95 | { 96 | var stream = new MemoryStream(bytes); 97 | 98 | var settings = new XmlReaderSettings(); 99 | settings.IgnoreComments = true; 100 | settings.IgnoreWhitespace = true; 101 | var reader = XmlReader.Create(stream, settings); 102 | 103 | var docBin = new DocBin(); 104 | docBin.ReadXml(reader); 105 | Copy(docBin); 106 | } 107 | } 108 | 109 | public void ToDisk(string pathFile) 110 | { 111 | if (Serialization.Selected == Serialization.Mode.Spacy) 112 | { 113 | using (Py.GIL()) 114 | { 115 | var pyPath = new PyString(pathFile); 116 | _pyDocBin.to_disk(pyPath); 117 | } 118 | } 119 | else 120 | { 121 | using var stream = new FileStream(pathFile, FileMode.Create); 122 | 123 | var settings = new XmlWriterSettings(); 124 | settings.Indent = true; 125 | using var writer = XmlWriter.Create(stream, settings); 126 | 127 | WriteXml(writer); 128 | } 129 | } 130 | 131 | public void FromDisk(string pathFile) 132 | { 133 | if (Serialization.Selected == Serialization.Mode.Spacy) 134 | { 135 | using (Py.GIL()) 136 | { 137 | var pyPath = new PyString(pathFile); 138 | _pyDocBin.from_disk(pyPath); 139 | } 140 | } 141 | else 142 | { 143 | using var stream = new FileStream(pathFile, FileMode.Open, FileAccess.Read); 144 | 145 | var settings = new XmlReaderSettings(); 146 | settings.IgnoreComments = true; 147 | settings.IgnoreWhitespace = true; 148 | var reader = XmlReader.Create(stream, settings); 149 | 150 | var docBin = new DocBin(); 151 | docBin.ReadXml(reader); 152 | Copy(docBin); 153 | } 154 | } 155 | 156 | public List GetDocs(Vocab vocab) => ToClr.GetListFromGeneratorMember(_pyDocBin?.get_docs(vocab.PyVocab), ref _docs); 157 | 158 | private void Copy(DocBin docBin) 159 | { 160 | _docs = docBin._docs; 161 | 162 | // I'd rather copy Python object no matter the serialization mode 163 | // If set to DotNet, the variable will be initialized to null 164 | // disregarding its current value which might be a default object 165 | _pyDocBin = docBin._pyDocBin; 166 | 167 | if (Serialization.Selected == Serialization.Mode.SpacyAndDotNet) 168 | { 169 | using (Py.GIL()) 170 | { 171 | dynamic spacy = Py.Import("spacy"); 172 | 173 | dynamic pyVocab = spacy.vocab.Vocab.__call__(); 174 | dynamic pyDocs = _pyDocBin.get_docs(pyVocab); 175 | 176 | dynamic builtins = Py.Import("builtins"); 177 | dynamic listDocs = builtins.list(pyDocs); 178 | 179 | var pyCount = new PyInt(builtins.len(listDocs)); 180 | var count = pyCount.ToInt32(); 181 | 182 | for (var i = 0; i < count; i++) 183 | { 184 | dynamic pyDoc = listDocs[i]; 185 | _docs[i].PyDoc = pyDoc; 186 | _docs[i].Vocab.PyVocab = pyDoc.vocab; 187 | } 188 | } 189 | } 190 | } 191 | 192 | public XmlSchema GetSchema() 193 | { 194 | return null; 195 | } 196 | 197 | public void ReadXml(XmlReader reader) 198 | { 199 | var serializationMode = Serialization.Selected; 200 | reader.MoveToContent(); 201 | 202 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:DocBin"); 203 | reader.ReadStartElement(); 204 | 205 | if (serializationMode == Serialization.Mode.SpacyAndDotNet) 206 | { 207 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:PyObj"); 208 | var bytesB64 = reader.ReadElementContentAsString(); 209 | var bytes = Convert.FromBase64String(bytesB64); 210 | var pyBytes = ToPy.GetBytes(bytes); 211 | 212 | using (Py.GIL()) 213 | { 214 | dynamic spacy = Py.Import("spacy"); 215 | _pyDocBin = spacy.tokens.DocBin.__call__(); 216 | _pyDocBin.from_bytes(pyBytes); 217 | } 218 | } 219 | 220 | Debug.Assert(serializationMode != Serialization.Mode.Spacy); 221 | 222 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Docs"); 223 | reader.ReadStartElement(); 224 | _docs = new List(); 225 | 226 | while (reader.MoveToContent() != XmlNodeType.EndElement) 227 | { 228 | if (reader.NodeType != XmlNodeType.EndElement) 229 | { 230 | var doc = new Doc(); 231 | doc.ReadXml(reader); 232 | _docs.Add(doc); 233 | } 234 | } 235 | 236 | reader.ReadEndElement(); 237 | } 238 | 239 | public void WriteXml(XmlWriter writer) 240 | { 241 | writer.WriteStartElement(Serialization.Prefix, "DocBin", Serialization.Namespace); 242 | 243 | var serializationMode = Serialization.Selected; 244 | 245 | if (serializationMode == Serialization.Mode.SpacyAndDotNet) 246 | { 247 | using (Py.GIL()) 248 | { 249 | var pyObj = ToClr.GetBytes(_pyDocBin.to_bytes()); 250 | var pyObjB64 = Convert.ToBase64String(pyObj); 251 | writer.WriteElementString("PyObj", Serialization.Namespace, pyObjB64); 252 | } 253 | } 254 | 255 | Debug.Assert(serializationMode != Serialization.Mode.Spacy); 256 | 257 | writer.WriteStartElement("Docs", Serialization.Namespace); 258 | foreach (var doc in _docs) 259 | doc.WriteXml(writer); 260 | writer.WriteEndElement(); 261 | 262 | writer.WriteEndElement(); 263 | } 264 | } 265 | } 266 | -------------------------------------------------------------------------------- /SpaCyDotNet/api/Token.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Diagnostics; 3 | using System.Xml; 4 | using System.Xml.Schema; 5 | using System.Xml.Serialization; 6 | using Python.Runtime; 7 | using PythonNetUtils; 8 | 9 | namespace SpacyDotNet 10 | { 11 | public class Token : IXmlSerializable 12 | { 13 | private dynamic _pyToken; 14 | 15 | private string _text; 16 | private string _lemma; 17 | 18 | private string _pos; 19 | private string _tag; 20 | private string _dep; 21 | private string _shape; 22 | 23 | private bool? _isAlpha; 24 | private bool? _isStop; 25 | private bool? _isPunct; 26 | private bool? _isDigit; 27 | private bool? _likeNum; 28 | private bool? _likeEMail; 29 | 30 | private bool? _hasVector; 31 | private double? _vectorNorm; 32 | private bool? _isOov; 33 | 34 | private int? _i; 35 | 36 | private Token _head; 37 | private int _headPos; 38 | 39 | private List _children; 40 | 41 | public Token() 42 | { 43 | // Needed to use generics 44 | } 45 | 46 | internal Token(dynamic token) 47 | { 48 | _isAlpha = null; 49 | _isStop = null; 50 | _isPunct = null; 51 | _isDigit = null; 52 | _likeNum = null; 53 | _likeEMail = null; 54 | 55 | _pyToken = token; 56 | } 57 | 58 | internal dynamic PyObj => _pyToken; 59 | 60 | public string Text => ToClr.GetMember(_pyToken?.text, ref _text); 61 | public string Lemma => ToClr.GetMember(_pyToken?.lemma_, ref _lemma); 62 | public string PoS => ToClr.GetMember(_pyToken?.pos_, ref _pos); 63 | public string Tag => ToClr.GetMember(_pyToken?.tag_, ref _tag); 64 | public string Dep => ToClr.GetMember(_pyToken?.dep_, ref _dep); 65 | public string Shape => ToClr.GetMember(_pyToken?.shape_, ref _shape); 66 | public bool IsAlpha => ToClr.GetMember(_pyToken?.is_alpha, ref _isAlpha); 67 | public bool IsStop => ToClr.GetMember(_pyToken?.is_stop, ref _isStop); 68 | public bool IsPunct => ToClr.GetMember(_pyToken?.is_punct, ref _isPunct); 69 | public bool IsDigit => ToClr.GetMember(_pyToken?.is_digit, ref _isDigit); 70 | public bool LikeNum => ToClr.GetMember(_pyToken?.like_num, ref _likeNum); 71 | public bool LikeEMail => ToClr.GetMember(_pyToken?.like_email, ref _likeEMail); 72 | public bool HasVector => ToClr.GetMember(_pyToken?.has_vector, ref _hasVector); 73 | public double VectorNorm => ToClr.GetMember(_pyToken?.vector_norm, ref _vectorNorm); 74 | public bool IsOov => ToClr.GetMember(_pyToken?.is_oov, ref _isOov); 75 | public int I => ToClr.GetMember(_pyToken?.i, ref _i); 76 | 77 | public Token Head 78 | { 79 | get 80 | { 81 | if (_head != null) 82 | return _head; 83 | 84 | using (Py.GIL()) 85 | { 86 | var pyHeadIsSelf = new PyInt(_pyToken.head.__eq__(_pyToken)); 87 | var headIsSelf = pyHeadIsSelf.ToInt32() != 0; 88 | if (headIsSelf) 89 | _head = this; 90 | else 91 | _head = new Token(_pyToken.head); 92 | 93 | return _head; 94 | } 95 | } 96 | 97 | set 98 | { 99 | _head = value; 100 | } 101 | } 102 | 103 | public List Children => ToClr.GetListFromGeneratorMember(_pyToken?.children, ref _children); 104 | 105 | public double Similarity(Token token) 106 | { 107 | using (Py.GIL()) 108 | { 109 | dynamic similarityPy = _pyToken.similarity(token.PyObj); 110 | var similarityPyFloat = PyFloat.AsFloat(similarityPy); 111 | return similarityPyFloat.As(); 112 | } 113 | } 114 | 115 | public override string ToString() 116 | { 117 | return Text; 118 | } 119 | 120 | public XmlSchema GetSchema() 121 | { 122 | return null; 123 | } 124 | 125 | public void ReadXml(XmlReader reader) 126 | { 127 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Text"); 128 | _text = reader.ReadElementContentAsString(); 129 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Lemma"); 130 | _lemma = reader.ReadElementContentAsString(); 131 | 132 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Pos"); 133 | _pos = reader.ReadElementContentAsString(); 134 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Tag"); 135 | _tag = reader.ReadElementContentAsString(); 136 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Dep"); 137 | _dep = reader.ReadElementContentAsString(); 138 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Shape"); 139 | _shape = reader.ReadElementContentAsString(); 140 | 141 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsAlpha"); 142 | _isAlpha = reader.ReadElementContentAsBoolean(); 143 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsStop"); 144 | _isStop = reader.ReadElementContentAsBoolean(); 145 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsPunct"); 146 | _isPunct = reader.ReadElementContentAsBoolean(); 147 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsDigit"); 148 | _isDigit = reader.ReadElementContentAsBoolean(); 149 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:LikeNum"); 150 | _likeNum = reader.ReadElementContentAsBoolean(); 151 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:LikeEMail"); 152 | _likeEMail = reader.ReadElementContentAsBoolean(); 153 | 154 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:HasVector"); 155 | _hasVector = reader.ReadElementContentAsBoolean(); 156 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:VectorNorm"); 157 | _vectorNorm = reader.ReadElementContentAsDouble(); 158 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsOov"); 159 | _isOov = reader.ReadElementContentAsBoolean(); 160 | 161 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:I"); 162 | _i = reader.ReadElementContentAsInt(); 163 | 164 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Head"); 165 | var headPosStr = reader.GetAttribute("Pos"); 166 | if (string.IsNullOrEmpty(headPosStr)) 167 | _headPos = -1; 168 | else 169 | _headPos = int.Parse(headPosStr); 170 | 171 | reader.Skip(); 172 | } 173 | 174 | public void WriteXml(XmlWriter writer) 175 | { 176 | // Using the property is important form the members to be loaded 177 | writer.WriteElementString("Text", Serialization.Namespace, Text); 178 | writer.WriteElementString("Lemma", Serialization.Namespace, Lemma); 179 | 180 | writer.WriteElementString("Pos", Serialization.Namespace, PoS); 181 | writer.WriteElementString("Tag", Serialization.Namespace, Tag); 182 | writer.WriteElementString("Dep", Serialization.Namespace, Dep); 183 | writer.WriteElementString("Shape", Serialization.Namespace, Shape); 184 | 185 | writer.WriteStartElement("IsAlpha", Serialization.Namespace); 186 | writer.WriteValue(IsAlpha); 187 | writer.WriteEndElement(); 188 | writer.WriteStartElement("IsStop", Serialization.Namespace); 189 | writer.WriteValue(IsStop); 190 | writer.WriteEndElement(); 191 | writer.WriteStartElement("IsPunct", Serialization.Namespace); 192 | writer.WriteValue(IsPunct); 193 | writer.WriteEndElement(); 194 | writer.WriteStartElement("IsDigit", Serialization.Namespace); 195 | writer.WriteValue(IsDigit); 196 | writer.WriteEndElement(); 197 | writer.WriteStartElement("LikeNum", Serialization.Namespace); 198 | writer.WriteValue(LikeNum); 199 | writer.WriteEndElement(); 200 | writer.WriteStartElement("LikeEMail", Serialization.Namespace); 201 | writer.WriteValue(LikeEMail); 202 | writer.WriteEndElement(); 203 | 204 | writer.WriteStartElement("HasVector", Serialization.Namespace); 205 | writer.WriteValue(HasVector); 206 | writer.WriteEndElement(); 207 | writer.WriteStartElement("VectorNorm", Serialization.Namespace); 208 | writer.WriteValue(VectorNorm); 209 | writer.WriteEndElement(); 210 | writer.WriteStartElement("IsOov", Serialization.Namespace); 211 | writer.WriteValue(IsOov); 212 | writer.WriteEndElement(); 213 | 214 | writer.WriteStartElement("I", Serialization.Namespace); 215 | writer.WriteValue(I); 216 | writer.WriteEndElement(); 217 | 218 | writer.WriteStartElement("Head", Serialization.Namespace); 219 | var head = Head; 220 | if (head == this) 221 | writer.WriteAttributeString("Pos", string.Empty); 222 | else 223 | writer.WriteAttributeString("Pos", head.I.ToString()); 224 | writer.WriteEndElement(); 225 | 226 | // This one was already commented 227 | //info.AddValue("Children", Children); 228 | } 229 | 230 | internal void RestoreHead(List tokens) 231 | { 232 | if (_headPos == -1) 233 | _head = this; 234 | else 235 | _head = tokens[_headPos]; 236 | } 237 | } 238 | } 239 | -------------------------------------------------------------------------------- /SpaCyDotNet/api/Doc.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.IO; 5 | using System.Xml; 6 | using System.Xml.Schema; 7 | using System.Xml.Serialization; 8 | using Python.Runtime; 9 | using PythonNetUtils; 10 | 11 | namespace SpacyDotNet 12 | { 13 | public class Doc : IXmlSerializable 14 | { 15 | private string _text; 16 | 17 | private Vocab _vocab; 18 | 19 | private List _tokens; 20 | 21 | private List _sentences; 22 | private List _nounChunks; 23 | private List _ents; 24 | 25 | public Doc() 26 | { 27 | } 28 | 29 | public Doc(Vocab vocab) 30 | { 31 | _vocab = vocab; 32 | 33 | using (Py.GIL()) 34 | { 35 | dynamic spacy = Py.Import("spacy"); 36 | dynamic pyVocab = vocab.PyVocab; 37 | PyDoc = spacy.tokens.doc.Doc.__call__(pyVocab); 38 | } 39 | } 40 | 41 | internal Doc(dynamic doc) 42 | { 43 | PyDoc = doc; 44 | _vocab = null; 45 | } 46 | 47 | internal Doc(dynamic doc, string text) 48 | { 49 | PyDoc = doc; 50 | _vocab = null; 51 | _text = text; 52 | } 53 | 54 | internal dynamic PyDoc { get; set; } 55 | 56 | public string Text => ToClr.GetMember(PyDoc?.text, ref _text); 57 | public List Tokens => ToClr.GetListFromCollectionMember(PyDoc, ref _tokens); 58 | public List Sents => ToClr.GetListFromGeneratorMember(PyDoc?.sents, ref _sentences); 59 | public List NounChunks => ToClr.GetListFromGeneratorMember(PyDoc?.noun_chunks, ref _nounChunks); 60 | public List Ents => ToClr.GetListFromGeneratorMember(PyDoc?.ents, ref _ents); 61 | 62 | public Vocab Vocab 63 | { 64 | get 65 | { 66 | if (_vocab != null) 67 | return _vocab; 68 | 69 | using (Py.GIL()) 70 | { 71 | var vocab = PyDoc.vocab; 72 | _vocab = new Vocab(vocab); 73 | return _vocab; 74 | } 75 | } 76 | } 77 | 78 | public void ToDisk(string path) 79 | { 80 | if (Serialization.Selected == Serialization.Mode.Spacy) 81 | { 82 | using (Py.GIL()) 83 | { 84 | var pyPath = new PyString(path); 85 | PyDoc.to_disk(pyPath); 86 | } 87 | } 88 | else 89 | { 90 | using var stream = new FileStream(path, FileMode.Create); 91 | 92 | var settings = new XmlWriterSettings(); 93 | settings.Indent = true; 94 | using var writer = XmlWriter.Create(stream, settings); 95 | 96 | WriteXml(writer); 97 | } 98 | } 99 | 100 | public void FromDisk(string path) 101 | { 102 | if (Serialization.Selected == Serialization.Mode.Spacy) 103 | { 104 | using (Py.GIL()) 105 | { 106 | var pyPath = new PyString(path); 107 | PyDoc.from_disk(pyPath); 108 | } 109 | } 110 | else 111 | { 112 | using var stream = new FileStream(path, FileMode.Open, FileAccess.Read); 113 | 114 | var settings = new XmlReaderSettings(); 115 | settings.IgnoreComments = true; 116 | settings.IgnoreWhitespace = true; 117 | var reader = XmlReader.Create(stream, settings); 118 | 119 | var doc = new Doc(); 120 | doc.ReadXml(reader); 121 | Copy(doc); 122 | } 123 | } 124 | 125 | public byte[] ToBytes() 126 | { 127 | if (Serialization.Selected == Serialization.Mode.Spacy) 128 | { 129 | using (Py.GIL()) 130 | { 131 | return ToClr.GetBytes(PyDoc.to_bytes()); 132 | } 133 | } 134 | else 135 | { 136 | using var stream = new MemoryStream(); 137 | 138 | var settings = new XmlWriterSettings(); 139 | settings.Indent = true; 140 | using var writer = XmlWriter.Create(stream, settings); 141 | 142 | WriteXml(writer); 143 | writer.Flush(); 144 | return stream.ToArray(); 145 | } 146 | } 147 | 148 | public void FromBytes(byte[] bytes) 149 | { 150 | if (Serialization.Selected == Serialization.Mode.Spacy) 151 | { 152 | var pyBytes = ToPy.GetBytes(bytes); 153 | using (Py.GIL()) 154 | { 155 | PyDoc.from_bytes(pyBytes); 156 | } 157 | } 158 | else 159 | { 160 | var stream = new MemoryStream(bytes); 161 | 162 | var settings = new XmlReaderSettings(); 163 | settings.IgnoreComments = true; 164 | settings.IgnoreWhitespace = true; 165 | var reader = XmlReader.Create(stream, settings); 166 | 167 | var doc = new Doc(); 168 | doc.ReadXml(reader); 169 | Copy(doc); 170 | } 171 | } 172 | 173 | private void Copy(Doc doc) 174 | { 175 | // I'd rather copy Python object no matter the serialization mode 176 | // If set to DotNet, the variable will be initialized to null 177 | // disregarding its current value which might be a default object 178 | PyDoc = doc.PyDoc; 179 | 180 | _text = doc._text; 181 | _vocab = doc._vocab; 182 | _tokens = doc._tokens; 183 | _sentences = doc._sentences; 184 | _nounChunks = doc._nounChunks; 185 | _ents = doc._ents; 186 | } 187 | 188 | public XmlSchema GetSchema() 189 | { 190 | return null; 191 | } 192 | 193 | public void ReadXml(XmlReader reader) 194 | { 195 | var serializationMode = Serialization.Selected; 196 | reader.MoveToContent(); 197 | 198 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Doc"); 199 | reader.ReadStartElement(); 200 | 201 | if (serializationMode == Serialization.Mode.SpacyAndDotNet) 202 | { 203 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:PyObj"); 204 | var bytesB64 = reader.ReadElementContentAsString(); 205 | var bytes = Convert.FromBase64String(bytesB64); 206 | var pyBytes = ToPy.GetBytes(bytes); 207 | 208 | using (Py.GIL()) 209 | { 210 | dynamic spacy = Py.Import("spacy"); 211 | dynamic pyVocab = spacy.vocab.Vocab.__call__(); 212 | PyDoc = spacy.tokens.doc.Doc.__call__(pyVocab); 213 | PyDoc.from_bytes(pyBytes); 214 | _vocab = new Vocab(PyDoc.vocab); 215 | } 216 | } 217 | 218 | Debug.Assert(Serialization.Selected != Serialization.Mode.Spacy); 219 | 220 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Text"); 221 | _text = reader.ReadElementContentAsString(); 222 | 223 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Vocab"); 224 | _vocab = new Vocab(null); 225 | _vocab.ReadXml(reader); 226 | 227 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Tokens"); 228 | _tokens = new List(); 229 | var isEmpty = reader.IsEmptyElement; 230 | reader.ReadStartElement(); 231 | 232 | if (!isEmpty) 233 | { 234 | while (reader.MoveToContent() != XmlNodeType.EndElement) 235 | { 236 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Token"); 237 | reader.ReadStartElement(); 238 | if (reader.NodeType != XmlNodeType.EndElement) 239 | { 240 | var token = new Token(); 241 | token.ReadXml(reader); 242 | _tokens.Add(token); 243 | reader.ReadEndElement(); 244 | } 245 | } 246 | 247 | reader.ReadEndElement(); 248 | } 249 | 250 | foreach (var token in _tokens) 251 | token.RestoreHead(_tokens); 252 | 253 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Sentences"); 254 | _sentences = new List(); 255 | isEmpty = reader.IsEmptyElement; 256 | reader.ReadStartElement(); 257 | 258 | if (!isEmpty) 259 | { 260 | while (reader.MoveToContent() != XmlNodeType.EndElement) 261 | { 262 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Sent"); 263 | reader.ReadStartElement(); 264 | if (reader.NodeType != XmlNodeType.EndElement) 265 | { 266 | var sent = new Span(); 267 | sent.ReadXml(reader); 268 | _sentences.Add(sent); 269 | reader.ReadEndElement(); 270 | } 271 | } 272 | 273 | reader.ReadEndElement(); 274 | } 275 | 276 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:NounChunks"); 277 | _nounChunks = new List(); 278 | isEmpty = reader.IsEmptyElement; 279 | reader.ReadStartElement(); 280 | 281 | if (!isEmpty) 282 | { 283 | while (reader.MoveToContent() != XmlNodeType.EndElement) 284 | { 285 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:NounChunk"); 286 | reader.ReadStartElement(); 287 | if (reader.NodeType != XmlNodeType.EndElement) 288 | { 289 | var nChunk = new Span(); 290 | nChunk.ReadXml(reader); 291 | _nounChunks.Add(nChunk); 292 | reader.ReadEndElement(); 293 | } 294 | } 295 | 296 | reader.ReadEndElement(); 297 | } 298 | 299 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Ents"); 300 | _ents = new List(); 301 | reader.ReadStartElement(); 302 | 303 | while (reader.MoveToContent() != XmlNodeType.EndElement) 304 | { 305 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Ent"); 306 | reader.ReadStartElement(); 307 | if (reader.NodeType != XmlNodeType.EndElement) 308 | { 309 | var ent = new Span(); 310 | ent.ReadXml(reader); 311 | _ents.Add(ent); 312 | reader.ReadEndElement(); 313 | } 314 | } 315 | 316 | reader.ReadEndElement(); 317 | } 318 | 319 | public void WriteXml(XmlWriter writer) 320 | { 321 | writer.WriteStartElement(Serialization.Prefix, "Doc", Serialization.Namespace); 322 | 323 | var serializationMode = Serialization.Selected; 324 | 325 | if (serializationMode == Serialization.Mode.SpacyAndDotNet) 326 | { 327 | using (Py.GIL()) 328 | { 329 | var pyObj = ToClr.GetBytes(PyDoc.to_bytes()); 330 | var pyObjB64 = Convert.ToBase64String(pyObj); 331 | writer.WriteElementString("PyObj", Serialization.Namespace, pyObjB64); 332 | } 333 | } 334 | 335 | Debug.Assert(serializationMode != Serialization.Mode.Spacy); 336 | 337 | // Using the property is important form the members to be loaded 338 | writer.WriteElementString("Text", Serialization.Namespace, Text); 339 | writer.WriteStartElement("Vocab", Serialization.Namespace); 340 | Vocab.WriteXml(writer); 341 | writer.WriteEndElement(); 342 | 343 | writer.WriteStartElement("Tokens", Serialization.Namespace); 344 | foreach (var token in Tokens) 345 | { 346 | writer.WriteStartElement("Token", Serialization.Namespace); 347 | token.WriteXml(writer); 348 | writer.WriteEndElement(); 349 | } 350 | 351 | writer.WriteEndElement(); 352 | 353 | writer.WriteStartElement("Sentences", Serialization.Namespace); 354 | foreach (var sent in Sents) 355 | { 356 | writer.WriteStartElement("Sent", Serialization.Namespace); 357 | sent.WriteXml(writer); 358 | writer.WriteEndElement(); 359 | } 360 | 361 | writer.WriteEndElement(); 362 | 363 | writer.WriteStartElement("NounChunks", Serialization.Namespace); 364 | foreach (var nounChunk in NounChunks) 365 | { 366 | writer.WriteStartElement("NounChunk", Serialization.Namespace); 367 | nounChunk.WriteXml(writer); 368 | writer.WriteEndElement(); 369 | } 370 | 371 | writer.WriteEndElement(); 372 | 373 | writer.WriteStartElement("Ents", Serialization.Namespace); 374 | foreach (var ent in Ents) 375 | { 376 | writer.WriteStartElement("Ent", Serialization.Namespace); 377 | ent.WriteXml(writer); 378 | writer.WriteEndElement(); 379 | } 380 | 381 | writer.WriteEndElement(); 382 | 383 | writer.WriteEndElement(); 384 | } 385 | } 386 | } 387 | --------------------------------------------------------------------------------