├── Output.png
├── cslogo.png
├── Test
├── py
│ ├── doc.out
│ ├── displacy.py
│ ├── linguistic_features.py
│ ├── Test.py
│ ├── spaCy101.py
│ └── serialization.py
└── cs
│ ├── DisplaCy.cs
│ ├── Test.csproj
│ ├── LinguisticFeatures.cs
│ ├── Program.cs
│ ├── Serialization.cs
│ ├── SpaCy101.cs
│ ├── Test.sln
│ └── ExampleES.cs
├── PythonNetUtils
├── PythonNetUtils.csproj
├── Utils.cs
├── ToPy.cs
├── PythonRt.cs
└── ToClr.cs
├── SpaCyDotNet
├── SpaCyDotNet.csproj
├── Serialization.cs
└── api
│ ├── Displacy.cs
│ ├── Spacy.cs
│ ├── StringStore.cs
│ ├── Span.cs
│ ├── Lang.cs
│ ├── Lexeme.cs
│ ├── Vocab.cs
│ ├── DocBin.cs
│ ├── Token.cs
│ └── Doc.cs
├── LICENSE
├── .gitignore
└── README.md
/Output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMArostegui/SpacyDotNet/HEAD/Output.png
--------------------------------------------------------------------------------
/cslogo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMArostegui/SpacyDotNet/HEAD/cslogo.png
--------------------------------------------------------------------------------
/Test/py/doc.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AMArostegui/SpacyDotNet/HEAD/Test/py/doc.out
--------------------------------------------------------------------------------
/Test/py/displacy.py:
--------------------------------------------------------------------------------
1 | import spacy
2 | from spacy import displacy
3 |
4 | nlp = spacy.load("en_core_web_sm")
5 | doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
6 | displacy.serve(doc, style="dep")
--------------------------------------------------------------------------------
/Test/py/linguistic_features.py:
--------------------------------------------------------------------------------
1 | import spacy
2 |
3 | nlp = spacy.load("en_core_web_sm")
4 | doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
5 | for token in doc:
6 | print(token.text, token.dep_, token.head.text, token.head.pos_,
7 | [child for child in token.children])
--------------------------------------------------------------------------------
/PythonNetUtils/PythonNetUtils.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net9.0
5 | enable
6 | disable
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/SpaCyDotNet/SpaCyDotNet.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net9.0
5 | AnyCPU;x64
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/SpaCyDotNet/Serialization.cs:
--------------------------------------------------------------------------------
1 | namespace SpacyDotNet
2 | {
3 | public static class Serialization
4 | {
5 | public enum Mode
6 | {
7 | Spacy,
8 | SpacyAndDotNet,
9 | DotNet
10 | }
11 |
12 | public static Mode Selected { get; set; } = Mode.Spacy;
13 |
14 | public static string Namespace = "https://github.com/AMArostegui/SpacyDotNet";
15 |
16 | public static string Prefix = "sdn";
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/Test/cs/DisplaCy.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 | using SpacyDotNet;
5 |
6 | namespace Test
7 | {
8 | static class DisplaCy
9 | {
10 | public static void Run()
11 | {
12 | var spacy = new Spacy();
13 | var nlp = spacy.Load("en_core_web_sm");
14 |
15 | var doc = nlp.GetDocument("Apple is looking at buying U.K. startup for $1 billion");
16 | var displacy = new Displacy();
17 | displacy.Serve(doc, "dep");
18 | }
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/Test/cs/Test.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | net9.0
6 | AnyCPU;x64
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/SpaCyDotNet/api/Displacy.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 | using Python.Runtime;
5 |
6 | namespace SpacyDotNet
7 | {
8 | public class Displacy
9 | {
10 | public Displacy()
11 | {
12 | }
13 |
14 | public void Serve(Doc doc, string style)
15 | {
16 | using (Py.GIL())
17 | {
18 | dynamic spacy = Py.Import("spacy");
19 |
20 | var pyDoc = doc.PyDoc;
21 | var pyStyle = new PyString(style);
22 | spacy.displacy.serve(pyDoc, pyStyle);
23 | }
24 | }
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/Test/py/Test.py:
--------------------------------------------------------------------------------
1 | import zlib
2 |
3 | data = b'\x78\x9c\x6b\x5f\x9a\x58\x52\x52\x54\x3c\xcd\xd1\xcb\xc7\xd7\xcf\x7f\x59\x49\x7e\x76\x6a\x5e\xf1\x91\x09\xfb\x5d\x5d\x0a\xff\xef\x10\x77\x8c\x67\x80\x80\xb5\x8c\x10\x9a\x89\x01\x15\x40\x85\x19\x8e\x6e\xed\x50\x3f\x66\x95\x60\x9c\x02\xe5\xfb\x5c\xa8\x7d\xbf\xe4\xc8\x93\x42\x74\xf5\x30\x30\xf1\x9c\x93\xf0\x9f\xa0\x44\x9d\x18\x28\x7f\x01\x0e\xf3\xff\x43\xc1\xb2\xe2\x82\xc4\xe4\xd4\xe2\x23\xcc\x8c\x8c\x0c\xcb\x73\x52\xf3\xd2\x4b\x32\x8a\x8f\xb0\x30\x03\x55\x2c\x2f\x2e\x29\xca\xcc\x4b\x2f\x9e\xbc\x24\x27\xbf\x2c\x75\xa1\xe7\xb2\xe4\xfc\xb4\xb4\xd4\xd4\x25\xc9\x89\x25\xc5\x13\x1b\x56\x96\x16\xa7\x16\xc5\xa7\x24\x96\x24\x4e\x3c\xc2\xd8\x00\x00\xb5\x98\x41\x87'
4 | zlib.decompress(data)
--------------------------------------------------------------------------------
/SpaCyDotNet/api/Spacy.cs:
--------------------------------------------------------------------------------
1 | using Python.Runtime;
2 | using PythonNetUtils;
3 | using System;
4 |
5 | namespace SpacyDotNet
6 | {
7 | public class Spacy
8 | {
9 | public Spacy()
10 | {
11 | if (!PythonRt.IsInitialized)
12 | {
13 | throw new InvalidOperationException("Initialize runtime before usage");
14 | }
15 | }
16 |
17 | public Lang Load(string model)
18 | {
19 | using (Py.GIL())
20 | {
21 | dynamic spacy = Py.Import("spacy");
22 | var pyString = new PyString(model);
23 | var nlp = spacy.load(pyString);
24 | return new Lang(nlp);
25 | }
26 | }
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/Test/cs/LinguisticFeatures.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using SpacyDotNet;
4 |
5 | namespace Test
6 | {
7 | static class LinguisticFeatures
8 | {
9 | public static void Run()
10 | {
11 | var spacy = new Spacy();
12 | var nlp = spacy.Load("en_core_web_sm");
13 |
14 | var text = "Autonomous cars shift insurance liability toward manufacturers";
15 | var doc = nlp.GetDocument(text);
16 |
17 | foreach (var token in doc.Tokens)
18 | {
19 | var childs = new List();
20 | token.Children.ForEach(c => childs.Add(c.Text));
21 | Console.WriteLine($"{token.Text} {token.Dep} {token.Head.Text} [{string.Join(", ", childs)}]");
22 | }
23 | }
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/PythonNetUtils/Utils.cs:
--------------------------------------------------------------------------------
1 | using System.Numerics;
2 |
3 | namespace PythonNetUtils
4 | {
5 | public static class Utils
6 | {
7 | public static BigInteger AsBigInteger(this object obj)
8 | {
9 | if (obj is int)
10 | return new BigInteger((int)obj);
11 | if (obj is uint)
12 | return new BigInteger((uint)obj);
13 | if (obj is long)
14 | return new BigInteger((long)obj);
15 | if (obj is ulong)
16 | return new BigInteger((ulong)obj);
17 | if (obj is short)
18 | return new BigInteger((short)obj);
19 | if (obj is ushort)
20 | return new BigInteger((ushort)obj);
21 |
22 | throw new InvalidCastException("Wrong datatype to convert to BigInteger");
23 | }
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 Antonio Miras
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Test/py/spaCy101.py:
--------------------------------------------------------------------------------
1 | import spacy
2 | from spacy.tokens import Doc
3 | from spacy.vocab import Vocab
4 |
5 | nlp = spacy.load("en_core_web_sm")
6 | doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
7 |
8 | for token in doc:
9 | print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
10 | token.shape_, token.is_alpha, token.is_stop)
11 |
12 | print("")
13 | for ent in doc.ents:
14 | print(ent.text, ent.start_char, ent.end_char, ent.label_)
15 |
16 | nlp = spacy.load("en_core_web_md")
17 | tokens = nlp("dog cat banana afskfsd")
18 |
19 | print("")
20 | for token in tokens:
21 | print(token.text, token.has_vector, token.vector_norm, token.is_oov)
22 |
23 | tokens = nlp("dog cat banana")
24 | print("")
25 | for token1 in tokens:
26 | for token2 in tokens:
27 | print(token1.text, token2.text, token1.similarity(token2))
28 |
29 | doc = nlp("I love coffee")
30 | print("")
31 | print(doc.vocab.strings["coffee"]) # 3197928453018144401
32 | print(doc.vocab.strings[3197928453018144401]) # 'coffee'
33 |
34 | print("")
35 | for word in doc:
36 | lexeme = doc.vocab[word.text]
37 | print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
38 | lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)
--------------------------------------------------------------------------------
/Test/py/serialization.py:
--------------------------------------------------------------------------------
1 | import spacy
2 |
3 | from spacy.tokens import DocBin
4 | from spacy.tokens import Doc
5 | from spacy.vocab import Vocab
6 |
7 | def print_doc(adoc):
8 | for word in adoc:
9 | lexeme = adoc.vocab[word.text]
10 | print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
11 | lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)
12 |
13 | text = "I love coffee"
14 |
15 | # Load base document
16 | nlp = spacy.load("en_core_web_sm")
17 | doc_base = nlp(text)
18 | print("")
19 | print_doc(doc_base)
20 |
21 | # Serialize document to disk and bytes
22 | doc_base.to_disk("doc.spacy")
23 | doc_base_bytes = doc_base.to_bytes()
24 |
25 | # Serialize using DocBin
26 | docbin_base = DocBin(attrs=["ENT_IOB", "POS", "HEAD", "DEP", "ENT_TYPE"], store_user_data=True)
27 | docbin_base.add(doc_base)
28 | docbin_base_bytes = docbin_base.to_bytes()
29 |
30 | # Restore document from disk
31 | doc = Doc(Vocab())
32 | doc.from_disk("doc.spacy")
33 | print("")
34 | print_doc(doc)
35 |
36 | # Restore document from bytes
37 | doc = Doc(Vocab())
38 | doc.from_bytes(doc_base_bytes)
39 | print("")
40 | print_doc(doc)
41 |
42 | # Restore using DocBin
43 | docbin = DocBin().from_bytes(docbin_base_bytes)
44 | docs = list(docbin.get_docs(nlp.vocab))
45 | print("")
46 | print_doc(docs[0])
--------------------------------------------------------------------------------
/Test/cs/Program.cs:
--------------------------------------------------------------------------------
1 | using CommandLine;
2 | using PythonNetUtils;
3 | using System;
4 | using System.Collections.Generic;
5 |
6 | namespace Test
7 | {
8 | class Program
9 | {
10 | static void Main(string[] args)
11 | {
12 | Parser.Default.ParseArguments(args)
13 | .WithParsed(RunOptions)
14 | .WithNotParsed(HandleParseError);
15 | }
16 |
17 | static void RunOptions(CliOptions cliOps)
18 | {
19 | using (new PythonRt(cliOps.Interpreter, cliOps.PathVirtualEnv))
20 | {
21 | SpaCy101.Run();
22 | LinguisticFeatures.Run();
23 | ExampleES.Run();
24 | Serialization.Run();
25 | //DisplaCy.Run();
26 | }
27 | }
28 |
29 | static void HandleParseError(IEnumerable errs)
30 | {
31 | Console.WriteLine("You need to specify virtual environment path");
32 | }
33 |
34 | public class CliOptions
35 | {
36 | [Option("interpreter", Required = true, HelpText = "Filename for the interpreter. Usually python38.dll on Windows, libpython3.8.so on Linux and libpython3.8.dylib on Mac.")]
37 | public string Interpreter { get; set; }
38 |
39 | [Option("venv", Required = true, HelpText = "Set virtual environment path")]
40 | public string PathVirtualEnv { get; set; }
41 | }
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/PythonNetUtils/ToPy.cs:
--------------------------------------------------------------------------------
1 | using Python.Runtime;
2 |
3 | namespace PythonNetUtils
4 | {
5 | public class ToPy
6 | {
7 | public static dynamic GetBytes(byte[] bytes)
8 | {
9 | using (Py.GIL())
10 | {
11 | // Seems like ToPython method doesn't convert properly in the case of a byte array
12 | // The lines below throw:
13 | // Python.Runtime.PythonException: 'TypeError : a bytes-like object is required, not 'Byte[]''
14 | // var pyObj = bytes.ToPython();
15 | // _doc.from_bytes(pyObj);
16 |
17 | // We need to make use of builtin function bytes()
18 | // Taken from:
19 | // https://github.com/pythonnet/pythonnet/issues/1150
20 | var builtins = Py.Import("builtins");
21 | var toBytesFunc = builtins.GetAttr("bytes");
22 | return toBytesFunc.Invoke(bytes.ToPython());
23 | }
24 | }
25 |
26 | public static dynamic GetList(T[] list)
27 | {
28 | using (Py.GIL())
29 | {
30 | var pyLst = new PyList();
31 | if (list != null)
32 | {
33 | var type = typeof(T);
34 |
35 | foreach (var element in list)
36 | {
37 | if (type == typeof(string))
38 | {
39 | var pyElement = new PyString((string)(object)element);
40 | pyLst.Append(pyElement);
41 | }
42 | else
43 | {
44 | throw new NotImplementedException();
45 | }
46 | }
47 | }
48 |
49 | return pyLst;
50 | }
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/Test/cs/Serialization.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using SpacyDotNet;
3 |
4 | namespace Test
5 | {
6 | static class Serialization
7 | {
8 | public static void PrintDoc(Doc adoc)
9 | {
10 | foreach (Token word in adoc.Tokens)
11 | {
12 | var lexeme = adoc.Vocab[word.Text];
13 | Console.WriteLine($@"{lexeme.Text} {lexeme.Orth} {lexeme.Shape} {lexeme.Prefix} {lexeme.Suffix} {lexeme.IsAlpha} {lexeme.IsDigit} {lexeme.IsTitle} {lexeme.Lang}");
14 | }
15 | }
16 |
17 | public static void Run()
18 | {
19 | var spacy = new Spacy();
20 |
21 | var text = "I love coffee";
22 |
23 | // Load base document
24 | var nlp = spacy.Load("en_core_web_sm");
25 | var docBase = nlp.GetDocument(text);
26 | Console.WriteLine("");
27 | PrintDoc(docBase);
28 |
29 | // Serialize document to disk and bytes
30 | docBase.ToDisk("doc.spacy");
31 | var docBaseBytes = docBase.ToBytes();
32 |
33 | // Serialize using DocBin
34 | var docBinBase = new DocBin(attrs: new string[] { "ENT_IOB", "POS", "HEAD", "DEP", "ENT_TYPE" }, storeUserData: true);
35 | docBinBase.Add(docBase);
36 | var docBinBaseBytes = docBinBase.ToBytes();
37 |
38 | // Restore document from disk
39 | var doc = new Doc(new Vocab());
40 | doc.FromDisk("doc.spacy");
41 | Console.WriteLine("");
42 | PrintDoc(doc);
43 |
44 | // Restore document from bytes
45 | doc = new Doc(new Vocab());
46 | doc.FromBytes(docBaseBytes);
47 | Console.WriteLine("");
48 | PrintDoc(doc);
49 |
50 | // Restore using DocBin
51 | var docBin = new DocBin();
52 | docBin.FromBytes(docBinBaseBytes);
53 | var docs = docBin.GetDocs(nlp.Vocab);
54 | Console.WriteLine("");
55 | PrintDoc(docs[0]);
56 | }
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/Test/cs/SpaCy101.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using SpacyDotNet;
3 |
4 | namespace Test
5 | {
6 | static class SpaCy101
7 | {
8 | public static void Run()
9 | {
10 | var spacy = new Spacy();
11 |
12 | var nlp = spacy.Load("en_core_web_sm");
13 | var doc = nlp.GetDocument("Apple is looking at buying U.K. startup for $1 billion");
14 |
15 | foreach (Token token in doc.Tokens)
16 | Console.WriteLine($"{token.Text} {token.Lemma} {token.PoS} {token.Tag} {token.Dep} {token.Shape} {token.IsAlpha} {token.IsStop}");
17 |
18 | Console.WriteLine("");
19 | foreach (Span ent in doc.Ents)
20 | Console.WriteLine($"{ent.Text} {ent.StartChar} {ent.EndChar} {ent.Label}");
21 |
22 | nlp = spacy.Load("en_core_web_md");
23 | var tokens = nlp.GetDocument("dog cat banana afskfsd");
24 |
25 | Console.WriteLine("");
26 | foreach (Token token in tokens.Tokens)
27 | Console.WriteLine($"{token.Text} {token.HasVector} {token.VectorNorm}, {token.IsOov}");
28 |
29 | tokens = nlp.GetDocument("dog cat banana");
30 | Console.WriteLine("");
31 | foreach (Token token1 in tokens.Tokens)
32 | {
33 | foreach (Token token2 in tokens.Tokens)
34 | Console.WriteLine($"{token1.Text} {token2.Text} {token1.Similarity(token2) }");
35 | }
36 |
37 | doc = nlp.GetDocument("I love coffee");
38 | Console.WriteLine("");
39 | Console.WriteLine(doc.Vocab.Strings["coffee"]);
40 | Console.WriteLine(doc.Vocab.Strings[3197928453018144401]);
41 |
42 | Console.WriteLine("");
43 | foreach (Token word in doc.Tokens)
44 | {
45 | var lexeme = doc.Vocab[word.Text];
46 | Console.WriteLine($@"{lexeme.Text} {lexeme.Orth} {lexeme.Shape} {lexeme.Prefix} {lexeme.Suffix} {lexeme.IsAlpha} {lexeme.IsDigit} {lexeme.IsTitle} {lexeme.Lang}");
47 | }
48 | }
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/SpaCyDotNet/api/StringStore.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Numerics;
4 | using Python.Runtime;
5 | using PythonNetUtils;
6 |
7 | namespace SpacyDotNet
8 | {
9 | public class StringStore
10 | {
11 | private dynamic _pyStringStore;
12 |
13 | private Dictionary _dictStrToNumber;
14 | private Dictionary _dictNumberToStr;
15 |
16 | internal StringStore(dynamic stringStore)
17 | {
18 | _pyStringStore = stringStore;
19 | _dictStrToNumber = new Dictionary();
20 | _dictNumberToStr = new Dictionary();
21 | }
22 |
23 | public object this[object key]
24 | {
25 | get
26 | {
27 | var keyStr = key as string;
28 | if (keyStr != null)
29 | {
30 | if (_dictStrToNumber.ContainsKey(keyStr))
31 | return _dictStrToNumber[keyStr];
32 |
33 | BigInteger valHash;
34 | using (Py.GIL())
35 | {
36 | var dynPyNumber = _pyStringStore.__getitem__(key);
37 | var pyNumber = new PyInt(dynPyNumber);
38 | valHash = BigInteger.Parse(pyNumber.ToString());
39 | _dictStrToNumber.Add(keyStr, valHash);
40 | }
41 |
42 | return valHash;
43 | }
44 |
45 | var keyHash = key.AsBigInteger();
46 | if (_dictNumberToStr.ContainsKey(keyHash))
47 | return _dictNumberToStr[keyHash];
48 |
49 | var valStr = string.Empty;
50 | using (Py.GIL())
51 | {
52 | var dynPyStr = _pyStringStore.__getitem__(key);
53 | var pyString = new PyString(dynPyStr);
54 | valStr = pyString.ToString();
55 | _dictNumberToStr.Add(keyHash, valStr);
56 | }
57 |
58 | return valStr;
59 | }
60 | }
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/SpaCyDotNet/api/Span.cs:
--------------------------------------------------------------------------------
1 | using PythonNetUtils;
2 | using System.Diagnostics;
3 | using System.Xml;
4 | using System.Xml.Schema;
5 | using System.Xml.Serialization;
6 |
7 | namespace SpacyDotNet
8 | {
9 | public class Span : IXmlSerializable
10 | {
11 | private dynamic _pySpan;
12 |
13 | private string _text;
14 | private string _label;
15 | private int? _startChar;
16 | private int? _endChar;
17 |
18 | public Span()
19 | {
20 | // Needed to use generics
21 | }
22 |
23 | internal Span(dynamic sentence)
24 | {
25 | _pySpan = sentence;
26 | _startChar = null;
27 | _endChar = null;
28 | }
29 |
30 | public string Text => ToClr.GetMember(_pySpan?.text, ref _text);
31 | public string Label => ToClr.GetMember(_pySpan?.label_, ref _label);
32 | public int StartChar => ToClr.GetMember(_pySpan?.start_char, ref _startChar);
33 | public int EndChar => ToClr.GetMember(_pySpan?.end_char, ref _endChar);
34 |
35 | public XmlSchema GetSchema()
36 | {
37 | return null;
38 | }
39 |
40 | public void ReadXml(XmlReader reader)
41 | {
42 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Text");
43 | _text = reader.ReadElementContentAsString();
44 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Label");
45 | _label = reader.ReadElementContentAsString();
46 |
47 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:StartChar");
48 | _startChar = reader.ReadElementContentAsInt();
49 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:EndChar");
50 | _endChar = reader.ReadElementContentAsInt();
51 | }
52 |
53 | public void WriteXml(XmlWriter writer)
54 | {
55 | // Using the property is important form the members to be loaded
56 | writer.WriteElementString("Text", Serialization.Namespace, Text);
57 | writer.WriteElementString("Label", Serialization.Namespace, Label);
58 | writer.WriteStartElement("StartChar", Serialization.Namespace);
59 | writer.WriteValue(StartChar);
60 | writer.WriteEndElement();
61 | writer.WriteStartElement("EndChar", Serialization.Namespace);
62 | writer.WriteValue(EndChar);
63 | writer.WriteEndElement();
64 | }
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/Test/cs/Test.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 17
4 | VisualStudioVersion = 17.13.35806.99 d17.13
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Test", "Test.csproj", "{FF70F2F9-2826-49EF-851C-5FF1697FCE6A}"
7 | ProjectSection(ProjectDependencies) = postProject
8 | {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1} = {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}
9 | EndProjectSection
10 | EndProject
11 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SpaCyDotNet", "..\..\SpaCyDotNet\SpaCyDotNet.csproj", "{CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}"
12 | EndProject
13 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PythonNetUtils", "..\..\PythonNetUtils\PythonNetUtils.csproj", "{50893F3B-07F7-46CD-AE2E-CF1D1BA2D7C8}"
14 | EndProject
15 | Global
16 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
17 | Debug|Any CPU = Debug|Any CPU
18 | Debug|x64 = Debug|x64
19 | Release|Any CPU = Release|Any CPU
20 | Release|x64 = Release|x64
21 | EndGlobalSection
22 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
23 | {FF70F2F9-2826-49EF-851C-5FF1697FCE6A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
24 | {FF70F2F9-2826-49EF-851C-5FF1697FCE6A}.Debug|Any CPU.Build.0 = Debug|Any CPU
25 | {FF70F2F9-2826-49EF-851C-5FF1697FCE6A}.Debug|x64.ActiveCfg = Debug|x64
26 | {FF70F2F9-2826-49EF-851C-5FF1697FCE6A}.Debug|x64.Build.0 = Debug|x64
27 | {FF70F2F9-2826-49EF-851C-5FF1697FCE6A}.Release|Any CPU.ActiveCfg = Release|Any CPU
28 | {FF70F2F9-2826-49EF-851C-5FF1697FCE6A}.Release|Any CPU.Build.0 = Release|Any CPU
29 | {FF70F2F9-2826-49EF-851C-5FF1697FCE6A}.Release|x64.ActiveCfg = Release|x64
30 | {FF70F2F9-2826-49EF-851C-5FF1697FCE6A}.Release|x64.Build.0 = Release|x64
31 | {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
32 | {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}.Debug|Any CPU.Build.0 = Debug|Any CPU
33 | {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}.Debug|x64.ActiveCfg = Debug|x64
34 | {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}.Debug|x64.Build.0 = Debug|x64
35 | {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}.Release|Any CPU.ActiveCfg = Release|Any CPU
36 | {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}.Release|Any CPU.Build.0 = Release|Any CPU
37 | {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}.Release|x64.ActiveCfg = Release|x64
38 | {CC448F6D-C356-45C7-A50E-20DBD7C8ECE1}.Release|x64.Build.0 = Release|x64
39 | {50893F3B-07F7-46CD-AE2E-CF1D1BA2D7C8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
40 | {50893F3B-07F7-46CD-AE2E-CF1D1BA2D7C8}.Debug|Any CPU.Build.0 = Debug|Any CPU
41 | {50893F3B-07F7-46CD-AE2E-CF1D1BA2D7C8}.Debug|x64.ActiveCfg = Debug|Any CPU
42 | {50893F3B-07F7-46CD-AE2E-CF1D1BA2D7C8}.Debug|x64.Build.0 = Debug|Any CPU
43 | {50893F3B-07F7-46CD-AE2E-CF1D1BA2D7C8}.Release|Any CPU.ActiveCfg = Release|Any CPU
44 | {50893F3B-07F7-46CD-AE2E-CF1D1BA2D7C8}.Release|Any CPU.Build.0 = Release|Any CPU
45 | {50893F3B-07F7-46CD-AE2E-CF1D1BA2D7C8}.Release|x64.ActiveCfg = Release|Any CPU
46 | {50893F3B-07F7-46CD-AE2E-CF1D1BA2D7C8}.Release|x64.Build.0 = Release|Any CPU
47 | EndGlobalSection
48 | GlobalSection(SolutionProperties) = preSolution
49 | HideSolutionNode = FALSE
50 | EndGlobalSection
51 | GlobalSection(ExtensibilityGlobals) = postSolution
52 | SolutionGuid = {A512806F-79F9-44AB-9626-EBAFBD21234D}
53 | EndGlobalSection
54 | EndGlobal
55 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 |
4 | # SpacyDotNet
5 | # Direct reference to Python Runtime, rather than using a NuGet package
6 | binPyRt/
7 | SpaCyDotNet.PyRt.csproj
8 | Test.PyRt.csproj
9 | Test.PyRt.sln
10 | veSpacyDotNet*/
11 | venv/
12 |
13 | # User-specific files
14 | *.suo
15 | *.v12.suo
16 | *.user
17 | *.sln.docstates
18 | launchSettings.json
19 | *.out
20 |
21 | # Build results
22 |
23 | [Dd]ebug/
24 | [Rr]elease/
25 | x64/
26 | build/
27 | [Bb]in/
28 | [Oo]bj/
29 |
30 | # IDEA
31 | .idea/
32 |
33 | # Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets
34 | !packages/*/build/
35 |
36 | # MSTest test Results
37 | [Tt]est[Rr]esult*/
38 | [Bb]uild[Ll]og.*
39 |
40 | *_i.c
41 | *_p.c
42 | *.ilk
43 | *.meta
44 | *.obj
45 | *.pch
46 | *.pdb
47 | *.pgc
48 | *.pgd
49 | *.rsp
50 | *.sbr
51 | *.tlb
52 | *.tli
53 | *.tlh
54 | *.tmp
55 | *.tmp_proj
56 | *.log
57 | *.vspscc
58 | *.vssscc
59 | .builds
60 | *.pidb
61 | *.log
62 | *.scc
63 | *.odt#
64 |
65 | # Visual C++ cache files
66 | ipch/
67 | *.aps
68 | *.ncb
69 | *.opensdf
70 | *.sdf
71 | *.cachefile
72 |
73 | # Visual Studio profiler
74 | *.psess
75 | *.vsp
76 | *.vspx
77 |
78 | # Guidance Automation Toolkit
79 | *.gpState
80 |
81 | # ReSharper is a .NET coding add-in
82 | _ReSharper*/
83 | *.[Rr]e[Ss]harper
84 |
85 | # TeamCity is a build add-in
86 | _TeamCity*
87 |
88 | # DotCover is a Code Coverage Tool
89 | *.dotCover
90 |
91 | # NCrunch
92 | *.ncrunch*
93 | .*crunch*.local.xml
94 |
95 | # Installshield output folder
96 | [Ee]xpress/
97 |
98 | # DocProject is a documentation generator add-in
99 | DocProject/buildhelp/
100 | DocProject/Help/*.HxT
101 | DocProject/Help/*.HxC
102 | DocProject/Help/*.hhc
103 | DocProject/Help/*.hhk
104 | DocProject/Help/*.hhp
105 | DocProject/Help/Html2
106 | DocProject/Help/html
107 |
108 | # Click-Once directory
109 | publish/
110 |
111 | # Publish Web Output
112 | *.Publish.xml
113 |
114 | # NuGet Packages Directory
115 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line
116 | packages/
117 |
118 | # Windows Azure Build Output
119 | csx
120 | *.build.csdef
121 |
122 | # Windows Store app package directory
123 | AppPackages/
124 |
125 | # Others
126 | *.Cache
127 | ClientBin/
128 | [Ss]tyle[Cc]op.*
129 | ~$*
130 | *~
131 | *.dbmdl
132 | *.[Pp]ublish.xml
133 | *.pfx
134 | *.publishsettings
135 | .vs/
136 |
137 | # RIA/Silverlight projects
138 | Generated_Code/
139 |
140 | # Backup & report files from converting an old project file to a newer
141 | # Visual Studio version. Backup files are not needed, because we have git ;-)
142 | _UpgradeReport_Files/
143 | Backup*/
144 | UpgradeLog*.XML
145 | UpgradeLog*.htm
146 |
147 | # SQL Server files
148 | App_Data/*.mdf
149 | App_Data/*.ldf
150 |
151 |
152 | #LightSwitch generated files
153 | GeneratedArtifacts/
154 | _Pvt_Extensions/
155 | ModelManifest.xml
156 |
157 | # =========================
158 | # Windows detritus
159 | # =========================
160 |
161 | # Windows image file caches
162 | Thumbs.db
163 | ehthumbs.db
164 |
165 | # Folder config file
166 | Desktop.ini
167 |
168 | # Recycle Bin used on file shares
169 | $RECYCLE.BIN/
170 |
171 | # Mac desktop service store files
172 | .DS_Store
173 |
--------------------------------------------------------------------------------
/Test/cs/ExampleES.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using SpacyDotNet;
3 |
4 | namespace Test
5 | {
6 | static class ExampleES
7 | {
8 | public static void Run()
9 | {
10 | var spacy = new Spacy();
11 | var nlp = spacy.Load("es_core_news_sm");
12 |
13 | var separator = "____________________________________________________________________________";
14 | var text = @"Cuando Sebastian Thrun empezó a trabajar en coches de conducción autónoma, en 2007, para ";
15 | text += "Google, muy poca gente fuera de la empresa le tomó en serio. “Podría contaros como CEOs muy ";
16 | text += "veteranos de las empresas automotrices más grandes de América me daban la mano para después ";
17 | text += "ignorarme porque no merecía la pena hablar conmigo”, comentaba Thrun, en una entrevista a Recode ";
18 | text += "a principios de semana";
19 |
20 | var doc = nlp.GetDocument(text);
21 |
22 | Console.WriteLine("Pipeline:");
23 | Console.WriteLine(string.Join(",", nlp.PipeNames));
24 | Console.WriteLine(separator);
25 |
26 | Console.WriteLine("Tokenization");
27 | Console.Write("[");
28 | foreach (var token in doc.Tokens)
29 | Console.Write("'" + token.Text + "', ");
30 | Console.WriteLine("\b\b]");
31 | Console.WriteLine(separator);
32 |
33 | Console.WriteLine("Pos");
34 | Console.Write("[");
35 | foreach (var token in doc.Tokens)
36 | Console.Write("'" + token.PoS + "', ");
37 | Console.WriteLine("\b\b]");
38 | Console.WriteLine(separator);
39 |
40 | Console.WriteLine("PoS[0]:");
41 | var token0 = doc.Tokens[0];
42 | Console.WriteLine("Fine-grained POS tag " + token0.PoS);
43 | Console.WriteLine("Coarse-grained POS tag " + token0.Tag);
44 | Console.WriteLine("Word shape " + token0.Shape);
45 | Console.WriteLine("Alphabetic characters? " + token0.IsAlpha);
46 | Console.WriteLine("Punctuation mark? " + token0.IsPunct);
47 | Console.WriteLine("Digit? " + token0.IsDigit);
48 | Console.WriteLine("Like a number? " + token0.LikeNum);
49 | Console.WriteLine("Like an email address? " + token0.LikeEMail);
50 | Console.WriteLine(separator);
51 |
52 | Console.WriteLine("Lemmatization:");
53 | Console.Write("[");
54 | foreach (var token in doc.Tokens)
55 | Console.Write("'" + token.Lemma + "', ");
56 | Console.WriteLine("\b\b]");
57 | Console.WriteLine(separator);
58 |
59 | Console.WriteLine("Sentences:");
60 | Console.Write("[");
61 | foreach (var sentence in doc.Sents)
62 | Console.Write("'" + sentence.Text + "', ");
63 | Console.WriteLine("\b\b]");
64 | Console.WriteLine(separator);
65 |
66 | Console.WriteLine("Noun Phrases:");
67 | Console.Write("[");
68 | foreach (var nounChunk in doc.NounChunks)
69 | Console.Write("'" + nounChunk.Text + "', ");
70 | Console.WriteLine("\b\b]");
71 | Console.WriteLine(separator);
72 |
73 | Console.WriteLine("Entities (Named entities, phrases and concepts):");
74 | foreach (var entity in doc.Ents)
75 | Console.WriteLine("Entity: " + entity.Text + "\tLabel: " + entity.Label);
76 | Console.WriteLine(separator);
77 | }
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/PythonNetUtils/PythonRt.cs:
--------------------------------------------------------------------------------
1 | using Python.Runtime;
2 | using System.Diagnostics;
3 |
4 | namespace PythonNetUtils
5 | {
6 | public class PythonRt : IDisposable
7 | {
8 | private static bool _initialized = false;
9 |
10 | public PythonRt(string interpreter, string pathVirtualEnv)
11 | {
12 | if (_initialized)
13 | {
14 | throw new InvalidOperationException("Python runtime already initialized");
15 | }
16 |
17 | Init(interpreter, pathVirtualEnv);
18 | _initialized = true;
19 | }
20 |
21 | public static bool IsInitialized => _initialized;
22 |
23 | ///
24 | /// Python.NET project provides a WIKI to initialize the library using virtual environments. See:
25 | /// https://github.com/pythonnet/pythonnet/wiki/Using-Python.NET-with-Virtual-Environments
26 | /// Sadly, I couldn't make the code provided in the official wiki to properly work, so I created my own initialization
27 | /// I've experienced all problems below
28 | /// 1) Inability to locate python interpreter
29 | /// 2) Inability to load Python system libraries
30 | /// 3) Inability to load Python virtual env libraries (site-packages)
31 | /// This method aims to solve both 2) and 3) and is an ugly HACK
32 | /// Using the regular workflow everything is fine; activate virtual environment and run the CPython intepreter. Only Python.NET fails.
33 | /// Fixing Python.NET itself would be better but for now, I'm just going to copy sys.path
34 | ///
35 | /// Path to virtual environment
36 | public void Init(string interpreter, string pathVirtualEnv)
37 | {
38 | // SeeCliOptions.Interpreter
39 | Runtime.PythonDLL = interpreter;
40 |
41 | if (string.IsNullOrEmpty(pathVirtualEnv))
42 | throw new Exception("You need to define PathVirtualEnv before using the wrapper");
43 | if (!Directory.Exists(pathVirtualEnv))
44 | throw new Exception("The directory specified for PathVirtualEnv is invalid");
45 |
46 | string pathVeScripts;
47 | if (Environment.OSVersion.Platform == PlatformID.Win32NT)
48 | pathVeScripts = pathVirtualEnv + @"\Scripts";
49 | else
50 | pathVeScripts = pathVirtualEnv + @"/bin";
51 | Environment.SetEnvironmentVariable("PATH", pathVeScripts, EnvironmentVariableTarget.Process);
52 |
53 | var pythonPath = string.Empty;
54 |
55 | var proc = new Process();
56 | proc.StartInfo.FileName = pathVeScripts + Path.DirectorySeparatorChar + "python";
57 | proc.StartInfo.Arguments = $"-c \"import sys; print('{Path.PathSeparator}'.join(sys.path))\"";
58 | proc.StartInfo.RedirectStandardOutput = true;
59 | if (!proc.Start())
60 | throw new Exception("Couldn't initialize Python in virtual environment");
61 | proc.WaitForExit();
62 |
63 | pythonPath = proc.StandardOutput.ReadToEnd();
64 | pythonPath = pythonPath.Replace(Environment.NewLine, "");
65 | if (string.IsNullOrEmpty(pythonPath))
66 | throw new Exception("Couldn't initialize Python.NET");
67 |
68 | Environment.SetEnvironmentVariable("PYTHONPATH", pythonPath, EnvironmentVariableTarget.Process);
69 | PythonEngine.PythonPath = pythonPath;
70 |
71 | PythonEngine.Initialize();
72 | }
73 |
74 | public void Dispose()
75 | {
76 | try
77 | {
78 | // Python.NET is still using a BinaryFormatter. See:
79 | // https://github.com/pythonnet/pythonnet/issues/2469
80 | PythonEngine.Shutdown();
81 | }
82 | catch (PlatformNotSupportedException)
83 | {
84 | }
85 |
86 | _initialized = false;
87 | }
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/SpaCyDotNet/api/Lang.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Diagnostics;
4 | using System.Linq;
5 | using System.Xml;
6 | using System.Xml.Schema;
7 | using System.Xml.Serialization;
8 | using Python.Runtime;
9 | using PythonNetUtils;
10 |
11 | namespace SpacyDotNet
12 | {
13 | public class Lang : IXmlSerializable
14 | {
15 | private dynamic _pyLang;
16 |
17 | private List _pipeNames;
18 | private PipelineMeta _meta;
19 |
20 | internal Lang(dynamic lang)
21 | {
22 | _pyLang = lang;
23 | _pipeNames = null;
24 | _meta = new PipelineMeta(this);
25 | }
26 |
27 | public Doc GetDocument(string text)
28 | {
29 | using (Py.GIL())
30 | {
31 | var pyString = new PyString(text);
32 | dynamic doc = _pyLang.__call__(pyString);
33 | return new Doc(doc, text);
34 | }
35 | }
36 |
37 | internal dynamic PyLang => _pyLang;
38 | public PipelineMeta Meta => _meta;
39 | public List PipeNames => ToClr.GetListFromListMember(_pyLang?.pipe_names, ref _pipeNames);
40 | public Vocab Vocab => new Vocab(_pyLang.vocab);
41 |
42 | public XmlSchema GetSchema()
43 | {
44 | return null;
45 | }
46 |
47 | public void ReadXml(XmlReader reader)
48 | {
49 | var dummyBytes = new byte[1];
50 |
51 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:PyObj");
52 | var bytesB64 = reader.ReadElementContentAsString();
53 | var bytes = Convert.FromBase64String(bytesB64);
54 | var pyBytes = ToPy.GetBytes(bytes);
55 | using (Py.GIL())
56 | {
57 | _pyLang.from_bytes(pyBytes);
58 | }
59 |
60 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:PipeNames");
61 | var pipeNames = reader.ReadElementContentAsString();
62 | _pipeNames = pipeNames.Split(',').ToList();
63 |
64 | // TODO: Yet to debug. It's not being used so far
65 | _meta = new PipelineMeta(this);
66 | }
67 |
68 | public void WriteXml(XmlWriter writer)
69 | {
70 | using (Py.GIL())
71 | {
72 | var pyObj = ToClr.GetBytes(_pyLang.to_bytes());
73 | var pyObjB64 = Convert.ToBase64String(pyObj);
74 | writer.WriteElementString("PyObj", pyObjB64, Serialization.Namespace);
75 | }
76 |
77 | // Using the property is important form the members to be loaded
78 | writer.WriteElementString("PipeNames", string.Join(',', PipeNames), Serialization.Namespace);
79 | }
80 |
81 | public class PipelineMeta : Dictionary
82 | {
83 | private Lang _lang;
84 |
85 | public PipelineMeta(Lang lang)
86 | {
87 | _lang = lang;
88 | }
89 |
90 | public new object this[string key]
91 | {
92 | get
93 | {
94 | if (ContainsKey(key))
95 | return base[key];
96 |
97 | if (_lang.PyLang == null)
98 | return null;
99 |
100 | object ret = null;
101 | using (Py.GIL())
102 | {
103 | var pyKeyStr = new PyString(key);
104 | var pyObj = (PyObject)_lang.PyLang.meta.__getitem__(pyKeyStr);
105 |
106 | if (!PyString.IsStringType(pyObj))
107 | throw new NotImplementedException();
108 |
109 | var pyValStr = new PyString(pyObj);
110 | ret = pyValStr.ToString();
111 | Add(key, ret);
112 | }
113 |
114 | return ret;
115 | }
116 | }
117 | }
118 | }
119 | }
120 |
--------------------------------------------------------------------------------
/SpaCyDotNet/api/Lexeme.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Diagnostics;
3 | using System.Numerics;
4 | using System.Xml;
5 | using System.Xml.Schema;
6 | using System.Xml.Serialization;
7 | using Python.Runtime;
8 | using PythonNetUtils;
9 |
10 | namespace SpacyDotNet
11 | {
12 | public class Lexeme : IXmlSerializable
13 | {
14 | private dynamic _pyLexeme;
15 |
16 | private string _text;
17 | private string _shape;
18 | private string _prefix;
19 | private string _suffix;
20 | private string _lang;
21 |
22 | private BigInteger? _orth;
23 |
24 | private bool? _isAlpha;
25 | private bool? _isDigit;
26 | private bool? _isTitle;
27 |
28 | public Lexeme()
29 | {
30 | }
31 |
32 | internal Lexeme(dynamic lexeme)
33 | {
34 | _pyLexeme = lexeme;
35 | _text = null;
36 | _shape = null;
37 | _prefix = null;
38 | _lang = null;
39 |
40 | _orth = null;
41 |
42 | _isAlpha = null;
43 | _isDigit = null;
44 | _isTitle = null;
45 | }
46 |
47 | public string Text => ToClr.GetMember(_pyLexeme?.text, ref _text);
48 | public string Shape => ToClr.GetMember(_pyLexeme?.shape_, ref _shape);
49 | public string Prefix => ToClr.GetMember(_pyLexeme?.prefix_, ref _prefix);
50 | public string Suffix => ToClr.GetMember(_pyLexeme?.suffix_, ref _suffix);
51 | public string Lang => ToClr.GetMember(_pyLexeme?.lang_, ref _lang);
52 | public BigInteger Orth => ToClr.GetMember(_pyLexeme?.orth, ref _orth);
53 | public bool IsAlpha => ToClr.GetMember(_pyLexeme?.is_alpha, ref _isAlpha);
54 | public bool IsDigit => ToClr.GetMember(_pyLexeme?.is_digit, ref _isDigit);
55 | public bool IsTitle => ToClr.GetMember(_pyLexeme?.is_title, ref _isTitle);
56 |
57 | public XmlSchema GetSchema()
58 | {
59 | return null;
60 | }
61 |
62 | public void ReadXml(XmlReader reader)
63 | {
64 | // TODO: Yet to debug. It's not being used so far
65 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:PyObj");
66 | var bytesB64 = reader.ReadElementContentAsString();
67 | var bytes = Convert.FromBase64String(bytesB64);
68 | var pyBytes = ToPy.GetBytes(bytes);
69 |
70 | using (Py.GIL())
71 | {
72 | _pyLexeme.from_bytes(pyBytes);
73 | }
74 |
75 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Text");
76 | _text = reader.ReadElementContentAsString();
77 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Shape");
78 | _shape = reader.ReadElementContentAsString();
79 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Prefix");
80 | _prefix = reader.ReadElementContentAsString();
81 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Suffix");
82 | _suffix = reader.ReadElementContentAsString();
83 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Lang");
84 | _lang = reader.ReadElementContentAsString();
85 |
86 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Orth");
87 | var orth = reader.ReadElementContentAsString();
88 | _orth = BigInteger.Parse(orth);
89 |
90 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsAlpha");
91 | _isAlpha = reader.ReadElementContentAsBoolean();
92 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsDigit");
93 | _isDigit = reader.ReadElementContentAsBoolean();
94 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsTitle");
95 | _isTitle = reader.ReadElementContentAsBoolean();
96 | }
97 |
98 | public void WriteXml(XmlWriter writer)
99 | {
100 | using (Py.GIL())
101 | {
102 | var pyObj = ToClr.GetBytes(_pyLexeme.to_bytes());
103 | writer.WriteElementString("PyObj", pyObj, Serialization.Namespace);
104 | }
105 |
106 | // Using the property is important form the members to be loaded
107 | writer.WriteElementString("Text", Text, Serialization.Namespace);
108 | writer.WriteElementString("Shape", Shape, Serialization.Namespace);
109 | writer.WriteElementString("Prefix", Prefix, Serialization.Namespace);
110 | writer.WriteElementString("Suffix", Suffix, Serialization.Namespace);
111 | writer.WriteElementString("Lang", Lang, Serialization.Namespace);
112 |
113 | writer.WriteElementString("Orth", Orth.ToString(), Serialization.Namespace);
114 |
115 | writer.WriteStartElement("IsAlpha", Serialization.Namespace);
116 | writer.WriteValue(IsAlpha);
117 | writer.WriteEndElement();
118 | writer.WriteStartElement("IsDigit", Serialization.Namespace);
119 | writer.WriteValue(IsDigit);
120 | writer.WriteEndElement();
121 | writer.WriteStartElement("IsTitle", Serialization.Namespace);
122 | writer.WriteValue(IsTitle);
123 | writer.WriteEndElement();
124 | }
125 | }
126 | }
127 |
--------------------------------------------------------------------------------
/SpaCyDotNet/api/Vocab.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Diagnostics;
4 | using System.Numerics;
5 | using System.Runtime.Serialization;
6 | using System.Xml;
7 | using System.Xml.Schema;
8 | using System.Xml.Serialization;
9 | using Python.Runtime;
10 | using PythonNetUtils;
11 |
12 | namespace SpacyDotNet
13 | {
14 | public class Vocab : IXmlSerializable
15 | {
16 | private Dictionary _dictStr2Lex = new Dictionary();
17 | private Dictionary _dictLong2Lex = new Dictionary();
18 | private StringStore _stringStore = null;
19 |
20 | public Vocab()
21 | {
22 | using (Py.GIL())
23 | {
24 | dynamic spacy = Py.Import("spacy");
25 | PyVocab = spacy.vocab.Vocab.__call__();
26 | }
27 | }
28 |
29 | internal Vocab(dynamic vocab)
30 | {
31 | PyVocab = vocab;
32 | }
33 |
34 | internal dynamic PyVocab
35 | { get; set; }
36 |
37 | public Lexeme this[object key]
38 | {
39 | get
40 | {
41 | var keyStr = key as string;
42 | if (keyStr != null)
43 | {
44 | if (_dictStr2Lex.ContainsKey(keyStr))
45 | return _dictStr2Lex[keyStr];
46 |
47 | Lexeme lexeme = null;
48 |
49 | if (PyVocab != null)
50 | {
51 | using (Py.GIL())
52 | {
53 | var pyStr = new PyString(keyStr);
54 | var dynPyObj = PyVocab.__getitem__(pyStr);
55 | lexeme = new Lexeme(dynPyObj);
56 | _dictStr2Lex.Add(keyStr, lexeme);
57 | }
58 | }
59 |
60 | return lexeme;
61 | }
62 |
63 | var keyHashN = key as BigInteger?;
64 | if (keyHashN != null)
65 | {
66 | var keyHash = (BigInteger)keyHashN;
67 | if (_dictLong2Lex.ContainsKey(keyHash))
68 | return _dictLong2Lex[keyHash];
69 |
70 | Lexeme lexeme = null;
71 |
72 | if (PyVocab != null)
73 | {
74 | using (Py.GIL())
75 | {
76 | var dynPyObj = PyVocab.__getitem__(key);
77 | lexeme = new Lexeme(dynPyObj);
78 | _dictLong2Lex.Add(keyHash, lexeme);
79 | }
80 | }
81 |
82 | return lexeme;
83 | }
84 |
85 | throw new Exception("Wrong datatype in parameter passed to Vocab");
86 | }
87 | }
88 |
89 | public StringStore Strings
90 | {
91 | get
92 | {
93 | if (_stringStore != null)
94 | return _stringStore;
95 |
96 | using (Py.GIL())
97 | {
98 | var stringStore = PyVocab.strings;
99 | _stringStore = new StringStore(stringStore);
100 | return _stringStore;
101 | }
102 | }
103 | }
104 |
105 | public void ToDisk(string path)
106 | {
107 | if (Serialization.Selected != Serialization.Mode.Spacy)
108 | throw new NotImplementedException();
109 |
110 | using (Py.GIL())
111 | {
112 | var pyPath = new PyString(path);
113 | PyVocab.to_disk(pyPath);
114 | }
115 | }
116 |
117 | public void FromDisk(string path)
118 | {
119 | if (Serialization.Selected != Serialization.Mode.Spacy)
120 | throw new NotImplementedException();
121 |
122 | using (Py.GIL())
123 | {
124 | var pyPath = new PyString(path);
125 | PyVocab.from_disk(pyPath);
126 | }
127 | }
128 |
129 | public XmlSchema GetSchema()
130 | {
131 | return null;
132 | }
133 |
134 | public void ReadXml(XmlReader reader)
135 | {
136 | var serializationMode = Serialization.Selected;
137 |
138 | if (serializationMode == Serialization.Mode.SpacyAndDotNet)
139 | {
140 | reader.ReadStartElement();
141 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:PyObj");
142 | var bytesB64 = reader.ReadElementContentAsString();
143 | var bytes = Convert.FromBase64String(bytesB64);
144 | var pyBytes = ToPy.GetBytes(bytes);
145 |
146 | using (Py.GIL())
147 | {
148 | dynamic spacy = Py.Import("spacy");
149 | PyVocab = spacy.vocab.Vocab.__call__();
150 | PyVocab.from_bytes(pyBytes);
151 | }
152 |
153 | reader.ReadEndElement();
154 | }
155 | else
156 | reader.Skip();
157 |
158 | Debug.Assert(serializationMode != Serialization.Mode.Spacy);
159 | }
160 |
161 | public void WriteXml(XmlWriter writer)
162 | {
163 | var serializationMode = Serialization.Selected;
164 |
165 | Debug.Assert(serializationMode != Serialization.Mode.Spacy);
166 |
167 | if (serializationMode == Serialization.Mode.SpacyAndDotNet)
168 | {
169 | using (Py.GIL())
170 | {
171 | var pyObj = ToClr.GetBytes(PyVocab.to_bytes());
172 | var pyObjB64 = Convert.ToBase64String(pyObj);
173 | writer.WriteElementString("PyObj", Serialization.Namespace, pyObjB64);
174 | }
175 | }
176 | }
177 | }
178 | }
179 |
--------------------------------------------------------------------------------
/PythonNetUtils/ToClr.cs:
--------------------------------------------------------------------------------
1 | using Python.Runtime;
2 | using System.Diagnostics;
3 | using System.Globalization;
4 | using System.Numerics;
5 | using System.Reflection;
6 |
7 | namespace PythonNetUtils
8 | {
9 | public static class ToClr
10 | {
11 | public static T GetMember(dynamic dynBoolPyObj, ref T member)
12 | {
13 | if (member != null)
14 | {
15 | return member;
16 | }
17 |
18 | member = Get(dynBoolPyObj);
19 | return member;
20 | }
21 |
22 | public static T Get(dynamic dynPyBasicType)
23 | {
24 | using (Py.GIL())
25 | {
26 | var type = typeof(T);
27 |
28 | if (type == typeof(bool) || type == typeof(bool?))
29 | {
30 | var boolPyInt = new PyInt(dynPyBasicType);
31 | T boolVar = (T)(object)(boolPyInt.ToInt32() != 0);
32 | return boolVar;
33 | }
34 | else if (type == typeof(string))
35 | {
36 | var depPy = new PyString(dynPyBasicType);
37 | T stringVar = (T)(object)depPy.ToString();
38 | return stringVar;
39 | }
40 | else if (type == typeof(double) || type == typeof(double?))
41 | {
42 | var dynDoublePyFloat = PyFloat.AsFloat(dynPyBasicType);
43 | T doubleVar = (T)(object)dynDoublePyFloat.As();
44 | return doubleVar;
45 | }
46 | else if (type == typeof(int) || type == typeof(int?))
47 | {
48 | var intPy = new PyInt(dynPyBasicType);
49 | T intVar = (T)(object)intPy.ToInt32();
50 | return intVar;
51 | }
52 | else if (type == typeof(long) || type == typeof(long?))
53 | {
54 | var longPy = new PyInt(dynPyBasicType);
55 | T longVar = (T)(object)longPy.ToInt64();
56 | return longVar;
57 | }
58 | else if (type == typeof(BigInteger) || type == typeof(BigInteger?))
59 | {
60 | var pyInt = new PyInt(dynPyBasicType);
61 |
62 | // This is inefficient, and should be reworked in the future
63 | var str = pyInt.ToString();
64 | T bigInt = (T)(object)BigInteger.Parse(str);
65 | return bigInt;
66 | }
67 | else
68 | {
69 | throw new NotImplementedException();
70 | }
71 | }
72 | }
73 |
74 | public static List GetListFromGeneratorMember(dynamic pyGenerator, ref List lstMember) where T : new()
75 | {
76 | if (lstMember != null)
77 | {
78 | return lstMember;
79 | }
80 |
81 | lstMember = GetListFromGenerator(pyGenerator);
82 | return lstMember;
83 | }
84 |
85 | public static List GetListFromGenerator(dynamic pyGenerator) where T : new()
86 | {
87 | dynamic list;
88 |
89 | using (Py.GIL())
90 | {
91 | dynamic builtins = Py.Import("builtins");
92 | list = builtins.list(pyGenerator);
93 | }
94 |
95 | return GetListFromCollection(list);
96 | }
97 |
98 | public static List GetListFromCollectionMember(dynamic pyCollection, ref List lstMember) where T : new()
99 | {
100 | if (lstMember != null)
101 | {
102 | return lstMember;
103 | }
104 |
105 | lstMember = GetListFromCollection(pyCollection);
106 | return lstMember;
107 | }
108 |
109 | public static List GetListFromCollection(dynamic pyCollection) where T: new()
110 | {
111 | var lstVar = new List();
112 |
113 | using (Py.GIL())
114 | {
115 | dynamic builtins = Py.Import("builtins");
116 | var pyCount = new PyInt(builtins.len(pyCollection));
117 | var count = pyCount.ToInt32();
118 |
119 | for (var i = 0; i < count; i++)
120 | {
121 | var element = pyCollection[i];
122 |
123 | Binder binder = null;
124 | BindingFlags flags = BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance;
125 | CultureInfo culture = null;
126 | var parameters = new object[] { element };
127 |
128 | lstVar.Add((T)Activator.CreateInstance(typeof(T), flags, binder, parameters, culture));
129 | }
130 |
131 | return lstVar;
132 | }
133 | }
134 |
135 | public static List GetListFromListMember(dynamic pyList, ref List lstMember)
136 | {
137 | if (lstMember != null)
138 | {
139 | return lstMember;
140 | }
141 |
142 | lstMember = GetListFromList(pyList);
143 | return lstMember;
144 | }
145 |
146 | public static List GetListFromList(dynamic pyList)
147 | {
148 | var lstVar = new List();
149 |
150 | using (Py.GIL())
151 | {
152 | dynamic builtins = Py.Import("builtins");
153 | var pyCount = new PyInt(builtins.len(pyList));
154 | var count = pyCount.ToInt32();
155 |
156 | for (var i = 0; i < count; i++)
157 | {
158 | var element = pyList[i];
159 |
160 | object created = null;
161 | if (typeof(T) == typeof(string))
162 | {
163 | var pyObj = new PyString(element);
164 | created = pyObj.ToString();
165 | }
166 | else
167 | {
168 | Debug.Assert(false);
169 | return null;
170 | }
171 |
172 | lstVar.Add((T)created);
173 | }
174 |
175 | return lstVar;
176 | }
177 | }
178 |
179 | public static byte[] GetBytes(dynamic dpyBytes)
180 | {
181 | var pyBytes = (PyObject)dpyBytes;
182 | var pyBuff = pyBytes.GetBuffer();
183 |
184 | var buff = new byte[pyBuff.Length];
185 | pyBuff.Read(buff, 0, (int)pyBuff.Length, 0);
186 | return buff;
187 | }
188 | }
189 | }
190 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SpacyDotNet
2 |
3 | SpacyDotNet is a .NET wrapper for the natural language library [spaCy](https://spacy.io/)
4 |
5 | ## Project scope and limitations
6 |
7 | This project is not meant to be a complete and exhaustive implementation of all spaCy features and [APIs](https://spacy.io/api). Altough it should be enough for basic tasks, think of it as a starting point, if you need to build a complex project using spaCy in .NET
8 |
9 | Most of the basic features in _Spacy101 section_ of the docs are available. All **Containers** classes are present (_Doc_, _DocBin_, _Token_, _Span_ and _Lexeme_) with their basic properties/methods running. Also _Vocab_ and _StringStore_ in a limited form.
10 |
11 | Nevertheless any developer should be ready to add the missing properties or classes in a straightforward manner.
12 |
13 | ## Requirements
14 |
15 | This project relies on [Python.NET](http://pythonnet.github.io/) to interop with spaCy, which is written in Python/Cython.
16 |
17 | It's been tested under **Windows 11** and **Ubuntu Linux 20.04**, using the following environment
18 |
19 | - .NET 9.0 / .NET Core 3.1
20 | - spaCy 3.8.5
21 | - Python 3.12
22 | - Python.NET: Latest official NuGet: [3.0.5](https://www.nuget.org/api/v2/package/pythonnet/3.0.5)
23 |
24 | but it might work under different conditions:
25 |
26 | - It was previously tested on
27 | - .NET Core 3.1
28 | - spaCy 3.0.5
29 | - Python 3.8
30 | - Python.NET release: [3.0.0-preview2021-04-03](https://www.nuget.org/packages/pythonnet/3.0.0-preview2021-04-03)
31 | - It should work with spaCy 2.3.5 and any other spaCy version that changes only its minor/patch version number
32 |
33 | Python.NET has been tested with Python releases 3.7 to 3.13
34 |
35 | ## Setup
36 |
37 | ### 1) Create a Python virtual environment and install spaCy
38 |
39 | It's advised to create a virtual environment to install spaCy. Depending on the host system this is done in different ways. The spaCy official [installation guide](https://spacy.io/usage) is fine
40 |
41 | To run the examples, we'll also need to install the correspoding language package (_es_core_news_sm_) as shown in the guide.
42 |
43 | ### 2) Check for Python shared library
44 |
45 | Python.NET makes use of Python as a shared library. Sadly, seems like the shared library is not copied with recent versions of _virtualenv_ and it's not even distributed in some flavours of Linux/Python >= 3.8
46 |
47 | While I don't understand the rationale behind those changes, we should check the following:
48 |
49 | **Windows**
50 |
51 | Check whether _python312.dll_ in located under _\Scripts_ folder. Otherwise, go to your main Python folder and copy all dlls. In my case: _python3.dll_, _python312.dll_ and the _vcruntime140.dll_
52 |
53 | **Linux**
54 |
55 | Check whether a libpython shared object is located under _/bin_ folder.
56 |
57 | If not, we first need to check if the shared object is present on our system. [find_libpython](https://pypi.org/project/find-libpython/) can help with this task.
58 |
59 | If library is nowhere to be found, it's likely that installing _python-dev_ package with the package manager of your favorite distribution will place the file in your system.
60 |
61 | Once we locate the library, drop it to the _bin_ folder. In my case, the file is named _libpython3.12.so.1.0_
62 |
63 | ## Usage
64 |
65 | SpaCyDotNet is built to be used as a library. However I provide an example project as a CLI program.
66 |
67 | ### 1) Compile and Build
68 |
69 | If using the CLI to run .NET, (Linux), we should simply browse to _Test/cs_ folder and compile the project with `dotnet build`. Under Visual Studio, just load _Test.sln_ solution
70 |
71 | ### 2) Run the project
72 |
73 | The program expects two parameters
74 |
75 | - **interpreter:** Name of Python shared library file. Usually _python312.dll_ on Windows, _libpython3.12.so_ on Linux and _libpython3.12.dylib_ on Mac
76 | - **venv:** Location of the virtual environment created with a compatible python and spaCy versions
77 |
78 | Run the example with `dotnet run --interpreter --venv ` or if using Visual Studio, set the command line in _Project => Properties => Debug => Application arguments_
79 |
80 | In my case:
81 |
82 | **Linux**
83 |
84 | dotnet run --interpreter libpython3.12.so.1.0 --venv /home/user/Dev/venvSpaCyPy312
85 |
86 | **Windows**
87 |
88 | dotnet run --interpreter python312.dll --venv C:\Users\user\Dev\venvSpaCyPy312
89 |
90 | ## Code comparison
91 |
92 | I've tried to mimic spaCy API as much as possible, considering the different nature of both C# and Python languages
93 |
94 | ### C# SpacyDotNet code
95 |
96 | ```c#
97 | var nlp = spacy.Load("en_core_web_sm");
98 | var doc = nlp.GetDocument("Apple is looking at buying U.K. startup for $1 billion");
99 |
100 | foreach (Token token in doc.Tokens)
101 | Console.WriteLine($"{token.Text} {token.Lemma} {token.PoS} {token.Tag} {token.Dep} {token.Shape} {token.IsAlpha} {token.IsStop}");
102 |
103 | Console.WriteLine("");
104 | foreach (Span ent in doc.Ents)
105 | Console.WriteLine($"{ent.Text} {ent.StartChar} {ent.EndChar} {ent.Label}");
106 |
107 | nlp = spacy.Load("en_core_web_md");
108 | var tokens = nlp.GetDocument("dog cat banana afskfsd");
109 |
110 | Console.WriteLine("");
111 | foreach (Token token in tokens.Tokens)
112 | Console.WriteLine($"{token.Text} {token.HasVector} {token.VectorNorm}, {token.IsOov}");
113 |
114 | tokens = nlp.GetDocument("dog cat banana");
115 | Console.WriteLine("");
116 | foreach (Token token1 in tokens.Tokens)
117 | {
118 | foreach (Token token2 in tokens.Tokens)
119 | Console.WriteLine($"{token1.Text} {token2.Text} {token1.Similarity(token2) }");
120 | }
121 |
122 | doc = nlp.GetDocument("I love coffee");
123 | Console.WriteLine("");
124 | Console.WriteLine(doc.Vocab.Strings["coffee"]);
125 | Console.WriteLine(doc.Vocab.Strings[3197928453018144401]);
126 |
127 | Console.WriteLine("");
128 | foreach (Token word in doc.Tokens)
129 | {
130 | var lexeme = doc.Vocab[word.Text];
131 | Console.WriteLine($@"{lexeme.Text} {lexeme.Orth} {lexeme.Shape} {lexeme.Prefix} {lexeme.Suffix}
132 | {lexeme.IsAlpha} {lexeme.IsDigit} {lexeme.IsTitle} {lexeme.Lang}");
133 | }
134 | ```
135 | ### Python spaCy code
136 |
137 | ```python
138 | nlp = spacy.load("en_core_web_sm")
139 | doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
140 |
141 | for token in doc:
142 | print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
143 | token.shape_, token.is_alpha, token.is_stop)
144 |
145 | print("")
146 | for ent in doc.ents:
147 | print(ent.text, ent.start_char, ent.end_char, ent.label_)
148 |
149 | nlp = spacy.load("en_core_web_md")
150 | tokens = nlp("dog cat banana afskfsd")
151 |
152 | print("")
153 | for token in tokens:
154 | print(token.text, token.has_vector, token.vector_norm, token.is_oov)
155 |
156 | tokens = nlp("dog cat banana")
157 | print("")
158 | for token1 in tokens:
159 | for token2 in tokens:
160 | print(token1.text, token2.text, token1.similarity(token2))
161 |
162 | doc = nlp("I love coffee")
163 | print("")
164 | print(doc.vocab.strings["coffee"]) # 3197928453018144401
165 | print(doc.vocab.strings[3197928453018144401]) # 'coffee'
166 |
167 | print("")
168 | for word in doc:
169 | lexeme = doc.vocab[word.text]
170 | print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
171 | lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)
172 | ```
173 | ### Output
174 |
175 | 
176 |
--------------------------------------------------------------------------------
/SpaCyDotNet/api/DocBin.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Diagnostics;
4 | using System.IO;
5 | using System.Xml;
6 | using System.Xml.Schema;
7 | using System.Xml.Serialization;
8 | using Python.Runtime;
9 | using PythonNetUtils;
10 |
11 | namespace SpacyDotNet
12 | {
13 | public class DocBin : IXmlSerializable
14 | {
15 | private dynamic _pyDocBin;
16 | private List _docs;
17 |
18 | public DocBin()
19 | {
20 | using (Py.GIL())
21 | {
22 | dynamic spacy = Py.Import("spacy");
23 | _pyDocBin = spacy.tokens.DocBin.__call__();
24 | }
25 | }
26 |
27 | public DocBin(string[] attrs, bool storeUserData)
28 | {
29 | using (Py.GIL())
30 | {
31 | var pyAttrs = new PyList();
32 | if (attrs != null)
33 | {
34 | foreach (var att in attrs)
35 | {
36 | var pyAtt = new PyString(att);
37 | pyAttrs.Append(pyAtt);
38 | }
39 | }
40 |
41 | var pyStoreUserDate = new PyInt(storeUserData ? 1 : 0);
42 | dynamic spacy = Py.Import("spacy");
43 | _pyDocBin = spacy.tokens.DocBin.__call__(pyAttrs, pyStoreUserDate);
44 | }
45 | }
46 |
47 | public void Add(Doc doc)
48 | {
49 | if (_docs == null)
50 | _docs = new List();
51 |
52 | _docs.Add(doc);
53 |
54 | using (Py.GIL())
55 | {
56 | dynamic pyDoc = doc.PyDoc;
57 | _pyDocBin.add(pyDoc);
58 | }
59 | }
60 |
61 | public byte[] ToBytes()
62 | {
63 | if (Serialization.Selected == Serialization.Mode.Spacy)
64 | {
65 | using (Py.GIL())
66 | {
67 | return ToClr.GetBytes(_pyDocBin.to_bytes());
68 | }
69 | }
70 | else
71 | {
72 | using var stream = new MemoryStream();
73 |
74 | var settings = new XmlWriterSettings();
75 | settings.Indent = true;
76 | using var writer = XmlWriter.Create(stream, settings);
77 |
78 | WriteXml(writer);
79 | writer.Flush();
80 | return stream.ToArray();
81 | }
82 | }
83 |
84 | public void FromBytes(byte[] bytes)
85 | {
86 | if (Serialization.Selected == Serialization.Mode.Spacy)
87 | {
88 | var pyObj = ToPy.GetBytes(bytes);
89 | using (Py.GIL())
90 | {
91 | _pyDocBin.from_bytes(pyObj);
92 | }
93 | }
94 | else
95 | {
96 | var stream = new MemoryStream(bytes);
97 |
98 | var settings = new XmlReaderSettings();
99 | settings.IgnoreComments = true;
100 | settings.IgnoreWhitespace = true;
101 | var reader = XmlReader.Create(stream, settings);
102 |
103 | var docBin = new DocBin();
104 | docBin.ReadXml(reader);
105 | Copy(docBin);
106 | }
107 | }
108 |
109 | public void ToDisk(string pathFile)
110 | {
111 | if (Serialization.Selected == Serialization.Mode.Spacy)
112 | {
113 | using (Py.GIL())
114 | {
115 | var pyPath = new PyString(pathFile);
116 | _pyDocBin.to_disk(pyPath);
117 | }
118 | }
119 | else
120 | {
121 | using var stream = new FileStream(pathFile, FileMode.Create);
122 |
123 | var settings = new XmlWriterSettings();
124 | settings.Indent = true;
125 | using var writer = XmlWriter.Create(stream, settings);
126 |
127 | WriteXml(writer);
128 | }
129 | }
130 |
131 | public void FromDisk(string pathFile)
132 | {
133 | if (Serialization.Selected == Serialization.Mode.Spacy)
134 | {
135 | using (Py.GIL())
136 | {
137 | var pyPath = new PyString(pathFile);
138 | _pyDocBin.from_disk(pyPath);
139 | }
140 | }
141 | else
142 | {
143 | using var stream = new FileStream(pathFile, FileMode.Open, FileAccess.Read);
144 |
145 | var settings = new XmlReaderSettings();
146 | settings.IgnoreComments = true;
147 | settings.IgnoreWhitespace = true;
148 | var reader = XmlReader.Create(stream, settings);
149 |
150 | var docBin = new DocBin();
151 | docBin.ReadXml(reader);
152 | Copy(docBin);
153 | }
154 | }
155 |
156 | public List GetDocs(Vocab vocab) => ToClr.GetListFromGeneratorMember(_pyDocBin?.get_docs(vocab.PyVocab), ref _docs);
157 |
158 | private void Copy(DocBin docBin)
159 | {
160 | _docs = docBin._docs;
161 |
162 | // I'd rather copy Python object no matter the serialization mode
163 | // If set to DotNet, the variable will be initialized to null
164 | // disregarding its current value which might be a default object
165 | _pyDocBin = docBin._pyDocBin;
166 |
167 | if (Serialization.Selected == Serialization.Mode.SpacyAndDotNet)
168 | {
169 | using (Py.GIL())
170 | {
171 | dynamic spacy = Py.Import("spacy");
172 |
173 | dynamic pyVocab = spacy.vocab.Vocab.__call__();
174 | dynamic pyDocs = _pyDocBin.get_docs(pyVocab);
175 |
176 | dynamic builtins = Py.Import("builtins");
177 | dynamic listDocs = builtins.list(pyDocs);
178 |
179 | var pyCount = new PyInt(builtins.len(listDocs));
180 | var count = pyCount.ToInt32();
181 |
182 | for (var i = 0; i < count; i++)
183 | {
184 | dynamic pyDoc = listDocs[i];
185 | _docs[i].PyDoc = pyDoc;
186 | _docs[i].Vocab.PyVocab = pyDoc.vocab;
187 | }
188 | }
189 | }
190 | }
191 |
192 | public XmlSchema GetSchema()
193 | {
194 | return null;
195 | }
196 |
197 | public void ReadXml(XmlReader reader)
198 | {
199 | var serializationMode = Serialization.Selected;
200 | reader.MoveToContent();
201 |
202 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:DocBin");
203 | reader.ReadStartElement();
204 |
205 | if (serializationMode == Serialization.Mode.SpacyAndDotNet)
206 | {
207 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:PyObj");
208 | var bytesB64 = reader.ReadElementContentAsString();
209 | var bytes = Convert.FromBase64String(bytesB64);
210 | var pyBytes = ToPy.GetBytes(bytes);
211 |
212 | using (Py.GIL())
213 | {
214 | dynamic spacy = Py.Import("spacy");
215 | _pyDocBin = spacy.tokens.DocBin.__call__();
216 | _pyDocBin.from_bytes(pyBytes);
217 | }
218 | }
219 |
220 | Debug.Assert(serializationMode != Serialization.Mode.Spacy);
221 |
222 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Docs");
223 | reader.ReadStartElement();
224 | _docs = new List();
225 |
226 | while (reader.MoveToContent() != XmlNodeType.EndElement)
227 | {
228 | if (reader.NodeType != XmlNodeType.EndElement)
229 | {
230 | var doc = new Doc();
231 | doc.ReadXml(reader);
232 | _docs.Add(doc);
233 | }
234 | }
235 |
236 | reader.ReadEndElement();
237 | }
238 |
239 | public void WriteXml(XmlWriter writer)
240 | {
241 | writer.WriteStartElement(Serialization.Prefix, "DocBin", Serialization.Namespace);
242 |
243 | var serializationMode = Serialization.Selected;
244 |
245 | if (serializationMode == Serialization.Mode.SpacyAndDotNet)
246 | {
247 | using (Py.GIL())
248 | {
249 | var pyObj = ToClr.GetBytes(_pyDocBin.to_bytes());
250 | var pyObjB64 = Convert.ToBase64String(pyObj);
251 | writer.WriteElementString("PyObj", Serialization.Namespace, pyObjB64);
252 | }
253 | }
254 |
255 | Debug.Assert(serializationMode != Serialization.Mode.Spacy);
256 |
257 | writer.WriteStartElement("Docs", Serialization.Namespace);
258 | foreach (var doc in _docs)
259 | doc.WriteXml(writer);
260 | writer.WriteEndElement();
261 |
262 | writer.WriteEndElement();
263 | }
264 | }
265 | }
266 |
--------------------------------------------------------------------------------
/SpaCyDotNet/api/Token.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Diagnostics;
3 | using System.Xml;
4 | using System.Xml.Schema;
5 | using System.Xml.Serialization;
6 | using Python.Runtime;
7 | using PythonNetUtils;
8 |
9 | namespace SpacyDotNet
10 | {
11 | public class Token : IXmlSerializable
12 | {
13 | private dynamic _pyToken;
14 |
15 | private string _text;
16 | private string _lemma;
17 |
18 | private string _pos;
19 | private string _tag;
20 | private string _dep;
21 | private string _shape;
22 |
23 | private bool? _isAlpha;
24 | private bool? _isStop;
25 | private bool? _isPunct;
26 | private bool? _isDigit;
27 | private bool? _likeNum;
28 | private bool? _likeEMail;
29 |
30 | private bool? _hasVector;
31 | private double? _vectorNorm;
32 | private bool? _isOov;
33 |
34 | private int? _i;
35 |
36 | private Token _head;
37 | private int _headPos;
38 |
39 | private List _children;
40 |
41 | public Token()
42 | {
43 | // Needed to use generics
44 | }
45 |
46 | internal Token(dynamic token)
47 | {
48 | _isAlpha = null;
49 | _isStop = null;
50 | _isPunct = null;
51 | _isDigit = null;
52 | _likeNum = null;
53 | _likeEMail = null;
54 |
55 | _pyToken = token;
56 | }
57 |
58 | internal dynamic PyObj => _pyToken;
59 |
60 | public string Text => ToClr.GetMember(_pyToken?.text, ref _text);
61 | public string Lemma => ToClr.GetMember(_pyToken?.lemma_, ref _lemma);
62 | public string PoS => ToClr.GetMember(_pyToken?.pos_, ref _pos);
63 | public string Tag => ToClr.GetMember(_pyToken?.tag_, ref _tag);
64 | public string Dep => ToClr.GetMember(_pyToken?.dep_, ref _dep);
65 | public string Shape => ToClr.GetMember(_pyToken?.shape_, ref _shape);
66 | public bool IsAlpha => ToClr.GetMember(_pyToken?.is_alpha, ref _isAlpha);
67 | public bool IsStop => ToClr.GetMember(_pyToken?.is_stop, ref _isStop);
68 | public bool IsPunct => ToClr.GetMember(_pyToken?.is_punct, ref _isPunct);
69 | public bool IsDigit => ToClr.GetMember(_pyToken?.is_digit, ref _isDigit);
70 | public bool LikeNum => ToClr.GetMember(_pyToken?.like_num, ref _likeNum);
71 | public bool LikeEMail => ToClr.GetMember(_pyToken?.like_email, ref _likeEMail);
72 | public bool HasVector => ToClr.GetMember(_pyToken?.has_vector, ref _hasVector);
73 | public double VectorNorm => ToClr.GetMember(_pyToken?.vector_norm, ref _vectorNorm);
74 | public bool IsOov => ToClr.GetMember(_pyToken?.is_oov, ref _isOov);
75 | public int I => ToClr.GetMember(_pyToken?.i, ref _i);
76 |
77 | public Token Head
78 | {
79 | get
80 | {
81 | if (_head != null)
82 | return _head;
83 |
84 | using (Py.GIL())
85 | {
86 | var pyHeadIsSelf = new PyInt(_pyToken.head.__eq__(_pyToken));
87 | var headIsSelf = pyHeadIsSelf.ToInt32() != 0;
88 | if (headIsSelf)
89 | _head = this;
90 | else
91 | _head = new Token(_pyToken.head);
92 |
93 | return _head;
94 | }
95 | }
96 |
97 | set
98 | {
99 | _head = value;
100 | }
101 | }
102 |
103 | public List Children => ToClr.GetListFromGeneratorMember(_pyToken?.children, ref _children);
104 |
105 | public double Similarity(Token token)
106 | {
107 | using (Py.GIL())
108 | {
109 | dynamic similarityPy = _pyToken.similarity(token.PyObj);
110 | var similarityPyFloat = PyFloat.AsFloat(similarityPy);
111 | return similarityPyFloat.As();
112 | }
113 | }
114 |
115 | public override string ToString()
116 | {
117 | return Text;
118 | }
119 |
120 | public XmlSchema GetSchema()
121 | {
122 | return null;
123 | }
124 |
125 | public void ReadXml(XmlReader reader)
126 | {
127 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Text");
128 | _text = reader.ReadElementContentAsString();
129 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Lemma");
130 | _lemma = reader.ReadElementContentAsString();
131 |
132 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Pos");
133 | _pos = reader.ReadElementContentAsString();
134 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Tag");
135 | _tag = reader.ReadElementContentAsString();
136 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Dep");
137 | _dep = reader.ReadElementContentAsString();
138 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Shape");
139 | _shape = reader.ReadElementContentAsString();
140 |
141 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsAlpha");
142 | _isAlpha = reader.ReadElementContentAsBoolean();
143 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsStop");
144 | _isStop = reader.ReadElementContentAsBoolean();
145 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsPunct");
146 | _isPunct = reader.ReadElementContentAsBoolean();
147 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsDigit");
148 | _isDigit = reader.ReadElementContentAsBoolean();
149 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:LikeNum");
150 | _likeNum = reader.ReadElementContentAsBoolean();
151 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:LikeEMail");
152 | _likeEMail = reader.ReadElementContentAsBoolean();
153 |
154 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:HasVector");
155 | _hasVector = reader.ReadElementContentAsBoolean();
156 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:VectorNorm");
157 | _vectorNorm = reader.ReadElementContentAsDouble();
158 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsOov");
159 | _isOov = reader.ReadElementContentAsBoolean();
160 |
161 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:I");
162 | _i = reader.ReadElementContentAsInt();
163 |
164 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Head");
165 | var headPosStr = reader.GetAttribute("Pos");
166 | if (string.IsNullOrEmpty(headPosStr))
167 | _headPos = -1;
168 | else
169 | _headPos = int.Parse(headPosStr);
170 |
171 | reader.Skip();
172 | }
173 |
174 | public void WriteXml(XmlWriter writer)
175 | {
176 | // Using the property is important form the members to be loaded
177 | writer.WriteElementString("Text", Serialization.Namespace, Text);
178 | writer.WriteElementString("Lemma", Serialization.Namespace, Lemma);
179 |
180 | writer.WriteElementString("Pos", Serialization.Namespace, PoS);
181 | writer.WriteElementString("Tag", Serialization.Namespace, Tag);
182 | writer.WriteElementString("Dep", Serialization.Namespace, Dep);
183 | writer.WriteElementString("Shape", Serialization.Namespace, Shape);
184 |
185 | writer.WriteStartElement("IsAlpha", Serialization.Namespace);
186 | writer.WriteValue(IsAlpha);
187 | writer.WriteEndElement();
188 | writer.WriteStartElement("IsStop", Serialization.Namespace);
189 | writer.WriteValue(IsStop);
190 | writer.WriteEndElement();
191 | writer.WriteStartElement("IsPunct", Serialization.Namespace);
192 | writer.WriteValue(IsPunct);
193 | writer.WriteEndElement();
194 | writer.WriteStartElement("IsDigit", Serialization.Namespace);
195 | writer.WriteValue(IsDigit);
196 | writer.WriteEndElement();
197 | writer.WriteStartElement("LikeNum", Serialization.Namespace);
198 | writer.WriteValue(LikeNum);
199 | writer.WriteEndElement();
200 | writer.WriteStartElement("LikeEMail", Serialization.Namespace);
201 | writer.WriteValue(LikeEMail);
202 | writer.WriteEndElement();
203 |
204 | writer.WriteStartElement("HasVector", Serialization.Namespace);
205 | writer.WriteValue(HasVector);
206 | writer.WriteEndElement();
207 | writer.WriteStartElement("VectorNorm", Serialization.Namespace);
208 | writer.WriteValue(VectorNorm);
209 | writer.WriteEndElement();
210 | writer.WriteStartElement("IsOov", Serialization.Namespace);
211 | writer.WriteValue(IsOov);
212 | writer.WriteEndElement();
213 |
214 | writer.WriteStartElement("I", Serialization.Namespace);
215 | writer.WriteValue(I);
216 | writer.WriteEndElement();
217 |
218 | writer.WriteStartElement("Head", Serialization.Namespace);
219 | var head = Head;
220 | if (head == this)
221 | writer.WriteAttributeString("Pos", string.Empty);
222 | else
223 | writer.WriteAttributeString("Pos", head.I.ToString());
224 | writer.WriteEndElement();
225 |
226 | // This one was already commented
227 | //info.AddValue("Children", Children);
228 | }
229 |
230 | internal void RestoreHead(List tokens)
231 | {
232 | if (_headPos == -1)
233 | _head = this;
234 | else
235 | _head = tokens[_headPos];
236 | }
237 | }
238 | }
239 |
--------------------------------------------------------------------------------
/SpaCyDotNet/api/Doc.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Diagnostics;
4 | using System.IO;
5 | using System.Xml;
6 | using System.Xml.Schema;
7 | using System.Xml.Serialization;
8 | using Python.Runtime;
9 | using PythonNetUtils;
10 |
11 | namespace SpacyDotNet
12 | {
13 | public class Doc : IXmlSerializable
14 | {
15 | private string _text;
16 |
17 | private Vocab _vocab;
18 |
19 | private List _tokens;
20 |
21 | private List _sentences;
22 | private List _nounChunks;
23 | private List _ents;
24 |
25 | public Doc()
26 | {
27 | }
28 |
29 | public Doc(Vocab vocab)
30 | {
31 | _vocab = vocab;
32 |
33 | using (Py.GIL())
34 | {
35 | dynamic spacy = Py.Import("spacy");
36 | dynamic pyVocab = vocab.PyVocab;
37 | PyDoc = spacy.tokens.doc.Doc.__call__(pyVocab);
38 | }
39 | }
40 |
41 | internal Doc(dynamic doc)
42 | {
43 | PyDoc = doc;
44 | _vocab = null;
45 | }
46 |
47 | internal Doc(dynamic doc, string text)
48 | {
49 | PyDoc = doc;
50 | _vocab = null;
51 | _text = text;
52 | }
53 |
54 | internal dynamic PyDoc { get; set; }
55 |
56 | public string Text => ToClr.GetMember(PyDoc?.text, ref _text);
57 | public List Tokens => ToClr.GetListFromCollectionMember(PyDoc, ref _tokens);
58 | public List Sents => ToClr.GetListFromGeneratorMember(PyDoc?.sents, ref _sentences);
59 | public List NounChunks => ToClr.GetListFromGeneratorMember(PyDoc?.noun_chunks, ref _nounChunks);
60 | public List Ents => ToClr.GetListFromGeneratorMember(PyDoc?.ents, ref _ents);
61 |
62 | public Vocab Vocab
63 | {
64 | get
65 | {
66 | if (_vocab != null)
67 | return _vocab;
68 |
69 | using (Py.GIL())
70 | {
71 | var vocab = PyDoc.vocab;
72 | _vocab = new Vocab(vocab);
73 | return _vocab;
74 | }
75 | }
76 | }
77 |
78 | public void ToDisk(string path)
79 | {
80 | if (Serialization.Selected == Serialization.Mode.Spacy)
81 | {
82 | using (Py.GIL())
83 | {
84 | var pyPath = new PyString(path);
85 | PyDoc.to_disk(pyPath);
86 | }
87 | }
88 | else
89 | {
90 | using var stream = new FileStream(path, FileMode.Create);
91 |
92 | var settings = new XmlWriterSettings();
93 | settings.Indent = true;
94 | using var writer = XmlWriter.Create(stream, settings);
95 |
96 | WriteXml(writer);
97 | }
98 | }
99 |
100 | public void FromDisk(string path)
101 | {
102 | if (Serialization.Selected == Serialization.Mode.Spacy)
103 | {
104 | using (Py.GIL())
105 | {
106 | var pyPath = new PyString(path);
107 | PyDoc.from_disk(pyPath);
108 | }
109 | }
110 | else
111 | {
112 | using var stream = new FileStream(path, FileMode.Open, FileAccess.Read);
113 |
114 | var settings = new XmlReaderSettings();
115 | settings.IgnoreComments = true;
116 | settings.IgnoreWhitespace = true;
117 | var reader = XmlReader.Create(stream, settings);
118 |
119 | var doc = new Doc();
120 | doc.ReadXml(reader);
121 | Copy(doc);
122 | }
123 | }
124 |
125 | public byte[] ToBytes()
126 | {
127 | if (Serialization.Selected == Serialization.Mode.Spacy)
128 | {
129 | using (Py.GIL())
130 | {
131 | return ToClr.GetBytes(PyDoc.to_bytes());
132 | }
133 | }
134 | else
135 | {
136 | using var stream = new MemoryStream();
137 |
138 | var settings = new XmlWriterSettings();
139 | settings.Indent = true;
140 | using var writer = XmlWriter.Create(stream, settings);
141 |
142 | WriteXml(writer);
143 | writer.Flush();
144 | return stream.ToArray();
145 | }
146 | }
147 |
148 | public void FromBytes(byte[] bytes)
149 | {
150 | if (Serialization.Selected == Serialization.Mode.Spacy)
151 | {
152 | var pyBytes = ToPy.GetBytes(bytes);
153 | using (Py.GIL())
154 | {
155 | PyDoc.from_bytes(pyBytes);
156 | }
157 | }
158 | else
159 | {
160 | var stream = new MemoryStream(bytes);
161 |
162 | var settings = new XmlReaderSettings();
163 | settings.IgnoreComments = true;
164 | settings.IgnoreWhitespace = true;
165 | var reader = XmlReader.Create(stream, settings);
166 |
167 | var doc = new Doc();
168 | doc.ReadXml(reader);
169 | Copy(doc);
170 | }
171 | }
172 |
173 | private void Copy(Doc doc)
174 | {
175 | // I'd rather copy Python object no matter the serialization mode
176 | // If set to DotNet, the variable will be initialized to null
177 | // disregarding its current value which might be a default object
178 | PyDoc = doc.PyDoc;
179 |
180 | _text = doc._text;
181 | _vocab = doc._vocab;
182 | _tokens = doc._tokens;
183 | _sentences = doc._sentences;
184 | _nounChunks = doc._nounChunks;
185 | _ents = doc._ents;
186 | }
187 |
188 | public XmlSchema GetSchema()
189 | {
190 | return null;
191 | }
192 |
193 | public void ReadXml(XmlReader reader)
194 | {
195 | var serializationMode = Serialization.Selected;
196 | reader.MoveToContent();
197 |
198 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Doc");
199 | reader.ReadStartElement();
200 |
201 | if (serializationMode == Serialization.Mode.SpacyAndDotNet)
202 | {
203 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:PyObj");
204 | var bytesB64 = reader.ReadElementContentAsString();
205 | var bytes = Convert.FromBase64String(bytesB64);
206 | var pyBytes = ToPy.GetBytes(bytes);
207 |
208 | using (Py.GIL())
209 | {
210 | dynamic spacy = Py.Import("spacy");
211 | dynamic pyVocab = spacy.vocab.Vocab.__call__();
212 | PyDoc = spacy.tokens.doc.Doc.__call__(pyVocab);
213 | PyDoc.from_bytes(pyBytes);
214 | _vocab = new Vocab(PyDoc.vocab);
215 | }
216 | }
217 |
218 | Debug.Assert(Serialization.Selected != Serialization.Mode.Spacy);
219 |
220 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Text");
221 | _text = reader.ReadElementContentAsString();
222 |
223 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Vocab");
224 | _vocab = new Vocab(null);
225 | _vocab.ReadXml(reader);
226 |
227 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Tokens");
228 | _tokens = new List();
229 | var isEmpty = reader.IsEmptyElement;
230 | reader.ReadStartElement();
231 |
232 | if (!isEmpty)
233 | {
234 | while (reader.MoveToContent() != XmlNodeType.EndElement)
235 | {
236 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Token");
237 | reader.ReadStartElement();
238 | if (reader.NodeType != XmlNodeType.EndElement)
239 | {
240 | var token = new Token();
241 | token.ReadXml(reader);
242 | _tokens.Add(token);
243 | reader.ReadEndElement();
244 | }
245 | }
246 |
247 | reader.ReadEndElement();
248 | }
249 |
250 | foreach (var token in _tokens)
251 | token.RestoreHead(_tokens);
252 |
253 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Sentences");
254 | _sentences = new List();
255 | isEmpty = reader.IsEmptyElement;
256 | reader.ReadStartElement();
257 |
258 | if (!isEmpty)
259 | {
260 | while (reader.MoveToContent() != XmlNodeType.EndElement)
261 | {
262 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Sent");
263 | reader.ReadStartElement();
264 | if (reader.NodeType != XmlNodeType.EndElement)
265 | {
266 | var sent = new Span();
267 | sent.ReadXml(reader);
268 | _sentences.Add(sent);
269 | reader.ReadEndElement();
270 | }
271 | }
272 |
273 | reader.ReadEndElement();
274 | }
275 |
276 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:NounChunks");
277 | _nounChunks = new List();
278 | isEmpty = reader.IsEmptyElement;
279 | reader.ReadStartElement();
280 |
281 | if (!isEmpty)
282 | {
283 | while (reader.MoveToContent() != XmlNodeType.EndElement)
284 | {
285 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:NounChunk");
286 | reader.ReadStartElement();
287 | if (reader.NodeType != XmlNodeType.EndElement)
288 | {
289 | var nChunk = new Span();
290 | nChunk.ReadXml(reader);
291 | _nounChunks.Add(nChunk);
292 | reader.ReadEndElement();
293 | }
294 | }
295 |
296 | reader.ReadEndElement();
297 | }
298 |
299 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Ents");
300 | _ents = new List();
301 | reader.ReadStartElement();
302 |
303 | while (reader.MoveToContent() != XmlNodeType.EndElement)
304 | {
305 | Debug.Assert(reader.Name == $"{Serialization.Prefix}:Ent");
306 | reader.ReadStartElement();
307 | if (reader.NodeType != XmlNodeType.EndElement)
308 | {
309 | var ent = new Span();
310 | ent.ReadXml(reader);
311 | _ents.Add(ent);
312 | reader.ReadEndElement();
313 | }
314 | }
315 |
316 | reader.ReadEndElement();
317 | }
318 |
319 | public void WriteXml(XmlWriter writer)
320 | {
321 | writer.WriteStartElement(Serialization.Prefix, "Doc", Serialization.Namespace);
322 |
323 | var serializationMode = Serialization.Selected;
324 |
325 | if (serializationMode == Serialization.Mode.SpacyAndDotNet)
326 | {
327 | using (Py.GIL())
328 | {
329 | var pyObj = ToClr.GetBytes(PyDoc.to_bytes());
330 | var pyObjB64 = Convert.ToBase64String(pyObj);
331 | writer.WriteElementString("PyObj", Serialization.Namespace, pyObjB64);
332 | }
333 | }
334 |
335 | Debug.Assert(serializationMode != Serialization.Mode.Spacy);
336 |
337 | // Using the property is important form the members to be loaded
338 | writer.WriteElementString("Text", Serialization.Namespace, Text);
339 | writer.WriteStartElement("Vocab", Serialization.Namespace);
340 | Vocab.WriteXml(writer);
341 | writer.WriteEndElement();
342 |
343 | writer.WriteStartElement("Tokens", Serialization.Namespace);
344 | foreach (var token in Tokens)
345 | {
346 | writer.WriteStartElement("Token", Serialization.Namespace);
347 | token.WriteXml(writer);
348 | writer.WriteEndElement();
349 | }
350 |
351 | writer.WriteEndElement();
352 |
353 | writer.WriteStartElement("Sentences", Serialization.Namespace);
354 | foreach (var sent in Sents)
355 | {
356 | writer.WriteStartElement("Sent", Serialization.Namespace);
357 | sent.WriteXml(writer);
358 | writer.WriteEndElement();
359 | }
360 |
361 | writer.WriteEndElement();
362 |
363 | writer.WriteStartElement("NounChunks", Serialization.Namespace);
364 | foreach (var nounChunk in NounChunks)
365 | {
366 | writer.WriteStartElement("NounChunk", Serialization.Namespace);
367 | nounChunk.WriteXml(writer);
368 | writer.WriteEndElement();
369 | }
370 |
371 | writer.WriteEndElement();
372 |
373 | writer.WriteStartElement("Ents", Serialization.Namespace);
374 | foreach (var ent in Ents)
375 | {
376 | writer.WriteStartElement("Ent", Serialization.Namespace);
377 | ent.WriteXml(writer);
378 | writer.WriteEndElement();
379 | }
380 |
381 | writer.WriteEndElement();
382 |
383 | writer.WriteEndElement();
384 | }
385 | }
386 | }
387 |
--------------------------------------------------------------------------------