├── pynlpl ├── mt │ ├── __init__.py │ └── wordalign.py ├── tests │ ├── __init__.py │ ├── evaluation_timbl │ │ ├── train │ │ ├── timbltest.sh │ │ ├── test │ │ └── test.IB1.O.gr.k1.out │ ├── test.sh │ ├── datatypes.py │ ├── statistics.py │ ├── formats.py │ ├── cql.py │ ├── evaluation.py │ ├── textprocessors.py │ ├── folia_benchmark.py │ └── search.py ├── tools │ ├── __init__.py │ ├── reflow.py │ ├── phrasetableserver.py │ ├── sonarlemmafreqlist.py │ ├── foliasplitcgnpostags.py │ ├── sampler.py │ ├── freqlist.py │ ├── sonar2folia.py │ ├── computepmi.py │ └── frogwrapper.py ├── clients │ ├── __init__.py │ ├── freeling.py │ └── frogclient.py ├── formats │ ├── __init__.py │ ├── cgn.py │ ├── timbl.py │ ├── taggerdata.py │ ├── moses.py │ ├── dutchsemcor.py │ ├── sonar.py │ ├── cql.py │ └── giza.py ├── lm │ ├── __init__.py │ ├── makesrilmcc │ ├── srilm.cc │ ├── client.py │ ├── server.py │ └── srilm.py ├── __init__.py ├── algorithms.py ├── common.py ├── fsa.py └── net.py ├── requirements.txt ├── AUTHORS ├── docs ├── pineapple.jpg ├── pynlpl_pres.pdf ├── pynlpl_pres2.pdf ├── common.rst ├── datatypes.rst ├── search.rst ├── evaluation.rst ├── lm.rst ├── _templates │ ├── fullclass.rst │ └── foliaelement.rst ├── formats.rst ├── textprocessors.rst ├── statistics.rst ├── index.rst ├── Makefile └── conf.py ├── .gitignore ├── MANIFEST.in ├── setup.cfg ├── .travis.yml ├── .readthedocs.yaml ├── setup.py └── README.rst /pynlpl/mt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pynlpl/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pynlpl/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | lxml>=2.2 2 | httplib2>=0.6 3 | numpy 4 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | mvgompel = Maarten van Gompel 2 | -------------------------------------------------------------------------------- /pynlpl/tests/evaluation_timbl/train: -------------------------------------------------------------------------------- 1 | cat cat 2 | dog dog 3 | rabbit rabbit 4 | -------------------------------------------------------------------------------- /docs/pineapple.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/proycon/pynlpl/HEAD/docs/pineapple.jpg -------------------------------------------------------------------------------- /docs/pynlpl_pres.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/proycon/pynlpl/HEAD/docs/pynlpl_pres.pdf -------------------------------------------------------------------------------- /docs/pynlpl_pres2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/proycon/pynlpl/HEAD/docs/pynlpl_pres2.pdf -------------------------------------------------------------------------------- /pynlpl/tests/evaluation_timbl/timbltest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | timbl -f train -t test +v+cm+cs 3 | -------------------------------------------------------------------------------- /pynlpl/clients/__init__.py: -------------------------------------------------------------------------------- 1 | """This packages contains clients for communicating with specific servers""" 2 | -------------------------------------------------------------------------------- /pynlpl/formats/__init__.py: -------------------------------------------------------------------------------- 1 | """This package contains modules for reading and/or writing specific file formats""" 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore .pyc files 2 | *.pyc 3 | # Ignore generated dirs 4 | build/* 5 | docs/_autosummary/* 6 | docs/build/* 7 | -------------------------------------------------------------------------------- /pynlpl/lm/__init__.py: -------------------------------------------------------------------------------- 1 | """This package contains modules for Language Models, with a C++/Python module for SRILM by Sander Canisius""" 2 | -------------------------------------------------------------------------------- /docs/common.rst: -------------------------------------------------------------------------------- 1 | Common Functions 2 | ================================== 3 | 4 | .. automodule:: pynlpl.common 5 | :members: 6 | :undoc-members: 7 | -------------------------------------------------------------------------------- /docs/datatypes.rst: -------------------------------------------------------------------------------- 1 | Data Types 2 | ================================== 3 | 4 | .. automodule:: pynlpl.datatypes 5 | :members: 6 | :undoc-members: 7 | 8 | -------------------------------------------------------------------------------- /docs/search.rst: -------------------------------------------------------------------------------- 1 | Search Algorithms 2 | ================================== 3 | 4 | .. automodule:: pynlpl.search 5 | :members: 6 | :undoc-members: 7 | 8 | -------------------------------------------------------------------------------- /docs/evaluation.rst: -------------------------------------------------------------------------------- 1 | Evaluation & Experiments 2 | ================================== 3 | 4 | .. automodule:: pynlpl.evaluation 5 | :members: 6 | :undoc-members: 7 | 8 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include LICENSE 3 | include requirements.txt 4 | recursive-include pynlpl *.py 5 | include pynlpl/tests/test.sh 6 | include pynlpl/tests/evaluation_timbl/* 7 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [build_sphinx] 2 | source-dir = ../docs/ 3 | build-dir = ../docs/build 4 | all_files = 1 5 | 6 | [upload_sphinx] 7 | upload-dir = ../docs/build/html 8 | 9 | [easy_install] 10 | 11 | -------------------------------------------------------------------------------- /docs/lm.rst: -------------------------------------------------------------------------------- 1 | Language Models 2 | ================================== 3 | 4 | .. automodule:: pynlpl.lm.lm 5 | :members: 6 | :undoc-members: 7 | 8 | .. automodule:: pynlpl.lm.srilm 9 | :members: 10 | :undoc-members: 11 | 12 | .. automodule:: pynlpl.lm.server 13 | :members: 14 | :undoc-members: 15 | 16 | .. automodule:: pynlpl.lm.client 17 | :members: 18 | :undoc-members: 19 | 20 | 21 | -------------------------------------------------------------------------------- /pynlpl/tests/evaluation_timbl/test: -------------------------------------------------------------------------------- 1 | cat cat 2 | cat cat 3 | cat cat 4 | cat cat 5 | cat cat 6 | dog cat 7 | dog cat 8 | dog cat 9 | cat dog 10 | cat dog 11 | rabbit dog 12 | dog dog 13 | dog dog 14 | dog dog 15 | dog rabbit 16 | dog rabbit 17 | rabbit rabbit 18 | rabbit rabbit 19 | rabbit rabbit 20 | rabbit rabbit 21 | rabbit rabbit 22 | rabbit rabbit 23 | rabbit rabbit 24 | rabbit rabbit 25 | rabbit rabbit 26 | rabbit rabbit 27 | rabbit rabbit 28 | -------------------------------------------------------------------------------- /pynlpl/__init__.py: -------------------------------------------------------------------------------- 1 | """PyNLPl, pronounced as "pineapple", is a Python library for Natural Language Processing. It contains various modules useful for common, and less common, NLP tasks. PyNLPl can be used for example the computation of n-grams, frequency lists and distributions, language models. There are also more complex data types, such as Priority Queues, and search algorithms, such as Beam Search. 2 | 3 | The library is divided into several packages and modules. It is designed for Python 2.6 and upwards. Including Python 3.""" 4 | 5 | VERSION = "1.2.9" 6 | -------------------------------------------------------------------------------- /pynlpl/tests/evaluation_timbl/test.IB1.O.gr.k1.out: -------------------------------------------------------------------------------- 1 | cat cat cat 2 | cat cat cat 3 | cat cat cat 4 | cat cat cat 5 | cat cat cat 6 | dog cat dog 7 | dog cat dog 8 | dog cat dog 9 | cat dog cat 10 | cat dog cat 11 | rabbit dog rabbit 12 | dog dog dog 13 | dog dog dog 14 | dog dog dog 15 | dog rabbit dog 16 | dog rabbit dog 17 | rabbit rabbit rabbit 18 | rabbit rabbit rabbit 19 | rabbit rabbit rabbit 20 | rabbit rabbit rabbit 21 | rabbit rabbit rabbit 22 | rabbit rabbit rabbit 23 | rabbit rabbit rabbit 24 | rabbit rabbit rabbit 25 | rabbit rabbit rabbit 26 | rabbit rabbit rabbit 27 | rabbit rabbit rabbit 28 | -------------------------------------------------------------------------------- /pynlpl/tools/reflow.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf8 -*- 3 | 4 | 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | from __future__ import division 8 | from __future__ import absolute_import 9 | 10 | import sys 11 | import io 12 | import getopt 13 | 14 | from pynlpl.textprocessors import ReflowText 15 | 16 | 17 | def main(): 18 | for filename in sys.argv[1:]: 19 | f = io.open(filename, 'r', encoding='utf-8') 20 | for line in ReflowText(f): 21 | print(line) 22 | f.close() 23 | 24 | if __name__ == '__main__': 25 | main() 26 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # whitelist 2 | branches: 3 | only: 4 | - master 5 | notifications: 6 | irc: 7 | channels: 8 | - "irc.uvt.nl#gitlama" 9 | template: 10 | - "%{repository_slug}#%{build_number} %{message} --> %{build_url}" 11 | skip_join: true 12 | language: python 13 | dist: trusty 14 | python: 15 | - "2.7" 16 | - "3.4" 17 | - "3.5" 18 | before_install: 19 | - sudo apt-get update -qq 20 | - sudo apt-get install -y xmldiff 21 | - pip install -U setuptools 22 | install: 23 | - pip install -r requirements.txt 24 | - python setup.py install 25 | script: 26 | - bash pynlpl/tests/test.sh 27 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | # We recommend specifying your dependencies to enable reproducible builds: 19 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 20 | # python: 21 | # install: 22 | # - requirements: docs/requirements.txt 23 | 24 | -------------------------------------------------------------------------------- /docs/_templates/fullclass.rst: -------------------------------------------------------------------------------- 1 | {{ fullname }} 2 | {{ underline }} 3 | 4 | .. currentmodule:: {{ module }} 5 | 6 | .. autoclass:: {{ objname }} 7 | :show-inheritance: 8 | :members: * 9 | 10 | {% block methods %} 11 | 12 | {% if methods %} 13 | .. rubric:: Method Summary 14 | 15 | .. autosummary:: 16 | {% for item in methods %} 17 | ~{{ name }}.{{ item }} 18 | {%- endfor %} 19 | {% endif %} 20 | {% endblock %} 21 | 22 | {% block attributes %} 23 | {% if attributes %} 24 | .. rubric:: Attributes 25 | 26 | .. autosummary:: 27 | {% for item in attributes %} 28 | ~{{ name }}.{{ item }} 29 | {%- endfor %} 30 | {% endif %} 31 | {% endblock %} 32 | 33 | .. rubric:: Method Details 34 | 35 | .. automethod:: __init__ 36 | 37 | {% for m in methods %} 38 | .. automethod:: {{ m }} 39 | {% endfor %} 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /pynlpl/tools/phrasetableserver.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | ############################################################### 5 | # PyNLPl - Phrase Table Server 6 | # by Maarten van Gompel (proycon) 7 | # http://ilk.uvt.nl/~mvgompel 8 | # Induction for Linguistic Knowledge Research Group 9 | # Universiteit van Tilburg 10 | # 11 | # Licensed under GPLv3 12 | # 13 | ############################################################### 14 | 15 | 16 | import sys 17 | import os 18 | 19 | if __name__ == "__main__": 20 | sys.path.append(sys.path[0] + '/../..') 21 | os.environ['PYTHONPATH'] = sys.path[0] + '/../..' 22 | 23 | from pynlpl.formats.moses import PhraseTable, PhraseTableServer 24 | 25 | 26 | 27 | 28 | if len(sys.argv) != 3: 29 | print >>sys.stderr,"Syntax: phrasetableserver.py phrasetable port" 30 | sys.exit(2) 31 | else: 32 | port = int(sys.argv[2]) 33 | PhraseTableServer(PhraseTable(sys.argv[1]), port) 34 | -------------------------------------------------------------------------------- /pynlpl/lm/makesrilmcc: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # README!!! 4 | 5 | #First compile SRILM as follows: 6 | 7 | #In srilm/common/Makefile.machine.i686 (also for x86_64!) set: 8 | # ADDITIONAL_CFLAGS =-fPIC 9 | # ADDITIONAL_CXXFLAGS =-fPIC 10 | 11 | #And change GCC_FLAGS to GCC_FLAGS = -Wreturn-type -Wimplicit 12 | 13 | #Then compile: 14 | # make MACHINE_TYPE=i686 NO_TCL=X 15 | 16 | #Then edit the directories in this script and run ./makesrilm 17 | 18 | if [ -z $1 ]; then 19 | echo "Usage: ./makesrilm /path/to/srilm/ [pythonversion]" >&2 20 | exit 1; 21 | fi 22 | 23 | 24 | 25 | export SRILM=$1 #Default: /home/mvgompel/tmp/srilm5.10/ #(must be an absolute path!) 26 | export SRILMLIBS=$SRILM/lib/i686 27 | if [ -z $1 ]; then 28 | PYTHONVERSION=$2 29 | else 30 | PYTHONVERSION="2.7" 31 | fi 32 | g++ -fPIC -shared -I/usr/include/python$PYTHONVERSION -lpython$PYTHONVERSION -I$SRILM/src -I$SRILM/include -lboost_python srilm.cc $SRILMLIBS/liboolm.a $SRILMLIBS/libdstruct.a $SRILMLIBS/libmisc.a -o srilmcc.so 33 | 34 | -------------------------------------------------------------------------------- /docs/formats.rst: -------------------------------------------------------------------------------- 1 | Formats 2 | ================================== 3 | 4 | Corpus Gesproken Nederlands 5 | :::::::::::::::::::::::::::::: 6 | 7 | .. automodule:: pynlpl.formats.cgn 8 | :members: 9 | :undoc-members: 10 | 11 | FoLiA 12 | :::::::::::::::::::::::::::::: 13 | 14 | See folia_ : folia.html 15 | 16 | GIZA++ 17 | :::::::::::::::::::::::::::::: 18 | 19 | .. automodule:: pynlpl.formats.giza 20 | :members: 21 | :undoc-members: 22 | 23 | 24 | Moses 25 | :::::::::::::::::::::::::::::: 26 | 27 | .. automodule:: pynlpl.formats.moses 28 | :members: 29 | :undoc-members: 30 | 31 | 32 | SoNaR 33 | :::::::::::::::::::::::::::::: 34 | 35 | .. automodule:: pynlpl.formats.sonar 36 | :members: 37 | :undoc-members: 38 | 39 | 40 | 41 | Taggerdata 42 | :::::::::::::::::::::::::::::: 43 | 44 | .. automodule:: pynlpl.formats.taggerdata 45 | :members: 46 | :undoc-members: 47 | 48 | 49 | TiMBL 50 | :::::::::::::::::::::::::::::: 51 | 52 | .. automodule:: pynlpl.formats.timbl 53 | :members: 54 | :undoc-members: 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /docs/_templates/foliaelement.rst: -------------------------------------------------------------------------------- 1 | {{ fullname }} 2 | {{ underline }} 3 | 4 | .. currentmodule:: {{ module }} 5 | 6 | .. autoclass:: {{ objname }} 7 | :show-inheritance: 8 | :undoc-members: 9 | :special-members: 10 | 11 | {% block methods %} 12 | 13 | {% if methods %} 14 | .. rubric:: Method Summary 15 | 16 | .. autosummary:: 17 | {% for item in methods %} 18 | ~{{ name }}.{{ item }} 19 | {%- endfor %} 20 | {% for private_method in ['__iter__', '__len__', '__str__'] %} 21 | {% if private_method in members %} 22 | ~{{ name }}.{{ private_method }} 23 | {% endif %} 24 | {% endfor %} 25 | {% endif %} 26 | {% endblock %} 27 | 28 | {% block attributes %} 29 | {% if attributes %} 30 | .. rubric:: Class Attributes 31 | 32 | {% for item in attributes %} 33 | .. autoattribute:: {{ item }} 34 | {%- endfor %} 35 | {% endif %} 36 | {% endblock %} 37 | 38 | .. rubric:: Method Details 39 | 40 | .. automethod:: __init__ 41 | {% for m in methods %} 42 | .. automethod:: {{ m }} 43 | {% endfor %} 44 | {% for private_method in ['__iter__', '__len__', '__str__'] %} 45 | {% if private_method in members %} 46 | .. automethod:: {{ private_method }} 47 | {% endif %} 48 | {% endfor %} 49 | -------------------------------------------------------------------------------- /pynlpl/tools/sonarlemmafreqlist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | 5 | from __future__ import print_function, unicode_literals, division, absolute_import 6 | 7 | import sys 8 | import os 9 | 10 | if __name__ == "__main__": 11 | sys.path.append(sys.path[0] + '/../..') 12 | os.environ['PYTHONPATH'] = sys.path[0] + '/../..' 13 | 14 | from pynlpl.formats.sonar import CorpusFiles, Corpus 15 | from pynlpl.statistics import FrequencyList 16 | 17 | sonardir = sys.argv[1] 18 | 19 | freqlist = FrequencyList() 20 | lemmapos_freqlist = FrequencyList() 21 | poshead_freqlist = FrequencyList() 22 | pos_freqlist = FrequencyList() 23 | 24 | for i, doc in enumerate(Corpus(sonardir)): 25 | print("#" + str(i) + " Processing " + doc.filename,file=sys.stderr) 26 | for word, id, pos, lemma in doc: 27 | freqlist.count(word) 28 | if lemma and pos: 29 | poshead = pos.split('(')[0] 30 | lemmapos_freqlist.count(lemma+'.'+poshead) 31 | poshead_freqlist.count(poshead) 32 | pos_freqlist.count(pos) 33 | 34 | freqlist.save('sonarfreqlist.txt') 35 | lemmapos_freqlist.save('sonarlemmaposfreqlist.txt') 36 | poshead_freqlist.save('sonarposheadfreqlist.txt') 37 | pos_freqlist.save('sonarposfreqlist.txt') 38 | 39 | print(unicode(freqlist).encode('utf-8')) 40 | -------------------------------------------------------------------------------- /pynlpl/lm/srilm.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | using namespace boost::python; 5 | 6 | #include "srilm/include/File.h" 7 | #include "srilm/include/Ngram.h" 8 | #include "srilm/include/Vocab.h" 9 | //#include 10 | #include "srilm/lm/src/NgramStatsLong.cc" 11 | 12 | class LanguageModel 13 | { 14 | private: 15 | Vocab vocab; 16 | Ngram model; 17 | 18 | public: 19 | LanguageModel(const std::string& filename, int order) : model(vocab, order) 20 | { 21 | File file(filename.c_str(), "r"); 22 | model.read(file); 23 | } 24 | 25 | Boolean exists(const std::string& word) { 26 | return (vocab.getIndex(word.c_str()) != Vocab_None); 27 | } 28 | 29 | LogP wordProb(const std::string& context1, const std::string& context2, const std::string& word) 30 | { 31 | /*VocabIndex contextindex1 = Vocab_None; 32 | VocabIndex contextindex2 = Vocab_None; 33 | if (context2 != "__") contextindex2 = vocab.getIndex(context2.c_str()); 34 | if (context1 != "__") contextindex1 = vocab.getIndex(context1.c_str());*/ 35 | 36 | const VocabIndex context[] = { 37 | (context2 == "__") ? Vocab_None : vocab.getIndex(context2.c_str()), 38 | (context1 == "__") ? Vocab_None : vocab.getIndex(context1.c_str()) 39 | }; 40 | 41 | //const VocabIndex context[] = { context2, context1 }; 42 | return model.wordProb(vocab.getIndex(word.c_str()), context); 43 | } 44 | }; 45 | 46 | 47 | BOOST_PYTHON_MODULE(srilmcc) 48 | { 49 | class_("LanguageModel", init()) 50 | .def("wordProb", &LanguageModel::wordProb) 51 | .def("exists", &LanguageModel::exists) 52 | ; 53 | } 54 | -------------------------------------------------------------------------------- /pynlpl/tools/foliasplitcgnpostags.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | #-*- coding:utf-8 -*- 3 | 4 | 5 | from __future__ import print_function, unicode_literals, division, absolute_import 6 | 7 | import glob 8 | import sys 9 | import os 10 | 11 | 12 | if __name__ == "__main__": 13 | sys.path.append(sys.path[0] + '/../..') 14 | os.environ['PYTHONPATH'] = sys.path[0] + '/../..' 15 | 16 | from pynlpl.formats import folia 17 | from pynlpl.formats import cgn 18 | import lxml.etree 19 | 20 | def process(target): 21 | print("Processing " + target) 22 | if os.path.isdir(target): 23 | print("Descending into directory " + target) 24 | for f in glob.glob(target + '/*'): 25 | process(f) 26 | elif os.path.isfile(target) and target[-4:] == '.xml': 27 | print("Loading " + target) 28 | try: 29 | doc = folia.Document(file=target) 30 | except lxml.etree.XMLSyntaxError: 31 | print("UNABLE TO LOAD " + target + " (XML SYNTAX ERROR!)",file=sys.stderr) 32 | return None 33 | changed = False 34 | for word in doc.words(): 35 | try: 36 | pos = word.annotation(folia.PosAnnotation) 37 | except folia.NoSuchAnnotation: 38 | continue 39 | try: 40 | word.replace( cgn.parse_cgn_postag(pos.cls) ) 41 | changed = True 42 | except cgn.InvalidTagException: 43 | print("WARNING: INVALID TAG " + pos.cls,file=sys.stderr) 44 | continue 45 | if changed: 46 | print("Saving...") 47 | doc.save() 48 | 49 | target = sys.argv[1] 50 | process(target) 51 | 52 | -------------------------------------------------------------------------------- /docs/textprocessors.rst: -------------------------------------------------------------------------------- 1 | Text Processors 2 | ================================== 3 | 4 | This module contains classes and functions for text processing. It is imported as follows:: 5 | 6 | import pynlpl.textprocessors 7 | 8 | Tokenisation 9 | ------------------ 10 | 11 | A very crude tokeniser is available in the form of the function ``pynlpl.textprocessors.crude_tokeniser(string)``. This will split punctuation characters from words and returns a list of tokens. It however has no regard for abbreviations and end-of-sentence detection, which is functionality a more sophisticated tokeniser can provide:: 12 | 13 | tokens = pynlpl.textprocessors.crude_tokeniser("to be, or not to be.") 14 | 15 | This will result in: 16 | 17 | tokens == ['to','be',',','or','not','to','be','.'] 18 | 19 | 20 | N-gram extraction 21 | ------------------ 22 | 23 | The extraction of n-grams is an elemental operation in Natural Language Processing. PyNLPl offers the ``Windower`` class to accomplish this task:: 24 | 25 | tokens = pynlpl.textprocessors.crude_tokeniser("to be or not to be") 26 | for trigram in Windower(tokens,3): 27 | print trigram 28 | 29 | The input to the Windower should be a list of words and a value for n. In addition, the windower can output extra symbols at the beginning of the input sequence and at the end of it. By default, this behaviour is enabled and the input symbol is ````, whereas the output symbol is ````. If this behaviour is unwanted you can suppress it by instantiating the Windower as follows:: 30 | 31 | Windower(tokens,3, None, None) 32 | 33 | The Windower is implemented as a Python generator and at each iteration yields a tuple of length n. 34 | 35 | 36 | .. automodule:: pynlpl.textprocessors 37 | :members: 38 | :undoc-members: 39 | -------------------------------------------------------------------------------- /pynlpl/lm/client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | from __future__ import division 7 | from __future__ import absolute_import 8 | 9 | import socket 10 | 11 | class LMClient(object): 12 | 13 | def __init__(self,host= "localhost",port=12346,n = 0): 14 | self.BUFSIZE = 1024 15 | self.socket = socket.socket(socket.AF_INET,socket.SOCK_STREAM) #Create the socket 16 | self.socket.settimeout(120) 17 | assert isinstance(port,int) 18 | self.socket.connect((host, port)) #Connect to server 19 | assert isinstance(n,int) 20 | self.n = n 21 | 22 | def scoresentence(self, sentence): 23 | if self.n > 0: 24 | raise Exception("This client instance has been set to send only " + str(self.n) + "-grams") 25 | if isinstance(sentence,list) or isinstance(sentence,tuple): 26 | sentence = " ".join(sentence) 27 | self.socket.send(sentence+ "\r\n") 28 | return float(self.socket.recv(self.BUFSIZE).strip()) 29 | 30 | def __getitem__(self, ngram): 31 | if self.n == 0: 32 | raise Exception("This client has been set to send only full sentence, not n-grams") 33 | if isinstance(ngram,str) or isinstance(ngram,unicode): 34 | ngram = ngram.split(" ") 35 | if len(ngram) != self.n: 36 | raise Exception("This client instance has been set to send only " + str(self.n) + "-grams.") 37 | ngram = " ".join(ngram) 38 | if (sys.version < '3' and isinstance(ngram,unicode)) or( sys.version == '3' and isinstance(ngram,str)): 39 | ngram = ngram.encode('utf-8') 40 | self.socket.send(ngram + b"\r\n") 41 | return float(self.socket.recv(self.BUFSIZE).strip()) 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /pynlpl/lm/server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | #--------------------------------------------------------------- 5 | # PyNLPl - Language Models 6 | # by Maarten van Gompel, ILK, Universiteit van Tilburg 7 | # http://ilk.uvt.nl/~mvgompel 8 | # proycon AT anaproy DOT nl 9 | # 10 | # Generic Server for Language Models 11 | # 12 | #---------------------------------------------------------------- 13 | 14 | #No Python 3 support for twisted yet... 15 | 16 | from twisted.internet import protocol, reactor 17 | from twisted.protocols import basic 18 | 19 | class LMSentenceProtocol(basic.LineReceiver): 20 | def lineReceived(self, sentence): 21 | try: 22 | score = self.factory.lm.scoresentence(sentence) 23 | except: 24 | score = 0.0 25 | self.sendLine(str(score)) 26 | 27 | class LMSentenceFactory(protocol.ServerFactory): 28 | protocol = LMSentenceProtocol 29 | 30 | def __init__(self, lm): 31 | self.lm = lm 32 | 33 | class LMNGramProtocol(basic.LineReceiver): 34 | def lineReceived(self, ngram): 35 | ngram = ngram.split(" ") 36 | try: 37 | score = self.factory.lm[ngram] 38 | except: 39 | score = 0.0 40 | self.sendLine(str(score)) 41 | 42 | class LMNGramFactory(protocol.ServerFactory): 43 | protocol = LMNGramProtocol 44 | 45 | def __init__(self, lm): 46 | self.lm = lm 47 | 48 | 49 | 50 | class LMServer: 51 | """Language Model Server""" 52 | def __init__(self, lm, port=12346, n=0): 53 | """n indicates the n-gram size, if set to 0 (which is default), the server will expect to only receive whole sentence, if set to a particular value, it will only expect n-grams of that value""" 54 | if n == 0: 55 | reactor.listenTCP(port, LMSentenceFactory(lm)) 56 | else: 57 | reactor.listenTCP(port, LMNGramFactory(lm)) 58 | reactor.run() 59 | 60 | -------------------------------------------------------------------------------- /pynlpl/tests/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -z "$1" ]; then 4 | PYTHON=$1 5 | else 6 | PYTHON=python 7 | fi 8 | 9 | if [ ! -z "$2" ]; then 10 | TESTDIR="$2" 11 | else 12 | TESTDIR=`dirname $0` 13 | fi 14 | cd $TESTDIR 15 | 16 | GOOD=1 17 | 18 | echo "Testing CGN">&2 19 | $PYTHON cgn.py 20 | if [ $? -ne 0 ]; then 21 | echo "Test failed!!!" >&2 22 | GOOD=0 23 | fi 24 | 25 | echo "Testing datatypes">&2 26 | $PYTHON datatypes.py 27 | if [ $? -ne 0 ]; then 28 | echo "Test failed!!!" >&2 29 | GOOD=0 30 | fi 31 | 32 | 33 | echo "Testing evaluation">&2 34 | $PYTHON evaluation.py 35 | if [ $? -ne 0 ]; then 36 | echo "Test failed!!!" >&2 37 | GOOD=0 38 | fi 39 | 40 | 41 | echo "Testing search">&2 42 | $PYTHON search.py 43 | if [ $? -ne 0 ]; then 44 | echo "Test failed!!!" >&2 45 | GOOD=0 46 | fi 47 | 48 | echo "Testing textprocessors">&2 49 | $PYTHON textprocessors.py 50 | if [ $? -ne 0 ]; then 51 | echo "Test failed!!!" >&2 52 | GOOD=0 53 | fi 54 | 55 | 56 | echo "Testing statistics">&2 57 | $PYTHON statistics.py 58 | if [ $? -ne 0 ]; then 59 | echo "Test failed!!!" >&2 60 | GOOD=0 61 | fi 62 | 63 | 64 | echo "Testing formats">&2 65 | $PYTHON formats.py 66 | if [ $? -ne 0 ]; then 67 | echo "Test failed!!!" >&2 68 | GOOD=0 69 | fi 70 | 71 | echo "Testing folia">&2 72 | $PYTHON folia.py 73 | if [ $? -ne 0 ]; then 74 | echo "Test failed!!!" >&2 75 | GOOD=0 76 | fi 77 | 78 | echo "Testing FQL">&2 79 | $PYTHON fql.py 80 | if [ $? -ne 0 ]; then 81 | echo "Test failed!!!" >&2 82 | GOOD=0 83 | fi 84 | 85 | echo "Testing CQL">&2 86 | $PYTHON cql.py 87 | if [ $? -ne 0 ]; then 88 | echo "Test failed!!!" >&2 89 | GOOD=0 90 | fi 91 | 92 | cd .. 93 | 94 | if [ $GOOD -eq 1 ]; then 95 | echo "Done, all tests passed!" >&2 96 | exit 0 97 | else 98 | echo "TESTS FAILED!!!!" >&2 99 | exit 1 100 | fi 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf8 -*- 3 | 4 | from __future__ import print_function 5 | 6 | 7 | import os 8 | import sys 9 | from setuptools import setup, find_packages 10 | 11 | def read(fname): 12 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 13 | 14 | entry_points = {} 15 | if sys.version > '3': 16 | entry_points = { 'console_scripts': [ 17 | 'pynlpl-computepmi = pynlpl.tools.computepmi:main', 18 | 'pynlpl-sampler = pynlpl.tools.sampler:main', 19 | 'pynlpl-makefreqlist = pynlpl.tools.freqlist:main', 20 | ] 21 | } 22 | 23 | 24 | setup( 25 | name = "PyNLPl", 26 | version = "1.2.9", #edit version in __init__.py as well and ensure tests/folia.py FOLIARELEASE points to the right version and is not set to None! 27 | author = "Maarten van Gompel", 28 | author_email = "proycon@anaproy.nl", 29 | description = ("PyNLPl, pronounced as 'pineapple', is a Python library for Natural Language Processing. It contains various modules useful for common, and less common, NLP tasks. PyNLPl contains modules for basic tasks, clients for interfacting with server, and modules for parsing several file formats common in NLP, most notably FoLiA."), 30 | license = "GPL", 31 | keywords = "nlp computational_linguistics search ngrams language_models linguistics toolkit", 32 | url = "https://github.com/proycon/pynlpl", 33 | packages=['pynlpl','pynlpl.clients','pynlpl.lm','pynlpl.formats','pynlpl.mt','pynlpl.tools','pynlpl.tests'], 34 | long_description=read('README.rst'), 35 | classifiers=[ 36 | "Development Status :: 5 - Production/Stable", 37 | "Topic :: Text Processing :: Linguistic", 38 | "Programming Language :: Python :: 2.7", 39 | "Programming Language :: Python :: 3", 40 | "Operating System :: POSIX", 41 | "Intended Audience :: Developers", 42 | "Intended Audience :: Science/Research", 43 | "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", 44 | ], 45 | zip_safe=False, 46 | include_package_data=True, 47 | package_data = {'pynlpl': ['tests/test.sh', 'tests/evaluation_timbl/*'] }, 48 | install_requires=['lxml >= 2.2','httplib2 >= 0.6','rdflib'], 49 | entry_points = entry_points 50 | ) 51 | -------------------------------------------------------------------------------- /pynlpl/lm/srilm.py: -------------------------------------------------------------------------------- 1 | #--------------------------------------------------------------- 2 | # PyNLPl - SRILM Language Model 3 | # by Maarten van Gompel, ILK, Universiteit van Tilburg 4 | # http://ilk.uvt.nl/~mvgompel 5 | # proycon AT anaproy DOT nl 6 | # 7 | # Adapted from code by Sander Canisius 8 | # 9 | # Licensed under GPLv3 10 | # 11 | # 12 | # This library enables using SRILM as language model 13 | # 14 | #---------------------------------------------------------------- 15 | 16 | from __future__ import print_function 17 | from __future__ import unicode_literals 18 | from __future__ import division 19 | from __future__ import absolute_import 20 | 21 | try: 22 | import srilmcc 23 | except ImportError: 24 | import warnings 25 | warnings.warn("srilmcc module is not compiled") 26 | srilmcc = None 27 | 28 | from pynlpl.textprocessors import Windower 29 | 30 | 31 | class SRILMException(Exception): 32 | """Base Exception for SRILM.""" 33 | 34 | 35 | class SRILM: 36 | def __init__(self, filename, n): 37 | if not srilmcc: 38 | raise SRILMException( 39 | "SRILM is not downloaded and compiled." 40 | "Please follow the instructions in makesrilmcc") 41 | self.model = srilmcc.LanguageModel(filename, n) 42 | self.n = n 43 | 44 | def scoresentence(self, sentence, unknownwordprob=-12): 45 | score = 0 46 | for ngram in Windower(sentence, self.n, "", ""): 47 | try: 48 | score += self.logscore(ngram) 49 | except KeyError: 50 | score += unknownwordprob 51 | return 10**score 52 | 53 | def __getitem__(self, ngram): 54 | return 10**self.logscore(ngram) 55 | 56 | def __contains__(self, key): 57 | return self.model.exists( key ) 58 | 59 | def logscore(self, ngram): 60 | #Bug work-around 61 | #if "" in ngram or "_" in ngram or "__" in ngram: 62 | # print >> sys.stderr, "WARNING: Invalid word in n-gram! Ignoring", ngram 63 | # return -999.9 64 | 65 | if len(ngram) == self.n: 66 | if all( (self.model.exists(x) for x in ngram) ): 67 | #no phrases, basic trigram, compute directly 68 | return self.model.wordProb(*ngram) 69 | else: 70 | raise KeyError 71 | else: 72 | raise Exception("Not an " + str(self.n) + "-gram") 73 | -------------------------------------------------------------------------------- /pynlpl/algorithms.py: -------------------------------------------------------------------------------- 1 | 2 | ###############################################################9 3 | # PyNLPl - Algorithms 4 | # by Maarten van Gompel 5 | # Centre for Language Studies 6 | # Radboud University Nijmegen 7 | # http://www.github.com/proycon/pynlpl 8 | # proycon AT anaproy DOT nl 9 | # 10 | # Licensed under GPLv3 11 | # 12 | ############################################################### 13 | 14 | from __future__ import print_function 15 | from __future__ import unicode_literals 16 | from __future__ import division 17 | from __future__ import absolute_import 18 | 19 | def sum_to_n(n, size, limit=None): #from http://stackoverflow.com/questions/2065553/python-get-all-numbers-that-add-up-to-a-number 20 | """Produce all lists of `size` positive integers in decreasing order 21 | that add up to `n`.""" 22 | if size == 1: 23 | yield [n] 24 | return 25 | if limit is None: 26 | limit = n 27 | start = (n + size - 1) // size 28 | stop = min(limit, n - size + 1) + 1 29 | for i in range(start, stop): 30 | for tail in sum_to_n(n - i, size - 1, i): 31 | yield [i] + tail 32 | 33 | 34 | def consecutivegaps(n, leftmargin = 0, rightmargin = 0): 35 | """Compute all possible single consecutive gaps in any sequence of the specified length. Returns 36 | (beginindex, length) tuples. Runs in O(n(n+1) / 2) time. Argument is the length of the sequence rather than the sequence itself""" 37 | begin = leftmargin 38 | while begin < n: 39 | length = (n - rightmargin) - begin 40 | while length > 0: 41 | yield (begin, length) 42 | length -= 1 43 | begin += 1 44 | 45 | def possiblesplits(n, minsplits=2, maxsplits=0): 46 | """Returns lists of (index,length) tuples, representing all possible splits of a sequence of length n.""" 47 | if not maxsplits: maxsplits = n 48 | for nrsplits in range(minsplits,maxsplits + 1): 49 | for split in sum_to_n(n,nrsplits): 50 | split_with_indices = [] 51 | begin = 0 52 | for length in split: 53 | split_with_indices.append( (begin, length) ) 54 | begin += length 55 | yield split_with_indices 56 | 57 | def bytesize(n): 58 | """Return the required size in bytes to encode the specified integer""" 59 | for i in range(1, 1000): 60 | if n < 2**(8*i): 61 | return i 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /pynlpl/tools/sampler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | ############################################################### 5 | # PyNLPl - Sampler 6 | # by Maarten van Gompel (proycon) 7 | # http://ilk.uvt.nl/~mvgompel 8 | # Induction for Linguistic Knowledge Research Group 9 | # Universiteit van Tilburg 10 | # 11 | # Licensed under GPLv3 12 | # 13 | # This tool can be used to split a file (or multiple interdependent 14 | # files, such as a parallel corpus) into a train, test and development 15 | # set. 16 | # 17 | ############################################################### 18 | 19 | 20 | from __future__ import print_function 21 | from __future__ import unicode_literals 22 | from __future__ import division 23 | from __future__ import absolute_import 24 | 25 | import argparse 26 | import sys 27 | 28 | import random 29 | from pynlpl.evaluation import filesampler 30 | 31 | def main(): 32 | parser = argparse.ArgumentParser(description="Extracts random samples from datasets, supports multiple parallel datasets (such as parallel corpora), provided that corresponding data is on the same line.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) 33 | parser.add_argument('-t','--testsetsize', help="Test set size (lines)", type=float, action='store',default=0) 34 | parser.add_argument('-d','--devsetsize', help="Development set size (lines)", type=float, action='store',default=0) 35 | parser.add_argument('-T','--trainsetsize', help="Training set size (lines), leave unassigned (0) to automatically use all of the remaining data", type=float, action='store',default=0) 36 | parser.add_argument('-S','--seed', help="Seed for random number generator", type=int, action='store',default=0) 37 | parser.add_argument('files', type=str, nargs='+', help="The data sets to sample from, must be of equal size (i.e., same number of lines)") 38 | 39 | args = parser.parse_args() 40 | if args.seed: 41 | random.seed(args.seed) 42 | 43 | if args.testsetsize == 0: 44 | print("ERROR: Specify at least a testset size!",file=sys.stderr) 45 | sys.exit(2) 46 | 47 | try: 48 | if not args.files: 49 | print("ERROR: Specify at least one file!",file=sys.stderr) 50 | sys.exit(2) 51 | except: 52 | print("ERROR: Specify at least one file!",file=sys.stderr) 53 | sys.exit(2) 54 | 55 | filesampler(args.files, args.testsetsize, args.devsetsize, args.trainsetsize) 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /docs/statistics.rst: -------------------------------------------------------------------------------- 1 | Statistics and Information Theory 2 | ================================== 3 | 4 | This module contains classes and functions for statistics and information theory. It is imported as follows:: 5 | 6 | import pynlpl.statistics 7 | 8 | 9 | Generic functions 10 | ------------------------------------- 11 | 12 | Amongst others, the following generic statistical functions are available:: 13 | 14 | * ``mean(list)`` - Computes the mean of a given list of numbers 15 | 16 | * ``median(list)`` - Computes the median of a given list of numbers 17 | 18 | * ``stddev(list)`` - Computes the standard deviation of a given list of numbers 19 | 20 | * ``normalize(list)`` - Normalizes a list of numbers so that the sum is 1.0 . 21 | 22 | 23 | Frequency Lists and Distributions 24 | ------------------------------------- 25 | 26 | One of the most basic and widespread tasks in NLP is the creation of a frequency list. Counting is established by simply appending lists to the frequencylist:: 27 | 28 | freqlist = pynlpl.statistics.FrequencyList() 29 | freqlist.append(['to','be','or','not','to','be']) 30 | 31 | Take care not to append lists rather than strings unless you mean to create a frequency list over its characters rather than words. You may want to use the ``pynlpl.textprocessors.crudetokeniser`` first:: 32 | 33 | freqlist.append(pynlpl.textprocessors.crude_tokeniser("to be or not to be")) 34 | 35 | The count can also be incremented explicitly explicitly for a single item: 36 | 37 | freqlist.count('shakespeare') 38 | 39 | The FrequencyList offers dictionary-like access. For example, the following statement will be true for the frequency list just created:: 40 | 41 | freqlist['be'] == 2 42 | 43 | Normalised counts (pseudo-probabilities) can be obtained using the ``p()`` method:: 44 | 45 | freqlist.p('be') 46 | 47 | Normalised counts can also be obtained by instantiation a Distribution instance using the frequency list:: 48 | 49 | dist = pynlpl.statistics.Distribution(freqlist) 50 | 51 | This too offers a dictionary-like interface, where values are by definition normalised. The advantage of a Distribution class is that it offers information-theoretic methods such as ``entropy()``, ``maxentropy()``, ``perplexity()`` and ``poslog()``. 52 | 53 | A frequency list can be saved to file using the ``save(filename)`` method, and loaded back from file using the ``load(filename)`` method. The ``output()`` method is a generator yielding strings for each line of output, in ranked order. 54 | 55 | 56 | API Reference 57 | ---------------- 58 | 59 | 60 | .. automodule:: pynlpl.statistics 61 | :members: 62 | :undoc-members: 63 | 64 | 65 | -------------------------------------------------------------------------------- /pynlpl/tests/datatypes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | from __future__ import division 7 | from __future__ import absolute_import 8 | from pynlpl.common import u 9 | 10 | import os 11 | import sys 12 | import unittest 13 | 14 | 15 | from pynlpl.datatypes import PriorityQueue 16 | 17 | values = [3,6,6,1,8,2] 18 | mintomax = sorted(values) 19 | maxtomin = list(reversed(mintomax)) 20 | 21 | 22 | class PriorityQueueTest(unittest.TestCase): 23 | def test_append_minimized(self): 24 | """Minimized PriorityQueue""" 25 | global values 26 | pq = PriorityQueue(values, lambda x: x, True,0,False,False) 27 | result = list(iter(pq)) 28 | self.assertEqual(result, mintomax) 29 | 30 | def test_append_maximized(self): 31 | """Maximized PriorityQueue""" 32 | global values 33 | pq = PriorityQueue(values, lambda x: x, False,0,False,False) 34 | result = list(iter(pq)) 35 | self.assertEqual(result, maxtomin) 36 | 37 | def test_append_maximized_blockworse(self): 38 | """Maximized PriorityQueue (with blockworse)""" 39 | global values 40 | pq = PriorityQueue(values, lambda x: x, False,0,True,False) 41 | result = list(iter(pq)) 42 | self.assertEqual(result, [8,6,6,3]) 43 | 44 | def test_append_maximized_blockworse_blockequal(self): 45 | """Maximized PriorityQueue (with blockworse + blockequal)""" 46 | global values 47 | pq = PriorityQueue(values, lambda x: x, False,0,True,True) 48 | result = list(iter(pq)) 49 | self.assertEqual(result, [8,6,3]) 50 | 51 | def test_append_minimized_blockworse(self): 52 | """Minimized PriorityQueue (with blockworse)""" 53 | global values 54 | pq = PriorityQueue(values, lambda x: x, True,0,True,False) 55 | result = list(iter(pq)) 56 | self.assertEqual(result, [1,3]) 57 | 58 | 59 | def test_append_minimized_fixedlength(self): 60 | """Fixed-length priority queue (min)""" 61 | global values 62 | pq = PriorityQueue(values, lambda x: x, True,4, False,False) 63 | result = list(iter(pq)) 64 | self.assertEqual(result, mintomax[:4]) 65 | 66 | def test_append_maximized_fixedlength(self): 67 | """Fixed-length priority queue (max)""" 68 | global values 69 | pq = PriorityQueue(values, lambda x: x, False,4,False,False) 70 | result = list(iter(pq)) 71 | self.assertEqual(result, maxtomin[:4]) 72 | 73 | 74 | if __name__ == '__main__': 75 | unittest.main() 76 | -------------------------------------------------------------------------------- /pynlpl/tools/freqlist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | ############################################################### 5 | # PyNLPl - Frequency List Generator 6 | # by Maarten van Gompel (proycon) 7 | # http://ilk.uvt.nl/~mvgompel 8 | # Induction for Linguistic Knowledge Research Group 9 | # Universiteit van Tilburg 10 | # 11 | # Licensed under GPLv3 12 | # 13 | ############################################################### 14 | 15 | 16 | from __future__ import print_function 17 | from __future__ import unicode_literals 18 | from __future__ import division 19 | from __future__ import absolute_import 20 | 21 | import argparse 22 | import sys 23 | import io 24 | 25 | from pynlpl.statistics import FrequencyList, Distribution 26 | from pynlpl.textprocessors import Windower, crude_tokenizer 27 | 28 | def main(): 29 | parser = argparse.ArgumentParser(description="Generate an n-gram frequency list", formatter_class=argparse.ArgumentDefaultsHelpFormatter) 30 | parser.add_argument('-n','--ngramsize', help="N-gram size", type=int, action='store',default=1) 31 | parser.add_argument('-i','--caseinsensitive', help="Case insensitive", action="store_true") 32 | parser.add_argument('-e','--encoding', help="Character encoding", type=str, action='store',default='utf-8') 33 | parser.add_argument('files', type=str, nargs='+', help="The data sets to sample from, must be of equal size (i.e., same number of lines)") 34 | 35 | 36 | args = parser.parse_args() 37 | 38 | if not args.files: 39 | print("No files specified", file=sys.stderr) 40 | sys.exit(1) 41 | 42 | freqlist = FrequencyList(None, args.caseinsensitive) 43 | for filename in args.files: 44 | f = io.open(filename,'r',encoding=args.encoding) 45 | for line in f: 46 | if args.ngramsize > 1: 47 | freqlist.append(Windower(crude_tokenizer(line),args.ngramsize)) 48 | else: 49 | freqlist.append(crude_tokenizer(line)) 50 | 51 | f.close() 52 | 53 | dist = Distribution(freqlist) 54 | for type, count in freqlist: 55 | if isinstance(type,tuple) or isinstance(type,list): 56 | type = " ".join(type) 57 | s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type)) 58 | print(s) 59 | 60 | print("Tokens: ", freqlist.tokens(),file=sys.stderr) 61 | print("Types: ", len(freqlist),file=sys.stderr) 62 | print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr) 63 | print("Entropy: ", dist.entropy(),file=sys.stderr) 64 | 65 | if __name__ == '__main__': 66 | main() 67 | 68 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. PyNLPl documentation master file, created by 2 | sphinx-quickstart on Tue Jul 6 22:07:20 2010. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to PyNLPl's documentation! 7 | ================================== 8 | 9 | PyNLPl, pronounced as 'pineapple', is a Python library for Natural Language 10 | Processing. It contains various modules useful for common, and less common, NLP 11 | tasks. PyNLPl can be used for basic tasks such as the extraction of n-grams and 12 | frequency lists, and to build simple language model. There are also more 13 | complex data types and algorithms. Moreover, there are parsers for file formats 14 | common in NLP (e.g. FoLiA/Giza/Moses/ARPA/Timbl/CQL). There are also clients to 15 | interface with various NLP specific servers. PyNLPl most notably features a 16 | very extensive library for working with FoLiA XML (Format for Linguistic 17 | Annotatation). 18 | 19 | The library is a divided into several packages and modules. It works on Python 20 | 2.7, as well as Python 3. 21 | 22 | The following modules are available: 23 | 24 | - ``pynlpl.datatypes`` - Extra datatypes (priority queues, patterns, tries) 25 | - ``pynlpl.evaluation`` - Evaluation & experiment classes (parameter search, wrapped 26 | progressive sampling, class evaluation (precision/recall/f-score/auc), sampler, confusion matrix, multithreaded experiment pool) 27 | - ``pynlpl.formats.cgn`` - Module for parsing CGN (Corpus Gesproken Nederlands) part-of-speech tags 28 | - ``pynlpl.formats.folia`` - Extensive library for reading and manipulating the 29 | documents in `FoLiA `_ format (Format for Linguistic Annotation). 30 | - ``pynlpl.formats.fql`` - Extensive library for the FoLiA Query Language (FQL), 31 | built on top of ``pynlpl.formats.folia``. FQL is currently documented `here 32 | `__. 33 | - ``pynlpl.formats.cql`` - Parser for the Corpus Query Language (CQL), as also used by 34 | Corpus Workbench and Sketch Engine. Contains a convertor to FQL. 35 | - ``pynlpl.formats.giza`` - Module for reading GIZA++ word alignment data 36 | - ``pynlpl.formats.moses`` - Module for reading Moses phrase-translation tables. 37 | - ``pynlpl.formats.sonar`` - Largely obsolete module for pre-releases of the 38 | SoNaR corpus, use ``pynlpl.formats.folia`` instead. 39 | - ``pynlpl.formats.timbl`` - Module for reading Timbl output (consider using 40 | `python-timbl `_ instead though) 41 | - ``pynlpl.lm.lm`` - Module for simple language model and reader for ARPA 42 | language model data as well (used by SRILM). 43 | - ``pynlpl.search`` - Various search algorithms (Breadth-first, depth-first, 44 | beam-search, hill climbing, A star, various variants of each) 45 | - ``pynlpl.statistics`` - Frequency lists, Levenshtein, common statistics and 46 | information theory functions 47 | - ``pynlpl.textprocessors`` - Simple tokeniser, n-gram extraction 48 | 49 | 50 | Contents: 51 | 52 | .. toctree:: 53 | :maxdepth: 3 54 | :glob: 55 | 56 | * 57 | 58 | Indices and tables 59 | ================== 60 | 61 | * :ref:`genindex` 62 | * :ref:`modindex` 63 | * :ref:`search` 64 | 65 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = $(VIRTUAL_ENV)/bin/sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | 15 | .PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest 16 | 17 | help: 18 | @echo "Please use \`make ' where is one of" 19 | @echo " html to make standalone HTML files" 20 | @echo " dirhtml to make HTML files named index.html in directories" 21 | @echo " pickle to make pickle files" 22 | @echo " json to make JSON files" 23 | @echo " htmlhelp to make HTML files and a HTML help project" 24 | @echo " qthelp to make HTML files and a qthelp project" 25 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 26 | @echo " changes to make an overview of all changed/added/deprecated items" 27 | @echo " linkcheck to check all external links for integrity" 28 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 29 | 30 | clean: 31 | -rm -rf $(BUILDDIR)/* _autosummary/* 32 | 33 | html: 34 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 35 | @echo 36 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 37 | 38 | dirhtml: 39 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 40 | @echo 41 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 42 | 43 | pickle: 44 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 45 | @echo 46 | @echo "Build finished; now you can process the pickle files." 47 | 48 | json: 49 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 50 | @echo 51 | @echo "Build finished; now you can process the JSON files." 52 | 53 | htmlhelp: 54 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 55 | @echo 56 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 57 | ".hhp project file in $(BUILDDIR)/htmlhelp." 58 | 59 | qthelp: 60 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 61 | @echo 62 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 63 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 64 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PyNLPl.qhcp" 65 | @echo "To view the help file:" 66 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PyNLPl.qhc" 67 | 68 | latex: 69 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 70 | @echo 71 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 72 | @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ 73 | "run these through (pdf)latex." 74 | 75 | changes: 76 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 77 | @echo 78 | @echo "The overview file is in $(BUILDDIR)/changes." 79 | 80 | linkcheck: 81 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 82 | @echo 83 | @echo "Link check complete; look for any errors in the above output " \ 84 | "or in $(BUILDDIR)/linkcheck/output.txt." 85 | 86 | doctest: 87 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 88 | @echo "Testing of doctests in the sources finished, look at the " \ 89 | "results in $(BUILDDIR)/doctest/output.txt." 90 | -------------------------------------------------------------------------------- /pynlpl/mt/wordalign.py: -------------------------------------------------------------------------------- 1 | from pynlpl.statistics import FrequencyList, Distribution 2 | 3 | 4 | class WordAlignment(object): 5 | 6 | def __init__(self, casesensitive = False): 7 | self.casesensitive = casesensitive 8 | 9 | def train(self, sourcefile, targetfile): 10 | sourcefile = open(sourcefile) 11 | targetfile = open(targetfile) 12 | 13 | self.sourcefreqlist = FrequencyList(None, self.casesensitive) 14 | self.targetfreqlist = FrequencyList(None, self.casesensitive) 15 | 16 | #frequency lists 17 | self.source2target = {} 18 | self.target2source = {} 19 | 20 | for sourceline, targetline in zip(sourcefile, targetfile): 21 | sourcetokens = sourceline.split() 22 | targettokens = targetline.split() 23 | 24 | self.sourcefreqlist.append(sourcetokens) 25 | self.targetfreqlist.append(targettokens) 26 | 27 | for sourcetoken in sourcetokens: 28 | if not sourcetoken in self.source2target: 29 | self.source2target[sourcetoken] = FrequencyList(targettokens,self.casesensitive) 30 | else: 31 | self.source2target[sourcetoken].append(targettokens) 32 | 33 | for targettoken in targettokens: 34 | if not targettoken in self.target2source: 35 | self.target2source[targettoken] = FrequencyList(sourcetokens,self.casesensitive) 36 | else: 37 | self.target2source[targettoken].append(sourcetokens) 38 | 39 | sourcefile.close() 40 | targetfile.close() 41 | 42 | def test(self, sourcefile, targetfile): 43 | sourcefile = open(sourcefile) 44 | targetfile = open(targetfile) 45 | 46 | 47 | #stage 2 48 | for sourceline, targetline in zip(sourcefile, targetfile): 49 | sourcetokens = sourceline.split() 50 | targettokens = targetline.split() 51 | 52 | S2Talignment = [] 53 | T2Salignment = [] 54 | 55 | for sourcetoken in sourcetokens: 56 | #which of the target-tokens is most frequent? 57 | besttoken = None 58 | bestscore = -1 59 | for i, targettoken in enumerate(targettokens): 60 | if targettoken in self.source2target[sourcetoken]: 61 | score = self.source2target[sourcetoken][targettoken] / float(self.targetfreqlist[targettoken]) 62 | if score > bestscore: 63 | bestscore = self.source2target[sourcetoken][targettoken] 64 | besttoken = i 65 | S2Talignment.append(besttoken) #TODO: multi-alignment? 66 | 67 | for targettoken in targettokens: 68 | besttoken = None 69 | bestscore = -1 70 | for i, sourcetoken in enumerate(sourcetokens): 71 | if sourcetoken in self.target2source[targettoken]: 72 | score = self.target2source[targettoken][sourcetoken] / float(self.sourcefreqlist[sourcetoken]) 73 | if score > bestscore: 74 | bestscore = self.target2source[targettoken][sourcetoken] 75 | besttoken = i 76 | T2Salignment.append(besttoken) #TODO: multi-alignment? 77 | 78 | yield sourcetokens, targettokens, S2Talignment, T2Salignment 79 | 80 | sourcefile.close() 81 | targetfile.close() 82 | 83 | -------------------------------------------------------------------------------- /pynlpl/tests/statistics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | #--------------------------------------------------------------- 5 | # PyNLPl - Test Units for Statistics and Information Theory 6 | # by Maarten van Gompel, ILK, Universiteit van Tilburg 7 | # http://ilk.uvt.nl/~mvgompel 8 | # proycon AT anaproy DOT nl 9 | # 10 | # Licensed under GPLv3 11 | # 12 | #---------------------------------------------------------------- 13 | from __future__ import print_function 14 | from __future__ import unicode_literals 15 | from __future__ import division 16 | from __future__ import absolute_import 17 | 18 | import sys 19 | import os 20 | import unittest 21 | 22 | from pynlpl.statistics import FrequencyList, HiddenMarkovModel 23 | from pynlpl.textprocessors import Windower 24 | 25 | 26 | sentences = ["This is a sentence .".split(' '),"Moreover , this sentence is a test .".split(' ')] 27 | 28 | class FrequencyListTest(unittest.TestCase): 29 | def test_freqlist_casesens(self): 30 | """Frequency List (case sensitive)""" 31 | global sentences 32 | f= FrequencyList() 33 | for sentence in sentences: 34 | f.append(sentence) 35 | self.assertTrue(( f['sentence'] == 2 and f['this'] == 1 and f['test'] == 1 )) 36 | 37 | def test_freqlist_caseinsens(self): 38 | """Frequency List (case insensitive)""" 39 | global sentences 40 | f= FrequencyList(None, False) 41 | for sentence in sentences: 42 | f.append(sentence) 43 | self.assertTrue(( f['sentence'] == 2 and f['this'] == 2 and f['Test'] == 1 )) 44 | 45 | def test_freqlist_tokencount(self): 46 | """Frequency List (count tokens)""" 47 | global sentences 48 | f= FrequencyList() 49 | for sentence in sentences: 50 | f.append(sentence) 51 | self.assertEqual(f.total,13) 52 | 53 | def test_freqlist_typecount(self): 54 | """Frequency List (count types)""" 55 | global sentences 56 | f= FrequencyList() 57 | for sentence in sentences: 58 | f.append(sentence) 59 | self.assertEqual(len(f),9) 60 | 61 | class BigramFrequencyListTest(unittest.TestCase): 62 | def test_freqlist_casesens(self): 63 | """Bigram Frequency List (case sensitive)""" 64 | global sentences 65 | f= FrequencyList() 66 | for sentence in sentences: 67 | f.append(Windower(sentence,2)) 68 | self.assertTrue(( f[('is','a')] == 2 and f[('This','is')] == 1)) 69 | 70 | def test_freqlist_caseinsens(self): 71 | """Bigram Frequency List (case insensitive)""" 72 | global sentences 73 | f= FrequencyList(None, False) 74 | for sentence in sentences: 75 | f.append(Windower(sentence,2)) 76 | self.assertTrue(( f[('is','a')] == 2 and f[('this','is')] == 1)) 77 | 78 | class HMMTest(unittest.TestCase): 79 | def test_viterbi(self): 80 | """Viterbi decode run on Hidden Markov Model""" 81 | hmm = HiddenMarkovModel('start') 82 | hmm.settransitions('start',{'rainy':0.6,'sunny':0.4}) 83 | hmm.settransitions('rainy',{'rainy':0.7,'sunny':0.3}) 84 | hmm.settransitions('sunny',{'rainy':0.4,'sunny':0.6}) 85 | hmm.setemission('rainy', {'walk': 0.1, 'shop': 0.4, 'clean': 0.5}) 86 | hmm.setemission('sunny', {'walk': 0.6, 'shop': 0.3, 'clean': 0.1}) 87 | observations = ['walk', 'shop', 'clean'] 88 | prob, path = hmm.viterbi(observations) 89 | self.assertEqual( path, ['sunny', 'rainy', 'rainy']) 90 | self.assertEqual( prob, 0.01344) 91 | 92 | if __name__ == '__main__': 93 | unittest.main() 94 | -------------------------------------------------------------------------------- /pynlpl/tests/formats.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | 5 | sys.path.append(sys.path[0] + '/../../') 6 | os.environ['PYTHONPATH'] = sys.path[0] + '/../../' 7 | from pynlpl.formats.timbl import TimblOutput 8 | if sys.version < '3': 9 | from StringIO import StringIO 10 | else: 11 | from io import StringIO 12 | 13 | class TimblTest(unittest.TestCase): 14 | 15 | def test1_simple(self): 16 | """Timbl - simple output""" 17 | s = StringIO("a b ? c\nc d ? e\n") 18 | for i, (features, referenceclass, predictedclass, distribution, distance) in enumerate(TimblOutput(s)): 19 | if i == 0: 20 | self.assertEqual(features,['a','b']) 21 | self.assertEqual(referenceclass,'?') 22 | self.assertEqual(predictedclass,'c') 23 | self.assertEqual(distribution,None) 24 | self.assertEqual(distance,None) 25 | elif i == 1: 26 | self.assertEqual(features,['c','d']) 27 | self.assertEqual(referenceclass,'?') 28 | self.assertEqual(predictedclass,'e') 29 | self.assertEqual(distribution,None) 30 | self.assertEqual(distance,None) 31 | 32 | 33 | def test2_db(self): 34 | """Timbl - Distribution output""" 35 | s = StringIO("a c ? c { c 1.00000, d 1.00000 }\na b ? c { c 1.00000 }\na d ? c { c 1.00000, e 1.00000 }") 36 | for i, (features, referenceclass, predictedclass, distribution, distance) in enumerate(TimblOutput(s)): 37 | if i == 0: 38 | self.assertEqual(features,['a','c']) 39 | self.assertEqual(referenceclass,'?') 40 | self.assertEqual(predictedclass,'c') 41 | self.assertEqual(distribution['c'], 0.5) 42 | self.assertEqual(distribution['d'], 0.5) 43 | self.assertEqual(distance,None) 44 | elif i == 1: 45 | self.assertEqual(features,['a','b']) 46 | self.assertEqual(referenceclass,'?') 47 | self.assertEqual(predictedclass,'c') 48 | self.assertEqual(distribution['c'], 1) 49 | self.assertEqual(distance,None) 50 | elif i == 2: 51 | self.assertEqual(features,['a','d']) 52 | self.assertEqual(referenceclass,'?') 53 | self.assertEqual(predictedclass,'c') 54 | self.assertEqual(distribution['c'], 0.5) 55 | self.assertEqual(distribution['e'], 0.5) 56 | self.assertEqual(distance,None) 57 | 58 | 59 | def test3_dbdi(self): 60 | """Timbl - Distribution + Distance output""" 61 | s = StringIO("a c ? c { c 1.00000, d 1.00000 } 1.0000000000000\na b ? c { c 1.00000 } 0.0000000000000\na d ? c { c 1.00000, e 1.00000 } 1.0000000000000") 62 | for i, (features, referenceclass, predictedclass, distribution, distance) in enumerate(TimblOutput(s)): 63 | if i == 0: 64 | self.assertEqual(features,['a','c']) 65 | self.assertEqual(referenceclass,'?') 66 | self.assertEqual(predictedclass,'c') 67 | self.assertEqual(distribution['c'], 0.5) 68 | self.assertEqual(distribution['d'], 0.5) 69 | self.assertEqual(distance,1.0) 70 | elif i == 1: 71 | self.assertEqual(features,['a','b']) 72 | self.assertEqual(referenceclass,'?') 73 | self.assertEqual(predictedclass,'c') 74 | self.assertEqual(distribution['c'], 1) 75 | self.assertEqual(distance,0.0) 76 | elif i == 2: 77 | self.assertEqual(features,['a','d']) 78 | self.assertEqual(referenceclass,'?') 79 | self.assertEqual(predictedclass,'c') 80 | self.assertEqual(distribution['c'], 0.5) 81 | self.assertEqual(distribution['e'], 0.5) 82 | self.assertEqual(distance,1.0) 83 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | PyNLPl - Python Natural Language Processing Library 2 | ===================================================== 3 | 4 | .. image:: https://travis-ci.org/proycon/pynlpl.svg?branch=master 5 | :target: https://travis-ci.org/proycon/pynlpl 6 | 7 | .. image:: http://readthedocs.org/projects/pynlpl/badge/?version=latest 8 | :target: http://pynlpl.readthedocs.io/en/latest/?badge=latest 9 | :alt: Documentation Status 10 | 11 | .. image:: http://applejack.science.ru.nl/lamabadge.php/pynlpl 12 | :target: http://applejack.science.ru.nl/languagemachines/ 13 | 14 | .. image:: https://zenodo.org/badge/759484.svg 15 | :target: https://zenodo.org/badge/latestdoi/759484 16 | 17 | PyNLPl, pronounced as 'pineapple', is a Python library for Natural Language 18 | Processing. It contains various modules useful for common, and less common, NLP 19 | tasks. PyNLPl can be used for basic tasks such as the extraction of n-grams and 20 | frequency lists, and to build simple language model. There are also more 21 | complex data types and algorithms. Moreover, there are parsers for file formats 22 | common in NLP (e.g. FoLiA/Giza/Moses/ARPA/Timbl/CQL). There are also clients to 23 | interface with various NLP specific servers. PyNLPl most notably features a 24 | very extensive library for working with FoLiA XML (Format for Linguistic 25 | Annotatation). 26 | 27 | The library is a divided into several packages and modules. It works on Python 28 | 2.7, as well as Python 3. 29 | 30 | The following modules are available: 31 | 32 | - ``pynlpl.datatypes`` - Extra datatypes (priority queues, patterns, tries) 33 | - ``pynlpl.evaluation`` - Evaluation & experiment classes (parameter search, wrapped 34 | progressive sampling, class evaluation (precision/recall/f-score/auc), sampler, confusion matrix, multithreaded experiment pool) 35 | - ``pynlpl.formats.cgn`` - Module for parsing CGN (Corpus Gesproken Nederlands) part-of-speech tags 36 | - ``pynlpl.formats.folia`` - Extensive library for reading and manipulating the 37 | documents in `FoLiA `_ format (Format for Linguistic Annotation). 38 | - ``pynlpl.formats.fql`` - Extensive library for the FoLiA Query Language (FQL), 39 | built on top of ``pynlpl.formats.folia``. FQL is currently documented `here 40 | `__. 41 | - ``pynlpl.formats.cql`` - Parser for the Corpus Query Language (CQL), as also used by 42 | Corpus Workbench and Sketch Engine. Contains a convertor to FQL. 43 | - ``pynlpl.formats.giza`` - Module for reading GIZA++ word alignment data 44 | - ``pynlpl.formats.moses`` - Module for reading Moses phrase-translation tables. 45 | - ``pynlpl.formats.sonar`` - Largely obsolete module for pre-releases of the 46 | SoNaR corpus, use ``pynlpl.formats.folia`` instead. 47 | - ``pynlpl.formats.timbl`` - Module for reading Timbl output (consider using 48 | `python-timbl `_ instead though) 49 | - ``pynlpl.lm.lm`` - Module for simple language model and reader for ARPA 50 | language model data as well (used by SRILM). 51 | - ``pynlpl.search`` - Various search algorithms (Breadth-first, depth-first, 52 | beam-search, hill climbing, A star, various variants of each) 53 | - ``pynlpl.statistics`` - Frequency lists, Levenshtein, common statistics and 54 | information theory functions 55 | - ``pynlpl.textprocessors`` - Simple tokeniser, n-gram extraction 56 | 57 | Installation 58 | -------------------- 59 | 60 | Download and install the latest stable version directly from the Python Package 61 | Index with ``pip install pynlpl`` (or ``pip3`` for Python 3 on most 62 | systems). For global installations prepend ``sudo``. 63 | 64 | Alternatively, clone this repository and run ``python setup.py install`` (or 65 | ``python3 setup.py install`` for Python 3 on most system. Prepend ``sudo`` for 66 | global installations. 67 | 68 | This software may also be found in the certain Linux distributions, such as 69 | the latest versions as Debian/Ubuntu, as ``python-pynlpl`` and ``python3-pynlpl``. 70 | PyNLPL is also included in our `LaMachine `_ distribution. 71 | 72 | Documentation 73 | -------------------- 74 | 75 | API Documentation can be found `here `__. 76 | 77 | 78 | -------------------------------------------------------------------------------- /pynlpl/tools/sonar2folia.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | #--------------------------------------------------------------- 5 | # PyNLPl - Conversion script for converting SoNaR/D-Coi from D-Coi XML to FoLiA XML 6 | # by Maarten van Gompel, ILK, Tilburg University 7 | # http://ilk.uvt.nl/~mvgompel 8 | # proycon AT anaproy DOT nl 9 | # 10 | # Licensed under GPLv3 11 | # 12 | #---------------------------------------------------------------- 13 | 14 | # Usage: sonar2folia.py sonar-input-dir output-dir nr-of-threads 15 | 16 | from __future__ import print_function, unicode_literals, division, absolute_import 17 | 18 | import sys 19 | import os 20 | 21 | if __name__ == "__main__": 22 | sys.path.append(sys.path[0] + '/../..') 23 | os.environ['PYTHONPATH'] = sys.path[0] + '/../..' 24 | 25 | import pynlpl.formats.folia as folia 26 | import pynlpl.formats.sonar as sonar 27 | from multiprocessing import Pool, Process 28 | import datetime 29 | import codecs 30 | 31 | 32 | def process(data): 33 | i, filename = data 34 | category = os.path.basename(os.path.dirname(filename)) 35 | progress = round((i+1) / float(len(index)) * 100,1) 36 | print("#" + str(i+1) + " " + filename + ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' ' + str(progress) + '%',file=sys.stderr) 37 | try: 38 | doc = folia.Document(file=filename) 39 | except Exception as e: 40 | print("ERROR loading " + filename + ":" + str(e),file=sys.stderr) 41 | return False 42 | filename = filename.replace(sonardir,'') 43 | if filename[0] == '/': 44 | filename = filename[1:] 45 | if filename[-4:] == '.pos': 46 | filename = filename[:-4] 47 | if filename[-4:] == '.tok': 48 | filename = filename[:-4] 49 | if filename[-4:] == '.ilk': 50 | filename = filename[:-4] 51 | #Load document prior to tokenisation 52 | try: 53 | pretokdoc = folia.Document(file=sonardir + '/' + filename) 54 | except: 55 | print("WARNING unable to load pretokdoc " + filename,file=sys.stderr) 56 | pretokdoc = None 57 | if pretokdoc: 58 | for p2 in pretokdoc.paragraphs(): 59 | try: 60 | p = doc[p2.id] 61 | except: 62 | print("ERROR: Paragraph " + p2.id + " not found. Tokenised and pre-tokenised versions out of sync?",file=sys.stderr) 63 | continue 64 | if p2.text: 65 | p.text = p2.text 66 | try: 67 | os.mkdir(foliadir + os.path.dirname(filename)) 68 | except: 69 | pass 70 | 71 | try: 72 | doc.save(foliadir + filename) 73 | except: 74 | print("ERROR saving " + foliadir + filename,file=sys.stderr) 75 | 76 | try: 77 | f = codecs.open(foliadir + filename.replace('.xml','.tok.txt'),'w','utf-8') 78 | f.write(unicode(doc)) 79 | f.close() 80 | except: 81 | print("ERROR saving " + foliadir + filename.replace('.xml','.tok.txt'),file=sys.stderr) 82 | 83 | 84 | sys.stdout.flush() 85 | sys.stderr.flush() 86 | return True 87 | 88 | def outputexists(filename, sonardir, foliadir): 89 | filename = filename.replace(sonardir,'') 90 | if filename[0] == '/': 91 | filename = filename[1:] 92 | if filename[-4:] == '.pos': 93 | filename = filename[:-4] 94 | if filename[-4:] == '.tok': 95 | filename = filename[:-4] 96 | if filename[-4:] == '.ilk': 97 | filename = filename[:-4] 98 | return os.path.exists(foliadir + filename) 99 | 100 | 101 | if __name__ == '__main__': 102 | sonardir = sys.argv[1] 103 | foliadir = sys.argv[2] 104 | threads = int(sys.argv[3]) 105 | if foliadir[-1] != '/': foliadir += '/' 106 | try: 107 | os.mkdir(foliadir[:-1]) 108 | except: 109 | pass 110 | 111 | print("Building index...") 112 | index = list(enumerate([ x for x in sonar.CorpusFiles(sonardir,'pos', "", lambda x: True, True) if not outputexists(x, sonardir, foliadir) ])) 113 | 114 | print("Processing...") 115 | p = Pool(threads) 116 | p.map(process, index ) 117 | 118 | -------------------------------------------------------------------------------- /pynlpl/formats/cgn.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | 3 | ############################################################### 4 | # PyNLPl - Corpus Gesproken Nederlands 5 | # by Maarten van Gompel (proycon) 6 | # http://ilk.uvt.nl/~mvgompel 7 | # Induction for Linguistic Knowledge Research Group 8 | # Universiteit van Tilburg 9 | # 10 | # Licensed under GPLv3 11 | # 12 | # Classes for reading CGN (still to be added). Most notably, contains a function for decoding 13 | # PoS features like "N(soort,ev,basis,onz,stan)" into a data structure. 14 | # 15 | ############################################################### 16 | 17 | from __future__ import print_function 18 | from __future__ import unicode_literals 19 | from __future__ import division 20 | from __future__ import absolute_import 21 | import sys 22 | if sys.version < '3': 23 | from codecs import getwriter 24 | stderr = getwriter('utf-8')(sys.stderr) 25 | stdout = getwriter('utf-8')(sys.stdout) 26 | else: 27 | stderr = sys.stderr 28 | stdout = sys.stdout 29 | 30 | from pynlpl.formats import folia 31 | from pynlpl.common import Enum 32 | 33 | 34 | class InvalidTagException(Exception): 35 | pass 36 | 37 | class InvalidFeatureException(Exception): 38 | pass 39 | 40 | subsets = { 41 | 'ntype': ['soort','eigen'], 42 | 'getal': ['ev','mv','getal',], 43 | 'genus': ['zijd','onz','masc','fem','genus'], 44 | 'naamval': ['stan','gen','dat','nomin','obl','bijz'], 45 | 'spectype': ['afgebr','afk','deeleigen','symb','vreemd','enof','meta','achter','comment','onverst'], 46 | 'conjtype': ['neven','onder'], 47 | 'vztype': ['init','versm','fin'], 48 | 'npagr': ['agr','evon','rest','evz','mv','agr3','evmo','rest3','evf'], 49 | 'lwtype': ['bep','onbep'], 50 | 'vwtype': ['pers','pr','refl','recip','bez','vb','vrag','betr','excl','aanw','onbep'], 51 | 'pdtype': ['adv-pron','pron','det','grad'], 52 | 'status': ['vol','red','nadr'], 53 | 'persoon': ['1','2','2v','2b','3','3p','3m','3v','3o','persoon'], 54 | 'positie': ['prenom','postnom', 'nom','vrij'], 55 | 'buiging': ['zonder','met-e','met-s'], 56 | 'getal-n' : ['zonder-v','mv-n','zonder-n'], 57 | 'graad' : ['basis','comp','sup','dim'], 58 | 'wvorm': ['pv','inf','vd','od'], 59 | 'pvtijd': ['tgw','verl','conj'], 60 | 'pvagr': ['ev','mv','met-t'], 61 | 'numtype': ['hoofd','rang'], 62 | 'dial': ['dial'], 63 | } 64 | constraints = { 65 | 'getal':['N','VNW'], 66 | 'npagr':['VNW','LID'], 67 | 'pvagr':['WW'], 68 | } 69 | 70 | def parse_cgn_postag(rawtag, raisefeatureexceptions = False): 71 | global subsets, constraints 72 | """decodes PoS features like "N(soort,ev,basis,onz,stan)" into a PosAnnotation data structure 73 | based on CGN tag overview compiled by Matje van de Camp""" 74 | 75 | 76 | begin = rawtag.find('(') 77 | if rawtag[-1] == ')' and begin > 0: 78 | tag = folia.PosAnnotation(None, cls=rawtag,set='http://ilk.uvt.nl/folia/sets/cgn') 79 | 80 | 81 | head = rawtag[0:begin] 82 | tag.append( folia.Feature, subset='head',cls=head) 83 | 84 | rawfeatures = rawtag[begin+1:-1].split(',') 85 | for rawfeature in rawfeatures: 86 | if rawfeature: 87 | found = False 88 | for subset, classes in subsets.items(): 89 | if rawfeature in classes: 90 | if subset in constraints: 91 | if not head in constraints[subset]: 92 | continue #constraint not met! 93 | found = True 94 | tag.append( folia.Feature, subset=subset,cls=rawfeature) 95 | break 96 | if not found: 97 | print("\t\tUnknown feature value: " + rawfeature + " in " + rawtag, file=stderr) 98 | if raisefeatureexceptions: 99 | raise InvalidFeatureException("Unknown feature value: " + rawfeature + " in " + rawtag) 100 | else: 101 | continue 102 | return tag 103 | else: 104 | raise InvalidTagException("Not a valid CGN tag") 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /pynlpl/clients/freeling.py: -------------------------------------------------------------------------------- 1 | ############################################################### 2 | # PyNLPl - FreeLing Library 3 | # by Maarten van Gompel (proycon) 4 | # http://ilk.uvt.nl/~mvgompel 5 | # Radboud University Nijmegen 6 | # 7 | # Licensed under GPLv3 8 | # 9 | # This is a Python library for on-the-fly communication with 10 | # a FreeLing server. Allowing on-the-fly lemmatisation and 11 | # PoS-tagging. It is recommended to pass your data on a 12 | # sentence-by-sentence basis to FreeLingClient.process() 13 | # 14 | # Make sure to start Freeling (analyzer) with the --server 15 | # and --flush flags !!!!! 16 | # 17 | ############################################################### 18 | 19 | from __future__ import print_function 20 | from __future__ import unicode_literals 21 | from __future__ import division 22 | from __future__ import absolute_import 23 | from pynlpl.common import u 24 | 25 | import socket 26 | import sys 27 | 28 | class FreeLingClient(object): 29 | def __init__(self, host, port, encoding='utf-8', timeout=120.0): 30 | """Initialise the client, set channel to the path and filename where the server's .in and .out pipes are (without extension)""" 31 | self.encoding = encoding 32 | self.BUFSIZE = 10240 33 | self.socket = socket.socket(socket.AF_INET,socket.SOCK_STREAM) 34 | self.socket.settimeout(timeout) 35 | self.socket.connect( (host,int(port)) ) 36 | self.encoding = encoding 37 | self.socket.sendall('RESET_STATS\0') 38 | r = self.socket.recv(self.BUFSIZE) 39 | if not r.strip('\0') == 'FL-SERVER-READY': 40 | raise Exception("Server not ready") 41 | 42 | 43 | def process(self, sourcewords, debug=False): 44 | """Process a list of words, passing it to the server and realigning the output with the original words""" 45 | 46 | if isinstance( sourcewords, list ) or isinstance( sourcewords, tuple ): 47 | sourcewords_s = " ".join(sourcewords) 48 | else: 49 | sourcewords_s = sourcewords 50 | sourcewords = sourcewords.split(' ') 51 | 52 | self.socket.sendall(sourcewords_s.encode(self.encoding) +'\n\0') 53 | if debug: print("Sent:",sourcewords_s.encode(self.encoding),file=sys.stderr) 54 | 55 | results = [] 56 | done = False 57 | while not done: 58 | data = b"" 59 | while not data: 60 | buffer = self.socket.recv(self.BUFSIZE) 61 | if debug: print("Buffer: ["+repr(buffer)+"]",file=sys.stderr) 62 | if buffer[-1] == '\0': 63 | data += buffer[:-1] 64 | done = True 65 | break 66 | else: 67 | data += buffer 68 | 69 | 70 | data = u(data,self.encoding) 71 | if debug: print("Received:",data,file=sys.stderr) 72 | 73 | for i, line in enumerate(data.strip(' \t\0\r\n').split('\n')): 74 | if not line.strip(): 75 | done = True 76 | break 77 | else: 78 | cols = line.split(" ") 79 | subwords = cols[0].lower().split("_") 80 | if len(cols) > 2: #this seems a bit odd? 81 | for word in subwords: #split multiword expressions 82 | results.append( (word, cols[1], cols[2], i, len(subwords) > 1 ) ) #word, lemma, pos, index, multiword? 83 | 84 | sourcewords = [ w.lower() for w in sourcewords ] 85 | 86 | alignment = [] 87 | for i, sourceword in enumerate(sourcewords): 88 | found = False 89 | best = 0 90 | distance = 999999 91 | for j, (targetword, lemma, pos, index, multiword) in enumerate(results): 92 | if sourceword == targetword and abs(i-j) < distance: 93 | found = True 94 | best = j 95 | distance = abs(i-j) 96 | 97 | if found: 98 | alignment.append(results[best]) 99 | else: 100 | alignment.append((None,None,None,None,False)) #no alignment found 101 | return alignment 102 | 103 | -------------------------------------------------------------------------------- /pynlpl/common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | ###############################################################9 5 | # PyNLPl - Common functions 6 | # by Maarten van Gompel 7 | # Centre for Language Studies 8 | # Radboud University Nijmegen 9 | # http://www.github.com/proycon/pynlpl 10 | # proycon AT anaproy DOT nl 11 | # 12 | # Licensed under GPLv3 13 | # 14 | # This contains very common functions and language extensions 15 | # 16 | ############################################################### 17 | 18 | from __future__ import print_function 19 | from __future__ import unicode_literals 20 | from __future__ import division 21 | from __future__ import absolute_import 22 | 23 | import datetime 24 | from sys import stderr, version 25 | 26 | ## From http://code.activestate.com/recipes/413486/ (r7) 27 | def Enum(*names): 28 | ##assert names, "Empty enums are not supported" # <- Don't like empty enums? Uncomment! 29 | 30 | class EnumClass(object): 31 | __slots__ = names 32 | def __iter__(self): return iter(constants) 33 | def __len__(self): return len(constants) 34 | def __getitem__(self, i): return constants[i] 35 | def __repr__(self): return 'Enum' + str(names) 36 | def __str__(self): return 'enum ' + str(constants) 37 | 38 | class EnumValue(object): 39 | __slots__ = ('__value') 40 | def __init__(self, value): self.__value = value 41 | Value = property(lambda self: self.__value) 42 | EnumType = property(lambda self: EnumType) 43 | def __hash__(self): return hash(self.__value) 44 | def __cmp__(self, other): 45 | # C fans might want to remove the following assertion 46 | # to make all enums comparable by ordinal value {;)) 47 | assert self.EnumType is other.EnumType, "Only values from the same enum are comparable" 48 | return cmp(self.__value, other.__value) 49 | def __invert__(self): return constants[maximum - self.__value] 50 | def __bool__(self): return bool(self.__value) 51 | def __nonzero__(self): return bool(self.__value) #Python 2.x 52 | def __repr__(self): return str(names[self.__value]) 53 | 54 | maximum = len(names) - 1 55 | constants = [None] * len(names) 56 | for i, each in enumerate(names): 57 | val = EnumValue(i) 58 | setattr(EnumClass, each, val) 59 | constants[i] = val 60 | constants = tuple(constants) 61 | EnumType = EnumClass() 62 | return EnumType 63 | 64 | 65 | def u(s, encoding = 'utf-8', errors='strict'): 66 | #ensure s is properly unicode.. wrapper for python 2.6/2.7, 67 | if version < '3': 68 | #ensure the object is unicode 69 | if isinstance(s, unicode): 70 | return s 71 | else: 72 | return unicode(s, encoding,errors=errors) 73 | else: 74 | #will work on byte arrays 75 | if isinstance(s, str): 76 | return s 77 | else: 78 | return str(s,encoding,errors=errors) 79 | 80 | def b(s): 81 | #ensure s is bytestring 82 | if version < '3': 83 | #ensure the object is unicode 84 | if isinstance(s, str): 85 | return s 86 | else: 87 | return s.encode('utf-8') 88 | else: 89 | #will work on byte arrays 90 | if isinstance(s, bytes): 91 | return s 92 | else: 93 | return s.encode('utf-8') 94 | 95 | def isstring(s): #Is this a proper string? 96 | return isinstance(s, str) or (version < '3' and isinstance(s, unicode)) 97 | 98 | def log(msg, **kwargs): 99 | """Generic log method. Will prepend timestamp. 100 | 101 | Keyword arguments: 102 | system - Name of the system/module 103 | indent - Integer denoting the desired level of indentation 104 | streams - List of streams to output to 105 | stream - Stream to output to (singleton version of streams) 106 | """ 107 | if 'debug' in kwargs: 108 | if 'currentdebug' in kwargs: 109 | if kwargs['currentdebug'] < kwargs['debug']: 110 | return False 111 | else: 112 | return False #no currentdebug passed, assuming no debug mode and thus skipping message 113 | 114 | s = "[" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "] " 115 | if 'system' in kwargs: 116 | s += "[" + system + "] " 117 | 118 | 119 | if 'indent' in kwargs: 120 | s += ("\t" * int(kwargs['indent'])) 121 | 122 | s += u(msg) 123 | 124 | if s[-1] != '\n': 125 | s += '\n' 126 | 127 | if 'streams' in kwargs: 128 | streams = kwargs['streams'] 129 | elif 'stream' in kwargs: 130 | streams = [kwargs['stream']] 131 | else: 132 | streams = [stderr] 133 | 134 | for stream in streams: 135 | stream.write(s) 136 | return s 137 | -------------------------------------------------------------------------------- /pynlpl/formats/timbl.py: -------------------------------------------------------------------------------- 1 | ############################################################### 2 | # PyNLPl - Timbl Classifier Output Library 3 | # by Maarten van Gompel (proycon) 4 | # http://ilk.uvt.nl/~mvgompel 5 | # Induction for Linguistic Knowledge Research Group 6 | # Universiteit van Tilburg 7 | # 8 | # Derived from code by Sander Canisius 9 | # 10 | # Licensed under GPLv3 11 | # 12 | # This library offers a TimblOutput class for reading Timbl 13 | # classifier output. It supports full distributions (+v+db) and comment (#) 14 | # 15 | ############################################################### 16 | 17 | 18 | from __future__ import print_function 19 | from __future__ import unicode_literals 20 | from __future__ import division 21 | from __future__ import absolute_import 22 | import sys 23 | if sys.version < '3': 24 | from codecs import getwriter 25 | stderr = getwriter('utf-8')(sys.stderr) 26 | stdout = getwriter('utf-8')(sys.stdout) 27 | else: 28 | stderr = sys.stderr 29 | stdout = sys.stdout 30 | 31 | from pynlpl.statistics import Distribution 32 | 33 | 34 | class TimblOutput(object): 35 | """A class for reading Timbl classifier output, supports the +v+db option and ignores comments starting with #""" 36 | 37 | def __init__(self, stream, delimiter = ' ', ignorecolumns = [], ignorevalues = []): 38 | self.stream = stream 39 | self.delimiter = delimiter 40 | self.ignorecolumns = ignorecolumns #numbers, ignore the specified FEATURE columns: first column is 1 41 | self.ignorevalues = ignorevalues #Ignore columns with the following values 42 | 43 | def __iter__(self): 44 | # Note: distance parsing (+v+di) works only if distributions (+v+db) are also enabled! 45 | for line in self.stream: 46 | endfvec = None 47 | line = line.strip() 48 | if line and line[0] != '#': #ignore empty lines and comments 49 | segments = [ x for i, x in enumerate(line.split(self.delimiter)) if x not in self.ignorevalues and i+1 not in self.ignorecolumns ] 50 | 51 | #segments = [ x for x in line.split() if x != "^" and not (len(x) == 3 and x[0:2] == "n=") ] #obtain segments, and filter null fields and "n=?" feature (in fixed-feature configuration) 52 | 53 | 54 | if not endfvec: 55 | try: 56 | # Modified by Ruben. There are some cases where one of the features is a {, and then 57 | # the module is not able to obtain the distribution of scores and senses 58 | # We have to look for the last { in the vector, and due to there is no rindex method 59 | # we obtain the reverse and then apply index. 60 | aux=list(reversed(segments)).index("{") 61 | endfvec=len(segments)-aux-1 62 | #endfvec = segments.index("{") 63 | except ValueError: 64 | endfvec = None 65 | 66 | if endfvec and endfvec > 2: # only for +v+db 67 | try: 68 | enddistr = segments.index('}',endfvec) 69 | except ValueError: 70 | raise 71 | distribution = self.parseDistribution(segments, endfvec, enddistr) 72 | if len(segments) > enddistr + 1: 73 | distance = float(segments[-1]) 74 | else: 75 | distance = None 76 | else: 77 | endfvec = len(segments) 78 | distribution = None 79 | distance = None 80 | 81 | #features, referenceclass, predictedclass, distribution, distance 82 | yield segments[:endfvec - 2], segments[endfvec - 2], segments[endfvec - 1], distribution, distance 83 | 84 | 85 | def parseDistribution(self, instance, start,end= None): 86 | dist = {} 87 | i = start + 1 88 | 89 | if not end: 90 | end = len(instance) - 1 91 | 92 | while i < end: #instance[i] != "}": 93 | label = instance[i] 94 | try: 95 | score = float(instance[i+1].rstrip(",")) 96 | dist[label] = score 97 | except: 98 | print("ERROR: pynlpl.input.timbl.TimblOutput -- Could not fetch score for class '" + label + "', expected float, but found '"+instance[i+1].rstrip(",")+"'. Instance= " + " ".join(instance)+ ".. Attempting to compensate...",file=stderr) 99 | i = i - 1 100 | i += 2 101 | 102 | 103 | if not dist: 104 | print("ERROR: pynlpl.input.timbl.TimblOutput -- Did not find class distribution for ", instance,file=stderr) 105 | 106 | return Distribution(dist) 107 | -------------------------------------------------------------------------------- /pynlpl/fsa.py: -------------------------------------------------------------------------------- 1 | #--------------------------------------------------------------- 2 | # PyNLPl - Finite State Automata 3 | # by Maarten van Gompel 4 | # Centre for Language Studies 5 | # Radboud University Nijmegen 6 | # http://proycon.github.com/folia 7 | # http://www.github.com/proycon/pynlpl 8 | # proycon AT anaproy DOT nl 9 | # 10 | # Partially based/inspired on code by Xiayun Sun (https://github.com/xysun/regex) 11 | # 12 | # Licensed under GPLv3 13 | # 14 | #---------------------------------------------------------------- 15 | from __future__ import print_function, unicode_literals, division, absolute_import 16 | import sys 17 | 18 | 19 | class State(object): 20 | def __init__(self, **kwargs): 21 | if 'epsilon' in kwargs: 22 | self.epsilon = kwargs['epsilon'] # epsilon-closure (lis of states) 23 | else: 24 | self.epsilon = [] # epsilon-closure 25 | if 'transitions' in kwargs: 26 | self.transitions = kwargs['transitions'] 27 | else: 28 | self.transitions = [] #(matchitem, matchfunction(value), state) 29 | if 'final' in kwargs: 30 | self.final = bool(kwargs['final']) # ending state 31 | else: 32 | self.final = False 33 | self.transitioned = None #will be a tuple (state, matchitem) indicating how this state was reached 34 | 35 | 36 | 37 | class NFA(object): 38 | """Non-deterministic finite state automaton. Can be used to model DFAs as well if your state transitions are not ambiguous and epsilon is empty.""" 39 | 40 | def __init__(self, initialstate): 41 | self.initialstate = initialstate 42 | 43 | def run(self, sequence, mustmatchall=False,debug=False): 44 | def add(state, states): 45 | """add state and recursively add epsilon transitions""" 46 | assert isinstance(state, State) 47 | if state in states: 48 | return 49 | states.add(state) 50 | for eps in state.epsilon: #recurse into epsilon transitions 51 | add(eps, states) 52 | 53 | current_states = set() 54 | add(self.initialstate, current_states) 55 | if debug: print("Starting run, current states: ", repr(current_states),file=sys.stderr) 56 | 57 | for offset, value in enumerate(sequence): 58 | if not current_states: break 59 | if debug: print("Value: ", repr(value),file=sys.stderr) 60 | next_states = set() 61 | for state in current_states: 62 | for matchitem, matchfunction, trans_state in state.transitions: 63 | if matchfunction(value): 64 | trans_state.transitioned = (state, matchitem) 65 | add(trans_state, next_states) 66 | 67 | current_states = next_states 68 | if debug: print("Current states: ", repr(current_states),file=sys.stderr) 69 | if not mustmatchall: 70 | for s in current_states: 71 | if s.final: 72 | if debug: print("Final state reached",file=sys.stderr) 73 | yield offset+1 74 | 75 | if mustmatchall: 76 | for s in current_states: 77 | if s.final: 78 | if debug: print("Final state reached",file=sys.stderr) 79 | yield offset+1 80 | 81 | 82 | def match(self, sequence): 83 | try: 84 | return next(self.run(sequence,True)) == len(sequence) 85 | except StopIteration: 86 | return False 87 | 88 | def find(self, sequence, debug=False): 89 | l = len(sequence) 90 | for i in range(0,l): 91 | for length in self.run(sequence[i:], False, debug): 92 | yield sequence[i:i+length] 93 | 94 | def __iter__(self): 95 | return iter(self._states(self.initialstate)) 96 | 97 | def _states(self, state, processedstates=[]): #pylint: disable=dangerous-default-value 98 | """Iterate over all states in no particular order""" 99 | processedstates.append(state) 100 | 101 | for nextstate in state.epsilon: 102 | if not nextstate in processedstates: 103 | self._states(nextstate, processedstates) 104 | 105 | for _, nextstate in state.transitions: 106 | if not nextstate in processedstates: 107 | self._states(nextstate, processedstates) 108 | 109 | return processedstates 110 | 111 | def __repr__(self): 112 | out = [] 113 | for state in self: 114 | staterep = repr(state) 115 | if state is self.initialstate: 116 | staterep += " (INITIAL)" 117 | for nextstate in state.epsilon: 118 | nextstaterep = repr(nextstate) 119 | if nextstate.final: 120 | nextstaterep += " (FINAL)" 121 | out.append( staterep + " -e-> " + nextstaterep ) 122 | for item, _, nextstate in state.transitions: 123 | nextstaterep = repr(nextstate) 124 | if nextstate.final: 125 | nextstaterep += " (FINAL)" 126 | out.append( staterep + " -(" + repr(item) + ")-> " + nextstaterep ) 127 | 128 | return "\n".join(out) 129 | -------------------------------------------------------------------------------- /pynlpl/tests/cql.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | #--------------------------------------------------------------- 5 | # PyNLPl - Test Units for CQL using Finite State Automata 6 | # by Maarten van Gompel, Radboud University Nijmegen 7 | # proycon AT anaproy DOT nl 8 | # 9 | # Licensed under GPLv3 10 | #---------------------------------------------------------------- 11 | 12 | 13 | from __future__ import print_function 14 | from __future__ import unicode_literals 15 | from __future__ import division 16 | from __future__ import absolute_import 17 | import sys 18 | if sys.version < '3': 19 | from codecs import getwriter 20 | stderr = getwriter('utf-8')(sys.stderr) 21 | stdout = getwriter('utf-8')(sys.stdout) 22 | else: 23 | stderr = sys.stderr 24 | stdout = sys.stdout 25 | 26 | import sys 27 | import unittest 28 | from pynlpl.formats import cql 29 | 30 | tokens = [ 31 | { 32 | 'word': 'This', 33 | 'lemma': 'this', 34 | 'pos': 'det', 35 | }, 36 | { 37 | 'word': 'is', 38 | 'lemma': 'be', 39 | 'pos': 'v', 40 | }, 41 | { 42 | 'word': 'a', 43 | 'lemma': 'a', 44 | 'pos': 'det', 45 | }, 46 | { 47 | 'word': 'first', 48 | 'lemma': 'first', 49 | 'pos': 'a', 50 | }, 51 | { 52 | 'word': 'test', 53 | 'lemma': 'test', 54 | 'pos': 'n', 55 | }, 56 | { 57 | 'word': 'of', 58 | 'lemma': 'dit', 59 | 'pos': 'prep', 60 | }, 61 | { 62 | 'word': 'the', 63 | 'lemma': 'the', 64 | 'pos': 'det', 65 | }, 66 | { 67 | 'word': 'new', 68 | 'lemma': 'new', 69 | 'pos': 'a', 70 | }, 71 | { 72 | 'word': 'module', 73 | 'lemma': 'module', 74 | 'pos': 'n', 75 | }, 76 | { 77 | 'word': '.', 78 | 'lemma': '.', 79 | 'pos': 'punc', 80 | }, 81 | ] 82 | 83 | 84 | class Test1(unittest.TestCase): 85 | 86 | def test1(self): 87 | q = cql.Query("\"the\"") 88 | result = q(tokens) 89 | self.assertEqual(len(result),1) #one result 90 | self.assertEqual(len(result[0]),1) #result 1 consists of one word 91 | self.assertEqual(result[0][0]['word'],"the") 92 | 93 | def test2(self): 94 | q = cql.Query("[ pos = \"det\" ]") 95 | result = q(tokens) 96 | self.assertEqual(len(result),3) 97 | self.assertEqual(result[0][0]['word'],"This") 98 | self.assertEqual(result[1][0]['word'],"a") 99 | self.assertEqual(result[2][0]['word'],"the") 100 | 101 | def test3(self): 102 | q = cql.Query("[ pos = \"det\" ] [ pos = \"a\" ] [ pos = \"n\" ]") 103 | result = q(tokens) 104 | self.assertEqual(len(result),2) 105 | self.assertEqual(result[0][0]['word'],"a") 106 | self.assertEqual(result[0][1]['word'],"first") 107 | self.assertEqual(result[0][2]['word'],"test") 108 | self.assertEqual(result[1][0]['word'],"the") 109 | self.assertEqual(result[1][1]['word'],"new") 110 | self.assertEqual(result[1][2]['word'],"module") 111 | 112 | def test4(self): 113 | q = cql.Query("[ pos = \"det\" ] [ pos = \"a\" ]? [ pos = \"n\" ]") 114 | result = q(tokens) 115 | self.assertEqual(len(result),2) 116 | self.assertEqual(result[0][0]['word'],"a") 117 | self.assertEqual(result[0][1]['word'],"first") 118 | self.assertEqual(result[0][2]['word'],"test") 119 | self.assertEqual(result[1][0]['word'],"the") 120 | self.assertEqual(result[1][1]['word'],"new") 121 | self.assertEqual(result[1][2]['word'],"module") 122 | 123 | def test5(self): 124 | q = cql.Query("[ pos = \"det\" ] []? [ pos = \"n\" ]") 125 | result = q(tokens) 126 | self.assertEqual(len(result),2) 127 | self.assertEqual(result[0][0]['word'],"a") 128 | self.assertEqual(result[0][1]['word'],"first") 129 | self.assertEqual(result[0][2]['word'],"test") 130 | self.assertEqual(result[1][0]['word'],"the") 131 | self.assertEqual(result[1][1]['word'],"new") 132 | self.assertEqual(result[1][2]['word'],"module") 133 | 134 | def test6(self): 135 | q = cql.Query("[ pos = \"det\" ] []+ [ pos = \"n\" ]") 136 | result = q(tokens) 137 | self.assertEqual(len(result),2) 138 | self.assertEqual(result[0][0]['word'],"a") 139 | self.assertEqual(result[0][1]['word'],"first") 140 | self.assertEqual(result[0][2]['word'],"test") 141 | self.assertEqual(result[1][0]['word'],"the") 142 | self.assertEqual(result[1][1]['word'],"new") 143 | self.assertEqual(result[1][2]['word'],"module") 144 | 145 | def test7(self): 146 | q = cql.Query("[ pos = \"det\" ] []* [ pos = \"n\" ]") 147 | result = q(tokens) 148 | self.assertEqual(len(result),2) 149 | self.assertEqual(result[0][0]['word'],"a") 150 | self.assertEqual(result[0][1]['word'],"first") 151 | self.assertEqual(result[0][2]['word'],"test") 152 | self.assertEqual(result[1][0]['word'],"the") 153 | self.assertEqual(result[1][1]['word'],"new") 154 | self.assertEqual(result[1][2]['word'],"module") 155 | 156 | if __name__ == '__main__': 157 | unittest.main() 158 | -------------------------------------------------------------------------------- /pynlpl/formats/taggerdata.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | 3 | ############################################################### 4 | # PyNLPl - Read tagger data 5 | # by Maarten van Gompel (proycon) 6 | # http://ilk.uvt.nl/~mvgompel 7 | # Induction for Linguistic Knowledge Research Group 8 | # Universiteit van Tilburg 9 | # 10 | # Licensed under GPLv3 11 | # 12 | # 13 | ############################################################### 14 | 15 | from __future__ import print_function 16 | from __future__ import unicode_literals 17 | from __future__ import division 18 | from __future__ import absolute_import 19 | 20 | import io 21 | 22 | class Taggerdata(object): 23 | def __init__(self,filename, encoding = 'utf-8', mode ='r'): 24 | self.filename = filename 25 | self.encoding = encoding 26 | assert (mode == 'r' or mode == 'w') 27 | self.mode = mode 28 | self.reset() 29 | self.firstiter = True 30 | self.indexed = False 31 | self.writeindex = 0 32 | 33 | def __iter__(self): 34 | words = [] 35 | lemmas = [] 36 | postags = [] 37 | for line in self.f: 38 | line = line.strip() 39 | if self.firstiter: 40 | self.indexed = (line == "#0") 41 | self.firstiter = False 42 | if not line and not self.indexed: 43 | yield (words, lemmas, postags) 44 | words = [] 45 | lemmas = [] 46 | postags = [] 47 | elif self.indexed and len(line) > 1 and line[0] == '#' and line[1:].isdigit(): 48 | if line != "#0": 49 | yield (words, lemmas, postags) 50 | words = [] 51 | lemmas = [] 52 | postags = [] 53 | elif line: 54 | try: 55 | word, lemma, pos = line.split("\t") 56 | except: 57 | word = lemma = pos = "NONE" 58 | if word == "NONE": word = None 59 | if lemma == "NONE": lemma = None 60 | if pos == "NONE": pos = None 61 | words.append(word) 62 | lemmas.append(lemma) 63 | postags.append(pos) 64 | if words: 65 | yield (words, lemmas, postags) 66 | 67 | def next(self): 68 | words = [] 69 | lemmas = [] 70 | postags = [] 71 | while True: 72 | try: 73 | line = self.f.next().strip() 74 | except StopIteration: 75 | if words: 76 | return (words, lemmas, postags) 77 | else: 78 | raise 79 | if self.firstiter: 80 | self.indexed = (line == "#0") 81 | self.firstiter = False 82 | if not line and not self.indexed: 83 | return (words, lemmas, postags) 84 | elif self.indexed and len(line) > 1 and line[0] == '#' and line[1:].isdigit(): 85 | if line != "#0": 86 | return (words, lemmas, postags) 87 | elif line: 88 | try: 89 | word, lemma, pos = line.split("\t") 90 | except: 91 | word = lemma = pos = "NONE" 92 | if word == "NONE": word = None 93 | if lemma == "NONE": lemma = None 94 | if pos == "NONE": pos = None 95 | words.append(word) 96 | lemmas.append(lemma) 97 | postags.append(pos) 98 | 99 | def align(self, referencewords, datatuple): 100 | """align the reference sentence with the tagged data""" 101 | targetwords = [] 102 | for i, (word,lemma,postag) in enumerate(zip(datatuple[0],datatuple[1],datatuple[2])): 103 | if word: 104 | subwords = word.split("_") 105 | for w in subwords: #split multiword expressions 106 | targetwords.append( (w, lemma, postag, i, len(subwords) > 1 ) ) #word, lemma, pos, index, multiword? 107 | 108 | referencewords = [ w.lower() for w in referencewords ] 109 | alignment = [] 110 | for i, referenceword in enumerate(referencewords): 111 | found = False 112 | best = 0 113 | distance = 999999 114 | for j, (targetword, lemma, pos, index, multiword) in enumerate(targetwords): 115 | if referenceword == targetword and abs(i-j) < distance: 116 | found = True 117 | best = j 118 | distance = abs(i-j) 119 | 120 | if found: 121 | alignment.append(targetwords[best]) 122 | else: 123 | alignment.append((None,None,None,None,False)) #no alignment found 124 | 125 | return alignment 126 | 127 | def reset(self): 128 | self.f = io.open(self.filename,self.mode, encoding=self.encoding) 129 | 130 | 131 | def write(self, sentence): 132 | self.f.write("#" + str(self.writeindex)+"\n") 133 | for word, lemma, pos in sentence: 134 | if not word: word = "NONE" 135 | if not lemma: lemma = "NONE" 136 | if not pos: pos = "NONE" 137 | self.f.write( word + "\t" + lemma + "\t" + pos + "\n" ) 138 | self.writeindex += 1 139 | 140 | def close(self): 141 | self.f.close() 142 | 143 | -------------------------------------------------------------------------------- /pynlpl/net.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | 3 | #--------------------------------------------------------------- 4 | # PyNLPl - Network utilities 5 | # by Maarten van Gompel 6 | # Centre for Language Studies 7 | # Radboud University Nijmegen 8 | # http://www.github.com/proycon/pynlpl 9 | # proycon AT anaproy DOT nl 10 | # 11 | # Generic Server for Language Models 12 | # 13 | #---------------------------------------------------------------- 14 | 15 | from __future__ import print_function 16 | from __future__ import unicode_literals 17 | from __future__ import division 18 | from __future__ import absolute_import 19 | from pynlpl.common import u,b 20 | import sys 21 | if sys.version < '3': 22 | from codecs import getwriter 23 | stderr = getwriter('utf-8')(sys.stderr) 24 | stdout = getwriter('utf-8')(sys.stdout) 25 | else: 26 | stderr = sys.stderr 27 | stdout = sys.stdout 28 | from twisted.internet import protocol, reactor # will fail on Python 3 for now 29 | from twisted.protocols import basic 30 | import shlex 31 | 32 | 33 | 34 | class GWSNetProtocol(basic.LineReceiver): 35 | def connectionMade(self): 36 | print("Client connected", file=stderr) 37 | self.factory.connections += 1 38 | if self.factory.connections < 1: 39 | self.transport.loseConnection() 40 | else: 41 | self.sendLine(b("READY")) 42 | 43 | def lineReceived(self, line): 44 | try: 45 | if sys.version >= '3' and isinstance(line,bytes): 46 | print("Client in: " + str(line,'utf-8'),file=stderr) 47 | else: 48 | print("Client in: " + line,file=stderr) 49 | except UnicodeDecodeError: 50 | print("Client in: (unicodeerror)",file=stderr) 51 | if sys.version < '3': 52 | if isinstance(line,unicode): 53 | self.factory.processprotocol.transport.write(line.encode('utf-8')) 54 | else: 55 | self.factory.processprotocol.transport.write(line) 56 | self.factory.processprotocol.transport.write(b('\n')) 57 | else: 58 | self.factory.processprotocol.transport.write(b(line) + b('\n')) 59 | self.factory.processprotocol.currentclient = self 60 | 61 | def connectionLost(self, reason): 62 | self.factory.connections -= 1 63 | if self.factory.processprotocol.currentclient == self: 64 | self.factory.processprotocol.currentclient = None 65 | 66 | class GWSFactory(protocol.ServerFactory): 67 | protocol = GWSNetProtocol 68 | 69 | def __init__(self, processprotocol): 70 | self.connections = 0 71 | self.processprotocol = processprotocol 72 | 73 | 74 | class GWSProcessProtocol(protocol.ProcessProtocol): 75 | def __init__(self, printstderr=True, sendstderr= False, filterout = None, filtererr = None): 76 | self.currentclient = None 77 | self.printstderr = printstderr 78 | self.sendstderr = sendstderr 79 | if not filterout: 80 | self.filterout = lambda x: x 81 | else: 82 | self.filterout = filterout 83 | if not filtererr: 84 | self.filtererr = lambda x: x 85 | else: 86 | self.filtererr = filtererr 87 | 88 | def connectionMade(self): 89 | pass 90 | 91 | def outReceived(self, data): 92 | try: 93 | if sys.version >= '3' and isinstance(data,bytes): 94 | print("Process out " + str(data, 'utf-8'),file=stderr) 95 | else: 96 | print("Process out " + data,file=stderr) 97 | except UnicodeDecodeError: 98 | print("Process out (unicodeerror)",file=stderr) 99 | print("DEBUG:", repr(b(data).strip().split(b('\n')))) 100 | for line in b(data).strip().split(b('\n')): 101 | line = self.filterout(line.strip()) 102 | if self.currentclient and line: 103 | self.currentclient.sendLine(b(line)) 104 | 105 | def errReceived(self, data): 106 | try: 107 | if sys.version >= '3' and isinstance(data,bytes): 108 | print("Process err " + str(data,'utf-8'), file=sys.stderr) 109 | else: 110 | print("Process err " + data,file=stderr) 111 | except UnicodeDecodeError: 112 | print("Process out (unicodeerror)",file=stderr) 113 | if self.printstderr and data: 114 | print(data.strip(),file=stderr) 115 | for line in b(data).strip().split(b('\n')): 116 | line = self.filtererr(line.strip()) 117 | if self.sendstderr and self.currentclient and line: 118 | self.currentclient.sendLine(b(line)) 119 | 120 | 121 | def processExited(self, reason): 122 | print("Process exited",file=stderr) 123 | 124 | 125 | def processEnded(self, reason): 126 | print("Process ended",file=stderr) 127 | if self.currentclient: 128 | self.currentclient.transport.loseConnection() 129 | reactor.stop() 130 | 131 | 132 | class GenericWrapperServer: 133 | """Generic Server around a stdin/stdout based CLI tool. Only accepts one client at a time to prevent concurrency issues !!!!!""" 134 | def __init__(self, cmdline, port, printstderr= True, sendstderr= False, filterout = None, filtererr = None): 135 | gwsprocessprotocol = GWSProcessProtocol(printstderr, sendstderr, filterout, filtererr) 136 | cmdline = shlex.split(cmdline) 137 | reactor.spawnProcess(gwsprocessprotocol, cmdline[0], cmdline) 138 | 139 | gwsfactory = GWSFactory(gwsprocessprotocol) 140 | reactor.listenTCP(port, gwsfactory) 141 | reactor.run() 142 | -------------------------------------------------------------------------------- /pynlpl/tests/evaluation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | #--------------------------------------------------------------- 5 | # PyNLPl - Test Units for Evaluation 6 | # by Maarten van Gompel, ILK, Universiteit van Tilburg 7 | # http://ilk.uvt.nl/~mvgompel 8 | # proycon AT anaproy DOT nl 9 | # 10 | # Licensed under GPLv3 11 | # 12 | #------------------------------------------------------------- 13 | 14 | from __future__ import print_function 15 | from __future__ import unicode_literals 16 | from __future__ import division 17 | from __future__ import absolute_import 18 | from pynlpl.common import u 19 | 20 | import sys 21 | import os 22 | import unittest 23 | import random 24 | 25 | from pynlpl.evaluation import AbstractExperiment, WPSParamSearch, ExperimentPool, ClassEvaluation, OrdinalEvaluation 26 | 27 | class ParamExperiment(AbstractExperiment): 28 | def defaultparameters(self): 29 | return {'a':1,'b':1,'c':1} 30 | 31 | def run(self): 32 | self.result = 0 33 | for line in self.inputdata: 34 | self.result += int(line) * self.parameters['a'] * self.parameters['b'] - self.parameters['c'] 35 | 36 | def score(self): 37 | return self.result 38 | 39 | @staticmethod 40 | def sample(inputdata,n): 41 | n = int(n) 42 | if n > len(inputdata): 43 | return inputdata 44 | else: 45 | return random.sample(inputdata,int(n)) 46 | 47 | class PoolExperiment(AbstractExperiment): 48 | def start(self): 49 | self.startcommand('sleep',None,None,None,str(self.parameters['duration'])) 50 | print("STARTING: sleep " + str(self.parameters['duration'])) 51 | 52 | 53 | class WPSTest(unittest.TestCase): 54 | def test_wps(self): 55 | inputdata = [ 1,2,3,4,5,6 ] 56 | parameterscope = [ ('a',[2,4]), ('b',[2,5,8]), ('c',[3,6,9]) ] 57 | search = WPSParamSearch(ParamExperiment, inputdata, len(inputdata), parameterscope) 58 | solution = search.searchbest() 59 | self.assertEqual(solution, (('a', 4), ('b', 8), ('c', 3)) ) 60 | 61 | 62 | 63 | class ExperimentPoolTest(unittest.TestCase): 64 | def test_pool(self): 65 | pool = ExperimentPool(4) 66 | for i in range(0,15): 67 | pool.append( PoolExperiment(None, duration=random.randint(1,6)) ) 68 | for experiment in pool.run(): 69 | print("DONE: sleep " + str(experiment.parameters['duration'])) 70 | 71 | self.assertTrue(True) #if we got here, no exceptions were raised and it's okay 72 | 73 | class ClassEvaluationTest2(unittest.TestCase): 74 | def setUp(self): 75 | self.goals = ['sun','sun','rain','cloudy','sun','rain'] 76 | self.observations = ['cloudy','cloudy','cloudy','rain','sun','sun'] 77 | 78 | 79 | def test001(self): 80 | e = ClassEvaluation(self.goals, self.observations) 81 | print() 82 | print(e) 83 | print(e.confusionmatrix()) 84 | 85 | class OrdinalEvaluationTest(unittest.TestCase): 86 | def setUp(self): 87 | self.goals = [1,2,3,4,3,2] 88 | self.observations = [4,1,3,4,2,2] 89 | 90 | def test001(self): 91 | oe = OrdinalEvaluation(self.goals,self.observations) 92 | print(oe.mae()) 93 | print(oe.mae(2)) 94 | print(oe.rmse()) 95 | print(oe.rmse(4)) 96 | 97 | class ClassEvaluationTest(unittest.TestCase): 98 | def setUp(self): 99 | self.goals = ['cat','cat','cat','cat','cat','cat','cat','cat', 'dog', 'dog','dog','dog','dog','dog' ,'rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit'] 100 | self.observations = ['cat','cat','cat','cat','cat','dog','dog','dog', 'cat','cat','rabbit','dog','dog','dog' ,'rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','dog','dog'] 101 | 102 | 103 | def test001(self): 104 | """Class evaluation test -- (See also http://en.wikipedia.org/wiki/Confusion_matrix , using same data)""" 105 | e = ClassEvaluation(self.goals, self.observations) 106 | 107 | print 108 | print(e) 109 | print(e.confusionmatrix()) 110 | 111 | 112 | self.assertEqual(e.tp['cat'], 5) 113 | self.assertEqual(e.fp['cat'], 2) 114 | self.assertEqual(e.tn['cat'], 17) 115 | self.assertEqual(e.fn['cat'], 3) 116 | 117 | self.assertEqual(e.tp['rabbit'], 11) 118 | self.assertEqual(e.fp['rabbit'], 1) 119 | self.assertEqual(e.tn['rabbit'], 13) 120 | self.assertEqual(e.fn['rabbit'], 2) 121 | 122 | self.assertEqual(e.tp['dog'], 3) 123 | self.assertEqual(e.fp['dog'], 5) 124 | self.assertEqual(e.tn['dog'], 16) 125 | self.assertEqual(e.fn['dog'], 3) 126 | 127 | self.assertEqual( round(e.precision('cat'),6), 0.714286) 128 | self.assertEqual( round(e.precision('rabbit'),6), 0.916667) 129 | self.assertEqual( round(e.precision('dog'),6), 0.375000) 130 | 131 | self.assertEqual( round(e.recall('cat'),6), 0.625000) 132 | self.assertEqual( round(e.recall('rabbit'),6), 0.846154) 133 | self.assertEqual( round(e.recall('dog'),6),0.500000) 134 | 135 | self.assertEqual( round(e.fscore('cat'),6), 0.666667) 136 | self.assertEqual( round(e.fscore('rabbit'),6), 0.880000) 137 | self.assertEqual( round(e.fscore('dog'),6),0.428571) 138 | 139 | self.assertEqual( round(e.accuracy(),6), 0.703704) 140 | 141 | 142 | 143 | if __name__ == '__main__': 144 | unittest.main() 145 | 146 | 147 | -------------------------------------------------------------------------------- /pynlpl/tests/textprocessors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | 5 | #--------------------------------------------------------------- 6 | # PyNLPl - Test Units for Text Processors 7 | # by Maarten van Gompel, ILK, Universiteit van Tilburg 8 | # http://ilk.uvt.nl/~mvgompel 9 | # proycon AT anaproy DOT nl 10 | # 11 | # Licensed under GPLv3 12 | # 13 | #---------------------------------------------------------------- 14 | 15 | from __future__ import print_function 16 | from __future__ import unicode_literals 17 | from __future__ import division 18 | from __future__ import absolute_import 19 | 20 | import sys 21 | import os 22 | import unittest 23 | 24 | from pynlpl.textprocessors import Windower, tokenise, strip_accents, calculate_overlap 25 | 26 | text = "This is a test .".split(" ") 27 | 28 | class WindowerTest(unittest.TestCase): 29 | def test_unigrams(self): 30 | """Windower (unigrams)""" 31 | global text 32 | result = list(iter(Windower(text,1))) 33 | self.assertEqual(result,[("This",),("is",),("a",),("test",),(".",)]) 34 | 35 | def test_bigrams(self): 36 | """Windower (bigrams)""" 37 | global text 38 | result = list(iter(Windower(text,2))) 39 | self.assertEqual(result,[("","This"),("This","is"),("is","a"),("a","test"),("test","."),(".","")]) 40 | 41 | def test_trigrams(self): 42 | """Windower (trigrams)""" 43 | global text 44 | result = list(iter(Windower(text,3))) 45 | self.assertEqual(result,[('', '', 'This'), ('', 'This', 'is'), ('This', 'is', 'a'), ('is', 'a', 'test'), ('a', 'test', '.'), ('test', '.', ''), ('.', '', '')]) 46 | 47 | 48 | def test_trigrams_word(self): 49 | """Windower (trigrams) (on single word)""" 50 | global text 51 | result = list(iter(Windower(["hi"],3))) 52 | self.assertEqual(result,[('', '', 'hi'), ('', 'hi', ''), ('hi', '', '')]) 53 | 54 | 55 | 56 | 57 | class TokenizerTest(unittest.TestCase): 58 | def test_tokenize(self): 59 | """Tokeniser - One sentence""" 60 | self.assertEqual(tokenise("This is a test."),"This is a test .".split(" ")) 61 | 62 | def test_tokenize_sentences(self): 63 | """Tokeniser - Multiple sentences""" 64 | self.assertEqual(tokenise("This, is the first sentence! This is the second sentence."),"This , is the first sentence ! This is the second sentence .".split(" ")) 65 | 66 | def test_tokenize_noeos(self): 67 | """Tokeniser - Missing EOS Marker""" 68 | self.assertEqual(tokenise("This is a test"),"This is a test".split(" ")) 69 | 70 | def test_tokenize_url(self): 71 | """Tokeniser - URL""" 72 | global text 73 | self.assertEqual(tokenise("I go to http://www.google.com when I need to find something."),"I go to http://www.google.com when I need to find something .".split(" ")) 74 | 75 | def test_tokenize_mail(self): 76 | """Tokeniser - Mail""" 77 | global text 78 | self.assertEqual(tokenise("Write me at proycon@anaproy.nl."),"Write me at proycon@anaproy.nl .".split(" ")) 79 | 80 | def test_tokenize_numeric(self): 81 | """Tokeniser - numeric""" 82 | global text 83 | self.assertEqual(tokenise("I won € 300,000.00!"),"I won € 300,000.00 !".split(" ")) 84 | 85 | def test_tokenize_quotes(self): 86 | """Tokeniser - quotes""" 87 | global text 88 | self.assertEqual(tokenise("Hij zegt: \"Wat een lief baby'tje is dat!\""),"Hij zegt : \" Wat een lief baby'tje is dat ! \"".split(" ")) 89 | 90 | 91 | class StripAccentTest(unittest.TestCase): 92 | def test_strip_accents(self): 93 | """Strip Accents""" 94 | self.assertEqual(strip_accents("áàâãāĝŭçñßt"),"aaaaagucnt") 95 | 96 | class OverlapTest(unittest.TestCase): 97 | def test_overlap_subset(self): 98 | """Overlap - Subset""" 99 | h = [4,5,6,7] 100 | n = [5,6] 101 | self.assertEqual(calculate_overlap(h,n), [((5,6),0)]) 102 | 103 | def test_overlap_equal(self): 104 | """Overlap - Equal""" 105 | h = [4,5,6,7] 106 | n = [4,5,6,7] 107 | self.assertEqual(calculate_overlap(h,n), [((4,5,6,7),2)]) 108 | 109 | def test_overlap_none(self): 110 | """Overlap - None""" 111 | h = [4,5,6,7] 112 | n = [8,9,10] 113 | self.assertEqual(calculate_overlap(h,n), []) 114 | 115 | def test_overlap_leftpartial(self): 116 | """Overlap - Left partial""" 117 | h = [4,5,6,7] 118 | n = [1,2,3,4,5] 119 | self.assertEqual(calculate_overlap(h,n), [((4,5),-1)] ) 120 | 121 | def test_overlap_rightpartial(self): 122 | """Overlap - Right partial""" 123 | h = [4,5,6,7] 124 | n = [6,7,8,9] 125 | self.assertEqual(calculate_overlap(h,n), [((6,7),1)] ) 126 | 127 | def test_overlap_leftpartial2(self): 128 | """Overlap - Left partial (2)""" 129 | h = [1,2,3,4,5] 130 | n = [0,1,2] 131 | self.assertEqual(calculate_overlap(h,n), [((1,2),-1)] ) 132 | 133 | def test_overlap_rightpartial2(self): 134 | """Overlap - Right partial (2)""" 135 | h = [1,2,3,4,5] 136 | n = [4,5,6] 137 | self.assertEqual(calculate_overlap(h,n), [((4,5),1)] ) 138 | 139 | 140 | def test_overlap_leftfull(self): 141 | """Overlap - Left full""" 142 | h = [1,2,3,4,5] 143 | n = [1,2] 144 | self.assertEqual(calculate_overlap(h,n), [((1,2),-1)] ) 145 | 146 | def test_overlap_rightfull(self): 147 | """Overlap - Right full""" 148 | h = [1,2,3,4,5] 149 | n = [4,5] 150 | self.assertEqual(calculate_overlap(h,n), [((4,5),1)] ) 151 | 152 | 153 | if __name__ == '__main__': 154 | unittest.main() 155 | -------------------------------------------------------------------------------- /pynlpl/tools/computepmi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | 4 | from __future__ import print_function, unicode_literals, division, absolute_import 5 | 6 | import argparse 7 | import sys 8 | from math import log 9 | 10 | from collections import defaultdict 11 | 12 | def pmi(sentences1, sentences2,discount = 0): 13 | jointcount = len(sentences1 & sentences2) - discount 14 | if jointcount <= 0: return None 15 | return log( jointcount / (len(sentences1) * len(sentences2))), jointcount+discount 16 | 17 | def npmi(sentences1, sentences2,discount=0): 18 | jointcount = len(sentences1 & sentences2) - discount 19 | if jointcount <= 0: return None 20 | return log( jointcount / (len(sentences1) * len(sentences2))) / -log(jointcount), jointcount+discount 21 | 22 | def main(): 23 | parser = argparse.ArgumentParser(description="Simple cooccurence computation", formatter_class=argparse.ArgumentDefaultsHelpFormatter) 24 | parser.add_argument('-f','--inputtext', type=str,help="Input file (plaintext, tokenised, utf-8, one sentence per line)", action='store',default="",required=True) 25 | parser.add_argument('-s','--sorted', help="Output sorted by co-occurrence score", action='store_true',default=False) 26 | parser.add_argument('-t','--threshold', help="Joined occurrence threshold, do not consider words occuring less than this", type=int, action='store',default=1) 27 | parser.add_argument('-a','--adjacency', help="Compute the adjacency fraction (how many co-occurrence are immediate bigrams)", action='store_true',default=False) 28 | parser.add_argument('-A','--discountadjacency', help="Do not take immediately adjacent fragments (bigrams) into account when computing mutual information (requires -a)", action='store_true',default=False) 29 | parser.add_argument('--pmi',help="Compute pointwise mutual information", action='store_true',default=False) 30 | parser.add_argument('--npmi',help="Compute normalised pointwise mutual information", action='store_true',default=False) 31 | parser.add_argument('--jaccard',help="Compute jaccard similarity coefficient", action='store_true',default=False) 32 | parser.add_argument('--dice',help="Compute dice coefficient", action='store_true',default=False) 33 | 34 | args = parser.parse_args() 35 | if not args.pmi and not args.npmi and not args.jaccard and not args.dice: 36 | args.pmi = True 37 | 38 | count = defaultdict(int) 39 | cooc = defaultdict(lambda: defaultdict(int)) 40 | adjacent = defaultdict(lambda: defaultdict(int)) 41 | total = 0 42 | 43 | f = open(args.inputtext,'r',encoding='utf-8') 44 | for i, line in enumerate(f): 45 | sentence = i + 1 46 | if sentence % 1000 == 0: print("Indexing @" + str(sentence),file=sys.stderr) 47 | if line: 48 | words = list(enumerate(line.split())) 49 | for pos, word in words: 50 | count[word] += 1 51 | total += 1 52 | for pos2, word2 in words: 53 | if pos2 > pos: 54 | cooc[word][word2] += 1 55 | if args.adjacency and pos2 == pos + len(word.split()): 56 | adjacent[word][word2] += 1 57 | f.close() 58 | 59 | 60 | l = len(cooc) 61 | output = [] 62 | for i, (word, coocdata) in enumerate(cooc.items()): 63 | print("Computing mutual information @" + str(i+1) + "/" + str(l) + ": \"" + word + "\" , co-occurs with " + str(len(coocdata)) + " words",file=sys.stderr) 64 | for word2, jointcount in coocdata.items(): 65 | if jointcount> args.threshold: 66 | if args.adjacency and word in adjacent and word2 in adjacent[word]: 67 | adjcount = adjacent[word][word2] 68 | else: 69 | adjcount = 0 70 | 71 | if args.discountadjacency: 72 | discount = adjcount 73 | else: 74 | discount = 0 75 | 76 | if args.pmi: 77 | score = log( ((jointcount-discount)/total) / ((count[word]/total) * (count[word2]/total))) 78 | elif args.npmi: 79 | score = log( ((jointcount-discount)/total) / ((count[word]/total) * (count[word2]/total))) / -log((jointcount-discount)/total) 80 | elif args.jaccard or args.dice: 81 | score = (jointcount-discount) / (count[word] + count[word2] - (jointcount - discount) ) 82 | if args.dice: 83 | score = 2*score / (1+score) 84 | 85 | if args.sorted: 86 | outputdata = (word,word2,score, jointcount, adjcount, adjcount / jointcount if args.adjacency else None) 87 | output.append(outputdata) 88 | else: 89 | if args.adjacency: 90 | print(word + "\t" + word2 + "\t" + str(score) + "\t" + str(jointcount) + "\t" + str(adjcount) + "\t" + str(adjcount / jointcount)) 91 | else: 92 | print(word + "\t" + word2 + "\t" + str(score) + "\t" + str(jointcount)) 93 | 94 | 95 | if args.sorted: 96 | print("Outputting " + str(len(output)) + " pairs",file=sys.stderr) 97 | if args.adjacency: 98 | print("#WORD\tWORD2\tSCORE\tJOINTCOUNT\tBIGRAMCOUNT\tBIGRAMRATIO") 99 | else: 100 | print("#WORD\tWORD2\tSCORE\tJOINTCOUNT\tBIGRAMCOUNT\tBIGRAMRATIO") 101 | if args.npmi: 102 | sign = 1 103 | else: 104 | sign = -1 105 | for word,word2,score,jointcount,adjcount, adjratio in sorted(output, key=lambda x: sign * x[2]): 106 | if args.adjacency: 107 | print(word + "\t" + word2 + "\t" + str(score) + "\t" + str(jointcount) + "\t" + str(adjcount) + "\t" + str(adjratio) ) 108 | else: 109 | print(word + "\t" + word2 + "\t" + str(score) + "\t" + str(jointcount)) 110 | 111 | 112 | 113 | 114 | if __name__ == '__main__': 115 | main() 116 | 117 | -------------------------------------------------------------------------------- /pynlpl/clients/frogclient.py: -------------------------------------------------------------------------------- 1 | ############################################################### 2 | # PyNLPl - Frog Client - Version 1.4.1 3 | # by Maarten van Gompel (proycon) 4 | # http://ilk.uvt.nl/~mvgompel 5 | # Induction for Linguistic Knowledge Research Group 6 | # Universiteit van Tilburg 7 | # 8 | # Derived from code by Rogier Kraf 9 | # 10 | # Licensed under GPLv3 11 | # 12 | # This is a Python library for on-the-fly communication with 13 | # a Frog/Tadpole Server. Allowing on-the-fly lemmatisation and 14 | # PoS-tagging. It is recommended to pass your data on a 15 | # sentence-by-sentence basis to FrogClient.process() 16 | # 17 | ############################################################### 18 | 19 | from __future__ import print_function 20 | from __future__ import unicode_literals 21 | from __future__ import division 22 | from __future__ import absolute_import 23 | from pynlpl.common import u 24 | 25 | import socket 26 | 27 | class FrogClient: 28 | def __init__(self,host="localhost",port=12345, server_encoding="utf-8", returnall=False, timeout=120.0): 29 | """Create a client connecting to a Frog or Tadpole server.""" 30 | self.BUFSIZE = 4096 31 | self.socket = socket.socket(socket.AF_INET,socket.SOCK_STREAM) 32 | self.socket.settimeout(timeout) 33 | self.socket.connect( (host,int(port)) ) 34 | self.server_encoding = server_encoding 35 | self.returnall = returnall 36 | 37 | 38 | 39 | 40 | def process(self,input_data, source_encoding="utf-8", return_unicode = True, oldfrog=False): 41 | """Receives input_data in the form of a str or unicode object, passes this to the server, with proper consideration for the encodings, and returns the Frog output as a list of tuples: (word,pos,lemma,morphology), each of these is a proper unicode object unless return_unicode is set to False, in which case raw strings will be returned. Return_unicode is no longer optional, it is fixed to True, parameter is still there only for backwards-compatibility.""" 42 | if isinstance(input_data, list) or isinstance(input_data, tuple): 43 | input_data = " ".join(input_data) 44 | 45 | 46 | 47 | input_data = u(input_data, source_encoding) #decode (or preferably do this in an earlier stage) 48 | input_data = input_data.strip(' \t\n') 49 | 50 | s = input_data.encode(self.server_encoding) +b'\r\n' 51 | if not oldfrog: s += b'EOT\r\n' 52 | self.socket.sendall(s) #send to socket in desired encoding 53 | output = [] 54 | 55 | done = False 56 | while not done: 57 | data = b"" 58 | while not data.endswith(b'\n'): 59 | moredata = self.socket.recv(self.BUFSIZE) 60 | if not moredata: break 61 | data += moredata 62 | 63 | 64 | data = u(data,self.server_encoding) 65 | 66 | 67 | for line in data.strip(' \t\r\n').split('\n'): 68 | if line == "READY": 69 | done = True 70 | break 71 | elif line: 72 | line = line.split('\t') #split on tab 73 | if len(line) > 4 and line[0].isdigit(): #first column is token number 74 | if line[0] == '1' and output: 75 | if self.returnall: 76 | output.append( (None,None,None,None, None,None,None, None) ) 77 | else: 78 | output.append( (None,None,None,None) ) 79 | fields = line[1:] 80 | parse1=parse2=ner=chunk="" 81 | word,lemma,morph,pos = fields[0:4] 82 | if len(fields) > 5: 83 | ner = fields[5] 84 | if len(fields) > 6: 85 | chunk = fields[6] 86 | if len(fields) >= 8: 87 | parse1 = fields[7] 88 | parse2 = fields[8] 89 | 90 | if len(fields) < 5: 91 | raise Exception("Can't process response line from Frog: ", repr(line), " got unexpected number of fields ", str(len(fields) + 1)) 92 | 93 | if self.returnall: 94 | output.append( (word,lemma,morph,pos,ner,chunk,parse1,parse2) ) 95 | else: 96 | output.append( (word,lemma,morph,pos) ) 97 | 98 | return output 99 | 100 | def process_aligned(self,input_data, source_encoding="utf-8", return_unicode = True): 101 | output = self.process(input_data, source_encoding, return_unicode) 102 | outputwords = [ x[0] for x in output ] 103 | inputwords = input_data.strip(' \t\n').split(' ') 104 | alignment = self.align(inputwords, outputwords) 105 | for i, _ in enumerate(inputwords): 106 | targetindex = alignment[i] 107 | if targetindex == None: 108 | if self.returnall: 109 | yield (None,None,None,None,None,None,None,None) 110 | else: 111 | yield (None,None,None,None) 112 | else: 113 | yield output[targetindex] 114 | 115 | def align(self,inputwords, outputwords): 116 | """For each inputword, provides the index of the outputword""" 117 | alignment = [] 118 | cursor = 0 119 | for inputword in inputwords: 120 | if len(outputwords) > cursor and outputwords[cursor] == inputword: 121 | alignment.append(cursor) 122 | cursor += 1 123 | elif len(outputwords) > cursor+1 and outputwords[cursor+1] == inputword: 124 | alignment.append(cursor+1) 125 | cursor += 2 126 | else: 127 | alignment.append(None) 128 | cursor += 1 129 | return alignment 130 | 131 | 132 | def __del__(self): 133 | self.socket.close() 134 | 135 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # PyNLPl documentation build configuration file, created by 4 | # sphinx-quickstart on Tue Jul 6 22:07:20 2010. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | #sys.path.append(os.path.abspath('.')) 20 | 21 | sys.path.append(os.path.abspath('../../')) 22 | from pynlpl import VERSION 23 | 24 | # -- General configuration ----------------------------------------------------- 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be extensions 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 28 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon','sphinx.ext.autosummary'] 29 | 30 | # Add any paths that contain templates here, relative to this directory. 31 | templates_path = ['_templates'] 32 | 33 | # The suffix of source filenames. 34 | source_suffix = '.rst' 35 | 36 | # The encoding of source files. 37 | #source_encoding = 'utf-8' 38 | 39 | # The master toctree document. 40 | master_doc = 'index' 41 | 42 | # General information about the project. 43 | project = u'PyNLPl' 44 | copyright = u'2016, Maarten van Gompel' 45 | 46 | # The version info for the project you're documenting, acts as replacement for 47 | # |version| and |release|, also used in various other places throughout the 48 | # built documents. 49 | # 50 | # The short X.Y version. 51 | version = VERSION 52 | # The full version, including alpha/beta/rc tags. 53 | release = VERSION 54 | 55 | # The language for content autogenerated by Sphinx. Refer to documentation 56 | # for a list of supported languages. 57 | #language = None 58 | 59 | # There are two options for replacing |today|: either, you set today to some 60 | # non-false value, then it is used: 61 | #today = '' 62 | # Else, today_fmt is used as the format for a strftime call. 63 | #today_fmt = '%B %d, %Y' 64 | 65 | # List of documents that shouldn't be included in the build. 66 | #unused_docs = [] 67 | 68 | # List of directories, relative to source directory, that shouldn't be searched 69 | # for source files. 70 | exclude_trees = ['_build'] 71 | 72 | # The reST default role (used for this markup: `text`) to use for all documents. 73 | #default_role = None 74 | 75 | # If true, '()' will be appended to :func: etc. cross-reference text. 76 | #add_function_parentheses = True 77 | 78 | # If true, the current module name will be prepended to all description 79 | # unit titles (such as .. function::). 80 | #add_module_names = True 81 | 82 | # If true, sectionauthor and moduleauthor directives will be shown in the 83 | # output. They are ignored by default. 84 | #show_authors = False 85 | 86 | # The name of the Pygments (syntax highlighting) style to use. 87 | pygments_style = 'sphinx' 88 | 89 | # A list of ignored prefixes for module index sorting. 90 | #modindex_common_prefix = [] 91 | 92 | 93 | # -- Options for HTML output --------------------------------------------------- 94 | 95 | # The theme to use for HTML and HTML Help pages. Major themes that come with 96 | # Sphinx are currently 'default' and 'sphinxdoc'. 97 | html_theme = 'default' 98 | 99 | # Theme options are theme-specific and customize the look and feel of a theme 100 | # further. For a list of options available for each theme, see the 101 | # documentation. 102 | #html_theme_options = {} 103 | 104 | # Add any paths that contain custom themes here, relative to this directory. 105 | #html_theme_path = [] 106 | 107 | # The name for this set of Sphinx documents. If None, it defaults to 108 | # " v documentation". 109 | #html_title = None 110 | 111 | # A shorter title for the navigation bar. Default is the same as html_title. 112 | #html_short_title = None 113 | 114 | # The name of an image file (relative to this directory) to place at the top 115 | # of the sidebar. 116 | #html_logo = None 117 | 118 | # The name of an image file (within the static path) to use as favicon of the 119 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 120 | # pixels large. 121 | #html_favicon = None 122 | 123 | # Add any paths that contain custom static files (such as style sheets) here, 124 | # relative to this directory. They are copied after the builtin static files, 125 | # so a file named "default.css" will overwrite the builtin "default.css". 126 | # html_static_path = ['_static'] 127 | 128 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 129 | # using the given strftime format. 130 | #html_last_updated_fmt = '%b %d, %Y' 131 | 132 | # If true, SmartyPants will be used to convert quotes and dashes to 133 | # typographically correct entities. 134 | #html_use_smartypants = True 135 | 136 | # Custom sidebar templates, maps document names to template names. 137 | #html_sidebars = {} 138 | 139 | # Additional templates that should be rendered to pages, maps page names to 140 | # template names. 141 | #html_additional_pages = {} 142 | 143 | # If false, no module index is generated. 144 | #html_use_modindex = True 145 | 146 | # If false, no index is generated. 147 | #html_use_index = True 148 | 149 | # If true, the index is split into individual pages for each letter. 150 | #html_split_index = False 151 | 152 | # If true, links to the reST sources are added to the pages. 153 | #html_show_sourcelink = True 154 | 155 | # If true, an OpenSearch description file will be output, and all pages will 156 | # contain a tag referring to it. The value of this option must be the 157 | # base URL from which the finished HTML is served. 158 | #html_use_opensearch = '' 159 | 160 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). 161 | #html_file_suffix = '' 162 | 163 | # Output file base name for HTML help builder. 164 | # htmlhelp_basename = 'pynlpl' 165 | 166 | 167 | # -- Options for LaTeX output -------------------------------------------------- 168 | 169 | # The paper size ('letter' or 'a4'). 170 | latex_paper_size = 'a4' 171 | 172 | # The font size ('10pt', '11pt' or '12pt'). 173 | #latex_font_size = '10pt' 174 | 175 | # Grouping the document tree into LaTeX files. List of tuples 176 | # (source start file, target name, title, author, documentclass [howto/manual]). 177 | latex_documents = [ 178 | ('index', 'pynlpl.tex', u'PyNLPl Documentation', 179 | u'Maarten van Gompel', 'manual'), 180 | ] 181 | 182 | # The name of an image file (relative to this directory) to place at the top of 183 | # the title page. 184 | #latex_logo = None 185 | 186 | # For "manual" documents, if this is true, then toplevel headings are parts, 187 | # not chapters. 188 | #latex_use_parts = False 189 | 190 | # Additional stuff for the LaTeX preamble. 191 | #latex_preamble = '' 192 | 193 | # Documents to append as an appendix to all manuals. 194 | #latex_appendices = [] 195 | 196 | # If false, no module index is generated. 197 | #latex_use_modindex = True 198 | 199 | autosummary_generate = True 200 | -------------------------------------------------------------------------------- /pynlpl/tests/folia_benchmark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import print_function, unicode_literals, division, absolute_import 4 | 5 | from pynlpl.formats import folia, fql, cql 6 | import time 7 | import sys 8 | import os 9 | import glob 10 | try: 11 | from pympler import asizeof 12 | except ImportError: 13 | print("An extra dependency called pympler is required: install using pip install pympler (or other means)",file=sys.stderr) 14 | raise 15 | 16 | repetitions = 0 17 | 18 | def timeit(f): 19 | def f_timer(*args, **kwargs): 20 | if 'filename' in kwargs: 21 | label = "on file " + kwargs['filename'] 22 | elif 'dirname' in kwargs: 23 | label = "on directory " + kwargs['dirname'] 24 | elif 'doc' in kwargs: 25 | label = "on document " + kwargs['doc'].id 26 | else: 27 | label = "" 28 | print(f.__name__ + " -- " + f.__doc__ + " -- " + label + " ...", end="") 29 | times = [] 30 | for i in range(0, repetitions): 31 | start = time.time() 32 | try: 33 | result = f(*args, **kwargs) 34 | except Exception as e: 35 | print(" -- ERROR! -- ", e) 36 | return None 37 | times.append(time.time() - start) 38 | if times: 39 | d = round(sum(times) / len(times),4) 40 | print('took ' + str(d) + 's (averaged over ' + str(len(times)) + ' runs)') 41 | else: 42 | d = 0 43 | return result 44 | return f_timer 45 | 46 | 47 | @timeit 48 | def loadfile(**kwargs): 49 | """Loading file""" 50 | doc = folia.Document(file=kwargs['filename'],bypassleak=False) 51 | 52 | 53 | @timeit 54 | def savefile(**kwargs): #careful with SSDs 55 | """Saving file""" 56 | kwargs['doc'].save("/tmp/test.xml") 57 | 58 | @timeit 59 | def xml(**kwargs): 60 | """XML serialisation""" 61 | kwargs['doc'].xml() 62 | 63 | 64 | @timeit 65 | def json(**kwargs): 66 | """JSON serialisation""" 67 | kwargs['doc'].json() 68 | 69 | @timeit 70 | def text(**kwargs): 71 | """text serialisation""" 72 | kwargs['doc'].text() 73 | 74 | @timeit 75 | def countwords(**kwargs): 76 | """Counting words""" 77 | kwargs['doc'].count(folia.Word,None, True,[folia.AbstractAnnotationLayer]) 78 | 79 | @timeit 80 | def selectwords(**kwargs): 81 | """Selecting words""" 82 | for word in kwargs['doc'].words(): 83 | pass 84 | 85 | 86 | @timeit 87 | def selectwordsfql(**kwargs): 88 | """Selecting words using FQL""" 89 | query = fql.Query("SELECT w") 90 | for word in query(kwargs['doc']): 91 | pass 92 | 93 | @timeit 94 | def selectwordsfqlforp(**kwargs): 95 | """Selecting words in paragraphs using FQL""" 96 | query = fql.Query("SELECT w FOR p") 97 | for word in query(kwargs['doc']): 98 | pass 99 | 100 | @timeit 101 | def selectwordsfqlxml(**kwargs): 102 | """Selecting words using FQL (XML output)""" 103 | query = fql.Query("SELECT w FORMAT xml") 104 | for wordxml in query(kwargs['doc']): 105 | pass 106 | 107 | @timeit 108 | def selectwordsfqlwhere(**kwargs): 109 | """Selecting words using FQL (with WHERE clause)""" 110 | query = fql.Query("SELECT w WHERE text != \"blah\"") 111 | for word in query(kwargs['doc']): 112 | pass 113 | 114 | @timeit 115 | def editwordsfql(**kwargs): 116 | """Editing the text of words using FQL (with WHERE clause)""" 117 | query = fql.Query("EDIT w WITH text \"blah\"") 118 | for word in query(kwargs['doc']): 119 | pass 120 | 121 | @timeit 122 | def nextwords(**kwargs): 123 | """Find neighbour of each word""" 124 | for word in kwargs['doc'].words(): 125 | word.next() 126 | 127 | @timeit 128 | def addelement(**kwargs): 129 | """Adding a simple annotation (desc) to each word""" 130 | for word in kwargs['doc'].words(): 131 | try: 132 | word.append(folia.Description, value="test") 133 | except folia.DuplicateAnnotationError: 134 | pass 135 | 136 | 137 | @timeit 138 | def ancestors(**kwargs): 139 | """Iterating over the ancestors of each word""" 140 | for word in kwargs['doc'].words(): 141 | for ancestor in word.ancestors(): 142 | pass 143 | 144 | @timeit 145 | def readerwords(**kwargs): 146 | """Iterating over words using Reader""" 147 | reader = folia.Reader(kwargs['filename'], folia.Word) 148 | for word in reader: 149 | pass 150 | 151 | def main(): 152 | global repetitions, target 153 | files = [] 154 | try: 155 | begin = 1 156 | if os.path.exists(sys.argv[1]): 157 | begin = 1 158 | selectedtests = "all" 159 | repetitions = 1 160 | else: 161 | selectedtests = sys.argv[1].split(',') 162 | if os.path.exists(sys.argv[2]): 163 | repetitions = 1 164 | begin = 2 165 | else: 166 | repetitions = int(sys.argv[2]) 167 | begin = 3 168 | filesordirs = sys.argv[begin:] 169 | except: 170 | print("Syntax: folia_benchmark [testfunctions [repetitions]] files-or-directories+",file=sys.stderr) 171 | print(" testfunctions is a comma separated list of function names, or the special keyword 'all'", file=sys.stderr) 172 | print(" directories are recursively searched for files with the extension folia.xml, +gz and +bz2 is supported too.", file=sys.stderr) 173 | sys.exit(2) 174 | 175 | 176 | for fd in filesordirs: 177 | if not os.path.exists(fd): 178 | raise Exception("No such file or directory" + fd) 179 | if os.path.isfile(fd): 180 | files.append(fd) 181 | elif os.path.isdir(fd): 182 | dirs = [fd] 183 | while dirs: 184 | dir = dirs.pop(0) 185 | for filename in glob.glob(dir + "/*"): 186 | if os.path.isdir(filename): 187 | dirs.append(filename) 188 | elif filename.endswith('.folia.xml') or filename.endswith('.folia.xml.gz') or filename.endswith('.folia.xml.bz2'): 189 | files.append(filename) 190 | 191 | 192 | for f in ('loadfile','loadfileleakbypass','readerwords'): 193 | if f in selectedtests or 'all' in selectedtests: 194 | for filename in files: 195 | globals()[f](filename=filename) 196 | 197 | 198 | for f in ('xml','text','json','countwords','selectwords','nextwords','ancestors','selectwordsfql','selectwordsfqlforp','selectwordsfqlxml','selectwordsfqlwhere','editwordsfql', 'addelement' ): 199 | if f in selectedtests or 'all' in selectedtests: 200 | for filename in files: 201 | doc = folia.Document(file=filename) 202 | globals()[f](doc=doc) 203 | 204 | for f in ('memtest',): 205 | if f in selectedtests or 'all' in selectedtests: 206 | for filename in files: 207 | doc = folia.Document(file=filename) 208 | print("memtest -- Memory test on document " + filename + " -- memory consumption estimated at " + str(round(asizeof.asizeof(doc) / 1024 / 1024,2)) + " MB" + " (filesize " + str(round(os.path.getsize(filename)/1024/1024,2)) + " MB)") 209 | 210 | 211 | 212 | if __name__ == '__main__': 213 | main() 214 | -------------------------------------------------------------------------------- /pynlpl/tests/search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | 5 | #--------------------------------------------------------------- 6 | # PyNLPl - Test Units for Search Algorithms 7 | # by Maarten van Gompel, ILK, Universiteit van Tilburg 8 | # http://ilk.uvt.nl/~mvgompel 9 | # proycon AT anaproy DOT nl 10 | # 11 | # Licensed under GPLv3 12 | # 13 | #---------------------------------------------------------------- 14 | 15 | import sys 16 | import os 17 | import unittest 18 | 19 | sys.path.append(sys.path[0] + '/../../') 20 | os.environ['PYTHONPATH'] = sys.path[0] + '/../../' 21 | 22 | from pynlpl.search import AbstractSearchState, DepthFirstSearch, BreadthFirstSearch, IterativeDeepening, HillClimbingSearch, BeamSearch 23 | 24 | 25 | class ReorderSearchState(AbstractSearchState): 26 | def __init__(self, tokens, parent = None): 27 | self.tokens = tokens 28 | super(ReorderSearchState, self).__init__(parent) 29 | 30 | def expand(self): 31 | #Operator: Swap two consecutive pairs 32 | l = len(self.tokens) 33 | for i in range(0,l - 1): 34 | newtokens = self.tokens[:i] 35 | newtokens.append(self.tokens[i + 1]) 36 | newtokens.append(self.tokens[i]) 37 | if i+2 < l: 38 | newtokens += self.tokens[i+2:] 39 | yield ReorderSearchState(newtokens, self) 40 | 41 | def __hash__(self): 42 | return hash(str(self)) 43 | 44 | def __eq__(self, other): 45 | return str(self) == str(other) 46 | 47 | def __str__(self): 48 | return " ".join(self.tokens) 49 | 50 | class InformedReorderSearchState(ReorderSearchState): 51 | def __init__(self, tokens, goal = None, parent = None): 52 | self.tokens = tokens 53 | self.goal = goal 54 | super(ReorderSearchState, self).__init__(parent) 55 | 56 | def score(self): 57 | """Compute distortion""" 58 | totaldistortion = 0 59 | for i, token in enumerate(self.goal.tokens): 60 | tokendistortion = 9999999 61 | for j, token2 in enumerate(self.tokens): 62 | if token == token2 and abs(i - j) < tokendistortion: 63 | tokendistortion = abs(i - j) 64 | totaldistortion += tokendistortion 65 | return totaldistortion 66 | 67 | def expand(self): 68 | #Operator: Swap two consecutive pairs 69 | l = len(self.tokens) 70 | for i in range(0,l - 1): 71 | newtokens = self.tokens[:i] 72 | newtokens.append(self.tokens[i + 1]) 73 | newtokens.append(self.tokens[i]) 74 | if i+2 < l: 75 | newtokens += self.tokens[i+2:] 76 | yield InformedReorderSearchState(newtokens, self.goal, self) 77 | 78 | inputstate = ReorderSearchState("a This test . sentence is".split(' ')) 79 | goalstate = ReorderSearchState("This is a test sentence .".split(' ')) 80 | 81 | class DepthFirstSearchTest(unittest.TestCase): 82 | def test_solution(self): 83 | """Depth First Search""" 84 | global inputstate, goalstate 85 | search = DepthFirstSearch(inputstate ,graph=True, goal=goalstate) 86 | solution = search.searchfirst() 87 | #print "DFS:", search.traversalsize(), "nodes visited |", 88 | self.assertEqual(solution, goalstate) 89 | 90 | 91 | 92 | 93 | class BreadthFirstSearchTest(unittest.TestCase): 94 | def test_solution(self): 95 | """Breadth First Search""" 96 | global inputstate, goalstate 97 | search = BreadthFirstSearch(inputstate ,graph=True, goal=goalstate) 98 | solution = search.searchfirst() 99 | #print "BFS:", search.traversalsize(), "nodes visited |", 100 | self.assertEqual(solution, goalstate) 101 | 102 | 103 | class IterativeDeepeningTest(unittest.TestCase): 104 | def test_solution(self): 105 | """Iterative Deepening DFS""" 106 | global inputstate, goalstate 107 | search = IterativeDeepening(inputstate ,graph=True, goal=goalstate) 108 | solution = search.searchfirst() 109 | #print "It.Deep:", search.traversalsize(), "nodes visited |", 110 | self.assertEqual(solution, goalstate) 111 | 112 | 113 | 114 | informedinputstate = InformedReorderSearchState("a This test . sentence is".split(' '), goalstate) 115 | #making a simple language model 116 | 117 | class HillClimbingTest(unittest.TestCase): 118 | def test_solution(self): 119 | """Hill Climbing""" 120 | global informedinputstate 121 | search = HillClimbingSearch(informedinputstate, graph=True, minimize=True,debug=False) 122 | solution = search.searchbest() 123 | self.assertTrue(solution) #TODO: this is not a test! 124 | 125 | class BeamSearchTest(unittest.TestCase): 126 | def test_minimizeC1(self): 127 | """Beam Search needle-in-haystack problem (beam=2, minimize)""" 128 | #beamsize has been set to the minimum that yields the correct solution 129 | global informedinputstate, solution, goalstate 130 | search = BeamSearch(informedinputstate, beamsize=2, graph=True, minimize=True,debug=0, goal=goalstate) 131 | solution = search.searchbest() 132 | self.assertEqual( str(solution), str(goalstate) ) 133 | self.assertEqual( search.solutions, 1 ) 134 | 135 | 136 | def test_minimizeA1(self): 137 | """Beam Search optimisation problem A (beam=2, minimize)""" 138 | #beamsize has been set to the minimum that yields the correct solution 139 | global informedinputstate, solution, goalstate 140 | search = BeamSearch(informedinputstate, beamsize=2, graph=True, minimize=True,debug=0) 141 | solution = search.searchbest() 142 | self.assertEqual( str(solution), str(goalstate) ) 143 | self.assertTrue( search.solutions > 1 ) #everything is a solution 144 | 145 | 146 | def test_minimizeA2(self): 147 | """Beam Search optimisation problem A (beam=100, minimize)""" 148 | #if a small beamsize works, a very large one should too 149 | global informedinputstate, solution, goalstate 150 | search = BeamSearch(informedinputstate, beamsize=100, graph=True, minimize=True,debug=0) 151 | solution = search.searchbest() 152 | self.assertEqual( str(solution), str(goalstate) ) 153 | self.assertTrue( search.solutions > 1 ) #everything is a solution 154 | 155 | #def test_minimizeA3(self): 156 | # """Beam Search optimisation problem A (eager mode, beam=2, minimize)""" 157 | # #beamsize has been set to the minimum that yields the correct solution 158 | # global informedinputstate, solution, goalstate 159 | # search = BeamSearch(informedinputstate, beamsize=50, graph=True, minimize=True,eager=True,debug=2) 160 | # solution = search.searchbest() 161 | # self.assertEqual( str(solution), str(goalstate) ) 162 | 163 | 164 | def test_minimizeB1(self): 165 | """Beam Search optimisation problem (longer) (beam=3, minimize)""" 166 | #beamsize has been set to the minimum that yields the correct solution 167 | goalstate = InformedReorderSearchState("This is supposed to be a very long sentence .".split(' ')) 168 | informedinputstate = InformedReorderSearchState("a long very . sentence supposed to be This is".split(' '), goalstate) 169 | search = BeamSearch(informedinputstate, beamsize=3, graph=True, minimize=True,debug=False) 170 | solution = search.searchbest() 171 | self.assertEqual(str(solution),str(goalstate)) 172 | 173 | 174 | 175 | if __name__ == '__main__': 176 | unittest.main() 177 | 178 | 179 | 180 | -------------------------------------------------------------------------------- /pynlpl/formats/moses.py: -------------------------------------------------------------------------------- 1 | ############################################################### 2 | # PyNLPl - Moses formats 3 | # by Maarten van Gompel (proycon) 4 | # http://ilk.uvt.nl/~mvgompel 5 | # Induction for Linguistic Knowledge Research Group 6 | # Universiteit van Tilburg 7 | # 8 | # Licensed under GPLv3 9 | # 10 | # This is a Python library classes and functions for 11 | # reading file-formats produced by Moses. Currently 12 | # contains only a class for reading a Moses PhraseTable. 13 | # (migrated to pynlpl from pbmbmt) 14 | # 15 | ############################################################### 16 | 17 | 18 | from __future__ import print_function 19 | from __future__ import unicode_literals 20 | from __future__ import division 21 | from __future__ import absolute_import 22 | 23 | from pynlpl.common import u 24 | 25 | import sys 26 | import bz2 27 | import gzip 28 | import datetime 29 | import socket 30 | import io 31 | 32 | try: 33 | from twisted.internet import protocol, reactor #No Python 3 support yet :( 34 | from twisted.protocols import basic 35 | twistedimported = True 36 | except: 37 | print("WARNING: Twisted could not be imported",file=sys.stderr) 38 | twistedimported = False 39 | 40 | 41 | class PhraseTable(object): 42 | def __init__(self,filename, quiet=False, reverse=False, delimiter="|||", score_column = 3, max_sourcen = 0,sourceencoder=None, targetencoder=None, scorefilter=None): 43 | """Load a phrase table from file into memory (memory intensive!)""" 44 | self.phrasetable = {} 45 | self.sourceencoder = sourceencoder 46 | self.targetencoder = targetencoder 47 | 48 | 49 | if filename.split(".")[-1] == "bz2": 50 | f = bz2.BZ2File(filename,'r') 51 | elif filename.split(".")[-1] == "gz": 52 | f = gzip.GzipFile(filename,'r') 53 | else: 54 | f = io.open(filename,'r',encoding='utf-8') 55 | linenum = 0 56 | prevsource = None 57 | targets = [] 58 | 59 | while True: 60 | if not quiet: 61 | linenum += 1 62 | if (linenum % 100000) == 0: 63 | print("Loading phrase-table: @%d" % linenum, "\t(" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ")",file=sys.stderr) 64 | line = u(f.readline()) 65 | if not line: 66 | break 67 | 68 | #split into (trimmed) segments 69 | segments = [ segment.strip() for segment in line.split(delimiter) ] 70 | 71 | if len(segments) < 3: 72 | print("Invalid line: ", line, file=sys.stderr) 73 | continue 74 | 75 | #Do we have a score associated? 76 | if score_column > 0 and len(segments) >= score_column: 77 | scores = tuple( ( float(x) for x in segments[score_column-1].strip().split() ) ) 78 | else: 79 | scores = tuple() 80 | 81 | #if align2_column > 0: 82 | # try: 83 | # null_alignments = segments[align2_column].count("()") 84 | # except: 85 | # null_alignments = 0 86 | #else: 87 | # null_alignments = 0 88 | 89 | if scorefilter: 90 | if not scorefilter(scores): continue 91 | 92 | if reverse: 93 | if max_sourcen > 0 and segments[1].count(' ') + 1 > max_sourcen: 94 | continue 95 | 96 | if self.sourceencoder: 97 | source = self.sourceencoder(segments[1]) #tuple(segments[1].split(" ")) 98 | else: 99 | source = segments[1] 100 | if self.targetencoder: 101 | target = self.targetencoder(segments[0]) #tuple(segments[0].split(" ")) 102 | else: 103 | target = segments[0] 104 | else: 105 | if max_sourcen > 0 and segments[0].count(' ') + 1 > max_sourcen: 106 | continue 107 | 108 | if self.sourceencoder: 109 | source = self.sourceencoder(segments[0]) #tuple(segments[0].split(" ")) 110 | else: 111 | source = segments[0] 112 | if self.targetencoder: 113 | target = self.targetencoder(segments[1]) #tuple(segments[1].split(" ")) 114 | else: 115 | target = segments[1] 116 | 117 | 118 | if prevsource and source != prevsource and targets: 119 | self.phrasetable[prevsource] = tuple(targets) 120 | targets = [] 121 | 122 | targets.append( (target,scores) ) 123 | prevsource = source 124 | 125 | #don't forget last one: 126 | if prevsource and targets: 127 | self.phrasetable[prevsource] = tuple(targets) 128 | 129 | f.close() 130 | 131 | 132 | def __contains__(self, phrase): 133 | """Query if a certain phrase exist in the phrase table""" 134 | if self.sourceencoder: phrase = self.sourceencoder(phrase) 135 | return (phrase in self.phrasetable) 136 | #d = self.phrasetable 137 | #for word in phrase: 138 | # if not word in d: 139 | # return False 140 | # d = d[word 141 | #return ("" in d) 142 | 143 | def __iter__(self): 144 | for phrase, targets in self.phrasetable.items(): 145 | yield phrase, targets 146 | 147 | def __len__(self): 148 | return len(self.phrasetable) 149 | 150 | def __bool__(self): 151 | return bool(self.phrasetable) 152 | 153 | def __getitem__(self, phrase): #same as translations 154 | """Return a list of (translation, scores) tuples""" 155 | if self.sourceencoder: phrase = self.sourceencoder(phrase) 156 | return self.phrasetable[phrase] 157 | 158 | 159 | #d = self.phrasetable 160 | #for word in phrase: 161 | # if not word in d: 162 | # raise KeyError 163 | # d = d[word] 164 | 165 | #if "" in d: 166 | # return d[""] 167 | #else: 168 | # raise KeyError 169 | 170 | if twistedimported: 171 | class PTProtocol(basic.LineReceiver): 172 | def lineReceived(self, phrase): 173 | try: 174 | for target,Pst,Pts,null_alignments in self.factory.phrasetable[phrase]: 175 | self.sendLine(target+"\t"+str(Pst)+"\t"+str(Pts)+"\t"+str(null_alignments)) 176 | except KeyError: 177 | self.sendLine("NOTFOUND") 178 | 179 | class PTFactory(protocol.ServerFactory): 180 | protocol = PTProtocol 181 | def __init__(self, phrasetable): 182 | self.phrasetable = phrasetable 183 | 184 | class PhraseTableServer(object): 185 | def __init__(self, phrasetable, port=65432): 186 | reactor.listenTCP(port, PTFactory(phrasetable)) 187 | reactor.run() 188 | 189 | 190 | 191 | 192 | class PhraseTableClient(object): 193 | 194 | def __init__(self,host= "localhost",port=65432): 195 | self.BUFSIZE = 4048 196 | self.socket = socket.socket(socket.AF_INET,socket.SOCK_STREAM) #Create the socket 197 | self.socket.settimeout(120) 198 | self.socket.connect((host, port)) #Connect to server 199 | self.lastresponse = "" 200 | self.lastquery = "" 201 | 202 | def __getitem__(self, phrase): 203 | solutions = [] 204 | if phrase != self.lastquery: 205 | self.socket.send(phrase+ "\r\n") 206 | 207 | data = b"" 208 | while not data or data[-1] != '\n': 209 | data += self.socket.recv(self.BUFSIZE) 210 | else: 211 | data = self.lastresponse 212 | 213 | data = u(data) 214 | 215 | for line in data.split('\n'): 216 | line = line.strip('\r\n') 217 | if line == "NOTFOUND": 218 | raise KeyError(phrase) 219 | elif line: 220 | fields = tuple(line.split("\t")) 221 | if len(fields) == 4: 222 | solutions.append( fields ) 223 | else: 224 | print >>sys.stderr,"PHRASETABLECLIENT WARNING: Unable to parse response line" 225 | 226 | self.lastresponse = data 227 | self.lastquery = phrase 228 | 229 | return solutions 230 | 231 | def __contains__(self, phrase): 232 | self.socket.send(phrase.encode('utf-8')+ b"\r\n")\ 233 | 234 | 235 | data = b"" 236 | while not data or data[-1] != '\n': 237 | data += self.socket.recv(self.BUFSIZE) 238 | 239 | data = u(data) 240 | 241 | for line in data.split('\n'): 242 | line = line.strip('\r\n') 243 | if line == "NOTFOUND": 244 | return False 245 | 246 | self.lastresponse = data 247 | self.lastquery = phrase 248 | 249 | return True 250 | 251 | -------------------------------------------------------------------------------- /pynlpl/formats/dutchsemcor.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | 3 | ############################################################### 4 | # PyNLPl - DutchSemCor 5 | # by Maarten van Gompel (proycon) 6 | # http://ilk.uvt.nl/~mvgompel 7 | # Induction for Linguistic Knowledge Research Group 8 | # Universiteit van Tilburg 9 | # 10 | # Licensed under GPLv3 11 | # 12 | # Modified by Ruben Izquierdo 13 | # We need also to store the TIMBL distance to the nearest neighboor 14 | # 15 | # Collection of formats for the DutchSemCor project 16 | # 17 | ############################################################### 18 | 19 | from __future__ import print_function 20 | from __future__ import unicode_literals 21 | from __future__ import division 22 | from __future__ import absolute_import 23 | from pynlpl.common import u 24 | import sys 25 | if sys.version < '3': 26 | from codecs import getwriter 27 | stderr = getwriter('utf-8')(sys.stderr) 28 | stdout = getwriter('utf-8')(sys.stdout) 29 | else: 30 | stderr = sys.stderr 31 | stdout = sys.stdout 32 | 33 | from pynlpl.formats.timbl import TimblOutput 34 | from pynlpl.statistics import Distribution 35 | import io 36 | 37 | 38 | class WSDSystemOutput(object): 39 | def __init__(self, filename = None): 40 | self.data = {} 41 | self.distances={} 42 | self.maxDistance=1 43 | if filename: 44 | self.load(filename) 45 | 46 | def append(self, word_id, senses,distance=0): 47 | # Commented by Ruben, there are some ID's that are repeated in all sonar test files... 48 | #assert (not word_id in self.data) 49 | if isinstance(senses, Distribution): 50 | self.data[word_id] = ( (x,y) for x,y in senses ) #PATCH UNDONE (#TODO: this is a patch, something's not right in Distribution?) 51 | self.distances[word_id]=distance 52 | if distance > self.maxDistance: 53 | self.maxDistance=distance 54 | return 55 | else: 56 | assert isinstance(senses, list) and len(senses) >= 1 57 | 58 | self.distances[word_id]=distance 59 | if distance > self.maxDistance: 60 | self.maxDistance=distance 61 | 62 | 63 | if len(senses[0]) == 1: 64 | #not a (sense_id, confidence) tuple! compute equal confidence for all elements automatically: 65 | confidence = 1 / float(len(senses)) 66 | self.data[word_id] = [ (x,confidence) for x in senses ] 67 | else: 68 | fulldistr = True 69 | for sense, confidence in senses: 70 | if confidence == None: 71 | fulldistr = False 72 | break 73 | 74 | if fulldistr: 75 | self.data[word_id] = Distribution(senses) 76 | else: 77 | self.data[word_id] = senses 78 | 79 | 80 | def getMaxDistance(self): 81 | return self.maxDistance 82 | 83 | def __iter__(self): 84 | for word_id, senses in self.data.items(): 85 | yield word_id, senses,self.distances[word_id] 86 | 87 | def __len__(self): 88 | return len(self.data) 89 | 90 | def __getitem__(self, word_id): 91 | """Returns the sense distribution for the given word_id""" 92 | return self.data[word_id] 93 | 94 | def load(self, filename): 95 | f = io.open(filename,'r',encoding='utf-8') 96 | for line in f: 97 | fields = line.strip().split(" ") 98 | word_id = fields[0] 99 | if len(fields[1:]) == 1: 100 | #only one sense, no confidence expressed: 101 | self.append(word_id, [(fields[1],None)]) 102 | else: 103 | senses = [] 104 | distance=-1 105 | for i in range(1,len(fields),2): 106 | if i+1==len(fields): 107 | #The last field is the distance 108 | if fields[i][:4]=='+vdi': #Support for previous format of wsdout 109 | distance=float(fields[i][4:]) 110 | else: 111 | distance=float(fields[i]) 112 | else: 113 | if fields[i+1] == '?': fields[i+1] = None 114 | senses.append( (fields[i], fields[i+1]) ) 115 | self.append(word_id, senses,distance) 116 | 117 | f.close() 118 | 119 | def save(self, filename): 120 | f = io.open(filename,'w',encoding='utf-8') 121 | for word_id, senses,distance in self: 122 | f.write(word_id) 123 | for sense, confidence in senses: 124 | if confidence == None: confidence = "?" 125 | f.write(" " + str(sense) + " " + str(confidence)) 126 | if word_id in self.distances.keys(): 127 | f.write(' '+str(self.distances[word_id])) 128 | f.write("\n") 129 | f.close() 130 | 131 | def out(self, filename): 132 | for word_id, senses,distance in self: 133 | print(word_id,distance,end="") 134 | for sense, confidence in senses: 135 | if confidence == None: confidence = "?" 136 | print(" " + sense + " " + str(confidence),end="") 137 | print() 138 | 139 | def senses(self, bestonly=False): 140 | """Returns a list of all predicted senses""" 141 | l = [] 142 | for word_id, senses,distance in self: 143 | for sense, confidence in senses: 144 | if not sense in l: l.append(sense) 145 | if bestonly: 146 | break 147 | return l 148 | 149 | 150 | def loadfromtimbl(self, filename): 151 | timbloutput = TimblOutput(io.open(filename,'r',encoding='utf-8')) 152 | for i, (features, referenceclass, predictedclass, distribution, distance) in enumerate(timbloutput): 153 | if distance != None: 154 | #distance='+vdi'+str(distance) 155 | distance=float(distance) 156 | if len(features) == 0: 157 | print("WARNING: Empty feature vector in " + filename + " (line " + str(i+1) + ") skipping!!",file=stderr) 158 | continue 159 | word_id = features[0] #note: this is an assumption that must be adhered to! 160 | if distribution: 161 | self.append(word_id, distribution,distance) 162 | 163 | def fromTimblToWsdout(self,fileTimbl,fileWsdout): 164 | timbloutput = TimblOutput(io.open(fileTimbl,'r',encoding='utf-8')) 165 | wsdoutfile = io.open(fileWsdout,'w',encoding='utf-8') 166 | for i, (features, referenceclass, predictedclass, distribution, distance) in enumerate(timbloutput): 167 | if len(features) == 0: 168 | print("WARNING: Empty feature vector in " + fileTimbl + " (line " + str(i+1) + ") skipping!!",file=stderr) 169 | continue 170 | word_id = features[0] #note: this is an assumption that must be adhered to! 171 | if distribution: 172 | wsdoutfile.write(word_id+' ') 173 | for sense, confidence in distribution: 174 | if confidence== None: confidence='?' 175 | wsdoutfile.write(sense+' '+str(confidence)+' ') 176 | wsdoutfile.write(str(distance)+'\n') 177 | wsdoutfile.close() 178 | 179 | 180 | 181 | class DataSet(object): #for testsets/trainingsets 182 | def __init__(self, filename): 183 | self.sense = {} #word_id => (sense_id, lemma,pos) 184 | self.targetwords = {} #(lemma,pos) => [sense_id] 185 | f = io.open(filename,'r',encoding='utf-8') 186 | for line in f: 187 | if len(line) > 0 and line[0] != '#': 188 | fields = line.strip('\n').split('\t') 189 | word_id = fields[0] 190 | sense_id = fields[1] 191 | lemma = fields[2] 192 | pos = fields[3] 193 | self.sense[word_id] = (sense_id, lemma, pos) 194 | if not (lemma,pos) in self.targetwords: 195 | self.targetwords[(lemma,pos)] = [] 196 | if not sense_id in self.targetwords[(lemma,pos)]: 197 | self.targetwords[(lemma,pos)].append(sense_id) 198 | f.close() 199 | 200 | def __getitem__(self, word_id): 201 | return self.sense[self._sanitize(word_id)] 202 | 203 | def getsense(self, word_id): 204 | return self.sense[self._sanitize(word_id)][0] 205 | 206 | def getlemma(self, word_id): 207 | return self.sense[self._sanitize(word_id)][1] 208 | 209 | def getpos(self, word_id): 210 | return self.sense[self._sanitize(word_id)][2] 211 | 212 | def _sanitize(self, word_id): 213 | return u(word_id) 214 | 215 | def __contains__(self, word_id): 216 | return (self._sanitize(word_id) in self.sense) 217 | 218 | 219 | def __iter__(self): 220 | for word_id, (sense, lemma, pos) in self.sense.items(): 221 | yield (word_id, sense, lemma, pos) 222 | 223 | def senses(self, lemma, pos): 224 | return self.targetwords[(lemma,pos)] 225 | -------------------------------------------------------------------------------- /pynlpl/formats/sonar.py: -------------------------------------------------------------------------------- 1 | #--------------------------------------------------------------- 2 | # PyNLPl - Simple Read library for D-Coi/SoNaR format 3 | # by Maarten van Gompel, ILK, Universiteit van Tilburg 4 | # http://ilk.uvt.nl/~mvgompel 5 | # proycon AT anaproy DOT nl 6 | # 7 | # Licensed under GPLv3 8 | # 9 | # This library facilitates parsing and reading corpora in 10 | # the SoNaR/D-Coi format. 11 | # 12 | #---------------------------------------------------------------- 13 | 14 | from __future__ import print_function 15 | from __future__ import unicode_literals 16 | from __future__ import division 17 | from __future__ import absolute_import 18 | 19 | 20 | import io 21 | import re 22 | import glob 23 | import os.path 24 | import sys 25 | 26 | from lxml import etree as ElementTree 27 | 28 | if sys.version < '3': 29 | from StringIO import StringIO 30 | else: 31 | from io import StringIO 32 | 33 | 34 | namespaces = { 35 | 'dcoi': "http://lands.let.ru.nl/projects/d-coi/ns/1.0", 36 | 'standalone':"http://ilk.uvt.nl/dutchsemcor-standalone", 37 | 'dsc':"http://ilk.uvt.nl/dutchsemcor", 38 | 'xml':"http://www.w3.org/XML/1998/namespace" 39 | } 40 | 41 | class CorpusDocument(object): 42 | """This class represent one document/text of the Corpus (read-only)""" 43 | 44 | def __init__(self, filename, encoding = 'iso-8859-15'): 45 | self.filename = filename 46 | self.id = os.path.basename(filename).split(".")[0] 47 | self.f = io.open(filename,'r', encoding=encoding) 48 | self.metadata = {} 49 | 50 | def _parseimdi(self,line): 51 | r = re.compile('(.*)') 52 | matches = r.findall(line) 53 | if matches: 54 | self.metadata['title'] = matches[0] 55 | if not 'date' in self.metadata: 56 | r = re.compile('(.*)') 57 | matches = r.findall(line) 58 | if matches: 59 | self.metadata['date'] = matches[0] 60 | 61 | 62 | def __iter__(self): 63 | """Iterate over all words, a four-tuple (word,id,pos,lemma), in the document""" 64 | 65 | r = re.compile('(.*)') 66 | for line in self.f.readlines(): 67 | matches = r.findall(line) 68 | for id, attribs, word in matches: 69 | pos = lemma = None 70 | m = re.findall('pos="([^"]+)"', attribs) 71 | if m: pos = m[0] 72 | 73 | m = re.findall('lemma="([^"]+)"', attribs) 74 | if m: lemma = m[0] 75 | 76 | yield word, id, pos, lemma 77 | if line.find('imdi:') != -1: 78 | self._parseimdi(line) 79 | 80 | def words(self): 81 | #alias 82 | return iter(self) 83 | 84 | 85 | def sentences(self): 86 | """Iterate over all sentences (sentence_id, sentence) in the document, sentence is a list of 4-tuples (word,id,pos,lemma)""" 87 | prevp = 0 88 | prevs = 0 89 | sentence = []; 90 | sentence_id = "" 91 | for word, id, pos, lemma in iter(self): 92 | try: 93 | doc_id, ptype, p, s, w = re.findall('([\w\d-]+)\.(p|head)\.(\d+)\.s\.(\d+)\.w\.(\d+)',id)[0] 94 | if ((p != prevp) or (s != prevs)) and sentence: 95 | yield sentence_id, sentence 96 | sentence = [] 97 | sentence_id = doc_id + '.' + ptype + '.' + str(p) + '.s.' + str(s) 98 | prevp = p 99 | except IndexError: 100 | doc_id, s, w = re.findall('([\w\d-]+)\.s\.(\d+)\.w\.(\d+)',id)[0] 101 | if s != prevs and sentence: 102 | yield sentence_id, sentence 103 | sentence = [] 104 | sentence_id = doc_id + '.s.' + str(s) 105 | sentence.append( (word,id,pos,lemma) ) 106 | prevs = s 107 | if sentence: 108 | yield sentence_id, sentence 109 | 110 | def paragraphs(self, with_id = False): 111 | """Extracts paragraphs, returns list of plain-text(!) paragraphs""" 112 | prevp = 0 113 | partext = [] 114 | for word, id, pos, lemma in iter(self): 115 | doc_id, ptype, p, s, w = re.findall('([\w\d-]+)\.(p|head)\.(\d+)\.s\.(\d+)\.w\.(\d+)',id)[0] 116 | if prevp != p and partext: 117 | yield ( doc_id + "." + ptype + "." + prevp , " ".join(partext) ) 118 | partext = [] 119 | partext.append(word) 120 | prevp = p 121 | if partext: 122 | yield (doc_id + "." + ptype + "." + prevp, " ".join(partext) ) 123 | 124 | class Corpus: 125 | def __init__(self,corpusdir, extension = 'pos', restrict_to_collection = "", conditionf=lambda x: True, ignoreerrors=False): 126 | self.corpusdir = corpusdir 127 | self.extension = extension 128 | self.restrict_to_collection = restrict_to_collection 129 | self.conditionf = conditionf 130 | self.ignoreerrors = ignoreerrors 131 | 132 | def __iter__(self): 133 | if not self.restrict_to_collection: 134 | for f in glob.glob(self.corpusdir+"/*." + self.extension): 135 | if self.conditionf(f): 136 | try: 137 | yield CorpusDocument(f) 138 | except: 139 | print("Error, unable to parse " + f,file=sys.stderr) 140 | if not self.ignoreerrors: 141 | raise 142 | for d in glob.glob(self.corpusdir+"/*"): 143 | if (not self.restrict_to_collection or self.restrict_to_collection == os.path.basename(d)) and (os.path.isdir(d)): 144 | for f in glob.glob(d+ "/*." + self.extension): 145 | if self.conditionf(f): 146 | try: 147 | yield CorpusDocument(f) 148 | except: 149 | print("Error, unable to parse " + f,file=sys.stderr) 150 | if not self.ignoreerrors: 151 | raise 152 | 153 | 154 | ####################################################### 155 | 156 | def ns(namespace): 157 | """Resolves the namespace identifier to a full URL""" 158 | global namespaces 159 | return '{'+namespaces[namespace]+'}' 160 | 161 | 162 | class CorpusFiles(Corpus): 163 | def __iter__(self): 164 | if not self.restrict_to_collection: 165 | for f in glob.glob(self.corpusdir+"/*." + self.extension): 166 | if self.conditionf(f): 167 | yield f 168 | for d in glob.glob(self.corpusdir+"/*"): 169 | if (not self.restrict_to_collection or self.restrict_to_collection == os.path.basename(d)) and (os.path.isdir(d)): 170 | for f in glob.glob(d+ "/*." + self.extension): 171 | if self.conditionf(f): 172 | yield f 173 | 174 | 175 | class CorpusX(Corpus): 176 | def __iter__(self): 177 | if not self.restrict_to_collection: 178 | for f in glob.glob(self.corpusdir+"/*." + self.extension): 179 | if self.conditionf(f): 180 | try: 181 | yield CorpusDocumentX(f) 182 | except: 183 | print("Error, unable to parse " + f,file=sys.stderr) 184 | if not self.ignoreerrors: 185 | raise 186 | for d in glob.glob(self.corpusdir+"/*"): 187 | if (not self.restrict_to_collection or self.restrict_to_collection == os.path.basename(d)) and (os.path.isdir(d)): 188 | for f in glob.glob(d+ "/*." + self.extension): 189 | if self.conditionf(f): 190 | try: 191 | yield CorpusDocumentX(f) 192 | except: 193 | print("Error, unable to parse " + f,file=sys.stderr) 194 | if not self.ignoreerrors: 195 | raise 196 | 197 | 198 | 199 | class CorpusDocumentX: 200 | """This class represent one document/text of the Corpus, loaded into memory at once and retaining the full structure""" 201 | 202 | def __init__(self, filename, tree = None, index=True ): 203 | global namespaces 204 | self.filename = filename 205 | if not tree: 206 | self.tree = ElementTree.parse(self.filename) 207 | self.committed = True 208 | elif isinstance(tree, ElementTree._Element): 209 | self.tree = tree 210 | self.committed = False 211 | 212 | #Grab root element and determine if we run inline or standalone 213 | self.root = self.xpath("/dcoi:DCOI") 214 | if self.root: 215 | self.root = self.root[0] 216 | self.inline = True 217 | else: 218 | raise Exception("Not in DCOI/SoNaR format!") 219 | #self.root = self.xpath("/standalone:text") 220 | #self.inline = False 221 | #if not self.root: 222 | # raise FormatError() 223 | 224 | #build an index 225 | self.index = {} 226 | if index: 227 | self._index(self.root) 228 | 229 | def _index(self,node): 230 | if ns('xml') + 'id' in node.attrib: 231 | self.index[node.attrib[ns('xml') + 'id']] = node 232 | for subnode in node: #TODO: can we do this with xpath instead? 233 | self._index(subnode) 234 | 235 | def validate(self, formats_dir="../formats/"): 236 | """checks if the document is valid""" 237 | #TODO: download XSD from web 238 | if self.inline: 239 | xmlschema = ElementTree.XMLSchema(ElementTree.parse(StringIO("\n".join(open(formats_dir+"dcoi-dsc.xsd").readlines())))) 240 | xmlschema.assertValid(self.tree) 241 | #return xmlschema.validate(self) 242 | else: 243 | xmlschema = ElementTree.XMLSchema(ElementTree.parse(StringIO("\n".join(open(formats_dir+"dutchsemcor-standalone.xsd").readlines())))) 244 | xmlschema.assertValid(self.tree) 245 | #return xmlschema.validate(self) 246 | 247 | def xpath(self, expression): 248 | """Executes an xpath expression using the correct namespaces""" 249 | global namespaces 250 | return self.tree.xpath(expression, namespaces=namespaces) 251 | 252 | 253 | def __exists__(self, id): 254 | return (id in self.index) 255 | 256 | def __getitem__(self, id): 257 | return self.index[id] 258 | 259 | 260 | def paragraphs(self, node=None): 261 | """iterate over paragraphs""" 262 | if node == None: node = self 263 | return node.xpath("//dcoi:p") 264 | 265 | def sentences(self, node=None): 266 | """iterate over sentences""" 267 | if node == None: node = self 268 | return node.xpath("//dcoi:s") 269 | 270 | def words(self,node=None): 271 | """iterate over words""" 272 | if node == None: node = self 273 | return node.xpath("//dcoi:w") 274 | 275 | def save(self, filename=None, encoding='iso-8859-15'): 276 | if not filename: filename = self.filename 277 | self.tree.write(filename, encoding=encoding, method='xml', pretty_print=True, xml_declaration=True) 278 | 279 | 280 | -------------------------------------------------------------------------------- /pynlpl/formats/cql.py: -------------------------------------------------------------------------------- 1 | #--------------------------------------------------------------- 2 | # PyNLPl - Corpus Query Language (CQL) 3 | # by Maarten van Gompel 4 | # Centre for Language Studies 5 | # Radboud University Nijmegen 6 | # http://proycon.github.com/folia 7 | # http://www.github.com/proycon/pynlpl 8 | # proycon AT anaproy DOT nl 9 | # 10 | # Parser and interpreter for a basic subset of the Corpus Query Language 11 | # 12 | # Licensed under GPLv3 13 | # 14 | #---------------------------------------------------------------- 15 | 16 | from __future__ import print_function, unicode_literals, division, absolute_import 17 | 18 | from pynlpl.fsa import State, NFA 19 | import re 20 | import sys 21 | 22 | OPERATORS = ('=','!=') 23 | MAXINTERVAL = 99 24 | 25 | class SyntaxError(Exception): 26 | pass 27 | 28 | class ValueExpression(object): 29 | def __init__(self, values): 30 | self.values = values #disjunction 31 | 32 | @staticmethod 33 | def parse(s,i): 34 | values = "" 35 | assert s[i] == '"' 36 | i += 1 37 | while not (s[i] == '"' and s[i-1] != "\\"): 38 | values += s[i] 39 | i += 1 40 | values = values.split("|") 41 | return ValueExpression(values), i+1 42 | 43 | def __len__(self): 44 | return len(self.values) 45 | 46 | def __iter__(self): 47 | for x in self.values: 48 | yield x 49 | 50 | def __getitem__(self,index): 51 | return self.values[index] 52 | 53 | class AttributeExpression(object): 54 | def __init__(self, attribute, operator, valueexpression): 55 | self.attribute = attribute 56 | self.operator = operator 57 | self.valueexpr = valueexpression 58 | 59 | @staticmethod 60 | def parse(s,i): 61 | while s[i] == " ": 62 | i +=1 63 | if s[i] == '"': 64 | #no attribute and no operator, use defaults: 65 | attribute = "word" 66 | operator = "=" 67 | else: 68 | attribute = "" 69 | while s[i] not in (' ','!','>','<','='): 70 | attribute += s[i] 71 | i += 1 72 | if not attribute: 73 | raise SyntaxError("Expected attribute name, none found") 74 | operator = "" 75 | while s[i] in (' ','!','>','<','='): 76 | if s[i] != ' ': 77 | operator += s[i] 78 | i += 1 79 | if operator not in OPERATORS: 80 | raise SyntaxError("Expected operator, got '" + operator + "'") 81 | if s[i] != '"': 82 | raise SyntaxError("Expected start of value expression (doublequote) in position " + str(i) + ", got " + s[i]) 83 | valueexpr, i = ValueExpression.parse(s,i) 84 | return AttributeExpression(attribute,operator, valueexpr), i 85 | 86 | class TokenExpression(object): 87 | def __init__(self, attribexprs=[], interval=None): 88 | self.attribexprs = attribexprs 89 | self.interval = interval 90 | 91 | @staticmethod 92 | def parse(s,i): 93 | attribexprs = [] 94 | while s[i] == " ": 95 | i +=1 96 | if s[i] == '"': 97 | attribexpr,i = AttributeExpression.parse(s,i) 98 | attribexprs.append(attribexpr) 99 | elif s[i] == "[": 100 | i += 1 101 | while True: 102 | while s[i] == " ": 103 | i +=1 104 | if s[i] == "&": 105 | attribexpr,i = AttributeExpression.parse(s,i+1) 106 | attribexprs.append(attribexpr) 107 | elif s[i] == "]": 108 | i += 1 109 | break 110 | elif not attribexprs: 111 | attribexpr,i = AttributeExpression.parse(s,i) 112 | attribexprs.append(attribexpr) 113 | else: 114 | raise SyntaxError("Unexpected char whilst parsing token expression, position " + str(i) + ": " + s[i]) 115 | else: 116 | raise SyntaxError("Expected token expression starting with either \" or [, got: " + s[i]) 117 | 118 | if i == len(s): 119 | interval = None #end of query! 120 | elif s[i] == "{": 121 | #interval expression, find end: 122 | interval = None 123 | for j in range(i+1, len(s)): 124 | if s[j] == "}": 125 | interval = s[i+1:j] 126 | 127 | if interval is None: 128 | raise SyntaxError("Interval expression started but no end-brace found") 129 | 130 | i += len(interval) + 2 131 | 132 | try: 133 | if ',' in interval: 134 | interval = tuple(int(x) for x in interval.split(",")) 135 | if len(interval) != 2: 136 | raise SyntaxError("Invalid interval: " + interval) 137 | elif '-' in interval: #alternative 138 | interval = tuple(int(x) for x in interval.split("-")) 139 | if len(interval) != 2: 140 | raise SyntaxError("Invalid interval: " + interval) 141 | else: 142 | interval = (int(interval),int(interval)) 143 | except ValueError: 144 | raise SyntaxError("Invalid interval: " + interval) 145 | elif s[i] == "?": 146 | interval = (0,1) 147 | i += 1 148 | elif s[i] == "+": 149 | interval = (1,MAXINTERVAL) 150 | i += 1 151 | elif s[i] == "*": 152 | interval = (0,MAXINTERVAL) 153 | i += 1 154 | else: 155 | interval = None 156 | 157 | return TokenExpression(attribexprs,interval),i 158 | 159 | 160 | def __len__(self): 161 | return len(self.attribexprs) 162 | 163 | def __iter__(self): 164 | for x in self.attribexprs: 165 | yield x 166 | 167 | def __getitem__(self,index): 168 | return self.attribexprs[index] 169 | 170 | def nfa(self, nextstate): 171 | """Returns an initial state for an NFA""" 172 | if self.interval: 173 | mininterval, maxinterval = self.interval #pylint: disable=unpacking-non-sequence 174 | nextstate2 = nextstate 175 | for i in range(maxinterval): 176 | state = State(transitions=[(self,self.match, nextstate2)]) 177 | if i+1> mininterval: 178 | if nextstate is not nextstate2: state.transitions.append((self,self.match, nextstate)) 179 | if maxinterval == MAXINTERVAL: 180 | state.epsilon.append(state) 181 | break 182 | nextstate2 = state 183 | return state 184 | else: 185 | state = State(transitions=[(self,self.match, nextstate)]) 186 | return state 187 | 188 | 189 | def match(self, value): 190 | match = False 191 | for _, attribexpr in enumerate(self): 192 | annottype = attribexpr.attribute 193 | if annottype == 'text': annottype = 'word' 194 | if attribexpr.operator == "!=": 195 | negate = True 196 | elif attribexpr.operator == "=": 197 | negate = False 198 | else: 199 | raise Exception("Unexpected operator " + attribexpr.operator) 200 | 201 | if len(attribexpr.valueexpr) > 1: 202 | expr = re.compile("^(" + "|".join(attribexpr.valueexpr) + ")$") 203 | else: 204 | expr = re.compile("^" + attribexpr.valueexpr[0] + '$') 205 | match = (expr.match(value[annottype]) is not None) 206 | if negate: 207 | match = not match 208 | if not match: 209 | return False 210 | return True 211 | 212 | 213 | 214 | class Query(object): 215 | def __init__(self, s): 216 | self.tokenexprs = [] 217 | i = 0 218 | l = len(s) 219 | while i < l: 220 | if s[i] == " ": 221 | i += 1 222 | else: 223 | tokenexpr,i = TokenExpression.parse(s,i) 224 | self.tokenexprs.append(tokenexpr) 225 | 226 | def __len__(self): 227 | return len(self.tokenexprs) 228 | 229 | def __iter__(self): 230 | for x in self.tokenexprs: 231 | yield x 232 | 233 | def __getitem__(self,index): 234 | return self.tokenexprs[index] 235 | 236 | def nfa(self): 237 | """convert the expression into an NFA""" 238 | finalstate = State(final=True) 239 | nextstate = finalstate 240 | for tokenexpr in reversed(self): 241 | state = tokenexpr.nfa(nextstate) 242 | nextstate = state 243 | return NFA(state) 244 | 245 | 246 | def __call__(self, tokens, debug=False): 247 | """Execute the CQL expression, pass a list of tokens/annotations using keyword arguments: word, pos, lemma, etc""" 248 | 249 | if not tokens: 250 | raise Exception("Pass a list of tokens/annotation using keyword arguments! (word,pos,lemma, or others)") 251 | 252 | #convert the expression into an NFA 253 | nfa = self.nfa() 254 | if debug: 255 | print(repr(nfa), file=sys.stderr) 256 | 257 | return list(nfa.find(tokens,debug)) 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | def cql2fql(cq): 266 | fq = "SELECT FOR SPAN " 267 | if not isinstance(cq, Query): 268 | cq = Query(cq) 269 | 270 | for i, token in enumerate(cq): 271 | if i > 0: fq += " & " 272 | fq += "w" 273 | if token.interval: 274 | fq += " {" + str(token.interval[0]) + "," + str(token.interval[1])+ "} " 275 | else: 276 | fq += " " 277 | if token.attribexprs: 278 | fq += "WHERE " 279 | for j, attribexpr in enumerate(token): 280 | if j > 0: 281 | fq += " AND " 282 | fq += "(" 283 | if attribexpr.operator == "!=": 284 | operator = "NOTMATCHES" 285 | elif attribexpr.operator == "=": 286 | operator = "MATCHES" 287 | else: 288 | raise Exception("Invalid operator: " + attribexpr.operator) 289 | if attribexpr.attribute in ("word","text"): 290 | if len(attribexpr.valueexpr) > 1: 291 | fq += "text " + operator + " \"^(" + "|".join(attribexpr.valueexpr) + ")$\" " 292 | else: 293 | fq += "text " + operator + " \"^" + attribexpr.valueexpr[0] + "$\" " 294 | else: 295 | annottype = attribexpr.attribute 296 | if annottype == "tag": 297 | annottype = "pos" 298 | elif annottype == "lempos": 299 | raise Exception("lempos not supported in CQL to FQL conversion, use pos and lemma separately") 300 | fq += annottype + " HAS class " 301 | if len(attribexpr.valueexpr) > 1: 302 | fq += operator + " \"^(" + "|".join(attribexpr.valueexpr) + ")$\" " 303 | else: 304 | fq += operator + " \"^" + attribexpr.valueexpr[0] + "$\" " 305 | fq += ")" 306 | 307 | return fq 308 | -------------------------------------------------------------------------------- /pynlpl/formats/giza.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | ############################################################### 4 | # PyNLPl - WordAlignment Library for reading GIZA++ A3 files 5 | # by Maarten van Gompel (proycon) 6 | # http://ilk.uvt.nl/~mvgompel 7 | # Induction for Linguistic Knowledge Research Group 8 | # Universiteit van Tilburg 9 | # 10 | # In part using code by Sander Canisius 11 | # 12 | # Licensed under GPLv3 13 | # 14 | # 15 | # This library reads GIZA++ A3 files. It contains three classes over which 16 | # you can iterate to obtain (sourcewords,targetwords,alignment) pairs. 17 | # 18 | # - WordAlignment - Reads target-source.A3.final files, in which each source word is aligned to one target word 19 | # - MultiWordAlignment - Reads source-target.A3.final files, in which each source word may be aligned to multiple target target words 20 | # - IntersectionAlignment - Computes the intersection between the above two alignments 21 | # 22 | # 23 | ############################################################### 24 | 25 | from __future__ import print_function 26 | from __future__ import unicode_literals 27 | from __future__ import division 28 | from __future__ import absolute_import 29 | 30 | from pynlpl.common import u 31 | 32 | import bz2 33 | import gzip 34 | import copy 35 | import io 36 | from sys import stderr 37 | 38 | class GizaSentenceAlignment(object): 39 | 40 | def __init__(self, sourceline, targetline, index): 41 | self.index = index 42 | self.alignment = [] 43 | if sourceline: 44 | self.source = self._parsesource(sourceline.strip()) 45 | else: 46 | self.source = [] 47 | self.target = targetline.strip().split(' ') 48 | 49 | def _parsesource(self, line): 50 | cleanline = "" 51 | 52 | inalignment = False 53 | begin = 0 54 | sourceindex = 0 55 | 56 | for i in range(0,len(line)): 57 | if line[i] == ' ' or i == len(line) - 1: 58 | if i == len(line) - 1: 59 | offset = 1 60 | else: 61 | offset = 0 62 | 63 | word = line[begin:i+offset] 64 | if word == '})': 65 | inalignment = False 66 | begin = i + 1 67 | continue 68 | elif word == "({": 69 | inalignment = True 70 | begin = i + 1 71 | continue 72 | if word.strip() and word != 'NULL': 73 | if not inalignment: 74 | sourceindex += 1 75 | if cleanline: cleanline += " " 76 | cleanline += word 77 | else: 78 | targetindex = int(word) 79 | self.alignment.append( (sourceindex-1, targetindex-1) ) 80 | begin = i + 1 81 | 82 | return cleanline.split(' ') 83 | 84 | 85 | def intersect(self,other): 86 | if other.target != self.source: 87 | print("GizaSentenceAlignment.intersect(): Mismatch between self.source and other.target: " + repr(self.source) + " -- vs -- " + repr(other.target),file=stderr) 88 | return None 89 | 90 | intersection = copy.copy(self) 91 | intersection.alignment = [] 92 | 93 | for sourceindex, targetindex in self.alignment: 94 | for targetindex2, sourceindex2 in other.alignment: 95 | if targetindex2 == targetindex and sourceindex2 == sourceindex: 96 | intersection.alignment.append( (sourceindex, targetindex) ) 97 | 98 | return intersection 99 | 100 | def __repr__(self): 101 | s = " ".join(self.source)+ " ||| " 102 | s += " ".join(self.target) + " ||| " 103 | for S,T in sorted(self.alignment): 104 | s += self.source[S] + "->" + self.target[T] + " ; " 105 | return s 106 | 107 | 108 | def getalignedtarget(self, index): 109 | """Returns target range only if source index aligns to a single consecutive range of target tokens.""" 110 | targetindices = [] 111 | target = None 112 | foundindex = -1 113 | for sourceindex, targetindex in self.alignment: 114 | if sourceindex == index: 115 | targetindices.append(targetindex) 116 | if len(targetindices) > 1: 117 | for i in range(1,len(targetindices)): 118 | if abs(targetindices[i] - targetindices[i-1]) != 1: 119 | break # not consecutive 120 | foundindex = (min(targetindices), max(targetindices)) 121 | target = ' '.join(self.target[min(targetindices):max(targetindices)+1]) 122 | elif targetindices: 123 | foundindex = targetindices[0] 124 | target = self.target[foundindex] 125 | 126 | return target, foundindex 127 | 128 | class GizaModel(object): 129 | def __init__(self, filename, encoding= 'utf-8'): 130 | if filename.split(".")[-1] == "bz2": 131 | self.f = bz2.BZ2File(filename,'r') 132 | elif filename.split(".")[-1] == "gz": 133 | self.f = gzip.GzipFile(filename,'r') 134 | else: 135 | self.f = io.open(filename,'r',encoding=encoding) 136 | self.nextlinebuffer = None 137 | 138 | 139 | def __iter__(self): 140 | self.f.seek(0) 141 | nextlinebuffer = u(next(self.f)) 142 | sentenceindex = 0 143 | 144 | done = False 145 | while not done: 146 | sentenceindex += 1 147 | line = nextlinebuffer 148 | if line[0] != '#': 149 | raise Exception("Error parsing GIZA++ Alignment at sentence " + str(sentenceindex) + ", expected new fragment, found: " + repr(line)) 150 | 151 | targetline = u(next(self.f)) 152 | sourceline = u(next(self.f)) 153 | 154 | yield GizaSentenceAlignment(sourceline, targetline, sentenceindex) 155 | 156 | try: 157 | nextlinebuffer = u(next(self.f)) 158 | except StopIteration: 159 | done = True 160 | 161 | 162 | def __del__(self): 163 | if self.f: self.f.close() 164 | 165 | 166 | #------------------ OLD ------------------- 167 | 168 | def parseAlignment(tokens): #by Sander Canisius 169 | assert tokens.pop(0) == "NULL" 170 | while tokens.pop(0) != "})": 171 | pass 172 | 173 | while tokens: 174 | word = tokens.pop(0) 175 | assert tokens.pop(0) == "({" 176 | positions = [] 177 | token = tokens.pop(0) 178 | while token != "})": 179 | positions.append(int(token)) 180 | token = tokens.pop(0) 181 | 182 | yield word, positions 183 | 184 | 185 | class WordAlignment: 186 | """Target to Source alignment: reads target-source.A3.final files, in which each source word is aligned to one target word""" 187 | 188 | def __init__(self,filename, encoding=False): 189 | """Open a target-source GIZA++ A3 file. The file may be bzip2 compressed. If an encoding is specified, proper unicode strings will be returned""" 190 | 191 | if filename.split(".")[-1] == "bz2": 192 | self.stream = bz2.BZ2File(filename,'r') 193 | else: 194 | self.stream = open(filename) 195 | self.encoding = encoding 196 | 197 | 198 | def __del__(self): 199 | self.stream.close() 200 | 201 | def __iter__(self): #by Sander Canisius 202 | line = self.stream.readline() 203 | while line: 204 | assert line.startswith("#") 205 | src = self.stream.readline().split() 206 | trg = [] 207 | alignment = [None for i in xrange(len(src))] 208 | 209 | for i, (targetWord, positions) in enumerate(parseAlignment(self.stream.readline().split())): 210 | 211 | trg.append(targetWord) 212 | 213 | for pos in positions: 214 | assert alignment[pos - 1] is None 215 | alignment[pos - 1] = i 216 | 217 | if self.encoding: 218 | yield [ u(w,self.encoding) for w in src ], [ u(w,self.encoding) for w in trg ], alignment 219 | else: 220 | yield src, trg, alignment 221 | 222 | line = self.stream.readline() 223 | 224 | 225 | def targetword(self, index, targetwords, alignment): 226 | """Return the aligned targetword for a specified index in the source words""" 227 | if alignment[index]: 228 | return targetwords[alignment[index]] 229 | else: 230 | return None 231 | 232 | def reset(self): 233 | self.stream.seek(0) 234 | 235 | class MultiWordAlignment: 236 | """Source to Target alignment: reads source-target.A3.final files, in which each source word may be aligned to multiple target words (adapted from code by Sander Canisius)""" 237 | 238 | def __init__(self,filename, encoding = False): 239 | """Load a target-source GIZA++ A3 file. The file may be bzip2 compressed. If an encoding is specified, proper unicode strings will be returned""" 240 | 241 | if filename.split(".")[-1] == "bz2": 242 | self.stream = bz2.BZ2File(filename,'r') 243 | else: 244 | self.stream = open(filename) 245 | self.encoding = encoding 246 | 247 | def __del__(self): 248 | self.stream.close() 249 | 250 | def __iter__(self): 251 | line = self.stream.readline() 252 | while line: 253 | assert line.startswith("#") 254 | trg = self.stream.readline().split() 255 | src = [] 256 | alignment = [] 257 | 258 | for i, (word, positions) in enumerate(parseAlignment(self.stream.readline().split())): 259 | src.append(word) 260 | alignment.append( [ p - 1 for p in positions ] ) 261 | 262 | 263 | if self.encoding: 264 | yield [ unicode(w,self.encoding) for w in src ], [ unicode(w,self.encoding) for w in trg ], alignment 265 | else: 266 | yield src, trg, alignment 267 | 268 | line = self.stream.readline() 269 | 270 | def targetword(self, index, targetwords, alignment): 271 | """Return the aligned targeword for a specified index in the source words. Multiple words are concatenated together with a space in between""" 272 | return " ".join(targetwords[alignment[index]]) 273 | 274 | def targetwords(self, index, targetwords, alignment): 275 | """Return the aligned targetwords for a specified index in the source words""" 276 | return [ targetwords[x] for x in alignment[index] ] 277 | 278 | def reset(self): 279 | self.stream.seek(0) 280 | 281 | 282 | class IntersectionAlignment: 283 | 284 | def __init__(self,source2target,target2source,encoding=False): 285 | self.s2t = MultiWordAlignment(source2target, encoding) 286 | self.t2s = WordAlignment(target2source, encoding) 287 | self.encoding = encoding 288 | 289 | def __iter__(self): 290 | for (src, trg, alignment), (revsrc, revtrg, revalignment) in zip(self.s2t,self.t2s): #will take unnecessary memory in Python 2.x, optimal in Python 3 291 | if src != revsrc or trg != revtrg: 292 | raise Exception("Files are not identical!") 293 | else: 294 | #keep only those alignments that are present in both 295 | intersection = [] 296 | for i, x in enumerate(alignment): 297 | if revalignment[i] in x: 298 | intersection.append(revalignment[i]) 299 | else: 300 | intersection.append(None) 301 | 302 | yield src, trg, intersection 303 | 304 | def reset(self): 305 | self.s2t.reset() 306 | self.t2s.reset() 307 | 308 | -------------------------------------------------------------------------------- /pynlpl/tools/frogwrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | 5 | #Frog Wrapper with XML input and FoLiA output support 6 | 7 | 8 | from __future__ import print_function, unicode_literals, division, absolute_import 9 | 10 | import getopt 11 | import lxml.etree 12 | import sys 13 | import os 14 | import codecs 15 | 16 | if __name__ == "__main__": 17 | sys.path.append(sys.path[0] + '/../..') 18 | os.environ['PYTHONPATH'] = sys.path[0] + '/../..' 19 | 20 | 21 | import pynlpl.formats.folia as folia 22 | from pynlpl.clients.frogclient import FrogClient 23 | 24 | def legacyout(i, word,lemma,morph,pos): 25 | if word: 26 | out = str(i + 1) + "\t" + word + "\t" + lemma + "\t" + morph + "\t" + pos 27 | print(out.encode('utf-8')) 28 | else: 29 | print() 30 | 31 | def usage(): 32 | print >>sys.stderr,"frogwrapper.py [options]" 33 | print >>sys.stderr,"------------------------------------------------------" 34 | print >>sys.stderr,"Input file:" 35 | print >>sys.stderr,"\t--txt=[file] Plaintext input" 36 | print >>sys.stderr,"\t--xml=[file] XML Input" 37 | print >>sys.stderr,"\t--folia=[file] FoLiA XML Input" 38 | print >>sys.stderr,"Frog settings:" 39 | print >>sys.stderr,"\t-p [port] Port the Frog server is running on" 40 | print >>sys.stderr,"Output type:" 41 | print >>sys.stderr,"\t--id=[ID] ID for outputted FoLiA XML Document" 42 | print >>sys.stderr,"\t--legacy Use legacy columned output instead of FoLiA" 43 | print >>sys.stderr,"\t-o Write output to input file (only works for --folia)" 44 | print >>sys.stderr,"XML Input:" 45 | print >>sys.stderr,"\t--selectsen=[expr] Use xpath expression to select sentences" 46 | print >>sys.stderr,"\t--selectpar=[expr] Use xpath expression to select paragraphs" 47 | print >>sys.stderr,"\t--idattrib=[attrb] Copy ID from this attribute" 48 | print >>sys.stderr,"Text Input:" 49 | print >>sys.stderr,"\t-N No structure" 50 | print >>sys.stderr,"\t-S One sentence per line (strict)" 51 | print >>sys.stderr,"\t-P One paragraph per line" 52 | print >>sys.stderr,"\t-I Value in first column (tab seperated) is ID!" 53 | print >>sys.stderr,"\t-E [encoding] Encoding of input file (default: utf-8)" 54 | 55 | try: 56 | opts, files = getopt.getopt(sys.argv[1:], "hSPINEp:o", ["txt=","xml=", "folia=","id=",'legacy','tok','selectsen=','selectpar=','idattrib=']) 57 | except getopt.GetoptError as err: 58 | # print help information and exit: 59 | print(str(err)) 60 | usage() 61 | sys.exit(1) 62 | 63 | 64 | textfile = xmlfile = foliafile = None 65 | foliaid = 'UNTITLED' 66 | legacy = None 67 | tok = False 68 | idinfirstcolumn = False 69 | encoding = 'utf-8' 70 | mode='s' 71 | xpathselect = '' 72 | idattrib='' 73 | port = None 74 | save = False 75 | 76 | for o, a in opts: 77 | if o == "-h": 78 | usage() 79 | sys.exit(0) 80 | elif o == "-I": 81 | idinfirstcolumn = True 82 | elif o == "-S": 83 | mode = 's' 84 | elif o == "-P": 85 | mode = 'p' 86 | elif o == "-p": 87 | port = int(a) 88 | elif o == "-N": 89 | mode = 'n' 90 | elif o == "-E": 91 | encoding = a 92 | elif o == "--selectsen": 93 | mode='s' 94 | xpathselect = a 95 | elif o == "--selectpar": 96 | mode='p' 97 | xpathselect = a 98 | elif o == "--idattrib": 99 | idattrib = a 100 | elif o == "--txt": 101 | textfile = a 102 | elif o == "--xml": 103 | xmlfile = a 104 | elif o == "--folia": 105 | foliafile = a 106 | elif o == "--id": 107 | foliaid = a #ID 108 | elif o == "-o": 109 | save = True 110 | elif o == "--legacy": 111 | legacy = True 112 | elif o == "--tok": 113 | tok = True 114 | else: 115 | print >>sys.stderr, "ERROR: Unknown option:",o 116 | sys.exit(1) 117 | 118 | if not port: 119 | print >> sys.stderr,"ERROR: No port specified to connect to Frog server" 120 | sys.exit(2) 121 | elif (not textfile and not xmlfile and not foliafile): 122 | print >> sys.stderr,"ERROR: Specify a file with either --txt, --xml or --folia" 123 | sys.exit(2) 124 | elif xmlfile and not xpathselect: 125 | print >> sys.stderr,"ERROR: You need to specify --selectsen or --selectpar when using --xml" 126 | sys.exit(2) 127 | 128 | frogclient = FrogClient('localhost',port) 129 | 130 | idmap = [] 131 | data = [] 132 | 133 | if textfile: 134 | f = codecs.open(textfile, 'r', encoding) 135 | for line in f.readlines(): 136 | if idinfirstcolumn: 137 | id, line = line.split('\t',1) 138 | idmap.append(id.strip()) 139 | else: 140 | idmap.append(None) 141 | data.append(line.strip()) 142 | f.close() 143 | 144 | if xmlfile: 145 | xmldoc = lxml.etree.parse(xmlfile) 146 | for node in xmldoc.xpath(xpathselect): 147 | if idattrib: 148 | if idattrib in node.attrib: 149 | idmap.append(node.attrib[idattrib]) 150 | else: 151 | print >>sys.stderr,"WARNING: Attribute " + idattrib + " not found on node!" 152 | idmap.append(None) 153 | else: 154 | idmap.append(None) 155 | data.append(node.text) 156 | 157 | if foliafile: 158 | foliadoc = folia.Document(file=foliafile) 159 | if not foliadoc.declared(folia.AnnotationType.TOKEN): 160 | foliadoc.declare(folia.AnnotationType.TOKEN, set='http://ilk.uvt.nl/folia/sets/ucto-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO) 161 | if not foliadoc.declared(folia.AnnotationType.POS): 162 | foliadoc.declare(folia.AnnotationType.POS, set='http://ilk.uvt.nl/folia/sets/cgn-legacy.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO) 163 | if not foliadoc.declared(folia.AnnotationType.LEMMA): 164 | foliadoc.declare(folia.AnnotationType.LEMMA, set='http://ilk.uvt.nl/folia/sets/mblem-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO) 165 | foliadoc.language('nld') 166 | text = foliadoc.data[-1] 167 | 168 | for p in foliadoc.paragraphs(): 169 | found_s = False 170 | for s in p.sentences(): 171 | found_w = False 172 | for w in s.words(): 173 | found_w = True 174 | found_s = True 175 | if found_w: 176 | #pass tokenised sentence 177 | words = s.words() 178 | response = frogclient.process(" ".join([unicode(w) for w in words])) 179 | for i, (word, lemma, morph, pos) in enumerate(response): 180 | if legacy: legacyout(i,word,lemma,morph,pos) 181 | if unicode(words[i]) == word: 182 | if lemma: 183 | words[i].append( folia.LemmaAnnotation(foliadoc, cls=lemma) ) 184 | if pos: 185 | words[i].append( folia.PosAnnotation(foliadoc, cls=pos) ) 186 | else: 187 | print >>sys.stderr,"WARNING: Out of sync after calling Frog! ", i, word 188 | 189 | else: 190 | #pass untokenised sentence 191 | try: 192 | sentext = s.text() 193 | except folia.NoSuchText: 194 | continue 195 | response = frogclient.process(sentext) 196 | for i, (word, lemma, morph, pos) in enumerate(response): 197 | if legacy: legacyout(i,word,lemma,morph,pos) 198 | if word: 199 | w = folia.Word(foliadoc, text=word, generate_id_in=s) 200 | if lemma: 201 | w.append( folia.LemmaAnnotation(foliadoc, cls=lemma) ) 202 | if pos: 203 | w.append( folia.PosAnnotation(foliadoc, cls=pos) ) 204 | s.append(w) 205 | 206 | if not found_s: 207 | #pass paragraph 208 | try: 209 | partext = p.text() 210 | except folia.NoSuchText: 211 | continue 212 | 213 | s = folia.Sentence(foliadoc, generate_id_in=p) 214 | response = frogclient.process(partext) 215 | for i, (word, lemma, morph, pos) in enumerate(response): 216 | if (not word or i == len(response) - 1) and len(s) > 0: 217 | #gap or end of response: terminate sentence 218 | p.append(s) 219 | s = folia.Sentence(foliadoc, generate_id_in=p) 220 | elif word: 221 | w = folia.Word(foliadoc, text=word, generate_id_in=s) 222 | if lemma: 223 | w.append( folia.LemmaAnnotation(foliadoc, cls=lemma) ) 224 | if pos: 225 | w.append( folia.PosAnnotation(foliadoc, cls=pos) ) 226 | s.append(w) 227 | 228 | 229 | else: 230 | foliadoc = folia.Document(id=foliaid) 231 | foliadoc.declare(folia.AnnotationType.TOKEN, set='http://ilk.uvt.nl/folia/sets/ucto-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO) 232 | foliadoc.declare(folia.AnnotationType.POS, set='http://ilk.uvt.nl/folia/sets/cgn-legacy.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO) 233 | foliadoc.declare(folia.AnnotationType.LEMMA, set='http://ilk.uvt.nl/folia/sets/mblem-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO) 234 | foliadoc.language('nld') 235 | text = folia.Text(foliadoc, id=foliadoc.id + '.text.1') 236 | foliadoc.append(text) 237 | 238 | 239 | curid = None 240 | for (fragment, id) in zip(data,idmap): 241 | if mode == 's' or mode == 'n': 242 | if id: 243 | s = folia.Sentence(foliadoc, id=id) 244 | else: 245 | s = folia.Sentence(foliadoc, generate_id_in=text) 246 | elif mode == 'p': 247 | if id: 248 | p = folia.Paragraph(foliadoc, id=id) 249 | else: 250 | p = folia.Paragraph(foliadoc, generate_id_in=text) 251 | s = folia.Sentence(foliadoc, generate_id_in=p) 252 | 253 | curid = s.id 254 | response = frogclient.process(fragment) 255 | for i, (word, lemma, morph, pos) in enumerate(response): 256 | if legacy: 257 | legacyout(i,word,lemma,morph,pos) 258 | continue 259 | 260 | if word: 261 | w = folia.Word(foliadoc, text=word, generate_id_in=s) 262 | if lemma: 263 | w.append( folia.LemmaAnnotation(foliadoc, cls=lemma) ) 264 | if pos: 265 | w.append( folia.PosAnnotation(foliadoc, cls=pos) ) 266 | s.append(w) 267 | if (not word or i == len(response) - 1) and len(s) > 0: 268 | #gap or end of response: terminate sentence 269 | if mode == 'p': 270 | p.append(s) 271 | if (i == len(response) - 1): 272 | text.append(p) 273 | elif mode == 'n' or (mode == 's' and i == len(response) - 1): 274 | text.append(s) 275 | elif mode == 's': 276 | continue 277 | 278 | if i < len(response) - 1: #not done yet? 279 | #create new sentence 280 | if mode == 'p': 281 | s = folia.Sentence(foliadoc, generate_id_in=p) 282 | elif mode == 'n' and id: 283 | #no id for this unforeseen sentence, make something up 284 | s = folia.Sentence(foliadoc, id=curid+'.X') 285 | print("WARNING: Sentence found that was not in original",file=sys.stderr) 286 | 287 | if not legacy: 288 | print(foliadoc.xmlstring()) 289 | if save and foliafile: 290 | foliadoc.save() 291 | --------------------------------------------------------------------------------