├── pynlpl
    ├── mt
    │   ├── __init__.py
    │   └── wordalign.py
    ├── tests
    │   ├── __init__.py
    │   ├── evaluation_timbl
    │   │   ├── train
    │   │   ├── timbltest.sh
    │   │   ├── test
    │   │   └── test.IB1.O.gr.k1.out
    │   ├── test.sh
    │   ├── datatypes.py
    │   ├── statistics.py
    │   ├── formats.py
    │   ├── cql.py
    │   ├── evaluation.py
    │   ├── textprocessors.py
    │   ├── folia_benchmark.py
    │   └── search.py
    ├── tools
    │   ├── __init__.py
    │   ├── reflow.py
    │   ├── phrasetableserver.py
    │   ├── sonarlemmafreqlist.py
    │   ├── foliasplitcgnpostags.py
    │   ├── sampler.py
    │   ├── freqlist.py
    │   ├── sonar2folia.py
    │   ├── computepmi.py
    │   └── frogwrapper.py
    ├── clients
    │   ├── __init__.py
    │   ├── freeling.py
    │   └── frogclient.py
    ├── formats
    │   ├── __init__.py
    │   ├── cgn.py
    │   ├── timbl.py
    │   ├── taggerdata.py
    │   ├── moses.py
    │   ├── dutchsemcor.py
    │   ├── sonar.py
    │   ├── cql.py
    │   └── giza.py
    ├── lm
    │   ├── __init__.py
    │   ├── makesrilmcc
    │   ├── srilm.cc
    │   ├── client.py
    │   ├── server.py
    │   └── srilm.py
    ├── __init__.py
    ├── algorithms.py
    ├── common.py
    ├── fsa.py
    └── net.py
├── requirements.txt
├── AUTHORS
├── docs
    ├── pineapple.jpg
    ├── pynlpl_pres.pdf
    ├── pynlpl_pres2.pdf
    ├── common.rst
    ├── datatypes.rst
    ├── search.rst
    ├── evaluation.rst
    ├── lm.rst
    ├── _templates
    │   ├── fullclass.rst
    │   └── foliaelement.rst
    ├── formats.rst
    ├── textprocessors.rst
    ├── statistics.rst
    ├── index.rst
    ├── Makefile
    └── conf.py
├── .gitignore
├── MANIFEST.in
├── setup.cfg
├── .travis.yml
├── .readthedocs.yaml
├── setup.py
└── README.rst


/pynlpl/mt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pynlpl/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pynlpl/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | lxml>=2.2
2 | httplib2>=0.6
3 | numpy
4 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | mvgompel = Maarten van Gompel <proycon@anaproy.nl>
2 | 


--------------------------------------------------------------------------------
/pynlpl/tests/evaluation_timbl/train:
--------------------------------------------------------------------------------
1 | cat cat
2 | dog dog
3 | rabbit rabbit
4 | 


--------------------------------------------------------------------------------
/docs/pineapple.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/proycon/pynlpl/HEAD/docs/pineapple.jpg


--------------------------------------------------------------------------------
/docs/pynlpl_pres.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/proycon/pynlpl/HEAD/docs/pynlpl_pres.pdf


--------------------------------------------------------------------------------
/docs/pynlpl_pres2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/proycon/pynlpl/HEAD/docs/pynlpl_pres2.pdf


--------------------------------------------------------------------------------
/pynlpl/tests/evaluation_timbl/timbltest.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | timbl -f train -t test +v+cm+cs
3 | 


--------------------------------------------------------------------------------
/pynlpl/clients/__init__.py:
--------------------------------------------------------------------------------
1 | """This packages contains clients for communicating with specific servers"""
2 | 


--------------------------------------------------------------------------------
/pynlpl/formats/__init__.py:
--------------------------------------------------------------------------------
1 | """This package contains modules for reading and/or writing specific file formats"""
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore .pyc files
2 | *.pyc
3 | # Ignore generated dirs
4 | build/*
5 | docs/_autosummary/*
6 | docs/build/*
7 | 


--------------------------------------------------------------------------------
/pynlpl/lm/__init__.py:
--------------------------------------------------------------------------------
1 | """This package contains modules for Language Models, with a C++/Python module for SRILM by Sander Canisius"""
2 | 


--------------------------------------------------------------------------------
/docs/common.rst:
--------------------------------------------------------------------------------
1 | Common Functions
2 | ==================================
3 | 
4 | .. automodule:: pynlpl.common
5 |     :members:
6 |     :undoc-members:
7 | 


--------------------------------------------------------------------------------
/docs/datatypes.rst:
--------------------------------------------------------------------------------
1 | Data Types
2 | ==================================
3 | 
4 | .. automodule:: pynlpl.datatypes
5 |     :members:
6 |     :undoc-members:
7 | 
8 | 


--------------------------------------------------------------------------------
/docs/search.rst:
--------------------------------------------------------------------------------
1 | Search Algorithms
2 | ==================================
3 | 
4 | .. automodule:: pynlpl.search
5 |     :members:
6 |     :undoc-members:
7 | 
8 | 


--------------------------------------------------------------------------------
/docs/evaluation.rst:
--------------------------------------------------------------------------------
1 | Evaluation & Experiments
2 | ==================================
3 | 
4 | .. automodule:: pynlpl.evaluation
5 |     :members:
6 |     :undoc-members:
7 | 
8 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include LICENSE
3 | include requirements.txt
4 | recursive-include pynlpl *.py
5 | include pynlpl/tests/test.sh
6 | include pynlpl/tests/evaluation_timbl/*
7 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [build_sphinx]
 2 | source-dir = ../docs/
 3 | build-dir = ../docs/build
 4 | all_files = 1
 5 | 
 6 | [upload_sphinx]
 7 | upload-dir = ../docs/build/html
 8 | 
 9 | [easy_install]
10 | 
11 | 


--------------------------------------------------------------------------------
/docs/lm.rst:
--------------------------------------------------------------------------------
 1 | Language Models
 2 | ==================================
 3 | 
 4 | .. automodule:: pynlpl.lm.lm
 5 |     :members:
 6 |     :undoc-members:
 7 | 
 8 | .. automodule:: pynlpl.lm.srilm
 9 |     :members:
10 |     :undoc-members:
11 | 
12 | .. automodule:: pynlpl.lm.server
13 |     :members:
14 |     :undoc-members:
15 | 
16 | .. automodule:: pynlpl.lm.client
17 |     :members:
18 |     :undoc-members:
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/pynlpl/tests/evaluation_timbl/test:
--------------------------------------------------------------------------------
 1 | cat cat
 2 | cat cat
 3 | cat cat
 4 | cat cat
 5 | cat cat
 6 | dog cat
 7 | dog cat 
 8 | dog cat 
 9 | cat dog
10 | cat dog 
11 | rabbit dog
12 | dog dog
13 | dog dog
14 | dog dog
15 | dog rabbit
16 | dog rabbit
17 | rabbit rabbit
18 | rabbit rabbit
19 | rabbit rabbit
20 | rabbit rabbit
21 | rabbit rabbit
22 | rabbit rabbit
23 | rabbit rabbit
24 | rabbit rabbit
25 | rabbit rabbit
26 | rabbit rabbit
27 | rabbit rabbit
28 | 


--------------------------------------------------------------------------------
/pynlpl/__init__.py:
--------------------------------------------------------------------------------
1 | """PyNLPl, pronounced as "pineapple", is a Python library for Natural Language Processing. It contains various modules useful for common, and less common, NLP tasks. PyNLPl can be used for example the computation of n-grams, frequency lists and distributions, language models. There are also more complex data types, such as Priority Queues, and search algorithms, such as Beam Search.
2 | 
3 | The library is divided into several packages and modules. It is designed for Python 2.6 and upwards. Including Python 3."""
4 | 
5 | VERSION = "1.2.9"
6 | 


--------------------------------------------------------------------------------
/pynlpl/tests/evaluation_timbl/test.IB1.O.gr.k1.out:
--------------------------------------------------------------------------------
 1 | cat cat cat
 2 | cat cat cat
 3 | cat cat cat
 4 | cat cat cat
 5 | cat cat cat
 6 | dog cat dog
 7 | dog cat dog
 8 | dog cat dog
 9 | cat dog cat
10 | cat dog cat
11 | rabbit dog rabbit
12 | dog dog dog
13 | dog dog dog
14 | dog dog dog
15 | dog rabbit dog
16 | dog rabbit dog
17 | rabbit rabbit rabbit
18 | rabbit rabbit rabbit
19 | rabbit rabbit rabbit
20 | rabbit rabbit rabbit
21 | rabbit rabbit rabbit
22 | rabbit rabbit rabbit
23 | rabbit rabbit rabbit
24 | rabbit rabbit rabbit
25 | rabbit rabbit rabbit
26 | rabbit rabbit rabbit
27 | rabbit rabbit rabbit
28 | 


--------------------------------------------------------------------------------
/pynlpl/tools/reflow.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf8 -*-
 3 | 
 4 | 
 5 | from __future__ import print_function
 6 | from __future__ import unicode_literals
 7 | from __future__ import division
 8 | from __future__ import absolute_import
 9 | 
10 | import sys
11 | import io
12 | import getopt
13 | 
14 | from pynlpl.textprocessors import ReflowText
15 | 
16 | 
17 | def main():
18 |     for filename in sys.argv[1:]:
19 |         f = io.open(filename, 'r', encoding='utf-8')
20 |         for line in ReflowText(f):
21 |             print(line)
22 |         f.close()
23 | 
24 | if __name__ == '__main__':
25 |     main()
26 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # whitelist
 2 | branches:
 3 |     only:
 4 |         - master
 5 | notifications:
 6 |   irc:
 7 |     channels:
 8 |       - "irc.uvt.nl#gitlama"
 9 |     template:
10 |       - "%{repository_slug}#%{build_number} %{message} --> %{build_url}"
11 |     skip_join: true
12 | language: python
13 | dist: trusty
14 | python:
15 |     - "2.7"
16 |     - "3.4"
17 |     - "3.5"
18 | before_install:
19 |   - sudo apt-get update -qq
20 |   - sudo apt-get install -y xmldiff
21 |   - pip install -U setuptools
22 | install:
23 |     - pip install -r requirements.txt
24 |     - python setup.py install
25 | script:
26 |     - bash pynlpl/tests/test.sh
27 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.11"
13 | 
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 |   configuration: docs/conf.py
17 | 
18 | # We recommend specifying your dependencies to enable reproducible builds:
19 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
20 | # python:
21 | #   install:
22 | #   - requirements: docs/requirements.txt
23 | 
24 | 


--------------------------------------------------------------------------------
/docs/_templates/fullclass.rst:
--------------------------------------------------------------------------------
 1 | {{ fullname }}
 2 | {{ underline }}
 3 | 
 4 | .. currentmodule:: {{ module }}
 5 | 
 6 | .. autoclass:: {{ objname }}
 7 |    :show-inheritance:
 8 |    :members: *
 9 | 
10 |    {% block methods %}
11 | 
12 |    {% if methods %}
13 |    .. rubric:: Method Summary
14 | 
15 |    .. autosummary::
16 |    {% for item in methods %}
17 |       ~{{ name }}.{{ item }}
18 |    {%- endfor %}
19 |    {% endif %}
20 |    {% endblock %}
21 | 
22 |    {% block attributes %}
23 |    {% if attributes %}
24 |    .. rubric:: Attributes
25 | 
26 |    .. autosummary::
27 |    {% for item in attributes %}
28 |       ~{{ name }}.{{ item }}
29 |    {%- endfor %}
30 |    {% endif %}
31 |    {% endblock %}
32 | 
33 |    .. rubric:: Method Details
34 | 
35 |    .. automethod:: __init__
36 | 
37 |    {% for m in methods %}
38 |    .. automethod:: {{ m }}
39 |    {% endfor %}
40 | 
41 |    
42 | 
43 | 


--------------------------------------------------------------------------------
/pynlpl/tools/phrasetableserver.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding:utf-8 -*-
 3 | 
 4 | ###############################################################
 5 | #  PyNLPl - Phrase Table Server
 6 | #       by Maarten van Gompel (proycon)
 7 | #       http://ilk.uvt.nl/~mvgompel
 8 | #       Induction for Linguistic Knowledge Research Group
 9 | #       Universiteit van Tilburg
10 | #
11 | #       Licensed under GPLv3
12 | #
13 | ###############################################################   
14 | 
15 | 
16 | import sys
17 | import os
18 | 
19 | if __name__ == "__main__":
20 |     sys.path.append(sys.path[0] + '/../..')
21 |     os.environ['PYTHONPATH'] = sys.path[0] + '/../..'
22 |     
23 | from pynlpl.formats.moses import PhraseTable, PhraseTableServer
24 | 
25 | 
26 | 
27 | 
28 | if len(sys.argv) != 3:
29 |     print >>sys.stderr,"Syntax: phrasetableserver.py phrasetable port"
30 |     sys.exit(2)
31 | else:    
32 |     port = int(sys.argv[2])
33 |     PhraseTableServer(PhraseTable(sys.argv[1]), port)
34 | 


--------------------------------------------------------------------------------
/pynlpl/lm/makesrilmcc:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # README!!!
 4 | 
 5 | #First compile SRILM as follows:
 6 | 
 7 | #In srilm/common/Makefile.machine.i686 (also for x86_64!) set:
 8 | # ADDITIONAL_CFLAGS =-fPIC
 9 | # ADDITIONAL_CXXFLAGS =-fPIC
10 | 
11 | #And change GCC_FLAGS to GCC_FLAGS =  -Wreturn-type -Wimplicit
12 | 
13 | #Then compile:
14 | # make MACHINE_TYPE=i686 NO_TCL=X
15 | 
16 | #Then edit the directories in this script and run ./makesrilm
17 | 
18 | if [ -z $1 ]; then
19 | 	echo "Usage: ./makesrilm /path/to/srilm/ [pythonversion]" >&2
20 | 	exit 1;
21 | fi
22 | 
23 | 
24 | 
25 | export SRILM=$1   #Default: /home/mvgompel/tmp/srilm5.10/ #(must be an absolute path!)
26 | export SRILMLIBS=$SRILM/lib/i686
27 | if [ -z $1 ]; then
28 |     PYTHONVERSION=$2
29 | else
30 |     PYTHONVERSION="2.7"
31 | fi
32 | g++ -fPIC -shared -I/usr/include/python$PYTHONVERSION -lpython$PYTHONVERSION -I$SRILM/src -I$SRILM/include -lboost_python srilm.cc $SRILMLIBS/liboolm.a $SRILMLIBS/libdstruct.a $SRILMLIBS/libmisc.a -o srilmcc.so
33 | 
34 | 


--------------------------------------------------------------------------------
/docs/formats.rst:
--------------------------------------------------------------------------------
 1 | Formats
 2 | ==================================
 3 | 
 4 | Corpus Gesproken Nederlands
 5 | ::::::::::::::::::::::::::::::
 6 | 
 7 | .. automodule:: pynlpl.formats.cgn
 8 |     :members:
 9 |     :undoc-members:
10 | 
11 | FoLiA
12 | ::::::::::::::::::::::::::::::
13 | 
14 | See folia_ : folia.html
15 | 
16 | GIZA++
17 | ::::::::::::::::::::::::::::::
18 | 
19 | .. automodule:: pynlpl.formats.giza
20 |     :members:
21 |     :undoc-members:
22 | 
23 | 
24 | Moses
25 | ::::::::::::::::::::::::::::::
26 | 
27 | .. automodule:: pynlpl.formats.moses
28 |     :members:
29 |     :undoc-members:
30 | 
31 | 
32 | SoNaR
33 | ::::::::::::::::::::::::::::::
34 | 
35 | .. automodule:: pynlpl.formats.sonar
36 |     :members:
37 |     :undoc-members:
38 | 
39 | 
40 | 
41 | Taggerdata
42 | ::::::::::::::::::::::::::::::
43 | 
44 | .. automodule:: pynlpl.formats.taggerdata
45 |     :members:
46 |     :undoc-members:
47 | 
48 | 
49 | TiMBL
50 | ::::::::::::::::::::::::::::::
51 | 
52 | .. automodule:: pynlpl.formats.timbl
53 |     :members:
54 |     :undoc-members:
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/docs/_templates/foliaelement.rst:
--------------------------------------------------------------------------------
 1 | {{ fullname }}
 2 | {{ underline }}
 3 | 
 4 | .. currentmodule:: {{ module }}
 5 | 
 6 | .. autoclass:: {{ objname }}
 7 |    :show-inheritance:
 8 |    :undoc-members:
 9 |    :special-members:
10 | 
11 |    {% block methods %}
12 | 
13 |    {% if methods %}
14 |    .. rubric:: Method Summary
15 | 
16 |    .. autosummary::
17 |    {% for item in methods %}
18 |       ~{{ name }}.{{ item }}
19 |    {%- endfor %}
20 |     {% for private_method in ['__iter__', '__len__', '__str__'] %}
21 |     {% if private_method in members %}
22 |     ~{{ name }}.{{ private_method }}
23 |     {% endif %}
24 |     {% endfor %}
25 |    {% endif %}
26 |    {% endblock %}
27 | 
28 |    {% block attributes %}
29 |    {% if attributes %}
30 |    .. rubric:: Class Attributes
31 | 
32 |    {% for item in attributes %}
33 |    .. autoattribute:: {{ item }}
34 |    {%- endfor %}
35 |    {% endif %}
36 |    {% endblock %}
37 | 
38 |    .. rubric:: Method Details
39 | 
40 |    .. automethod::  __init__
41 |    {% for m in methods %}
42 |    .. automethod:: {{ m }}
43 |    {% endfor %}
44 |    {% for private_method in ['__iter__', '__len__', '__str__'] %}
45 |    {% if private_method in members %}
46 |    .. automethod:: {{ private_method }}
47 |    {% endif %}
48 |    {% endfor %}
49 | 


--------------------------------------------------------------------------------
/pynlpl/tools/sonarlemmafreqlist.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding:utf-8 -*-
 3 | 
 4 | 
 5 | from __future__ import print_function, unicode_literals, division, absolute_import
 6 | 
 7 | import sys
 8 | import os
 9 | 
10 | if __name__ == "__main__":
11 |     sys.path.append(sys.path[0] + '/../..')
12 |     os.environ['PYTHONPATH'] = sys.path[0] + '/../..'
13 | 
14 | from pynlpl.formats.sonar import CorpusFiles, Corpus
15 | from pynlpl.statistics import FrequencyList
16 | 
17 | sonardir = sys.argv[1]
18 | 
19 | freqlist = FrequencyList()
20 | lemmapos_freqlist = FrequencyList()
21 | poshead_freqlist = FrequencyList()
22 | pos_freqlist = FrequencyList()
23 | 
24 | for i, doc in enumerate(Corpus(sonardir)):
25 |     print("#" + str(i) + " Processing " + doc.filename,file=sys.stderr)
26 |     for word, id, pos, lemma in doc:
27 |         freqlist.count(word)
28 |         if lemma and pos:
29 |             poshead = pos.split('(')[0]
30 |             lemmapos_freqlist.count(lemma+'.'+poshead)
31 |             poshead_freqlist.count(poshead)
32 |             pos_freqlist.count(pos)
33 |       
34 | freqlist.save('sonarfreqlist.txt')
35 | lemmapos_freqlist.save('sonarlemmaposfreqlist.txt')
36 | poshead_freqlist.save('sonarposheadfreqlist.txt')
37 | pos_freqlist.save('sonarposfreqlist.txt')
38 |             
39 | print(unicode(freqlist).encode('utf-8'))
40 | 


--------------------------------------------------------------------------------
/pynlpl/lm/srilm.cc:
--------------------------------------------------------------------------------
 1 | #include <string>
 2 | 
 3 | #include <boost/python.hpp>
 4 | using namespace boost::python;
 5 | 
 6 | #include "srilm/include/File.h"
 7 | #include "srilm/include/Ngram.h"
 8 | #include "srilm/include/Vocab.h"
 9 | //#include <NgramStats.h>
10 | #include "srilm/lm/src/NgramStatsLong.cc"
11 | 
12 | class LanguageModel
13 | {
14 | private:
15 | 	Vocab vocab;
16 | 	Ngram model;
17 | 
18 | public:
19 | 	LanguageModel(const std::string& filename, int order) : model(vocab, order)
20 | 	{
21 | 		File file(filename.c_str(), "r");
22 | 		model.read(file);
23 | 	}
24 | 
25 |     Boolean exists(const std::string& word) {
26 |         return (vocab.getIndex(word.c_str()) != Vocab_None);
27 |     }
28 | 
29 | 	LogP wordProb(const std::string& context1, const std::string& context2, const std::string& word)
30 | 	{
31 |         /*VocabIndex contextindex1 = Vocab_None;
32 |         VocabIndex contextindex2 = Vocab_None;
33 |         if (context2 != "__") contextindex2 = vocab.getIndex(context2.c_str());
34 |         if (context1 != "__") contextindex1 = vocab.getIndex(context1.c_str());*/
35 | 
36 | 		const VocabIndex context[] = {
37 | 			(context2 == "__") ? Vocab_None : vocab.getIndex(context2.c_str()),
38 | 			(context1 == "__") ? Vocab_None : vocab.getIndex(context1.c_str())
39 | 		};
40 | 
41 | 		//const VocabIndex context[] = { context2, context1 };
42 | 		return model.wordProb(vocab.getIndex(word.c_str()), context);
43 | 	}
44 | };
45 | 
46 | 
47 | BOOST_PYTHON_MODULE(srilmcc)
48 | {
49 | 	class_<LanguageModel, boost::noncopyable>("LanguageModel", init<const std::string&, int>())
50 | 		.def("wordProb", &LanguageModel::wordProb)
51 | 		.def("exists", &LanguageModel::exists)
52 | 	;
53 | }
54 | 


--------------------------------------------------------------------------------
/pynlpl/tools/foliasplitcgnpostags.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #-*- coding:utf-8 -*-
 3 | 
 4 | 
 5 | from __future__ import print_function, unicode_literals, division, absolute_import
 6 | 
 7 | import glob
 8 | import sys
 9 | import os
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     sys.path.append(sys.path[0] + '/../..')
14 |     os.environ['PYTHONPATH'] = sys.path[0] + '/../..'
15 | 
16 | from pynlpl.formats import folia
17 | from pynlpl.formats import cgn
18 | import lxml.etree
19 | 
20 | def process(target):
21 |     print("Processing " + target)
22 |     if os.path.isdir(target):
23 |         print("Descending into directory " + target)
24 |         for f in glob.glob(target + '/*'):
25 |             process(f)
26 |     elif os.path.isfile(target) and target[-4:] == '.xml':            
27 |         print("Loading " + target)
28 |         try:
29 |             doc = folia.Document(file=target)
30 |         except lxml.etree.XMLSyntaxError:
31 |             print("UNABLE TO LOAD " + target + " (XML SYNTAX ERROR!)",file=sys.stderr)
32 |             return None
33 |         changed = False
34 |         for word in doc.words():
35 |             try:
36 |                 pos = word.annotation(folia.PosAnnotation)                
37 |             except folia.NoSuchAnnotation:
38 |                 continue
39 |             try:
40 |                 word.replace( cgn.parse_cgn_postag(pos.cls) )
41 |                 changed = True
42 |             except cgn.InvalidTagException:
43 |                 print("WARNING: INVALID TAG " + pos.cls,file=sys.stderr)
44 |                 continue
45 |         if changed:
46 |             print("Saving...")
47 |             doc.save()
48 | 
49 | target = sys.argv[1]
50 | process(target)
51 |    
52 | 


--------------------------------------------------------------------------------
/docs/textprocessors.rst:
--------------------------------------------------------------------------------
 1 | Text Processors
 2 | ==================================
 3 | 
 4 | This module contains classes and functions for text processing. It is imported as follows:: 
 5 | 
 6 | 	import pynlpl.textprocessors
 7 | 
 8 | Tokenisation
 9 | ------------------
10 | 
11 | A very crude tokeniser is available in the form of the function ``pynlpl.textprocessors.crude_tokeniser(string)``. This will split punctuation characters from words and returns a list of tokens. It however has no regard for abbreviations and end-of-sentence detection, which is functionality a more sophisticated tokeniser can provide::
12 | 
13 | 	tokens = pynlpl.textprocessors.crude_tokeniser("to be, or not to be.")
14 | 	
15 | This will result in:
16 | 
17 | 	tokens == ['to','be',',','or','not','to','be','.']
18 | 
19 |   
20 | N-gram extraction
21 | ------------------
22 | 
23 | The extraction of n-grams is an elemental operation in Natural Language Processing. PyNLPl offers the ``Windower`` class to accomplish this task::
24 | 
25 | 	tokens = pynlpl.textprocessors.crude_tokeniser("to be or not to be")
26 | 	for trigram in Windower(tokens,3):
27 | 		print trigram
28 | 		
29 | The input to the Windower should be a list of words and a value for n. In addition, the windower can output extra symbols at the beginning of the input sequence and at the end of it. By default, this behaviour is enabled and the input symbol is ``<begin>``, whereas the output symbol is ``<end>``. If this behaviour is unwanted you can suppress it by instantiating the Windower as follows::
30 | 
31 | 	Windower(tokens,3, None, None)
32 | 
33 | The Windower is implemented as a Python generator and at each iteration yields a tuple of length n.
34 | 
35 | 
36 | .. automodule:: pynlpl.textprocessors
37 |     :members:
38 |     :undoc-members:
39 | 


--------------------------------------------------------------------------------
/pynlpl/lm/client.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding:utf-8 -*-
 3 | 
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | from __future__ import division
 7 | from __future__ import absolute_import    
 8 | 
 9 | import socket
10 | 
11 | class LMClient(object):
12 | 
13 |     def __init__(self,host= "localhost",port=12346,n = 0):        
14 |         self.BUFSIZE = 1024
15 |         self.socket = socket.socket(socket.AF_INET,socket.SOCK_STREAM) #Create the socket
16 |         self.socket.settimeout(120)
17 |         assert isinstance(port,int) 
18 |         self.socket.connect((host, port)) #Connect to server
19 |         assert isinstance(n,int)
20 |         self.n = n
21 | 
22 |     def scoresentence(self, sentence):
23 |         if self.n > 0:
24 |             raise Exception("This client instance has been set to send only " + str(self.n) +  "-grams")
25 |         if isinstance(sentence,list) or isinstance(sentence,tuple):
26 |             sentence = " ".join(sentence)
27 |         self.socket.send(sentence+ "\r\n")
28 |         return float(self.socket.recv(self.BUFSIZE).strip())
29 | 
30 |     def __getitem__(self, ngram):
31 |         if self.n == 0:
32 |             raise Exception("This client  has been set to send only full sentence, not n-grams")
33 |         if isinstance(ngram,str) or isinstance(ngram,unicode):
34 |             ngram = ngram.split(" ")
35 |         if len(ngram) != self.n:
36 |             raise Exception("This client instance has been set to send only " + str(self.n) +  "-grams.")
37 |         ngram = " ".join(ngram)
38 |         if (sys.version < '3' and isinstance(ngram,unicode)) or( sys.version == '3' and isinstance(ngram,str)):
39 |             ngram = ngram.encode('utf-8')        
40 |         self.socket.send(ngram + b"\r\n")
41 |         return float(self.socket.recv(self.BUFSIZE).strip())
42 |         
43 |         
44 |         
45 |         
46 | 


--------------------------------------------------------------------------------
/pynlpl/lm/server.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding:utf-8 -*-
 3 | 
 4 | #---------------------------------------------------------------
 5 | # PyNLPl - Language Models
 6 | #   by Maarten van Gompel, ILK, Universiteit van Tilburg
 7 | #   http://ilk.uvt.nl/~mvgompel
 8 | #   proycon AT anaproy DOT nl
 9 | #
10 | #   Generic Server for Language Models
11 | #
12 | #----------------------------------------------------------------
13 | 
14 | #No Python 3 support for twisted yet...
15 | 
16 | from twisted.internet import protocol, reactor
17 | from twisted.protocols import basic
18 | 
19 | class LMSentenceProtocol(basic.LineReceiver):
20 |     def lineReceived(self, sentence):
21 |         try:
22 |             score = self.factory.lm.scoresentence(sentence)
23 |         except:
24 |             score = 0.0
25 |         self.sendLine(str(score))
26 | 
27 | class LMSentenceFactory(protocol.ServerFactory):
28 |     protocol = LMSentenceProtocol
29 | 
30 |     def __init__(self, lm):
31 |         self.lm = lm
32 |         
33 | class LMNGramProtocol(basic.LineReceiver):
34 |     def lineReceived(self, ngram):
35 |         ngram = ngram.split(" ")    
36 |         try:
37 |             score = self.factory.lm[ngram]
38 |         except:
39 |             score = 0.0
40 |         self.sendLine(str(score))    
41 |         
42 | class LMNGramFactory(protocol.ServerFactory):
43 |     protocol = LMNGramProtocol
44 | 
45 |     def __init__(self, lm):
46 |         self.lm = lm        
47 |         
48 |         
49 | 
50 | class LMServer:
51 |     """Language Model Server"""
52 |     def __init__(self, lm, port=12346, n=0):
53 |         """n indicates the n-gram size, if set to 0 (which is default), the server will expect to only receive whole sentence, if set to a particular value, it will only expect n-grams of that value"""
54 |         if n == 0:
55 |             reactor.listenTCP(port, LMSentenceFactory(lm))
56 |         else:
57 |             reactor.listenTCP(port, LMNGramFactory(lm))
58 |         reactor.run()
59 | 
60 | 


--------------------------------------------------------------------------------
/pynlpl/tests/test.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | if [ ! -z "$1" ]; then
  4 |     PYTHON=$1
  5 | else
  6 |     PYTHON=python
  7 | fi
  8 | 
  9 | if [ ! -z "$2" ]; then
 10 |     TESTDIR="$2"
 11 | else
 12 |     TESTDIR=`dirname $0`
 13 | fi
 14 | cd $TESTDIR
 15 | 
 16 | GOOD=1
 17 | 
 18 | echo "Testing CGN">&2
 19 | $PYTHON cgn.py
 20 | if [ $? -ne 0 ]; then
 21 |     echo "Test failed!!!" >&2
 22 |     GOOD=0
 23 | fi
 24 | 
 25 | echo "Testing datatypes">&2
 26 | $PYTHON datatypes.py
 27 | if [ $? -ne 0 ]; then
 28 |     echo "Test failed!!!" >&2
 29 |     GOOD=0
 30 | fi
 31 | 
 32 | 
 33 | echo "Testing evaluation">&2
 34 | $PYTHON evaluation.py
 35 | if [ $? -ne 0 ]; then
 36 |     echo "Test failed!!!" >&2
 37 |     GOOD=0
 38 | fi
 39 | 
 40 | 
 41 | echo "Testing search">&2
 42 | $PYTHON search.py
 43 | if [ $? -ne 0 ]; then
 44 |     echo "Test failed!!!" >&2
 45 |     GOOD=0
 46 | fi
 47 | 
 48 | echo "Testing textprocessors">&2
 49 | $PYTHON textprocessors.py
 50 | if [ $? -ne 0 ]; then
 51 |     echo "Test failed!!!" >&2
 52 |     GOOD=0
 53 | fi
 54 | 
 55 | 
 56 | echo "Testing statistics">&2
 57 | $PYTHON statistics.py
 58 | if [ $? -ne 0 ]; then
 59 |     echo "Test failed!!!" >&2
 60 |     GOOD=0
 61 | fi
 62 | 
 63 | 
 64 | echo "Testing formats">&2
 65 | $PYTHON formats.py
 66 | if [ $? -ne 0 ]; then
 67 |     echo "Test failed!!!" >&2
 68 |     GOOD=0
 69 | fi
 70 | 
 71 | echo "Testing folia">&2
 72 | $PYTHON folia.py
 73 | if [ $? -ne 0 ]; then
 74 |     echo "Test failed!!!" >&2
 75 |     GOOD=0
 76 | fi
 77 | 
 78 | echo "Testing FQL">&2
 79 | $PYTHON fql.py
 80 | if [ $? -ne 0 ]; then
 81 |     echo "Test failed!!!" >&2
 82 |     GOOD=0
 83 | fi
 84 | 
 85 | echo "Testing CQL">&2
 86 | $PYTHON cql.py
 87 | if [ $? -ne 0 ]; then
 88 |     echo "Test failed!!!" >&2
 89 |     GOOD=0
 90 | fi
 91 | 
 92 | cd ..
 93 | 
 94 | if [ $GOOD -eq 1 ]; then
 95 |     echo "Done, all tests passed!" >&2
 96 |     exit 0
 97 | else
 98 |     echo "TESTS FAILED!!!!" >&2
 99 |     exit 1
100 | fi
101 | 
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf8 -*-
 3 | 
 4 | from __future__ import print_function
 5 | 
 6 | 
 7 | import os
 8 | import sys
 9 | from setuptools import setup, find_packages
10 | 
11 | def read(fname):
12 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
13 | 
14 | entry_points = {}
15 | if sys.version > '3':
16 |     entry_points = {    'console_scripts': [
17 |             'pynlpl-computepmi = pynlpl.tools.computepmi:main',
18 |             'pynlpl-sampler = pynlpl.tools.sampler:main',
19 |             'pynlpl-makefreqlist = pynlpl.tools.freqlist:main',
20 |         ]
21 |     }
22 | 
23 | 
24 | setup(
25 |     name = "PyNLPl",
26 |     version = "1.2.9", #edit version in __init__.py as well and ensure tests/folia.py FOLIARELEASE points to the right version and is not set to None!
27 |     author = "Maarten van Gompel",
28 |     author_email = "proycon@anaproy.nl",
29 |     description = ("PyNLPl, pronounced as 'pineapple', is a Python library for Natural Language Processing. It contains various modules useful for common, and less common, NLP tasks. PyNLPl contains modules for basic tasks, clients for interfacting with server, and modules for parsing several file formats common in NLP, most notably FoLiA."),
30 |     license = "GPL",
31 |     keywords = "nlp computational_linguistics search ngrams language_models linguistics toolkit",
32 |     url = "https://github.com/proycon/pynlpl",
33 |     packages=['pynlpl','pynlpl.clients','pynlpl.lm','pynlpl.formats','pynlpl.mt','pynlpl.tools','pynlpl.tests'],
34 |     long_description=read('README.rst'),
35 |     classifiers=[
36 |         "Development Status :: 5 - Production/Stable",
37 |         "Topic :: Text Processing :: Linguistic",
38 |         "Programming Language :: Python :: 2.7",
39 |         "Programming Language :: Python :: 3",
40 |         "Operating System :: POSIX",
41 |         "Intended Audience :: Developers",
42 |         "Intended Audience :: Science/Research",
43 |         "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
44 |     ],
45 |     zip_safe=False,
46 |     include_package_data=True,
47 |     package_data = {'pynlpl': ['tests/test.sh', 'tests/evaluation_timbl/*'] },
48 |     install_requires=['lxml >= 2.2','httplib2 >= 0.6','rdflib'],
49 |     entry_points = entry_points
50 | )
51 | 


--------------------------------------------------------------------------------
/pynlpl/lm/srilm.py:
--------------------------------------------------------------------------------
 1 | #---------------------------------------------------------------
 2 | # PyNLPl - SRILM Language Model
 3 | #   by Maarten van Gompel, ILK, Universiteit van Tilburg
 4 | #   http://ilk.uvt.nl/~mvgompel
 5 | #   proycon AT anaproy DOT nl
 6 | #
 7 | #   Adapted from code by Sander Canisius
 8 | #
 9 | #   Licensed under GPLv3
10 | #
11 | #
12 | # This library enables using SRILM as language model
13 | #
14 | #----------------------------------------------------------------
15 | 
16 | from __future__ import print_function
17 | from __future__ import unicode_literals
18 | from __future__ import division
19 | from __future__ import absolute_import    
20 | 
21 | try:
22 |     import srilmcc
23 | except ImportError:
24 |     import warnings
25 |     warnings.warn("srilmcc module is not compiled")
26 |     srilmcc = None
27 | 
28 | from pynlpl.textprocessors import Windower
29 | 
30 | 
31 | class SRILMException(Exception):
32 |     """Base Exception for SRILM."""
33 | 
34 | 
35 | class SRILM:
36 |     def __init__(self, filename, n):
37 |         if not srilmcc:
38 |             raise SRILMException(
39 |                 "SRILM is not downloaded and compiled."
40 |                 "Please follow the instructions in makesrilmcc")
41 |         self.model = srilmcc.LanguageModel(filename, n)
42 |         self.n = n
43 | 
44 |     def scoresentence(self, sentence, unknownwordprob=-12):
45 |         score = 0
46 |         for ngram in Windower(sentence, self.n, "<s>", "</s>"):
47 |             try:
48 |                score += self.logscore(ngram)
49 |             except KeyError:
50 |                score += unknownwordprob
51 |         return 10**score
52 | 
53 |     def __getitem__(self, ngram):
54 |         return 10**self.logscore(ngram)
55 | 
56 |     def __contains__(self, key):
57 |         return self.model.exists( key )
58 | 
59 |     def logscore(self, ngram):
60 |         #Bug work-around
61 |         #if "" in ngram or "_" in ngram or "__" in ngram:
62 |         #    print >> sys.stderr, "WARNING: Invalid word in n-gram! Ignoring", ngram 
63 |         #    return -999.9
64 | 
65 |         if len(ngram) == self.n:
66 |             if all( (self.model.exists(x) for x in ngram) ):
67 |                 #no phrases, basic trigram, compute directly
68 |                 return self.model.wordProb(*ngram)
69 |             else:
70 |                 raise KeyError
71 |         else:
72 |             raise Exception("Not an " + str(self.n) + "-gram")
73 | 


--------------------------------------------------------------------------------
/pynlpl/algorithms.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ###############################################################9
 3 | # PyNLPl - Algorithms
 4 | #   by Maarten van Gompel
 5 | #   Centre for Language Studies
 6 | #   Radboud University Nijmegen
 7 | #   http://www.github.com/proycon/pynlpl
 8 | #   proycon AT anaproy DOT nl
 9 | #
10 | #       Licensed under GPLv3
11 | #
12 | ###############################################################
13 | 
14 | from __future__ import print_function
15 | from __future__ import unicode_literals
16 | from __future__ import division
17 | from __future__ import absolute_import
18 | 
19 | def sum_to_n(n, size, limit=None): #from http://stackoverflow.com/questions/2065553/python-get-all-numbers-that-add-up-to-a-number
20 |     """Produce all lists of `size` positive integers in decreasing order
21 |     that add up to `n`."""
22 |     if size == 1:
23 |         yield [n]
24 |         return
25 |     if limit is None:
26 |         limit = n
27 |     start = (n + size - 1) // size
28 |     stop = min(limit, n - size + 1) + 1
29 |     for i in range(start, stop):
30 |         for tail in sum_to_n(n - i, size - 1, i):
31 |             yield [i] + tail
32 | 
33 | 
34 | def consecutivegaps(n, leftmargin = 0, rightmargin = 0):
35 |     """Compute all possible single consecutive gaps in any sequence of the specified length. Returns
36 |     (beginindex, length) tuples. Runs in  O(n(n+1) / 2) time. Argument is the length of the sequence rather than the sequence itself"""
37 |     begin = leftmargin
38 |     while begin < n:
39 |         length = (n - rightmargin) - begin
40 |         while length > 0:
41 |             yield (begin, length)
42 |             length -= 1
43 |         begin += 1
44 | 
45 | def possiblesplits(n, minsplits=2, maxsplits=0):
46 |     """Returns lists of (index,length) tuples, representing all possible splits of a sequence of length n."""
47 |     if not maxsplits: maxsplits = n
48 |     for nrsplits in range(minsplits,maxsplits + 1):
49 |         for split in sum_to_n(n,nrsplits):
50 |             split_with_indices = []
51 |             begin = 0
52 |             for length in split:
53 |                 split_with_indices.append( (begin, length) )
54 |                 begin += length
55 |             yield split_with_indices
56 | 
57 | def bytesize(n):
58 |     """Return the required size in bytes to encode the specified integer"""
59 |     for i in range(1, 1000):
60 |         if n < 2**(8*i):
61 |             return i
62 | 
63 | 
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/pynlpl/tools/sampler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding:utf-8 -*-
 3 | 
 4 | ###############################################################
 5 | #  PyNLPl - Sampler
 6 | #       by Maarten van Gompel (proycon)
 7 | #       http://ilk.uvt.nl/~mvgompel
 8 | #       Induction for Linguistic Knowledge Research Group
 9 | #       Universiteit van Tilburg
10 | #
11 | #       Licensed under GPLv3
12 | #
13 | # This tool can be used to split a file (or multiple interdependent
14 | # files, such as a parallel corpus) into a train, test and development
15 | # set.
16 | #
17 | ###############################################################
18 | 
19 | 
20 | from __future__ import print_function
21 | from __future__ import unicode_literals
22 | from __future__ import division
23 | from __future__ import absolute_import
24 | 
25 | import argparse
26 | import sys
27 | 
28 | import random
29 | from pynlpl.evaluation import filesampler
30 | 
31 | def main():
32 |     parser = argparse.ArgumentParser(description="Extracts random samples from datasets, supports multiple parallel datasets (such as parallel corpora), provided that corresponding data is on the same line.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
33 |     parser.add_argument('-t','--testsetsize', help="Test set size (lines)", type=float, action='store',default=0)
34 |     parser.add_argument('-d','--devsetsize', help="Development set size (lines)", type=float, action='store',default=0)
35 |     parser.add_argument('-T','--trainsetsize', help="Training set size (lines), leave unassigned (0) to automatically use all of the remaining data", type=float, action='store',default=0)
36 |     parser.add_argument('-S','--seed', help="Seed for random number generator", type=int, action='store',default=0)
37 |     parser.add_argument('files', type=str, nargs='+', help="The data sets to sample from, must be of equal size (i.e., same number of lines)")
38 | 
39 |     args = parser.parse_args()
40 |     if args.seed:
41 |         random.seed(args.seed)
42 | 
43 |     if args.testsetsize == 0:
44 |         print("ERROR: Specify at least a testset size!",file=sys.stderr)
45 |         sys.exit(2)
46 | 
47 |     try:
48 |         if not args.files:
49 |             print("ERROR: Specify at least one file!",file=sys.stderr)
50 |             sys.exit(2)
51 |     except:
52 |         print("ERROR: Specify at least one file!",file=sys.stderr)
53 |         sys.exit(2)
54 | 
55 |     filesampler(args.files, args.testsetsize, args.devsetsize, args.trainsetsize)
56 | 
57 | if __name__ == '__main__':
58 |     main()
59 | 


--------------------------------------------------------------------------------
/docs/statistics.rst:
--------------------------------------------------------------------------------
 1 | Statistics and Information Theory
 2 | ==================================
 3 | 
 4 | This module contains classes and functions for statistics and information theory. It is imported as follows:: 
 5 | 
 6 | 	import pynlpl.statistics
 7 | 	
 8 | 
 9 | Generic functions
10 | -------------------------------------
11 | 
12 | Amongst others, the following generic statistical functions are available::
13 | 
14 | * ``mean(list)`` - Computes the mean of a given list of numbers
15 | 
16 | * ``median(list)`` - Computes the median of a given list of numbers
17 | 
18 | * ``stddev(list)`` - Computes the standard deviation of a given list of numbers  
19 | 
20 | * ``normalize(list)`` - Normalizes a list of numbers so that the sum is 1.0 .
21 | 
22 | 
23 | Frequency Lists and Distributions
24 | -------------------------------------
25 | 
26 | One of the most basic and widespread tasks in NLP is the creation of a frequency list. Counting is established by simply appending lists to the frequencylist::
27 | 
28 | 	freqlist =  pynlpl.statistics.FrequencyList()
29 | 	freqlist.append(['to','be','or','not','to','be'])
30 | 
31 | Take care not to append lists rather than strings unless you mean to create a frequency list over its characters rather than words. You may want to use the ``pynlpl.textprocessors.crudetokeniser`` first::
32 | 
33 | 	freqlist.append(pynlpl.textprocessors.crude_tokeniser("to be or not to be"))
34 | 
35 | The count can also be incremented explicitly explicitly for a single item:
36 | 
37 | 	freqlist.count('shakespeare')
38 | 	
39 | The FrequencyList offers dictionary-like access. For example, the following statement will be true for the frequency list just created::
40 | 
41 | 	freqlist['be'] == 2
42 | 
43 | Normalised counts (pseudo-probabilities) can be obtained using the ``p()`` method::
44 | 
45 | 	freqlist.p('be')
46 | 	
47 | Normalised counts can also be obtained by instantiation a Distribution instance using the frequency list::
48 | 
49 | 	dist = pynlpl.statistics.Distribution(freqlist)
50 | 	
51 | This too offers a dictionary-like interface, where values are by definition normalised. The advantage of a Distribution class is that it offers information-theoretic methods such as ``entropy()``, ``maxentropy()``, ``perplexity()`` and ``poslog()``.
52 | 
53 | A frequency list can be saved to file using the ``save(filename)`` method, and loaded back from file using the ``load(filename)`` method. The ``output()`` method is a generator yielding strings for each line of output, in ranked order.
54 | 
55 | 
56 | API Reference
57 | ----------------
58 | 
59 | 
60 | .. automodule:: pynlpl.statistics
61 |     :members:
62 |     :undoc-members:
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/pynlpl/tests/datatypes.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding:utf-8 -*-
 3 | 
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | from __future__ import division
 7 | from __future__ import absolute_import
 8 | from pynlpl.common import u
 9 | 
10 | import os
11 | import sys
12 | import unittest
13 | 
14 | 
15 | from pynlpl.datatypes import PriorityQueue
16 | 
17 | values = [3,6,6,1,8,2]
18 | mintomax = sorted(values)
19 | maxtomin = list(reversed(mintomax))
20 | 
21 | 
22 | class PriorityQueueTest(unittest.TestCase):
23 |     def test_append_minimized(self):
24 |         """Minimized PriorityQueue"""
25 |         global values
26 |         pq = PriorityQueue(values, lambda x: x, True,0,False,False)
27 |         result = list(iter(pq))
28 |         self.assertEqual(result, mintomax)
29 | 
30 |     def test_append_maximized(self):
31 |         """Maximized PriorityQueue"""
32 |         global values
33 |         pq = PriorityQueue(values, lambda x: x, False,0,False,False)
34 |         result = list(iter(pq))
35 |         self.assertEqual(result, maxtomin)
36 | 
37 |     def test_append_maximized_blockworse(self):
38 |         """Maximized PriorityQueue (with blockworse)"""
39 |         global values
40 |         pq = PriorityQueue(values, lambda x: x, False,0,True,False)
41 |         result = list(iter(pq))
42 |         self.assertEqual(result, [8,6,6,3])
43 | 
44 |     def test_append_maximized_blockworse_blockequal(self):
45 |         """Maximized PriorityQueue (with blockworse + blockequal)"""
46 |         global values
47 |         pq = PriorityQueue(values, lambda x: x, False,0,True,True)
48 |         result = list(iter(pq))
49 |         self.assertEqual(result, [8,6,3])
50 | 
51 |     def test_append_minimized_blockworse(self):
52 |         """Minimized PriorityQueue (with blockworse)"""
53 |         global values
54 |         pq = PriorityQueue(values, lambda x: x, True,0,True,False)
55 |         result = list(iter(pq))
56 |         self.assertEqual(result, [1,3])
57 |         
58 | 
59 |     def test_append_minimized_fixedlength(self):
60 |         """Fixed-length priority queue (min)"""
61 |         global values
62 |         pq = PriorityQueue(values, lambda x: x, True,4, False,False)
63 |         result = list(iter(pq))
64 |         self.assertEqual(result, mintomax[:4])        
65 | 
66 |     def test_append_maximized_fixedlength(self):
67 |         """Fixed-length priority queue (max)"""
68 |         global values
69 |         pq = PriorityQueue(values, lambda x: x, False,4,False,False)
70 |         result = list(iter(pq))
71 |         self.assertEqual(result, maxtomin[:4])                
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     unittest.main()
76 | 


--------------------------------------------------------------------------------
/pynlpl/tools/freqlist.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding:utf-8 -*-
 3 | 
 4 | ###############################################################
 5 | #  PyNLPl - Frequency List Generator
 6 | #       by Maarten van Gompel (proycon)
 7 | #       http://ilk.uvt.nl/~mvgompel
 8 | #       Induction for Linguistic Knowledge Research Group
 9 | #       Universiteit van Tilburg
10 | #
11 | #       Licensed under GPLv3
12 | #
13 | ###############################################################
14 | 
15 | 
16 | from __future__ import print_function
17 | from __future__ import unicode_literals
18 | from __future__ import division
19 | from __future__ import absolute_import
20 | 
21 | import argparse
22 | import sys
23 | import io
24 | 
25 | from pynlpl.statistics import FrequencyList, Distribution
26 | from pynlpl.textprocessors import Windower, crude_tokenizer
27 | 
28 | def main():
29 |     parser = argparse.ArgumentParser(description="Generate an n-gram frequency list", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
30 |     parser.add_argument('-n','--ngramsize', help="N-gram size", type=int, action='store',default=1)
31 |     parser.add_argument('-i','--caseinsensitive', help="Case insensitive", action="store_true")
32 |     parser.add_argument('-e','--encoding', help="Character encoding", type=str, action='store',default='utf-8')
33 |     parser.add_argument('files', type=str, nargs='+', help="The data sets to sample from, must be of equal size (i.e., same number of lines)")
34 | 
35 | 
36 |     args = parser.parse_args()
37 | 
38 |     if not args.files:
39 |         print("No files specified", file=sys.stderr)
40 |         sys.exit(1)
41 | 
42 |     freqlist = FrequencyList(None, args.caseinsensitive)
43 |     for filename in args.files:
44 |         f = io.open(filename,'r',encoding=args.encoding)
45 |         for line in f:
46 |             if args.ngramsize > 1:
47 |                 freqlist.append(Windower(crude_tokenizer(line),args.ngramsize))
48 |             else:
49 |                 freqlist.append(crude_tokenizer(line))
50 | 
51 |         f.close()
52 | 
53 |     dist = Distribution(freqlist)
54 |     for type, count in freqlist:
55 |         if isinstance(type,tuple) or isinstance(type,list):
56 |             type = " ".join(type)
57 |         s =  type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
58 |         print(s)
59 | 
60 |     print("Tokens:           ", freqlist.tokens(),file=sys.stderr)
61 |     print("Types:            ", len(freqlist),file=sys.stderr)
62 |     print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr)
63 |     print("Entropy:          ", dist.entropy(),file=sys.stderr)
64 | 
65 | if __name__ == '__main__':
66 |     main()
67 | 
68 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. PyNLPl documentation master file, created by
 2 |    sphinx-quickstart on Tue Jul  6 22:07:20 2010.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to PyNLPl's documentation!
 7 | ==================================
 8 | 
 9 | PyNLPl, pronounced as 'pineapple', is a Python library for Natural Language
10 | Processing. It contains various modules useful for common, and less common, NLP
11 | tasks. PyNLPl can be used for basic tasks such as the extraction of n-grams and
12 | frequency lists, and to build simple language model. There are also more
13 | complex data types and algorithms. Moreover, there are parsers for file formats
14 | common in NLP (e.g. FoLiA/Giza/Moses/ARPA/Timbl/CQL). There are also clients to
15 | interface with various NLP specific servers. PyNLPl most notably features a
16 | very extensive library for working with FoLiA XML (Format for Linguistic
17 | Annotatation).
18 | 
19 | The library is a divided into several packages and modules. It works on Python
20 | 2.7, as well as Python 3.
21 | 
22 | The following modules are available:
23 | 
24 | - ``pynlpl.datatypes`` - Extra datatypes (priority queues, patterns, tries)
25 | - ``pynlpl.evaluation`` - Evaluation & experiment classes (parameter search, wrapped
26 |   progressive sampling, class evaluation (precision/recall/f-score/auc), sampler, confusion matrix, multithreaded experiment pool)
27 | - ``pynlpl.formats.cgn`` - Module for parsing CGN (Corpus Gesproken Nederlands) part-of-speech tags
28 | - ``pynlpl.formats.folia`` - Extensive library for reading and manipulating the
29 |   documents in `FoLiA <http://proycon.github.io/folia>`_ format (Format for Linguistic Annotation).
30 | - ``pynlpl.formats.fql`` - Extensive library for the FoLiA Query Language (FQL),
31 |   built on top of ``pynlpl.formats.folia``. FQL is currently documented `here
32 |   <https://github.com/proycon/foliadocserve>`__. 
33 | - ``pynlpl.formats.cql`` - Parser for the Corpus Query Language (CQL), as also used by
34 |   Corpus Workbench and Sketch Engine. Contains a convertor to FQL.
35 | - ``pynlpl.formats.giza`` - Module for reading GIZA++ word alignment data
36 | - ``pynlpl.formats.moses`` - Module for reading Moses phrase-translation tables.
37 | - ``pynlpl.formats.sonar`` - Largely obsolete module for pre-releases of the
38 |   SoNaR corpus, use ``pynlpl.formats.folia`` instead.
39 | - ``pynlpl.formats.timbl`` - Module for reading Timbl output (consider using
40 |   `python-timbl <https://github.com/proycon/python-timbl>`_ instead though)
41 | - ``pynlpl.lm.lm`` - Module for simple language model and reader for ARPA
42 |   language model data as well (used by SRILM).
43 | - ``pynlpl.search`` - Various search algorithms (Breadth-first, depth-first,
44 |   beam-search, hill climbing, A star, various variants of each)
45 | - ``pynlpl.statistics`` - Frequency lists, Levenshtein, common statistics and
46 |   information theory functions
47 | - ``pynlpl.textprocessors`` - Simple tokeniser, n-gram extraction 
48 | 
49 | 
50 | Contents:
51 | 
52 | .. toctree::
53 |     :maxdepth: 3
54 |     :glob:
55 | 
56 |     *
57 | 
58 | Indices and tables
59 | ==================
60 | 
61 | * :ref:`genindex`
62 | * :ref:`modindex`
63 | * :ref:`search`
64 | 
65 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = $(VIRTUAL_ENV)/bin/sphinx-build
 7 | PAPER         =
 8 | BUILDDIR      = build
 9 | 
10 | # Internal variables.
11 | PAPEROPT_a4     = -D latex_paper_size=a4
12 | PAPEROPT_letter = -D latex_paper_size=letter
13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
14 | 
15 | .PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest
16 | 
17 | help:
18 | 	@echo "Please use \`make <target>' where <target> is one of"
19 | 	@echo "  html      to make standalone HTML files"
20 | 	@echo "  dirhtml   to make HTML files named index.html in directories"
21 | 	@echo "  pickle    to make pickle files"
22 | 	@echo "  json      to make JSON files"
23 | 	@echo "  htmlhelp  to make HTML files and a HTML help project"
24 | 	@echo "  qthelp    to make HTML files and a qthelp project"
25 | 	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
26 | 	@echo "  changes   to make an overview of all changed/added/deprecated items"
27 | 	@echo "  linkcheck to check all external links for integrity"
28 | 	@echo "  doctest   to run all doctests embedded in the documentation (if enabled)"
29 | 
30 | clean:
31 | 	-rm -rf $(BUILDDIR)/* _autosummary/*
32 | 
33 | html:
34 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
35 | 	@echo
36 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
37 | 
38 | dirhtml:
39 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
40 | 	@echo
41 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
42 | 
43 | pickle:
44 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
45 | 	@echo
46 | 	@echo "Build finished; now you can process the pickle files."
47 | 
48 | json:
49 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
50 | 	@echo
51 | 	@echo "Build finished; now you can process the JSON files."
52 | 
53 | htmlhelp:
54 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
55 | 	@echo
56 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
57 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
58 | 
59 | qthelp:
60 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
61 | 	@echo
62 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
63 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
64 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PyNLPl.qhcp"
65 | 	@echo "To view the help file:"
66 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PyNLPl.qhc"
67 | 
68 | latex:
69 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
70 | 	@echo
71 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
72 | 	@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
73 | 	      "run these through (pdf)latex."
74 | 
75 | changes:
76 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
77 | 	@echo
78 | 	@echo "The overview file is in $(BUILDDIR)/changes."
79 | 
80 | linkcheck:
81 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
82 | 	@echo
83 | 	@echo "Link check complete; look for any errors in the above output " \
84 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
85 | 
86 | doctest:
87 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
88 | 	@echo "Testing of doctests in the sources finished, look at the " \
89 | 	      "results in $(BUILDDIR)/doctest/output.txt."
90 | 


--------------------------------------------------------------------------------
/pynlpl/mt/wordalign.py:
--------------------------------------------------------------------------------
 1 | from pynlpl.statistics import FrequencyList, Distribution
 2 | 
 3 | 
 4 | class WordAlignment(object):
 5 | 
 6 |     def __init__(self, casesensitive = False):
 7 |         self.casesensitive = casesensitive
 8 | 
 9 |     def train(self, sourcefile, targetfile):
10 |         sourcefile = open(sourcefile)
11 |         targetfile = open(targetfile)
12 | 
13 |         self.sourcefreqlist = FrequencyList(None, self.casesensitive)
14 |         self.targetfreqlist = FrequencyList(None, self.casesensitive)
15 | 
16 |         #frequency lists
17 |         self.source2target = {}
18 |         self.target2source = {}
19 | 
20 |         for sourceline, targetline in zip(sourcefile, targetfile):
21 |             sourcetokens = sourceline.split()
22 |             targettokens = targetline.split()
23 | 
24 |             self.sourcefreqlist.append(sourcetokens)
25 |             self.targetfreqlist.append(targettokens)
26 | 
27 |             for sourcetoken in sourcetokens:
28 |                 if not sourcetoken in self.source2target:
29 |                     self.source2target[sourcetoken] = FrequencyList(targettokens,self.casesensitive)
30 |                 else:
31 |                     self.source2target[sourcetoken].append(targettokens)
32 | 
33 |             for targettoken in targettokens:
34 |                 if not targettoken in self.target2source:
35 |                     self.target2source[targettoken] = FrequencyList(sourcetokens,self.casesensitive)
36 |                 else:
37 |                     self.target2source[targettoken].append(sourcetokens)
38 | 
39 |         sourcefile.close()
40 |         targetfile.close()
41 | 
42 |     def test(self, sourcefile, targetfile):
43 |         sourcefile = open(sourcefile)
44 |         targetfile = open(targetfile)
45 | 
46 | 
47 |         #stage 2
48 |         for sourceline, targetline in zip(sourcefile, targetfile):
49 |             sourcetokens = sourceline.split()
50 |             targettokens = targetline.split()
51 | 
52 |             S2Talignment = []
53 |             T2Salignment = []
54 | 
55 |             for sourcetoken in sourcetokens:
56 |                 #which of the target-tokens is most frequent?
57 |                 besttoken = None
58 |                 bestscore = -1
59 |                 for i, targettoken in enumerate(targettokens):
60 |                     if targettoken in self.source2target[sourcetoken]:
61 |                         score = self.source2target[sourcetoken][targettoken] / float(self.targetfreqlist[targettoken])
62 |                         if score > bestscore:
63 |                             bestscore = self.source2target[sourcetoken][targettoken]
64 |                             besttoken = i
65 |                 S2Talignment.append(besttoken) #TODO: multi-alignment?
66 | 
67 |             for targettoken in targettokens:
68 |                 besttoken = None
69 |                 bestscore = -1
70 |                 for i, sourcetoken in enumerate(sourcetokens):
71 |                     if sourcetoken in self.target2source[targettoken]:
72 |                         score = self.target2source[targettoken][sourcetoken] / float(self.sourcefreqlist[sourcetoken])
73 |                         if score > bestscore:
74 |                             bestscore = self.target2source[targettoken][sourcetoken]
75 |                             besttoken = i
76 |                 T2Salignment.append(besttoken) #TODO: multi-alignment?
77 | 
78 |             yield sourcetokens, targettokens, S2Talignment, T2Salignment
79 | 
80 |         sourcefile.close()
81 |         targetfile.close()
82 | 
83 | 


--------------------------------------------------------------------------------
/pynlpl/tests/statistics.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*- coding:utf-8 -*-
 3 | 
 4 | #---------------------------------------------------------------
 5 | # PyNLPl - Test Units for Statistics and Information Theory
 6 | #   by Maarten van Gompel, ILK, Universiteit van Tilburg
 7 | #   http://ilk.uvt.nl/~mvgompel
 8 | #   proycon AT anaproy DOT nl
 9 | #
10 | #   Licensed under GPLv3
11 | #
12 | #----------------------------------------------------------------
13 | from __future__ import print_function
14 | from __future__ import unicode_literals
15 | from __future__ import division
16 | from __future__ import absolute_import
17 | 
18 | import sys
19 | import os
20 | import unittest
21 | 
22 | from pynlpl.statistics import FrequencyList, HiddenMarkovModel
23 | from pynlpl.textprocessors import Windower
24 | 
25 | 
26 | sentences = ["This is a sentence .".split(' '),"Moreover , this sentence is a test .".split(' ')]
27 | 
28 | class FrequencyListTest(unittest.TestCase):
29 |     def test_freqlist_casesens(self):
30 |         """Frequency List (case sensitive)"""
31 |         global sentences
32 |         f= FrequencyList()
33 |         for sentence in sentences:
34 |             f.append(sentence)
35 |         self.assertTrue(( f['sentence'] == 2 and  f['this'] == 1 and f['test'] == 1 )) 
36 | 
37 |     def test_freqlist_caseinsens(self):
38 |         """Frequency List (case insensitive)"""
39 |         global sentences
40 |         f= FrequencyList(None, False)
41 |         for sentence in sentences:
42 |             f.append(sentence)
43 |         self.assertTrue(( f['sentence'] == 2 and  f['this'] == 2 and f['Test'] == 1 )) 
44 | 
45 |     def test_freqlist_tokencount(self):
46 |         """Frequency List (count tokens)"""
47 |         global sentences
48 |         f= FrequencyList()
49 |         for sentence in sentences:
50 |             f.append(sentence)
51 |         self.assertEqual(f.total,13) 
52 | 
53 |     def test_freqlist_typecount(self):
54 |         """Frequency List (count types)"""
55 |         global sentences
56 |         f= FrequencyList()
57 |         for sentence in sentences:
58 |             f.append(sentence)
59 |         self.assertEqual(len(f),9) 
60 | 
61 | class BigramFrequencyListTest(unittest.TestCase):
62 |     def test_freqlist_casesens(self):
63 |         """Bigram Frequency List (case sensitive)"""
64 |         global sentences
65 |         f= FrequencyList()
66 |         for sentence in sentences:
67 |             f.append(Windower(sentence,2))
68 |         self.assertTrue(( f[('is','a')] == 2 and  f[('This','is')] == 1))
69 | 
70 |     def test_freqlist_caseinsens(self):
71 |         """Bigram Frequency List (case insensitive)"""
72 |         global sentences
73 |         f= FrequencyList(None, False)
74 |         for sentence in sentences:
75 |             f.append(Windower(sentence,2))
76 |         self.assertTrue(( f[('is','a')] == 2 and  f[('this','is')] == 1))
77 | 
78 | class HMMTest(unittest.TestCase):
79 |     def test_viterbi(self):
80 |         """Viterbi decode run on Hidden Markov Model"""
81 |         hmm = HiddenMarkovModel('start')
82 |         hmm.settransitions('start',{'rainy':0.6,'sunny':0.4})
83 |         hmm.settransitions('rainy',{'rainy':0.7,'sunny':0.3})
84 |         hmm.settransitions('sunny',{'rainy':0.4,'sunny':0.6}) 
85 |         hmm.setemission('rainy', {'walk': 0.1, 'shop': 0.4, 'clean': 0.5})
86 |         hmm.setemission('sunny', {'walk': 0.6, 'shop': 0.3, 'clean': 0.1})
87 |         observations = ['walk', 'shop', 'clean']
88 |         prob, path = hmm.viterbi(observations)
89 |         self.assertEqual( path, ['sunny', 'rainy', 'rainy'])
90 |         self.assertEqual( prob, 0.01344)
91 |         
92 | if __name__ == '__main__':
93 |     unittest.main()
94 | 


--------------------------------------------------------------------------------
/pynlpl/tests/formats.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | 
 5 | sys.path.append(sys.path[0] + '/../../')
 6 | os.environ['PYTHONPATH'] = sys.path[0] + '/../../'
 7 | from pynlpl.formats.timbl import TimblOutput
 8 | if sys.version < '3':
 9 |     from StringIO import StringIO
10 | else:
11 |     from io import StringIO
12 | 
13 | class TimblTest(unittest.TestCase):
14 | 
15 |     def test1_simple(self):
16 |         """Timbl - simple output"""
17 |         s = StringIO("a b ? c\nc d ? e\n")
18 |         for i, (features, referenceclass, predictedclass, distribution, distance) in enumerate(TimblOutput(s)):
19 |             if i == 0:
20 |                 self.assertEqual(features,['a','b'])
21 |                 self.assertEqual(referenceclass,'?')
22 |                 self.assertEqual(predictedclass,'c')
23 |                 self.assertEqual(distribution,None)
24 |                 self.assertEqual(distance,None)
25 |             elif i == 1:
26 |                 self.assertEqual(features,['c','d'])
27 |                 self.assertEqual(referenceclass,'?')
28 |                 self.assertEqual(predictedclass,'e')
29 |                 self.assertEqual(distribution,None)
30 |                 self.assertEqual(distance,None)
31 | 
32 | 
33 |     def test2_db(self):
34 |         """Timbl - Distribution output"""
35 |         s = StringIO("a c ? c { c 1.00000, d 1.00000 }\na b ? c { c 1.00000 }\na d ? c { c 1.00000, e 1.00000 }")
36 |         for i, (features, referenceclass, predictedclass, distribution, distance) in enumerate(TimblOutput(s)):
37 |             if i == 0:
38 |                 self.assertEqual(features,['a','c'])
39 |                 self.assertEqual(referenceclass,'?')
40 |                 self.assertEqual(predictedclass,'c')
41 |                 self.assertEqual(distribution['c'], 0.5)
42 |                 self.assertEqual(distribution['d'], 0.5)
43 |                 self.assertEqual(distance,None)
44 |             elif i == 1:
45 |                 self.assertEqual(features,['a','b'])
46 |                 self.assertEqual(referenceclass,'?')
47 |                 self.assertEqual(predictedclass,'c')
48 |                 self.assertEqual(distribution['c'], 1)
49 |                 self.assertEqual(distance,None)
50 |             elif i == 2:
51 |                 self.assertEqual(features,['a','d'])
52 |                 self.assertEqual(referenceclass,'?')
53 |                 self.assertEqual(predictedclass,'c')
54 |                 self.assertEqual(distribution['c'], 0.5)
55 |                 self.assertEqual(distribution['e'], 0.5)
56 |                 self.assertEqual(distance,None)
57 | 
58 | 
59 |     def test3_dbdi(self):
60 |         """Timbl - Distribution + Distance output"""
61 |         s = StringIO("a c ? c { c 1.00000, d 1.00000 }        1.0000000000000\na b ? c { c 1.00000 }        0.0000000000000\na d ? c { c 1.00000, e 1.00000 }        1.0000000000000")
62 |         for i, (features, referenceclass, predictedclass, distribution, distance) in enumerate(TimblOutput(s)):
63 |             if i == 0:
64 |                 self.assertEqual(features,['a','c'])
65 |                 self.assertEqual(referenceclass,'?')
66 |                 self.assertEqual(predictedclass,'c')
67 |                 self.assertEqual(distribution['c'], 0.5)
68 |                 self.assertEqual(distribution['d'], 0.5)
69 |                 self.assertEqual(distance,1.0)
70 |             elif i == 1:
71 |                 self.assertEqual(features,['a','b'])
72 |                 self.assertEqual(referenceclass,'?')
73 |                 self.assertEqual(predictedclass,'c')
74 |                 self.assertEqual(distribution['c'], 1)
75 |                 self.assertEqual(distance,0.0)
76 |             elif i == 2:
77 |                 self.assertEqual(features,['a','d'])
78 |                 self.assertEqual(referenceclass,'?')
79 |                 self.assertEqual(predictedclass,'c')
80 |                 self.assertEqual(distribution['c'], 0.5)
81 |                 self.assertEqual(distribution['e'], 0.5)
82 |                 self.assertEqual(distance,1.0)
83 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | PyNLPl - Python Natural Language Processing Library
 2 | =====================================================
 3 | 
 4 | .. image:: https://travis-ci.org/proycon/pynlpl.svg?branch=master
 5 |     :target: https://travis-ci.org/proycon/pynlpl
 6 | 
 7 | .. image:: http://readthedocs.org/projects/pynlpl/badge/?version=latest
 8 | 	:target: http://pynlpl.readthedocs.io/en/latest/?badge=latest
 9 | 	:alt: Documentation Status
10 | 
11 | .. image:: http://applejack.science.ru.nl/lamabadge.php/pynlpl
12 |    :target: http://applejack.science.ru.nl/languagemachines/
13 | 
14 | .. image:: https://zenodo.org/badge/759484.svg
15 |    :target: https://zenodo.org/badge/latestdoi/759484
16 | 
17 | PyNLPl, pronounced as 'pineapple', is a Python library for Natural Language
18 | Processing. It contains various modules useful for common, and less common, NLP
19 | tasks. PyNLPl can be used for basic tasks such as the extraction of n-grams and
20 | frequency lists, and to build simple language model. There are also more
21 | complex data types and algorithms. Moreover, there are parsers for file formats
22 | common in NLP (e.g. FoLiA/Giza/Moses/ARPA/Timbl/CQL). There are also clients to
23 | interface with various NLP specific servers. PyNLPl most notably features a
24 | very extensive library for working with FoLiA XML (Format for Linguistic
25 | Annotatation).
26 | 
27 | The library is a divided into several packages and modules. It works on Python
28 | 2.7, as well as Python 3.
29 | 
30 | The following modules are available:
31 | 
32 | - ``pynlpl.datatypes`` - Extra datatypes (priority queues, patterns, tries)
33 | - ``pynlpl.evaluation`` - Evaluation & experiment classes (parameter search, wrapped
34 |   progressive sampling, class evaluation (precision/recall/f-score/auc), sampler, confusion matrix, multithreaded experiment pool)
35 | - ``pynlpl.formats.cgn`` - Module for parsing CGN (Corpus Gesproken Nederlands) part-of-speech tags
36 | - ``pynlpl.formats.folia`` - Extensive library for reading and manipulating the
37 |   documents in `FoLiA <http://proycon.github.io/folia>`_ format (Format for Linguistic Annotation).
38 | - ``pynlpl.formats.fql`` - Extensive library for the FoLiA Query Language (FQL),
39 |   built on top of ``pynlpl.formats.folia``. FQL is currently documented `here
40 |   <https://github.com/proycon/foliadocserve>`__.
41 | - ``pynlpl.formats.cql`` - Parser for the Corpus Query Language (CQL), as also used by
42 |   Corpus Workbench and Sketch Engine. Contains a convertor to FQL.
43 | - ``pynlpl.formats.giza`` - Module for reading GIZA++ word alignment data
44 | - ``pynlpl.formats.moses`` - Module for reading Moses phrase-translation tables.
45 | - ``pynlpl.formats.sonar`` - Largely obsolete module for pre-releases of the
46 |   SoNaR corpus, use ``pynlpl.formats.folia`` instead.
47 | - ``pynlpl.formats.timbl`` - Module for reading Timbl output (consider using
48 |   `python-timbl <https://github.com/proycon/python-timbl>`_ instead though)
49 | - ``pynlpl.lm.lm`` - Module for simple language model and reader for ARPA
50 |   language model data as well (used by SRILM).
51 | - ``pynlpl.search`` - Various search algorithms (Breadth-first, depth-first,
52 |   beam-search, hill climbing, A star, various variants of each)
53 | - ``pynlpl.statistics`` - Frequency lists, Levenshtein, common statistics and
54 |   information theory functions
55 | - ``pynlpl.textprocessors`` - Simple tokeniser, n-gram extraction
56 | 
57 | Installation
58 | --------------------
59 | 
60 | Download and install the latest stable version directly from the Python Package
61 | Index with ``pip install pynlpl`` (or ``pip3`` for Python 3 on most
62 | systems). For global installations prepend ``sudo``.
63 | 
64 | Alternatively, clone this repository and run ``python setup.py install`` (or
65 | ``python3 setup.py install`` for Python 3 on most system. Prepend ``sudo`` for
66 | global installations.
67 | 
68 | This software may also be found in the certain Linux distributions, such as
69 | the latest versions as Debian/Ubuntu, as ``python-pynlpl`` and ``python3-pynlpl``.
70 | PyNLPL is also included in our `LaMachine <http://proycon.github.io/LaMachine>`_ distribution.
71 | 
72 | Documentation
73 | --------------------
74 | 
75 | API Documentation can be found `here <http://pynlpl.readthedocs.io/en/latest/>`__.
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/pynlpl/tools/sonar2folia.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding:utf-8 -*-
  3 | 
  4 | #---------------------------------------------------------------
  5 | # PyNLPl - Conversion script for converting SoNaR/D-Coi from D-Coi XML to FoLiA XML
  6 | #   by Maarten van Gompel, ILK, Tilburg University
  7 | #   http://ilk.uvt.nl/~mvgompel
  8 | #   proycon AT anaproy DOT nl
  9 | #
 10 | #   Licensed under GPLv3
 11 | #
 12 | #----------------------------------------------------------------
 13 | 
 14 | # Usage: sonar2folia.py sonar-input-dir output-dir nr-of-threads
 15 | 
 16 | from __future__ import print_function, unicode_literals, division, absolute_import
 17 | 
 18 | import sys
 19 | import os
 20 | 
 21 | if __name__ == "__main__":
 22 |     sys.path.append(sys.path[0] + '/../..')
 23 |     os.environ['PYTHONPATH'] = sys.path[0] + '/../..'
 24 | 
 25 | import pynlpl.formats.folia as folia
 26 | import pynlpl.formats.sonar as sonar
 27 | from multiprocessing import Pool, Process
 28 | import datetime
 29 | import codecs
 30 | 
 31 | 
 32 | def process(data):
 33 |     i, filename = data
 34 |     category = os.path.basename(os.path.dirname(filename))
 35 |     progress = round((i+1) / float(len(index)) * 100,1)    
 36 |     print("#" + str(i+1) + " " + filename + ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' ' +  str(progress) + '%',file=sys.stderr)
 37 |     try:
 38 |         doc = folia.Document(file=filename)
 39 |     except Exception as e:
 40 |         print("ERROR loading " + filename + ":" + str(e),file=sys.stderr)
 41 |         return False
 42 |     filename = filename.replace(sonardir,'')
 43 |     if filename[0] == '/':
 44 |         filename = filename[1:]
 45 |     if filename[-4:] == '.pos':
 46 |         filename = filename[:-4]
 47 |     if filename[-4:] == '.tok':
 48 |         filename = filename[:-4]    
 49 |     if filename[-4:] == '.ilk':
 50 |         filename = filename[:-4]    
 51 |     #Load document prior to tokenisation
 52 |     try:
 53 |         pretokdoc = folia.Document(file=sonardir + '/' + filename)
 54 |     except:
 55 |         print("WARNING unable to load pretokdoc " + filename,file=sys.stderr)
 56 |         pretokdoc = None
 57 |     if pretokdoc:
 58 |         for p2 in pretokdoc.paragraphs():
 59 |             try:
 60 |                 p = doc[p2.id]        
 61 |             except:
 62 |                 print("ERROR: Paragraph " + p2.id + " not found. Tokenised and pre-tokenised versions out of sync?",file=sys.stderr)
 63 |                 continue
 64 |             if p2.text:
 65 |                 p.text = p2.text                     
 66 |     try:
 67 |         os.mkdir(foliadir + os.path.dirname(filename))
 68 |     except:
 69 |         pass
 70 |         
 71 |     try:        
 72 |         doc.save(foliadir + filename)
 73 |     except:
 74 |         print("ERROR saving " + foliadir + filename,file=sys.stderr)
 75 |     
 76 |     try:
 77 |         f = codecs.open(foliadir + filename.replace('.xml','.tok.txt'),'w','utf-8')
 78 |         f.write(unicode(doc))    
 79 |         f.close()        
 80 |     except:
 81 |         print("ERROR saving " + foliadir + filename.replace('.xml','.tok.txt'),file=sys.stderr)
 82 | 
 83 |             
 84 |     sys.stdout.flush()
 85 |     sys.stderr.flush()
 86 |     return True
 87 |     
 88 | def outputexists(filename, sonardir, foliadir):
 89 |     filename = filename.replace(sonardir,'')
 90 |     if filename[0] == '/':
 91 |         filename = filename[1:]
 92 |     if filename[-4:] == '.pos':
 93 |         filename = filename[:-4]
 94 |     if filename[-4:] == '.tok':
 95 |         filename = filename[:-4]    
 96 |     if filename[-4:] == '.ilk':
 97 |         filename = filename[:-4]     
 98 |     return os.path.exists(foliadir + filename)
 99 | 
100 | 
101 | if __name__ == '__main__':    
102 |     sonardir = sys.argv[1]
103 |     foliadir = sys.argv[2]
104 |     threads = int(sys.argv[3])
105 |     if foliadir[-1] != '/': foliadir += '/'
106 |     try:
107 |         os.mkdir(foliadir[:-1])
108 |     except:
109 |         pass
110 |             
111 |     print("Building index...")
112 |     index = list(enumerate([ x for x in sonar.CorpusFiles(sonardir,'pos', "", lambda x: True, True) if not outputexists(x, sonardir, foliadir) ]))
113 | 
114 |     print("Processing...")
115 |     p = Pool(threads)
116 |     p.map(process, index )
117 | 
118 | 


--------------------------------------------------------------------------------
/pynlpl/formats/cgn.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*-
  2 | 
  3 | ###############################################################
  4 | #  PyNLPl - Corpus Gesproken Nederlands
  5 | #       by Maarten van Gompel (proycon)
  6 | #       http://ilk.uvt.nl/~mvgompel
  7 | #       Induction for Linguistic Knowledge Research Group
  8 | #       Universiteit van Tilburg
  9 | #       
 10 | #       Licensed under GPLv3
 11 | # 
 12 | # Classes for reading CGN (still to be added). Most notably, contains a function for decoding
 13 | # PoS features like "N(soort,ev,basis,onz,stan)" into a data structure.
 14 | #
 15 | ###############################################################
 16 | 
 17 | from __future__ import print_function
 18 | from __future__ import unicode_literals
 19 | from __future__ import division
 20 | from __future__ import absolute_import  
 21 | import sys
 22 | if sys.version < '3':
 23 |     from codecs import getwriter
 24 |     stderr = getwriter('utf-8')(sys.stderr)
 25 |     stdout = getwriter('utf-8')(sys.stdout)
 26 | else:
 27 |     stderr = sys.stderr
 28 |     stdout = sys.stdout
 29 |     
 30 | from pynlpl.formats import folia
 31 | from pynlpl.common import Enum
 32 | 
 33 | 
 34 | class InvalidTagException(Exception):
 35 |     pass
 36 |     
 37 | class InvalidFeatureException(Exception):
 38 |     pass
 39 | 
 40 | subsets = {
 41 |     'ntype': ['soort','eigen'],
 42 |     'getal': ['ev','mv','getal',],
 43 |     'genus': ['zijd','onz','masc','fem','genus'],
 44 |     'naamval': ['stan','gen','dat','nomin','obl','bijz'],
 45 |     'spectype': ['afgebr','afk','deeleigen','symb','vreemd','enof','meta','achter','comment','onverst'],
 46 |     'conjtype': ['neven','onder'],
 47 |     'vztype': ['init','versm','fin'],
 48 |     'npagr': ['agr','evon','rest','evz','mv','agr3','evmo','rest3','evf'],
 49 |     'lwtype': ['bep','onbep'],
 50 |     'vwtype': ['pers','pr','refl','recip','bez','vb','vrag','betr','excl','aanw','onbep'], 
 51 |     'pdtype':  ['adv-pron','pron','det','grad'],
 52 |     'status': ['vol','red','nadr'],
 53 |     'persoon': ['1','2','2v','2b','3','3p','3m','3v','3o','persoon'],
 54 |     'positie': ['prenom','postnom', 'nom','vrij'],
 55 |     'buiging': ['zonder','met-e','met-s'],
 56 |     'getal-n' : ['zonder-v','mv-n','zonder-n'],
 57 |     'graad' : ['basis','comp','sup','dim'],
 58 |     'wvorm': ['pv','inf','vd','od'],
 59 |     'pvtijd': ['tgw','verl','conj'],
 60 |     'pvagr':  ['ev','mv','met-t'],
 61 |     'numtype': ['hoofd','rang'],
 62 |     'dial': ['dial'],
 63 | }
 64 | constraints = {
 65 |     'getal':['N','VNW'],
 66 |     'npagr':['VNW','LID'],
 67 |     'pvagr':['WW'],    
 68 | }
 69 | 
 70 | def parse_cgn_postag(rawtag, raisefeatureexceptions = False):
 71 |     global subsets, constraints
 72 |     """decodes PoS features like "N(soort,ev,basis,onz,stan)" into a PosAnnotation data structure 
 73 |     based on CGN tag overview compiled by Matje van de Camp"""
 74 |     
 75 |     
 76 |     begin = rawtag.find('(')
 77 |     if rawtag[-1] == ')' and begin > 0:
 78 |         tag = folia.PosAnnotation(None, cls=rawtag,set='http://ilk.uvt.nl/folia/sets/cgn')
 79 | 
 80 |         
 81 |         head = rawtag[0:begin]
 82 |         tag.append( folia.Feature, subset='head',cls=head)
 83 | 
 84 |         rawfeatures = rawtag[begin+1:-1].split(',')
 85 |         for rawfeature in rawfeatures:            
 86 |             if rawfeature:
 87 |                 found = False
 88 |                 for subset, classes in subsets.items():
 89 |                     if rawfeature in classes:
 90 |                         if subset in constraints:
 91 |                             if not head in constraints[subset]:
 92 |                                 continue #constraint not met!
 93 |                         found = True
 94 |                         tag.append( folia.Feature, subset=subset,cls=rawfeature)
 95 |                         break
 96 |                 if not found:
 97 |                     print("\t\tUnknown feature value: " + rawfeature + " in " + rawtag, file=stderr)
 98 |                     if raisefeatureexceptions:
 99 |                         raise InvalidFeatureException("Unknown feature value: " + rawfeature + " in " + rawtag)
100 |                     else:    
101 |                         continue
102 |         return tag
103 |     else:
104 |         raise InvalidTagException("Not a valid CGN tag")
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/pynlpl/clients/freeling.py:
--------------------------------------------------------------------------------
  1 | ###############################################################
  2 | #  PyNLPl - FreeLing Library
  3 | #       by Maarten van Gompel (proycon)
  4 | #       http://ilk.uvt.nl/~mvgompel
  5 | #       Radboud University Nijmegen
  6 | #       
  7 | #       Licensed under GPLv3
  8 | # 
  9 | # This is a Python library for on-the-fly communication with
 10 | # a FreeLing server. Allowing on-the-fly lemmatisation and
 11 | # PoS-tagging. It is recommended to pass your data on a 
 12 | # sentence-by-sentence basis to FreeLingClient.process()
 13 | #
 14 | # Make sure to start Freeling (analyzer)  with the --server 
 15 | # and --flush flags !!!!!
 16 | #
 17 | ###############################################################
 18 | 
 19 | from __future__ import print_function
 20 | from __future__ import unicode_literals
 21 | from __future__ import division
 22 | from __future__ import absolute_import    
 23 | from pynlpl.common import u
 24 |     
 25 | import socket
 26 | import sys
 27 | 
 28 | class FreeLingClient(object):
 29 |     def __init__(self, host, port, encoding='utf-8', timeout=120.0):
 30 |         """Initialise the client, set channel to the path and filename where the server's .in and .out pipes are (without extension)"""
 31 |         self.encoding = encoding
 32 |         self.BUFSIZE = 10240
 33 |         self.socket = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
 34 |         self.socket.settimeout(timeout)
 35 |         self.socket.connect( (host,int(port)) )
 36 |         self.encoding = encoding
 37 |         self.socket.sendall('RESET_STATS\0')
 38 |         r = self.socket.recv(self.BUFSIZE)
 39 |         if not r.strip('\0') == 'FL-SERVER-READY':
 40 |             raise Exception("Server not ready")
 41 | 
 42 |         
 43 |     def process(self, sourcewords, debug=False):
 44 |         """Process a list of words, passing it to the server and realigning the output with the original words"""
 45 | 
 46 |         if isinstance( sourcewords, list ) or isinstance( sourcewords, tuple ):
 47 |             sourcewords_s = " ".join(sourcewords)            
 48 |         else:
 49 |             sourcewords_s = sourcewords
 50 |             sourcewords = sourcewords.split(' ')
 51 |         
 52 |         self.socket.sendall(sourcewords_s.encode(self.encoding) +'\n\0')
 53 |         if debug: print("Sent:",sourcewords_s.encode(self.encoding),file=sys.stderr)
 54 |         
 55 |         results = []
 56 |         done = False
 57 |         while not done:    
 58 |             data = b""
 59 |             while not data:
 60 |                 buffer = self.socket.recv(self.BUFSIZE)
 61 |                 if debug: print("Buffer: ["+repr(buffer)+"]",file=sys.stderr)
 62 |                 if buffer[-1] == '\0':
 63 |                     data += buffer[:-1]
 64 |                     done = True
 65 |                     break
 66 |                 else:
 67 |                     data += buffer
 68 | 
 69 |             
 70 |             data = u(data,self.encoding)
 71 |             if debug: print("Received:",data,file=sys.stderr) 
 72 | 
 73 |             for i, line in enumerate(data.strip(' \t\0\r\n').split('\n')):
 74 |                 if not line.strip():
 75 |                     done = True
 76 |                     break
 77 |                 else:
 78 |                     cols = line.split(" ")
 79 |                     subwords = cols[0].lower().split("_")
 80 |                     if len(cols) > 2: #this seems a bit odd?
 81 |                         for word in subwords: #split multiword expressions
 82 |                             results.append( (word, cols[1], cols[2], i, len(subwords) > 1 ) ) #word, lemma, pos, index, multiword?
 83 | 
 84 |         sourcewords = [ w.lower() for w in sourcewords ]          
 85 | 
 86 |         alignment = []
 87 |         for i, sourceword in enumerate(sourcewords):
 88 |             found = False
 89 |             best = 0  
 90 |             distance = 999999          
 91 |             for j, (targetword, lemma, pos, index, multiword) in enumerate(results):
 92 |                 if sourceword == targetword and abs(i-j) < distance:
 93 |                     found = True
 94 |                     best = j
 95 |                     distance = abs(i-j)
 96 | 
 97 |             if found:
 98 |                 alignment.append(results[best])
 99 |             else:                
100 |                 alignment.append((None,None,None,None,False)) #no alignment found
101 |         return alignment
102 | 
103 | 


--------------------------------------------------------------------------------
/pynlpl/common.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding:utf-8 -*-
  3 | 
  4 | ###############################################################9
  5 | # PyNLPl - Common functions
  6 | #   by Maarten van Gompel
  7 | #   Centre for Language Studies
  8 | #   Radboud University Nijmegen
  9 | #   http://www.github.com/proycon/pynlpl
 10 | #   proycon AT anaproy DOT nl
 11 | #
 12 | #       Licensed under GPLv3
 13 | #
 14 | # This contains very common functions and language extensions
 15 | #
 16 | ###############################################################
 17 | 
 18 | from __future__ import print_function
 19 | from __future__ import unicode_literals
 20 | from __future__ import division
 21 | from __future__ import absolute_import
 22 | 
 23 | import datetime
 24 | from sys import stderr, version
 25 | 
 26 | ## From http://code.activestate.com/recipes/413486/ (r7)
 27 | def Enum(*names):
 28 |    ##assert names, "Empty enums are not supported" # <- Don't like empty enums? Uncomment!
 29 | 
 30 |    class EnumClass(object):
 31 |       __slots__ = names
 32 |       def __iter__(self):        return iter(constants)
 33 |       def __len__(self):         return len(constants)
 34 |       def __getitem__(self, i):  return constants[i]
 35 |       def __repr__(self):        return 'Enum' + str(names)
 36 |       def __str__(self):         return 'enum ' + str(constants)
 37 | 
 38 |    class EnumValue(object):
 39 |       __slots__ = ('__value')
 40 |       def __init__(self, value): self.__value = value
 41 |       Value = property(lambda self: self.__value)
 42 |       EnumType = property(lambda self: EnumType)
 43 |       def __hash__(self):        return hash(self.__value)
 44 |       def __cmp__(self, other):
 45 |          # C fans might want to remove the following assertion
 46 |          # to make all enums comparable by ordinal value {;))
 47 |          assert self.EnumType is other.EnumType, "Only values from the same enum are comparable"
 48 |          return cmp(self.__value, other.__value)
 49 |       def __invert__(self):      return constants[maximum - self.__value]
 50 |       def __bool__(self):     return bool(self.__value)
 51 |       def __nonzero__(self):     return bool(self.__value) #Python 2.x
 52 |       def __repr__(self):        return str(names[self.__value])
 53 | 
 54 |    maximum = len(names) - 1
 55 |    constants = [None] * len(names)
 56 |    for i, each in enumerate(names):
 57 |       val = EnumValue(i)
 58 |       setattr(EnumClass, each, val)
 59 |       constants[i] = val
 60 |    constants = tuple(constants)
 61 |    EnumType = EnumClass()
 62 |    return EnumType
 63 | 
 64 | 
 65 | def u(s, encoding = 'utf-8', errors='strict'):
 66 |     #ensure s is properly unicode.. wrapper for python 2.6/2.7,
 67 |     if version < '3':
 68 |         #ensure the object is unicode
 69 |         if isinstance(s, unicode):
 70 |             return s
 71 |         else:
 72 |             return unicode(s, encoding,errors=errors)
 73 |     else:
 74 |         #will work on byte arrays
 75 |         if isinstance(s, str):
 76 |             return s
 77 |         else:
 78 |             return str(s,encoding,errors=errors)
 79 | 
 80 | def b(s):
 81 |     #ensure s is bytestring
 82 |     if version < '3':
 83 |         #ensure the object is unicode
 84 |         if isinstance(s, str):
 85 |             return s
 86 |         else:
 87 |             return s.encode('utf-8')
 88 |     else:
 89 |         #will work on byte arrays
 90 |         if isinstance(s, bytes):
 91 |             return s
 92 |         else:
 93 |             return s.encode('utf-8')
 94 | 
 95 | def isstring(s): #Is this a proper string?
 96 |     return isinstance(s, str) or (version < '3' and isinstance(s, unicode))
 97 | 
 98 | def log(msg, **kwargs):
 99 |     """Generic log method. Will prepend timestamp.
100 | 
101 |     Keyword arguments:
102 |       system   - Name of the system/module
103 |       indent   - Integer denoting the desired level of indentation
104 |       streams  - List of streams to output to
105 |       stream   - Stream to output to (singleton version of streams)
106 |     """
107 |     if 'debug' in kwargs:
108 |         if 'currentdebug' in kwargs:
109 |             if kwargs['currentdebug'] < kwargs['debug']:
110 |                 return False
111 |         else:
112 |             return False #no currentdebug passed, assuming no debug mode and thus skipping message
113 | 
114 |     s = "[" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "] "
115 |     if 'system' in kwargs:
116 |         s += "[" + system + "] "
117 | 
118 | 
119 |     if 'indent' in kwargs:
120 |         s += ("\t" * int(kwargs['indent']))
121 | 
122 |     s += u(msg)
123 | 
124 |     if s[-1] != '\n':
125 |         s += '\n'
126 | 
127 |     if 'streams' in kwargs:
128 |         streams = kwargs['streams']
129 |     elif 'stream' in kwargs:
130 |         streams = [kwargs['stream']]
131 |     else:
132 |         streams = [stderr]
133 | 
134 |     for stream in streams:
135 |         stream.write(s)
136 |     return s
137 | 


--------------------------------------------------------------------------------
/pynlpl/formats/timbl.py:
--------------------------------------------------------------------------------
  1 | ###############################################################
  2 | #  PyNLPl - Timbl Classifier Output Library
  3 | #       by Maarten van Gompel (proycon)
  4 | #       http://ilk.uvt.nl/~mvgompel
  5 | #       Induction for Linguistic Knowledge Research Group
  6 | #       Universiteit van Tilburg
  7 | #       
  8 | #       Derived from code by Sander Canisius
  9 | #
 10 | #       Licensed under GPLv3
 11 | # 
 12 | # This library offers a TimblOutput class for reading Timbl
 13 | # classifier output. It supports full distributions (+v+db) and comment (#)
 14 | #
 15 | ###############################################################    
 16 | 
 17 | 
 18 | from __future__ import print_function
 19 | from __future__ import unicode_literals
 20 | from __future__ import division
 21 | from __future__ import absolute_import  
 22 | import sys
 23 | if sys.version < '3':
 24 |     from codecs import getwriter
 25 |     stderr = getwriter('utf-8')(sys.stderr)
 26 |     stdout = getwriter('utf-8')(sys.stdout)
 27 | else:
 28 |     stderr = sys.stderr
 29 |     stdout = sys.stdout 
 30 | 
 31 | from pynlpl.statistics import Distribution
 32 | 
 33 | 
 34 | class TimblOutput(object):
 35 |     """A class for reading Timbl classifier output, supports the +v+db option and ignores comments starting with #"""
 36 | 
 37 |     def __init__(self, stream, delimiter = ' ', ignorecolumns = [], ignorevalues = []):
 38 |         self.stream = stream
 39 |         self.delimiter = delimiter
 40 |         self.ignorecolumns = ignorecolumns #numbers, ignore the specified FEATURE columns: first column is 1
 41 |         self.ignorevalues = ignorevalues #Ignore columns with the following values
 42 | 
 43 |     def __iter__(self):
 44 |         # Note: distance parsing (+v+di) works only if distributions (+v+db) are also enabled!
 45 |         for line in self.stream:
 46 |             endfvec = None
 47 |             line = line.strip()
 48 |             if line and line[0] != '#': #ignore empty lines and comments
 49 |                 segments = [ x for i, x in enumerate(line.split(self.delimiter)) if x not in self.ignorevalues and i+1 not in self.ignorecolumns ]
 50 |                               
 51 |                 #segments = [ x for x in line.split() if x != "^" and not (len(x) == 3 and x[0:2] == "n=") ]  #obtain segments, and filter null fields and "n=?" feature (in fixed-feature configuration)
 52 |                 
 53 | 
 54 |                 if not endfvec:
 55 |                     try:
 56 |                         # Modified by Ruben. There are some cases where one of the features is a {, and then
 57 |                         # the module is not able to obtain the distribution of scores and senses
 58 |                         # We have to look for the last { in the vector, and due to there is no rindex method
 59 |                         # we obtain the reverse and then apply index.
 60 |                         aux=list(reversed(segments)).index("{")
 61 |                         endfvec=len(segments)-aux-1
 62 |                         #endfvec = segments.index("{")            
 63 |                     except ValueError:
 64 |                         endfvec = None
 65 | 
 66 |                 if endfvec and endfvec > 2:  # only for +v+db
 67 |                     try:
 68 |                         enddistr = segments.index('}',endfvec)
 69 |                     except ValueError:
 70 |                         raise
 71 |                     distribution = self.parseDistribution(segments, endfvec, enddistr)
 72 |                     if len(segments) > enddistr + 1:
 73 |                         distance = float(segments[-1])
 74 |                     else:
 75 |                         distance = None
 76 |                 else:
 77 |                     endfvec = len(segments)
 78 |                     distribution = None
 79 |                     distance = None
 80 |                                     
 81 |                 #features, referenceclass, predictedclass, distribution, distance
 82 |                 yield segments[:endfvec - 2], segments[endfvec - 2], segments[endfvec - 1], distribution, distance    
 83 |            
 84 | 
 85 |     def parseDistribution(self, instance, start,end= None):
 86 |         dist = {}
 87 |         i = start + 1
 88 | 
 89 |         if not end:
 90 |             end = len(instance) - 1
 91 | 
 92 |         while i < end:  #instance[i] != "}":
 93 |             label = instance[i]
 94 |             try:
 95 |                 score = float(instance[i+1].rstrip(","))
 96 |                 dist[label] = score
 97 |             except:
 98 |                 print("ERROR: pynlpl.input.timbl.TimblOutput -- Could not fetch score for class '" + label + "', expected float, but found '"+instance[i+1].rstrip(",")+"'. Instance= " + " ".join(instance)+ ".. Attempting to compensate...",file=stderr)
 99 |                 i = i - 1
100 |             i += 2
101 | 
102 |             
103 |         if not dist:
104 |             print("ERROR: pynlpl.input.timbl.TimblOutput --  Did not find class distribution for ", instance,file=stderr)
105 | 
106 |         return Distribution(dist)
107 | 


--------------------------------------------------------------------------------
/pynlpl/fsa.py:
--------------------------------------------------------------------------------
  1 | #---------------------------------------------------------------
  2 | # PyNLPl - Finite State Automata
  3 | #   by Maarten van Gompel
  4 | #   Centre for Language Studies
  5 | #   Radboud University Nijmegen
  6 | #   http://proycon.github.com/folia
  7 | #   http://www.github.com/proycon/pynlpl
  8 | #   proycon AT anaproy DOT nl
  9 | #
 10 | # Partially based/inspired on code by Xiayun Sun (https://github.com/xysun/regex)
 11 | #
 12 | #   Licensed under GPLv3
 13 | #
 14 | #----------------------------------------------------------------
 15 | from __future__ import print_function, unicode_literals, division, absolute_import
 16 | import sys
 17 | 
 18 | 
 19 | class State(object):
 20 |     def __init__(self, **kwargs):
 21 |         if 'epsilon' in kwargs:
 22 |             self.epsilon = kwargs['epsilon'] # epsilon-closure (lis of states)
 23 |         else:
 24 |             self.epsilon = [] # epsilon-closure
 25 |         if 'transitions' in kwargs:
 26 |             self.transitions = kwargs['transitions']
 27 |         else:
 28 |             self.transitions = [] #(matchitem, matchfunction(value), state)
 29 |         if 'final' in kwargs:
 30 |             self.final = bool(kwargs['final']) # ending state
 31 |         else:
 32 |             self.final = False
 33 |         self.transitioned = None #will be a tuple (state, matchitem) indicating how this state was reached
 34 | 
 35 | 
 36 | 
 37 | class NFA(object):
 38 |     """Non-deterministic finite state automaton. Can be used to model DFAs as well if your state transitions are not ambiguous and epsilon is empty."""
 39 | 
 40 |     def __init__(self, initialstate):
 41 |         self.initialstate = initialstate
 42 | 
 43 |     def run(self, sequence, mustmatchall=False,debug=False):
 44 |         def add(state, states):
 45 |             """add state and recursively add epsilon transitions"""
 46 |             assert isinstance(state, State)
 47 |             if state in states:
 48 |                 return
 49 |             states.add(state)
 50 |             for eps in state.epsilon: #recurse into epsilon transitions
 51 |                 add(eps, states)
 52 | 
 53 |         current_states = set()
 54 |         add(self.initialstate, current_states)
 55 |         if debug: print("Starting run, current states: ", repr(current_states),file=sys.stderr)
 56 | 
 57 |         for offset, value in enumerate(sequence):
 58 |             if not current_states: break
 59 |             if debug: print("Value: ", repr(value),file=sys.stderr)
 60 |             next_states = set()
 61 |             for state in current_states:
 62 |                 for matchitem, matchfunction, trans_state in state.transitions:
 63 |                     if matchfunction(value):
 64 |                         trans_state.transitioned = (state, matchitem)
 65 |                         add(trans_state, next_states)
 66 | 
 67 |             current_states = next_states
 68 |             if debug: print("Current states: ", repr(current_states),file=sys.stderr)
 69 |             if not mustmatchall:
 70 |                 for s in current_states:
 71 |                     if s.final:
 72 |                         if debug: print("Final state reached",file=sys.stderr)
 73 |                         yield offset+1
 74 | 
 75 |         if mustmatchall:
 76 |             for s in current_states:
 77 |                 if s.final:
 78 |                     if debug: print("Final state reached",file=sys.stderr)
 79 |                     yield offset+1
 80 | 
 81 | 
 82 |     def match(self, sequence):
 83 |         try:
 84 |             return next(self.run(sequence,True)) == len(sequence)
 85 |         except StopIteration:
 86 |             return False
 87 | 
 88 |     def find(self, sequence, debug=False):
 89 |         l = len(sequence)
 90 |         for i in range(0,l):
 91 |             for length in self.run(sequence[i:], False, debug):
 92 |                 yield sequence[i:i+length]
 93 | 
 94 |     def __iter__(self):
 95 |         return iter(self._states(self.initialstate))
 96 | 
 97 |     def _states(self, state, processedstates=[]): #pylint: disable=dangerous-default-value
 98 |         """Iterate over all states in no particular order"""
 99 |         processedstates.append(state)
100 | 
101 |         for nextstate in state.epsilon:
102 |             if not nextstate in processedstates:
103 |                 self._states(nextstate, processedstates)
104 | 
105 |         for _, nextstate in state.transitions:
106 |             if not nextstate in processedstates:
107 |                 self._states(nextstate, processedstates)
108 | 
109 |         return processedstates
110 | 
111 |     def __repr__(self):
112 |         out = []
113 |         for state in self:
114 |             staterep = repr(state)
115 |             if state is self.initialstate:
116 |                 staterep += " (INITIAL)"
117 |             for nextstate in state.epsilon:
118 |                 nextstaterep = repr(nextstate)
119 |                 if nextstate.final:
120 |                     nextstaterep += " (FINAL)"
121 |                 out.append( staterep + " -e-> " + nextstaterep )
122 |             for item, _, nextstate in state.transitions:
123 |                 nextstaterep = repr(nextstate)
124 |                 if nextstate.final:
125 |                     nextstaterep += " (FINAL)"
126 |                 out.append( staterep + " -(" + repr(item) + ")-> " + nextstaterep )
127 | 
128 |         return "\n".join(out)
129 | 


--------------------------------------------------------------------------------
/pynlpl/tests/cql.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding:utf-8 -*-
  3 | 
  4 | #---------------------------------------------------------------
  5 | # PyNLPl - Test Units for CQL using Finite State Automata
  6 | #   by Maarten van Gompel, Radboud University Nijmegen
  7 | #   proycon AT anaproy DOT nl
  8 | #
  9 | #   Licensed under GPLv3
 10 | #----------------------------------------------------------------
 11 | 
 12 | 
 13 | from __future__ import print_function
 14 | from __future__ import unicode_literals
 15 | from __future__ import division
 16 | from __future__ import absolute_import
 17 | import sys
 18 | if sys.version < '3':
 19 |     from codecs import getwriter
 20 |     stderr = getwriter('utf-8')(sys.stderr)
 21 |     stdout = getwriter('utf-8')(sys.stdout)
 22 | else:
 23 |     stderr = sys.stderr
 24 |     stdout = sys.stdout
 25 | 
 26 | import sys
 27 | import unittest
 28 | from pynlpl.formats import cql
 29 | 
 30 | tokens = [
 31 |     {
 32 |         'word': 'This',
 33 |         'lemma': 'this',
 34 |         'pos': 'det',
 35 |     },
 36 |     {
 37 |         'word': 'is',
 38 |         'lemma': 'be',
 39 |         'pos': 'v',
 40 |     },
 41 |     {
 42 |         'word': 'a',
 43 |         'lemma': 'a',
 44 |         'pos': 'det',
 45 |     },
 46 |     {
 47 |         'word': 'first',
 48 |         'lemma': 'first',
 49 |         'pos': 'a',
 50 |     },
 51 |     {
 52 |         'word': 'test',
 53 |         'lemma': 'test',
 54 |         'pos': 'n',
 55 |     },
 56 |     {
 57 |         'word': 'of',
 58 |         'lemma': 'dit',
 59 |         'pos': 'prep',
 60 |     },
 61 |     {
 62 |         'word': 'the',
 63 |         'lemma': 'the',
 64 |         'pos': 'det',
 65 |     },
 66 |     {
 67 |         'word': 'new',
 68 |         'lemma': 'new',
 69 |         'pos': 'a',
 70 |     },
 71 |     {
 72 |         'word': 'module',
 73 |         'lemma': 'module',
 74 |         'pos': 'n',
 75 |     },
 76 |     {
 77 |         'word': '.',
 78 |         'lemma': '.',
 79 |         'pos': 'punc',
 80 |     },
 81 | ]
 82 | 
 83 | 
 84 | class Test1(unittest.TestCase):
 85 | 
 86 |     def test1(self):
 87 |         q = cql.Query("\"the\"")
 88 |         result = q(tokens)
 89 |         self.assertEqual(len(result),1) #one result
 90 |         self.assertEqual(len(result[0]),1) #result 1 consists of one word
 91 |         self.assertEqual(result[0][0]['word'],"the")
 92 | 
 93 |     def test2(self):
 94 |         q = cql.Query("[ pos = \"det\" ]")
 95 |         result = q(tokens)
 96 |         self.assertEqual(len(result),3)
 97 |         self.assertEqual(result[0][0]['word'],"This")
 98 |         self.assertEqual(result[1][0]['word'],"a")
 99 |         self.assertEqual(result[2][0]['word'],"the")
100 | 
101 |     def test3(self):
102 |         q = cql.Query("[ pos = \"det\" ] [ pos = \"a\" ] [ pos = \"n\" ]")
103 |         result = q(tokens)
104 |         self.assertEqual(len(result),2)
105 |         self.assertEqual(result[0][0]['word'],"a")
106 |         self.assertEqual(result[0][1]['word'],"first")
107 |         self.assertEqual(result[0][2]['word'],"test")
108 |         self.assertEqual(result[1][0]['word'],"the")
109 |         self.assertEqual(result[1][1]['word'],"new")
110 |         self.assertEqual(result[1][2]['word'],"module")
111 | 
112 |     def test4(self):
113 |         q = cql.Query("[ pos = \"det\" ] [ pos = \"a\" ]? [ pos = \"n\" ]")
114 |         result = q(tokens)
115 |         self.assertEqual(len(result),2)
116 |         self.assertEqual(result[0][0]['word'],"a")
117 |         self.assertEqual(result[0][1]['word'],"first")
118 |         self.assertEqual(result[0][2]['word'],"test")
119 |         self.assertEqual(result[1][0]['word'],"the")
120 |         self.assertEqual(result[1][1]['word'],"new")
121 |         self.assertEqual(result[1][2]['word'],"module")
122 | 
123 |     def test5(self):
124 |         q = cql.Query("[ pos = \"det\" ] []? [ pos = \"n\" ]")
125 |         result = q(tokens)
126 |         self.assertEqual(len(result),2)
127 |         self.assertEqual(result[0][0]['word'],"a")
128 |         self.assertEqual(result[0][1]['word'],"first")
129 |         self.assertEqual(result[0][2]['word'],"test")
130 |         self.assertEqual(result[1][0]['word'],"the")
131 |         self.assertEqual(result[1][1]['word'],"new")
132 |         self.assertEqual(result[1][2]['word'],"module")
133 | 
134 |     def test6(self):
135 |         q = cql.Query("[ pos = \"det\" ] []+ [ pos = \"n\" ]")
136 |         result = q(tokens)
137 |         self.assertEqual(len(result),2)
138 |         self.assertEqual(result[0][0]['word'],"a")
139 |         self.assertEqual(result[0][1]['word'],"first")
140 |         self.assertEqual(result[0][2]['word'],"test")
141 |         self.assertEqual(result[1][0]['word'],"the")
142 |         self.assertEqual(result[1][1]['word'],"new")
143 |         self.assertEqual(result[1][2]['word'],"module")
144 | 
145 |     def test7(self):
146 |         q = cql.Query("[ pos = \"det\" ] []* [ pos = \"n\" ]")
147 |         result = q(tokens)
148 |         self.assertEqual(len(result),2)
149 |         self.assertEqual(result[0][0]['word'],"a")
150 |         self.assertEqual(result[0][1]['word'],"first")
151 |         self.assertEqual(result[0][2]['word'],"test")
152 |         self.assertEqual(result[1][0]['word'],"the")
153 |         self.assertEqual(result[1][1]['word'],"new")
154 |         self.assertEqual(result[1][2]['word'],"module")
155 | 
156 | if __name__ == '__main__':
157 |     unittest.main()
158 | 


--------------------------------------------------------------------------------
/pynlpl/formats/taggerdata.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*-
  2 | 
  3 | ###############################################################
  4 | #  PyNLPl - Read tagger data
  5 | #       by Maarten van Gompel (proycon)
  6 | #       http://ilk.uvt.nl/~mvgompel
  7 | #       Induction for Linguistic Knowledge Research Group
  8 | #       Universiteit van Tilburg
  9 | #       
 10 | #       Licensed under GPLv3
 11 | #
 12 | #
 13 | ###############################################################
 14 | 
 15 | from __future__ import print_function
 16 | from __future__ import unicode_literals
 17 | from __future__ import division
 18 | from __future__ import absolute_import    
 19 | 
 20 | import io
 21 | 
 22 | class Taggerdata(object):
 23 |     def __init__(self,filename, encoding = 'utf-8', mode ='r'):
 24 |         self.filename = filename
 25 |         self.encoding = encoding
 26 |         assert (mode == 'r' or mode == 'w')
 27 |         self.mode = mode
 28 |         self.reset()
 29 |         self.firstiter = True
 30 |         self.indexed = False
 31 |         self.writeindex = 0
 32 | 
 33 |     def __iter__(self):
 34 |         words = []
 35 |         lemmas = []
 36 |         postags = []
 37 |         for line in self.f:
 38 |             line = line.strip()
 39 |             if self.firstiter:
 40 |                 self.indexed = (line == "#0")
 41 |                 self.firstiter = False
 42 |             if not line and not self.indexed:
 43 |                 yield (words, lemmas, postags)
 44 |                 words = []
 45 |                 lemmas = []
 46 |                 postags = []
 47 |             elif self.indexed and len(line) > 1 and line[0] == '#' and line[1:].isdigit():
 48 |                 if line != "#0":
 49 |                     yield (words, lemmas, postags)
 50 |                     words = []
 51 |                     lemmas = []
 52 |                     postags = []
 53 |             elif line:
 54 |                 try:
 55 |                     word, lemma, pos = line.split("\t")
 56 |                 except:
 57 |                     word = lemma = pos = "NONE"
 58 |                 if word == "NONE": word = None
 59 |                 if lemma == "NONE": lemma = None
 60 |                 if pos == "NONE": pos = None
 61 |                 words.append(word)
 62 |                 lemmas.append(lemma)
 63 |                 postags.append(pos)
 64 |         if words:
 65 |             yield (words, lemmas, postags)
 66 | 
 67 |     def next(self):
 68 |         words = []
 69 |         lemmas = []
 70 |         postags = []
 71 |         while True:
 72 |             try:
 73 |                 line = self.f.next().strip()
 74 |             except StopIteration:
 75 |                 if words:
 76 |                     return (words, lemmas, postags)
 77 |                 else:
 78 |                     raise
 79 |             if self.firstiter:
 80 |                 self.indexed = (line == "#0")
 81 |                 self.firstiter = False
 82 |             if not line and not self.indexed:
 83 |                 return (words, lemmas, postags)
 84 |             elif self.indexed and len(line) > 1 and line[0] == '#' and line[1:].isdigit():
 85 |                 if line != "#0":
 86 |                     return (words, lemmas, postags)
 87 |             elif line:
 88 |                 try:
 89 |                     word, lemma, pos = line.split("\t")
 90 |                 except:
 91 |                     word = lemma = pos = "NONE"
 92 |                 if word == "NONE": word = None
 93 |                 if lemma == "NONE": lemma = None
 94 |                 if pos == "NONE": pos = None
 95 |                 words.append(word)
 96 |                 lemmas.append(lemma)
 97 |                 postags.append(pos)
 98 | 
 99 |     def align(self, referencewords, datatuple):
100 |         """align the reference sentence with the tagged data"""
101 |         targetwords = []
102 |         for i, (word,lemma,postag) in enumerate(zip(datatuple[0],datatuple[1],datatuple[2])):
103 |             if word:
104 |                 subwords = word.split("_")
105 |                 for w in subwords: #split multiword expressions
106 |                     targetwords.append( (w, lemma, postag, i, len(subwords) > 1 ) ) #word, lemma, pos, index, multiword? 
107 | 
108 |         referencewords = [ w.lower() for w in referencewords ]          
109 |         alignment = []
110 |         for i, referenceword in enumerate(referencewords):
111 |             found = False
112 |             best = 0  
113 |             distance = 999999          
114 |             for j, (targetword, lemma, pos, index, multiword) in enumerate(targetwords):
115 |                 if referenceword == targetword and abs(i-j) < distance:
116 |                     found = True
117 |                     best = j
118 |                     distance = abs(i-j)
119 | 
120 |             if found:
121 |                 alignment.append(targetwords[best])
122 |             else:                
123 |                 alignment.append((None,None,None,None,False)) #no alignment found        
124 |         
125 |         return alignment   
126 | 
127 |     def reset(self):
128 |         self.f = io.open(self.filename,self.mode, encoding=self.encoding)
129 | 
130 | 
131 |     def write(self, sentence):
132 |         self.f.write("#" + str(self.writeindex)+"\n")
133 |         for word, lemma, pos in sentence:
134 |            if not word: word = "NONE"
135 |            if not lemma: lemma = "NONE"
136 |            if not pos: pos = "NONE"
137 |            self.f.write( word + "\t" + lemma + "\t" + pos + "\n" )                
138 |         self.writeindex += 1
139 | 
140 |     def close(self):
141 |         self.f.close()
142 | 
143 | 


--------------------------------------------------------------------------------
/pynlpl/net.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*-
  2 | 
  3 | #---------------------------------------------------------------
  4 | # PyNLPl - Network utilities
  5 | #   by Maarten van Gompel
  6 | #   Centre for Language Studies
  7 | #   Radboud University Nijmegen
  8 | #   http://www.github.com/proycon/pynlpl
  9 | #   proycon AT anaproy DOT nl
 10 | #
 11 | #   Generic Server for Language Models
 12 | #
 13 | #----------------------------------------------------------------
 14 | 
 15 | from __future__ import print_function
 16 | from __future__ import unicode_literals
 17 | from __future__ import division
 18 | from __future__ import absolute_import
 19 | from pynlpl.common import u,b
 20 | import sys
 21 | if sys.version < '3':
 22 |     from codecs import getwriter
 23 |     stderr = getwriter('utf-8')(sys.stderr)
 24 |     stdout = getwriter('utf-8')(sys.stdout)
 25 | else:
 26 |     stderr = sys.stderr
 27 |     stdout = sys.stdout
 28 | from twisted.internet import protocol, reactor # will fail on Python 3 for now
 29 | from twisted.protocols import basic
 30 | import shlex
 31 | 
 32 | 
 33 | 
 34 | class GWSNetProtocol(basic.LineReceiver):
 35 |     def connectionMade(self):
 36 |         print("Client connected", file=stderr)
 37 |         self.factory.connections += 1
 38 |         if self.factory.connections < 1:
 39 |             self.transport.loseConnection()
 40 |         else:
 41 |             self.sendLine(b("READY"))
 42 | 
 43 |     def lineReceived(self, line):
 44 |         try:
 45 |             if sys.version >= '3' and isinstance(line,bytes):
 46 |                 print("Client in: " + str(line,'utf-8'),file=stderr)
 47 |             else:
 48 |                 print("Client in: " + line,file=stderr)
 49 |         except UnicodeDecodeError:
 50 |             print("Client in: (unicodeerror)",file=stderr)
 51 |         if sys.version < '3':
 52 |             if isinstance(line,unicode):
 53 |                 self.factory.processprotocol.transport.write(line.encode('utf-8'))
 54 |             else:
 55 |                 self.factory.processprotocol.transport.write(line)
 56 |             self.factory.processprotocol.transport.write(b('\n'))
 57 |         else:
 58 |             self.factory.processprotocol.transport.write(b(line) + b('\n'))
 59 |         self.factory.processprotocol.currentclient = self
 60 | 
 61 |     def connectionLost(self, reason):
 62 |         self.factory.connections -= 1
 63 |         if self.factory.processprotocol.currentclient == self:
 64 |             self.factory.processprotocol.currentclient = None
 65 | 
 66 | class GWSFactory(protocol.ServerFactory):
 67 |     protocol = GWSNetProtocol
 68 | 
 69 |     def __init__(self, processprotocol):
 70 |         self.connections = 0
 71 |         self.processprotocol = processprotocol
 72 | 
 73 | 
 74 | class GWSProcessProtocol(protocol.ProcessProtocol):
 75 |     def __init__(self, printstderr=True, sendstderr= False, filterout = None, filtererr = None):
 76 |         self.currentclient = None
 77 |         self.printstderr = printstderr
 78 |         self.sendstderr = sendstderr
 79 |         if not filterout:
 80 |             self.filterout = lambda x: x
 81 |         else:
 82 |             self.filterout = filterout
 83 |         if not filtererr:
 84 |             self.filtererr = lambda x: x
 85 |         else:
 86 |             self.filtererr = filtererr
 87 | 
 88 |     def connectionMade(self):
 89 |         pass
 90 | 
 91 |     def outReceived(self, data):
 92 |         try:
 93 |             if sys.version >= '3' and isinstance(data,bytes):
 94 |                 print("Process out " + str(data, 'utf-8'),file=stderr)
 95 |             else:
 96 |                 print("Process out " + data,file=stderr)
 97 |         except UnicodeDecodeError:
 98 |             print("Process out (unicodeerror)",file=stderr)
 99 |         print("DEBUG:", repr(b(data).strip().split(b('\n'))))
100 |         for line in b(data).strip().split(b('\n')):
101 |             line = self.filterout(line.strip())
102 |             if self.currentclient and line:
103 |                 self.currentclient.sendLine(b(line))
104 | 
105 |     def errReceived(self, data):
106 |         try:
107 |             if sys.version >= '3' and isinstance(data,bytes):
108 |                 print("Process err " + str(data,'utf-8'), file=sys.stderr)
109 |             else:
110 |                 print("Process err " + data,file=stderr)
111 |         except UnicodeDecodeError:
112 |             print("Process out (unicodeerror)",file=stderr)
113 |         if self.printstderr and data:
114 |             print(data.strip(),file=stderr)
115 |         for line in b(data).strip().split(b('\n')):
116 |             line = self.filtererr(line.strip())
117 |             if self.sendstderr and self.currentclient and line:
118 |                 self.currentclient.sendLine(b(line))
119 | 
120 | 
121 |     def processExited(self, reason):
122 |         print("Process exited",file=stderr)
123 | 
124 | 
125 |     def processEnded(self, reason):
126 |         print("Process ended",file=stderr)
127 |         if self.currentclient:
128 |             self.currentclient.transport.loseConnection()
129 |         reactor.stop()
130 | 
131 | 
132 | class GenericWrapperServer:
133 |     """Generic Server around a stdin/stdout based CLI tool. Only accepts one client at a time to prevent concurrency issues !!!!!"""
134 |     def __init__(self, cmdline, port, printstderr= True, sendstderr= False, filterout = None, filtererr = None):
135 |         gwsprocessprotocol = GWSProcessProtocol(printstderr, sendstderr, filterout, filtererr)
136 |         cmdline = shlex.split(cmdline)
137 |         reactor.spawnProcess(gwsprocessprotocol, cmdline[0], cmdline)
138 | 
139 |         gwsfactory = GWSFactory(gwsprocessprotocol)
140 |         reactor.listenTCP(port, gwsfactory)
141 |         reactor.run()
142 | 


--------------------------------------------------------------------------------
/pynlpl/tests/evaluation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding:utf-8 -*-
  3 | 
  4 | #---------------------------------------------------------------
  5 | # PyNLPl - Test Units for Evaluation
  6 | #   by Maarten van Gompel, ILK, Universiteit van Tilburg
  7 | #   http://ilk.uvt.nl/~mvgompel
  8 | #   proycon AT anaproy DOT nl
  9 | #
 10 | #   Licensed under GPLv3
 11 | #
 12 | #-------------------------------------------------------------
 13 | 
 14 | from __future__ import print_function
 15 | from __future__ import unicode_literals
 16 | from __future__ import division
 17 | from __future__ import absolute_import
 18 | from pynlpl.common import u
 19 | 
 20 | import sys
 21 | import os
 22 | import unittest
 23 | import random
 24 | 
 25 | from pynlpl.evaluation import AbstractExperiment, WPSParamSearch, ExperimentPool, ClassEvaluation, OrdinalEvaluation
 26 | 
 27 | class ParamExperiment(AbstractExperiment):
 28 |     def defaultparameters(self):
 29 |         return {'a':1,'b':1,'c':1}
 30 | 
 31 |     def run(self):
 32 |         self.result = 0
 33 |         for line in self.inputdata:
 34 |             self.result += int(line) * self.parameters['a'] * self.parameters['b'] - self.parameters['c']
 35 | 
 36 |     def score(self):
 37 |         return self.result
 38 | 
 39 |     @staticmethod
 40 |     def sample(inputdata,n):
 41 |         n = int(n)
 42 |         if n > len(inputdata):
 43 |             return inputdata
 44 |         else:
 45 |             return random.sample(inputdata,int(n))
 46 | 
 47 | class PoolExperiment(AbstractExperiment):
 48 |     def start(self):
 49 |         self.startcommand('sleep',None,None,None,str(self.parameters['duration']))
 50 |         print("STARTING: sleep " + str(self.parameters['duration']))
 51 | 
 52 | 
 53 | class WPSTest(unittest.TestCase):
 54 |     def test_wps(self):
 55 |         inputdata = [ 1,2,3,4,5,6 ]
 56 |         parameterscope = [ ('a',[2,4]), ('b',[2,5,8]),  ('c',[3,6,9]) ]
 57 |         search = WPSParamSearch(ParamExperiment, inputdata, len(inputdata), parameterscope)
 58 |         solution = search.searchbest()
 59 |         self.assertEqual(solution,  (('a', 4), ('b', 8), ('c', 3)) )
 60 | 
 61 | 
 62 | 
 63 | class ExperimentPoolTest(unittest.TestCase):
 64 |     def test_pool(self):
 65 |         pool = ExperimentPool(4)
 66 |         for i in range(0,15):
 67 |             pool.append( PoolExperiment(None, duration=random.randint(1,6)) )
 68 |         for experiment in pool.run():
 69 |             print("DONE: sleep " + str(experiment.parameters['duration']))
 70 |         
 71 |         self.assertTrue(True) #if we got here, no exceptions were raised and it's okay 
 72 |         
 73 | class ClassEvaluationTest2(unittest.TestCase):
 74 |     def setUp(self):
 75 |         self.goals = ['sun','sun','rain','cloudy','sun','rain']
 76 |         self.observations = ['cloudy','cloudy','cloudy','rain','sun','sun']
 77 |     
 78 |        
 79 |     def test001(self):
 80 |         e = ClassEvaluation(self.goals, self.observations)
 81 |         print()
 82 |         print(e)
 83 |         print(e.confusionmatrix())
 84 | 
 85 | class OrdinalEvaluationTest(unittest.TestCase):
 86 |     def setUp(self):
 87 |         self.goals = [1,2,3,4,3,2]
 88 |         self.observations = [4,1,3,4,2,2]
 89 | 
 90 |     def test001(self):
 91 |         oe = OrdinalEvaluation(self.goals,self.observations)
 92 |         print(oe.mae())
 93 |         print(oe.mae(2))
 94 |         print(oe.rmse())
 95 |         print(oe.rmse(4))
 96 |     
 97 | class ClassEvaluationTest(unittest.TestCase):
 98 |     def setUp(self):
 99 |         self.goals =        ['cat','cat','cat','cat','cat','cat','cat','cat',    'dog',  'dog','dog','dog','dog','dog'      ,'rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit']
100 |         self.observations = ['cat','cat','cat','cat','cat','dog','dog','dog',  'cat','cat','rabbit','dog','dog','dog'   ,'rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','rabbit','dog','dog']
101 |     
102 |         
103 |     def test001(self):
104 |         """Class evaluation test -- (See also http://en.wikipedia.org/wiki/Confusion_matrix , using same data)"""
105 |         e = ClassEvaluation(self.goals, self.observations)
106 |         
107 |         print
108 |         print(e)
109 |         print(e.confusionmatrix())
110 |     
111 |                 
112 |         self.assertEqual(e.tp['cat'], 5)
113 |         self.assertEqual(e.fp['cat'], 2)
114 |         self.assertEqual(e.tn['cat'], 17)
115 |         self.assertEqual(e.fn['cat'], 3)
116 |         
117 |         self.assertEqual(e.tp['rabbit'], 11)
118 |         self.assertEqual(e.fp['rabbit'], 1)
119 |         self.assertEqual(e.tn['rabbit'], 13)
120 |         self.assertEqual(e.fn['rabbit'], 2)
121 |         
122 |         self.assertEqual(e.tp['dog'], 3)
123 |         self.assertEqual(e.fp['dog'], 5)
124 |         self.assertEqual(e.tn['dog'], 16)
125 |         self.assertEqual(e.fn['dog'], 3)
126 |         
127 |         self.assertEqual( round(e.precision('cat'),6), 0.714286)
128 |         self.assertEqual( round(e.precision('rabbit'),6), 0.916667)
129 |         self.assertEqual( round(e.precision('dog'),6), 0.375000)
130 | 
131 |         self.assertEqual( round(e.recall('cat'),6), 0.625000)
132 |         self.assertEqual( round(e.recall('rabbit'),6), 0.846154)
133 |         self.assertEqual( round(e.recall('dog'),6),0.500000)
134 | 
135 |         self.assertEqual( round(e.fscore('cat'),6), 0.666667)
136 |         self.assertEqual( round(e.fscore('rabbit'),6), 0.880000)
137 |         self.assertEqual( round(e.fscore('dog'),6),0.428571)
138 | 
139 |         self.assertEqual( round(e.accuracy(),6), 0.703704)
140 |         
141 |         
142 | 
143 | if __name__ == '__main__':
144 |     unittest.main()
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/pynlpl/tests/textprocessors.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding:utf-8 -*-
  3 | 
  4 | 
  5 | #---------------------------------------------------------------
  6 | # PyNLPl - Test Units for Text Processors
  7 | #   by Maarten van Gompel, ILK, Universiteit van Tilburg
  8 | #   http://ilk.uvt.nl/~mvgompel
  9 | #   proycon AT anaproy DOT nl
 10 | #
 11 | #   Licensed under GPLv3
 12 | #
 13 | #----------------------------------------------------------------
 14 | 
 15 | from __future__ import print_function
 16 | from __future__ import unicode_literals
 17 | from __future__ import division
 18 | from __future__ import absolute_import
 19 | 
 20 | import sys
 21 | import os
 22 | import unittest
 23 | 
 24 | from pynlpl.textprocessors import Windower, tokenise, strip_accents, calculate_overlap
 25 | 
 26 | text = "This is a test .".split(" ")
 27 | 
 28 | class WindowerTest(unittest.TestCase):
 29 |     def test_unigrams(self):
 30 |         """Windower (unigrams)"""
 31 |         global text
 32 |         result = list(iter(Windower(text,1)))
 33 |         self.assertEqual(result,[("This",),("is",),("a",),("test",),(".",)])
 34 | 
 35 |     def test_bigrams(self):
 36 |         """Windower (bigrams)"""
 37 |         global text
 38 |         result = list(iter(Windower(text,2)))
 39 |         self.assertEqual(result,[("<begin>","This"),("This","is"),("is","a"),("a","test"),("test","."),(".","<end>")])
 40 | 
 41 |     def test_trigrams(self):
 42 |         """Windower (trigrams)"""
 43 |         global text
 44 |         result = list(iter(Windower(text,3)))
 45 |         self.assertEqual(result,[('<begin>', '<begin>', 'This'), ('<begin>', 'This', 'is'), ('This', 'is', 'a'), ('is', 'a', 'test'), ('a', 'test', '.'), ('test', '.', '<end>'), ('.', '<end>', '<end>')])
 46 | 
 47 | 
 48 |     def test_trigrams_word(self):
 49 |         """Windower (trigrams) (on single word)"""
 50 |         global text
 51 |         result = list(iter(Windower(["hi"],3)))
 52 |         self.assertEqual(result,[('<begin>', '<begin>', 'hi'), ('<begin>', 'hi', '<end>'), ('hi', '<end>', '<end>')])
 53 | 
 54 | 
 55 | 
 56 |         
 57 | class TokenizerTest(unittest.TestCase):
 58 |     def test_tokenize(self):
 59 |         """Tokeniser - One sentence"""
 60 |         self.assertEqual(tokenise("This is a test."),"This is a test .".split(" "))    
 61 |     
 62 |     def test_tokenize_sentences(self):
 63 |         """Tokeniser - Multiple sentences"""
 64 |         self.assertEqual(tokenise("This, is the first sentence! This is the second sentence."),"This , is the first sentence ! This is the second sentence .".split(" "))     
 65 |     
 66 |     def test_tokenize_noeos(self):
 67 |         """Tokeniser - Missing EOS Marker"""
 68 |         self.assertEqual(tokenise("This is a test"),"This is a test".split(" "))
 69 |     
 70 |     def test_tokenize_url(self):
 71 |         """Tokeniser - URL"""
 72 |         global text
 73 |         self.assertEqual(tokenise("I go to http://www.google.com when I need to find something."),"I go to http://www.google.com when I need to find something .".split(" "))        
 74 | 
 75 |     def test_tokenize_mail(self):
 76 |         """Tokeniser - Mail"""
 77 |         global text
 78 |         self.assertEqual(tokenise("Write me at proycon@anaproy.nl."),"Write me at proycon@anaproy.nl .".split(" "))        
 79 | 
 80 |     def test_tokenize_numeric(self):
 81 |         """Tokeniser - numeric"""
 82 |         global text
 83 |         self.assertEqual(tokenise("I won € 300,000.00!"),"I won € 300,000.00 !".split(" "))        
 84 | 
 85 |     def test_tokenize_quotes(self):
 86 |         """Tokeniser - quotes"""
 87 |         global text
 88 |         self.assertEqual(tokenise("Hij zegt: \"Wat een lief baby'tje is dat!\""),"Hij zegt : \" Wat een lief baby'tje is dat ! \"".split(" "))     
 89 | 
 90 | 
 91 | class StripAccentTest(unittest.TestCase):
 92 |     def test_strip_accents(self):
 93 |         """Strip Accents"""        
 94 |         self.assertEqual(strip_accents("áàâãāĝŭçñßt"),"aaaaagucnt")
 95 | 
 96 | class OverlapTest(unittest.TestCase):
 97 |     def test_overlap_subset(self):
 98 |         """Overlap - Subset"""
 99 |         h = [4,5,6,7]
100 |         n = [5,6]
101 |         self.assertEqual(calculate_overlap(h,n),  [((5,6),0)])
102 |         
103 |     def test_overlap_equal(self):
104 |         """Overlap - Equal"""
105 |         h = [4,5,6,7]
106 |         n = [4,5,6,7]
107 |         self.assertEqual(calculate_overlap(h,n),  [((4,5,6,7),2)])        
108 |         
109 |     def test_overlap_none(self):
110 |         """Overlap - None"""
111 |         h = [4,5,6,7]
112 |         n = [8,9,10]
113 |         self.assertEqual(calculate_overlap(h,n),  [])            
114 |     
115 |     def test_overlap_leftpartial(self):
116 |         """Overlap - Left partial"""
117 |         h = [4,5,6,7]
118 |         n = [1,2,3,4,5]
119 |         self.assertEqual(calculate_overlap(h,n),  [((4,5),-1)] ) 
120 |         
121 |     def test_overlap_rightpartial(self):
122 |         """Overlap - Right partial"""
123 |         h = [4,5,6,7]
124 |         n = [6,7,8,9]
125 |         self.assertEqual(calculate_overlap(h,n),  [((6,7),1)] )        
126 |         
127 |     def test_overlap_leftpartial2(self):
128 |         """Overlap - Left partial (2)"""
129 |         h = [1,2,3,4,5]
130 |         n = [0,1,2]
131 |         self.assertEqual(calculate_overlap(h,n),  [((1,2),-1)] ) 
132 |         
133 |     def test_overlap_rightpartial2(self):
134 |         """Overlap - Right partial (2)"""
135 |         h = [1,2,3,4,5]
136 |         n = [4,5,6]
137 |         self.assertEqual(calculate_overlap(h,n),  [((4,5),1)] )        
138 |     
139 |     
140 |     def test_overlap_leftfull(self):
141 |         """Overlap - Left full"""
142 |         h = [1,2,3,4,5]
143 |         n = [1,2]
144 |         self.assertEqual(calculate_overlap(h,n),  [((1,2),-1)] ) 
145 |         
146 |     def test_overlap_rightfull(self):
147 |         """Overlap - Right full"""
148 |         h = [1,2,3,4,5]
149 |         n = [4,5]
150 |         self.assertEqual(calculate_overlap(h,n),  [((4,5),1)] )        
151 |     
152 | 
153 | if __name__ == '__main__':
154 |     unittest.main()
155 | 


--------------------------------------------------------------------------------
/pynlpl/tools/computepmi.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | 
  4 | from __future__ import print_function, unicode_literals, division, absolute_import
  5 | 
  6 | import argparse
  7 | import sys
  8 | from math import log
  9 | 
 10 | from collections import defaultdict
 11 | 
 12 | def pmi(sentences1, sentences2,discount = 0):
 13 |     jointcount = len(sentences1 & sentences2) - discount
 14 |     if jointcount <= 0: return None
 15 |     return log( jointcount / (len(sentences1) * len(sentences2))), jointcount+discount
 16 | 
 17 | def npmi(sentences1, sentences2,discount=0):
 18 |     jointcount = len(sentences1 & sentences2) - discount
 19 |     if jointcount <= 0: return None
 20 |     return log( jointcount / (len(sentences1) * len(sentences2))) / -log(jointcount), jointcount+discount
 21 | 
 22 | def main():
 23 |     parser = argparse.ArgumentParser(description="Simple cooccurence computation", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 24 |     parser.add_argument('-f','--inputtext', type=str,help="Input file (plaintext, tokenised, utf-8, one sentence per line)", action='store',default="",required=True)
 25 |     parser.add_argument('-s','--sorted', help="Output sorted by co-occurrence score", action='store_true',default=False)
 26 |     parser.add_argument('-t','--threshold', help="Joined occurrence threshold, do not consider words occuring less than this", type=int, action='store',default=1)
 27 |     parser.add_argument('-a','--adjacency', help="Compute the adjacency fraction (how many co-occurrence are immediate bigrams)", action='store_true',default=False)
 28 |     parser.add_argument('-A','--discountadjacency', help="Do not take immediately adjacent fragments (bigrams) into account when computing mutual information (requires -a)", action='store_true',default=False)
 29 |     parser.add_argument('--pmi',help="Compute pointwise mutual information", action='store_true',default=False)
 30 |     parser.add_argument('--npmi',help="Compute normalised pointwise mutual information", action='store_true',default=False)
 31 |     parser.add_argument('--jaccard',help="Compute jaccard similarity coefficient", action='store_true',default=False)
 32 |     parser.add_argument('--dice',help="Compute dice coefficient", action='store_true',default=False)
 33 | 
 34 |     args = parser.parse_args()
 35 |     if not args.pmi and not args.npmi and not args.jaccard and not args.dice:
 36 |         args.pmi = True
 37 | 
 38 |     count = defaultdict(int)
 39 |     cooc = defaultdict(lambda: defaultdict(int))
 40 |     adjacent = defaultdict(lambda: defaultdict(int))
 41 |     total = 0
 42 | 
 43 |     f = open(args.inputtext,'r',encoding='utf-8')
 44 |     for i, line in enumerate(f):
 45 |         sentence = i + 1
 46 |         if sentence % 1000 == 0: print("Indexing @" + str(sentence),file=sys.stderr)
 47 |         if line:
 48 |             words = list(enumerate(line.split()))
 49 |             for pos, word in words:
 50 |                 count[word] += 1
 51 |                 total += 1
 52 |                 for pos2, word2 in words:
 53 |                     if pos2 > pos:
 54 |                         cooc[word][word2] += 1
 55 |                         if args.adjacency and pos2 == pos + len(word.split()):
 56 |                             adjacent[word][word2] += 1
 57 |     f.close()
 58 | 
 59 | 
 60 |     l = len(cooc)
 61 |     output = []
 62 |     for i, (word, coocdata) in enumerate(cooc.items()):
 63 |         print("Computing mutual information @" + str(i+1) + "/" + str(l) + ": \"" + word + "\" , co-occurs with " + str(len(coocdata)) + " words",file=sys.stderr)
 64 |         for word2, jointcount in coocdata.items():
 65 |             if jointcount> args.threshold:
 66 |                 if args.adjacency and word in adjacent and word2 in adjacent[word]:
 67 |                     adjcount = adjacent[word][word2]
 68 |                 else:
 69 |                     adjcount = 0
 70 | 
 71 |                 if args.discountadjacency:
 72 |                     discount = adjcount
 73 |                 else:
 74 |                     discount = 0
 75 | 
 76 |                 if args.pmi:
 77 |                     score = log( ((jointcount-discount)/total)  / ((count[word]/total) * (count[word2]/total)))
 78 |                 elif args.npmi:
 79 |                     score = log( ((jointcount-discount)/total) / ((count[word]/total) * (count[word2]/total))) / -log((jointcount-discount)/total)
 80 |                 elif args.jaccard or args.dice:
 81 |                     score = (jointcount-discount) / (count[word] + count[word2] - (jointcount - discount) )
 82 |                     if args.dice:
 83 |                         score = 2*score / (1+score)
 84 | 
 85 |                 if args.sorted:
 86 |                     outputdata = (word,word2,score, jointcount, adjcount, adjcount / jointcount if args.adjacency else None)
 87 |                     output.append(outputdata)
 88 |                 else:
 89 |                     if args.adjacency:
 90 |                         print(word + "\t" + word2 + "\t" + str(score) + "\t" + str(jointcount) + "\t" + str(adjcount) + "\t" + str(adjcount / jointcount))
 91 |                     else:
 92 |                         print(word + "\t" + word2 + "\t" + str(score) + "\t" + str(jointcount))
 93 | 
 94 | 
 95 |     if args.sorted:
 96 |         print("Outputting " + str(len(output)) + " pairs",file=sys.stderr)
 97 |         if args.adjacency:
 98 |             print("#WORD\tWORD2\tSCORE\tJOINTCOUNT\tBIGRAMCOUNT\tBIGRAMRATIO")
 99 |         else:
100 |             print("#WORD\tWORD2\tSCORE\tJOINTCOUNT\tBIGRAMCOUNT\tBIGRAMRATIO")
101 |         if args.npmi:
102 |             sign = 1
103 |         else:
104 |             sign = -1
105 |         for word,word2,score,jointcount,adjcount, adjratio in sorted(output, key=lambda x: sign * x[2]):
106 |             if args.adjacency:
107 |                 print(word + "\t" + word2 + "\t" + str(score) + "\t" + str(jointcount) + "\t" + str(adjcount) + "\t" + str(adjratio) )
108 |             else:
109 |                 print(word + "\t" + word2 + "\t" + str(score) + "\t" + str(jointcount))
110 | 
111 | 
112 | 
113 | 
114 | if __name__ == '__main__':
115 |     main()
116 | 
117 | 


--------------------------------------------------------------------------------
/pynlpl/clients/frogclient.py:
--------------------------------------------------------------------------------
  1 | ###############################################################
  2 | #  PyNLPl - Frog Client - Version 1.4.1
  3 | #       by Maarten van Gompel (proycon)
  4 | #       http://ilk.uvt.nl/~mvgompel
  5 | #       Induction for Linguistic Knowledge Research Group
  6 | #       Universiteit van Tilburg
  7 | #
  8 | #       Derived from code by Rogier Kraf
  9 | #
 10 | #       Licensed under GPLv3
 11 | #
 12 | # This is a Python library for on-the-fly communication with
 13 | # a Frog/Tadpole Server. Allowing on-the-fly lemmatisation and
 14 | # PoS-tagging. It is recommended to pass your data on a
 15 | # sentence-by-sentence basis to FrogClient.process()
 16 | #
 17 | ###############################################################
 18 | 
 19 | from __future__ import print_function
 20 | from __future__ import unicode_literals
 21 | from __future__ import division
 22 | from __future__ import absolute_import
 23 | from pynlpl.common import u
 24 | 
 25 | import socket
 26 | 
 27 | class FrogClient:
 28 |     def __init__(self,host="localhost",port=12345, server_encoding="utf-8", returnall=False, timeout=120.0):
 29 |         """Create a client connecting to a Frog or Tadpole server."""
 30 |         self.BUFSIZE = 4096
 31 |         self.socket = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
 32 |         self.socket.settimeout(timeout)
 33 |         self.socket.connect( (host,int(port)) )
 34 |         self.server_encoding = server_encoding
 35 |         self.returnall = returnall
 36 | 
 37 | 
 38 | 
 39 | 
 40 |     def process(self,input_data, source_encoding="utf-8", return_unicode = True, oldfrog=False):
 41 |         """Receives input_data in the form of a str or unicode object, passes this to the server, with proper consideration for the encodings, and returns the Frog output as a list of tuples: (word,pos,lemma,morphology), each of these is a proper unicode object unless return_unicode is set to False, in which case raw strings will be returned. Return_unicode is no longer optional, it is fixed to True, parameter is still there only for backwards-compatibility."""
 42 |         if isinstance(input_data, list) or isinstance(input_data, tuple):
 43 |             input_data = " ".join(input_data)
 44 | 
 45 | 
 46 | 
 47 |         input_data = u(input_data, source_encoding) #decode (or preferably do this in an earlier stage)
 48 |         input_data = input_data.strip(' \t\n')
 49 | 
 50 |         s = input_data.encode(self.server_encoding) +b'\r\n'
 51 |         if not oldfrog: s += b'EOT\r\n'
 52 |         self.socket.sendall(s) #send to socket in desired encoding
 53 |         output = []
 54 | 
 55 |         done = False
 56 |         while not done:
 57 |             data = b""
 58 |             while not data.endswith(b'\n'):
 59 |                 moredata = self.socket.recv(self.BUFSIZE)
 60 |                 if not moredata: break
 61 |                 data += moredata
 62 | 
 63 | 
 64 |             data = u(data,self.server_encoding)
 65 | 
 66 | 
 67 |             for line in data.strip(' \t\r\n').split('\n'):
 68 |                 if line == "READY":
 69 |                     done = True
 70 |                     break
 71 |                 elif line:
 72 |                     line = line.split('\t') #split on tab
 73 |                     if len(line) > 4 and line[0].isdigit(): #first column is token number
 74 |                         if line[0] == '1' and output:
 75 |                             if self.returnall:
 76 |                                 output.append( (None,None,None,None, None,None,None, None) )
 77 |                             else:
 78 |                                 output.append( (None,None,None,None) )
 79 |                         fields = line[1:]
 80 |                         parse1=parse2=ner=chunk=""
 81 |                         word,lemma,morph,pos = fields[0:4]
 82 |                         if len(fields) > 5:
 83 |                             ner = fields[5]
 84 |                         if len(fields) > 6:
 85 |                             chunk = fields[6]
 86 |                         if len(fields) >= 8:
 87 |                             parse1 = fields[7]
 88 |                             parse2 = fields[8]
 89 | 
 90 |                         if len(fields) < 5:
 91 |                             raise Exception("Can't process response line from Frog: ", repr(line), " got unexpected number of fields ", str(len(fields) + 1))
 92 | 
 93 |                         if self.returnall:
 94 |                             output.append( (word,lemma,morph,pos,ner,chunk,parse1,parse2) )
 95 |                         else:
 96 |                             output.append( (word,lemma,morph,pos) )
 97 | 
 98 |         return output
 99 | 
100 |     def process_aligned(self,input_data, source_encoding="utf-8", return_unicode = True):
101 |         output = self.process(input_data, source_encoding, return_unicode)
102 |         outputwords = [ x[0] for x in output ]
103 |         inputwords = input_data.strip(' \t\n').split(' ')
104 |         alignment = self.align(inputwords, outputwords)
105 |         for i, _ in enumerate(inputwords):
106 |             targetindex = alignment[i]
107 |             if targetindex == None:
108 |                 if self.returnall:
109 |                     yield (None,None,None,None,None,None,None,None)
110 |                 else:
111 |                     yield (None,None,None,None)
112 |             else:
113 |                 yield output[targetindex]
114 | 
115 |     def align(self,inputwords, outputwords):
116 |         """For each inputword, provides the index of the outputword"""
117 |         alignment = []
118 |         cursor = 0
119 |         for inputword in inputwords:
120 |             if len(outputwords) > cursor and outputwords[cursor] == inputword:
121 |                 alignment.append(cursor)
122 |                 cursor += 1
123 |             elif len(outputwords) > cursor+1 and outputwords[cursor+1] == inputword:
124 |                 alignment.append(cursor+1)
125 |                 cursor += 2
126 |             else:
127 |                 alignment.append(None)
128 |                 cursor += 1
129 |         return alignment
130 | 
131 | 
132 |     def __del__(self):
133 |         self.socket.close()
134 | 
135 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # PyNLPl documentation build configuration file, created by
  4 | # sphinx-quickstart on Tue Jul  6 22:07:20 2010.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sys, os
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #sys.path.append(os.path.abspath('.'))
 20 | 
 21 | sys.path.append(os.path.abspath('../../'))
 22 | from pynlpl import VERSION
 23 | 
 24 | # -- General configuration -----------------------------------------------------
 25 | 
 26 | # Add any Sphinx extension module names here, as strings. They can be extensions
 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 28 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon','sphinx.ext.autosummary']
 29 | 
 30 | # Add any paths that contain templates here, relative to this directory.
 31 | templates_path = ['_templates']
 32 | 
 33 | # The suffix of source filenames.
 34 | source_suffix = '.rst'
 35 | 
 36 | # The encoding of source files.
 37 | #source_encoding = 'utf-8'
 38 | 
 39 | # The master toctree document.
 40 | master_doc = 'index'
 41 | 
 42 | # General information about the project.
 43 | project = u'PyNLPl'
 44 | copyright = u'2016, Maarten van Gompel'
 45 | 
 46 | # The version info for the project you're documenting, acts as replacement for
 47 | # |version| and |release|, also used in various other places throughout the
 48 | # built documents.
 49 | #
 50 | # The short X.Y version.
 51 | version = VERSION
 52 | # The full version, including alpha/beta/rc tags.
 53 | release = VERSION
 54 | 
 55 | # The language for content autogenerated by Sphinx. Refer to documentation
 56 | # for a list of supported languages.
 57 | #language = None
 58 | 
 59 | # There are two options for replacing |today|: either, you set today to some
 60 | # non-false value, then it is used:
 61 | #today = ''
 62 | # Else, today_fmt is used as the format for a strftime call.
 63 | #today_fmt = '%B %d, %Y'
 64 | 
 65 | # List of documents that shouldn't be included in the build.
 66 | #unused_docs = []
 67 | 
 68 | # List of directories, relative to source directory, that shouldn't be searched
 69 | # for source files.
 70 | exclude_trees = ['_build']
 71 | 
 72 | # The reST default role (used for this markup: `text`) to use for all documents.
 73 | #default_role = None
 74 | 
 75 | # If true, '()' will be appended to :func: etc. cross-reference text.
 76 | #add_function_parentheses = True
 77 | 
 78 | # If true, the current module name will be prepended to all description
 79 | # unit titles (such as .. function::).
 80 | #add_module_names = True
 81 | 
 82 | # If true, sectionauthor and moduleauthor directives will be shown in the
 83 | # output. They are ignored by default.
 84 | #show_authors = False
 85 | 
 86 | # The name of the Pygments (syntax highlighting) style to use.
 87 | pygments_style = 'sphinx'
 88 | 
 89 | # A list of ignored prefixes for module index sorting.
 90 | #modindex_common_prefix = []
 91 | 
 92 | 
 93 | # -- Options for HTML output ---------------------------------------------------
 94 | 
 95 | # The theme to use for HTML and HTML Help pages.  Major themes that come with
 96 | # Sphinx are currently 'default' and 'sphinxdoc'.
 97 | html_theme = 'default'
 98 | 
 99 | # Theme options are theme-specific and customize the look and feel of a theme
100 | # further.  For a list of options available for each theme, see the
101 | # documentation.
102 | #html_theme_options = {}
103 | 
104 | # Add any paths that contain custom themes here, relative to this directory.
105 | #html_theme_path = []
106 | 
107 | # The name for this set of Sphinx documents.  If None, it defaults to
108 | # "<project> v<release> documentation".
109 | #html_title = None
110 | 
111 | # A shorter title for the navigation bar.  Default is the same as html_title.
112 | #html_short_title = None
113 | 
114 | # The name of an image file (relative to this directory) to place at the top
115 | # of the sidebar.
116 | #html_logo = None
117 | 
118 | # The name of an image file (within the static path) to use as favicon of the
119 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
120 | # pixels large.
121 | #html_favicon = None
122 | 
123 | # Add any paths that contain custom static files (such as style sheets) here,
124 | # relative to this directory. They are copied after the builtin static files,
125 | # so a file named "default.css" will overwrite the builtin "default.css".
126 | # html_static_path = ['_static']
127 | 
128 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
129 | # using the given strftime format.
130 | #html_last_updated_fmt = '%b %d, %Y'
131 | 
132 | # If true, SmartyPants will be used to convert quotes and dashes to
133 | # typographically correct entities.
134 | #html_use_smartypants = True
135 | 
136 | # Custom sidebar templates, maps document names to template names.
137 | #html_sidebars = {}
138 | 
139 | # Additional templates that should be rendered to pages, maps page names to
140 | # template names.
141 | #html_additional_pages = {}
142 | 
143 | # If false, no module index is generated.
144 | #html_use_modindex = True
145 | 
146 | # If false, no index is generated.
147 | #html_use_index = True
148 | 
149 | # If true, the index is split into individual pages for each letter.
150 | #html_split_index = False
151 | 
152 | # If true, links to the reST sources are added to the pages.
153 | #html_show_sourcelink = True
154 | 
155 | # If true, an OpenSearch description file will be output, and all pages will
156 | # contain a <link> tag referring to it.  The value of this option must be the
157 | # base URL from which the finished HTML is served.
158 | #html_use_opensearch = ''
159 | 
160 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
161 | #html_file_suffix = ''
162 | 
163 | # Output file base name for HTML help builder.
164 | # htmlhelp_basename = 'pynlpl'
165 | 
166 | 
167 | # -- Options for LaTeX output --------------------------------------------------
168 | 
169 | # The paper size ('letter' or 'a4').
170 | latex_paper_size = 'a4'
171 | 
172 | # The font size ('10pt', '11pt' or '12pt').
173 | #latex_font_size = '10pt'
174 | 
175 | # Grouping the document tree into LaTeX files. List of tuples
176 | # (source start file, target name, title, author, documentclass [howto/manual]).
177 | latex_documents = [
178 |   ('index', 'pynlpl.tex', u'PyNLPl Documentation',
179 |    u'Maarten van Gompel', 'manual'),
180 | ]
181 | 
182 | # The name of an image file (relative to this directory) to place at the top of
183 | # the title page.
184 | #latex_logo = None
185 | 
186 | # For "manual" documents, if this is true, then toplevel headings are parts,
187 | # not chapters.
188 | #latex_use_parts = False
189 | 
190 | # Additional stuff for the LaTeX preamble.
191 | #latex_preamble = ''
192 | 
193 | # Documents to append as an appendix to all manuals.
194 | #latex_appendices = []
195 | 
196 | # If false, no module index is generated.
197 | #latex_use_modindex = True
198 | 
199 | autosummary_generate = True
200 | 


--------------------------------------------------------------------------------
/pynlpl/tests/folia_benchmark.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from __future__ import print_function, unicode_literals, division, absolute_import
  4 | 
  5 | from pynlpl.formats import folia, fql, cql
  6 | import time
  7 | import sys
  8 | import os
  9 | import glob
 10 | try:
 11 |     from pympler import asizeof
 12 | except ImportError:
 13 |     print("An extra dependency called pympler is required: install using pip install pympler (or other means)",file=sys.stderr)
 14 |     raise
 15 | 
 16 | repetitions = 0
 17 | 
 18 | def timeit(f):
 19 |     def f_timer(*args, **kwargs):
 20 |         if 'filename' in kwargs:
 21 |             label = "on file " + kwargs['filename']
 22 |         elif 'dirname' in kwargs:
 23 |             label = "on directory " + kwargs['dirname']
 24 |         elif 'doc' in kwargs:
 25 |             label = "on document " + kwargs['doc'].id
 26 |         else:
 27 |             label = ""
 28 |         print(f.__name__ + " -- " + f.__doc__  + " -- " + label + " ...", end="")
 29 |         times = []
 30 |         for i in range(0, repetitions):
 31 |             start = time.time()
 32 |             try:
 33 |                 result = f(*args, **kwargs)
 34 |             except Exception as e:
 35 |                 print(" -- ERROR! -- ", e)
 36 |                 return None
 37 |             times.append(time.time() - start)
 38 |         if times:
 39 |             d =  round(sum(times)  / len(times),4)
 40 |             print('took ' + str(d) + 's (averaged over ' + str(len(times)) + ' runs)')
 41 |         else:
 42 |             d = 0
 43 |         return result
 44 |     return f_timer
 45 | 
 46 | 
 47 | @timeit
 48 | def loadfile(**kwargs):
 49 |     """Loading file"""
 50 |     doc = folia.Document(file=kwargs['filename'],bypassleak=False)
 51 | 
 52 | 
 53 | @timeit
 54 | def savefile(**kwargs): #careful with SSDs
 55 |     """Saving file"""
 56 |     kwargs['doc'].save("/tmp/test.xml")
 57 | 
 58 | @timeit
 59 | def xml(**kwargs):
 60 |     """XML serialisation"""
 61 |     kwargs['doc'].xml()
 62 | 
 63 | 
 64 | @timeit
 65 | def json(**kwargs):
 66 |     """JSON serialisation"""
 67 |     kwargs['doc'].json()
 68 | 
 69 | @timeit
 70 | def text(**kwargs):
 71 |     """text serialisation"""
 72 |     kwargs['doc'].text()
 73 | 
 74 | @timeit
 75 | def countwords(**kwargs):
 76 |     """Counting words"""
 77 |     kwargs['doc'].count(folia.Word,None, True,[folia.AbstractAnnotationLayer])
 78 | 
 79 | @timeit
 80 | def selectwords(**kwargs):
 81 |     """Selecting words"""
 82 |     for word in kwargs['doc'].words():
 83 |         pass
 84 | 
 85 | 
 86 | @timeit
 87 | def selectwordsfql(**kwargs):
 88 |     """Selecting words using FQL"""
 89 |     query = fql.Query("SELECT w")
 90 |     for word in query(kwargs['doc']):
 91 |         pass
 92 | 
 93 | @timeit
 94 | def selectwordsfqlforp(**kwargs):
 95 |     """Selecting words in paragraphs using FQL"""
 96 |     query = fql.Query("SELECT w FOR p")
 97 |     for word in query(kwargs['doc']):
 98 |         pass
 99 | 
100 | @timeit
101 | def selectwordsfqlxml(**kwargs):
102 |     """Selecting words using FQL (XML output)"""
103 |     query = fql.Query("SELECT w FORMAT xml")
104 |     for wordxml in query(kwargs['doc']):
105 |         pass
106 | 
107 | @timeit
108 | def selectwordsfqlwhere(**kwargs):
109 |     """Selecting words using FQL (with WHERE clause)"""
110 |     query = fql.Query("SELECT w WHERE text != \"blah\"")
111 |     for word in query(kwargs['doc']):
112 |         pass
113 | 
114 | @timeit
115 | def editwordsfql(**kwargs):
116 |     """Editing the text of  words using FQL (with WHERE clause)"""
117 |     query = fql.Query("EDIT w WITH text \"blah\"")
118 |     for word in query(kwargs['doc']):
119 |         pass
120 | 
121 | @timeit
122 | def nextwords(**kwargs):
123 |     """Find neighbour of each word"""
124 |     for word in kwargs['doc'].words():
125 |         word.next()
126 | 
127 | @timeit
128 | def addelement(**kwargs):
129 |     """Adding a simple annotation (desc) to each word"""
130 |     for word in kwargs['doc'].words():
131 |         try:
132 |             word.append(folia.Description, value="test")
133 |         except folia.DuplicateAnnotationError:
134 |             pass
135 | 
136 | 
137 | @timeit
138 | def ancestors(**kwargs):
139 |     """Iterating over the ancestors of each word"""
140 |     for word in kwargs['doc'].words():
141 |         for ancestor in word.ancestors():
142 |             pass
143 | 
144 | @timeit
145 | def readerwords(**kwargs):
146 |     """Iterating over words using Reader"""
147 |     reader = folia.Reader(kwargs['filename'], folia.Word)
148 |     for word in reader:
149 |         pass
150 | 
151 | def main():
152 |     global repetitions, target
153 |     files = []
154 |     try:
155 |         begin = 1
156 |         if os.path.exists(sys.argv[1]):
157 |             begin = 1
158 |             selectedtests = "all"
159 |             repetitions = 1
160 |         else:
161 |             selectedtests = sys.argv[1].split(',')
162 |             if os.path.exists(sys.argv[2]):
163 |                 repetitions = 1
164 |                 begin = 2
165 |             else:
166 |                 repetitions = int(sys.argv[2])
167 |                 begin = 3
168 |         filesordirs = sys.argv[begin:]
169 |     except:
170 |         print("Syntax: folia_benchmark [testfunctions [repetitions]] files-or-directories+",file=sys.stderr)
171 |         print(" testfunctions is a comma separated list of function names, or the special keyword 'all'", file=sys.stderr)
172 |         print(" directories are recursively searched for files with the extension folia.xml, +gz and +bz2 is supported too.", file=sys.stderr)
173 |         sys.exit(2)
174 | 
175 | 
176 |     for fd in filesordirs:
177 |         if not os.path.exists(fd):
178 |             raise Exception("No such file or directory" + fd)
179 |         if os.path.isfile(fd):
180 |             files.append(fd)
181 |         elif os.path.isdir(fd):
182 |             dirs = [fd]
183 |             while dirs:
184 |                 dir = dirs.pop(0)
185 |                 for filename in glob.glob(dir + "/*"):
186 |                     if os.path.isdir(filename):
187 |                         dirs.append(filename)
188 |                     elif filename.endswith('.folia.xml') or filename.endswith('.folia.xml.gz') or filename.endswith('.folia.xml.bz2'):
189 |                         files.append(filename)
190 | 
191 | 
192 |     for f in ('loadfile','loadfileleakbypass','readerwords'):
193 |         if f in selectedtests or 'all' in selectedtests:
194 |             for filename in files:
195 |                 globals()[f](filename=filename)
196 | 
197 | 
198 |     for f in ('xml','text','json','countwords','selectwords','nextwords','ancestors','selectwordsfql','selectwordsfqlforp','selectwordsfqlxml','selectwordsfqlwhere','editwordsfql', 'addelement' ):
199 |         if f in selectedtests or 'all' in selectedtests:
200 |             for filename in files:
201 |                 doc = folia.Document(file=filename)
202 |                 globals()[f](doc=doc)
203 | 
204 |     for f in ('memtest',):
205 |         if f in selectedtests or 'all' in selectedtests:
206 |             for filename in files:
207 |                 doc = folia.Document(file=filename)
208 |                 print("memtest -- Memory test on document " + filename + " -- memory consumption estimated at " + str(round(asizeof.asizeof(doc) / 1024 / 1024,2)) + " MB" + " (filesize " + str(round(os.path.getsize(filename)/1024/1024,2)) + " MB)")
209 | 
210 | 
211 | 
212 | if __name__ == '__main__':
213 |     main()
214 | 


--------------------------------------------------------------------------------
/pynlpl/tests/search.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding:utf-8 -*-
  3 | 
  4 | 
  5 | #---------------------------------------------------------------
  6 | # PyNLPl - Test Units for Search Algorithms
  7 | #   by Maarten van Gompel, ILK, Universiteit van Tilburg
  8 | #   http://ilk.uvt.nl/~mvgompel
  9 | #   proycon AT anaproy DOT nl
 10 | #
 11 | #   Licensed under GPLv3
 12 | #
 13 | #----------------------------------------------------------------
 14 | 
 15 | import sys
 16 | import os
 17 | import unittest
 18 | 
 19 | sys.path.append(sys.path[0] + '/../../')
 20 | os.environ['PYTHONPATH'] = sys.path[0] + '/../../'
 21 | 
 22 | from pynlpl.search import AbstractSearchState, DepthFirstSearch, BreadthFirstSearch, IterativeDeepening, HillClimbingSearch, BeamSearch
 23 | 
 24 | 
 25 | class ReorderSearchState(AbstractSearchState):
 26 |     def __init__(self, tokens, parent = None):
 27 |         self.tokens = tokens
 28 |         super(ReorderSearchState, self).__init__(parent)
 29 | 
 30 |     def expand(self):
 31 |         #Operator: Swap two consecutive pairs
 32 |         l = len(self.tokens)
 33 |         for i in range(0,l - 1):
 34 |             newtokens = self.tokens[:i]
 35 |             newtokens.append(self.tokens[i + 1])
 36 |             newtokens.append(self.tokens[i])
 37 |             if i+2 < l:
 38 |                 newtokens += self.tokens[i+2:]
 39 |             yield ReorderSearchState(newtokens, self)
 40 | 
 41 |     def __hash__(self):
 42 |         return hash(str(self))
 43 | 
 44 |     def __eq__(self, other):
 45 |         return str(self) == str(other)
 46 | 
 47 |     def __str__(self):
 48 |         return " ".join(self.tokens)
 49 | 
 50 | class InformedReorderSearchState(ReorderSearchState):
 51 |     def __init__(self, tokens, goal = None, parent = None):
 52 |         self.tokens = tokens
 53 |         self.goal = goal
 54 |         super(ReorderSearchState, self).__init__(parent)
 55 | 
 56 |     def score(self):
 57 |         """Compute distortion"""
 58 |         totaldistortion = 0
 59 |         for i, token in enumerate(self.goal.tokens):
 60 |             tokendistortion = 9999999
 61 |             for j, token2 in enumerate(self.tokens):
 62 |                 if token == token2 and abs(i - j) < tokendistortion:
 63 |                     tokendistortion = abs(i - j)
 64 |             totaldistortion += tokendistortion
 65 |         return totaldistortion
 66 | 
 67 |     def expand(self):
 68 |         #Operator: Swap two consecutive pairs
 69 |         l = len(self.tokens)
 70 |         for i in range(0,l - 1):
 71 |             newtokens = self.tokens[:i]
 72 |             newtokens.append(self.tokens[i + 1])
 73 |             newtokens.append(self.tokens[i])
 74 |             if i+2 < l:
 75 |                 newtokens += self.tokens[i+2:]
 76 |             yield InformedReorderSearchState(newtokens, self.goal, self)
 77 | 
 78 | inputstate = ReorderSearchState("a This test . sentence is".split(' '))
 79 | goalstate = ReorderSearchState("This is a test sentence .".split(' '))
 80 | 
 81 | class DepthFirstSearchTest(unittest.TestCase):
 82 |     def test_solution(self):
 83 |         """Depth First Search"""
 84 |         global inputstate, goalstate
 85 |         search = DepthFirstSearch(inputstate ,graph=True, goal=goalstate)
 86 |         solution = search.searchfirst()
 87 |         #print "DFS:", search.traversalsize(), "nodes visited |",
 88 |         self.assertEqual(solution, goalstate)
 89 | 
 90 | 
 91 | 
 92 | 
 93 | class BreadthFirstSearchTest(unittest.TestCase):
 94 |     def test_solution(self):
 95 |         """Breadth First Search"""
 96 |         global inputstate, goalstate
 97 |         search = BreadthFirstSearch(inputstate ,graph=True, goal=goalstate)
 98 |         solution = search.searchfirst()
 99 |         #print "BFS:", search.traversalsize(), "nodes visited |",
100 |         self.assertEqual(solution, goalstate)
101 | 
102 | 
103 | class IterativeDeepeningTest(unittest.TestCase):
104 |     def test_solution(self):
105 |         """Iterative Deepening DFS"""
106 |         global inputstate, goalstate
107 |         search = IterativeDeepening(inputstate ,graph=True, goal=goalstate)
108 |         solution = search.searchfirst()
109 |         #print "It.Deep:", search.traversalsize(), "nodes visited |",
110 |         self.assertEqual(solution, goalstate)
111 | 
112 | 
113 | 
114 | informedinputstate = InformedReorderSearchState("a This test . sentence is".split(' '), goalstate)
115 | #making a simple language model
116 | 
117 | class HillClimbingTest(unittest.TestCase):
118 |     def test_solution(self):
119 |         """Hill Climbing"""
120 |         global informedinputstate
121 |         search = HillClimbingSearch(informedinputstate, graph=True, minimize=True,debug=False)
122 |         solution = search.searchbest()
123 |         self.assertTrue(solution) #TODO: this is not a test!
124 | 
125 | class BeamSearchTest(unittest.TestCase):
126 |     def test_minimizeC1(self):
127 |         """Beam Search needle-in-haystack problem (beam=2, minimize)"""
128 |         #beamsize has been set to the minimum that yields the correct solution
129 |         global informedinputstate, solution, goalstate
130 |         search = BeamSearch(informedinputstate, beamsize=2, graph=True, minimize=True,debug=0, goal=goalstate)
131 |         solution = search.searchbest()
132 |         self.assertEqual( str(solution), str(goalstate) )
133 |         self.assertEqual( search.solutions, 1 )
134 |     
135 |     
136 |     def test_minimizeA1(self):
137 |         """Beam Search optimisation problem A (beam=2, minimize)"""
138 |         #beamsize has been set to the minimum that yields the correct solution
139 |         global informedinputstate, solution, goalstate
140 |         search = BeamSearch(informedinputstate, beamsize=2, graph=True, minimize=True,debug=0)
141 |         solution = search.searchbest()
142 |         self.assertEqual( str(solution), str(goalstate) )
143 |         self.assertTrue( search.solutions > 1 ) #everything is a solution
144 | 
145 |         
146 |     def test_minimizeA2(self):
147 |         """Beam Search optimisation problem A (beam=100, minimize)"""
148 |         #if a small beamsize works, a very large one should too
149 |         global informedinputstate, solution, goalstate
150 |         search = BeamSearch(informedinputstate, beamsize=100, graph=True, minimize=True,debug=0)
151 |         solution = search.searchbest()
152 |         self.assertEqual( str(solution), str(goalstate) )   
153 |         self.assertTrue( search.solutions > 1 ) #everything is a solution
154 |     
155 |     #def test_minimizeA3(self):    
156 |     #    """Beam Search optimisation problem A (eager mode, beam=2, minimize)"""
157 |     #    #beamsize has been set to the minimum that yields the correct solution
158 |     #    global informedinputstate, solution, goalstate
159 |     #    search = BeamSearch(informedinputstate, beamsize=50, graph=True, minimize=True,eager=True,debug=2)
160 |     #    solution = search.searchbest()
161 |     #    self.assertEqual( str(solution), str(goalstate) )
162 | 
163 | 
164 |     def test_minimizeB1(self):
165 |         """Beam Search optimisation problem (longer) (beam=3, minimize)"""
166 |         #beamsize has been set to the minimum that yields the correct solution
167 |         goalstate = InformedReorderSearchState("This is supposed to be a very long sentence .".split(' '))
168 |         informedinputstate = InformedReorderSearchState("a long very . sentence supposed to be This is".split(' '), goalstate)
169 |         search = BeamSearch(informedinputstate, beamsize=3, graph=True, minimize=True,debug=False)
170 |         solution = search.searchbest()
171 |         self.assertEqual(str(solution),str(goalstate))
172 |         
173 |         
174 | 
175 | if __name__ == '__main__':
176 |     unittest.main()
177 | 
178 | 
179 | 
180 | 


--------------------------------------------------------------------------------
/pynlpl/formats/moses.py:
--------------------------------------------------------------------------------
  1 | ###############################################################
  2 | #  PyNLPl - Moses formats
  3 | #       by Maarten van Gompel (proycon)
  4 | #       http://ilk.uvt.nl/~mvgompel
  5 | #       Induction for Linguistic Knowledge Research Group
  6 | #       Universiteit van Tilburg
  7 | #
  8 | #       Licensed under GPLv3
  9 | #
 10 | # This is a Python library classes and functions for
 11 | # reading file-formats produced by Moses. Currently
 12 | # contains only a class for reading a Moses PhraseTable.
 13 | # (migrated to pynlpl from pbmbmt)
 14 | #
 15 | ###############################################################
 16 | 
 17 | 
 18 | from __future__ import print_function
 19 | from __future__ import unicode_literals
 20 | from __future__ import division
 21 | from __future__ import absolute_import
 22 | 
 23 | from pynlpl.common import u
 24 | 
 25 | import sys
 26 | import bz2
 27 | import gzip
 28 | import datetime
 29 | import socket
 30 | import io
 31 | 
 32 | try:
 33 |     from twisted.internet import protocol, reactor #No Python 3 support yet :(
 34 |     from twisted.protocols import basic
 35 |     twistedimported = True
 36 | except:
 37 |     print("WARNING: Twisted could not be imported",file=sys.stderr)
 38 |     twistedimported = False
 39 | 
 40 | 
 41 | class PhraseTable(object):
 42 |     def __init__(self,filename, quiet=False, reverse=False, delimiter="|||", score_column = 3, max_sourcen = 0,sourceencoder=None, targetencoder=None, scorefilter=None):
 43 |         """Load a phrase table from file into memory (memory intensive!)"""
 44 |         self.phrasetable = {}
 45 |         self.sourceencoder = sourceencoder
 46 |         self.targetencoder = targetencoder
 47 | 
 48 | 
 49 |         if filename.split(".")[-1] == "bz2":
 50 |             f = bz2.BZ2File(filename,'r')
 51 |         elif filename.split(".")[-1] == "gz":
 52 |             f = gzip.GzipFile(filename,'r')
 53 |         else:
 54 |             f = io.open(filename,'r',encoding='utf-8')
 55 |         linenum = 0
 56 |         prevsource = None
 57 |         targets = []
 58 | 
 59 |         while True:
 60 |             if not quiet:
 61 |                 linenum += 1
 62 |                 if (linenum % 100000) == 0:
 63 |                     print("Loading phrase-table: @%d" % linenum, "\t(" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ")",file=sys.stderr)
 64 |             line = u(f.readline())
 65 |             if not line:
 66 |                 break
 67 | 
 68 |             #split into (trimmed) segments
 69 |             segments = [ segment.strip() for segment in line.split(delimiter) ]
 70 | 
 71 |             if len(segments) < 3:
 72 |                 print("Invalid line: ", line, file=sys.stderr)
 73 |                 continue
 74 | 
 75 |             #Do we have a score associated?
 76 |             if score_column > 0 and len(segments) >= score_column:
 77 |                 scores = tuple( ( float(x) for x in segments[score_column-1].strip().split() ) )
 78 |             else:
 79 |                 scores = tuple()
 80 | 
 81 |             #if align2_column > 0:
 82 |             #    try:
 83 |             #        null_alignments = segments[align2_column].count("()")
 84 |             #    except:
 85 |             #        null_alignments = 0
 86 |             #else:
 87 |             #    null_alignments = 0
 88 | 
 89 |             if scorefilter:
 90 |                 if not scorefilter(scores): continue
 91 | 
 92 |             if reverse:
 93 |                 if max_sourcen > 0 and segments[1].count(' ') + 1 > max_sourcen:
 94 |                     continue
 95 | 
 96 |                 if self.sourceencoder:
 97 |                     source = self.sourceencoder(segments[1]) #tuple(segments[1].split(" "))
 98 |                 else:
 99 |                     source = segments[1]
100 |                 if self.targetencoder:
101 |                     target = self.targetencoder(segments[0]) #tuple(segments[0].split(" "))
102 |                 else:
103 |                     target = segments[0]
104 |             else:
105 |                 if max_sourcen > 0 and segments[0].count(' ') + 1 > max_sourcen:
106 |                     continue
107 | 
108 |                 if self.sourceencoder:
109 |                     source = self.sourceencoder(segments[0]) #tuple(segments[0].split(" "))
110 |                 else:
111 |                     source = segments[0]
112 |                 if self.targetencoder:
113 |                     target = self.targetencoder(segments[1]) #tuple(segments[1].split(" "))
114 |                 else:
115 |                     target = segments[1]
116 | 
117 | 
118 |             if prevsource and source != prevsource and targets:
119 |                 self.phrasetable[prevsource] = tuple(targets)
120 |                 targets = []
121 | 
122 |             targets.append( (target,scores) )
123 |             prevsource = source
124 | 
125 |         #don't forget last one:
126 |         if prevsource and targets:
127 |             self.phrasetable[prevsource] = tuple(targets)
128 | 
129 |         f.close()
130 | 
131 | 
132 |     def __contains__(self, phrase):
133 |         """Query if a certain phrase exist in the phrase table"""
134 |         if self.sourceencoder: phrase = self.sourceencoder(phrase)
135 |         return (phrase in self.phrasetable)
136 |         #d = self.phrasetable
137 |         #for word in phrase:
138 |         #    if not word in d:
139 |         #        return False
140 |         #    d = d[word
141 |         #return ("" in d)
142 | 
143 |     def __iter__(self):
144 |         for phrase, targets in self.phrasetable.items():
145 |             yield phrase, targets
146 | 
147 |     def __len__(self):
148 |         return len(self.phrasetable)
149 | 
150 |     def __bool__(self):
151 |         return bool(self.phrasetable)
152 | 
153 |     def __getitem__(self, phrase): #same as translations
154 |         """Return a list of (translation, scores) tuples"""
155 |         if self.sourceencoder: phrase = self.sourceencoder(phrase)
156 |         return self.phrasetable[phrase]
157 | 
158 | 
159 |         #d = self.phrasetable
160 |         #for word in phrase:
161 |         #    if not word in d:
162 |         #        raise KeyError
163 |         #    d = d[word]
164 | 
165 |         #if "" in d:
166 |         #    return d[""]
167 |         #else:
168 |         #    raise KeyError
169 | 
170 | if twistedimported:
171 |     class PTProtocol(basic.LineReceiver):
172 |         def lineReceived(self, phrase):
173 |             try:
174 |                 for target,Pst,Pts,null_alignments in self.factory.phrasetable[phrase]:
175 |                     self.sendLine(target+"\t"+str(Pst)+"\t"+str(Pts)+"\t"+str(null_alignments))
176 |             except KeyError:
177 |                 self.sendLine("NOTFOUND")
178 | 
179 |     class PTFactory(protocol.ServerFactory):
180 |         protocol = PTProtocol
181 |         def __init__(self, phrasetable):
182 |             self.phrasetable = phrasetable
183 | 
184 |     class PhraseTableServer(object):
185 |         def __init__(self, phrasetable, port=65432):
186 |             reactor.listenTCP(port, PTFactory(phrasetable))
187 |             reactor.run()
188 | 
189 | 
190 | 
191 | 
192 | class PhraseTableClient(object):
193 | 
194 |     def __init__(self,host= "localhost",port=65432):
195 |         self.BUFSIZE = 4048
196 |         self.socket = socket.socket(socket.AF_INET,socket.SOCK_STREAM) #Create the socket
197 |         self.socket.settimeout(120)
198 |         self.socket.connect((host, port)) #Connect to server
199 |         self.lastresponse = ""
200 |         self.lastquery = ""
201 | 
202 |     def __getitem__(self, phrase):
203 |         solutions = []
204 |         if phrase != self.lastquery:
205 |             self.socket.send(phrase+ "\r\n")
206 | 
207 |             data = b""
208 |             while not data or data[-1] != '\n':
209 |                 data += self.socket.recv(self.BUFSIZE)
210 |         else:
211 |             data = self.lastresponse
212 | 
213 |         data = u(data)
214 | 
215 |         for line in data.split('\n'):
216 |             line = line.strip('\r\n')
217 |             if line == "NOTFOUND":
218 |                 raise KeyError(phrase)
219 |             elif line:
220 |                 fields = tuple(line.split("\t"))
221 |                 if len(fields) == 4:
222 |                     solutions.append( fields )
223 |                 else:
224 |                     print >>sys.stderr,"PHRASETABLECLIENT WARNING: Unable to parse response line"
225 | 
226 |         self.lastresponse = data
227 |         self.lastquery = phrase
228 | 
229 |         return solutions
230 | 
231 |     def __contains__(self, phrase):
232 |         self.socket.send(phrase.encode('utf-8')+ b"\r\n")\
233 | 
234 | 
235 |         data = b""
236 |         while not data or data[-1] != '\n':
237 |             data += self.socket.recv(self.BUFSIZE)
238 | 
239 |         data = u(data)
240 | 
241 |         for line in data.split('\n'):
242 |             line = line.strip('\r\n')
243 |             if line == "NOTFOUND":
244 |                 return False
245 | 
246 |         self.lastresponse = data
247 |         self.lastquery = phrase
248 | 
249 |         return True
250 | 
251 | 


--------------------------------------------------------------------------------
/pynlpl/formats/dutchsemcor.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*-
  2 | 
  3 | ###############################################################
  4 | # PyNLPl - DutchSemCor
  5 | #       by Maarten van Gompel (proycon)
  6 | #       http://ilk.uvt.nl/~mvgompel
  7 | #       Induction for Linguistic Knowledge Research Group
  8 | #       Universiteit van Tilburg
  9 | #       
 10 | #       Licensed under GPLv3
 11 | #
 12 | #  Modified by Ruben Izquierdo
 13 | #  We need also to store the TIMBL distance to the nearest neighboor  
 14 | # 
 15 | # Collection of formats for the DutchSemCor project
 16 | #
 17 | ###############################################################
 18 | 
 19 | from __future__ import print_function
 20 | from __future__ import unicode_literals
 21 | from __future__ import division
 22 | from __future__ import absolute_import  
 23 | from pynlpl.common import u
 24 | import sys
 25 | if sys.version < '3':
 26 |     from codecs import getwriter
 27 |     stderr = getwriter('utf-8')(sys.stderr)
 28 |     stdout = getwriter('utf-8')(sys.stdout)
 29 | else:
 30 |     stderr = sys.stderr
 31 |     stdout = sys.stdout
 32 | 
 33 | from pynlpl.formats.timbl import TimblOutput
 34 | from pynlpl.statistics import Distribution
 35 | import io
 36 | 
 37 | 
 38 | class WSDSystemOutput(object):
 39 |     def __init__(self, filename = None):
 40 |         self.data = {}
 41 |         self.distances={}
 42 |         self.maxDistance=1
 43 |         if filename:
 44 |             self.load(filename)
 45 | 
 46 |     def append(self, word_id, senses,distance=0):
 47 |        # Commented by Ruben, there are some ID's that are repeated in all sonar test files...            
 48 |        #assert (not word_id in self.data)
 49 |        if isinstance(senses, Distribution):
 50 |             self.data[word_id] = ( (x,y) for x,y in senses ) #PATCH UNDONE (#TODO: this is a patch, something's not right in Distribution?)
 51 |             self.distances[word_id]=distance
 52 |             if distance > self.maxDistance:
 53 |               self.maxDistance=distance
 54 |             return
 55 |        else:
 56 |            assert isinstance(senses, list) and len(senses) >= 1
 57 | 
 58 |        self.distances[word_id]=distance
 59 |        if distance > self.maxDistance:
 60 |         self.maxDistance=distance
 61 |                              
 62 |        
 63 |        if len(senses[0]) == 1:
 64 |             #not a (sense_id, confidence) tuple! compute equal confidence for all elements automatically:
 65 |             confidence = 1 / float(len(senses))
 66 |             self.data[word_id]  = [ (x,confidence) for x in senses ]
 67 |        else: 
 68 |           fulldistr = True
 69 |           for sense, confidence in senses:
 70 |             if confidence == None:
 71 |                 fulldistr = False
 72 |                 break
 73 | 
 74 |           if fulldistr:
 75 |                self.data[word_id] = Distribution(senses)
 76 |           else:
 77 |                self.data[word_id] = senses
 78 |         
 79 | 
 80 |     def getMaxDistance(self):
 81 |         return self.maxDistance
 82 |     
 83 |     def __iter__(self):
 84 |         for word_id, senses in  self.data.items():
 85 |             yield word_id, senses,self.distances[word_id]
 86 | 
 87 |     def __len__(self):
 88 |         return len(self.data)
 89 | 
 90 |     def __getitem__(self, word_id):
 91 |         """Returns the sense distribution for the given word_id"""
 92 |         return self.data[word_id]
 93 | 
 94 |     def load(self, filename):
 95 |         f = io.open(filename,'r',encoding='utf-8')
 96 |         for line in f:
 97 |             fields = line.strip().split(" ")
 98 |             word_id = fields[0]
 99 |             if len(fields[1:]) == 1:
100 |                 #only one sense, no confidence expressed:
101 |                 self.append(word_id, [(fields[1],None)])
102 |             else:
103 |                 senses = []
104 |                 distance=-1
105 |                 for i in range(1,len(fields),2):
106 |                     if i+1==len(fields):
107 |                         #The last field is the distance
108 |                         if fields[i][:4]=='+vdi': #Support for previous format of wsdout
109 |                             distance=float(fields[i][4:])
110 |                         else:
111 |                             distance=float(fields[i])
112 |                     else:
113 |                         if fields[i+1] == '?': fields[i+1] = None
114 |                         senses.append( (fields[i], fields[i+1]) )
115 |                 self.append(word_id, senses,distance)
116 |                 
117 |         f.close()
118 | 
119 |     def save(self, filename):
120 |         f = io.open(filename,'w',encoding='utf-8')
121 |         for word_id, senses,distance in self:
122 |             f.write(word_id)
123 |             for sense, confidence in senses:
124 |                 if confidence == None: confidence = "?"
125 |                 f.write(" " + str(sense) + " " + str(confidence))
126 |             if word_id in self.distances.keys():
127 |                 f.write(' '+str(self.distances[word_id]))
128 |             f.write("\n")
129 |         f.close()
130 | 
131 |     def out(self, filename):
132 |         for word_id, senses,distance in self:
133 |             print(word_id,distance,end="")
134 |             for sense, confidence in senses:
135 |                 if confidence == None: confidence = "?"
136 |                 print(" " + sense + " " + str(confidence),end="")
137 |             print()
138 | 
139 |     def senses(self, bestonly=False):
140 |         """Returns a list of all predicted senses"""
141 |         l = []
142 |         for word_id, senses,distance in self:
143 |             for sense, confidence in senses:
144 |                 if not sense in l: l.append(sense)
145 |                 if bestonly:
146 |                     break
147 |         return l
148 | 
149 | 
150 |     def loadfromtimbl(self, filename):
151 |         timbloutput = TimblOutput(io.open(filename,'r',encoding='utf-8'))
152 |         for i, (features, referenceclass, predictedclass, distribution, distance) in enumerate(timbloutput):
153 |             if distance != None:
154 |                 #distance='+vdi'+str(distance)
155 |                 distance=float(distance)
156 |             if len(features) == 0:
157 |                 print("WARNING: Empty feature vector in " + filename + " (line " + str(i+1) + ") skipping!!",file=stderr)
158 |                 continue
159 |             word_id = features[0] #note: this is an assumption that must be adhered to!
160 |             if distribution:
161 |                 self.append(word_id, distribution,distance)
162 | 
163 |     def fromTimblToWsdout(self,fileTimbl,fileWsdout):
164 |         timbloutput = TimblOutput(io.open(fileTimbl,'r',encoding='utf-8'))
165 |         wsdoutfile = io.open(fileWsdout,'w',encoding='utf-8')
166 |         for i, (features, referenceclass, predictedclass, distribution, distance) in enumerate(timbloutput):
167 |             if len(features) == 0:
168 |                 print("WARNING: Empty feature vector in " + fileTimbl + " (line " + str(i+1) + ") skipping!!",file=stderr)
169 |                 continue
170 |             word_id = features[0] #note: this is an assumption that must be adhered to!
171 |             if distribution:
172 |                 wsdoutfile.write(word_id+' ')
173 |                 for sense, confidence in distribution:
174 |                     if confidence== None: confidence='?'
175 |                     wsdoutfile.write(sense+' '+str(confidence)+' ')
176 |                 wsdoutfile.write(str(distance)+'\n')
177 |         wsdoutfile.close()
178 |                                                     
179 | 
180 | 
181 | class DataSet(object): #for testsets/trainingsets
182 |     def __init__(self, filename):
183 |         self.sense = {} #word_id => (sense_id, lemma,pos)
184 |         self.targetwords = {} #(lemma,pos) => [sense_id]
185 |         f = io.open(filename,'r',encoding='utf-8')
186 |         for line in f:
187 |             if len(line) > 0 and line[0] != '#':
188 |                 fields = line.strip('\n').split('\t')
189 |                 word_id = fields[0]
190 |                 sense_id = fields[1]
191 |                 lemma = fields[2]
192 |                 pos = fields[3]
193 |                 self.sense[word_id] = (sense_id, lemma, pos)
194 |                 if not (lemma,pos) in self.targetwords:
195 |                     self.targetwords[(lemma,pos)] = []
196 |                 if not sense_id in self.targetwords[(lemma,pos)]:
197 |                     self.targetwords[(lemma,pos)].append(sense_id)
198 |         f.close()
199 | 
200 |     def __getitem__(self, word_id):
201 |         return self.sense[self._sanitize(word_id)]
202 | 
203 |     def getsense(self, word_id):
204 |         return self.sense[self._sanitize(word_id)][0]
205 | 
206 |     def getlemma(self, word_id):
207 |         return self.sense[self._sanitize(word_id)][1]
208 | 
209 |     def getpos(self, word_id):
210 |         return self.sense[self._sanitize(word_id)][2]
211 | 
212 |     def _sanitize(self, word_id):
213 |         return u(word_id)
214 | 
215 |     def __contains__(self, word_id):
216 |         return (self._sanitize(word_id) in self.sense)
217 | 
218 | 
219 |     def __iter__(self):
220 |         for word_id, (sense, lemma, pos) in self.sense.items():
221 |             yield (word_id, sense, lemma, pos)
222 | 
223 |     def senses(self, lemma, pos):
224 |         return self.targetwords[(lemma,pos)]
225 | 


--------------------------------------------------------------------------------
/pynlpl/formats/sonar.py:
--------------------------------------------------------------------------------
  1 | #---------------------------------------------------------------
  2 | # PyNLPl - Simple Read library for D-Coi/SoNaR format
  3 | #   by Maarten van Gompel, ILK, Universiteit van Tilburg
  4 | #   http://ilk.uvt.nl/~mvgompel
  5 | #   proycon AT anaproy DOT nl
  6 | #
  7 | #   Licensed under GPLv3
  8 | #
  9 | # This library facilitates parsing and reading corpora in
 10 | # the SoNaR/D-Coi format.
 11 | #
 12 | #----------------------------------------------------------------
 13 | 
 14 | from __future__ import print_function
 15 | from __future__ import unicode_literals
 16 | from __future__ import division
 17 | from __future__ import absolute_import
 18 | 
 19 | 
 20 | import io
 21 | import re
 22 | import glob
 23 | import os.path
 24 | import sys
 25 | 
 26 | from lxml import etree as ElementTree
 27 | 
 28 | if sys.version < '3':
 29 |     from StringIO import StringIO
 30 | else:
 31 |     from io import StringIO
 32 | 
 33 | 
 34 | namespaces = {
 35 |     'dcoi': "http://lands.let.ru.nl/projects/d-coi/ns/1.0",
 36 |     'standalone':"http://ilk.uvt.nl/dutchsemcor-standalone",
 37 |     'dsc':"http://ilk.uvt.nl/dutchsemcor",
 38 |     'xml':"http://www.w3.org/XML/1998/namespace"
 39 | }
 40 | 
 41 | class CorpusDocument(object):
 42 |     """This class represent one document/text of the Corpus (read-only)"""
 43 | 
 44 |     def __init__(self, filename, encoding = 'iso-8859-15'):
 45 |         self.filename = filename
 46 |         self.id = os.path.basename(filename).split(".")[0]
 47 |         self.f = io.open(filename,'r', encoding=encoding)
 48 |         self.metadata = {}
 49 | 
 50 |     def _parseimdi(self,line):
 51 |         r = re.compile('<imdi:Title>(.*)</imdi:Title>')
 52 |         matches = r.findall(line)
 53 |         if matches:
 54 |             self.metadata['title'] = matches[0]
 55 |         if not 'date' in self.metadata:
 56 |             r = re.compile('<imdi:Date>(.*)</imdi:Date>')
 57 |             matches = r.findall(line)
 58 |             if matches:
 59 |                 self.metadata['date'] = matches[0]
 60 | 
 61 | 
 62 |     def __iter__(self):
 63 |         """Iterate over all words, a four-tuple (word,id,pos,lemma), in the document"""
 64 | 
 65 |         r = re.compile('<w.*xml:id="([^"]*)"(.*)>(.*)</w>')
 66 |         for line in self.f.readlines():
 67 |             matches = r.findall(line)
 68 |             for id, attribs, word in matches:
 69 |                 pos = lemma = None
 70 |                 m = re.findall('pos="([^"]+)"', attribs)
 71 |                 if m: pos = m[0]
 72 | 
 73 |                 m = re.findall('lemma="([^"]+)"', attribs)
 74 |                 if m: lemma = m[0]
 75 | 
 76 |                 yield word, id, pos, lemma
 77 |             if line.find('imdi:') != -1:
 78 |                 self._parseimdi(line)
 79 | 
 80 |     def words(self):
 81 |         #alias
 82 |         return iter(self)
 83 | 
 84 | 
 85 |     def sentences(self):
 86 |         """Iterate over all sentences (sentence_id, sentence) in the document, sentence is a list of 4-tuples (word,id,pos,lemma)"""
 87 |         prevp = 0
 88 |         prevs = 0
 89 |         sentence = [];
 90 |         sentence_id = ""
 91 |         for word, id, pos, lemma in iter(self):
 92 |             try:
 93 |                 doc_id, ptype, p, s, w = re.findall('([\w\d-]+)\.(p|head)\.(\d+)\.s\.(\d+)\.w\.(\d+)',id)[0]
 94 |                 if ((p != prevp) or (s != prevs)) and sentence:
 95 |                     yield sentence_id, sentence
 96 |                     sentence = []
 97 |                     sentence_id = doc_id + '.' + ptype + '.' + str(p) + '.s.' + str(s)
 98 |                 prevp = p
 99 |             except IndexError:
100 |                 doc_id, s, w = re.findall('([\w\d-]+)\.s\.(\d+)\.w\.(\d+)',id)[0]
101 |                 if s != prevs and sentence:
102 |                     yield sentence_id, sentence
103 |                     sentence = []
104 |                     sentence_id = doc_id + '.s.' + str(s)
105 |             sentence.append( (word,id,pos,lemma) )
106 |             prevs = s
107 |         if sentence:
108 |             yield sentence_id, sentence
109 | 
110 |     def paragraphs(self, with_id = False):
111 |         """Extracts paragraphs, returns list of plain-text(!) paragraphs"""
112 |         prevp = 0
113 |         partext = []
114 |         for word, id, pos, lemma in iter(self):
115 |             doc_id, ptype, p, s, w = re.findall('([\w\d-]+)\.(p|head)\.(\d+)\.s\.(\d+)\.w\.(\d+)',id)[0]
116 |             if prevp != p and partext:
117 |                     yield ( doc_id + "." + ptype + "." + prevp , " ".join(partext) )
118 |                     partext = []
119 |             partext.append(word)
120 |             prevp = p
121 |         if partext:
122 |             yield (doc_id + "." + ptype + "." + prevp, " ".join(partext) )
123 | 
124 | class Corpus:
125 |     def __init__(self,corpusdir, extension = 'pos', restrict_to_collection = "", conditionf=lambda x: True, ignoreerrors=False):
126 |         self.corpusdir = corpusdir
127 |         self.extension = extension
128 |         self.restrict_to_collection = restrict_to_collection
129 |         self.conditionf = conditionf
130 |         self.ignoreerrors = ignoreerrors
131 | 
132 |     def __iter__(self):
133 |         if not self.restrict_to_collection:
134 |             for f in glob.glob(self.corpusdir+"/*." + self.extension):
135 |                 if self.conditionf(f):
136 |                     try:
137 |                         yield CorpusDocument(f)
138 |                     except:
139 |                         print("Error, unable to parse " + f,file=sys.stderr)
140 |                         if not self.ignoreerrors:
141 |                             raise
142 |         for d in glob.glob(self.corpusdir+"/*"):
143 |             if (not self.restrict_to_collection or self.restrict_to_collection == os.path.basename(d)) and (os.path.isdir(d)):
144 |                 for f in glob.glob(d+ "/*." + self.extension):
145 |                     if self.conditionf(f):
146 |                         try:
147 |                             yield CorpusDocument(f)
148 |                         except:
149 |                             print("Error, unable to parse " + f,file=sys.stderr)
150 |                             if not self.ignoreerrors:
151 |                                 raise
152 | 
153 | 
154 | #######################################################
155 | 
156 | def ns(namespace):
157 |     """Resolves the namespace identifier to a full URL"""
158 |     global namespaces
159 |     return '{'+namespaces[namespace]+'}'
160 | 
161 | 
162 | class CorpusFiles(Corpus):
163 |     def __iter__(self):
164 |         if not self.restrict_to_collection:
165 |             for f in glob.glob(self.corpusdir+"/*." + self.extension):
166 |                 if self.conditionf(f):
167 |                     yield f
168 |         for d in glob.glob(self.corpusdir+"/*"):
169 |             if (not self.restrict_to_collection or self.restrict_to_collection == os.path.basename(d)) and (os.path.isdir(d)):
170 |                 for f in glob.glob(d+ "/*." + self.extension):
171 |                     if self.conditionf(f):
172 |                         yield f
173 | 
174 | 
175 | class CorpusX(Corpus):
176 |     def __iter__(self):
177 |         if not self.restrict_to_collection:
178 |             for f in glob.glob(self.corpusdir+"/*." + self.extension):
179 |                 if self.conditionf(f):
180 |                     try:
181 |                         yield CorpusDocumentX(f)
182 |                     except:
183 |                         print("Error, unable to parse " + f,file=sys.stderr)
184 |                         if not self.ignoreerrors:
185 |                             raise
186 |         for d in glob.glob(self.corpusdir+"/*"):
187 |             if (not self.restrict_to_collection or self.restrict_to_collection == os.path.basename(d)) and (os.path.isdir(d)):
188 |                 for f in glob.glob(d+ "/*." + self.extension):
189 |                     if self.conditionf(f):
190 |                         try:
191 |                             yield CorpusDocumentX(f)
192 |                         except:
193 |                             print("Error, unable to parse " + f,file=sys.stderr)
194 |                             if not self.ignoreerrors:
195 |                                 raise
196 | 
197 | 
198 | 
199 | class CorpusDocumentX:
200 |     """This class represent one document/text of the Corpus, loaded into memory at once and retaining the full structure"""
201 | 
202 |     def __init__(self, filename, tree = None, index=True ):
203 |         global namespaces
204 |         self.filename = filename
205 |         if not tree:
206 |             self.tree = ElementTree.parse(self.filename)
207 |             self.committed = True
208 |         elif isinstance(tree, ElementTree._Element):
209 |             self.tree = tree
210 |             self.committed = False
211 | 
212 |         #Grab root element and determine if we run inline or standalone
213 |         self.root =  self.xpath("/dcoi:DCOI")
214 |         if self.root:
215 |             self.root = self.root[0]
216 |             self.inline = True
217 |         else:
218 |             raise Exception("Not in DCOI/SoNaR format!")
219 |             #self.root = self.xpath("/standalone:text")
220 |             #self.inline = False
221 |             #if not self.root:
222 |             #    raise FormatError()
223 | 
224 |         #build an index
225 |         self.index = {}
226 |         if index:
227 |             self._index(self.root)
228 | 
229 |     def _index(self,node):
230 |         if ns('xml') + 'id' in node.attrib:
231 |                 self.index[node.attrib[ns('xml') + 'id']] = node
232 |         for subnode in node: #TODO: can we do this with xpath instead?
233 |             self._index(subnode)
234 | 
235 |     def validate(self, formats_dir="../formats/"):
236 |         """checks if the document is valid"""
237 |         #TODO: download XSD from web
238 |         if self.inline:
239 |             xmlschema = ElementTree.XMLSchema(ElementTree.parse(StringIO("\n".join(open(formats_dir+"dcoi-dsc.xsd").readlines()))))
240 |             xmlschema.assertValid(self.tree)
241 |             #return xmlschema.validate(self)
242 |         else:
243 |             xmlschema = ElementTree.XMLSchema(ElementTree.parse(StringIO("\n".join(open(formats_dir+"dutchsemcor-standalone.xsd").readlines()))))
244 |             xmlschema.assertValid(self.tree)
245 |             #return xmlschema.validate(self)
246 | 
247 |     def xpath(self, expression):
248 |         """Executes an xpath expression using the correct namespaces"""
249 |         global namespaces
250 |         return self.tree.xpath(expression, namespaces=namespaces)
251 | 
252 | 
253 |     def __exists__(self, id):
254 |         return (id in self.index)
255 | 
256 |     def __getitem__(self, id):
257 |         return self.index[id]
258 | 
259 | 
260 |     def paragraphs(self, node=None):
261 |         """iterate over paragraphs"""
262 |         if node == None: node = self
263 |         return node.xpath("//dcoi:p")
264 | 
265 |     def sentences(self, node=None):
266 |         """iterate over sentences"""
267 |         if node == None: node = self
268 |         return node.xpath("//dcoi:s")
269 | 
270 |     def words(self,node=None):
271 |         """iterate over words"""
272 |         if node == None: node = self
273 |         return node.xpath("//dcoi:w")
274 | 
275 |     def save(self, filename=None, encoding='iso-8859-15'):
276 |         if not filename: filename = self.filename
277 |         self.tree.write(filename, encoding=encoding, method='xml', pretty_print=True, xml_declaration=True)
278 | 
279 | 
280 | 


--------------------------------------------------------------------------------
/pynlpl/formats/cql.py:
--------------------------------------------------------------------------------
  1 | #---------------------------------------------------------------
  2 | # PyNLPl - Corpus Query Language (CQL)
  3 | #   by Maarten van Gompel
  4 | #   Centre for Language Studies
  5 | #   Radboud University Nijmegen
  6 | #   http://proycon.github.com/folia
  7 | #   http://www.github.com/proycon/pynlpl
  8 | #   proycon AT anaproy DOT nl
  9 | #
 10 | # Parser and interpreter for a basic subset of the Corpus Query Language
 11 | #
 12 | #   Licensed under GPLv3
 13 | #
 14 | #----------------------------------------------------------------
 15 | 
 16 | from __future__ import print_function, unicode_literals, division, absolute_import
 17 | 
 18 | from pynlpl.fsa import State, NFA
 19 | import re
 20 | import sys
 21 | 
 22 | OPERATORS = ('=','!=')
 23 | MAXINTERVAL = 99
 24 | 
 25 | class SyntaxError(Exception):
 26 |     pass
 27 | 
 28 | class ValueExpression(object):
 29 |     def __init__(self, values):
 30 |         self.values = values #disjunction
 31 | 
 32 |     @staticmethod
 33 |     def parse(s,i):
 34 |         values = ""
 35 |         assert s[i] == '"'
 36 |         i += 1
 37 |         while not (s[i] == '"' and s[i-1] != "\\"):
 38 |             values += s[i]
 39 |             i += 1
 40 |         values = values.split("|")
 41 |         return ValueExpression(values), i+1
 42 | 
 43 |     def __len__(self):
 44 |         return len(self.values)
 45 | 
 46 |     def __iter__(self):
 47 |         for x in self.values:
 48 |             yield x
 49 | 
 50 |     def __getitem__(self,index):
 51 |         return self.values[index]
 52 | 
 53 | class AttributeExpression(object):
 54 |     def __init__(self, attribute, operator, valueexpression):
 55 |         self.attribute = attribute
 56 |         self.operator = operator
 57 |         self.valueexpr = valueexpression
 58 | 
 59 |     @staticmethod
 60 |     def parse(s,i):
 61 |         while s[i] == " ":
 62 |             i +=1
 63 |         if s[i] == '"':
 64 |             #no attribute and no operator, use defaults:
 65 |             attribute = "word"
 66 |             operator = "="
 67 |         else:
 68 |             attribute = ""
 69 |             while s[i] not in (' ','!','>','<','='):
 70 |                 attribute += s[i]
 71 |                 i += 1
 72 |             if not attribute:
 73 |                 raise SyntaxError("Expected attribute name, none found")
 74 |             operator = ""
 75 |             while s[i] in (' ','!','>','<','='):
 76 |                 if s[i] != ' ':
 77 |                     operator += s[i]
 78 |                 i += 1
 79 |             if operator not in OPERATORS:
 80 |                 raise SyntaxError("Expected operator, got '" + operator + "'")
 81 |         if s[i] != '"':
 82 |             raise SyntaxError("Expected start of value expression (doublequote) in position " + str(i) + ", got " + s[i])
 83 |         valueexpr, i = ValueExpression.parse(s,i)
 84 |         return AttributeExpression(attribute,operator, valueexpr), i
 85 | 
 86 | class TokenExpression(object):
 87 |     def __init__(self, attribexprs=[], interval=None):
 88 |         self.attribexprs = attribexprs
 89 |         self.interval = interval
 90 | 
 91 |     @staticmethod
 92 |     def parse(s,i):
 93 |         attribexprs = []
 94 |         while s[i] == " ":
 95 |             i +=1
 96 |         if s[i] == '"':
 97 |             attribexpr,i = AttributeExpression.parse(s,i)
 98 |             attribexprs.append(attribexpr)
 99 |         elif s[i] == "[":
100 |             i += 1
101 |             while True:
102 |                 while s[i] == " ":
103 |                     i +=1
104 |                 if s[i] == "&":
105 |                     attribexpr,i = AttributeExpression.parse(s,i+1)
106 |                     attribexprs.append(attribexpr)
107 |                 elif s[i] == "]":
108 |                     i += 1
109 |                     break
110 |                 elif not attribexprs:
111 |                     attribexpr,i = AttributeExpression.parse(s,i)
112 |                     attribexprs.append(attribexpr)
113 |                 else:
114 |                     raise SyntaxError("Unexpected char whilst parsing token expression,  position " + str(i) + ": " + s[i])
115 |         else:
116 |             raise SyntaxError("Expected token expression starting with either \" or [, got: " + s[i])
117 | 
118 |         if i == len(s):
119 |             interval = None #end of query!
120 |         elif s[i] == "{":
121 |             #interval expression, find end:
122 |             interval = None
123 |             for j in range(i+1, len(s)):
124 |                 if s[j] == "}":
125 |                     interval = s[i+1:j]
126 | 
127 |             if interval is None:
128 |                 raise SyntaxError("Interval expression started but no end-brace found")
129 | 
130 |             i += len(interval) + 2
131 | 
132 |             try:
133 |                 if ',' in interval:
134 |                     interval = tuple(int(x) for x in interval.split(","))
135 |                     if len(interval) != 2:
136 |                         raise SyntaxError("Invalid interval: " + interval)
137 |                 elif '-' in interval: #alternative
138 |                     interval = tuple(int(x) for x in interval.split("-"))
139 |                     if len(interval) != 2:
140 |                         raise SyntaxError("Invalid interval: " + interval)
141 |                 else:
142 |                     interval = (int(interval),int(interval))
143 |             except ValueError:
144 |                 raise SyntaxError("Invalid interval: " + interval)
145 |         elif s[i] == "?":
146 |             interval = (0,1)
147 |             i += 1
148 |         elif s[i] == "+":
149 |             interval = (1,MAXINTERVAL)
150 |             i += 1
151 |         elif s[i] == "*":
152 |             interval = (0,MAXINTERVAL)
153 |             i += 1
154 |         else:
155 |             interval = None
156 | 
157 |         return TokenExpression(attribexprs,interval),i
158 | 
159 | 
160 |     def __len__(self):
161 |         return len(self.attribexprs)
162 | 
163 |     def __iter__(self):
164 |         for x in self.attribexprs:
165 |             yield x
166 | 
167 |     def __getitem__(self,index):
168 |         return self.attribexprs[index]
169 | 
170 |     def nfa(self, nextstate):
171 |         """Returns an initial state for an NFA"""
172 |         if self.interval:
173 |             mininterval, maxinterval = self.interval #pylint: disable=unpacking-non-sequence
174 |             nextstate2 = nextstate
175 |             for i in range(maxinterval):
176 |                 state = State(transitions=[(self,self.match, nextstate2)])
177 |                 if i+1> mininterval:
178 |                     if nextstate is not nextstate2: state.transitions.append((self,self.match, nextstate))
179 |                     if maxinterval == MAXINTERVAL:
180 |                         state.epsilon.append(state)
181 |                         break
182 |                 nextstate2 = state
183 |             return state
184 |         else:
185 |             state = State(transitions=[(self,self.match, nextstate)])
186 |             return state
187 | 
188 | 
189 |     def match(self, value):
190 |         match = False
191 |         for _, attribexpr in enumerate(self):
192 |             annottype = attribexpr.attribute
193 |             if annottype == 'text': annottype = 'word'
194 |             if attribexpr.operator == "!=":
195 |                 negate = True
196 |             elif attribexpr.operator == "=":
197 |                 negate = False
198 |             else:
199 |                 raise Exception("Unexpected operator " + attribexpr.operator)
200 | 
201 |             if len(attribexpr.valueexpr) > 1:
202 |                 expr = re.compile("^(" + "|".join(attribexpr.valueexpr) + ")$")
203 |             else:
204 |                 expr = re.compile("^" + attribexpr.valueexpr[0] + '$')
205 |             match = (expr.match(value[annottype]) is not None)
206 |             if negate:
207 |                 match = not match
208 |             if not match:
209 |                 return False
210 |         return True
211 | 
212 | 
213 | 
214 | class Query(object):
215 |     def __init__(self, s):
216 |         self.tokenexprs = []
217 |         i = 0
218 |         l = len(s)
219 |         while i < l:
220 |             if s[i] == " ":
221 |                 i += 1
222 |             else:
223 |                 tokenexpr,i = TokenExpression.parse(s,i)
224 |                 self.tokenexprs.append(tokenexpr)
225 | 
226 |     def __len__(self):
227 |         return len(self.tokenexprs)
228 | 
229 |     def __iter__(self):
230 |         for x in self.tokenexprs:
231 |             yield x
232 | 
233 |     def __getitem__(self,index):
234 |         return self.tokenexprs[index]
235 | 
236 |     def nfa(self):
237 |         """convert the expression into an NFA"""
238 |         finalstate = State(final=True)
239 |         nextstate = finalstate
240 |         for tokenexpr in reversed(self):
241 |             state = tokenexpr.nfa(nextstate)
242 |             nextstate = state
243 |         return NFA(state)
244 | 
245 | 
246 |     def __call__(self, tokens, debug=False):
247 |         """Execute the CQL expression, pass a list of tokens/annotations using keyword arguments: word, pos, lemma, etc"""
248 | 
249 |         if not tokens:
250 |             raise Exception("Pass a list of tokens/annotation using keyword arguments! (word,pos,lemma, or others)")
251 | 
252 |         #convert the expression into an NFA
253 |         nfa = self.nfa()
254 |         if debug:
255 |             print(repr(nfa), file=sys.stderr)
256 | 
257 |         return list(nfa.find(tokens,debug))
258 | 
259 | 
260 | 
261 | 
262 | 
263 | 
264 | 
265 | def cql2fql(cq):
266 |     fq = "SELECT FOR SPAN "
267 |     if not isinstance(cq, Query):
268 |         cq = Query(cq)
269 | 
270 |     for i, token in enumerate(cq):
271 |         if i > 0: fq += " & "
272 |         fq += "w"
273 |         if token.interval:
274 |             fq += " {" + str(token.interval[0]) + "," + str(token.interval[1])+ "} "
275 |         else:
276 |             fq += " "
277 |         if token.attribexprs:
278 |             fq += "WHERE "
279 |             for j, attribexpr in enumerate(token):
280 |                 if j > 0:
281 |                     fq += " AND "
282 |                 fq += "("
283 |                 if attribexpr.operator == "!=":
284 |                     operator = "NOTMATCHES"
285 |                 elif attribexpr.operator == "=":
286 |                     operator = "MATCHES"
287 |                 else:
288 |                     raise Exception("Invalid operator: " + attribexpr.operator)
289 |                 if attribexpr.attribute in ("word","text"):
290 |                     if len(attribexpr.valueexpr) > 1:
291 |                         fq += "text " + operator + " \"^(" + "|".join(attribexpr.valueexpr) + ")$\" "
292 |                     else:
293 |                         fq += "text " + operator + " \"^" + attribexpr.valueexpr[0] + "$\" "
294 |                 else:
295 |                     annottype = attribexpr.attribute
296 |                     if annottype == "tag":
297 |                         annottype = "pos"
298 |                     elif annottype == "lempos":
299 |                         raise Exception("lempos not supported in CQL to FQL conversion, use pos and lemma separately")
300 |                     fq += annottype + " HAS class "
301 |                     if len(attribexpr.valueexpr) > 1:
302 |                         fq += operator + " \"^(" + "|".join(attribexpr.valueexpr) + ")$\" "
303 |                     else:
304 |                         fq += operator + " \"^" + attribexpr.valueexpr[0] + "$\" "
305 |                 fq += ")"
306 | 
307 |     return fq
308 | 


--------------------------------------------------------------------------------
/pynlpl/formats/giza.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | ###############################################################
  4 | #  PyNLPl - WordAlignment Library for reading GIZA++ A3 files
  5 | #       by Maarten van Gompel (proycon)
  6 | #       http://ilk.uvt.nl/~mvgompel
  7 | #       Induction for Linguistic Knowledge Research Group
  8 | #       Universiteit van Tilburg
  9 | #
 10 | #       In part using code by Sander Canisius
 11 | #
 12 | #       Licensed under GPLv3
 13 | #
 14 | #
 15 | # This library reads GIZA++ A3 files. It contains three classes over which
 16 | # you can iterate to obtain (sourcewords,targetwords,alignment) pairs.
 17 | #
 18 | #   - WordAlignment  - Reads target-source.A3.final files, in which each source word is aligned to one target word
 19 | #   - MultiWordAlignment  - Reads source-target.A3.final files, in which each source word may be aligned to multiple target target words
 20 | #   - IntersectionAlignment  - Computes the intersection between the above two alignments
 21 | #
 22 | #
 23 | ###############################################################
 24 | 
 25 | from __future__ import print_function
 26 | from __future__ import unicode_literals
 27 | from __future__ import division
 28 | from __future__ import absolute_import
 29 | 
 30 | from pynlpl.common import u
 31 | 
 32 | import bz2
 33 | import gzip
 34 | import copy
 35 | import io
 36 | from sys import stderr
 37 | 
 38 | class GizaSentenceAlignment(object):
 39 | 
 40 |     def __init__(self, sourceline, targetline, index):
 41 |         self.index = index
 42 |         self.alignment = []
 43 |         if sourceline:
 44 |             self.source = self._parsesource(sourceline.strip())
 45 |         else:
 46 |             self.source = []
 47 |         self.target = targetline.strip().split(' ')
 48 | 
 49 |     def _parsesource(self, line):
 50 |         cleanline = ""
 51 | 
 52 |         inalignment = False
 53 |         begin = 0
 54 |         sourceindex = 0
 55 | 
 56 |         for i in range(0,len(line)):
 57 |             if line[i] == ' ' or i == len(line) - 1:
 58 |                 if i == len(line) - 1:
 59 |                     offset = 1
 60 |                 else:
 61 |                     offset = 0
 62 | 
 63 |                 word = line[begin:i+offset]
 64 |                 if word == '})':
 65 |                     inalignment = False
 66 |                     begin = i + 1
 67 |                     continue
 68 |                 elif word == "({":
 69 |                     inalignment = True
 70 |                     begin = i + 1
 71 |                     continue
 72 |                 if word.strip() and word != 'NULL':
 73 |                     if not inalignment:
 74 |                         sourceindex += 1
 75 |                         if cleanline: cleanline += " "
 76 |                         cleanline += word
 77 |                     else:
 78 |                         targetindex = int(word)
 79 |                         self.alignment.append( (sourceindex-1, targetindex-1) )
 80 |                 begin = i + 1
 81 | 
 82 |         return cleanline.split(' ')
 83 | 
 84 | 
 85 |     def intersect(self,other):
 86 |         if other.target != self.source:
 87 |             print("GizaSentenceAlignment.intersect(): Mismatch between self.source and other.target: " + repr(self.source) + " -- vs -- " + repr(other.target),file=stderr)
 88 |             return None
 89 | 
 90 |         intersection = copy.copy(self)
 91 |         intersection.alignment = []
 92 | 
 93 |         for sourceindex, targetindex in self.alignment:
 94 |             for targetindex2, sourceindex2 in other.alignment:
 95 |                 if targetindex2 == targetindex and sourceindex2 == sourceindex:
 96 |                     intersection.alignment.append( (sourceindex, targetindex) )
 97 | 
 98 |         return intersection
 99 | 
100 |     def __repr__(self):
101 |         s = " ".join(self.source)+ " ||| "
102 |         s += " ".join(self.target) + " ||| "
103 |         for S,T in sorted(self.alignment):
104 |             s += self.source[S] + "->" + self.target[T] + " ; "
105 |         return s
106 | 
107 | 
108 |     def getalignedtarget(self, index):
109 |         """Returns target range only if source index aligns to a single consecutive range of target tokens."""
110 |         targetindices = []
111 |         target = None
112 |         foundindex = -1
113 |         for sourceindex, targetindex in self.alignment:
114 |             if sourceindex == index:
115 |                 targetindices.append(targetindex)
116 |         if len(targetindices) > 1:
117 |             for i in range(1,len(targetindices)):
118 |                 if abs(targetindices[i] - targetindices[i-1]) != 1:
119 |                     break  # not consecutive
120 |             foundindex = (min(targetindices), max(targetindices))
121 |             target = ' '.join(self.target[min(targetindices):max(targetindices)+1])
122 |         elif targetindices:
123 |             foundindex = targetindices[0]
124 |             target = self.target[foundindex]
125 | 
126 |         return target, foundindex
127 | 
128 | class GizaModel(object):
129 |     def __init__(self, filename, encoding= 'utf-8'):
130 |         if filename.split(".")[-1] == "bz2":
131 |             self.f = bz2.BZ2File(filename,'r')
132 |         elif filename.split(".")[-1] == "gz":
133 |             self.f = gzip.GzipFile(filename,'r')
134 |         else:
135 |             self.f = io.open(filename,'r',encoding=encoding)
136 |         self.nextlinebuffer = None
137 | 
138 | 
139 |     def __iter__(self):
140 |         self.f.seek(0)
141 |         nextlinebuffer = u(next(self.f))
142 |         sentenceindex = 0
143 | 
144 |         done = False
145 |         while not done:
146 |             sentenceindex += 1
147 |             line = nextlinebuffer
148 |             if line[0] != '#':
149 |                 raise Exception("Error parsing GIZA++ Alignment at sentence " +  str(sentenceindex) + ", expected new fragment, found: " + repr(line))
150 | 
151 |             targetline = u(next(self.f))
152 |             sourceline = u(next(self.f))
153 | 
154 |             yield GizaSentenceAlignment(sourceline, targetline, sentenceindex)
155 | 
156 |             try:
157 |                 nextlinebuffer = u(next(self.f))
158 |             except StopIteration:
159 |                 done = True
160 | 
161 | 
162 |     def __del__(self):
163 |         if self.f: self.f.close()
164 | 
165 | 
166 | #------------------ OLD -------------------
167 | 
168 | def parseAlignment(tokens): #by Sander Canisius
169 |     assert tokens.pop(0) == "NULL"
170 |     while tokens.pop(0) != "})":
171 |         pass
172 | 
173 |     while tokens:
174 |         word = tokens.pop(0)
175 |         assert tokens.pop(0) == "({"
176 |         positions = []
177 |         token = tokens.pop(0)
178 |         while token != "})":
179 |             positions.append(int(token))
180 |             token = tokens.pop(0)
181 | 
182 |         yield word, positions
183 | 
184 | 
185 | class WordAlignment:
186 |     """Target to Source alignment: reads target-source.A3.final files, in which each source word is aligned to one target word"""
187 | 
188 |     def __init__(self,filename, encoding=False):
189 |         """Open a target-source GIZA++ A3 file. The file may be bzip2 compressed. If an encoding is specified, proper unicode strings will be returned"""
190 | 
191 |         if filename.split(".")[-1] == "bz2":
192 |             self.stream = bz2.BZ2File(filename,'r')
193 |         else:
194 |             self.stream = open(filename)
195 |         self.encoding = encoding
196 | 
197 | 
198 |     def __del__(self):
199 |         self.stream.close()
200 | 
201 |     def __iter__(self): #by Sander Canisius
202 |         line = self.stream.readline()
203 |         while line:
204 |             assert line.startswith("#")
205 |             src = self.stream.readline().split()
206 |             trg = []
207 |             alignment = [None for i in xrange(len(src))]
208 | 
209 |             for i, (targetWord, positions) in enumerate(parseAlignment(self.stream.readline().split())):
210 | 
211 |                 trg.append(targetWord)
212 | 
213 |                 for pos in positions:
214 |                     assert alignment[pos - 1] is None
215 |                     alignment[pos - 1] = i
216 | 
217 |             if self.encoding:
218 |                 yield [ u(w,self.encoding) for w in src ], [ u(w,self.encoding) for w in trg ], alignment
219 |             else:
220 |                 yield src, trg, alignment
221 | 
222 |             line = self.stream.readline()
223 | 
224 | 
225 |     def targetword(self, index, targetwords, alignment):
226 |         """Return the aligned targetword for a specified index in the source words"""
227 |         if alignment[index]:
228 |             return targetwords[alignment[index]]
229 |         else:
230 |             return None
231 | 
232 |     def reset(self):
233 |         self.stream.seek(0)
234 | 
235 | class MultiWordAlignment:
236 |     """Source to Target alignment: reads source-target.A3.final files, in which each source word may be aligned to multiple target words (adapted from code by Sander Canisius)"""
237 | 
238 |     def __init__(self,filename, encoding = False):
239 |         """Load a target-source GIZA++ A3 file. The file may be bzip2 compressed. If an encoding is specified, proper unicode strings will be returned"""
240 | 
241 |         if filename.split(".")[-1] == "bz2":
242 |             self.stream = bz2.BZ2File(filename,'r')
243 |         else:
244 |             self.stream = open(filename)
245 |         self.encoding = encoding
246 | 
247 |     def __del__(self):
248 |         self.stream.close()
249 | 
250 |     def __iter__(self):
251 |         line = self.stream.readline()
252 |         while line:
253 |             assert line.startswith("#")
254 |             trg = self.stream.readline().split()
255 |             src = []
256 |             alignment = []
257 | 
258 |             for i, (word, positions) in enumerate(parseAlignment(self.stream.readline().split())):
259 |                 src.append(word)
260 |                 alignment.append( [ p - 1 for p in positions ] )
261 | 
262 | 
263 |             if self.encoding:
264 |                 yield [ unicode(w,self.encoding) for w in src ], [ unicode(w,self.encoding) for w in trg ], alignment
265 |             else:
266 |                 yield src, trg, alignment
267 | 
268 |             line = self.stream.readline()
269 | 
270 |     def targetword(self, index, targetwords, alignment):
271 |         """Return the aligned targeword for a specified index in the source words. Multiple words are concatenated together with a space in between"""
272 |         return " ".join(targetwords[alignment[index]])
273 | 
274 |     def targetwords(self, index, targetwords, alignment):
275 |         """Return the aligned targetwords for a specified index in the source words"""
276 |         return [ targetwords[x] for x in alignment[index] ]
277 | 
278 |     def reset(self):
279 |         self.stream.seek(0)
280 | 
281 | 
282 | class IntersectionAlignment:
283 | 
284 |     def __init__(self,source2target,target2source,encoding=False):
285 |         self.s2t = MultiWordAlignment(source2target, encoding)
286 |         self.t2s = WordAlignment(target2source, encoding)
287 |         self.encoding = encoding
288 | 
289 |     def __iter__(self):
290 |         for (src, trg, alignment), (revsrc, revtrg, revalignment) in zip(self.s2t,self.t2s): #will take unnecessary memory in Python 2.x, optimal in Python 3
291 |             if src != revsrc or trg != revtrg:
292 |                 raise Exception("Files are not identical!")
293 |             else:
294 |                 #keep only those alignments that are present in both
295 |                 intersection = []
296 |                 for i, x in enumerate(alignment):
297 |                     if revalignment[i] in x:
298 |                         intersection.append(revalignment[i])
299 |                     else:
300 |                         intersection.append(None)
301 | 
302 |                 yield src, trg, intersection
303 | 
304 |     def reset(self):
305 |         self.s2t.reset()
306 |         self.t2s.reset()
307 | 
308 | 


--------------------------------------------------------------------------------
/pynlpl/tools/frogwrapper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding:utf-8 -*-
  3 | 
  4 | 
  5 | #Frog Wrapper with XML input and FoLiA output support
  6 | 
  7 | 
  8 | from __future__ import print_function, unicode_literals, division, absolute_import
  9 | 
 10 | import getopt
 11 | import lxml.etree
 12 | import sys
 13 | import os
 14 | import codecs
 15 | 
 16 | if __name__ == "__main__":
 17 |     sys.path.append(sys.path[0] + '/../..')
 18 |     os.environ['PYTHONPATH'] = sys.path[0] + '/../..'
 19 | 
 20 | 
 21 | import pynlpl.formats.folia as folia
 22 | from pynlpl.clients.frogclient import FrogClient
 23 | 
 24 | def legacyout(i, word,lemma,morph,pos):
 25 |     if word:
 26 |         out = str(i + 1) + "\t" + word + "\t" + lemma + "\t" + morph + "\t" + pos
 27 |         print(out.encode('utf-8'))
 28 |     else:
 29 |         print()
 30 | 
 31 | def usage():
 32 |     print >>sys.stderr,"frogwrapper.py  [options]"
 33 |     print >>sys.stderr,"------------------------------------------------------"
 34 |     print >>sys.stderr,"Input file:"
 35 |     print >>sys.stderr,"\t--txt=[file]       Plaintext input"
 36 |     print >>sys.stderr,"\t--xml=[file]       XML Input"
 37 |     print >>sys.stderr,"\t--folia=[file]     FoLiA XML Input"
 38 |     print >>sys.stderr,"Frog settings:"
 39 |     print >>sys.stderr,"\t-p [port]          Port the Frog server is running on"
 40 |     print >>sys.stderr,"Output type:"
 41 |     print >>sys.stderr,"\t--id=[ID]          ID for outputted FoLiA XML Document"
 42 |     print >>sys.stderr,"\t--legacy           Use legacy columned output instead of FoLiA"
 43 |     print >>sys.stderr,"\t-o                 Write output to input file (only works for --folia)"
 44 |     print >>sys.stderr,"XML Input:"
 45 |     print >>sys.stderr,"\t--selectsen=[expr] Use xpath expression to select sentences"
 46 |     print >>sys.stderr,"\t--selectpar=[expr] Use xpath expression to select paragraphs"
 47 |     print >>sys.stderr,"\t--idattrib=[attrb] Copy ID from this attribute"
 48 |     print >>sys.stderr,"Text Input:"
 49 |     print >>sys.stderr,"\t-N                 No structure"
 50 |     print >>sys.stderr,"\t-S                 One sentence per line (strict)"
 51 |     print >>sys.stderr,"\t-P                 One paragraph per line"
 52 |     print >>sys.stderr,"\t-I                 Value in first column (tab seperated) is ID!"
 53 |     print >>sys.stderr,"\t-E [encoding]      Encoding of input file (default: utf-8)"
 54 | 
 55 | try:
 56 |     opts, files = getopt.getopt(sys.argv[1:], "hSPINEp:o", ["txt=","xml=", "folia=","id=",'legacy','tok','selectsen=','selectpar=','idattrib='])
 57 | except getopt.GetoptError as err:
 58 |     # print help information and exit:
 59 |     print(str(err))
 60 |     usage()
 61 |     sys.exit(1)
 62 | 
 63 | 
 64 | textfile = xmlfile = foliafile = None
 65 | foliaid = 'UNTITLED'
 66 | legacy = None
 67 | tok = False
 68 | idinfirstcolumn = False
 69 | encoding = 'utf-8'
 70 | mode='s'
 71 | xpathselect = ''
 72 | idattrib=''
 73 | port = None
 74 | save = False
 75 | 
 76 | for o, a in opts:
 77 |     if o == "-h":
 78 |         usage()
 79 |         sys.exit(0)
 80 |     elif o == "-I":
 81 |         idinfirstcolumn = True
 82 |     elif o == "-S":
 83 |         mode = 's'
 84 |     elif o == "-P":
 85 |         mode = 'p'
 86 |     elif o == "-p":
 87 |         port = int(a)
 88 |     elif o == "-N":
 89 |         mode = 'n'
 90 |     elif o == "-E":
 91 |         encoding = a
 92 |     elif o == "--selectsen":
 93 |         mode='s'
 94 |         xpathselect = a
 95 |     elif o == "--selectpar":
 96 |         mode='p'
 97 |         xpathselect = a
 98 |     elif o == "--idattrib":
 99 |         idattrib = a
100 |     elif o == "--txt":
101 |         textfile = a
102 |     elif o == "--xml":
103 |         xmlfile = a
104 |     elif o == "--folia":
105 |         foliafile = a
106 |     elif o == "--id":
107 |         foliaid = a #ID
108 |     elif o == "-o":
109 |         save = True
110 |     elif o == "--legacy":
111 |         legacy = True
112 |     elif o == "--tok":
113 |         tok = True
114 |     else:
115 |         print >>sys.stderr, "ERROR: Unknown option:",o
116 |         sys.exit(1)
117 | 
118 | if not port:
119 |     print >> sys.stderr,"ERROR: No port specified to connect to Frog server"
120 |     sys.exit(2)
121 | elif (not textfile and not xmlfile and not foliafile):
122 |     print >> sys.stderr,"ERROR: Specify a file with either --txt, --xml or --folia"
123 |     sys.exit(2)
124 | elif xmlfile and not xpathselect:
125 |     print >> sys.stderr,"ERROR: You need to specify --selectsen or --selectpar when using --xml"
126 |     sys.exit(2)
127 | 
128 | frogclient = FrogClient('localhost',port)
129 | 
130 | idmap = []
131 | data = []
132 | 
133 | if textfile:
134 |     f = codecs.open(textfile, 'r', encoding)
135 |     for line in f.readlines():
136 |         if idinfirstcolumn:
137 |             id, line = line.split('\t',1)
138 |             idmap.append(id.strip())
139 |         else:
140 |             idmap.append(None)
141 |         data.append(line.strip())
142 |     f.close()
143 | 
144 | if xmlfile:
145 |     xmldoc = lxml.etree.parse(xmlfile)
146 |     for node in xmldoc.xpath(xpathselect):
147 |         if idattrib:
148 |             if idattrib in node.attrib:
149 |                 idmap.append(node.attrib[idattrib])
150 |             else:
151 |                 print >>sys.stderr,"WARNING: Attribute " + idattrib + " not found on node!"
152 |                 idmap.append(None)
153 |         else:
154 |             idmap.append(None)
155 |         data.append(node.text)
156 | 
157 | if foliafile:
158 |     foliadoc = folia.Document(file=foliafile)
159 |     if not foliadoc.declared(folia.AnnotationType.TOKEN):
160 |         foliadoc.declare(folia.AnnotationType.TOKEN, set='http://ilk.uvt.nl/folia/sets/ucto-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO)
161 |     if not foliadoc.declared(folia.AnnotationType.POS):
162 |         foliadoc.declare(folia.AnnotationType.POS, set='http://ilk.uvt.nl/folia/sets/cgn-legacy.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO)
163 |     if not foliadoc.declared(folia.AnnotationType.LEMMA):
164 |         foliadoc.declare(folia.AnnotationType.LEMMA, set='http://ilk.uvt.nl/folia/sets/mblem-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO)
165 |     foliadoc.language('nld')
166 |     text = foliadoc.data[-1]
167 | 
168 |     for p in foliadoc.paragraphs():
169 |         found_s = False
170 |         for s in p.sentences():
171 |             found_w = False
172 |             for w in s.words():
173 |                 found_w = True
174 |             found_s = True
175 |             if found_w:
176 |                 #pass tokenised sentence
177 |                 words = s.words()
178 |                 response = frogclient.process(" ".join([unicode(w) for w in words]))
179 |                 for i, (word, lemma, morph, pos) in enumerate(response):
180 |                     if legacy: legacyout(i,word,lemma,morph,pos)
181 |                     if unicode(words[i]) == word:
182 |                         if lemma:
183 |                             words[i].append( folia.LemmaAnnotation(foliadoc, cls=lemma) )
184 |                         if pos:
185 |                             words[i].append( folia.PosAnnotation(foliadoc, cls=pos) )
186 |                     else:
187 |                         print >>sys.stderr,"WARNING: Out of sync after calling Frog! ", i, word
188 | 
189 |             else:
190 |                 #pass untokenised sentence
191 |                 try:
192 |                     sentext = s.text()
193 |                 except folia.NoSuchText:
194 |                     continue
195 |                 response = frogclient.process(sentext)
196 |                 for i, (word, lemma, morph, pos) in enumerate(response):
197 |                     if legacy: legacyout(i,word,lemma,morph,pos)
198 |                     if word:
199 |                         w = folia.Word(foliadoc, text=word, generate_id_in=s)
200 |                         if lemma:
201 |                             w.append( folia.LemmaAnnotation(foliadoc, cls=lemma) )
202 |                         if pos:
203 |                             w.append( folia.PosAnnotation(foliadoc, cls=pos) )
204 |                         s.append(w)
205 | 
206 |             if not found_s:
207 |                 #pass paragraph
208 |                 try:
209 |                     partext = p.text()
210 |                 except folia.NoSuchText:
211 |                     continue
212 | 
213 |                 s = folia.Sentence(foliadoc, generate_id_in=p)
214 |                 response = frogclient.process(partext)
215 |                 for i, (word, lemma, morph, pos) in enumerate(response):
216 |                     if (not word or i == len(response) - 1) and len(s) > 0:
217 |                         #gap or end of response: terminate sentence
218 |                         p.append(s)
219 |                         s = folia.Sentence(foliadoc, generate_id_in=p)
220 |                     elif word:
221 |                         w = folia.Word(foliadoc, text=word, generate_id_in=s)
222 |                         if lemma:
223 |                             w.append( folia.LemmaAnnotation(foliadoc, cls=lemma) )
224 |                         if pos:
225 |                             w.append( folia.PosAnnotation(foliadoc, cls=pos) )
226 |                         s.append(w)
227 | 
228 | 
229 | else:
230 |     foliadoc = folia.Document(id=foliaid)
231 |     foliadoc.declare(folia.AnnotationType.TOKEN, set='http://ilk.uvt.nl/folia/sets/ucto-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO)
232 |     foliadoc.declare(folia.AnnotationType.POS, set='http://ilk.uvt.nl/folia/sets/cgn-legacy.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO)
233 |     foliadoc.declare(folia.AnnotationType.LEMMA, set='http://ilk.uvt.nl/folia/sets/mblem-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO)
234 |     foliadoc.language('nld')
235 |     text = folia.Text(foliadoc, id=foliadoc.id + '.text.1')
236 |     foliadoc.append(text)
237 | 
238 | 
239 |     curid = None
240 |     for (fragment, id) in zip(data,idmap):
241 |         if mode == 's' or mode == 'n':
242 |             if id:
243 |                 s = folia.Sentence(foliadoc, id=id)
244 |             else:
245 |                 s = folia.Sentence(foliadoc, generate_id_in=text)
246 |         elif mode == 'p':
247 |             if id:
248 |                 p = folia.Paragraph(foliadoc, id=id)
249 |             else:
250 |                 p = folia.Paragraph(foliadoc, generate_id_in=text)
251 |             s = folia.Sentence(foliadoc, generate_id_in=p)
252 | 
253 |         curid = s.id
254 |         response = frogclient.process(fragment)
255 |         for i, (word, lemma, morph, pos) in enumerate(response):
256 |             if legacy:
257 |                 legacyout(i,word,lemma,morph,pos)
258 |                 continue
259 | 
260 |             if word:
261 |                 w = folia.Word(foliadoc, text=word, generate_id_in=s)
262 |                 if lemma:
263 |                     w.append( folia.LemmaAnnotation(foliadoc, cls=lemma) )
264 |                 if pos:
265 |                     w.append( folia.PosAnnotation(foliadoc, cls=pos) )
266 |                 s.append(w)
267 |             if (not word or i == len(response) - 1) and len(s) > 0:
268 |                 #gap or end of response: terminate sentence
269 |                 if mode == 'p':
270 |                     p.append(s)
271 |                     if (i == len(response) - 1):
272 |                         text.append(p)
273 |                 elif mode == 'n' or (mode == 's' and i == len(response) - 1):
274 |                     text.append(s)
275 |                 elif mode == 's':
276 |                     continue
277 | 
278 |                 if i < len(response) - 1: #not done yet?
279 |                     #create new sentence
280 |                     if mode == 'p':
281 |                         s = folia.Sentence(foliadoc, generate_id_in=p)
282 |                     elif mode == 'n' and id:
283 |                         #no id for this unforeseen sentence, make something up
284 |                         s = folia.Sentence(foliadoc, id=curid+'.X')
285 |                         print("WARNING: Sentence found that was not in original",file=sys.stderr)
286 | 
287 | if not legacy:
288 |     print(foliadoc.xmlstring())
289 | if save and foliafile:
290 |     foliadoc.save()
291 | 


--------------------------------------------------------------------------------