├── ChangeLog
├── LICENSE.txt
├── Makefile
├── README.rst
├── bin
    ├── dl-conv.py
    ├── dl-ner.py
    ├── dl-pos.py
    ├── dl-sentiwords.py
    ├── dl-words-pca.py
    ├── dl-words.py
    ├── knn.py
    ├── mwe.py
    ├── senna-tag.py
    ├── ssyevr.py
    ├── toIOB.py
    └── tweet-tokenize.py
├── deepnl
    ├── ChangeLog
    ├── HPCA.h
    ├── HPCA_impl.cpp
    ├── WordsTrainer.cpp
    ├── WordsTrainer.h
    ├── __init__.py
    ├── classifier.cpp
    ├── classifier.pyx
    ├── corpus.py
    ├── embeddings.py
    ├── extractors.cpp
    ├── extractors.pxd
    ├── extractors.pyx
    ├── hpca.cpp
    ├── hpca.pyx
    ├── math.cpp
    ├── math.pxd
    ├── math.pyx
    ├── ner_tagger.py
    ├── network.cpp
    ├── network.pxd
    ├── network.pyx
    ├── networkconv.cpp
    ├── networkconv.pxd
    ├── networkconv.pyx
    ├── networkseq.cpp
    ├── networkseq.pxd
    ├── networkseq.pyx
    ├── pos_tagger.py
    ├── reader.py
    ├── sentiwords.cpp
    ├── sentiwords.pyx
    ├── tagger.cpp
    ├── tagger.pxd
    ├── tagger.pyx
    ├── trainer.cpp
    ├── trainer.pxd
    ├── trainer.pyx
    ├── trainerconv.cpp
    ├── trainerconv.pyx
    ├── utils.py
    ├── word_dictionary.py
    ├── words.cpp
    ├── words.h
    ├── words.pxd
    └── words.pyx
├── docs
    ├── index.rst
    ├── intro.rst
    ├── network.rst
    ├── scripts.rst
    └── utils.rst
└── setup.py


/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | CYTHON_FILES = $(wildcard deepnl/*.pyx)
 3 | EXT_FILES = $(CYTHON_FILES:.pyx=.cpp)
 4 | 
 5 | all: #$(EXT_FILES)
 6 | 	python setup.py build
 7 | 
 8 | %.cpp: %.pyx
 9 | 	cython $< --cplus
10 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ************************************************************
  2 | ``deepnl`` --- Deep Learning for Natural Language Processing
  3 | ************************************************************
  4 | 
  5 | ``deepnl`` is a Python library for Natural Language Processing tasks based on
  6 | a Deep Learning neural network architecture.
  7 | 
  8 | The library currently provides tools for performing part-of-speech tagging,
  9 | Named Entity tagging and Semantic Role Labeling.
 10 | 
 11 | ``deepnl`` also provides code for creating *word embeddings* from text, using
 12 | either the Language Model approach by [Collobert11]_, or Hellinger PCA,
 13 | as in [Lebret14]_.
 14 | 
 15 | It can also create *sentiment specific word embeddings* from a corpus of
 16 | annotated Tweets.
 17 | 
 18 | If you use ``deepnl``, please cite [Attardi]_ in your publications.
 19 | 
 20 | **WARNING**. There has been a change in file format for models since version 1.3.14.
 21 | You will have to retrain them to use with later versions.
 22 | 
 23 | Installation
 24 | ===========
 25 | 
 26 | Download the code or clone the repository on your machine with:
 27 | 
 28 |    $ git clone https://github.com/attardi/deepnl.git
 29 |    
 30 | Ensure that you have the dependencies mentioned below, then proceed to the build process described below.
 31 | 
 32 | Dependencies
 33 | ------------
 34 | 
 35 | ``deepnl`` requires numpy_ and Eigen_.
 36 | 
 37 | A C++ compiler is also needed for compiling the C++ extensions it uses,
 38 | produced with Cython_.
 39 | The generated ``.cpp`` files are already provided with ``deepnl``, but you
 40 | will need Cython_ if you want to develop or modify the C++ extensions.
 41 | 
 42 | Build
 43 | -----
 44 | 
 45 | To compile the library, run::
 46 | 
 47 |    $ python2 setup.py build
 48 | 
 49 | This will invoke the C++ compiler to compile the code on your platform.
 50 | 
 51 | You can run the scripts directly from the ``bin`` directory, or you can
 52 | install them by calling::
 53 | 
 54 |    $ sudo python setup.py install
 55 | 
 56 | If Cython gets invoked and raises error, force an update on the file
 57 | timestamps, with:
 58 | 
 59 |    $ touch deepnl/*.cpp
 60 | 
 61 | Basic usage
 62 | ===========
 63 | 
 64 | ``deepnl`` can be used both as a Python library or through command line scripts.
 65 | 
 66 | Library usage
 67 | -------------
 68 | 
 69 | You can use ``deepnl`` as a library in Python code as follows, where
 70 | ``filename`` is the name of the file containing the model produced through training:
 71 | 
 72 | .. code-block:: python
 73 | 
 74 |     >>> from deepnl.tagger import Tagger
 75 |     >>> tagger = Tagger.load(open(filename))
 76 |     >>> sent = 'The quick brown fox jumped over the lazy dog .'
 77 |     >>> tagger.tag_sequence(sent.split(), return_tokens=True)
 78 |     [[(u'The', u'DT'), (u'quick', u'JJ'), (u'brown', u'JJ'), (u'fox', u'NN'), (u'jumped', u'VBD'), (u'over', u'IN'), (u'the', u'DT'), (u'lazy', u'JJ'), (u'dog', u'NN'), (u'.', '.')]]
 79 | 
 80 | Class ``Tagger`` is a generic interface for sequence taggers and provides a
 81 | method ``tag_sequence`` for tagging a sentence.
 82 | A sentence is represented as a list of tokens.
 83 | 
 84 | Class ``Tagger`` can be used directly for performing POS tagging.
 85 | Two specializations are provided: ``NerTagger`, for Named Entity tagging and
 86 | ``SrlTagger`` for Semantic Role Labeling.
 87 | 
 88 | The output of ``tag_sequence`` is normally a list of tuples, representing
 89 | tokens with their associated tags. In the case of POS tagging, the tags are
 90 | just the POS tags of each token; in case of ``NerTagger`` the tags are in
 91 | ``IOB`` notation for representing subsequences, while in the case of
 92 | ``SrlTagger`` the output is more complex.
 93 | 
 94 | 
 95 | Standalone scripts
 96 | ------------------
 97 | 
 98 | ``deepnl`` provides scripts for tagging text or training new models.
 99 | 
100 | They are present in the `bin` subdirectory where you downloaded the code.
101 | If you did not install them, you can invoke them directly from there.
102 | 
103 | Call them with option ``-h`` or ``--help`` to obtain details on their usage.
104 | 
105 | The scripts expect tokenized input, one token per line, with an empty line to
106 | separate sentences.
107 | 
108 | When training, the token attributes are supplied in TSV (tab separated values) format.
109 | Here is an example of POS tagging, using a previously trained model from file ``pos.dnn``:
110 | 
111 | .. code-block:: bash
112 | 
113 |     $ dl-pos.py pos.dnn
114 |     The
115 |     quick
116 |     brown
117 |     fox
118 |     jumped
119 |     over
120 |     the
121 |     lazy
122 |     dog
123 |     .
124 | 
125 |     The DT
126 |     quick JJ
127 |     brown JJ
128 |     fox NN
129 |     jumped VBD
130 |     over IN
131 |     the DT
132 |     lazy JJ
133 |     dog NN
134 |     . .
135 | 
136 | Word Embeddings
137 | ===============
138 | 
139 | The command ``dl-words.py`` allows creating word embeddings from a language
140 | model built from a plain text corpus, properly tokenized.
141 | 
142 | The command ``dl-words-pca.py`` allows creating word embeddings from a
143 | language model built from a plain text corpus, with the technique of Hellinger
144 | PCA.
145 | 
146 | The command ``dl-sentiwords.py`` allows creating *sentiment specific word
147 | embeddings* from a corpus of annotated Tweets.
148 | 
149 | 
150 | Benchmarks
151 | ==========
152 | 
153 | The NER tagger replicates the performance of SENNA_ in the CoNLL 2003 benchmark.
154 | 
155 | The CoNLL-2003 shared task data can be downloaded from
156 | http://www.cnts.ua.ac.be/conll2003/ner/.
157 | 
158 | The train and test data must be cleaned and converted to the more recent IOB2
159 | notation, by calling:
160 | 
161 | .. code-block:: bash
162 | 
163 |     sed '/-DOCSTART-/,+1d' train | bin/toIOB.py | cut -f 1,2,4 > train.iob
164 |     sed '/-DOCSTART-/,+1d' testa | bin/toIOB.py | cut -f 1,2,4 > testa.iob
165 |     sed '/-DOCSTART-/,+1d' testb | bin/toIOB.py | cut -f 1,2,4 > testb.iob
166 |     cat train.iob testa.iob > train+dev.iob
167 | 
168 | Assuming that the SENNA distribution is in directory ``senna``, the embeddings
169 | and vocabulary from SENNA can be used:
170 | 
171 | .. code-block:: bash
172 | 
173 |    cp -p senna/embeddings/embeddings.txt vectors.txt
174 |    cp -p senna/hash/words.lst vocab.txt
175 | 
176 | The gazetters from SENNA can be used to produce a single entity list as follows:
177 | 
178 | .. code-block:: bash
179 | 
180 |     iconv -f ISO-8859-1 -t UTF-8 < senna/hash/ner.loc.lst | awk '{printf "LOC\t%s\n", $$0}'  > eng.list
181 |     iconv -f ISO-8859-1 -t UTF-8 < senna/hash/ner.misc.lst | awk '{printf "MISC\t%s\n", $$0}' >> eng.list
182 |     iconv -f ISO-8859-1 -t UTF-8 < senna/hash/ner.org.lst | awk '{printf "ORG\t%s\n", $$0}' >> eng.list
183 |     iconv -f ISO-8859-1 -t UTF-8 < senna/hash/ner.per.lst | awk '{printf "PER\t%s\n", $$0}' >> eng.list
184 | 
185 | You also need the list of suffixes:
186 | 
187 | .. code-block:: bash
188 | 
189 |     cp -p senna/hash/suffix.lst suffix.lst
190 | 
191 | The tagger can then be trained as follows:
192 | 
193 | .. code-block:: bash
194 | 
195 |     bin/dl-ner.py ner.dnn -t train+dev.iob \
196 |           --vocab vocab.txt --vectors vectors.txt \
197 |           --caps --suffix --suffixes suffix.lst --gazetteer eng.list \
198 |           -e 40 --variant senna \
199 |           -l 0.01 -w 5 -n 300 -v
200 | 
201 | The benchmark can be run as:
202 | 
203 | .. code-block:: bash
204 | 
205 |     bin/dl-ner.py ner.dnn < testb.iob > testb.out.iob
206 | 
207 | The results I achieved are::
208 | 
209 |     processed 46435 tokens with 5648 phrases; found: 5640 phrases; correct: 5031.
210 |     accuracy:  97.62%; precision:  89.20%; recall:  89.08%; FB1:  89.14
211 |               LOC: precision:  93.30%; recall:  91.01%; FB1:  92.14
212 |              MISC: precision:  78.24%; recall:  77.35%; FB1:  77.79
213 |               ORG: precision:  84.59%; recall:  87.24%; FB1:  85.89
214 |               PER: precision:  94.71%; recall:  94.06%; FB1:  94.38
215 | 
216 | Writing Extensions
217 | ==================
218 | 
219 | You can modify or extend the code just by adding them to the directory ``deepnl``.
220 | To compile the extension, use the same build process, but you will also need to have Cython_ installed.
221 | The compiler will issue warnings about NumPy of the type:
222 | 
223 |    /usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h:15:2: warning: #warning "Using deprecated NumPy API, disable it by " "#defining NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
224 |  #warning "Using deprecated NumPy API, disable it by "
225 | 
226 | Simply disregard them, since currently there is no way to fix them, until the maintainers of Cython_ will decide to upgrade it to use the latest API.
227 | 
228 | Credits
229 | =======
230 | 
231 | Erick Fonseca developed ``nlpnet``, a similar library, available at:
232 | https://github.com/erickrf/nlpnet, which provided inspiration for ``deepnl``.
233 | 
234 | References
235 | ==========
236 | 
237 | .. [Attardi] Giuseppe Attardi. 2015. DeepNL: a Deep Learning NLP
238 | 	     pipeline. Workshop on Vector Space Modeling for NLP, NAACL 2015,
239 | 	     Denver, Colorado (June 5, 2015).
240 | 
241 | .. [Collobert11] Ronan Collobert, J. Weston, L. Bottou, M. Karlen, K. Kavukcuoglu and P. Kuksa.
242 |    Natural Language Processing (Almost) from Scratch. *Journal of Machine
243 |    Learning Research*, 12:2493-2537, 2011.
244 | 
245 | .. [Lebret14]  Rémi Lebret and Ronan  Collobert. 2014. Word Embeddings through Hellinger PCA. *EACL 2014*: 482.
246 | 
247 | .. _numpy: http://www.numpy.org
248 | .. _Eigen: http://eigen.tuxfamily.org/
249 | .. _Cython: http://cython.org
250 | .. _SENNA: http://ronan.collobert.com/senna/
251 | 


--------------------------------------------------------------------------------
/bin/dl-conv.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Train and use a convolutional neural network classifier.
  6 | 
  7 | Author: Giuseppe Attardi
  8 | """
  9 | 
 10 | from __future__ import print_function
 11 | import logging
 12 | import numpy as np
 13 | import argparse
 14 | from ConfigParser import ConfigParser
 15 | 
 16 | # allow executing from anywhere without installing the package
 17 | import sys
 18 | import os
 19 | import distutils.util
 20 | builddir = os.path.dirname(os.path.realpath(__file__)) + '/../build/lib.'
 21 | libdir = builddir + distutils.util.get_platform() + '-' + '.'.join(map(str, sys.version_info[:2]))
 22 | #sys.path.append(libdir)
 23 | sys.path.insert(0,libdir)
 24 | 
 25 | # local
 26 | from deepnl.corpus import *
 27 | from deepnl.extractors import *
 28 | from deepnl.networkconv import ConvolutionalNetwork
 29 | from deepnl.trainerconv import ConvTrainer
 30 | from deepnl.reader import ClassifyReader
 31 | from deepnl.classifier import Classifier
 32 | 
 33 | # ----------------------------------------------------------------------
 34 | # Auxiliary functions
 35 | 
 36 | def create_trainer(args, converter, labels):
 37 |     """
 38 |     Creates or loads a neural network according to the specified args.
 39 |     :param labels: list of labels.
 40 |     """
 41 | 
 42 |     logger = logging.getLogger("Logger")
 43 | 
 44 |     if args.load:
 45 |         logger.info("Loading provided network...")
 46 |         trainer = ConvTrainer.load(args.load)
 47 |         # change learning rate
 48 |         trainer.learning_rate = args.learning_rate
 49 |         trainer.threads = args.threads
 50 |     else:
 51 |         logger.info('Creating new network...')
 52 |         # sum the number of features in all extractors' tables 
 53 |         feat_size = converter.size()
 54 |         pool_size = args.window * 2 + 1
 55 |         nn = ConvolutionalNetwork(feat_size * pool_size, args.hidden,
 56 |                                   args.hidden2, len(labels), pool_size)
 57 |         options = {
 58 |             'learning_rate': args.learning_rate,
 59 |             'eps': args.eps,
 60 |             'verbose': args.verbose,
 61 |             'left_context': args.window,
 62 |             'right_context': args.window
 63 |         }
 64 |         trainer = ConvTrainer(nn, converter, labels, options)
 65 | 
 66 |     trainer.saver = saver(args.model, args.vectors, args.variant)
 67 | 
 68 |     logger.info("... with the following parameters:")
 69 |     logger.info(trainer.nn.description())
 70 |     
 71 |     return trainer
 72 | 
 73 | def saver(model_file, vectors_file, variant):
 74 |     """Function for saving model periodically"""
 75 |     def save(trainer):
 76 |         # save embeddings also separately
 77 |         if vectors_file:
 78 |             trainer.save_vectors(vectors_file, variant)
 79 |         if model_file:
 80 |             with open(model_file, 'wb') as file:
 81 |                 trainer.classifier.save(file)
 82 |     return save
 83 | 
 84 | # ----------------------------------------------------------------------
 85 | 
 86 | def main():
 87 | 
 88 |     # set the seed for replicability
 89 |     np.random.seed(42)          # DEBUG
 90 | 
 91 |     defaults = {}
 92 |     
 93 |     parser = argparse.ArgumentParser(description="Convolutional network classifier.")
 94 |     
 95 |     parser.add_argument('-c', '--config', dest='config_file',
 96 |                         help='Specify config file', metavar='FILE')
 97 | 
 98 |     # args, remaining_argv = parser.parse_known_args()
 99 | 
100 |     # if args.config_file:
101 |     #     config = ConfigParser.SafeConfigParser()
102 |     #     config.read([args.config_file])
103 |     #     defaults = dict(config.items('Defaults'))
104 | 
105 |     # parser.set_defaults(**defaults)
106 | 
107 |     parser.add_argument('model', type=str,
108 |                         help='Model file to train/use.')
109 | 
110 |     # input format
111 |     format = parser.add_argument_group('Format')
112 | 
113 |     format.add_argument('--label-field', type=int, default=1,
114 |                         help='Field containing label (default %(default)s).')
115 |     format.add_argument('--text-field', type=int, default=2,
116 |                         help='Field containing text (default %(default)s).')
117 | 
118 |     # training options
119 |     train = parser.add_argument_group('Train')
120 | 
121 |     train.add_argument('-t', '--train', type=str, default=None,
122 |                        help='File with annotated data for training.')
123 | 
124 |     train.add_argument('-w', '--window', type=int, default=5,
125 |                        help='Size of the word window (default %(default)s)')
126 |     train.add_argument('-s', '--embeddings-size', type=int, default=50,
127 |                        help='Number of features per word (default %(default)s)',
128 |                        dest='embeddings_size')
129 |     train.add_argument('-e', '--epochs', type=int, default=100,
130 |                        help='Number of training epochs (default %(default)s)',
131 |                        dest='iterations')
132 |     train.add_argument('-l', '--learning_rate', type=float, default=0.001,
133 |                        help='Learning rate for network weights (default %(default)s)',
134 |                        dest='learning_rate')
135 |     train.add_argument('--eps', type=float, default=1e-6,
136 |                         help='Epsilon value for AdaGrad (default %(default)s)')
137 |     train.add_argument('-n', '--hidden', type=int, default=200,
138 |                        help='Number of hidden neurons (default %(default)s)')
139 |     train.add_argument('-n2', '--hidden2', type=int, default=200,
140 |                        help='Number of hidden neurons (default %(default)s)')
141 | 
142 |     # Extractors:
143 |     extractors = parser.add_argument_group('Extractors')
144 |     extractors.add_argument('--caps', const=5, nargs='?', type=int, default=None,
145 |                             help='Include capitalization features. Optionally, supply the number of features (default %(default)s)')
146 |     extractors.add_argument('--suffix', const=5, nargs='?', type=int, default=None,
147 |                             help='Include suffix features. Optionally, supply the number of features (default %(default)s)')
148 |     extractors.add_argument('--suffixes', type=str, default='',
149 |                         help='Load suffixes from this file')
150 |     extractors.add_argument('--prefix', const=0, nargs='?', type=int, default=None,
151 |                         help='Include prefix features. Optionally, '\
152 |                             'supply the number of features (default %(default)s)')
153 |     extractors.add_argument('--prefixes', type=str, default='',
154 |                         help='Load prefixes from this file')
155 |     # Embeddings
156 |     embeddings = parser.add_argument_group('Embeddings')
157 |     embeddings.add_argument('--vocab', type=str, default=None,
158 |                         help='Vocabulary file, either read or created')
159 |     embeddings.add_argument('--vectors', type=str, default=None,
160 |                         help='Embeddings file, either read or created')
161 |     embeddings.add_argument('--min-occurr', type=int, default=3,
162 |                         help='Minimum occurrences for inclusion in vocabulary',
163 |                         dest='minOccurr')
164 |     embeddings.add_argument('--load', type=str, default=None,
165 |                         help='Load previously saved model')
166 |     embeddings.add_argument('--variant', type=str, default=None,
167 |                         help='Either "senna" (default), "polyglot" or "word2vec".')
168 | 
169 |     # common
170 |     parser.add_argument('--threads', type=int, default=1,
171 |                         help='Number of threads (default %(default)s)')
172 |     parser.add_argument('-v', '--verbose', help='Verbose mode',
173 |                         action='store_true')
174 | 
175 |     # Use this for obtaining defaults from config file:
176 |     #args = arguments.get_args()
177 |     args = parser.parse_args()
178 | 
179 |     log_format = '%(message)s'
180 |     log_level = logging.DEBUG if args.verbose else logging.INFO
181 |     logging.basicConfig(format=log_format, level=log_level)
182 |     logger = logging.getLogger("Logger")
183 | 
184 |     config = ConfigParser()
185 |     if args.config_file:
186 |         config.read(args.config_file)
187 | 
188 |     # merge args with config
189 | 
190 |     if args.train:
191 |         reader = ClassifyReader(text_field=args.text_field, label_field=args.label_field)
192 |         # a generator (can be iterated several times)
193 |         sentences = reader.read(args.train)
194 | 
195 |         if args.vocab and os.path.exists(args.vocab):
196 |             if args.vectors and os.path.exists(args.vectors):
197 |                 # use supplied embeddings
198 |                 embeddings = Embeddings(vectors=args.vectors, vocab_file=args.vocab,
199 |                                         variant=args.variant)
200 |             else:
201 |                 # create random embeddings
202 |                 embeddings = Embeddings(args.embeddings_size, vocab_file=args.vocab,
203 |                                         variant=args.variant)
204 |             # collect words from the corpus
205 |             # build vocabulary
206 |             vocab, bigrams, trigrams = reader.create_vocabulary(sentences,
207 |                                                                 #size=args.vocab_size,
208 |                                                                 min_occurrences=args.minOccurr)
209 |             # add them to the given vocabulary
210 |             embeddings.merge(vocab)
211 |             logger.info("Overriding vocabulary in %s" % args.vocab)
212 |             embeddings.save_vocabulary(args.vocab)
213 | 
214 |         elif args.variant == 'word2vec':
215 |             if os.path.exists(args.vectors):
216 |                 embeddings = Embeddings(vectors=args.vectors,
217 |                                         variant=args.variant)
218 |                 vocab, bigrams, trigrams = reader.create_vocabulary(sentences,
219 |                                                                     #args.vocab_size,
220 |                                                                     min_occurrences=args.minOccurr)
221 |                 embeddings.merge(vocab)
222 |             else:
223 |                 vocab, bigrams, trigrams = reader.create_vocabulary(sentences,
224 |                                                                     #args.vocab_size,
225 |                                                                     min_occurrences=args.minOccurr)
226 |                 embeddings = Embeddings(vocab=vocab,
227 |                                         variant=args.variant)
228 |             if args.vocab:
229 |                 logger.info("Saving vocabulary in %s" % args.vocab)
230 |                 embeddings.save_vocabulary(args.vocab)
231 | 
232 |         elif not args.vocab_size:
233 |             logger.error("Missing parameter --vocab-size")
234 |             return
235 |         else:
236 |             # build vocabulary and tag set
237 |             vocab, bigrams, trigrams = reader.create_vocabulary(sentences,
238 |                                                                 #args.vocab_size,
239 |                                                                 min_occurrences=args.minOccurr)
240 |             logger.info("Creating word embeddings")
241 |             embeddings = Embeddings(args.embeddings_size, vocab=vocab,
242 |                                     variant=args.variant)
243 |             if args.vocab:
244 |                 logger.info("Saving vocabulary in %s" % args.vocab)
245 |                 embeddings.save_vocabulary(args.vocab)
246 | 
247 |         converter = Converter()
248 |         converter.add(embeddings)
249 | 
250 |         if args.caps:
251 |             logger.info("Creating capitalization features...")
252 |             converter.add(CapsExtractor(args.caps))
253 | 
254 |         if ((args.suffixes and not os.path.exists(args.suffixes)) or
255 |             (args.prefixes and not os.path.exists(args.prefixes))):
256 |             # collect the forms once
257 |             words = (tok for sent in sentences for tok in sent)
258 | 
259 |         if args.suffix:
260 |             if os.path.exists(args.suffixes):
261 |                 logger.info("Loading suffix list...")
262 |                 extractor = SuffixExtractor(args.suffix, args.suffixes)
263 |                 converter.add(extractor)
264 |             else:
265 |                 logger.info("Creating suffix list...")
266 |                 extractor = SuffixExtractor(args.suffix, None, words)
267 |                 converter.add(extractor)
268 |                 if args.suffixes:
269 |                     logger.info("Saving suffix list to: %s", args.suffixes)
270 |                     extractor.write(args.suffixes)
271 | 
272 |         if args.prefix:
273 |             if os.path.exists(args.prefixes):
274 |                 logger.info("Loading prefix list...")
275 |                 extractor = PrefixExtractor(args.prefix, args.prefixes)
276 |                 converter.add(extractor)
277 |             else:
278 |                 logger.info("Creating prefix list...")
279 |                 extractor = PrefixExtractor(args.prefix, None, words)
280 |                 converter.add(extractor)
281 |                 if args.prefixes:
282 |                     logger.info("Saving prefix list to: %s", args.prefixes)
283 |                     extractor.write(args.prefixes)
284 | 
285 |         # labels from all examples
286 |         examples = [converter.convert(example) for example in sentences]
287 |         # assign index to labels
288 |         sent_labels = reader.polarities
289 |         labels_index = {}
290 |         labels = []
291 |         for i,c in enumerate(set(sent_labels)):
292 |             labels_index[c] = i
293 |             labels.append(c)
294 |         trainer = create_trainer(args, converter, labels)
295 |         logger.info("Starting training with %d examples" % len(examples))
296 | 
297 |         report_frequency = max(args.iterations / 200, 1)
298 |         report_frequency = 1    # DEBUG
299 |         labels_ids = [labels_index[label] for label in sent_labels]
300 |         trainer.train(examples, labels_ids, args.iterations, report_frequency,
301 |                       args.threads)
302 |     
303 |         logger.info("Saving trained model ...")
304 |         trainer.saver(trainer)
305 |         logger.info("... to %s" % args.model)
306 | 
307 |     else:
308 |         # predict
309 |         with open(args.model) as file:
310 |             classifier = Classifier.load(file)
311 |         reader = ClassifyReader(text_field=args.text_field, label_field=args.label_field)
312 |         
313 |         for example in reader:
314 |             words = example[reader.text_field].split()
315 |             example[reader.label_field] = classifier.predict(words)
316 |             print('\t'.join(example).encode('utf-8'))
317 | 
318 | # ----------------------------------------------------------------------
319 | 
320 | if __name__ == '__main__':
321 |     main()
322 | 


--------------------------------------------------------------------------------
/bin/dl-sentiwords.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Learn sentiment-specific word embeddings from tweets.
  6 | 
  7 | Author: Giuseppe Attardi
  8 | """
  9 | 
 10 | import logging
 11 | import numpy as np
 12 | import argparse
 13 | from ConfigParser import ConfigParser
 14 | from itertools import chain
 15 | 
 16 | # allow executing from anywhere without installing the package
 17 | import sys
 18 | import os
 19 | import distutils.util
 20 | builddir = os.path.dirname(os.path.realpath(__file__)) + '/../build/lib.'
 21 | libdir = builddir + distutils.util.get_platform() + '-' + '.'.join(map(str, sys.version_info[:2]))
 22 | #sys.path.append(libdir)
 23 | sys.path.insert(0, libdir)
 24 | 
 25 | # local
 26 | from deepnl import *
 27 | from deepnl.extractors import *
 28 | from deepnl.reader import TweetReader
 29 | from deepnl.network import Network
 30 | from deepnl.sentiwords import SentimentTrainer
 31 | 
 32 | # ----------------------------------------------------------------------
 33 | # Auxiliary functions
 34 | 
 35 | def create_trainer(args, converter):
 36 |     """
 37 |     Creates or loads a neural network according to the specified args.
 38 |     """
 39 | 
 40 |     logger = logging.getLogger("Logger")
 41 | 
 42 |     if args.load:
 43 |         logger.info("Loading provided network...")
 44 |         trainer = SentimentTrainer.load(args.load)
 45 |         # change learning rate
 46 |         trainer.learning_rate = args.learning_rate
 47 |     else:
 48 |         logger.info('Creating new network...')
 49 |         # sum the number of features in all extractors' tables 
 50 |         input_size = converter.size() * (args.window * 2 + 1)
 51 |         nn = Network(input_size, args.hidden, 2)
 52 |         options = {
 53 |             'learning_rate': args.learning_rate,
 54 |             'eps': args.eps,
 55 |             'ro': args.ro,
 56 |             'verbose': args.verbose,
 57 |             'left_context': args.window,
 58 |             'right_context': args.window,
 59 |             'ngram_size': args.ngrams,
 60 |             'alpha': args.alpha
 61 |         }
 62 |         trainer = SentimentTrainer(nn, converter, options)
 63 | 
 64 |     trainer.saver = saver(args.model, args.vectors)
 65 | 
 66 |     logger.info("... with the following parameters:")
 67 |     logger.info(trainer.nn.description())
 68 |     
 69 |     return trainer
 70 | 
 71 | def saver(model_file, vectors_file):
 72 |     """Function for saving model periodically"""
 73 |     def save(trainer):
 74 |         # save embeddings also separately
 75 |         if vectors_file:
 76 |             trainer.save_vectors(vectors_file)
 77 |         if model_file:
 78 |             trainer.save(model_file)
 79 |     return save
 80 | 
 81 | # ----------------------------------------------------------------------
 82 | 
 83 | if __name__ == '__main__':
 84 | 
 85 |     # set the seed for replicability
 86 |     np.random.seed(42)
 87 | 
 88 |     defaults = {}
 89 |     
 90 |     parser = argparse.ArgumentParser(description="Learn word embeddings.")
 91 |     
 92 |     parser.add_argument('-c', '--config', dest='config_file',
 93 |                         help='Specify config file', metavar='FILE')
 94 | 
 95 |     # args, remaining_argv = parser.parse_known_args()
 96 | 
 97 |     # if args.config_file:
 98 |     #     config = ConfigParser.SafeConfigParser()
 99 |     #     config.read([args.config_file])
100 |     #     defaults = dict(config.items('Defaults'))
101 | 
102 |     # parser.set_defaults(**defaults)
103 | 
104 |     parser.add_argument('-w', '--window', type=int, default=5,
105 |                         help='Size of the word window (default %(default)s)',
106 |                         dest='window')
107 |     parser.add_argument('-s', '--embeddings-size', type=int, default=50,
108 |                         help='Number of features per word (default %(default)s)',
109 |                         dest='embeddings_size')
110 |     parser.add_argument('-e', '--epochs', type=int, default=100,
111 |                         help='Number of training epochs (default %(default)s)',
112 |                         dest='iterations')
113 |     parser.add_argument('-l', '--learning-rate', type=float, default=0.001,
114 |                         help='Learning rate for network weights (default %(default)s)',
115 |                         dest='learning_rate')
116 |     parser.add_argument('--eps', type=float, default=1e-8,
117 |                         help='Epsilon value for AdaGrad (default %(default)s)')
118 |     parser.add_argument('--ro', type=float, default=0.95,
119 |                         help='Ro value for AdaDelta (default %(default)s)')
120 |     parser.add_argument('-n', '--hidden', type=int, default=200,
121 |                         help='Number of hidden neurons (default %(default)s)')
122 |     parser.add_argument('--ngrams', type=int, default=2,
123 |                         help='Length of ngrams (default %(default)s)')
124 |     parser.add_argument('--textField', type=int, default=3,
125 |                         help='field containing text (default %(default)s)')
126 |     parser.add_argument('--tagField', type=int, default=2,
127 |                         help='field containing polarity (default %(default)s)')
128 |     parser.add_argument('--alpha', type=float, default=0.5,
129 |                         help='Relative weight of normal wrt sentiment score (default %(default)s)')
130 |     parser.add_argument('train', type=str,
131 |                         help='File with text corpus for training.')
132 |     parser.add_argument('--model', type=str, default=None,
133 |                         help='File where to save the model')
134 |     parser.add_argument('--vocab', type=str, required=True,
135 |                         help='Vocabulary file, either read and updated or created')
136 |     parser.add_argument('--min-occurr', type=int, default=3,
137 |                         help='Minimum occurrences for inclusion in vocabulary (default %(default)s',
138 |                         dest='minOccurr')
139 |     parser.add_argument('--vocab-size', type=int, default=0,
140 |                         help='Maximum size of vocabulary from corpus (default %(default)s)')
141 |     parser.add_argument('--vectors', type=str, required=True,
142 |                         help='Embeddings file, either read and updated or created')
143 |     parser.add_argument('--load', type=str, default=None,
144 |                         help='Load previously saved model')
145 |     parser.add_argument('--threads', type=int, default=1,
146 |                         help='Number of threads (default %(default)s)')
147 |     parser.add_argument('--variant', type=str, default=None,
148 |                         help='Either "senna" (default), "polyglot" or "word2vec".')
149 |     parser.add_argument('-v', '--verbose', help='Verbose mode',
150 |                         action='store_true')
151 | 
152 |     args = parser.parse_args()
153 | 
154 |     log_format = '%(message)s'
155 |     log_level = logging.DEBUG if args.verbose else logging.INFO
156 |     logging.basicConfig(format=log_format, level=log_level)
157 |     logger = logging.getLogger("Logger")
158 | 
159 |     config = ConfigParser()
160 |     if args.config_file:
161 |         config.read(args.config_file)
162 | 
163 |     # merge args with config
164 | 
165 |     reader = TweetReader(text_field=args.textField, label_field=args.tagField, ngrams=args.ngrams)
166 |     reader.read(args.train)
167 |     vocab, bigrams, trigrams = reader.create_vocabulary(reader.sentences,
168 |                                                         args.vocab_size,
169 |                                                         min_occurrences=args.minOccurr)
170 |     if args.variant == 'word2vec' and os.path.exists(args.vectors):
171 |         embeddings = Embeddings(vectors=args.vectors, variant=args.variant)
172 |         embeddings.merge(vocab)
173 |         logger.info("Saving vocabulary in %s" % args.vocab)
174 |         embeddings.save_vocabulary(args.vocab)
175 |     elif os.path.exists(args.vocab):
176 |         # start with the given vocabulary
177 |         base_vocab = reader.load_vocabulary(args.vocab)
178 |         if os.path.exists(args.vectors):
179 |             # load embeddings
180 |             embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab,
181 |                                     variant=args.variant)
182 |         else:
183 |             # create embeddings
184 |             embeddings = Embeddings(args.embeddings_size, vocab=base_vocab,
185 |                                     variant=args.variant)
186 |         # add the ngrams from the corpus
187 |         embeddings.merge(vocab)
188 |         logger.info("Overriding vocabulary in %s" % args.vocab)
189 |         embeddings.save_vocabulary(args.vocab)
190 |     else:
191 |         embeddings = Embeddings(args.embeddings_size, vocab=vocab,
192 |                                 variant=args.variant)
193 |         logger.info("Saving vocabulary in %s" % args.vocab)
194 |         embeddings.save_vocabulary(args.vocab)
195 | 
196 |     # Assume bigrams are prefix of trigrams, or else we should put a terminator
197 |     # on trie
198 |     trie = {}
199 |     for b in chain(bigrams, trigrams):
200 |         tmp = trie
201 |         for w in b:
202 |             tmp = tmp.setdefault(embeddings.dict[w], {})
203 | 
204 |     converter = Converter()
205 |     converter.add(embeddings)
206 | 
207 |     trainer = create_trainer(args, converter)
208 | 
209 |     report_intervals = max(args.iterations / 200, 1)
210 |     report_intervals = 10000    # DEBUG
211 | 
212 |     logger.info("Starting training")
213 | 
214 |     # a generator expression (can be iterated several times)
215 |     # It caches converted sentences, avoiding repeated conversions
216 |     converted_sentences = converter.generator(reader.sentences, cache=True)
217 |     trainer.train(converted_sentences, reader.polarities, trie,
218 |                   args.iterations, report_intervals)
219 |     
220 |     logger.info("Overriding vectors to %s" % args.vectors)
221 |     embeddings.save_vectors(args.vectors, args.variant)
222 |     if args.model:
223 |         logger.info("Saving trained model to %s" % args.model)
224 |         trainer.save(args.model)
225 | 


--------------------------------------------------------------------------------
/bin/dl-words-pca.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Learn word embeddings from plain text using Hellinger PCA.
  6 | 
  7 | See
  8 | Lebret, Rémi, and Ronan Collobert. "Word Embeddings through Hellinger PCA." EACL 2014 (2014): 482.
  9 | 
 10 | Author: Giuseppe Attardi
 11 | """
 12 | 
 13 | import logging
 14 | import numpy as np
 15 | import argparse
 16 | from ConfigParser import ConfigParser
 17 | 
 18 | # profiling
 19 | # import yappi
 20 | # cProfile
 21 | # import pstats, cProfile
 22 | # import pyximport
 23 | # pyximport.install()
 24 | 
 25 | # allow executing from anywhere without installing the package
 26 | import sys
 27 | import os
 28 | import distutils.util
 29 | builddir = os.path.dirname(os.path.realpath(__file__)) + '/../build/lib.'
 30 | libdir = builddir + distutils.util.get_platform() + '-' + '.'.join(map(str, sys.version_info[:2]))
 31 | sys.path.append(libdir)
 32 | 
 33 | # local
 34 | from deepnl.embeddings import Plain
 35 | import deepnl.hpca as hpca
 36 | 
 37 | # ----------------------------------------------------------------------
 38 | 
 39 | def main():
 40 | 
 41 |     defaults = {}
 42 |     
 43 |     parser = argparse.ArgumentParser(description="Learn word embeddings.")
 44 |     
 45 |     parser.add_argument('-c', '--config', dest='config_file',
 46 |                         help='Specify config file', metavar='FILE')
 47 | 
 48 |     # args, remaining_argv = parser.parse_known_args()
 49 | 
 50 |     # if args.config_file:
 51 |     #     config = ConfigParser.SafeConfigParser()
 52 |     #     config.read([args.config_file])
 53 |     #     defaults = dict(config.items('Defaults'))
 54 | 
 55 |     # parser.set_defaults(**defaults)
 56 | 
 57 |     parser.add_argument('-w', '--window', type=int, default=5,
 58 |                         help='Size of the word window (default %(default)s)')
 59 |     parser.add_argument('-s', '--embeddings-size', type=int, default=50,
 60 |                         help='Number of features per word (default %(default)s)',
 61 |                         dest='embeddings_size')
 62 |     parser.add_argument('--ngrams', type=int, default=1,
 63 |                         help='Size of ngrams (default %(default)s)')
 64 |     parser.add_argument('--train', type=str, required=True,
 65 |                         help='File with text corpus for training.')
 66 |     parser.add_argument('-o', '--output', type=str,
 67 |                         help='File where to save the model, for further training')
 68 |     parser.add_argument('--vocab', type=str, required=True,
 69 |                         help='Vocabulary file')
 70 |     parser.add_argument('--context-words', type=int, default=10000,
 71 |                         help='Number of context words (the first N from vocabulary)')
 72 |     parser.add_argument('--context-size', type=int, default=1,
 73 |                         help='Number of context words')
 74 |     parser.add_argument('--vectors', type=str, required=True,
 75 |                         help='Embeddings file, either read and updated or created')
 76 |     parser.add_argument('--threads', type=int, default=1,
 77 |                         help='Number of threads (default %(default)s)')
 78 |     parser.add_argument('--variant', type=str, default=None,
 79 |                         help='Either "senna" (default), "polyglot" or "word2vec".')
 80 |     parser.add_argument('--covariance', action='store_true',
 81 |                         help='Use PCA algorithm on covariance matrix.')
 82 |     parser.add_argument('-v', '--verbose', action='store_true',
 83 |                         help='Verbose mode')
 84 | 
 85 |     args = parser.parse_args()
 86 | 
 87 |     log_format = '%(message)s'
 88 |     log_level = logging.DEBUG if args.verbose else logging.INFO
 89 |     logging.basicConfig(format=log_format, level=log_level)
 90 |     logger = logging.getLogger("Logger")
 91 | 
 92 |     config = ConfigParser()
 93 |     if args.config_file:
 94 |         config.read(args.config_file)
 95 | 
 96 |     # merge args with config
 97 | 
 98 |     if not os.path.exists(args.vocab):
 99 |         logger.error("Missing vocabulary: " + args.vocab)
100 |         return
101 | 
102 |     logger.info("Building co-occurrence matrix")
103 |     rootmat = hpca.cooccurrences(args.train, args.vocab, args.context_words,
104 |                                  args.context_size)
105 |     logger.info("Perform PCA")
106 |     vectors = hpca.fit(rootmat, args.embeddings_size, args.covariance)
107 | 
108 |     logger.info("Saving vectors ...")
109 |     Plain.write_vectors(args.vectors, vectors)
110 |     logger.info("... to %s" % args.vectors)
111 | 
112 |     if args.output:
113 |         logger.info("Saving trained model ...")
114 |         trainer.save(args.output)
115 |         logger.info("... to %s" % args.output)
116 | 
117 | # ----------------------------------------------------------------------
118 | 
119 | profile = None #'yappi'
120 | 
121 | if __name__ == '__main__':
122 |     if profile == 'yappi':
123 |         yappi.start()
124 |         main()
125 |         yappi.get_func_stats().print_all()
126 |     elif profile == 'cprofile':
127 |         cProfile.runctx("main()", globals(), locals(), "Profile.prof")
128 |         s = pstats.Stats("Profile.prof")
129 |         s.strip_dirs().sort_stats("time").print_stats()
130 |     else:
131 |         main()
132 | 


--------------------------------------------------------------------------------
/bin/dl-words.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Learn word embeddings from plain text.
  6 | 
  7 | Author: Giuseppe Attardi
  8 | """
  9 | 
 10 | import logging
 11 | import numpy as np
 12 | import argparse
 13 | from ConfigParser import ConfigParser
 14 | 
 15 | # profiling
 16 | # import yappi
 17 | 
 18 | # allow executing from anywhere without installing the package
 19 | import sys
 20 | import os
 21 | import distutils.util
 22 | builddir = os.path.dirname(os.path.realpath(__file__)) + '/../build/lib.'
 23 | libdir = builddir + distutils.util.get_platform() + '-' + '.'.join(map(str, sys.version_info[:2]))
 24 | sys.path.append(libdir)
 25 | 
 26 | # local
 27 | from deepnl.extractors import *
 28 | from deepnl.reader import TextReader
 29 | from deepnl.network import Network
 30 | from deepnl.words import LmTrainer
 31 | 
 32 | # ----------------------------------------------------------------------
 33 | # Auxiliary functions
 34 | 
 35 | 
 36 | def create_trainer(args, converter):
 37 |     """
 38 |     Creates or loads a neural network according to the specified args.
 39 |     """
 40 | 
 41 |     logger = logging.getLogger("Logger")
 42 | 
 43 |     if args.load:
 44 |         logger.info("Loading provided network...")
 45 |         trainer = LmTrainer.load(args.load)
 46 |         trainer.learning_rate = args.learning_rate
 47 |     else:
 48 |         logger.info('Creating new network...')
 49 |         # sum the number of features in all extractors' tables
 50 |         input_size = converter.size() * (args.windows * 2 + 1)
 51 |         nn = LmNetwork(input_size, args.hidden, 1)
 52 |         options = {
 53 |             'learning_rate': args.learning_rate,
 54 |             'eps': args.eps,
 55 |             'ro': args.ro,
 56 |             'verbose': args.verbose,
 57 |             'left_context': args.window,
 58 |             'right_context': args.window,
 59 |             'ngram_size': args.ngrams
 60 |         }
 61 |         trainer = LmTrainer(nn, converter, options)
 62 | 
 63 |     trainer.saver = saver(args.output, args.vectors)
 64 | 
 65 |     logger.info("... with the following parameters:")
 66 |     logger.info(trainer.nn.description())
 67 | 
 68 |     return trainer
 69 | 
 70 | 
 71 | def saver(model_file, vectors_file):
 72 |     """Function for saving model periodically"""
 73 |     def save(trainer):
 74 |         # save embeddings also separately
 75 |         if vectors_file:
 76 |             trainer.save_vectors(vectors_file)
 77 |         trainer.save(model_file)
 78 |     return save
 79 | 
 80 | # ----------------------------------------------------------------------
 81 | 
 82 | 
 83 | def main():
 84 | 
 85 |     # set the seed for replicability
 86 |     np.random.seed(42)
 87 | 
 88 |     defaults = {}
 89 | 
 90 |     parser = argparse.ArgumentParser(description="Learn word embeddings.")
 91 | 
 92 |     parser.add_argument('-c', '--config', dest='config_file',
 93 |                         help='Specify config file', metavar='FILE')
 94 | 
 95 |     # args, remaining_argv = parser.parse_known_args()
 96 | 
 97 |     # if args.config_file:
 98 |     #     config = ConfigParser.SafeConfigParser()
 99 |     #     config.read([args.config_file])
100 |     #     defaults = dict(config.items('Defaults'))
101 | 
102 |     # parser.set_defaults(**defaults)
103 | 
104 |     parser.add_argument('-w', '--window', type=int, default=5,
105 |                         help='Size of the word window (default %(default)s)',
106 |                              dest='window')
107 |     parser.add_argument('-s', '--embeddings-size', type=int, default=50,
108 |                         help='Number of features per word (default %(default)s)',
109 |                         dest='embeddings_size')
110 |     parser.add_argument('-e', '--epochs', type=int, default=100,
111 |                         help='Number of training epochs (default %(default)s)',
112 |                         dest='iterations')
113 |     parser.add_argument('-l', '--learning-rate', type=float, default=0.001,
114 |                         help='Learning rate for network weights (default %(default)s)',
115 |                         dest='learning_rate')
116 |     parser.add_argument('--eps', type=float, default=1e-8,
117 |                         help='Epsilon value for AdaGrad (default %(default)s)')
118 |     parser.add_argument('--ro', type=float, default=0.95,
119 |                         help='Ro value for AdaDelta (default %(default)s)')
120 |     parser.add_argument('-n', '--hidden', type=int, default=200,
121 |                         help='Number of hidden neurons (default %(default)s)')
122 |     parser.add_argument('--ngrams', type=int, default=1,
123 |                         help='Size of ngrams (default %(default)s)')
124 |     parser.add_argument('--train', type=str, default=None,
125 |                         help='File with text corpus for training.', required=True)
126 |     parser.add_argument('-o', '--output', type=str,
127 |                         help='File where to save model, for further training')
128 |     parser.add_argument('--vocab', type=str, required=True,
129 |                         help='Vocabulary file')
130 |     parser.add_argument('--vectors', required=True,
131 |                         help='Embeddings file, either read and updated or created')
132 |     parser.add_argument('--load', type=str,
133 |                         help='Load previously saved model')
134 |     parser.add_argument('--threads', type=int, default=1,
135 |                         help='Number of threads (default %(default)s)')
136 |     parser.add_argument('--words', type=int, default=0,
137 |                         help='Number of words in corpus')
138 |     parser.add_argument('--variant', type=str, default=None,
139 |                         help='Either "senna" (default), "polyglot" or "word2vec".')
140 |     parser.add_argument('-v', '--verbose', help='Verbose mode',
141 |                         action='store_true')
142 | 
143 |     args = parser.parse_args()
144 | 
145 |     log_format = '%(message)s'
146 |     log_level = logging.DEBUG if args.verbose else logging.INFO
147 |     logging.basicConfig(format=log_format, level=log_level)
148 |     logger = logging.getLogger("Logger")
149 | 
150 |     config = ConfigParser()
151 |     if args.config_file:
152 |         config.read(args.config_file)
153 | 
154 |     # merge args with config
155 | 
156 |     if not os.path.exists(args.vocab):
157 |         logger.error("Missing vocabulary: " + args.vocab)
158 |         return
159 | 
160 |     embeddings = Embeddings(args.embeddings_size, args.vocab, args.vectors,
161 |                             variant=args.variant)
162 | 
163 |     logger.info("Read data")
164 |     converter = Converter()
165 |     converter.add(embeddings)
166 | 
167 |     trainer = create_trainer(args, converter)
168 | 
169 |     report_intervals = max(args.iterations / 200, 1)
170 |     report_intervals = 10000    # DEBUG
171 | 
172 |     logger.info("Starting training")
173 | 
174 |     reader = TextReader()
175 |     # a generator (can be iterated several times)
176 |     sentences = reader.read(args.train)
177 |     converted_sentences = converter.generator(sentences)
178 | 
179 |     trainer.train(converted_sentences, args.iterations, report_intervals,
180 |                   args.threads, epoch_pairs=args.words)
181 | 
182 |     logger.info("Saving vectors ...")
183 |     trainer.save_vectors(args.vectors)
184 |     logger.info("... to %s" % args.vectors)
185 | 
186 |     if args.output:
187 |         logger.info("Saving trained model ...")
188 |         trainer.save(args.output)
189 |         logger.info("... to %s" % args.output)
190 | 
191 | # ----------------------------------------------------------------------
192 | 
193 | profile = False
194 | 
195 | if __name__ == '__main__':
196 |     # if profile:
197 |     #     #yappi.start() # done after thread creation
198 |     #     main()
199 |     #     yappi.get_func_stats().print_all()
200 |     #     yappi.get_thread_stats().print_all()
201 |     # else:
202 |         main()
203 | 


--------------------------------------------------------------------------------
/bin/knn.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | """
  4 | Show the knn words in the embeddings to a given word.
  5 | 
  6 | Usage:
  7 |    ./knn.py embeddings vocabulary
  8 | 
  9 | Options:
 10 |   -h, --help              : display this help and exit
 11 | """
 12 | ## Required
 13 | #
 14 | # sudo apt-get install build-essential python-dev python-numpy python-setuptools python-scipy libatlas-dev libatlas-base-dev libatlas3gf-base
 15 | # sudo apt-get remove libopenblas-base
 16 | # sudo pip install --upgrade nose
 17 | # sudo pip install -U scikit-learn
 18 | 
 19 | from __future__ import print_function
 20 | import sys
 21 | from optparse import OptionParser
 22 | from math import sqrt
 23 | from operator import itemgetter
 24 | import re
 25 | # clustering
 26 | from scipy.cluster.vq import kmeans, whiten, vq
 27 | from sklearn.cluster import dbscan
 28 | import numpy as np
 29 | 
 30 | # Number of neighbors to return.
 31 | top = 10
 32 | # min number in cluster
 33 | min_core = 3
 34 | # Cluster representatives
 35 | representatives = 5
 36 | 
 37 | # Normalize digits by replacing them with #
 38 | DIGITS = re.compile("[0-9]", re.UNICODE)
 39 | 
 40 | def case_normalizer(word, dictionary):
 41 |   """ In case the word is not available in the vocabulary,
 42 |      we can try multiple case normalizing procedures.
 43 |      We consider the best substitute to be the one with the lowest index,
 44 |      which is equivalent to the most frequent alternative."""
 45 |   w = word
 46 |   lower = (dictionary.get(w.lower(), 1e12), w.lower())
 47 |   upper = (dictionary.get(w.upper(), 1e12), w.upper())
 48 |   title = (dictionary.get(w.title(), 1e12), w.title())
 49 |   results = [lower, upper, title]
 50 |   results.sort()
 51 |   index, w = results[0]
 52 |   if index != 1e12:
 53 |     return w
 54 |   return word
 55 | 
 56 | def normalize(word, word_id):
 57 |   """ Find the closest alternative in case the word is OOV."""
 58 |   if not word in word_id:
 59 |       word = DIGITS.sub("0", word)
 60 |   if not word in word_id:
 61 |       word = case_normalizer(word, word_id)
 62 | 
 63 |   if not word in word_id:
 64 |       return None
 65 |   return word
 66 | 
 67 | def l2_nearest(embeddings, e, k):
 68 |   """Sort vectors according to their Euclidean distance from e
 69 |   and return the k closest.
 70 |   Returns list of (index, distance^2)
 71 |   """
 72 | 
 73 |   distances = ((embeddings - e) ** 2).sum(axis=1) # ** 0.5
 74 |   sorted_distances = sorted(enumerate(distances), key=itemgetter(1))
 75 |   return sorted_distances[1:k]
 76 | 
 77 | def knn(embeddings, id_word, word_id):
 78 |   """Show closest k words"""
 79 |   input = sys.stdin
 80 |   while True:
 81 |     word = input.readline()
 82 |     if not word: break
 83 |     word = word.strip().decode('utf-8')
 84 |     word = normalize(word, word_id)
 85 |     if not word:
 86 |       print("OOV word")
 87 |       continue
 88 |     # numpy version
 89 |     i = 0
 90 |     for index, distance2 in l2_nearest(embeddings, embeddings[word_id[word]], top+1):
 91 |       print('%i\t%s\t%f' % (i, id_word[index].encode('utf-8'), sqrt(distance2)))
 92 |       i += 1
 93 | 
 94 | def Kmeans(file, vocabfile, k):
 95 |   np.random.seed((1000,2000))
 96 |   whitened = whiten(embeddings)
 97 |   codebook, distortion = kmeans(whitened, k)
 98 |   clusters = [l2_nearest(embeddings, c, representatives+1) for c in codebook]
 99 |   # output
100 |   print(len(codebook), distortion)
101 |   for centroid in codebook:
102 |     print(' '.join([str(x) for x in centroid]))
103 |   print()
104 |   for cluster in clusters:
105 |     print(' '.join([id_word[i] for i, d in cluster]).encode('utf-8'))
106 |   print()
107 |   # assign clusters to words
108 |   codes, _ = vq(embeddings, codebook)
109 |   for w, c in zip(word_id.keys(), codes):
110 |     print(w, c)
111 | 
112 | def Dbscan(embeddings, id_word, word_id, eps, min_size):
113 |   coreSamples, labels = dbscan(embeddings, eps, min_size)
114 |   # group clusters
115 |   clusters = {}
116 |   for i, label in enumerate(labels):
117 |     if label not in clusters:
118 |       clusters[label] = []
119 |     clusters[label].append(id_word[i].encode('utf-8'))
120 |   # output
121 |   print(len(clusters) - 1)
122 |   for c in clusters.iterkeys():
123 |     if c < 0: continue          # -1 is noise
124 |     print(' '.join([str(x) for x in embeddings[int(c)]]))
125 |   print()
126 |   # show clusters
127 |   for c, words in clusters.iteritems():
128 |     print(c, ' '.join(words))
129 | 
130 | def readClusters(clusterfile):
131 |   cfile = open(clusterfile)
132 |   k = cfile.readline().split()[0]
133 |   clusters = []
134 |   for i in range(int(k)):
135 |     vector = [float(x) for x in cfile.readline().split()]
136 |     clusters.append(vector)
137 |   return clusters
138 | 
139 | def annotate(embeddings, id_word, word_id, clusterfile, col = 0):
140 |   clusters = readClusters(clusterfile)
141 |   for line in sys.stdin:
142 |     line = line.strip().decode('utf-8')
143 |     if not line:
144 |       print()
145 |       continue
146 |     attrs = line.split('\t')
147 |     # detect which column to use
148 |     if not col:
149 |       if attrs[8] == '_':
150 |         col = 8                 # PHEAD
151 |       else:
152 |         col = 9                 # PDEPREL
153 |     # get vector for token
154 |     token = attrs[1]            # form
155 |     token = normalize(token, word_id)
156 |     if not token:
157 |       token = attrs[2]          # try lemma
158 |       token = normalize(token, word_id)
159 |     if token:
160 |       id = word_id[token]
161 |     else:
162 |       id = 0 # word_id['<UNK>']
163 |     e = embeddings[id]
164 |     # find cluster
165 |     min = 1e12
166 |     for i, cluster in enumerate(clusters):
167 |       d2 = ((cluster - e) ** 2).sum()
168 |       if d2 < min:
169 |         min = d2
170 |         c = i
171 |     attrs[col] = 'C%i' % (c)
172 |     print('\t'.join(attrs).encode('utf-8'))
173 | 
174 | def loadVocab(vocab_file):
175 |   vocab = []
176 |   with open(vocab_file, 'rb') as file:
177 |     for line in file:
178 |       vocab.append(line.strip().decode('utf-8'))
179 |   return vocab
180 | 
181 | def loadEmbeddings(filename, vocab_file=None):
182 |   vocab = []
183 |   if vocab_file:
184 |     vocab = loadVocab(filename)
185 |     with open(filename, 'rb') as file:
186 |       vectors = np.array([[float(value) for value in line.split()]
187 |                           for line in file])
188 |   else:
189 |     # read both from same file in word2vec format
190 |     vectors = []
191 |     with open(filename, 'rb') as file:
192 |       len, size = file.readline().strip().split()
193 |       for line in file:
194 |         items = line.split()
195 |         vocab.append(items[0].decode('utf-8'))
196 |         vectors.append([float(value) for value in items[1:]])
197 |       vectors = np.array(vectors)
198 | 
199 |   return vectors, vocab
200 | 
201 | def main():
202 |   usage = """usage: %prog [options] embeddings [vocabulary]
203 | Show knn of words typed on stdin."""
204 |   parser = OptionParser(usage=usage)
205 |   parser.add_option("-d", "--dbscan",
206 |                     action="store", type="float", default=0.0,
207 |                     help="Create clusters of distance EPS to stdout using dbscan",
208 |                     metavar="EPS")
209 |   parser.add_option("-k", "--kmeans",
210 |                     action="store", type="int", default=0,
211 |                     help="Create N clusters to stdout using kmeans",
212 |                     metavar="N")
213 |   parser.add_option("-a", "--annotate", metavar="FILE",
214 |                     help="Annotate CoNLL-X input with clusters from FILE")
215 |   parser.add_option("-c", "--col",
216 |                     action="store", type="int", default=0,
217 |                     help="Column where to put cluster annotation",
218 |                     metavar="C")
219 |   parser.add_option("-g", "--group", metavar="FILE",
220 |                     help="Show clusters from FILE")
221 |   parser.add_option("-f", "--format", type="string", default="plain",
222 |                     help="Embeddings file format: plain (default), word2vec")
223 |   options, args = parser.parse_args()
224 |   if len(args) == 0:
225 |     parser.error("incorrect number of arguments")
226 | 
227 |   file = args[0]
228 |   if len(args) == 2:
229 |     vocab_file = args[1]
230 |   else:
231 |     vocab_file = None
232 | 
233 |   if options.format.lower() == 'word2vec':
234 |     embeddings, id_word = loadEmbeddings(file)
235 |   elif options.format.lower() == 'word2embeddings':
236 |     embeddings = np.load(file).get_word_embeddings()
237 |     id_word = ['<UNK>', '<S>', '</S>', '<PAD>']
238 |     id_word.extend(loadVocab(vocab_file))
239 |   else:
240 |     embeddings, id_word = loadEmbeddings(file, vocab_file)
241 | 
242 |   # Map words to indices
243 |   word_id = { v:i for i,v in enumerate(id_word)}
244 | 
245 |   if options.kmeans:
246 |     Kmeans(embeddings, id_word, word_id, options.kmeans)
247 |   elif options.dbscan:
248 |     Dbscan(embeddings, id_word, word_id, options.dbscan, min_core)
249 |   elif options.group:
250 |     # print(the clusters
251 |     codebook = readClusters(options.group)
252 |     codes, _ = vq(embeddings, np.array(codebook))
253 |     groups = {}
254 |     for i,c in enumerate(codes):
255 |       if c not in groups:
256 |         groups[c] = []
257 |       groups[c].append(id_word[i])
258 |     for c,members in groups.iteritems():
259 |       print(c, ' '.join(members).encode('utf-8'))
260 |   elif options.annotate:
261 |     anontate(embeddings, id_word, word_id, options.annotate, options.col)
262 |   else:
263 |     knn(embeddings, id_word, word_id)
264 | 
265 | if __name__ == '__main__':
266 |   main()
267 | 


--------------------------------------------------------------------------------
/bin/mwe.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | """
  4 | Check if phrase is a MWE.
  5 | 
  6 | Usage:
  7 |    ./mwe.py embeddings vocabulary
  8 | 
  9 | Options:
 10 |   -h, --help              : display this help and exit
 11 | """
 12 | 
 13 | from __future__ import print_function
 14 | import sys
 15 | from optparse import OptionParser
 16 | import cPickle as pickle
 17 | from operator import itemgetter
 18 | import re
 19 | import numpy as np
 20 | 
 21 | # Number of neighbors to return.
 22 | top = 5
 23 | 
 24 | # Normalize digits by replacing them with #
 25 | DIGITS = re.compile("[0-9]", re.UNICODE)
 26 | 
 27 | def case_normalizer(word, dictionary):
 28 |   """ In case the word is not available in the vocabulary,
 29 |      we can try multiple case normalizing procedures.
 30 |      We consider the best substitute to be the one with the lowest index,
 31 |      which is equivalent to the most frequent alternative."""
 32 |   w = word
 33 |   lower = (dictionary.get(w.lower(), 1e12), w.lower())
 34 |   upper = (dictionary.get(w.upper(), 1e12), w.upper())
 35 |   title = (dictionary.get(w.title(), 1e12), w.title())
 36 |   results = [lower, upper, title]
 37 |   results.sort()
 38 |   index, w = results[0]
 39 |   if index != 1e12:
 40 |     return w
 41 |   return word
 42 | 
 43 | def normalize(word, word_id):
 44 |   """ Find the closest alternative in case the word is OOV."""
 45 |   if not word in word_id:
 46 |       word = DIGITS.sub("0", word)
 47 |   if not word in word_id:
 48 |       word = case_normalizer(word, word_id)
 49 | 
 50 |   if not word in word_id:
 51 |       return None
 52 |   return word
 53 | 
 54 | def l2_nearest(embeddings, e, k):
 55 |   """Sort vectors according to their Euclidean distance from e
 56 |   and return the k closest.
 57 |   Returns list of (index, distance^2)
 58 |   """
 59 | 
 60 |   distances = ((embeddings - e) ** 2).sum(axis=1) # ** 0.5
 61 |   sorted_distances = sorted(enumerate(distances), key=itemgetter(1))
 62 |   return sorted_distances[1:k]
 63 | 
 64 | def variant(word, id, embeddings):
 65 |   # FIXME: should use POS
 66 |   if len(word) > 3:
 67 |     return l2_nearest(embeddings, embeddings[id], top+1)
 68 |   else:
 69 |     return [(id,0)]
 70 | 
 71 | def closest(ngram, word_id, id_word, embeddings):
 72 |     for i,word in enumerate(ngram):
 73 |       for index, distance2 in variant(word, word_id[word], embeddings):
 74 |         yield [w if n!=i else id_word[index] for n, w in enumerate(ngram)]
 75 | 
 76 | def show(embeddings, word_id, id_word, counts):
 77 |   """Show closest k phrases"""
 78 |   input = sys.stdin
 79 |   while True:
 80 |     words = input.readline()
 81 |     if not words: break
 82 |     words = words.strip().decode('utf-8').split()
 83 |     words = [normalize(word, word_id) for word in words]
 84 |     if not all(words):
 85 |       print("OOV word")
 86 |       continue
 87 |     phrase = ' '.join(words)
 88 |     freq = counts.get(phrase, 0)
 89 |     print(phrase.encode('utf-8'), freq)
 90 |     for ngram in closest(words, word_id, id_word, embeddings):
 91 |       phrase = ' '.join(ngram)
 92 |       freq = counts.get(phrase, 0)
 93 |       print(phrase.encode('utf-8'), freq)
 94 | 
 95 | def loadVocab(vocab_file):
 96 |   vocab = []
 97 |   with open(vocab_file, 'rb') as file:
 98 |     for line in file:
 99 |       vocab.append(line.strip().decode('utf-8'))
100 |   return vocab
101 | 
102 | def loadEmbeddings(filename, vocab_file=None):
103 |   vocab = []
104 |   if vocab_file:
105 |     vocab = loadVocab(vocab_file)
106 |     with open(filename, 'rb') as file:
107 |       vectors = np.array([[float(value) for value in line.split()]
108 |                           for line in file])
109 |   else:
110 |     # read both from same file in word2vec format
111 |     vectors = []
112 |     with open(filename, 'rb') as file:
113 |       len, size = file.readline().strip().split()
114 |       for line in file:
115 |         items = line.split()
116 |         vocab.append(items[0].decode('utf-8'))
117 |         vectors.append([float(value) for value in items[1:]])
118 |       vectors = np.array(vectors)
119 | 
120 |   return vectors, vocab
121 | 
122 | def PolyglotLoad(filename):
123 |   """
124 |   Load the feature matrix used by word2embeddings.
125 |   """
126 |   vectors = []
127 |   with open(filename, 'rb') as f:
128 |     for line in f:
129 |       items = line.split()
130 |       word = unicode(items[0], 'utf-8')
131 |       vectors.append([float(x) for x in items[1:]])
132 |   return np.array(vectors)
133 | 
134 | def main():
135 |   usage = """usage: %prog [options] embeddings [vocabulary]
136 | Show knn of variant of phrase typed on stdin."""
137 |   parser = OptionParser(usage=usage)
138 |   parser.add_option("-f", "--format", type="string", default="plain",
139 |                     help="Embedding file format: plain (default), word2vec")
140 |   parser.add_option("-c", "--counts", type="string",
141 |                     help="Ngram frequencysfile")
142 |   options, args = parser.parse_args()
143 |   if len(args) == 0:
144 |     parser.error("incorrect number of arguments")
145 | 
146 |   file = args[0]
147 |   if len(args) == 2:
148 |     vocab_file = args[1]
149 |   else:
150 |     vocab_file = None
151 | 
152 |   if options.format.lower() == 'word2vec':
153 |     embeddings, id_word = loadEmbeddings(file)
154 |   elif options.format.lower() == 'word2embeddings':
155 |     embeddings = np.load(file).get_word_embeddings()
156 |     id_word = ['<UNK>', '<S>', '</S>', '<PAD>']
157 |     id_word.extend(loadVocab(vocab_file))
158 |   else:
159 |     embeddings, id_word = loadEmbeddings(file, vocab_file)
160 | 
161 |   counts = {}
162 |   if options.counts:
163 |     with open(options.counts) as file:
164 |       for line in file:
165 |         ngram, freq = line.strip().split()
166 |         ngram = re.sub('_', ' ', ngram)
167 |         counts[ngram] = int(freq)
168 | 
169 |   # Map words to indices
170 |   word_id = { v:i for i,v in enumerate(id_word)}
171 | 
172 |   show(embeddings, word_id, id_word, counts)
173 | 
174 | if __name__ == '__main__':
175 |   main()
176 | 


--------------------------------------------------------------------------------
/bin/senna-tag.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | This script will run a POS or SRL tagger on the input data and print the results
  6 | to stdout.
  7 | """
  8 | 
  9 | from __future__ import print_function
 10 | import argparse
 11 | import logging
 12 | import ipdb
 13 | 
 14 | # Attardi: allow executing from anywhere without installing package
 15 | import sys
 16 | import os
 17 | srcdir = os.path.dirname(os.path.realpath(__file__)) + '/../'
 18 | sys.path.append(srcdir + 'build/lib.linux-x86_64-2.7')
 19 | 
 20 | import nlpnet
 21 | import nlpnet.utils as utils
 22 | from nlpnet.taggers import Tagger
 23 | from nlpnet.network import Network
 24 | from nlpnet.pos.pos_reader import POSReader
 25 | from nlpnet.attributes import Suffix
 26 | from nlpnet.metadata import Metadata
 27 | from nlpnet.word_dictionary import WordDictionary
 28 | import numpy as np
 29 | 
 30 | senna_dump = "pos.dump"
 31 | 
 32 | def load_features(file):
 33 |     row, col = file.readline().split()
 34 |     row = int(row)
 35 |     col = int(col)
 36 |     a = np.ndarray((row, col))
 37 |     words = [None] * row
 38 |     for i in range(row):
 39 |         values = file.readline().split()
 40 |         words[i] = values[0]
 41 |         for j, val in enumerate(values[1:]):
 42 |             a[i,j] = float(val)
 43 |     return (words, a)
 44 | 
 45 | def load_weights(file):
 46 |     row, col = file.readline().split()
 47 |     row = int(row)
 48 |     col = int(col)
 49 |     a = np.ndarray((row, col))
 50 |     for i in range(row):
 51 |         line = file.readline()
 52 |         for j, val in enumerate(line.split()):
 53 |             a[i,j] = float(val)
 54 |     return a
 55 | 
 56 | def load_bias(file):
 57 |     col = file.readline()
 58 |     col = int(col)
 59 |     a = np.zeros(col)
 60 |     line = file.readline()
 61 |     for j, val in enumerate(line.split()):
 62 |         a[j] = float(val)
 63 |     return a
 64 | 
 65 | def load_network():
 66 |     """
 67 |     Loads the network from the default file and returns it.
 68 |     """
 69 |     file = open(senna_dump)
 70 |     words, type_features = load_features(file)
 71 |     word_dict = WordDictionary(None, wordlist=words, variant='senna')
 72 |     tables = [type_features]
 73 |     
 74 |     # PADDING, allcaps, hascap, initcap, nocaps
 75 |     caps, caps_features = load_features(file)
 76 |     tables.append(caps_features)
 77 | 
 78 |     suff, suffix_features = load_features(file)
 79 |     tables.append(suffix_features)
 80 | 
 81 |     hidden_weights = load_weights(file) # (hidden_size, input_size)
 82 |     hidden_bias = load_bias(file)
 83 |     output_weights = load_weights(file) # (output_size, hidden_size)
 84 |     output_bias = load_bias(file)
 85 |         
 86 |     transition0 = load_bias(file)
 87 |     transitions = load_weights(file).T
 88 |     transitions = np.vstack((transitions, transition0))
 89 | 
 90 |     word_window_size = 5
 91 |     input_size = hidden_weights.shape[1]
 92 |     hidden_size = hidden_weights.shape[0]
 93 |     output_size = output_bias.shape[0]
 94 |         
 95 |     nn = Network(word_window_size, input_size, hidden_size, output_size,
 96 |                  hidden_weights, hidden_bias, output_weights, output_bias)
 97 |     nn.feature_tables = tables
 98 |     nn.transitions = transitions 
 99 |     
100 |     return nn, word_dict, suff
101 | 
102 | class SennaPOSTagger(Tagger):
103 |     """A POSTagger loads the models and performs POS tagging on text."""
104 |     
105 |     def _load_data(self):
106 |         """Loads data for POS from SENNA dump"""
107 |         md = Metadata.load_from_file('pos')
108 |         self.nn, word_dict, suff = load_network()
109 |         self.reader = POSReader()
110 |         self.reader.word_dict = word_dict
111 |         self.reader.create_converter(md)
112 |         self.itd = self.reader.get_inverse_tag_dictionary()
113 |         self.nn.padding_left = self.reader.converter.get_padding_left()
114 |         self.nn.padding_right = self.reader.converter.get_padding_right()
115 |         self.nn.pre_padding = np.array([self.nn.padding_left] * 2)
116 |         self.nn.pos_padding = np.array([self.nn.padding_right] * 2)
117 |         Suffix.codes = {}
118 |         for i, s in enumerate(suff):
119 |             Suffix.codes[s] = i
120 |         Suffix.other = Suffix.codes['NOSUFFIX']
121 |     
122 |     def tag(self, text=None):
123 |         """
124 |         Tags the given text.
125 |         
126 |         :param text: a string or unicode object. Strings assumed to be utf-8
127 |         :returns: a list of lists (sentences with tokens).
128 |             Each sentence has (token, tag) tuples.
129 |         """
130 |         result = []
131 |         if text:
132 |             tokens = utils.tokenize(text, clean=False)
133 |             for sent in tokens:
134 |                 tags = self.tag_tokens(sent)
135 |                 result.append(zip(sent, tags))
136 |         else:
137 |             # read tsv from stdin
138 |             sent = []
139 |             for line in sys.stdin:
140 |                 line = line.decode('utf-8').strip()
141 |                 if line:
142 |                     sent.append(line.split()[0])
143 |                 else:
144 |                     #ipdb.set_trace()
145 |                     tags = self.tag_tokens(sent)
146 |                     result.append(zip(sent, tags))
147 |                     sent = []
148 | 
149 |         return result
150 |     
151 |     def tag_tokens(self, tokens):
152 |         """
153 |         Tags a given list of tokens. 
154 |         
155 |         Tokens should be produced with the nlpnet tokenizer in order to 
156 |         match the entries in the vocabulary. If you have non-tokenized text,
157 |         use POSTagger.tag(text).
158 |         
159 |         :param tokens: a list of strings
160 |         :returns: a list of strings (the tags)
161 |         """
162 |         converter = self.reader.converter
163 |         # do not use clean_text. Attardi
164 |         #converted_tokens = np.array([converter.convert(utils.clean_text(token, False)) 
165 |         converted_tokens = converter.convert(tokens)
166 |         answer = self.nn.tag_sentence(converted_tokens)
167 |         tags = [self.itd[tag] for tag in answer]
168 |         return tags
169 | 
170 | def process_input(task):
171 |     """
172 |     This function reads input from stdin and processes sentences.
173 |     
174 |     :param task: either 'pos' or 'ner'
175 |     """
176 |     task_lower = task.lower()
177 |     if task_lower == 'pos':
178 |         tagger = SennaPOSTagger()
179 |     elif task_lower == 'ner':
180 |         tagger = nlpnet.taggers.NERTagger()
181 |     else:
182 |         raise ValueError('Unknown task: %s' % task)
183 |     
184 |     result = tagger.tag()        
185 |     _print_tagged(result, task)
186 | 
187 | def _print_tagged(tagged_sents, task):
188 |     """
189 |     Prints the tagged text to stdout.
190 |     
191 |     :param tagged_sents: sentences tagged according to any of nlpnet taggers.
192 |     :param task: the tagging task (either 'pos' or 'ner')
193 |     """
194 |     if task == 'pos':
195 |         _print_tagged_pos(tagged_sents)
196 |     elif task == 'ner':
197 |         _print_tagged_ner(tagged_sents)
198 |     else:
199 |         raise ValueError('Unknown task: %s' % task)
200 |     
201 | def _print_tagged_pos(tagged_sents):
202 |     """Prints one sentence per line as token_tag"""
203 |     # for sent in tagged_sents:
204 |     #     s = ' '.join('_'.join(item) for item in sent)
205 |     #     print(s)
206 | 
207 |     # print in tsv
208 |     for sent in tagged_sents:
209 |         for token in sent:
210 |             print('\t'.join([item.encode('utf-8') for item in token]))
211 |         print()
212 | 
213 | def _print_tagged_ner(tagged_sents):
214 |     """Prints one token per line as token\ttag"""
215 |     for sent in tagged_sents:
216 |         for tok, tag in sent:
217 |             print(tok[0] + '\t' + tag # tok is (form, POS))
218 |         print()
219 | 
220 | if __name__ == '__main__':
221 |     
222 |     parser = argparse.ArgumentParser()
223 |     parser.add_argument('task', help='Task for which the network should be used.', 
224 |                         type=str, choices=['pos', 'ner'])
225 |     parser.add_argument('data', help='Directory containing trained models.', type=str)
226 |     parser.add_argument('-v', help='Verbose mode', action='store_true', dest='verbose')
227 |     args = parser.parse_args()
228 |     
229 |     nlpnet.set_data_dir(args.data)
230 |     
231 |     #interactive_running(args.task)
232 |     process_input(args.task)
233 |     
234 | 


--------------------------------------------------------------------------------
/bin/ssyevr.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | from __future__ import print_function
 4 | import numpy as np
 5 | from scipy.linalg.lapack import ssyevr
 6 | 
 7 | A = np.array([[ 0.67, -0.20,  0.19, -1.06,  0.46],
 8 |               [-0.20,  3.82, -0.13,  1.06, -0.48],
 9 |               [ 0.19, -0.13,  3.27,  0.11,  1.10],
10 |               [-1.06,  1.06,  0.11,  5.86, -0.98],
11 |               [ 0.46, -0.48,  1.10, -0.98,  3.54]
12 | ])
13 | 
14 | n = np.linalg.norm(A, axis=1)
15 | 
16 | print(A)
17 | 
18 | w,z,info = ssyevr(A, range='I', il=3, overwrite_a=1)
19 | 
20 | print(w)
21 | print(z)
22 | # z = (5 x 3)
23 | print(A.dot(z))
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/bin/toIOB.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | """
 3 | Upgrade to new IOB convention: Inside, Outside, Begin.
 4 | """
 5 | # O I -> O B
 6 | # I B -> I B
 7 | # I I -> I I
 8 | # I O -> I O
 9 | 
10 | from __future__ import print_function
11 | import sys
12 | import getopt
13 | 
14 | 
15 | def usage():
16 |     print('usage:', sys.argv[0], '[-hr] < inFile ')
17 |     print('  -r   revert to old convention.')
18 |     sys.exit()
19 | 
20 | try:
21 |     opts, args = getopt.getopt(sys.argv[1:], 'hr')
22 | except getopt.GetoptError:
23 |     usage()
24 | 
25 | reverse = False
26 | 
27 | for opt, arg in opts:
28 |     if opt == '-h':
29 |         usage()
30 |     if opt == '-r':
31 |         reverse = True
32 | 
33 | 
34 | def main():
35 |     previous = None
36 |     for line in sys.stdin:
37 |         if line == '\n':
38 |             print('\t'.join(previous))
39 |             print()
40 |             previous = None
41 |             continue
42 |         words = line.split()
43 |         word = words[0]
44 |         tag =  words[-1]
45 |         if reverse:
46 |             if tag[0] == 'B' and (previous == None or previous[-1] == 'O'):
47 |                 words[-1] = 'I' + tag[1:]
48 |         else:
49 |             if tag[0] == 'I' and (previous == None or previous[-1] == 'O'):
50 |                 words[-1] = 'B' + tag[1:]
51 |         if previous:
52 |             print('\t'.join(previous))
53 |         previous = words
54 |     if previous:                # leftover
55 |         print('\t'.join(previous))
56 | 
57 | main()
58 | 


--------------------------------------------------------------------------------
/bin/tweet-tokenize.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | """
 4 | Tokenize a Twitter corpus in CoNLL 2013 format.
 5 | 
 6 | Usage:
 7 |    tweet-tokenize.py [options] < CoNLL20113-file
 8 | 
 9 | Optons:
10 | 
11 |    -h		print this help message
12 |    -l language	select corpus language (default english)
13 | 
14 | """
15 | 
16 | import os
17 | import sys
18 | import getopt
19 | from __future__ import print_function
20 | 
21 | # Tanl directory
22 | tanl = '/project/piqasso/QA/Tanl/'
23 | 
24 | # Where to find data
25 | data = tanl + 'data/'
26 | 
27 | # import Tanl modules
28 | 
29 | sys.path.append(tanl + 'bin/')
30 | 
31 | from SentenceSplitter import *
32 | from Tokenizer_it import *
33 | 
34 | ### CLI INTERFACE ############################################################
35 | 
36 | def show_help():
37 |     print(__doc__, end='')
38 | 
39 | def show_usage(scriptname):
40 |     print('Usage: %s [options] [file]' % scriptname, file=sys.stderr)
41 | 
42 | def show_suggestion(scriptname):
43 |     print('Try \'%s --help\' for more information.' % scriptname, file=sys.stderr)
44 | 
45 | def main():
46 |     scriptname = os.path.basename(sys.argv[0])
47 | 
48 |     try:
49 |         long_opts = ['language=', 'help', 'usage']
50 |         opts, args = getopt.gnu_getopt(sys.argv[1:], 'l:h', long_opts)
51 |     except getopt.GetoptError:
52 |         show_usage(scriptname)
53 |         show_suggestion(scriptname)
54 |         sys.exit(1)
55 |     
56 |     lang = 'english'
57 | 
58 |     for opt, arg in opts:
59 |         if opt in ('-l', '--language'):
60 |             lang = arg
61 |         elif opt in ('-h', '--help'):
62 |             show_help()
63 |             return
64 |         elif opt == '--usage':
65 |             show_usage(scriptname)
66 |             return
67 | 
68 |     # Tanl modules
69 |     splitterModel = data + 'split/sentence/' + lang + '.punkt'
70 |     if os.path.exists(splitterModel):
71 |         print("No such model:" + splitterModel)
72 |         return
73 |     t0 = SentenceSplitter(splitterModel)
74 |     t1 = Tokenizer()
75 | 
76 |     # Field containig tweets
77 |     text_field = 3
78 | 
79 |     for line in sys.stdin:
80 |         fields = line.split('\t')
81 |         p0 = t0.pipe([fields[text_field]]) # SentenceSplitter
82 |         p1 = t1.pipe(p0)                   # Tokenizer
83 |         tokens = []
84 |         for t in p1:
85 |             form = t['FORM']
86 |             if form != '\n':
87 |                 tokens.append(form)
88 |         fields[text_field] = ' '.join(tokens)
89 |         print('\t'.join(fields))
90 | 
91 | if __name__ == '__main__':
92 |     main()
93 | 


--------------------------------------------------------------------------------
/deepnl/ChangeLog:
--------------------------------------------------------------------------------
 1 | 2014-04-22  Giuseppe Attardi  <attardi@di.unipi.it>
 2 | 
 3 | 	* network.pyx (_tag_sentence): use concatenate instead of vstack,
 4 | 	since it is twice faster.
 5 | 	(create_new): set seed for replicability.
 6 | 	(_backpropagate): completely rewritten, following closely the paper.
 7 | 	(run): introduced layer2, according to paper.
 8 | 
 9 | 2014-04-20  Giuseppe Attardi  <attardi@di.unipi.it>
10 | 
11 | 	* attributes.py (Suffix.get_suffix): fixed handling of padding
12 | 
13 | 2014-04-19  Giuseppe Attardi  <attardi@di.unipi.it>
14 | 
15 | 	* network.pyx (create_new): initialize transitions here.
16 | 
17 | 2014-04-18  Giuseppe Attardi  <attardi@di.unipi.it>
18 | 
19 | 	* network.pyx (train): terminate when both error increases and
20 | 	accuracy decreases.
21 | 
22 | 2014-04-15  Giuseppe Attardi  <attardi@di.unipi.it>
23 | 
24 | 	* network.pyx (_calculate_all_scores): use logsumexp instead of
25 | 	computing log(sum(exp()))
26 | 	(_calculate_gradients_sll): use logsumexp also for computing
27 | 	net_gradients.
28 | 	(_calculate_all_scores): no need for np.longdouble(scores) since we
29 | 	now use logsumexp.
30 | 
31 | 2014-04-14  Giuseppe Attardi  <attardi@di.unipi.it>
32 | 
33 | 	* network.pyx (train): normalize error.
34 | 
35 | 2014-04-13  Giuseppe Attardi  <attardi@di.unipi.it>
36 | 
37 | 	* taggers.py (load_network): associate list of files to gaz_features_ner.
38 | 
39 | 	* arguments.py (get_args): option --lt is not just for SRL.
40 | 
41 | 	* network.pyx (tag_sentence): eliminated unused argument logprob.
42 | 	(_viterbi): dropped unused param allow_repeats.
43 | 	(Network): renamed train_hits to validation_hits, train_items to
44 | 	validation_items
45 | 	(_validate): perform validation on held out sentences.
46 | 
47 | 2014-04-12  Giuseppe Attardi  <attardi@di.unipi.it>
48 | 
49 | 	* network.pyx (train): dump model whenever accuracy improves.
50 | 	Eliminated termination on dropping accuracy.
51 | 	Replaced print with logging.
52 | 	(create_new): use uniform distribution with variance = 1/sqrt(fanin)
53 | 
54 | 2014-04-10  Giuseppe Attardi  <attardi@di.unipi.it>
55 | 
56 | 	* attributes.py (Suffix.get_suffix): check for PADDING.
57 | 	(get_capitalization): check for PADDING.
58 | 
59 | 	* word_dictionary.py (isNumber): use regexp.
60 | 	(WordDictionary.__getitem__): replace digits with '0'
61 | 
62 | 2014-04-07  Giuseppe Attardi  <attardi@di.unipi.it>
63 | 
64 | 	* attributes.py (TokenConverter.convert_sentence): aded to deal
65 | 	with context in conversion.
66 | 
67 | 	* reader.py (TextReader.codify_sentences): use convert(sent).
68 | 
69 | 	* taggers.py (POSTagger.tag_tokens): drop use of clean_text.
70 | 
71 | 	* config.py (set_data_dir): added NER files.
72 | 
73 | 	* attributes.py (get_capitalization): added upper.
74 | 
75 | 2014-03-10  Giuseppe Attardi  <attardi@di.unipi.it>
76 | 
77 | 	* taggers.py (NERTagger): added.
78 | 
79 | 	* ner/ner_reader.py (NerReader): added.
80 | 
81 | 	* pos/pos_reader.py (POSReader.get_inverse_tag_dictionary):
82 | 	removed: it is inherited from TaggerReader
83 | 
84 | 2013-12-15  Giuseppe Attardi  <attardi@di.unipi.it>
85 | 
86 | 	* networklm.pyx (train): save language model at every 100th batch.
87 | 
88 | 


--------------------------------------------------------------------------------
/deepnl/HPCA.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef deepnl_HPCA_H
 3 | #define deepnl_HPCA_H
 4 | 
 5 | #include "Python.h"
 6 | 
 7 | namespace hpca {
 8 | 
 9 | bool matrix_sqrt(float* m, int rowstart, int rowend, int cols);
10 | 
11 | bool distance_matrix(float* m, int rowstart, int rowend, int rows, int cols,
12 | 		     float* dm);
13 | 
14 | PyObject* cooccurrence_matrix(char* corpus, char* vocabFile, unsigned top,
15 | 			      unsigned window);
16 | 
17 | void hellinger_matrix(float* dm, float* cooccur, int rows, int cols, int lines);
18 | 
19 | extern bool verbose;
20 | }
21 | 
22 | #endif // deepnl_HPCA_H
23 | 


--------------------------------------------------------------------------------
/deepnl/HPCA_impl.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "Python.h"
  3 | 
  4 | #include "numpy/ndarraytypes.h"
  5 | #include "numpy/ndarrayobject.h"
  6 | 
  7 | #include <math.h>
  8 | #include <string.h>		// strtok_r()
  9 | #include <strings.h>		// bzero()
 10 | #include <fstream>
 11 | #include <iostream>
 12 | #include <limits>
 13 | #include <unordered_map>
 14 | #include <vector>
 15 | 
 16 | using namespace std;
 17 | 
 18 | namespace hpca {
 19 | 
 20 | bool verbose = true;
 21 | 
 22 | typedef unordered_map<string, unsigned> Dict;
 23 | 
 24 | template<class T>
 25 | struct ring
 26 | {
 27 |   ring(unsigned size) :
 28 |     buffer(size),
 29 |     _start(0),
 30 |     _end(0)
 31 |   { }
 32 | 
 33 |   void clear() {
 34 |     _start = _end = 0;
 35 |   }
 36 | 
 37 |   struct iterator {
 38 |     ring& r;
 39 |     T* curr;
 40 | 
 41 |     iterator(ring& r, T* curr) :
 42 |       r(r),
 43 |       curr(curr)
 44 |     {}
 45 | 
 46 |     T& operator *() { return *curr; }
 47 | 
 48 |     iterator& operator++() {
 49 |       if (++curr == &*r.buffer.end())
 50 | 	curr = &*r.buffer.begin();
 51 |       if (curr == r._end)
 52 | 	curr = 0;		// signals end
 53 |       return *this;
 54 |     }
 55 | 
 56 |     //! Increment operator (postfix).
 57 |     iterator operator ++ (int) {
 58 |         iterator tmp = *this;
 59 |         ++*this;
 60 |         return tmp;
 61 |     }
 62 | 
 63 |     bool operator ==(const iterator& it) const {
 64 |       return curr == it.curr;
 65 |     }
 66 | 
 67 |     bool operator !=(const iterator& it) const {
 68 |       return curr != it.curr;
 69 |     }
 70 | 
 71 |   };
 72 |   
 73 |   void add(T x) {
 74 |     if (empty())
 75 |       _start = _end = &*buffer.begin();
 76 |     else if (_end == _start) {
 77 |       if (++_start == &*buffer.end())
 78 | 	_start = &*buffer.begin();
 79 |     }
 80 |     *_end = x;
 81 |     if (++_end == &*buffer.end())
 82 |       _end = &*buffer.begin();
 83 |   }
 84 | 
 85 |   bool empty() { return _start == 0; }
 86 | 
 87 |   unsigned size() {
 88 |     return (empty())
 89 |       ? 0
 90 |       : (_start < _end) ? _end - _start : buffer.size() - (_start - _end);
 91 |   }
 92 | 
 93 |   iterator begin() { return iterator(*this, _start); }
 94 |   iterator end() { return iterator(*this, 0); }
 95 | 
 96 |   vector<T> buffer;
 97 |   T* _start; ///< virtual beginning
 98 |   T* _end;   ///< virtual end (one besides last element)
 99 | };
100 | 
101 | static int MAX_LINE_LEN = 1 << 14;
102 | /**
103 |  *   load list of words from file
104 |  */
105 | void load_list(char const* file, Dict& dict)
106 | {
107 |   char line[MAX_LINE_LEN];
108 |   ifstream ifs(file);
109 |   if (!ifs) {
110 |     cerr << "No such file: " << file << endl;
111 |     return;
112 |   }
113 |   while (ifs.getline(line, MAX_LINE_LEN))
114 |     dict[line] = dict.size() - 1; // lhs is evaluated first
115 | }
116 | 
117 | #define P(i, j) (*(p + (i) * top + (j)))
118 | 
119 | /**
120 |  * Build cooccurrence matix @param p from corpus of sentences in file @param
121 |  * corpus, using vocabulary from file @c vocabFile, of which the top @param
122 |  * top words are used as context words.
123 |  */
124 | PyObject* cooccurrence_matrix(char* corpus, char* vocabFile, unsigned top,
125 | 			      unsigned window)
126 | {
127 |   // read vocabulary
128 |   Dict vocab;
129 |   load_list(vocabFile, vocab);
130 |   
131 |   unsigned words = vocab.size();
132 | 
133 |   // allocate numpy array
134 |   const npy_intp dims[2] = {words, top};
135 |   PyObject* npyp = PyArray_ZEROS(2, (npy_intp*)dims, NPY_FLOAT32, NPY_CORDER);
136 |   //Py_INCREF(npyp);
137 |   float* p = (float*)PyArray_DATA(npyp);
138 | 
139 |   // read sentences
140 |   char sentence[MAX_LINE_LEN];
141 |   ifstream ifs(corpus);
142 |   if (!ifs) {
143 |     cerr << "No such file: " << corpus << endl;
144 |     return npyp;
145 |   }
146 |   ring<unsigned> context(window);
147 |   
148 |   int sentCount = 0;
149 |   while (true) {
150 |     if (!ifs.getline(sentence, MAX_LINE_LEN)) {
151 |       if (ifs.rdstate() & ifstream::failbit) {	// too long line
152 | 	ifs.clear();
153 | 	ifs.ignore(numeric_limits<streamsize>::max(), '\n');
154 | 	if (ifs.rdstate())
155 | 	  break;
156 | 	if (verbose)
157 | 	  cerr << "\nLong line: " << sentCount << endl;
158 | 	sentCount++;
159 | 	continue;
160 |       } else
161 | 	break;
162 |     }
163 |     context.clear();
164 |     char* next = sentence;
165 |     char* tok = strtok_r(0, " ", &next);
166 |     // count how many times a context word w in D appears after a vocabulary
167 |     // word T in V, in a window of context-size: C(T, w)
168 |     // context words are the first top in the vocabulary.
169 | # if 1//def CONTEXT_SIZE
170 |     while ((tok = strtok_r(0, " ", &next))) {
171 |       Dict::const_iterator w = vocab.find(tok);
172 |       if (w == vocab.end()) {
173 | 	context.add(-1);	// OOV
174 | 	continue;
175 |       }
176 |       // w in T
177 |       if (w->second < top) {	// w in D
178 | 	for (int T : context) {
179 | 	  if (T >= 0)
180 | 	    P(T, w->second)++;	// p[T, w] = n(w, T) = C(T, w)
181 | 	}
182 |       }
183 |       context.add(w->second);
184 |     }
185 | # else
186 |     char* prev = strtok_r(0, " ", &next);
187 |     while ((tok = strtok_r(0, " ", &next))) {
188 |       Dict::const_iterator w = vocab.find(tok);
189 |       if (w != vocab.end()) {
190 | 	Dict::const_iterator T = vocab.find(prev);
191 | 	if (T != vocab.end() && w->second < top)
192 | 	    P(T->second, w->second)++; // p[T, w] = n(w, T)
193 |       }
194 |       prev = tok;
195 |     }
196 | # endif
197 |     sentCount++;
198 |     if (verbose) {
199 |       if (sentCount % 100000 == 0) {
200 | 	cerr << '+';
201 | 	cerr.flush();
202 |       } else if (sentCount % 10000 == 0) {
203 | 	cerr << '.';
204 | 	cerr.flush();
205 |       }
206 |     }
207 |   }
208 |   if (verbose) {
209 |     cerr << endl;
210 |     cerr << "Sentences: " << sentCount << endl;
211 |   }
212 |   // normalize counts and apply sqrt()
213 |   if (verbose)
214 |     cerr << "Normalize frequencies" << endl;
215 |   for (unsigned j = 0; j < top; j++) {
216 |     float nT = 0;		// Sum_w C(T, w) != C(T) (includes OOV w)
217 |     for (unsigned i = 0; i < words; i++)
218 |       nT += P(i, j);		// p[i, j] sum by column
219 |     if (nT == 0.0)		// better doing a single test here
220 |       nT = 1;			// avoid zero division
221 |     for (unsigned i = 0; i < words; i++)
222 |       P(i, j) = sqrt(P(i, j) / nT); // p[i, j]
223 |   }
224 |   //Py_DECREF(npyp); DEBUG
225 |   return npyp;
226 | }
227 | 
228 | } // namespace hpca
229 | 
230 | // needed to get PyArray_DescrFromType to work
231 | int dummy = _import_array();
232 | 


--------------------------------------------------------------------------------
/deepnl/WordsTrainer.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "WordsTrainer.h"
 3 | 
 4 | double minError = 1.e-5;
 5 | 
 6 | double WordsTrainer::train_pair()
 7 | {
 8 |   nn.run(input_pos, hidden_pos, output_pos);
 9 |   nn.run(input_neg, hidden_neg, output_neg);
10 | 
11 |   // hinge loss
12 |   double score_pos = output_pos[0];
13 |   double score_neg = output_neg[0];
14 |   double error = std::max(0.0, 1.0 - score_pos + score_neg);
15 |   if (error < minError)
16 |     return error;
17 | 
18 |   // (output_size) x (hidden_size) = (output_size, hidden_size)
19 |   // (1) x (hidden_size) = (1, hidden_size)
20 |   grads.output_weights.row(0) += hidden_pos - hidden_neg;
21 | 
22 |   // layer 2
23 |   // (hidden_size) = (hidden_size) * (1, hidden_size)
24 |   // hidden_pos = hardtanhe(hidden_pos) * nn.output_weights[0]
25 |   hidden_pos = hardtanhe(hidden_pos).transpose().cwiseProduct(nn.output_weights.row(0));
26 | 
27 |   // hidden_neg = hardtanhe(nn.hidden_values) * (- nn.output_weights[0])
28 |   hidden_neg = hardtanhe(hidden_neg).transpose().cwiseProduct(- nn.output_weights.row(0));
29 | 
30 |   // (hidden_size, input_size) = (hidden_size) x (input_size)
31 |   grads.hidden_weights += hidden_pos * input_pos.transpose() +
32 |     hidden_neg * input_neg.transpose();
33 |   grads.hidden_bias += hidden_pos + hidden_neg;
34 | 
35 |   // input gradients
36 |   // These are not accumulated, since their update is immediate.
37 |   // (input_size) = (1, hidden_size) x (hidden_size, input_size)
38 |   grads.input_pos = hidden_pos.transpose() * nn.hidden_weights;
39 |   grads.input_neg = hidden_neg.transpose() * nn.hidden_weights;
40 | 
41 |   return error;
42 | }
43 | 
44 | void WordsTrainer::update_embeddings(double LR_0, int token_pos, int token_neg)
45 | {
46 |   int middle = window_size/2;
47 |   int start = 0;
48 |   for (int i = 0; i < window_size; i++) {
49 |     int end = start + table.cols();
50 |     if (i == middle) {
51 |       // this is the middle position.
52 |       // apply negative and positive deltas to different tokens
53 |       table.row(token_pos) += LR_0 * grads.input_pos.segment(start, end);
54 |       table.row(token_neg) += LR_0 * grads.input_neg.segment(start, end);
55 |     } else {
56 |       // this is not the middle position. both deltas apply.
57 |       int token = example(i, 0);
58 |       table.row(token) += LR_0 * grads.input_pos.segment(start, end)
59 | 	+ LR_0 * grads.input_neg.segment(start, end);
60 |     }	
61 |     start = end;
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/deepnl/WordsTrainer.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <iostream>		// DEBUG
  3 | 
  4 | // Eigen library
  5 | #include <Eigen/Dense>
  6 | 
  7 | typedef Eigen::VectorXd	Vector;
  8 | typedef Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>	Matrix;
  9 | typedef Eigen::Matrix<int, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> Matrix1i;
 10 | 
 11 | using Eigen::Map;
 12 | 
 13 | inline void hardtanh(Vector const& xc)
 14 | {
 15 |   Vector& x = const_cast<Vector&>(xc);
 16 |   for (unsigned i = 0; i < x.size(); i++)
 17 |     if (x(i) < -1.0)
 18 |       x(i) = -1.0;
 19 |     else if (x(i) > 1)
 20 |       x(i) = 1.0;
 21 | }
 22 | 
 23 | /**
 24 |  * Converse of hardtanh: computes derivative of input, given the output, i.e,
 25 |  * given y = hardtanh(x), compute hardtanh'(x).
 26 |  */
 27 | inline Vector& hardtanhe(Vector const& cy)
 28 | {
 29 |   Vector& y = const_cast<Vector&>(cy);
 30 |   for (unsigned i = 0; i < y.size(); i++)
 31 |     if (y(i) == -1.0 || y(i) == 1.0)
 32 |       y(i) = 0.0;
 33 |     else
 34 |       y(i) = 1.0;
 35 |   return y;
 36 | }
 37 | 
 38 | struct Parameters
 39 | {
 40 |   Map<Matrix> hidden_weights;	// input weights
 41 |   Map<Vector> hidden_bias;	// input bias
 42 |   Map<Matrix> output_weights;	// output weights
 43 |   Map<Vector> output_bias;	// output bias
 44 | 
 45 |   Parameters(int numInput, int numHidden, int numOutput,
 46 | 	     double* hidden_weights, double* hidden_bias,
 47 | 	     double* output_weights, double* output_bias) :
 48 |     hidden_weights(hidden_weights, numHidden, numInput),
 49 |     hidden_bias(hidden_bias, numHidden),
 50 |     output_weights(output_weights, numOutput, numHidden),
 51 |     output_bias(output_bias, numOutput)
 52 |   { }
 53 | 
 54 | };
 55 | 
 56 | struct Network : public Parameters
 57 | {
 58 |   Network(int numInput, int numHidden, int numOutput,
 59 | 	  double* hidden_weights, double* hidden_bias,
 60 | 	  double* output_weights, double* output_bias) :
 61 |     Parameters(numInput, numHidden, numOutput,
 62 | 	       hidden_weights, hidden_bias,
 63 | 	       output_weights, output_bias)
 64 |   { }
 65 | 
 66 |   void run(const Vector& input, Vector const& hidden, Vector const& output) {
 67 |     // We must pass const and then cast it away, a hack according to:
 68 |     // http://eigen.tuxfamily.org/dox/TopicFunctionTakingEigenTypes.html
 69 |     const_cast<Vector&>(hidden).noalias() = hidden_weights * input + hidden_bias;
 70 |     hardtanh(hidden);		// first layer
 71 |     const_cast<Vector&>(output).noalias() = output_weights * hidden + output_bias;
 72 |   }
 73 | };
 74 | 
 75 | struct LmGradients : public Parameters
 76 | {
 77 |   Map<Vector> input_pos;	// positive input variables
 78 |   Map<Vector> input_neg;	// negative input variables
 79 | 
 80 |   LmGradients(int numInput, int numHidden, int numOutput,
 81 | 	      double* hiddenWeights, double* hiddenBias,
 82 | 	      double* outpuWeights, double* outputBias,
 83 | 	      double* input_pos, double* input_neg) :
 84 |     Parameters(numInput, numHidden, numOutput,
 85 | 	       hiddenWeights, hiddenBias,
 86 | 	       outpuWeights, outputBias),
 87 |     input_pos(input_pos, numInput),
 88 |     input_neg(input_neg, numInput)
 89 |   { }
 90 | };
 91 | 
 92 | class WordsTrainer
 93 | {
 94 | public:
 95 | 
 96 |   WordsTrainer(int numInput, int numHidden, int numOutput,
 97 | 	       double* hidden_weights, double* hidden_bias,
 98 | 	       double* output_weights, double* output_bias,
 99 | 	       double* input_pos, double* input_neg,
100 | 	       double* grads_hidden_weights, double* grads_hidden_bias,
101 | 	       double* grads_output_weights, double* grads_output_bias,
102 | 	       double* grads_input_pos, double* grads_input_neg,
103 | 	       int* example, int window_size,
104 | 	       double* table, int table_rows, int table_cols) :
105 |     nn(numInput, numHidden, numOutput,
106 |        hidden_weights, hidden_bias,
107 |        output_weights, output_bias),
108 |     input_pos(input_pos, numInput),
109 |     input_neg(input_neg, numInput),
110 |     hidden_pos(numHidden),
111 |     hidden_neg(numHidden),
112 |     output_pos(numOutput),
113 |     output_neg(numOutput),
114 |     grads(numInput, numHidden, numOutput,
115 | 	  grads_hidden_weights, grads_hidden_bias,
116 | 	  grads_output_weights, grads_output_bias,
117 | 	  grads_input_pos, grads_input_neg),
118 |     example(example, window_size, 1),
119 |     window_size(window_size),
120 |     table(table, table_rows, table_cols)
121 |   { }
122 | 
123 |   // input from: input_pos, input_neg, output to: output_pos, output_neg
124 |   double train_pair();
125 | 
126 |   void	update_embeddings(double LR_0,
127 | 			  int token_pos,
128 | 			  int token_neg);
129 | 
130 |   Network	nn;
131 |   Map<Vector>	input_pos, input_neg; // shared with python
132 |   Vector	hidden_pos, hidden_neg;
133 |   Vector	output_pos, output_neg;
134 |   LmGradients	grads;
135 |   Map<Matrix1i>	example;
136 |   int		window_size;
137 |   Map<Matrix>	table;		// word vectors
138 | };
139 | 


--------------------------------------------------------------------------------
/deepnl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/attardi/deepnl/e1ad450f2768a084f44b128de313f19c2f15100f/deepnl/__init__.py


--------------------------------------------------------------------------------
/deepnl/classifier.pyx:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # distutils: language = c++
 3 | 
 4 | """
 5 | Classifier exploiting a neural network.
 6 | """
 7 | 
 8 | # standard
 9 | import numpy as np
10 | from numpy import int32 as INT
11 | cimport numpy as np
12 | import cPickle as pickle
13 | 
14 | # local
15 | from network cimport *
16 | from extractors import Converter
17 | from extractors cimport Converter
18 | from utils import import_class
19 | 
20 | # ----------------------------------------------------------------------
21 | 
22 | cdef class Classifier(object):
23 |     """
24 |     Classifier using a neural network.
25 |     """
26 | 
27 |     cdef Converter converter
28 |     cdef list labels
29 |     cdef readonly Network nn
30 |     cdef np.ndarray pre_padding, post_padding
31 | 
32 |     def __init__(self, Converter converter, list labels, int_t left_context, int_t right_context,
33 |                  Network nn):
34 |         """
35 |         :param converter: the Converter object that extracts features and
36 |            converts them to weights.
37 |         :param labels: list of labels.
38 |         :param left_context: size of left context window.
39 |         :param right_context: size of right context window.
40 |         :param nn: network to be used.
41 |         """
42 |         self.converter = converter
43 |         self.labels = labels
44 |         self.nn = nn        # dependency injection
45 |         cdef np.ndarray[int_t] padding_left = converter.get_padding_left()
46 |         cdef np.ndarray[int_t] padding_right = converter.get_padding_right()
47 |         self.pre_padding = np.array(left_context * [padding_left], dtype=INT)
48 |         self.post_padding = np.array(right_context * [padding_right], dtype=INT)
49 | 
50 |     cpdef predict(self, list tokens):
51 |         """
52 |         Classify a list of tokens. 
53 |         
54 |         :param tokens: a list of tokens, each a list of attributes.
55 |         :returns: the predicted class label
56 |         """
57 |         cdef np.ndarray[int_t,ndim=2] converted = self.converter.convert(tokens)
58 |         # add padding to the sentence
59 |         cdef np.ndarray[int_t,ndim=2] padded_sentence = \
60 |             np.concatenate((self.pre_padding, converted, self.post_padding))
61 | 
62 |         # allocate variables
63 |         vars = self.nn.variables(len(padded_sentence))
64 |         # lookup layer
65 |         self.converter.lookup(padded_sentence, vars.input)
66 |         output = self.nn.forward(vars)
67 |         return self.labels[np.argmax(vars.output)]
68 | 
69 |     def save(self, file):
70 |         """
71 |         Saves the classifier to a file.
72 |         """
73 |         netClass = type(self.nn) # fully qualified name
74 |         pickle.dump(netClass.__module__+'.'+netClass.__name__, file)
75 |         self.nn.save(file)
76 |         pickle.dump(self.labels, file)
77 |         pickle.dump((len(self.pre_padding), len(self.post_padding)), file)
78 |         self.converter.save(file)
79 | 
80 |     @classmethod
81 |     def load(cls, file):
82 |         """
83 |         Loads the classifier from a file.
84 |         """
85 |         classname = pickle.load(file)
86 |         klass = import_class(classname)
87 |         nn = klass.load(file)
88 |         labels = pickle.load(file)
89 |         (left_context, right_context) = pickle.load(file)
90 |         converter = Converter()
91 |         converter.load(file)
92 | 
93 |         return cls(converter, labels, left_context, right_context, nn=nn)
94 | 


--------------------------------------------------------------------------------
/deepnl/corpus.py:
--------------------------------------------------------------------------------
  1 | #!/usr/env python
  2 | # -*- coding: utf-8 -*-
  3 | #cython: embedsignature=True
  4 | 
  5 | """
  6 | Classes for reading/writing various types of corpora.
  7 | """
  8 | 
  9 | # standard
 10 | from __future__ import print_function
 11 | import sys
 12 | import codecs
 13 | 
 14 | class ConllReader(object):
 15 |     """
 16 |     An iterator over sentences read from a file in CoNLL TSV format.
 17 |     If the input is from a file, it can be iterated several times.
 18 |     """
 19 |     def __init__(self, filename=None):
 20 |         self.filename = filename
 21 | 
 22 |     def __iter__(self):
 23 |         if self.filename:
 24 |             file = codecs.open(self.filename, 'r', 'utf-8', errors='ignore')
 25 |         else:
 26 |             file = codecs.getreader('utf-8')(sys.stdin)
 27 |         sent = []
 28 |         for line in file:
 29 |             line = line.strip()
 30 |             if line:
 31 |                 sent.append(line.split('\t'))
 32 |             else:
 33 |                 yield sent
 34 |                 sent = []
 35 |         if sent:                # just in case
 36 |             yield sent
 37 |         if self.filename:
 38 |             file.close()
 39 | 
 40 |     def count(self):
 41 |         """
 42 |         :return: the number of sentences.
 43 |         """
 44 |         empty_lines = 0
 45 |         buf_size = 1024 * 1024
 46 |         file = open(self.filename)
 47 |         read_f = file.read # loop optimization
 48 |         while True:
 49 |             buf = read_f(buf_size)
 50 |             if not buf:
 51 |                 break
 52 |             # FIXME: this fails if \n\n is split between two buffers
 53 |             empty_lines += buf.count('\n\n') # empty lines
 54 |         if self.filename:
 55 |             file.close()
 56 |         return empty_lines
 57 | 
 58 | # ----------------------------------------------------------------------
 59 | 
 60 | class ConllWriter(object):
 61 |     """
 62 |     Prints one token per line as token<tab>tag,
 63 |     with sentence separated by empty line.
 64 |     """
 65 | 
 66 |     @classmethod
 67 |     def write(cls, sent):
 68 |         """
 69 |         Prints a sentence to stdout in TSV format
 70 |         
 71 |         :param sent: the sentence to write.
 72 |         """
 73 |         for token in sent:
 74 |             print('\t'.join([item.encode('utf-8') for item in token]))
 75 |         print()
 76 | 
 77 | # ----------------------------------------------------------------------
 78 | 
 79 | class SrlWriter(object):
 80 | 
 81 |     @classmethod
 82 |     def write(cls, sent):
 83 |         """
 84 |         :param sent: must be of type SRLAnnotatedSentence
 85 |         """
 86 |         print(' '.join(sent.tokens).encode('utf-8'))
 87 |         for predicate, arg_structure in sent.arg_structures:
 88 |             print(predicate.encode('utf-8'))
 89 |             for label in arg_structure:
 90 |                 argument = ' '.join(arg_structure[label])
 91 |                 line = '\t%s: %s' % (label, argument)
 92 |                 print(line.encode('utf-8'))
 93 |         print()
 94 | 
 95 | # ----------------------------------------------------------------------
 96 | 
 97 | class TsvReader(object):
 98 |     """
 99 |     An iterator over examples read from a file in TSV format.
100 |     If the input is from a file, it can be iterated several times.
101 |     """
102 |     def __init__(self, filename=None):
103 |         self.filename = filename
104 | 
105 |     def __iter__(self):
106 |         if self.filename:
107 |             file = codecs.open(self.filename, 'r', 'utf-8', errors='ignore')
108 |         else:
109 |             file = codecs.getreader('utf-8')(sys.stdin)
110 |         for line in file:
111 |             line = line.strip()
112 |             if line:
113 |                 yield line.split('\t')
114 |         if self.filename:
115 |             file.close()
116 | 


--------------------------------------------------------------------------------
/deepnl/embeddings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Load word embeddings from different representations.
  5 | """
  6 | 
  7 | from __future__ import print_function
  8 | import os
  9 | import numpy as np
 10 | import logging
 11 | from itertools import izip
 12 | 
 13 | # local
 14 | from word_dictionary import WordDictionary
 15 | 
 16 | # ----------------------------------------------------------------------
 17 | 
 18 | class Plain(object):
 19 | 
 20 |     @classmethod
 21 |     def read_vectors(cls, filename):
 22 |         """
 23 |         Read an embedding from a plain text file with one vector per 
 24 |         line, values separated by whitespace.
 25 |         """
 26 |         with open(filename, 'rb') as file:
 27 |             matrix = np.array([[float(value) for value in line.split()]
 28 |                                for line in file])
 29 |         return matrix
 30 | 
 31 |     @classmethod
 32 |     def read_vocabulary(cls, filename):
 33 |         """
 34 |         Read a vocabulary file containing one word per line.
 35 |         Return a list of words.
 36 |         """
 37 |         words = []
 38 |         with open(filename, 'rb') as f:
 39 |             for line in f:
 40 |                 word = unicode(line.strip(), 'utf-8')
 41 |                 if word:
 42 |                     words.append(word)
 43 |         return words
 44 | 
 45 |     @classmethod
 46 |     def write_vocabulary(cls, vocab, filename):
 47 |         """
 48 |         Write a vocabulary to a file containing one word per line.
 49 |         """
 50 |         with open(filename, 'wb') as f:
 51 |             for word in vocab:
 52 |                 print(word.encode('utf-8'), file=f)
 53 | 
 54 |     @classmethod
 55 |     def write_vectors(cls, filename, matrix):
 56 |         """
 57 |         Write embedding vectors to a plain text file with one vector per 
 58 |         line, values separated by whitespace.
 59 |         """
 60 |         with open(filename, 'wb') as file:
 61 |             for row in matrix:
 62 |                 print(' '.join(["%f" % x for x in row]), file=file)
 63 | 
 64 | # ----------------------------------------------------------------------
 65 | 
 66 | class Senna(object):
 67 | 
 68 |     @classmethod
 69 |     def read_vocabulary(cls, filename):
 70 |         """
 71 |         Read the vocabulary file used by SENNA.
 72 |         It has one word per line, all lower case except for the special words
 73 |         PADDING and UNKNOWN.
 74 |     
 75 |         """
 76 |         return Plain.vocabulary(filename)
 77 | 
 78 | # ----------------------------------------------------------------------
 79 | 
 80 | class Word2Embeddings(object):
 81 | 
 82 |     @classmethod
 83 |     def read_vocabulary(cls, filename):
 84 |         """
 85 |         Read the vocabulary used with word2embeddings.
 86 |         It is the same as a plain text vocabulary, except the embeddings for
 87 |         the rare/unknown word are the first two items (before any word in the file).
 88 |         """
 89 |         return Plain.vocabulary(filename, 'polyglot')
 90 | 
 91 |     @classmethod
 92 |     def read_vectors(cls, filename):
 93 |         """
 94 |         Load the feature matrix used by word2embeddings.
 95 |         """
 96 |         import cPickle as pickle
 97 |     
 98 |         with open(filename, 'rb') as f:
 99 |             model = pickle.load(f)
100 |         return model.get_word_embeddings()
101 | 
102 | # ----------------------------------------------------------------------
103 | 
104 | class Word2Vec(object):
105 | 
106 |     @classmethod
107 |     def load(cls, filename):
108 |         """
109 |         Load words and vectors from a file in word2vec format.
110 |         """
111 |         words = []
112 |         vectors = []
113 |         with open(filename, 'rb') as f:
114 |             len, size = f.readline().split()
115 |             for line in f:
116 |                 items = line.split()
117 |                 word = unicode(items[0], 'utf-8')
118 |                 words.append(word)
119 |                 vectors.append([float(x) for x in items[1:]])
120 |         # vectors for the special symbols, not present in words, will be
121 |         # created later
122 |         return np.array(vectors), words
123 | 
124 |     @classmethod
125 |     def save(cls, filename, words, vectors):
126 |         """
127 |         Save words and vectors to a file in word2vec format.
128 |         :param vectors: is a Numpy array
129 |         """
130 |         with open(filename, 'wb') as f:
131 |             print(len(words), vectors.shape[1], file=f)
132 |             for word, vector in izip(words, vectors):
133 |                 print(word.encode('UTF-8'), ' '.join('%f' % w for w in vector), file=f)
134 | 
135 | # ----------------------------------------------------------------------
136 | 
137 | def generate_vectors(num_vectors, num_features, min_value=-0.1, max_value=0.1):
138 |     """
139 |     Generates vectors of real numbers, to be used as word features.
140 |     Vectors are initialized randomly with values in the interval [min_value, max_value]
141 |     :return: a 2-dim numpy array.
142 |     """
143 |     # set the seed for replicability
144 |     #np.random.seed(42)          # DEBUG
145 | 
146 |     table = np.random.uniform(min_value, max_value, (num_vectors, num_features))
147 |     logging.debug("Generated %d feature vectors with %d features each." %
148 |                   (num_vectors, num_features))
149 |     
150 |     return table
151 | 


--------------------------------------------------------------------------------
/deepnl/extractors.pxd:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # distutils: language=c++
 3 | 
 4 | """
 5 | Feature extractors.
 6 | """
 7 | 
 8 | cimport numpy as np
 9 | 
10 | from network cimport float_t, int_t
11 | from cpython cimport bool
12 | 
13 | # ----------------------------------------------------------------------
14 | 
15 | cdef class Iterable:
16 |     """
17 |     ABC for classes that provide the __iter__() method.
18 |     """
19 | 
20 | # ----------------------------------------------------------------------
21 | 
22 | cdef class Converter(Iterable):
23 |     """
24 |     Interface to the extractors.
25 |     Extracts features from a sentence and converts them into a list of feature
26 |     vectors in feature space.
27 |     """
28 |     
29 |     cdef readonly list extractors
30 |     cdef readonly list fields
31 | 
32 |     cdef np.ndarray[int_t] get_padding_left(self)
33 |     cdef np.ndarray[int_t] get_padding_right(self)
34 | 
35 |     cpdef int_t size(self)
36 | 
37 |     cpdef np.ndarray[int_t,ndim=2] convert(self, list sent)
38 | 
39 |     cpdef np.ndarray[float_t] lookup(self,
40 |                                      np.ndarray[int_t,ndim=2] sentence,
41 |                                      np.ndarray out=*)
42 | 
43 |     cpdef adaGradInit(self, float_t adaEps)
44 | 
45 |     cpdef update(self, np.ndarray[float_t] grads, np.ndarray[int_t,ndim=2] sentence,
46 |     	  	 float_t learning_rate)
47 | 
48 | cdef class Extractor(object):
49 | 
50 |     cdef readonly dict dict
51 |     cdef readonly np.ndarray table
52 |     cdef readonly np.ndarray adaGrads
53 | 
54 |     cpdef int_t size(self)
55 | 
56 |     cpdef adaGradInit(self, float_t adaEps)
57 | 
58 |     cpdef int_t get_padding_left(self)
59 |     cpdef int_t get_padding_right(self)
60 | 
61 |     cpdef extract(self, list tokens, int_t field)
62 | 
63 | cdef class Embeddings(Extractor):
64 |     pass
65 | 
66 | cdef class CapsExtractor(Extractor):
67 |     pass
68 | 
69 | cdef class AffixExtractor(Extractor):
70 |     cdef bool lowcase
71 | 
72 | cdef class SuffixExtractor(AffixExtractor):
73 |     pass
74 | 
75 | cdef class PrefixExtractor(AffixExtractor):
76 |     pass
77 | 
78 | cdef class GazetteerExtractor(Extractor):
79 |     cdef bool lowcase
80 |     cdef bool noaccents
81 | 
82 | cdef class AttributeExtractor(Extractor):
83 |     pass
84 | 


--------------------------------------------------------------------------------
/deepnl/hpca.pyx:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # distutils: language = c++
 3 | # distutils: sources = deepnl/HPCA_impl.cpp
 4 | # cython: profile=False
 5 | 
 6 | """
 7 | Learn word embeddings from plain text using Hellinger PCA.
 8 | 
 9 | See
10 | Lebret, Rémi, and Ronan Collobert. "Word Embeddings through Hellinger PCA." EACL 2014 (2014): 482.
11 | 
12 | Author: Giuseppe Attardi
13 | """
14 | 
15 | import sys                      # DEBUG
16 | import numpy as np
17 | cimport numpy as np
18 | 
19 | # for method decorations
20 | cimport cython
21 | 
22 | import threading
23 | 
24 | from scipy.linalg.lapack import ssyevr
25 | from numpy.linalg import svd
26 | 
27 | import logging
28 | logger = logging.getLogger(__name__)
29 | 
30 | from network cimport float_t
31 | 
32 | # ----------------------------------------------------------------------
33 | 
34 | cdef extern from "HPCA.h" namespace "hpca" nogil:
35 |     object cooccurrence_matrix(char* corpus, char* vocabFile, int top, int window)
36 | 
37 | # ----------------------------------------------------------------------
38 | 
39 | def cooccurrences(char* corpus, char* vocabFile, unsigned top, unsigned window):
40 |     """
41 |     Compute the cooccurrence matrix on a corpus of sentences, using as
42 |     vocabulary the list of words present in :param vocabFile: and as
43 |     context words the first :param top: words in the vocabulary.
44 |     :param corpus: file containing text.
45 |     """
46 |     return cooccurrence_matrix(corpus, vocabFile, top, window)
47 | 
48 | cpdef np.ndarray[float_t,ndim=2] fit(np.ndarray[float_t,ndim=2] dm,
49 |                                      int n_components, bint covariance=False):
50 |     """
51 |     Compute SVD on :param dm:.
52 |     :param covariance: use covariance matrix.
53 |     :return: the representation of dm reduced to :param n_components: dimensions.
54 |     """
55 | 
56 |     cdef np.ndarray[float_t,ndim=2] cov
57 |     cdef int cols = dm.shape[1]
58 | 
59 |     if covariance:
60 |         # use lapack SSYEVR
61 |         # ask just for the largest eigenvalues
62 |         # (cols x rows) (rows x cols) = (cols x cols)
63 |         cov = dm.T.dot(dm)
64 |         w, z, info = ssyevr(cov, range='I', il=cols-n_components+1, overwrite_a=1)
65 |         return dm.dot(z)
66 |     else:
67 |         # use lapack _gesdd
68 |         u, s, v = svd(dm, full_matrices=False)
69 |         # v = (cols x cols), rows are eigenvectors
70 |         # v.T = (cols x cols), cols are eigenvectors
71 |         # (rows x cols) (cols x comp) = (rows x comp)
72 |         return dm.dot(v.T[:,:n_components])
73 |         # alternative using scipy.sparse.linalg.svds
74 |         # u, s, v = svds(dm, n_components)
75 | 


--------------------------------------------------------------------------------
/deepnl/math.pxd:
--------------------------------------------------------------------------------
 1 | # distutils: language=c++
 2 | 
 3 | cimport numpy as np
 4 | 
 5 | from network cimport float_t, int_t
 6 | 
 7 | # FIXHIM: no overloading in Cython
 8 | cdef np.ndarray[float_t] softmax(np.ndarray[float_t] a, np.ndarray out=*)
 9 | cdef np.ndarray[float_t, ndim=2] softmax2d(np.ndarray[float_t, ndim=2] a, int_t axis=*, np.ndarray out=*)
10 | 
11 | # FIXHIM: no overloading in Cython
12 | cdef float_t logsumexp(np.ndarray[float_t] a)
13 | cdef np.ndarray[float_t] logsumexp2d(np.ndarray[float_t, ndim=2] a, int_t axis=*)
14 | 
15 | cdef np.ndarray[float_t] tanh(np.ndarray[float_t] weights,
16 |                                       np.ndarray out=*)
17 | cdef np.ndarray[float_t] tanhe(np.ndarray[float_t] y,
18 |                                        np.ndarray out=*)
19 | 
20 | cdef np.ndarray[float_t] hardtanh(np.ndarray[float_t] weights,
21 |                                           np.ndarray out=*)
22 | 
23 | cdef np.ndarray[float_t] hardtanhd(np.ndarray[float_t] y,
24 |                                    np.ndarray out=*)
25 | cdef np.ndarray[float_t, ndim=2] hardtanhd2d(np.ndarray[float_t, ndim=2] y,
26 |                                    np.ndarray out=*)
27 | 
28 | cdef np.ndarray[float_t] hardtanh_back(np.ndarray[float_t] y,
29 |                                            np.ndarray[float_t] grads,
30 |                                            np.ndarray[float_t] grads_in)
31 | 
32 | cdef np.ndarray[float_t, ndim=2] hardtanh_back2d(np.ndarray[float_t, ndim=2] y,
33 |                                                  np.ndarray[float_t, ndim=2] grads,
34 |                                                  np.ndarray[float_t, ndim=2] grads_in)
35 | 
36 | cdef np.ndarray[float_t] hardtanhe(np.ndarray[float_t] y,
37 |                                    np.ndarray out=*)
38 | cdef np.ndarray[float_t, ndim=2] hardtanhe2d(np.ndarray[float_t, ndim=2] y,
39 |                                    np.ndarray out=*)
40 | 
41 | 


--------------------------------------------------------------------------------
/deepnl/math.pyx:
--------------------------------------------------------------------------------
  1 | # distutils: language = c++
  2 | 
  3 | cimport numpy as np
  4 | import numpy as np
  5 | 
  6 | # ----------------------------------------------------------------------
  7 | # Math functions
  8 | 
  9 | cdef np.ndarray[float_t] softmax(np.ndarray[float_t] a, np.ndarray out=None):
 10 |     """Compute the ratio of exp(a) to the sum of exponentials.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     a : array_like
 15 |         Input array.
 16 |     out : array_like, optional
 17 |         Alternative output array in which to place the result.
 18 | 
 19 |     Returns
 20 |     -------
 21 |     res : ndarray
 22 |         The result, ``np.exp(a)/(np.sum(np.exp(a), axis))`` calculated in a numerically stable way.
 23 |     """
 24 |     if out is None:
 25 |         out = np.empty_like(a)
 26 |     np.exp(a - a.max(), out)
 27 |     out /= np.sum(out)
 28 |     return out
 29 | 
 30 | cdef np.ndarray[float_t, ndim=2] softmax2d(np.ndarray[float_t, ndim=2] a, int_t axis=0, np.ndarray out=None):
 31 |     """Compute the ratio of exp(a) to the sum of exponentials along the axis.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     a : array_like
 36 |         Input array.
 37 |     axis : int, optional
 38 |         Axis over which the sum is taken. By default `axis` is 0,
 39 |     out : array_like, optional
 40 |         Alternative output array in which to place the result.
 41 | 
 42 |     Returns
 43 |     -------
 44 |     res : ndarray
 45 |         The result, ``np.exp(a)/(np.sum(np.exp(a), axis))`` calculated in a numerically stable way.
 46 |     """
 47 |     if out is None:
 48 |         out = np.empty_like(a)
 49 |     np.exp(a - a.max(axis), out)
 50 |     out /= np.sum(out, axis)
 51 |     return out
 52 | 
 53 | cdef float_t logsumexp(np.ndarray[float_t] a):
 54 |     """Compute the log of the sum of exponentials of input elements.
 55 |     like: scipy.misc.logsumexp
 56 | 
 57 |     Parameters
 58 |     ----------
 59 |     a : array_like
 60 |         Input array.
 61 | 
 62 |     Returns
 63 |     -------
 64 |     res : ndarray
 65 |         The result, ``np.log(np.sum(np.exp(a)))`` calculated in a numerically
 66 |         more stable way.
 67 |     """
 68 |     a_max = a.max()
 69 |     return np.log(np.sum(np.exp(a - a_max))) + a_max
 70 | 
 71 | cdef np.ndarray[float_t] logsumexp2d(np.ndarray[float_t,ndim=2] a, int_t axis=0):
 72 |     """Compute the log of the sum of exponentials of input elements.
 73 |     like: scipy.misc.logsumexp
 74 | 
 75 |     Parameters
 76 |     ----------
 77 |     a : array_like
 78 |         Input array.
 79 |     axis : int
 80 |         Axis over which the sum is taken.
 81 | 
 82 |     Returns
 83 |     -------
 84 |     res : ndarray
 85 |         The result, ``np.log(np.sum(np.exp(a)))`` calculated in a numerically
 86 |         more stable way.
 87 |     """
 88 |     a = np.rollaxis(a, axis)
 89 |     a_max = a.max(axis=0)
 90 |     return np.log(np.sum(np.exp(a - a_max), axis=0)) + a_max
 91 | 
 92 | cdef np.ndarray[float_t] tanh(np.ndarray[float_t] weights,
 93 |                                       np.ndarray out=None):
 94 |     """Hyperbolic tangent.
 95 |     out : array_like, optional
 96 |         Alternative output array in which to place the result.
 97 |     """
 98 |     if out is None:
 99 |         out = np.empty_like(weights)
100 |     np.tanh(weights, out)
101 |     return out
102 | 
103 | cdef np.ndarray[float_t] tanhe(np.ndarray[float_t] y,
104 |                                        np.ndarray out=None):
105 |     """Hyperbolic tangent.
106 |     out : array_like, optional
107 |         Alternative output array in which to place the result.
108 |     """
109 |     if out is None:
110 |         out = np.empty_like(y)
111 |     out[:] = 1 - y**2
112 |     return out
113 | 
114 | cdef np.ndarray[float_t] hardtanh(np.ndarray[float_t] y,
115 |                                           np.ndarray out=None):
116 |     """Hard hyperbolic tangent.
117 |     out : array_like, optional
118 |         Alternative output array in which to place the result.
119 |     """
120 |     it = np.nditer([y, out],
121 |                    op_flags = [['readonly'],
122 |                                ['writeonly', 'allocate', 'no_broadcast']])
123 |     for w, o in it:
124 |         if w < -1:
125 |             o[...] = -1
126 |         elif w > 1:
127 |             o[...] = 1
128 |         else:
129 |             o[...] = w
130 |     return it.operands[1]
131 | 
132 | cdef np.ndarray[float_t] hardtanhd(np.ndarray[float_t] y,
133 |                                            np.ndarray out=None):
134 |     """derivative of hardtanh
135 |     out : array_like, optional
136 |         Alternative output array in which to place the result.
137 |     """
138 |     it = np.nditer([y, out],
139 |                    op_flags = [['readonly'],
140 |                                ['writeonly', 'allocate', 'no_broadcast']])
141 |     for w, o in it:
142 |         if -1.0 <= w <= 1.0:
143 |             o[...] = 1.0
144 |         else:
145 |             o[...] = 0.0
146 |     return it.operands[1]
147 | 
148 | cdef np.ndarray[float_t, ndim=2] hardtanhd2d(np.ndarray[float_t, ndim=2] y,
149 |                                              np.ndarray out=None):
150 |     """derivative of hardtanh
151 |     out : array_like, optional
152 |         Alternative output array in which to place the result.
153 |     """
154 |     it = np.nditer([y, out],
155 |                    op_flags = [['readonly'],
156 |                                ['writeonly', 'allocate', 'no_broadcast']])
157 |     for w, o in it:
158 |         if -1.0 <= w <= 1.0:
159 |             o[...] = 1.0
160 |         else:
161 |             o[...] = 0.0
162 |     return it.operands[1]
163 | 
164 | cdef np.ndarray[float_t] hardtanh_back(np.ndarray[float_t] y,
165 |                                                np.ndarray[float_t] grads,
166 |                                                np.ndarray[float_t] grads_in):
167 |     """backward of hardtanh in terms of y = hardtanh(x)
168 |     Propagates the output gradients to the input, by multiplying with the
169 |     derivative p hardtanh.
170 |     grads:      gradients of output.
171 |     grads_in :  output array in which to place the result.
172 |     """
173 |     it = np.nditer([y, grads, grads_in],
174 |                    op_flags = [['readonly'], ['readonly'], ['writeonly']])
175 |     for w, g, o in it:
176 |         if w == -1.0 or w == 1.0:
177 |             o[...] = 0.0
178 |         else:
179 |             o[...] = g[...]
180 |     return grads_in
181 | 
182 | cdef np.ndarray[float_t, ndim=2] hardtanh_back2d(np.ndarray[float_t, ndim=2] y,
183 |                                                  np.ndarray[float_t, ndim=2] grads_out,
184 |                                                  np.ndarray[float_t, ndim=2] grads_in):
185 |     """derivative of hardtanh in terms of y = hardtanh(x)
186 |     Propagates the output gradients to the input, by multiplying with the
187 |     derivative of hardtanh.
188 |     grads_out:  gradients of output.
189 |     grads_in:   array in which to place the result.
190 |     """
191 |     it = np.nditer([y, grads_out, grads_in],
192 |                    op_flags = [['readonly'], ['readonly'], ['writeonly']])
193 |     for w, gout, gin in it:
194 |         if w == -1.0 or w == 1.0:
195 |             gin[...] = 0.0
196 |         else:
197 |             gin[...] = gout[...]
198 |     return grads_in
199 | 
200 | cdef np.ndarray[float_t] hardtanhe(np.ndarray[float_t] y,
201 |                                            np.ndarray out=None):
202 |     """derivative of hardtanh in terms of y = hardtanh(x)
203 |     out : array_like, optional
204 |         Alternative output array in which to place the result.
205 |     """
206 |     it = np.nditer([y, out],
207 |                    op_flags = [['readonly'],
208 |                                ['writeonly', 'allocate', 'no_broadcast']])
209 |     for w, o in it:
210 |         if w == -1.0 or w == 1.0:
211 |             o[...] = 0.0
212 |         else:
213 |             o[...] = 1.0
214 |     return it.operands[1]
215 | 
216 | cdef np.ndarray[float_t, ndim=2] hardtanhe2d(np.ndarray[float_t, ndim=2] y,
217 |                                              np.ndarray out=None):
218 |     """derivative of hardtanh in terms of y = hardtanh(x)
219 |     out: array_like, optional
220 |         Alternative output array in which to place the result.
221 |     """
222 |     it = np.nditer([y, out],
223 |                    op_flags = [['readonly'],
224 |                                ['writeonly', 'allocate', 'no_broadcast']])
225 |     for w, o in it:
226 |         if w == -1.0 or w == 1.0:
227 |             o[...] = 0.0
228 |         else:
229 |             o[...] = 1.0
230 |     return it.operands[1]
231 | 


--------------------------------------------------------------------------------
/deepnl/ner_tagger.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | NER tagger exploiting a deep neural network.
 5 | """
 6 | 
 7 | # standard
 8 | import sys
 9 | from itertools import izip
10 | 
11 | # local
12 | from tagger import Tagger
13 | from reader import TaggerReader
14 | from corpus import *
15 | 
16 | # ----------------------------------------------------------------------
17 | 
18 | class ToIOBES(object):
19 |     """Convert from IOB to IOBES notation:
20 |     Begin
21 |     Inside
22 |     Outside
23 |     Single
24 |     End
25 |     """
26 | 
27 |     def __init__(self, iterable, tagField):
28 |         self.iterable = iterable
29 |         self.tagField = tagField
30 | 
31 |     def __iter__(self):
32 |         # tokens are lists [form, ..., tag]
33 |         for sent in self.iterable:
34 |             l = len(sent)
35 |             for i, tok in enumerate(sent):
36 |                 if  i+1 == l or sent[i+1][self.tagField][0] != 'I':
37 |                     if tok[self.tagField][0] == 'B':
38 |                         tok[self.tagField] = 'S'+tok[self.tagField][1:]
39 |                     elif tok[self.tagField][0] == 'I':
40 |                         tok[self.tagField] = 'E'+tok[self.tagField][1:]
41 |             yield sent
42 | 
43 | class NerReader(TaggerReader):
44 |     """
45 |     This class reads data from a CoNLL03 corpus and turns it into a format
46 |     readable by the neural network for the NER tagging task.
47 |     """
48 | 
49 |     def read(self, filename):
50 |         """
51 |         :param filename: the name of a file in CoNLL TSV format.
52 |         """
53 |         return ToIOBES(ConllReader(filename), self.tagField)
54 | 
55 | # ----------------------------------------------------------------------
56 | 
57 | class NerTagger(Tagger):
58 |     """Performs NER tagging on sentences."""
59 |     
60 |     def tag(self, sent, tagField=-1):
61 |         tags = self.toIOB(super(NerTagger, self).tag(sent))
62 |         for tok,tag in izip(sent, tags):
63 |             tok[tagField] = tag
64 |         return sent
65 | 
66 |     def toIOB(self, tags):
67 |         """
68 |         Convert back from IOBES to IOB notation.
69 |         """
70 |         res = []
71 |         for tag in tags:
72 |             if tag[0] == 'S':
73 |                 res.append('B'+tag[1:])
74 |             elif tag[0] == 'E':
75 |                 res.append('I'+tag[1:])
76 |             else:
77 |                 res.append(tag)
78 |         return res
79 | 
80 | 


--------------------------------------------------------------------------------
/deepnl/network.pxd:
--------------------------------------------------------------------------------
 1 | # distutils: language=c++
 2 | 
 3 | cimport numpy as np
 4 | 
 5 | # Use double floats
 6 | ctypedef double float_t
 7 | # Use 32bit int
 8 | ctypedef int int_t
 9 | # dtype('int32')
10 | from numpy import int32 as INT
11 | 
12 | cdef class Variables(object):
13 |     """Visible and hidden variables.
14 |     Unique to thread"""
15 |     
16 |     cdef public np.ndarray input, hidden, output
17 | 
18 | cdef class Parameters(object):
19 |     """
20 |     Network parameters: weights and biases.
21 |     Shared by threads.
22 |     """
23 | 
24 |     cdef public np.ndarray hidden_weights, hidden_bias
25 |     cdef public np.ndarray output_weights, output_bias
26 | 
27 |     cdef copy(self, Parameters p)
28 |     # cpdef since it is called with super
29 |     cpdef update(self, Gradients grads, float_t learning_rate,
30 |                  Parameters ada=*)
31 | 
32 | cdef class Gradients(Parameters):
33 | 
34 |     # gradients for input variables
35 |     cdef public np.ndarray input
36 | 
37 | cdef class Network(object):
38 |     
39 |     cdef public Parameters p
40 | 
41 |     # sizes (public for loading)
42 |     cdef public int input_size, hidden_size, output_size
43 |     
44 |     # function to save periodically
45 |     cdef public object saver
46 | 
47 |     cdef variables(self, int slen=*)
48 |     cdef gradients(self, int slen=*)
49 |     cdef parameters(self)
50 | 
51 |     cpdef forward(self, Variables vars)
52 | 
53 |     cdef float_t backpropagate(self, int y, Variables vars, Gradients grads)
54 | 
55 |     # cpdef since used with super
56 |     cpdef update(self, Gradients grads, float_t learning_rate, Parameters ada=*)
57 | 


--------------------------------------------------------------------------------
/deepnl/network.pyx:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # distutils: language = c++
  3 | # cython: embedsignature=True
  4 | # cython: profile=True
  5 | 
  6 | """
  7 | A neural network for NLP tagging tasks.
  8 | """
  9 | 
 10 | # standard
 11 | from __future__ import print_function
 12 | import logging
 13 | import sys                      # DEBUG
 14 | 
 15 | import numpy as np
 16 | cimport numpy as np
 17 | 
 18 | # for decorations
 19 | cimport cython
 20 | 
 21 | # local
 22 | from math cimport *
 23 | 
 24 | # ----------------------------------------------------------------------
 25 | 
 26 | cdef class Variables(object):
 27 |     
 28 |     #cdef public np.ndarray input, hidden, output
 29 | 
 30 |     def __init__(self, input_size=0, hidden_size=0, output_size=0):
 31 |         self.input = np.empty(input_size) if input_size else None
 32 |         self.hidden = np.empty(hidden_size) if hidden_size else None
 33 |         self.output = np.empty(output_size) if output_size else None
 34 | 
 35 | # ----------------------------------------------------------------------
 36 | 
 37 | cdef class Parameters(object):
 38 |     """
 39 |     Network parameters: weights and biases.
 40 |     Parameters are shared among threads in ASGD.
 41 |     """
 42 |     
 43 |     def __init__(self, int input_size, int hidden_size, int output_size):
 44 |         self.output_weights = np.zeros((output_size, hidden_size))
 45 |         self.output_bias = np.zeros(output_size)
 46 |         self.hidden_weights = np.zeros((hidden_size, input_size))
 47 |         self.hidden_bias = np.zeros(hidden_size)
 48 | 
 49 |     def clear(self, val=0.0):
 50 |         self.output_weights[:,:] = val
 51 |         self.output_bias[:] = val
 52 |         self.hidden_weights[:,:] = val
 53 |         self.hidden_bias[:] = val
 54 | 
 55 |     def initialize(self, int input_size, int hidden_size, int output_size):
 56 |         """
 57 |         Creates the weight matrices with random values.
 58 |         """
 59 |         # We must pass sizes since one cannot get shape from Cython ndarray
 60 | 
 61 |         # Note : optimal initialization of weights may depend on the
 62 |         #        activation function used (among other things).
 63 |         #        For example, results presented in [Xavier10] suggest that you
 64 |         #        should use 4 times larger initial weights for sigmoid()
 65 |         #        compared to tanh().
 66 | 
 67 |         # set the seed for replicability
 68 |         #np.random.seed(42)          # DEBUG
 69 | 
 70 |         high = 2.38 / np.sqrt(input_size) # [Bottou-88]
 71 |         self.hidden_weights = np.random.uniform(-high, high, (hidden_size, input_size))
 72 |         self.hidden_bias = np.random.uniform(-high, high, (hidden_size))
 73 | 
 74 |         high = 2.38 / np.sqrt(hidden_size) # [Bottou-88]
 75 |         self.output_weights = np.random.uniform(-high, high, (output_size, hidden_size))
 76 |         self.output_bias = np.random.uniform(-high, high, (output_size))
 77 | 
 78 |     cdef copy(self, Parameters p):
 79 |         """Used in ASGD"""
 80 |         self.output_weights[:,:] = p.output_weights
 81 |         self.output_bias[:] = p.output_bias
 82 |         self.hidden_weights[:,:] = p.hidden_weights
 83 |         self.hidden_bias[:] = p.hidden_bias
 84 | 
 85 |     def addSquare(self, Gradients grads):
 86 |         """For adaGrad"""
 87 |         self.output_weights += grads.output_weights * grads.output_weights
 88 |         self.output_bias += grads.output_bias * grads.output_bias
 89 |         self.hidden_weights += grads.hidden_weights * grads.hidden_weights
 90 |         self.hidden_bias += grads.hidden_bias * grads.hidden_bias
 91 | 
 92 |     cpdef update(self, Gradients grads, float_t learning_rate,
 93 |                  Parameters ada=None):
 94 |         """
 95 |         Adjust the weights.
 96 |         :param ada: cumulative square gradients for performing AdaGrad.
 97 |         AdaGrad: G_t, where G(i,i)_t = G(i,i)_t-1 + grad(i)^2
 98 |         * i.e. we cumulate the square of gradients in G for parameter p:
 99 |         * G += g^2
100 |         * p -= LR * g / sqrt(G + eps)
101 | 
102 |         Consider using AdaDelta instead:
103 |         http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf
104 | 
105 |         """
106 |         if ada:
107 |             # print('ada', ada.hidden_weights[:5,:5], ada.hidden_weights[-5:,-5:], file=sys.stderr) # DEBUG
108 |             ada.addSquare(grads)
109 |             self.output_weights += learning_rate * grads.output_weights / np.sqrt(ada.output_weights)
110 |             self.output_bias += learning_rate * grads.output_bias / np.sqrt(ada.output_bias)
111 | 
112 |             # print('ada', ada.hidden_weights[:5,:5], ada.hidden_weights[-5:,-5:], file=sys.stderr) # DEBUG
113 |             # print('uhw', learning_rate, adaEps, self.hidden_weights[:5,:5], grads.hidden_weights[:5,:5], grads.hidden_weights[-5:,-5:], file=sys.stderr) # DEBUG
114 | 
115 |             self.hidden_weights += learning_rate * grads.hidden_weights / np.sqrt(ada.hidden_weights)
116 |             self.hidden_bias += learning_rate * grads.hidden_bias / np.sqrt(ada.hidden_bias)
117 |         else:
118 |             # divide by the fan-in
119 |             self.output_weights += grads.output_weights * learning_rate / 100  # DEBUG / self.hidden_size
120 |             self.output_bias += grads.output_bias * learning_rate / 100 # DEBUG self.hidden_size
121 |             # print('uhw', learning_rate, self.hidden_weights[:2,:2], grads.hidden_weights[:2,:2], file=sys.stderr) # DEBUG
122 |             self.hidden_weights += grads.hidden_weights * learning_rate / 100  # DEBUG self.input_size
123 |             self.hidden_bias += grads.hidden_bias * learning_rate / 100 # DEBUG self.input_size
124 | 
125 |     def save(self, file):
126 |         """
127 |         Saves the parameters to a file.
128 |         It saves the weights and biases.
129 |         """
130 |         np.savez(file, hidden_weights=self.hidden_weights,
131 |                  output_weights=self.output_weights,
132 |                  hidden_bias=self.hidden_bias, output_bias=self.output_bias)
133 | 
134 |     @classmethod
135 |     def load(cls, file):
136 |         """
137 |         Loads the neural network from a file.
138 |         It loads weights, biases and sizes.
139 |         """
140 |         data = np.load(file)
141 |         
142 |         p = cls.__new__(cls)
143 |         p.hidden_weights = data['hidden_weights']
144 |         p.output_weights = data['output_weights']
145 |         p.hidden_bias = data['hidden_bias']
146 |         p.output_bias = data['output_bias']
147 | 
148 |         return p
149 | 
150 | # ----------------------------------------------------------------------
151 | 
152 | cdef class Gradients(Parameters):
153 |     """
154 |     Gradients for all network Parameters, plus input gradients.
155 |     """
156 |     
157 |     # gradients for input variables
158 |     #cdef public np.ndarray input
159 | 
160 |     def __init__(self, int input_size, int hidden_size, int output_size):
161 |         super(Gradients, self).__init__(input_size, hidden_size, output_size)
162 |         self.input = np.zeros(input_size)
163 | 
164 |     def clear(self, val=0.0):
165 |         self.output_weights.fill(val)
166 |         self.output_bias.fill(val)
167 |         self.hidden_weights.fill(val)
168 |         self.hidden_bias.fill(val)
169 |         self.input.fill(val)
170 | 
171 | # ----------------------------------------------------------------------
172 | 
173 | cdef class Network(object):
174 |     """
175 |     Basic neural network.
176 |     Parameters are the weights of the various layers.
177 |     """
178 |     # FIXME: make Parameters an array of Layers, each one with its
179 |     # forward/backward/update methods.
180 | 
181 |     def __init__(self, int input_size, int hidden_size, int output_size,
182 |                  p=None):
183 |         """
184 |         :param input_size: number of input variables
185 |         :param hidden_size: number of hidden variables
186 |         :param output_size: number of output variables
187 |         :param p: the parameters to use.
188 | 
189 |         For replicability, the seed should have been set, e.g. to
190 |            np.random.seed(42)
191 |         """
192 |         self.input_size = input_size
193 |         self.hidden_size = hidden_size
194 |         self.output_size = output_size
195 | 
196 |         if not p:
197 |             p = Parameters(input_size, hidden_size, output_size)
198 |             # initialize parameters to random values
199 |             p.initialize(input_size, hidden_size, output_size)
200 |         self.p = p
201 | 
202 |         # saver fuction
203 |         self.saver = lambda nn: None
204 |     
205 |     def description(self):
206 |         """
207 |         Returns a textual description of the network.
208 |         """
209 |         desc = """
210 |         Input layer size: %d
211 |         Hidden layer size: %d
212 |         Output size: %d
213 |         """ % (self.input_size, self.hidden_size, self.output_size)
214 |         
215 |         return desc
216 |     
217 |     cdef variables(self, int slen=1):
218 |         """Allocate variables.
219 |         :param slen: sequence length (for sequence or convolutional networks)
220 |         """
221 |         return Variables(self.input_size * slen, self.hidden_size * slen, self.output_size * slen)
222 | 
223 |     cdef gradients(self, int slen=1):
224 |         """Allocate variables.
225 |         :param slen: sequence length (for sequence or convolutional networks)
226 |         """
227 |         return Gradients(self.input_size, self.hidden_size, self.output_size)
228 | 
229 |     cdef parameters(self):
230 |         """Allocate network parameters.
231 |         """
232 |         return Parameters(self.input_size, self.hidden_size, self.output_size)
233 | 
234 |     cpdef forward(self, Variables vars):
235 |         """
236 |         Runs the network on the given variables: hidden and visible. 
237 |         """
238 |         # (hidden_size, input_size) . input_size = hidden_size
239 |         self.p.hidden_weights.dot(vars.input, vars.hidden)
240 |         vars.hidden += self.p.hidden_bias
241 |         hardtanh(vars.hidden, vars.hidden)
242 |         self.p.output_weights.dot(vars.hidden, vars.output)
243 |         vars.output += self.p.output_bias
244 | 
245 |     cdef float_t backpropagate(self, int y, Variables vars, Gradients grads):
246 |         """
247 |         Cost is the hinge loss.
248 |         Compute the gradients of the cost for each layer.
249 |         :param y: the correct outcome.
250 |         :param vars: the network variables.
251 |         :param grads: were to store the gradients.
252 |         :return: the hinge loss.
253 |         """
254 | 
255 |         # Multiclass hinge loss (Crammer&Singer):
256 |         # hl(x, y) = max(0, 1 + max_t!=y f(x)[t] - f(x)[y])
257 |         # Hinge loss is 0 if the score of the correct label exceeds the score
258 |         # of every other label by a margin of at least 1.
259 |         # m = argmax_t!=y f(x)[t]
260 |         # dhl / df [y] = -1 if f(x)[m] - f(x)[y] > 1, else 0
261 |         # dhl / df [t] = +1 if f(x)[t] - f(x)[y] > 1, else 0
262 |         cdef float_t fx_y = vars.output[y]
263 |         cdef float_t fx_m = np.NINF # negative infinity
264 |         cdef int i
265 |         cdef float_t v
266 |         for i, v in enumerate(vars.output):
267 |             if i == y:
268 |                 continue
269 |             if v > fx_m:
270 |                 fx_m = v
271 |         cdef float_t hinge_loss = max(0.0, 1 + fx_m - fx_y)
272 | 
273 |         if hinge_loss == 0.0:
274 |             return hinge_loss
275 |         cdef Parameters p = self.p
276 | 
277 |         # minimizing C(f_4)
278 |         # f_4 = W_2 f_3 + b_2
279 |         # dC / db_2 = dC / df_4
280 |         # negative gradient:
281 |         grads.output_bias[:] = np.where(vars.output - fx_y > -1, -1, 0) # -1
282 |         grads.output_bias[y] = +1
283 | 
284 |         # dC / dW_2 = dC / df_4 f_3
285 |         # (output_size) x (hidden_size) = (output_size, hidden_size)
286 |         np.outer(grads.output_bias, vars.hidden, grads.output_weights)
287 |         # dC / df_3 = dC / df_4 * W_2
288 |         # (output_size) * (output_size, hidden_size) = (hidden_size)
289 |         grads.output_bias.dot(p.output_weights, grads.hidden_bias) # temporary
290 | 
291 |         # f_3 = hardtanh(f_2)
292 |         # dC / df_2 = dC / df_3 * hardtanh'(f_2)
293 |         hardtanh_back(vars.hidden, grads.hidden_bias, grads.hidden_bias)
294 | 
295 |         # f_2 = W_1 f_1 + b_1
296 |         # dC / db_1 = dC / df_2
297 | 
298 |         # dC / dW_1 = dC / df_2 * f_1
299 |         # (hidden_size) x (input_size) = (hidden_size, input_size)
300 |         np.outer(grads.hidden_bias, vars.input, grads.hidden_weights)
301 | 
302 |         # dC / df_1 = dC / df_2 * W_1
303 |         # (hidden_size) * (hidden_size, input_size) = (input_size)
304 |         grads.hidden_bias.dot(p.hidden_weights, grads.input)
305 | 
306 |         # Lookup layer
307 |         # f_1 = W_0 f_0
308 |         # dC / dW_0 = dC / df_1 * W_0
309 | 
310 |         return hinge_loss
311 | 
312 |     cpdef update(self, Gradients grads, float_t learning_rate,
313 |                  Parameters ada=None):
314 |         self.p.update(grads, learning_rate, ada)
315 | 
316 |     def save(self, file):
317 |         """
318 |         Saves the neural network to a file.
319 |         It saves the parameters.
320 |         """
321 |         self.p.save(file)
322 | 
323 |     @classmethod
324 |     def load(cls, file):
325 |         """
326 |         Loads the neural network from a file.
327 |         It loads weights, biases and sizes.
328 |         """
329 |         nn = cls.__new__(cls)
330 |         nn.p = Parameters.load(file)
331 |         nn.input_size = nn.p.hidden_weights.shape[1]
332 |         nn.hidden_size = nn.p.hidden_weights.shape[0]
333 |         nn.output_size = nn.p.output_weights.shape[0]
334 | 
335 |         return nn
336 | 


--------------------------------------------------------------------------------
/deepnl/networkconv.pxd:
--------------------------------------------------------------------------------
 1 | # distutils: language=c++
 2 | 
 3 | from network cimport *
 4 | 
 5 | cdef class ConvVariables(Variables):
 6 |     """Visible and hidden variables.
 7 |     Unique to thread"""
 8 |     
 9 |     cdef public np.ndarray hidden2
10 |     # convolution layer
11 |     cdef readonly np.ndarray conv
12 | 
13 | cdef class ConvParameters(Parameters):
14 |     """
15 |     Network parameters: weights and biases.
16 |     Shared by threads.
17 |     """
18 | 
19 |     # the second hidden layer
20 |     cdef public np.ndarray hidden2_weights, hidden2_bias
21 | 
22 |     cpdef update(self, Gradients grads, float_t learning_rate, Parameters ada=*)
23 | 
24 | cdef class ConvGradients(Gradients):
25 |     
26 |     cdef public np.ndarray hidden2_weights, hidden2_bias
27 |     cdef readonly np.ndarray conv
28 | 
29 | cdef class ConvolutionalNetwork(Network):
30 | 
31 |     cdef public int hidden2_size
32 |     cdef public int pool_size
33 | 
34 |     cdef np.ndarray[float_t] predict(self, list tokens)
35 | 
36 |     cdef float_t backpropagate(self, int y, Variables vars, Gradients grads)
37 | 


--------------------------------------------------------------------------------
/deepnl/networkseq.pxd:
--------------------------------------------------------------------------------
 1 | # distutils: language=c++
 2 | 
 3 | from network cimport *
 4 | 
 5 | cdef class SeqParameters(Parameters):
 6 | 
 7 |     # transitions
 8 |     cdef public np.ndarray transitions
 9 |     
10 | cdef class SeqGradients(Gradients):
11 | 
12 |     # gradients for output variables
13 |     cdef public np.ndarray output
14 |     # gradients for hidden variables
15 |     cdef public np.ndarray hidden
16 |     cdef public np.ndarray transitions
17 | 
18 | cdef class SequenceNetwork(Network):
19 | 
20 |     cdef public np.ndarray input_sequence
21 |     # FIXME: put in SeqVariables
22 |     cdef public np.ndarray hidden_sequence
23 | 
24 |     # FIXME: clash with method in Network
25 |     cdef float_t backpropagateSeq(self, sent_tags, scores, SeqGradients grads, float_t skipErr)
26 | 
27 |     cdef _backpropagate(self, SeqGradients grads)
28 | 
29 |     cdef np.ndarray[float_t,ndim=2] _calculate_delta(self, scores)
30 | 
31 |     cdef float_t _calculate_gradients_sll(self, np.ndarray[int_t] tags,
32 |                                         SeqGradients grads,
33 |                                         np.ndarray[float_t,ndim=2] scores,
34 |                                         float_t skipErr)
35 | 
36 |     cdef float_t _calculate_gradients_wll(self, np.ndarray[int_t] tags,
37 |                                         SeqGradients grads,
38 |                                         np.ndarray[float_t,ndim=2] scores)
39 | 
40 |     cpdef np.ndarray[int_t] _viterbi(self,
41 |                                             np.ndarray[float_t,ndim=2] scores)
42 | 


--------------------------------------------------------------------------------
/deepnl/pos_tagger.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | POS tagger exploiting a deep neural network.
 5 | """
 6 | 
 7 | # standard
 8 | import sys
 9 | from __future__ import print_function
10 | 
11 | # local
12 | from network import Network
13 | from tagger import Tagger
14 | from reader import PosReader
15 | from corpus import *
16 | 
17 | # ----------------------------------------------------------------------
18 | 
19 | class PosTagger(Tagger):
20 |     """A PosTagger loads the model and performs POS tagging on text."""
21 | 
22 |     def tag(self, file=sys.stdout):
23 |         """
24 |         :param filename: the file from which to read, stdin if missing.
25 |         """
26 |         reader = PosReader(file)
27 |         writer = ConllWriter()
28 |         for sent in reader:
29 |             print(writer.write(self.tag_sequence(sent)))
30 | 
31 | 


--------------------------------------------------------------------------------
/deepnl/reader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/env python
  2 | # -*- coding: utf-8 -*-
  3 | #cython: embedsignature=True
  4 | 
  5 | """
  6 | Classes for reading various types of corpora.
  7 | """
  8 | 
  9 | # standard
 10 | import os
 11 | import logging
 12 | import numpy as np
 13 | from collections import Counter
 14 | import gzip
 15 | 
 16 | # local
 17 | from corpus import *
 18 | from embeddings import Plain
 19 | 
 20 | class Reader(object):
 21 |     """
 22 |     Abstract class for corpus readers.
 23 |     """
 24 |     
 25 |     # force class to be abstract
 26 |     #__metaclass__ = abc.ABCMeta
 27 | 
 28 |     def create_vocabulary(self, sentences, size, min_occurrences=3):
 29 |         """
 30 |         Create vocabulary from sentences.
 31 |         :param sentences: an iterable on sentences.
 32 |         :param size: size of the vocabulary
 33 |         :param min_occurrences: Minimum number of times that a token must
 34 |             appear in the text in order to be included in the dictionary. 
 35 |         Sentence tokens are lists [form, ..., tag]
 36 |         """
 37 |         c = Counter()
 38 |         for sent in sentences:
 39 |             for token, in sent:
 40 |                 c[token] += 1
 41 |         common = c.most_common(size)
 42 |         words = [w for w, n in common if n >= min_occurrences]
 43 |         return words
 44 | 
 45 |     def load_vocabulary(self, filename):
 46 |         return Plain.read_vocabulary(filename)
 47 | 
 48 | # ----------------------------------------------------------------------
 49 | 
 50 | class TextReader(Reader):
 51 |     """
 52 |     Reads sentences from tokenized text file.
 53 |     """
 54 |     
 55 |     def __init__(self, variant=None):
 56 |         """
 57 |         :param sentences: A list of lists of tokens.
 58 |         """
 59 |         super(TextReader, self).__init__()
 60 |         self.variant = variant
 61 | 
 62 |     def read(self, filename=None):
 63 |         """
 64 |         :param filename: name of the file from where sentences are read.
 65 |             The file should have one sentence per line, with tokens
 66 |             separated by white spaces.
 67 |         :return: an iterable over sentences, which can be iterated over several
 68 |             times.
 69 |         """
 70 |         class iterable(object):
 71 |             def __iter__(self):
 72 |                 if not filename:
 73 |                     file = sys.stdin
 74 |                 elif filename.endswith('.gz'):
 75 |                     file = gzip.GzipFile(filename, 'rb')
 76 |                 else:
 77 |                     file = open(filename, 'rb')
 78 |                 for line in file:
 79 |                     sent =  unicode(line, 'utf-8').split()
 80 |                     if sent:
 81 |                         yield sent
 82 |                 file.close()
 83 | 
 84 |         return iterable()
 85 | 
 86 |     def sent_count(self):
 87 |         return len(self.sentences)
 88 | 
 89 | # ----------------------------------------------------------------------
 90 | 
 91 | class TaggerReader(ConllReader):
 92 |     """
 93 |     Abstract class extending TextReader with useful functions
 94 |     for tagging tasks. 
 95 |     """
 96 |     
 97 |     # force class to be abstract
 98 |     #__metaclass__ = abc.ABCMeta
 99 | 
100 |     def __init__(self, formField=0, tagField=-1):
101 |         """
102 |         :param formField: the position of the form field in tokens
103 |         :param tagField: the position of the tag field in tokens
104 |         """
105 |         super(TaggerReader, self).__init__()
106 |         # self.sentence_count = len(sentences) if sentences else 0
107 |         self.formField = formField # field containing form
108 |         self.tagField = tagField # field containing tag
109 | 
110 |     def read(self, filename):
111 |         """
112 |         :return: an iterator on sentences.
113 |         """
114 |         return ConllReader(filename)
115 | 
116 |     # def sent_count(self):
117 |     #     return self.sentence_count
118 | 
119 |     def create_vocabulary(self, sentences, size, min_occurrences=3):
120 |         """
121 |         Create vocabulary and tag set from sentences.
122 |         :param sentences: an iterable on sentences.
123 |         :param size: size of the vocabulary
124 |         :param min_occurrences: Minimum number of times that a token must
125 |             appear in the text in order to be included in the dictionary. 
126 |         Sentence tokens are lists [form, ..., tag]
127 |         """
128 |         c = Counter()
129 |         tags = set()
130 |         for sent in sentences:
131 |             for token in sent:
132 |                 c[token[self.formField]] += 1
133 |                 tags.add(token[self.tagField])
134 |         common = c.most_common(size) # common is a list of pairs
135 |         words = [w for w, n in common if n >= min_occurrences]
136 |         return words, tags
137 | 
138 |     def create_tagset(self, sentences):
139 |         """
140 |         Create tag set from sentences.
141 |         :param sentences: an iterable over sentences.
142 |         """
143 |         tags = set()
144 |         for sent in sentences:
145 |             for token in sent:
146 |                 tags.add(token[self.tagField])
147 |         return tags
148 |     
149 | # ----------------------------------------------------------------------
150 | 
151 | class PosReader(TaggerReader):
152 |     """
153 |     This class reads data from a POS corpus and turns it into a representation
154 |     for use by the neural network for the POS tagging task.
155 |     """
156 |     
157 |     def __init__(self, formField=0, tagField=-1):
158 |         self.rare_tag = None
159 |         super(PosReader, self).__init__(formField=0, tagField=-1)
160 | 
161 | # ----------------------------------------------------------------------
162 | 
163 | # other polarities will be given 0.
164 | default_polarities = { 'positive': 1, 'negative': -1 }
165 | 
166 | class TweetReader(Reader):
167 |     """
168 |     Reader for tweets in SemEval 2013 format, one tweet per line consisting  of:
169 |     SID	UID	polarity	tokenized text
170 |     264183816548130816      15140428        positive      Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sat. :)
171 |     """
172 | 
173 |     def __init__(self, text_field=3, label_field=2, ngrams=1, variant=None):
174 |         """
175 | 	:param ngrams: the length of ngrams to consider.
176 | 	:param variant: whether to use native, or SENNA or Polyglot conventions.
177 |         """
178 |         super(TweetReader, self).__init__()
179 |         self.text_field = text_field
180 |         self.label_field = label_field
181 | 	self.ngrams = ngrams
182 |         self.variant = variant
183 |         self.sentences = []
184 |         self.polarities = []
185 | 
186 |     def __iter__(self):
187 |         for tweet in TsvReader(): # stdin
188 |             yield tweet
189 | 
190 |     def read(self, filename=None, polarities=default_polarities):
191 |         """
192 |         Builds a list of sentences and a list corresponding polarities [-1, 0, 1]
193 |         """
194 |         for tweet in TsvReader(filename):
195 |             if len(tweet) <= self.text_field:
196 |                 # drop empty tweets
197 |                 continue
198 |             self.sentences.append(tweet[self.text_field].split())
199 |             polarity = polarities.get(tweet[self.label_field], 0)
200 |             self.polarities.append(polarity)
201 |         return self.sentences
202 |                     
203 |     def acceptable(self, token):
204 |         """Simple criteron to accept a token as part of a phrase, rejecting
205 |         punctuations or common short words.
206 |         """
207 |         return len(token) > 2
208 | 
209 |     # discount to avoid phrases with very infrequent words
210 |     delta = 1
211 | 
212 |     def create_vocabulary(self, tweets, size=None, min_occurrences=3, threshold=0.1):
213 |         """
214 |         Generates a list of all ngrams from the given tweets.
215 |         
216 |         :param tweets: an iterable on tweets.
217 |         :param size: Max number of tokens to be included in the dictionary.
218 |         :param min_occurrences: Minimum number of times that a token must
219 |             appear in the text in order to be included in the dictionary. 
220 |         :param threshold: minimum bigram score.
221 |         :return: list of ngrams (joined by '_'), list of bigrams
222 |             and list of trigrams.
223 |         """
224 |         
225 |         # Use PMI-like score for selecting collocations:
226 |         # score(x, y) = (count(x,y) - delta) / count(x)count(y)
227 |         # @see Mikolov et al. 2013. Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013
228 |         # unigrams
229 |         unigramCount = Counter(token for tweet in tweets for token in tweet)
230 |         ngrams = [u for u,c in unigramCount.iteritems() if c >= min_occurrences]
231 |         # bigrams
232 |         bigramCount = Counter()
233 |         trigramCount = Counter()
234 |         for tweet in tweets:
235 |             for a,b,c in zip(tweet[:-1], tweet[1:], tweet[2:]):
236 |                 if unigramCount[a] >= min_occurrences and unigramCount[b] >= min_occurrences:
237 |                     bigramCount.update([(a, b)])
238 |                     if unigramCount[c] >= min_occurrences:
239 |                         trigramCount.update([(a, b, c)])
240 |             if len(tweet) > 1 and unigramCount[tweet[-2]] >= min_occurrences and unigramCount[tweet[-1]] >= min_occurrences:
241 |                 bigramCount.update([(tweet[-2], tweet[-1])])
242 |         bigrams = []
243 |         for b, c in bigramCount.iteritems():
244 |             if (float(c) - TweetReader.delta) / (unigramCount[b[0]] * unigramCount[b[1]]) > threshold:
245 |                 ngrams.append(b[0] + '_' + b[1])
246 |                 bigrams.append(b)
247 |         trigrams = []
248 |         for b, c in trigramCount.iteritems():
249 |             if (float(c) - TweetReader.delta) / (unigramCount[b[0]] * unigramCount[b[1]]) > threshold/2 \
250 |                 and (float(c) - TweetReader.delta) / (unigramCount[b[1]] * unigramCount[b[2]]) > threshold/2:
251 |                 ngrams.append(b[0] + '_' + b[1] + '_' + b[2])
252 |                 trigrams.append(b)
253 |         # FIXME: repeat for multigrams
254 |         return ngrams, bigrams, trigrams
255 | 
256 | # ----------------------------------------------------------------------
257 | 
258 | class ClassifyReader(TweetReader):
259 |     """
260 |     Variant of TweetReader with multiple labels.
261 |     """
262 | 
263 |     def read(self, filename=None):
264 |         """
265 |         Builds a list of sentences.
266 |         """
267 |         for tweet in TsvReader(filename):
268 |             self.sentences.append(tweet[self.text_field].split())
269 |             self.polarities.append(tweet[self.label_field])
270 |         return self.sentences
271 | 


--------------------------------------------------------------------------------
/deepnl/sentiwords.pyx:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # distutils: language = c++
  3 | 
  4 | """
  5 | Train a sentiment specific language model.
  6 | """
  7 | 
  8 | from __future__ import print_function
  9 | import numpy as np
 10 | from numpy import int32 as INT
 11 | import sys                      # DEBUG
 12 | 
 13 | # for decorations
 14 | cimport cython
 15 | 
 16 | # local
 17 | from math cimport *
 18 | from words cimport *
 19 | from extractors cimport Iterable
 20 | 
 21 | # ----------------------------------------------------------------------
 22 | 
 23 | 
 24 | cdef class SentGradients(Gradients):
 25 | 
 26 |     cdef public np.ndarray hidden_pos
 27 |     cdef public np.ndarray hidden_neg
 28 |     #cdef public np.ndarray input_pos: as Gradients.input
 29 |     cdef public np.ndarray input_neg
 30 |     
 31 | 
 32 |     def __init__(self, int_t input_size, int_t hidden_size, int_t output_size):
 33 |         super(SentGradients, self).__init__(input_size, hidden_size, output_size)
 34 |         # gradients for positive hidden variables
 35 |         self.hidden_pos = np.zeros(hidden_size)
 36 |         # gradients for negative hidden variables
 37 |         self.hidden_neg = np.zeros(hidden_size)
 38 |         # gradients for negative input variables
 39 |         self.input_neg = np.zeros(input_size)
 40 | 
 41 | 
 42 |     def clear(self):
 43 |         super(SentGradients, self).clear()
 44 |         self.hidden_pos.fill(0.0)
 45 |         self.hidden_neg.fill(0.0)
 46 |         self.input_neg.fill(0.0)
 47 | 
 48 | # ----------------------------------------------------------------------
 49 | 
 50 | def itertrie(trie, sent, start, depth=0):
 51 |     """iterate through all ngrams that occur in :param sent: starting at
 52 |     position :param start:"""
 53 |     yield 1                     # unigram
 54 |     tr = trie
 55 |     for cur in xrange(start, len(sent)):
 56 |         tok = sent[cur][0]
 57 |         if tok in tr:         # part of ngram
 58 |             tr = tr[tok]
 59 |         else:
 60 |             break
 61 |     if cur > start+1:
 62 |         yield cur-start         # ngram
 63 | 
 64 | # ----------------------------------------------------------------------
 65 | 
 66 | 
 67 | cdef class SentimentTrainer(LmTrainer): 
 68 |     """
 69 |     A neural network for sentiment specific language model, aimed 
 70 |     at inducing sentiment-specific word representations.
 71 |     @see Tang et al. 2014. Learning Sentiment-SpecificWord Embedding for Twitter Sentiment Classification.
 72 |     http://aclweb.org/anthology/P14-1146
 73 |     """
 74 |     
 75 |     # polarities of each tweet
 76 |     cdef list polarities
 77 | 
 78 |     # alpha parameter: relative weight of standard and sentiment errors.
 79 |     cdef double alpha
 80 | 
 81 |     cdef RandomPool random_pool
 82 | 
 83 | 
 84 |     def __init__(self, nn, converter, options):
 85 |         """
 86 |         Initializes a new neural network initialized for training.
 87 |         :param options: provides
 88 |         :param hidden_size: default 20
 89 |         :param ngrams: size of ngrams to extract
 90 |         :param alpha: default 0.5
 91 |         """
 92 |         super(SentimentTrainer, self).__init__(nn, converter, options)
 93 | 
 94 |         self.alpha = options.get('alpha', 0.5)
 95 | 
 96 | 
 97 |     @cython.boundscheck(False)
 98 |     cdef _train_pair_s(self, np.ndarray[int_t,ndim=2] example, Gradients grads,
 99 |                        int_t polarity, int_t size=1):
100 |         """
101 |         Trains the network with a pair of positive/negative examples.
102 |         The negative one is randomly generated.
103 | 	:param example: the positive example, i.e. a list of a list of token IDs
104 |         :param grads: the computed gradients are accumulated here, except for
105 |         inputs, which are updated immediately.
106 |         :param polarity: 1 for positive, -1 for negative sentences.
107 | 	:param size: size of ngram to generate for replacing window center
108 |         """
109 |         
110 |         # a token is a list of feature IDs.
111 |         # token[0] is the list with the WordDictionary index of the word,
112 |         cdef int_t left_context = len(self.pre_padding)
113 |         cdef np.ndarray[int_t] middle_token = example[left_context]
114 |         cdef np.ndarray[int_t] variant
115 | 
116 |         # ensure to generate a different word
117 |         while True:
118 |             variant = self.random_pool.next()
119 |             if variant[0] != middle_token[0]:
120 |                 break
121 | 
122 |         cdef Network nn = self.nn
123 |         cdef Parameters p = nn.p
124 |         # FIXME: avoid allocation
125 |         vars_pos = nn.variables()
126 |         self.converter.lookup(example, vars_pos.input)
127 |         nn.forward(vars_pos)
128 | 
129 |         vars_neg = nn.variables()
130 |         cdef np.ndarray[int_t] negative_token = np.array(variant, dtype=INT)
131 |         #print('pos', self.converter.extractors[0].sentence(example).encode('utf-8'), file=sys.stderr) # DEBUG
132 |         #print(vars_pos.input[128:132], file=sys.stderr) # DEBUG
133 |         example[left_context] = negative_token
134 |         self.converter.lookup(example, vars_neg.input)
135 |         #print('neg', self.converter.extractors[0].sentence(example).encode('utf-8'), file=sys.stderr) # DEBUG
136 |         #print(file=sys.stderr, vars_neg.input[128:132]) # DEBUG
137 |         nn.forward(vars_neg)
138 |         
139 |         # hinge loss
140 |         cdef float_t errorCW = max(0, 1 - vars_pos.output[0] + vars_neg.output[0])
141 |         cdef float_t errorUS = max(0, 1 - polarity * vars_pos.output[1] + polarity * vars_neg.output[1])
142 |         cdef float_t error = self.alpha * errorCW + (1 - self.alpha) * errorUS
143 |         #if error > 2: print(file=sys.stderr, 'error', errorCW, errorUS, error) # DEBUG
144 |         self.error += error
145 |         self.avg_error.add(error) # moving average
146 |         self.total_pairs += 1
147 |         if error == 0: 
148 |             self.skips += 1
149 |             return error, variant
150 |         
151 |         # Compute the gradients
152 | 
153 |         # negative gradient for the positive example is +1, for the negative one is -1
154 |         # (remember the network still has the values of the negative example) 
155 | 
156 |         # @see A.8 in Collobert et al. 2011.
157 |         cdef np.ndarray[float_t] grads_pos_score = np.array([0.0, 0.0])
158 |         cdef np.ndarray[float_t] grads_neg_score = np.array([0.0, 0.0])
159 |         if (errorCW > 0):
160 |             grads_pos_score[0] = 1.0
161 |             grads_neg_score[0] = -1.0
162 |         if (errorUS > 0):
163 |             grads_pos_score[1] = 1.0
164 |             grads_neg_score[1] = -1.0
165 |         
166 |         # Summary:
167 |         # grads.output_bias = grads_score
168 |         # grads.output_weights = grads_score.T * hidden_values
169 |         # grads_hidden = activationError(hidden_values) * grads_score.T.dot(output_weights)
170 |         # grads.hidden_bias = grads_hidden
171 |         # grads.hidden_weights = grads_hidden.T * input_values
172 |         # grads.input = grads_hidden.dot(hidden_weights)
173 | 
174 |         # Output layer
175 |         # CHECKME: summing they cancel each other:
176 |         grads.output_bias = grads_pos_score + grads_neg_score
177 |         # (2, hidden_size) = (2) x (hidden_size)
178 |         grads.output_weights = np.outer(grads_pos_score, vars_pos.hidden) +\
179 |                                np.outer(grads_neg_score, vars_neg.hidden)
180 |         #print('gow', grads.output_weights[0,128:132], file=sys.stderr) # DEBUG
181 | 
182 |         # Hidden layer
183 |         # (hidden_size) = (2) * (2, hidden_size)
184 |         grads.hidden_pos = hardtanhe(vars_pos.hidden) * grads_pos_score.dot(p.output_weights)
185 |         grads.hidden_neg = hardtanhe(vars_neg.hidden) * grads_neg_score.dot(p.output_weights)
186 | 
187 |         # Input layer
188 |         # (hidden_size) x (input_size) = (hidden_size, input_size)
189 |         grads.hidden_weights = np.outer(grads.hidden_pos, vars_pos.input) +\
190 |                                np.outer(grads.hidden_neg, vars_neg.input)
191 | 
192 |         #print('ghw', grads.hidden_weights[0,128:132], file=sys.stderr) # DEBUG
193 |         grads.hidden_bias = grads.hidden_pos + grads.hidden_neg
194 | 
195 |         # Lookup layer
196 | 	# (hidden_size) x (hidden_size, input_size) = (input_size)
197 |         grads.hidden_pos.dot(p.hidden_weights, grads.input)
198 |         grads.hidden_neg.dot(p.hidden_weights, grads.input_neg)
199 | 
200 |         return error, variant
201 | 
202 | 
203 |     cdef _update(self, Gradients grads, float_t remaining,
204 |                  np.ndarray[int_t,ndim=2] example,
205 |                  np.ndarray[int_t] middle_token,
206 |                  np.ndarray[int_t] negative_token):
207 |         """
208 |         Update the weights along the gradients :param grads:
209 |         """
210 |         # FIXME: use adagrad
211 |         #cdef float_t LR_0 = max(0.001, self.learning_rate * remaining)
212 |         cdef float_t LR_0 = self.learning_rate
213 |         cdef float_t LR_1 = max(0.001, self.learning_rate / self.nn.input_size * remaining)
214 |         cdef float_t LR_2 = max(0.001, self.learning_rate / self.nn.hidden_size * remaining)
215 | 
216 |         cdef Network nn = self.nn
217 |         cdef Parameters p = nn.p
218 |         cdef int_t left_context = len(self.pre_padding)
219 | 
220 |         p.output_weights += LR_2 * grads.output_weights
221 |         p.output_bias += LR_2 * grads.output_bias
222 |         
223 |         p.hidden_weights += LR_1 * grads.hidden_weights
224 |         p.hidden_bias += LR_1 * grads.hidden_bias
225 |         
226 |         # tokens where changes apply
227 |         cdef np.ndarray tokens = np.vstack((example, negative_token))
228 |         # both changes apply to all tokens except the middle
229 |         cdef np.ndarray[float_t] deltas = grads.input + grads.input_neg
230 |         cdef int_t features = self.converter.size()
231 |         cdef int_t start = left_context * features
232 |         cdef int_t end = start + features
233 |         deltas[start:end] = grads.input[start:end] # positive token
234 |         # add the change to the negative token
235 |         deltas = np.concatenate((deltas, grads.input_neg[start:end])) # negative token
236 |         
237 |         self.converter.update(deltas, tokens, self.learning_rate)
238 | 
239 | 
240 |     def train(self, Iterable sentences, list polarities, trie,
241 |               int_t epochs, int_t report_freq):
242 |         """
243 |         Trains the sentiment language model on the given sentences.
244 |         :param sentences: an iterable on a list of token features for each sentence
245 |         :param iterations: number of train iterations
246 |         :param polarities: the polarity of each sentence, +-1.
247 |         :param trie: of ngrams
248 |         """
249 |         # FIXME: parallelize using ASGD.
250 | 
251 |         # prepare for AdaGrad
252 |         if self.adaEps:
253 |             self.converter.adaGradInit(self.adaEps)
254 | 
255 |         # generate 1000 random indices at a time to save time
256 |         # (generating 1000 integers at once takes about ten times the time for a single one)
257 |         # FIXME: nonsense to create random features besides ID
258 |         feature_tables = [e.table for e in self.converter.extractors]
259 |         self.random_pool = RandomPool([x.shape[0] for x in feature_tables])
260 |         self.total_pairs = 0
261 | 
262 |         # how often to save model
263 |         cdef int_t save_period = 1000 * 1000 # FIXME
264 | 
265 |         cdef float_t all_cases = float(sum([len(sen) for sen in sentences]) * epochs * self.ngram_size)
266 | 
267 |         cdef int_t epoch, num, pos
268 |         cdef float_t remaining
269 | 
270 |         cdef int_t left_context = len(self.pre_padding)
271 |         cdef int_t right_context = len(self.post_padding)
272 |         cdef int_t window_size = left_context + 1 + right_context
273 |         # FIXME: might use len(self.example) instead of window_size?
274 |         cdef np.ndarray window = np.empty((window_size, 1), dtype=INT)
275 |         cdef np.ndarray token, neg_token
276 |         cdef int_t size = 1
277 | 
278 |         grads = SentGradients(self.nn.input_size, self.nn.hidden_size, self.nn.output_size)
279 | 
280 |         for epoch in xrange(epochs):
281 |             self.error = 0.0
282 |             self.epoch_items = 0
283 |             self.epoch_hits = 0
284 |             self.skips = 0
285 |             epoch_pairs = 0
286 |             # update LR by fan-in
287 |             # decrease linearly by remaining
288 |             remaining = 1.0 - (self.total_pairs / all_cases)
289 | 
290 |             for num, sentence in enumerate(sentences):
291 |                 if polarities[num] == 0:
292 |                     # skip neutral sentences
293 |                     continue
294 |                 #print(self.converter.extractors[0].sentence(sentence).encode('utf-8'), file=sys.stderr) # DEBUG
295 |                 for pos in xrange(len(sentence)):
296 |                     # for any word or ngram at sentence[pos:pos+size]
297 |                     for size in itertrie(trie, sentence, pos):
298 |                         # FIXME: avoid overlaps like 0-3, 1-2
299 |                         # extract a window of tokens around the given position
300 |                         token = self._extract_window(window, sentence, pos, size)
301 |                         error, neg_token = self._train_pair_s(window, grads, polarities[num], size)
302 |                         self.total_items += 1
303 |                         if error > self.skipErr:
304 |                             self.error += error
305 |                             self._update(grads, remaining, window, token, neg_token)
306 |                         else:
307 |                             self.epoch_hits += 1
308 |                         epoch_pairs += 1
309 | 
310 |                         if report_freq > 0 and \
311 |                            (self.total_pairs and
312 |                             self.total_pairs % report_freq == 0):
313 |                             self._progress_report(epoch, self.total_pairs, num)
314 |                             # periodically save language model
315 |                             if save_period and self.total_pairs % save_period == 0:
316 |                                 self.saver(self)
317 |             self._epoch_report(epoch + 1)
318 | 


--------------------------------------------------------------------------------
/deepnl/tagger.pxd:
--------------------------------------------------------------------------------
 1 | """
 2 | Sequence Tagger.
 3 | """
 4 | # distutils: language=c++
 5 | 
 6 | cimport numpy as np
 7 | from extractors cimport Converter
 8 | from networkseq cimport SeqGradients, SequenceNetwork
 9 | from network cimport float_t, int_t
10 | from cpython cimport bool
11 | 
12 | cdef class Tagger(object):
13 |         
14 |     # feature extractor
15 |     cdef public Converter converter
16 | 
17 |     cdef readonly dict tag_index         # tag ids
18 |     cdef readonly list tags              # list of tags
19 |     cdef public nn # cython crashes with SequenceNetwork
20 | 
21 |     # padding stuff
22 |     cdef public np.ndarray pre_padding, post_padding
23 | 
24 |     cpdef list tag(self, list tokens)
25 | 
26 |     cpdef np.ndarray[float_t,ndim=2] _tag_sequence(self,
27 |                                                    np.ndarray sentence,
28 |                                                    bool train=*)
29 | 
30 | 


--------------------------------------------------------------------------------
/deepnl/tagger.pyx:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # distutils: language = c++
  3 | # cython: profile=False
  4 | 
  5 | """
  6 | Sequence tagger exploiting a neural network.
  7 | """
  8 | 
  9 | # standard
 10 | from __future__ import print_function
 11 | import numpy as np
 12 | import cPickle as pickle
 13 | import sys                      # DEBUG
 14 | 
 15 | # local
 16 | from network cimport *
 17 | import network
 18 | from networkseq import SequenceNetwork
 19 | from numpy import int32 as INT
 20 | 
 21 | # ----------------------------------------------------------------------
 22 | 
 23 | cdef class Tagger(object):
 24 |     """
 25 |     Abstract base class for sliding window sequence taggers.
 26 |     """
 27 |     
 28 |     # cdef dict tag_index         # tag ids
 29 |     # cdef list tags              # list of tags
 30 | 
 31 |     def __init__(self, nn, Converter converter, tag_index,
 32 |                  int left_context, int right_context):
 33 |         """
 34 |         :param nn: network to be used.
 35 |         :param converter: the Converter object that extracts features and
 36 |            converts them to weights.
 37 |         :param tag_index: index of tags.
 38 |         :param left_context: size of left context window.
 39 |         :param right_context: size of right context window.
 40 |         """
 41 |         self.nn = nn        # dependency injection
 42 |         self.converter = converter
 43 |         self.tag_index = tag_index
 44 |         self.tags = sorted(tag_index, key=tag_index.get)
 45 |         cdef np.ndarray[int_t] padding_left = converter.get_padding_left()
 46 |         cdef np.ndarray[int_t] padding_right = converter.get_padding_right()
 47 |         self.pre_padding = np.array(left_context * [padding_left], dtype=INT)
 48 |         self.post_padding = np.array(right_context * [padding_right], dtype=INT)
 49 | 
 50 |     cpdef list tag(self, list tokens):
 51 |         """
 52 |         Tags a given list of tokens. 
 53 |         
 54 |         Tokens should be produced by a compatible tokenizer in order to 
 55 |         match the entries in the vocabulary.
 56 |         
 57 |         :param tokens: a list of tokens, each a list of attributes.
 58 |         :returns: the list of tags for each token.
 59 |         """
 60 |         cdef np.ndarray[int_t,ndim=2] seq = self.converter.convert(tokens)
 61 |         # add padding
 62 |         seq = np.concatenate((self.pre_padding, seq, self.post_padding))
 63 | 
 64 |         cdef np.ndarray[float_t,ndim=2] scores = self._tag_sequence(seq)
 65 |         # computes full score, combining ftheta and A (if SLL)
 66 |         answer = self.nn._viterbi(scores)
 67 |         return [self.tags[tag] for tag in answer]
 68 | 
 69 |     cpdef np.ndarray[float_t,ndim=2] _tag_sequence(self,
 70 |                                                    np.ndarray sentence,
 71 |                                                    bool train=False):
 72 |         """
 73 |         Runs the network for each element in the sentence and returns 
 74 |         the scores for all possibile tag sequences.
 75 |         
 76 |         :param sentence: an array, where each row encodes a token.
 77 |             Sentence includes padding.
 78 |         :param tags: the correct tags (needed when training).
 79 |         :return: an array of size (len(sentence), output_size) with the
 80 |             scores for all tags for each token..
 81 |         """
 82 |         nn = self.nn
 83 |         cdef int window_size = len(self.pre_padding) + 1 + len(self.post_padding)
 84 |         cdef slen = len(sentence) - window_size + 1 # without padding
 85 | 
 86 |         # scores[t, i] = ftheta_i,t = score for i-th tag, t-th word
 87 |         cdef np.ndarray[float_t,ndim=2] scores = np.empty((slen, nn.output_size))
 88 |         
 89 |         # container for network variables
 90 |         #vars = nn.variables()
 91 |         vars = network.Variables() # empty fields, filled below
 92 | 
 93 |         if train:
 94 |             # we must keep the whole history
 95 |             nn.input_sequence = np.empty((slen, nn.input_size))
 96 |             # hidden_values at each token in the correct path
 97 |             nn.hidden_sequence = np.empty((slen, nn.hidden_size))
 98 |         else:
 99 |             # we can discard intermediate values
100 |             vars.input = np.empty(nn.input_size)
101 |             vars.hidden = np.empty(nn.hidden_size)
102 | 
103 |         # print(sentence[:,:3], file=sys.stderr)   # DEBUG
104 |         # #print(self.converter.extractors[0].sentence(sentence[:4]), file=sys.stderr) # DEBUG
105 |         # print('hweights', nn.p.hidden_weights[:4,:4], file=sys.stderr) # DEBUG
106 |         # print('hbias', nn.p.hidden_bias[:4], file=sys.stderr)          # DEBUG
107 | 
108 |         # lookup the whole sentence at once
109 |         # number of features in a window
110 |         cdef int token_size = nn.input_size / window_size
111 |         cdef np.ndarray sentence_features = np.empty(len(sentence) * token_size)
112 |         self.converter.lookup(sentence, sentence_features)
113 | 
114 |         # run through all windows in the sentence
115 |         cdef int i, start
116 |         for i in xrange(slen):
117 |             start = i * token_size
118 |             vars.input = sentence_features[start: start+nn.input_size]
119 |             if train:
120 |                 nn.input_sequence[i,:] = vars.input
121 |                 vars.hidden = nn.hidden_sequence[i]
122 |             vars.output = scores[i]
123 |             nn.forward(vars)
124 |             # DEBUG
125 |             # if train:
126 |             #     # print('window:', self.converter.extractors[0].sentence(window), file=sys.stderr)
127 |             #     # print('sent:', window[:4], window[-4:], file=sys.stderr)
128 |             #     print('input', vars.input[:4], vars.input[-4:], file=sys.stderr)
129 |             #     #print('iw', self.nn.p.hidden_weights[0,:4], self.nn.p.hidden_weights[-1,-4:], file=sys.stderr)
130 |             #     print('hidden', vars.hidden[:4], vars.hidden[-4:], file=sys.stderr)
131 |             #     print('output', vars.output[:4], vars.output[-4:], file=sys.stderr)
132 |         
133 |         return scores
134 | 
135 |     def save(self, file):
136 |         """
137 |         Saves the tagger to a file.
138 |         """
139 |         self.nn.save(file)
140 |         pickle.dump(self.tag_index, file)
141 |         pickle.dump((len(self.pre_padding), len(self.post_padding)), file)
142 |         self.converter.save(file)
143 | 
144 |     @classmethod
145 |     def load(cls, file):
146 |         """
147 |         Loads the tagger from a file.
148 |         """
149 |         nn = SequenceNetwork.load(file)
150 |         tag_index = pickle.load(file)
151 |         left_context, right_context = pickle.load(file)
152 |         converter = Converter()
153 |         converter.load(file)
154 | 
155 |         return cls(nn, converter, tag_index, left_context, right_context)
156 | 
157 | 


--------------------------------------------------------------------------------
/deepnl/trainer.pxd:
--------------------------------------------------------------------------------
 1 | # distutils: language=c++
 2 | 
 3 | from cpython cimport bool
 4 | cimport numpy as np
 5 | 
 6 | # local
 7 | from network cimport Network, Parameters, Gradients, float_t, int_t
 8 | from extractors cimport Converter
 9 | from networkseq cimport SeqGradients
10 | 
11 | cdef class MovingAverage(object):
12 | 
13 |     cdef float_t mean
14 |     cdef float_t variance
15 |     cdef unsigned count
16 | 
17 |     cdef add(self, float_t v)
18 | 
19 | cdef class Trainer(object):
20 | 
21 |     # public to enable loading
22 |     cdef public Network nn
23 |     # feature extractor
24 |     cdef public Converter converter
25 |     cdef public np.ndarray pre_padding, post_padding
26 |     cdef public object saver
27 |     cdef int total_items, epoch_items, epoch_hits, skips
28 |     # data for statistics
29 |     cdef float_t error, accuracy
30 |     cdef readonly MovingAverage avg_error
31 | 
32 |     # options
33 |     cdef public bool verbose
34 | 
35 |     # size of ngrams
36 |     cdef public int ngram_size
37 | 
38 |     # training parameters
39 |     cdef public float_t learning_rate
40 |     cdef float_t adaEps
41 |     cdef float_t adaRo
42 |     cdef float_t l1_decay
43 |     cdef float_t l2_decay
44 |     cdef float_t momentum
45 |     cdef float_t skipErr
46 |     cdef Parameters ada
47 | 
48 |     cdef float_t _validate(self, list sentences, labels, int idx)
49 | 
50 |     cpdef update(self, Gradients grads, np.ndarray[int_t,ndim=2] sentence)
51 | 
52 | cdef class TaggerTrainer(Trainer):
53 | 
54 |      cdef dict tags_dict
55 |      #cdef Tagger tagger # FIXHIM: crashes Cython compiler
56 |      cdef readonly object tagger
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/deepnl/trainerconv.pyx:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # distutils: language = c++
  3 | # cython: profile=False
  4 | 
  5 | """
  6 | Train a DL Convolutional Neural Network.
  7 | """
  8 | 
  9 | import sys
 10 | import numpy as np
 11 | from itertools import izip
 12 | 
 13 | # local
 14 | from networkconv cimport *
 15 | from trainer cimport Trainer
 16 | from extractors cimport Converter
 17 | from classifier import Classifier
 18 | 
 19 | # for decorations
 20 | cimport cython
 21 | 
 22 | cdef class ConvTrainer(Trainer):
 23 |     """
 24 |     Trainer for a convolutional network.
 25 |     """
 26 | 
 27 |     cdef readonly classifier
 28 | 
 29 |     def __init__(self, nn, Converter converter, list labels, dict options):
 30 |         """
 31 |         :param labels: list of labels.
 32 |         """
 33 |         super(ConvTrainer, self).__init__(nn, converter, options)
 34 |         left_context = options.get('left_context', 2)
 35 |         right_context = options.get('right_context', 2)
 36 |         self.classifier = Classifier(converter, labels, left_context, right_context, nn)
 37 |     
 38 |     def _train_epoch(self, list sentences, list labels):
 39 |         """
 40 |         Trains for one epoch with all examples.
 41 |         :param sentences: a list of 2-dim numpy arrays, where each array 
 42 |             encodes a sentence. Each array row represents a token through the
 43 |             indices to its features.
 44 |         :param labels: a list of id of labels for each corresponding sentence.
 45 |         """
 46 | 
 47 |         self.error = 0
 48 |         self.epoch_items = 0
 49 |         self.epoch_hits = 0
 50 |         self.skips = 0
 51 |         
 52 |         # shuffle data
 53 |         # get the random number generator state in order to shuffle
 54 |         # sentences and their tags in the same order
 55 |         random_state = np.random.get_state()
 56 |         np.random.shuffle(sentences)
 57 |         np.random.set_state(random_state)
 58 |         np.random.shuffle(labels)
 59 |         
 60 |         # keep last 2% for validation
 61 |         validation = int(len(sentences) * 0.98)
 62 | 
 63 |         nn = self.nn
 64 |         cdef ConvGradients grads
 65 |         cdef int_t i = 0, slen
 66 |         for i in xrange(validation):
 67 |             sent = sentences[i]
 68 |             label = labels[i]
 69 |             slen = len(sent)
 70 |             # add padding
 71 |             sent = np.concatenate((self.pre_padding, sent, self.post_padding))
 72 |             vars = nn.variables(len(sent)) # allocate variables
 73 |             self.converter.lookup(sent, vars.input)
 74 |             nn.forward(vars)
 75 |             grads = nn.gradients(slen) # allocate gradients
 76 |             loss = nn.backpropagate(label, vars, grads)
 77 |             if loss > 0.0:
 78 |                 self.error += loss
 79 |                 self.update(grads, sent)
 80 |                 # # DEBUG. verify
 81 |                 # label1 = np.argmax(vars.output)
 82 |                 # nn.forward(vars) # DEBUG
 83 |                 # grads.clear() # allocate gradients
 84 |                 # loss2 = nn.backpropagate(label, vars, grads) # DEBUG
 85 |                 # if loss2 > 0.5: # loss:                             # DEBUG
 86 |                 #     label2 = np.argmax(vars.output)
 87 |                 #     self.update(grads, sent)
 88 |                 #     # check again
 89 |                 #     nn.forward(vars) # DEBUG
 90 |                 #     label3 = np.argmax(vars.output)
 91 |                 #     if label != label3:
 92 |                 #         grads.clear() # allocate gradients
 93 |                 #         loss3 = nn.backpropagate(label, vars, grads)
 94 |                 #         print('NOFIX', i, label, label1, label2, label3, loss, loss2, loss3, file=sys.stderr)
 95 |             else:
 96 |                 self.epoch_hits += 1
 97 |             self.epoch_items += 1
 98 |             # progress report
 99 |             i += 1
100 |             if self.verbose:
101 |                 if i%1000 == 0:
102 |                     sys.stderr.write('+')
103 |                     sys.stderr.flush()
104 |                 elif i%100 == 0:
105 |                     sys.stderr.write('.')
106 |                     sys.stderr.flush()
107 |             if i == validation:
108 |                 break
109 |         if self.verbose:
110 |             sys.stderr.write('\n')
111 | 
112 |         self.accuracy = self._validate(sentences, labels, validation)
113 | 
114 |     @cython.boundscheck(False)
115 |     cdef float_t _validate(self, list sentences, labels, int_t idx):
116 |         """Perform validation on held out data and estimate accuracy
117 |         :param idx: index of first sentence in validation set.
118 |         """
119 |         cdef int_t count = 0
120 |         cdef int_t hits = 0
121 | 
122 |         cdef int_t i, label, pred
123 |         cdef np.ndarray[int_t,ndim=2] sent
124 |         cdef Variables vars
125 | 
126 |         for i in xrange(idx, len(sentences)):
127 |             sent = sentences[i]
128 |             label = labels[i]
129 |             # add padding
130 |             sent = np.concatenate((self.pre_padding, sent, self.post_padding))
131 |             vars = self.nn.variables(len(sent)) # allocate variables
132 |             self.converter.lookup(sent, vars.input)
133 |             self.nn.forward(vars)
134 |             pred = np.argmax(vars.output)
135 |             if pred == label:
136 |                 hits += 1
137 |             count += 1
138 |         return float(hits) / count if count else 1.0
139 | 
140 |     # def save(self, file): inherited
141 | 
142 |     @classmethod
143 |     def load(cls, file):
144 |         """
145 |         Resume training from previous dump.
146 |         """
147 |         # use __new__() to skip initialiazation
148 |         trainer = ConvTrainer.__new__(cls) # CHECKME: ConvTrainer is redundant?
149 |         trainer.pre_padding, trainer.post_padding, trainer.ngram_size = np.load(file)
150 |         trainer.nn = ConvolutionalNetwork.load(file) # different from super
151 |         trainer.converter = Converter()
152 |         trainer.converter.load(file)
153 |         return trainer
154 | 


--------------------------------------------------------------------------------
/deepnl/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Utility functions
  5 | """
  6 | 
  7 | import re
  8 | import logging
  9 | import numpy as np
 10 | from itertools import islice
 11 | 
 12 | def tokenize(text, sent_splitter, tokenizer, clean=True):
 13 |     """
 14 |     Returns a list of lists of the tokens in text, separated by sentences.
 15 |     Each line break in the text starts a new list.
 16 |     
 17 |     :param sent_splitter: a sentence splitter sucg as
 18 |         nltk.tokenize.regexp.punkt
 19 |     :param tokenzier: a tokenizer, such as
 20 |         nltk.tokenize.regexp.RegexpTokenizer
 21 |     :param clean: If True, performs some cleaning action on the text, such as replacing
 22 |         all digits for 9 (by calling :func:`clean_text`)
 23 |     """
 24 |     ret = []
 25 |     
 26 |     if type(text) != unicode:
 27 |         text = unicode(text, 'utf-8')
 28 |     
 29 |     if clean:
 30 |         text = clean_text(text, correct=True)
 31 |     
 32 |     text = _clitic_regexp.sub(r' -\1', text)
 33 |     
 34 |    
 35 |     # the sentence tokenizer doesn't consider line breaks as sentence delimiters, so
 36 |     # we split them manually where there are two consecutive line breaks.
 37 |     sentences = []
 38 |     lines = text.split('\n\n')
 39 |     for line in lines:
 40 |         sentences.extend(sent_tokenizer.tokenize(line, realign_boundaries=True))
 41 |     
 42 |     for p in sentences:
 43 |         if p.strip() == '':
 44 |             continue
 45 |         
 46 |         new_sent = _tokenizer.tokenize(p)
 47 |         ret.append(new_sent)
 48 |         
 49 |     return ret
 50 | 
 51 | def clean_text(text, correct=True):
 52 |     """
 53 |     Apply some transformations to the text, such as 
 54 |     replacing digits for 9 and simplifying quotation marks.
 55 |     
 56 |     :param correct: If True, tries to correct punctuation misspellings. 
 57 |     """
 58 |     
 59 |     # replaces different kinds of quotation marks with "
 60 |     # take care not to remove apostrophes
 61 |     text = re.sub(ur"(?u)(^|\W)[‘’′`']", r'\1"', text)
 62 |     text = re.sub(ur"(?u)[‘’`′'](\W|$)", r'"\1', text)
 63 |     text = re.sub(ur'(?u)[«»“”]', '"', text)
 64 |     
 65 |     if correct:
 66 |         # tries to fix mistyped tokens (common in Wikipedia-pt) as ,, '' ..
 67 |         text = re.sub(r'(?<!\.)\.\.(?!\.)', '.', text) # take care with ellipses 
 68 |         text = re.sub(r'([,";:])\1,', r'\1', text)
 69 |         
 70 |         # inserts space after leading hyphen. It happens sometimes in cases like:
 71 |         # blablabla -that is, bloblobloblo
 72 |         text = re.sub(' -(?=[^\W\d_])', ' - ', text)
 73 |     
 74 |     # replaces numbers with the 9's
 75 |     text = re.sub(r'\d', '9', text)
 76 |     
 77 |     # replaces special ellipsis character 
 78 |     text = text.replace(u'…', '...')
 79 |     
 80 |     return text
 81 | 
 82 | def count_lines(filename):
 83 |     """Counts and returns how many non empty lines there are in a file."""
 84 |     count = 0
 85 |     with open(filename, 'r') as f:
 86 |         for line in f:          # avoid reading the whole file in memory
 87 |             if f.strip():
 88 |                 count += 1
 89 |     return count
 90 | 
 91 | def _create_affix_tables(affix, table_list, num_features):
 92 |     """
 93 |     Internal helper function for loading suffix or prefix feature tables 
 94 |     into the given list.
 95 |     affix should be either 'suffix' or 'prefix'.
 96 |     """
 97 |     logger = logging.getLogger('Logger')
 98 |     logger.info('Generating %s features...' % affix)
 99 |     tensor = []
100 |     codes = getattr(attributes.Affix, '%s_codes' % affix)
101 |     num_affixes_per_size = getattr(attributes.Affix, 'num_%ses_per_size' % affix)
102 |     for size in codes:
103 |         
104 |         # use num_*_per_size because it accounts for special suffix codes
105 |         num_affixes = num_affixes_per_size[size]
106 |         table = generate_feature_vectors(num_affixes, num_features)
107 |         tensor.append(table)
108 |     
109 |     # affix attribute actually has a 3-dim tensor
110 |     # (concatenation of 2d tables, one for each suffix size)
111 |     for table in tensor:
112 |         table_list.append(table)
113 | 
114 | def set_distance_features(max_dist=None, 
115 |                           num_target_features=None, num_pred_features=None):
116 |     """
117 |     Returns the distance feature tables to be used by a convolutional network.
118 |     One table is for relative distance to the target predicate, the other
119 |     to the predicate.
120 |     
121 |     :param max_dist: maximum distance to be used in new vectors.
122 |     """
123 |     logger = logging.getLogger("Logger")
124 |     
125 |     # max_dist before/after, 0 distance, and distances above the max
126 |     max_dist = 2 * (max_dist + 1) + 1
127 |     logger.info("Generating target word distance features...")
128 |     target_dist = generate_feature_vectors(max_dist, num_target_features)
129 |     logger.info("Generating predicate distance features...")
130 |     pred_dist = generate_feature_vectors(max_dist, num_pred_features)
131 |     
132 |     return (target_dist, pred_dist)
133 | 
134 | def set_logger(level):
135 |     """Sets the logger to be used throughout the system."""
136 | 
137 |     log_format = '%(message)s'
138 |     logging.basicConfig(format=log_format)
139 |     logger = logging.getLogger("Logger")
140 |     logger.setLevel(level)
141 | 
142 | def boundaries_to_arg_limits(boundaries):
143 |     """
144 |     Converts a sequence of IOBES tags delimiting arguments to an array
145 |     of argument boundaries, used by the network.
146 |     """
147 |     limits = []
148 |     start = None
149 |     
150 |     for i, tag in enumerate(boundaries):
151 |         if tag == 'S': 
152 |             limits.append([i, i])
153 |         elif tag == 'B':
154 |             start = i 
155 |         elif tag == 'E':
156 |             limits.append([start, i])
157 |     
158 |     return np.array(limits, int)
159 | 
160 | # ----------------------------------------------------------------------
161 | 
162 | def import_class(name):
163 |     modulename, classname = name.rsplit('.', 1)
164 |     m = __import__(modulename, globals(), locals(), [classname])
165 |     return getattr(m, classname)
166 | 
167 | # ----------------------------------------------------------------------
168 | 
169 | def grouper(iterable, n):
170 |     it = iter(iterable)
171 |     while True:
172 |         chunk = tuple(islice(it, n))
173 |         if not chunk:
174 |             return
175 |         yield chunk
176 | 
177 | # ----------------------------------------------------------------------
178 | # diacritic
179 | 
180 | import unicodedata
181 | def strip_accents(s):
182 |     return ''.join(c for c in unicodedata.normalize('NFD', s)
183 |                    if unicodedata.category(c) != 'Mn')
184 | 
185 | # ----------------------------------------------------------------------
186 | 
187 | class Trie(dict):
188 |     """Simple trie of ngrams, that keeps counts of their frequencies.
189 |     """
190 | 
191 |     def __init__(self):
192 |         self.freq = 0           # count the occurrences of ngrams ending here
193 |     
194 |     def __len__(self):
195 |         count = 0
196 |         for trie in super(Trie, self).itervalues():
197 |             if trie.freq:     # terminal node
198 |                 count += 1
199 |             count += trie.__len__()
200 |         return count
201 | 
202 |     def __repr__(self):
203 |         return '<Trie %d, %s>' % (self.freq, super(Trie, self).__repr__())
204 | 
205 |     def add(self, ngram, lowcase=True, noaccents=True):
206 |         """Insert the ngram :param ngram: into the trie."""
207 |         curr = self
208 |         for tok in ngram:
209 |             if lowcase:
210 |                 tok = tok.lower()
211 |             if noaccents:
212 |                 tok = strip_accents(tok)
213 |             curr = curr.setdefault(tok, Trie())
214 |         curr.freq += 1
215 | 
216 |     def prune(self, occurr):
217 |         """prune ngrams that occurr less than :param occurr:"""
218 |         for key, curr in self.items():
219 |             if len(curr) == 0:  # final ngram
220 |                 if curr.freq < occurr:            
221 |                     del self[key]
222 |             else:
223 |                 curr.prune(occurr)
224 |                 # prune dead branch
225 |                 if len(curr) == 0:
226 |                     del self[key]
227 | 
228 |     def iter(self, sent, start=0, lowcase=True, noaccents=True):
229 |         """iterate through all ngrams that occur in :param sent: starting at
230 |         position :param start:
231 |         :param lowcase: compare lower case tokens.
232 |         :param noaccents: compare disregarding accents.
233 |         """
234 |         trie = self
235 |         for cur in xrange(start, len(sent)):
236 |             tok = sent[cur]
237 |             if lowcase:
238 |                 tok = tok.lower()
239 |             if noaccents:
240 |                 tok = strip_accents(tok)
241 |             if tok in trie:         # part of ngram
242 |                 trie = trie[tok]
243 |                 if trie.freq:
244 |                     yield cur+1 # ngram end
245 |             else:
246 |                 break
247 | 
248 |     def __iter__(self):
249 |         """Iterate through the ngrams stored in the trie"""
250 |         for key, trie in self.iteritems():
251 |             if trie.freq:     # terminal node
252 |                 yield [key]
253 |             for rest in trie:
254 |                 yield [key] + rest
255 | 
256 | 


--------------------------------------------------------------------------------
/deepnl/word_dictionary.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from collections import Counter, OrderedDict
  4 | import cPickle as pickle
  5 | import re
  6 | from numpy import int32 as INT
  7 | 
  8 | num = re.compile('[+\-]?([0-9][,.]?)+$')
  9 | 
 10 | def isNumber(key):
 11 |     return num.match(key)
 12 | 
 13 | class WordDictionary(dict):
 14 |     """
 15 |     Class to store words and their corresponding indices in
 16 |     the network lookup table. Also deals with padding and
 17 |     maps rare words to a special index.
 18 |     """
 19 |     
 20 |     def __init__(self, sentences, size=None, minimum_occurrences=None,
 21 |                  wordlist=None, variant='senna'):
 22 |         """
 23 |         Fills a dictionary (to be used for indexing) with the most
 24 |         common words in the given text.
 25 |         
 26 |         :param sentences: an iterable on lists of tokens 
 27 |             (each token represented as a string).
 28 |         :param size: Maximum number of token indices 
 29 |             (not including paddings, rare, etc.).
 30 |         :param minimum_occurrences: The minimum number of occurrences a token
 31 |             must have in order to be included.
 32 |         :param wordlist: Use this list of words to build the dictionary.
 33 |             Overrides sentences if not None and ignores maximum size.
 34 |         :param variant: either 'polyglot', 'word2vec', or 'senna' conventions,
 35 |             i.e. keep case, use different padding tokens.
 36 |         """
 37 |         self.variant = variant
 38 |         if variant:
 39 |             self.variant = variant.lower()
 40 |         if self.variant == 'polyglot':
 41 |             padding_left = '<PAD>'
 42 |             padding_right = '<PAD>'
 43 |             rare = '<UNK>'
 44 |         elif self.variant == 'word2vec':
 45 |             padding_left = '</s>'
 46 |             padding_right = '</s>'
 47 |             rare = '<UNK>'
 48 |         elif self.variant == 'senna':
 49 |             # SENNA conventions
 50 |             padding_left = 'PADDING'
 51 |             padding_right = 'PADDING'
 52 |             rare = 'UNKNOWN'
 53 |         
 54 |         if self.variant:
 55 |             self.special_symbols = set((rare, 
 56 |                                         padding_left,
 57 |                                         padding_right))
 58 |         else:
 59 |             self.special_symbols = set()
 60 | 
 61 |         if wordlist is None:
 62 |             # work with the supplied sentences. extract frequencies.
 63 |             
 64 |             # gets frequency count
 65 |             c = self._get_frequency_count(sentences)
 66 |         
 67 |             if minimum_occurrences is None:
 68 |                 minimum_occurrences = 1
 69 |             
 70 |             words = [key for key, number in c.most_common() 
 71 |                      if number >= minimum_occurrences]
 72 |             
 73 |             if size is not None and size < len(words):
 74 |                 words = words[:size]
 75 |         
 76 |         else:
 77 |             # Keep the order and eliminate duplicates
 78 |             #words = list(OrderedDict.fromkeys(self.normalize(w) for w in wordlist))
 79 |             words = list(OrderedDict.fromkeys(wordlist))
 80 |             
 81 |         # trim to the maximum size
 82 |         if size is None:
 83 |             size = len(words)
 84 |         else:
 85 |             size = min(size, len(words))
 86 |             words = words[:size]
 87 |         
 88 |         # build the inverse index
 89 |         self.words = [0] * len(words) # inverse index
 90 |         for idx, word in enumerate(words):
 91 |             super(WordDictionary, self).__setitem__(word, INT(idx))
 92 |             self.words[idx] = word # words should be already normalized
 93 |         
 94 |         # if the given words include one of the rare or padding symbols,
 95 |         # don't replace it
 96 |         for symbol in self.special_symbols:
 97 |             if super(WordDictionary, self).get(symbol) is None: # might be 0
 98 |                  self[symbol] = len(self)
 99 |         
100 |         # save the indices of the special symbols
101 |         if self.variant:
102 |             self.padding_left = super(WordDictionary, self).get(padding_left)
103 |             self.padding_right = super(WordDictionary, self).get(padding_right)
104 |             self.rare = super(WordDictionary, self).get(rare)
105 |         else:
106 |             # there is no corresponding string in dictionary
107 |             self.padding_left = INT(len(self))
108 |             self.padding_right = self.padding_left
109 |             self.rare = INT(self.padding_left + 1)
110 |     
111 |     def size(self):
112 |         """
113 |         :return: the number of words in the dictionary, excluding special symbols.
114 |         """
115 |         return len(self) - len(self.special_symbols)
116 | 
117 |     def save(self, file):
118 |         """
119 |         Saves the word dictionary to the given file as a list of words.
120 |         Special words (paddings and rare) are also included.
121 |         """
122 |         pickle.dump(self.variant, file)
123 |         pickle.dump(self.words, file)
124 |         pickle.dump((self.rare, 
125 |                      self.padding_left,
126 |                      self.padding_right), file)
127 | 
128 |     @classmethod
129 |     def load(cls, file):
130 |         o = WordDictionary.__new__(cls)
131 |         o.variant = pickle.load(file)
132 |         o.words = pickle.load(file)
133 |         o.rare, o.padding_left, o.padding_right = pickle.load(file)
134 |         for i,x in enumerate(o.words):
135 |             # FIXME: this assumes normalized words in file
136 |             super(WordDictionary, o).__setitem__(x, INT(i))
137 |         return o
138 | 
139 |     def _get_frequency_count(self, sentences):
140 |         """
141 |         Returns a token counter for normalized tokens in :param sentences:.
142 |         
143 |         :param sentences: an iterable on lists of tokens.
144 |         """
145 |         return Counter(self.normalize(t) for sent in sentences for t in sent)
146 |     
147 |     def update_tokens(self, tokens, size=None, minimum_occurrences=1, freqs=None):
148 |         """
149 |         Updates the dictionary, adding tokens until :param size: is reached.
150 |         
151 |         :param freqs: a dictionary providing a token count.
152 |         """
153 |         if freqs is None:
154 |             freqs = self._get_frequency_count([tokens])
155 |             
156 |         if size is None or size == 0:
157 |             # size None or 0 means no size limit
158 |             size = len(freqs)
159 |         
160 |         increment = size - self.size()
161 |         if increment <= 0:
162 |             return
163 |         
164 |         # tokens not present in the dictionary and above minimum frequency 
165 |         new_tokens = [token for token in freqs 
166 |                       if token not in self and freqs[token] >= minimum_occurrences]
167 |         # order the words from the most frequent to the least
168 |         new_tokens.sort(key=lambda x: freqs[x], reverse=True)
169 |         
170 |         for token in new_tokens:
171 |             self[token] = len(self)
172 |             increment -= 1
173 |             if increment == 0:
174 |                 break
175 |         
176 |     def normalize(self, word):
177 |         """
178 |         Normalize word, converting digits to 0 and lowercasing (when variant is 'senna').
179 |         """
180 |         if self.variant == 'senna':
181 |             # senna converts numbers to '0'
182 |             if isNumber(word):
183 |                 word = '0'
184 |             else:
185 |                 word = word.lower()
186 |             return re.sub('[0-9]', '0', word)
187 |         if self.variant:
188 |             # replace all digits by '0'
189 |             return re.sub('[0-9]', '0', word)
190 |         return word
191 | 
192 |     def __contains__(self, key):
193 |         """
194 |         Overrides the "in" operator. Case insensitive when variant is 'senna'.
195 |         """
196 |         # deal with symbols in original case, e.g. PADDING, UNKNOWN.
197 |         return super(WordDictionary, self).__contains__(key) or \
198 |             super(WordDictionary, self).__contains__(self.normalize(key))
199 | 
200 |     def __getitem__(self, key):
201 |         """
202 |         Overrides the [] read operator. 
203 |         
204 |         Two differences from the original:
205 |         1) if the key is not present, it returns the value for the UNKNOWN key.
206 |         2) match is attempted also with normalized key.
207 |         """
208 |         # deal with symbols in original case, e.g. PADDING, UNKNOWN.
209 |         return super(WordDictionary, self).get(key) or \
210 |             super(WordDictionary, self).get(self.normalize(key), self.rare)
211 | 
212 |     def get(self, key):
213 |         """
214 |         Overrides the dictionary get method, so when given a word without an entry,
215 |         it returns the value for the UNKNOWN key.
216 |         Note that it is NOT possible to supply a default value as in the dict class.
217 |         """
218 |         return self.__getitem__(key)
219 | 
220 |     def __setitem__(self, key, value):
221 |         """
222 |         Replaces the [] write operator.
223 |         We store INT values.
224 | 
225 |         Words are normalized before insertion.
226 |         """
227 |         # deal with symbols in original case, e.g. PADDING, UNKNOWN.
228 |         if not super(WordDictionary, self).__contains__(key):
229 |             key = self.normalize(key)
230 |             if not super(WordDictionary, self).__contains__(key):
231 |                 self.words.append(key)
232 |         super(WordDictionary, self).__setitem__(key, INT(value))
233 |         
234 |     def add(self, word):
235 |         if word not in self:    # invokes __contains__()
236 |             self[word] = len(self)
237 |         
238 |     def get_words(self, indices):
239 |         """
240 |         Returns the words represented by a sequence of indices.
241 |         Notice that this might not return the original sentence,
242 |         since the index is not injective: two words might have the same index
243 |         e.g. numbers '11' and '22' are mapped to '00'
244 | 
245 |         """
246 |         return (self.words[i] if i < len(self.words) else '<UNKN>' for i in indices)
247 |     
248 |     def get_indices(self, words):
249 |         """
250 |         Returns the indices corresponding to a sequence of tokens.
251 |         """
252 |         return (self[w] for w in words)
253 | 
254 | class NgramDictionary(WordDictionary):
255 |     """
256 |     Class to store ngrams and their corresponding indices in
257 |     the network lookup table.
258 |     """
259 |     def __init__(self, ngrams, size=None, minimum_occurrences=None, variant=None):
260 |         """
261 |         Fills a dictionary (to be used for indexing) with the most
262 |         common ngrams.
263 |         
264 |         :param ngrams: a list of lists of ngrams
265 |         :param size: Maximum number of ngram indices 
266 |             (not including paddings, rare, etc.).
267 |         :param minimum_occurrences: The minimum number of occurrences an ngram must 
268 |             have in order to be included.
269 |         :param variant: either 'polyglot' or 'senna' conventions, i.e. keep upper case, use different padding tokens.
270 |         """
271 |         super(NgramDictionary, self).__init__(self, ngrams, size, minimum_occurrences,
272 |                                               variant=variant)
273 | 


--------------------------------------------------------------------------------
/deepnl/words.h:
--------------------------------------------------------------------------------
 1 | 
 2 | // Eigen library
 3 | #include <Eigen/Dense>
 4 | 
 5 | typedef Eigen::VectorXd	Vector;
 6 | typedef Eigen::MatrixXd	Matrix;
 7 | 
 8 | struct Parameters
 9 | {
10 |   Matrix wIn;		// input weights
11 |   Vector bIn;		// input bias
12 |   Matrix wOut;		// output weights
13 |   Vector bOut;		// output bias
14 | 
15 |   Parameters() { }
16 | 
17 |   Parameters(unsigned numInput, unsigned numHidden, unsigned numOutput) :
18 |     wIn(numInput, numHidden),
19 |     bIn(numHidden),
20 |     wOut(numHidden, numOutput),
21 |     bOut(numOutput)
22 |   {
23 |     clear();
24 |   }
25 | 
26 |   virtual void clear() {
27 |     wIn.setZero();
28 |     bIn.setZero();
29 |     wOut.setZero();
30 |     bOut.setZero();
31 |   }
32 | 
33 | };
34 | 
35 | class Network
36 | {
37 |   Vector	hidden, output;	///< hidden and output variables
38 |   unsigned	numInput;	///< number of input values
39 |   unsigned	numHidden;	///< number of hidden variables
40 |   unsigned	numOutput;	///< number of output variables
41 |   Parameters*	p;		///< parameters
42 | 
43 |   void run(const Vector& input) {
44 |     hidden.noalias() = input.transpose() * p->wIn + p->bIn.transpose();
45 |     tanh(hidden);		// first layer
46 |     output.noalias() = hidden.transpose() * p->wOut + p->bOut.transpose();
47 |   }
48 | };
49 | 
50 | class Trainer
51 | {
52 | public:
53 | 
54 |   void init(PyObject* nn);
55 | 
56 |   void train_pair(double* pos_input_values_0, double* negative_token_0);
57 | };
58 | 
59 | 


--------------------------------------------------------------------------------
/deepnl/words.pxd:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # distutils: language=c++
 3 | 
 4 | """
 5 | Train a Language model
 6 | """
 7 | 
 8 | # for decorations
 9 | cimport cython
10 | 
11 | # local
12 | from network cimport *
13 | from trainer cimport *
14 | 
15 | cdef class RandomPool:
16 | 
17 |     cdef np.ndarray pool
18 |     cdef dims
19 |     cdef int_t current
20 |     cdef int_t size
21 | 
22 | cdef class LmGradients(Gradients):
23 |     
24 |     cdef public np.ndarray input_neg
25 | 
26 | cdef class LmTrainer(Trainer): 
27 |     """
28 |     Learn word representations.
29 |     """
30 | 
31 |     cdef list feature_tables
32 | 
33 |     # data for statistics during training. 
34 |     cdef int_t total_pairs
35 |     
36 |     cdef np.ndarray[int_t] _extract_window(self,
37 |                                         np.ndarray[int_t,ndim=2] window,
38 |                                         np.ndarray[int_t,ndim=2] sentence,
39 |                                                   int_t position, int_t size=*)
40 | 
41 |     cdef _update_weights(self, worker, LmGradients grads, float remaining)
42 | 
43 |     cdef _update_embeddings(self,
44 |                             np.ndarray[float_t] grads_input_pos,
45 |                             np.ndarray[float_t] grads_input_neg,
46 |                             float_t remaining,
47 |                             np.ndarray[int_t,ndim=2] example,
48 |                             np.ndarray[int_t] token_pos,
49 |                             np.ndarray[int_t] token_neg)
50 | 
51 | # ----------------------------------------------------------------------
52 | 
53 | cdef extern from "WordsTrainer.h": # namespace "DeepNL":
54 |     cdef cppclass WordsTrainer:
55 |         WordsTrainer(int, int, int,
56 |                      double*, double*, double*, double*,
57 |                      double*, double*, double*, double*,
58 |                      double*, double*, double*, double*,
59 |                      int*, int,
60 |                      double*, int, int) except +
61 |         double train_pair() nogil
62 |         double update_embeddings(double, int, int) nogil
63 | 
64 | cdef class LmWorker(LmTrainer): 
65 |     """
66 |     Worker thread for learning word representations.
67 |     """
68 | 
69 |     # local storage
70 |     cdef Variables vars_pos
71 |     cdef Variables vars_neg
72 |     cdef LmGradients grads
73 |     cdef np.ndarray example
74 | 
75 |     # pool of random numbers (used for efficiency)
76 |     cdef public RandomPool random_pool
77 | 
78 |     cdef WordsTrainer* trainer
79 | 
80 |     cdef _train_batch(self, sentences, float remaining)
81 | 
82 |     cdef float _train_step(self, example, pos_token, neg_token,
83 |                              float remaining)
84 | 
85 |     cdef float _train_pair(self, Variables vars_pos, Variables vars_neg,
86 |                              LmGradients grads)
87 | 
88 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. py:module:: nlpnet
 2 | 
 3 | ==================================================================
 4 | :mod:`nlpnet` --- Natural Language Processing with neural networks
 5 | ==================================================================
 6 | 
 7 | :mod:`nlpnet` is a Python library for Natural Language Processing tasks based on neural networks. 
 8 | Currently, it performs part-of-speech tagging and semantic role labeling. It may be used as a Python
 9 | library or through its standalone scripts. Most of the architecture is language independent, 
10 | but some functions were especially tailored for working with Portuguese.
11 | 
12 | This system was inspired by SENNA_, but has some conceptual and practical differences. 
13 | If you use :mod:`nlpnet`, please cite one or both of the articles below, according to your needs (POS or
14 | SRL):
15 | 
16 | .. _SENNA: http://ronan.collobert.com/senna/
17 | 
18 | * Fonseca, E. R. and Rosa, J.L.G. *A Two-Step Convolutional Neural Network Approach for Semantic
19 |   Role Labeling*. Proceedings of the 2013 International Joint Conference on Neural Networks, 2013.
20 |   p. 2955-2961 [`PDF <http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6707118>`_]
21 | 
22 | * Fonseca, E. R. and Rosa, J.L.G. *Mac-Morpho Revisited: Towards Robust Part-of-Speech Tagging*. 
23 |   Proceedings of the 9th Brazilian Symposium in Information and Human Language Technology, 2013. p.  
24 |   98-107 [`PDF <http://aclweb.org/anthology//W/W13/W13-4811.pdf>`_]
25 | 
26 | Contents
27 | --------
28 | 
29 | .. toctree::
30 |     :maxdepth: 2
31 | 
32 |     intro
33 |     scripts
34 |     utils
35 |     network
36 |     
37 | 


--------------------------------------------------------------------------------
/docs/intro.rst:
--------------------------------------------------------------------------------
  1 | ============
  2 | Introduction
  3 | ============
  4 | 
  5 | This documents covers the basics for installing and using :mod:`deepnl`. 
  6 | 
  7 | Installation
  8 | ------------
  9 | 
 10 | :mod:`deepnl` can be downloaded from the Python package index at https://pypi.python.org/pypi/deepnl/ or installed with
 11 | 
 12 | .. code-block:: bash
 13 | 
 14 |     pip install deepnl
 15 | 
 16 | See the `Dependencies`_ section below for additional installation requirements.
 17 |     
 18 | Dependencies
 19 | ~~~~~~~~~~~~
 20 | 
 21 | :mod:`deepnl` requires and numpy_.
 22 | 
 23 | For development use you will also need Cython_, which is used to generate C
 24 | extensions and run faster.
 25 | 
 26 | For simple installation, you dont't need it, since the generated ``.c`` files are already provided with :mod:`deepnl`, but you will need a C compiler.
 27 | 
 28 | .. _numpy: http://www.numpy.org
 29 | .. _Cython: http://cython.org
 30 | .. _setuptools: http://pythonhosted.org/setuptools/
 31 | 
 32 | Brief explanation
 33 | -----------------
 34 | 
 35 | Here is a brief exaplanation about how stuff works in the internals of
 36 | :mod:`deepnl` (*you don't need to know it to use this library*).
 37 | For additional details on the technique, refer to the articles in the index page or about the SENNA system.
 38 | 
 39 | Two types of neural networks are available: a common MLP (multilayer
 40 | perceptron) and a convolutional one. 
 41 | The former is used for training a POS tagger, and a NER tagger.
 42 | Basically, the common MLP examines
 43 | word windows, outputs a score for assigning each tag to each word, and then determines 
 44 | the tags using the Viterbi algorithm (which is essentially picking the best combination from network
 45 | scores and tag transition scores).
 46 | 
 47 | During training, adjustments are made to the network connections, word representations and 
 48 | the tag transition scores. Their learning rates may be set separately, although the best
 49 | results seem to arise when all three have the same value.
 50 | 
 51 | The convolutional network can be used to train a Semantic Role Labeler (SRL).
 52 | In order to output a score for each word, it examines the whole sentence. It does so by picking a word window at a time and forwarding it to a convolution layer.
 53 | This layer stores in each of its neurons the biggest value found so far.
 54 | After all words have been examined, the convolution layer forwards its output like a usual MLP network.
 55 | Then, it works like the previous model: the network outputs scores for each word/tag combination,
 56 | and a Viterbi search is performed.
 57 | 
 58 | In the convolution layer, the values found by each neuron may come from different words, i.e., each neuron stores
 59 | its maximum independently from the others. This is particularly complex during training, because 
 60 | neurons must backpropagate their error only to the word window that yielded their stored value.
 61 | 
 62 | One doesn't need to worry about the details concerning the neural networks
 63 | when using the standalone scripts provided in the ``bin`` directory:
 64 | - ``dl-words.py``,
 65 | - ``dl-sentiwords.py``,
 66 | - ``dl-ner.py``,
 67 | - ``dl-pos.py``.
 68 | 
 69 | However, they are available to play with in the :ref:`network` module.
 70 | 
 71 | Basic usage
 72 | -----------
 73 | 
 74 | :mod:`deepnl` can be used both as a Python library or by its standalone scripts. The basic library API is explained below.
 75 | See also :ref:`scripts`.
 76 | 
 77 | Library usage
 78 | ~~~~~~~~~~~~~
 79 | 
 80 | You can use :mod:`deepnl` as a library in Python code as follows:
 81 | 
 82 | .. code-block:: python
 83 | 
 84 |     >>> import deepnl
 85 |     >>> tagger = deepnl.PosTagger('modelFile')
 86 |     >>> tagger.tag('O rato roeu a roupa do rei de Roma.')
 87 |     [[(u'O', u'ART'), (u'rato', u'N'), (u'roeu', u'V'), (u'a', u'ART'), (u'roupa', u'N'), (u'do', u'PREP+ART'), (u'rei', u'N'), (u'de', u'PREP'), (u'Roma', u'NPROP'), (u'.', 'PU')]]
 88 | 
 89 | In the example above, ``'modelFile'`` must be the path to the file containing
 90 | the trained POS model.
 91 | 
 92 | The curently available taggers are:
 93 | - ``PosTagger`` and
 94 | - ``NerTagger``.
 95 | 
 96 | Both taggers expect sentences consisting of a list of tokens.
 97 | 
 98 | The output is printed in TSV format, with one token per line, and each line
 99 | contains:
100 | 
101 | word<tab>tag
102 | 
103 | with an empty line separating sentences.
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/docs/network.rst:
--------------------------------------------------------------------------------
 1 | ..  _network:
 2 | 
 3 | ========
 4 | Networks
 5 | ========
 6 | 
 7 | This document describes the neural networks used by :mod:`nlpnet`. They are written in Cython and use ``numpy`` a lot in order to attain very fast performance.
 8 | 
 9 | Note that these classes are somewhat low-level and don't deal with words and sentences explicitly. Instead, only vectorial representations are used. This approach is explained in the papers linked in the root page of this documentation.
10 | 
11 | .. :module:: nlpnet.network
12 | 
13 | Module :mod:`nlpnet.network`
14 | ============================
15 | 
16 | This module includes the actual neural networks. There are two classes of networks currently used: :class:`nlpnet.network.Network` for POS and :class:`nlpnet.network.ConvolutionalNetwork` for SRL.
17 | 
18 | .. :class::`nlpnet.network.Network`
19 | 
20 | Class :class:`nlpnet.network.Network`
21 | -------------------------------------
22 | 
23 | .. autoclass:: nlpnet.network.Network
24 |     :members: create_new, description, run, tag_sentence, train, save, load_from_file
25 | 
26 | 
27 | 
28 | .. :class::`nlpnet.network.ConvolutionalNetwork`
29 | 
30 | Class :class:`nlpnet.network.ConvolutionalNetwork`
31 | --------------------------------------------------
32 | 
33 | .. autoclass:: nlpnet.network.ConvolutionalNetwork
34 |     :members: create_new, description, run, tag_sentence, train, save, load_from_file
35 | 


--------------------------------------------------------------------------------
/docs/scripts.rst:
--------------------------------------------------------------------------------
  1 | .. _scripts:
  2 | 
  3 | ==================
  4 | Standalone Scripts
  5 | ==================
  6 | 
  7 | :mod:`nlpnet` includes standalone scripts that may be called from a command line. They are 
  8 | copied to the `scripts` subdirectory of your Python installation, which can be included 
  9 | in the system PATH variable. There are three such scripts:
 10 | 
 11 | **nlpnet-train**
 12 |   Script to train a new model or further train an existing one.
 13 | 
 14 | **nlpnet-test**
 15 |   Script to measure the performance of a model against a gold data set.
 16 | 
 17 | **nlpnet-tag**
 18 |   Script to call a model and tag some given text.
 19 | 
 20 | Each of them is explained below.
 21 | 
 22 | .. contents::  
 23 |   :local:  
 24 |   :depth: 1  
 25 | 
 26 | 
 27 | nlpnet-tag
 28 | ==========
 29 | 
 30 | This is the simplest :mod:`nlpnet` script. It simply runs the system for a given text input. 
 31 | It should be called with the following syntax:
 32 | 
 33 | .. code-block:: bash
 34 | 
 35 |     $ nlpnet-tag.py TASK DATA_DIRECTORY
 36 | 
 37 | Where ``TASK`` is either ``pos`` or ``srl`` and ``DATA_DIRECTORY`` is the directory with the
 38 | trained models. It has also the following command line options:
 39 | 
 40 | -v  Verbose mode
 41 | --no-repeat  Forces the classification step to avoid repeated argument labels (SRL only).
 42 | 
 43 | For example:
 44 | 
 45 | .. code-block:: bash
 46 | 
 47 |     $ nlpnet-tag.py pos /path/to/nlpnet-data/
 48 |     O rato roeu a roupa do rei de Roma.
 49 |     O_ART rato_N roeu_V a_ART roupa_N do_PREP+ART rei_N de_PREP Roma_NPROP ._PU
 50 | 
 51 | Or with semantic role labeling:
 52 | 
 53 | .. code-block:: bash
 54 | 
 55 |     $ nlpnet-tag.py srl /path/to/nlpnet-data/
 56 |     O rato roeu a roupa do rei de Roma.
 57 |     O rato roeu a roupa do rei de Roma .
 58 |     roeu
 59 |         A1: a roupa do rei de Roma
 60 |         A0: O rato
 61 |         V: roeu
 62 | 
 63 | The first line was typed by the user, and the second one is the result of tokenization.
 64 | 
 65 | 
 66 | 
 67 | nlpnet-train
 68 | ============
 69 | 
 70 | There are a lot of training parameters that can be supplied to :mod:`nlpnet`. Some of them depend
 71 | on the task that the network is being trained for, as the type of network can be a simple MLP
 72 | for POS tagging and a convolutional network for SRL.
 73 | 
 74 | General Options
 75 | ---------------
 76 | 
 77 | These options can be used in either POS or SRL training.
 78 | 
 79 | -w NUMBER  The size of the word window. For SRL, the supplied model used 3, and for POS, 5. It is important to have a reasonably large window in POS so the tagger can analyze the context.
 80 | -n NUMBER  Number of hidden neurons.
 81 | -f NUMBER  Generates feature vectors randomly with the given number of dimensions for words. Ignore it if you supply pre-initialized representations.
 82 | --load_features  Loads the features vectors representing words. The file containing the data must be set in config.py and be in the data/ directory. Nlpnet uses numpy files for storing representations as 2-dimensional arrays.
 83 | -e NUMBER  Number of epochs to train the network.
 84 | -l NUMBER  The learning rate for network weights.
 85 | --lf NUMBER  The learning rate for features (including extra features like the ones from ``--caps``).
 86 | --lt NUMBER  The learning rate for the tag transition scores.
 87 | --caps NUMBER  Include capitalization as a feature. If a number is given, determine the number of features (default 5).
 88 | --suffix NUMBER  Same as ``--caps``, but for suffixes. It will search a file named suffixes.txt in the data/ directory, and read each line as suffix.
 89 | -a NUMBER  Stop training when the network achieves this accuracy. Useful to avoid divergence when the learning rate is high.
 90 | -v  Verbose mode, it will output more information about what is happening internally.
 91 | --load_network  Loads a previously saved network. The file name must be set in config.py and be in the data/ directory. 
 92 | --task TASK  Task to train for. It must be either ``srl`` or ``pos``.
 93 | --data DIRECTORY  The directory containing the model files. If a new model is being trained, everything is saved to that dir.
 94 | --gold FILE  A file containing the gold data used for training.
 95 | 
 96 | Data files must be in the format used by :mod:`nlpnet`. A POS file must have one sentence per line, each sentence containing tokens in the format ``token_tag`` and separated by whitespace. SRL files must be in the `CoNLL format`_.
 97 | 
 98 | .. _`CoNLL format`: https://ufal.mff.cuni.cz/conll2009-st/task-description.html#Dataformat
 99 | 
100 | 
101 | SRL
102 | ---
103 | 
104 | -c NUMBER  Number of neurons in the convolution layer.
105 | --pos NUMBER  Uses POS as a feature. Currently, it must read the tags from the training data. Works same as --caps.
106 | --chunk NUMBER  Uses syntactic chunks as a feature. Same as --pos.
107 | --use_lemma  Reads word lemmas instead of surface forms. It needs to read them from the training data.
108 | --id  Train for argument boundary identification only.
109 | --class  Train for previously identified argument classification only. (if neither this or ``--id`` is supplied, trains a network that does both in a single step)
110 | --pred  Train for predicate recognizing only.
111 | --max_dist NUMBER  The maximum distance (to predicates and target words) to have an own feature vector. Any distance greater than this will be mapped to a single vector.
112 | --target_features NUMBER  Number of features for vectors representing distance to the target word.
113 | --pred_features NUMBER  Same as ``--target_features`` for the predicate.
114 | 
115 | 
116 | nlpnet-test
117 | ===========
118 | 
119 | This script is much simpler. It evaluates the system performance against a gold standard. 
120 | 
121 | General options
122 | ---------------
123 | 
124 | The arguments below are valid for both tasks.
125 | 
126 | --task TASK  Task for which the network should be used. Either ``pos`` or ``srl``.
127 | -v  Verbose mode
128 | --gold FILE  File with gold standard data
129 | --data DIRECTORY  Directory with trained models
130 | 
131 | POS
132 | ---
133 | 
134 | --oov FILE  Analyze performance on the words described in the given file.
135 | 
136 | The ``--oov`` option requires a UTF-8 file containing one word per line. Actually, this option
137 | is not exclusive for OOV (out-of-vocabulary) words, but rather any word list you
138 | want to evaluate.
139 | 
140 | SRL
141 | ---
142 | 
143 | SRL evaluation is performed in different ways, depending on whether it is aimed at
144 | argument identification, classification, predicate detection or all of them.
145 | In the future, there may be a more standardized version for this test.
146 | 
147 | --id  Evaluate only argument identification (SRL only). The script will output the score.
148 | --class  Evaluate only argument classification (SRL only). The script will output the score.
149 | --preds  Evaluate only predicate identification (SRL only). The script will output the score.
150 | --2steps  Execute SRL with two separate steps. The script will output the results in CoNLL format.
151 | --no-repeat  Forces the classification step to avoid repeated argument labels (2 step SRL only)
152 | --auto-pred  Determines SRL predicates automatically. Only used when evaluating the full process (identification + classification)
153 | 
154 | The CoNLL output can be evaluated against a gold file using the official SRL eval script (see http://www.lsi.upc.edu/~srlconll/soft.html).
155 | 
156 | 
157 | 


--------------------------------------------------------------------------------
/docs/utils.rst:
--------------------------------------------------------------------------------
 1 | =================
 2 | Utility Functions
 3 | =================
 4 | 
 5 | .. py:module:: nlpnet.utils
 6 | 
 7 | Module :mod:`nlpnet.utils`
 8 | ==========================
 9 | 
10 | This module includes some general utility functions. Most of the functions are specific for the 
11 | internals of :mod:`nlpnet`, but the following ones can be interesting for other purposes.
12 | 
13 | .. autofunction:: nlpnet.utils.clean_text
14 | .. autofunction:: nlpnet.utils.tokenize
15 | .. autofunction:: nlpnet.utils.contract
16 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | 
 2 | try:
 3 |     from setuptools import setup, Extension
 4 | except ImportError:
 5 |     from distutils.core import setup, Extension
 6 | from Cython.Build import cythonize
 7 | 
 8 | import numpy as np
 9 | import glob
10 | 
11 | def readme():
12 |     with open('README.rst') as f:
13 |         text = f.read()
14 |     return text
15 | 
16 | extensions = [
17 |     Extension('deepnl/words',
18 |               sources=["deepnl/words.pyx", "deepnl/WordsTrainer.cpp"],
19 |               include_dirs=[np.get_include(),
20 |                             "/usr/include/eigen3"],
21 |               language="c++",
22 |               extra_compile_args=["-fopenmp"]),
23 |     Extension('deepnl/hpca',
24 |               sources=["deepnl/hpca.pyx", "deepnl/HPCA_impl.cpp"],
25 |               include_dirs=[np.get_include(),
26 |                             "/usr/include/eigen3"],
27 |               language="c++",
28 |               extra_compile_args=["-std=c++11"],
29 |               extra_link_args=["-fopenmp"]),
30 |     Extension('deepnl/*',
31 |               sources=['deepnl/*.pyx'],
32 |               include_dirs=[np.get_include(),
33 |                             "/usr/include/eigen3"],
34 |               language="c++",
35 |               extra_compile_args=["-fopenmp"]),
36 | ]
37 | 
38 | setup(
39 |     name = "deepnl",
40 | 
41 |     description = "Deep Learning for NLP tasks",
42 |     author = "Giuseppe Attardi <attardi@di.unipi.it>",
43 |     author_email = "attardi@di.unipi.it",
44 |     url = "https://github.com/attardi/deepnl",
45 | 
46 |     license = "GNU GPL",
47 |     version = "1.3.18",
48 | 
49 |     platforms = "any",
50 | 
51 |     keywords = " Deep learning "
52 |         " Neural network "
53 |         " Natural language processing ",
54 | 
55 |     requires = ["numpy (>= 1.9)"],
56 | 
57 |     packages = ["deepnl"],
58 | 
59 |     ext_modules = cythonize(
60 |         extensions,
61 |         language="c++",
62 |         nthreads=4),
63 |     scripts = glob.glob("bin/*.py"),
64 | 
65 |     classifiers = [
66 |         "Development Status :: 3 - Alpha",
67 |         "Environment :: Console",
68 |         "Intended Audience :: Science/Research",
69 |         "License :: OSI Approved :: GNU General Public License",
70 |         "Operating System :: OS Independent",
71 |         "Programming Language :: Python :: 2.6",
72 |         "Programming Language :: Python :: 2.7",
73 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
74 |         "Topic :: Scientific/Engineering :: Information Analysis",
75 |         "Topic :: Text Processing :: Linguistic",
76 |     ],
77 | 
78 |     long_description = readme()
79 | )
80 | 


--------------------------------------------------------------------------------