├── python-client
    ├── __init__.py
    ├── sklext
    │   ├── __init__.py
    │   ├── test
    │   │   ├── __init__.py
    │   │   ├── test_mutual_information.py
    │   │   ├── test_term_estimators.py
    │   │   └── test_term_weight_transformer.py
    │   ├── cond_prob.py
    │   ├── mutual_information.py
    │   ├── term_weighting.py
    │   └── term_estimators.py
    ├── es_text_analytics
    │   ├── __init__.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── test
    │   │   │   ├── __init__.py
    │   │   │   ├── test_aviskorpus.py
    │   │   │   ├── test_newsgroups.py
    │   │   │   ├── test_ndt_dataset.py
    │   │   │   └── test_dataset.py
    │   │   ├── elasticsearch_dataset.py
    │   │   ├── newsgroups.py
    │   │   ├── wiki_infobox.py
    │   │   ├── ndt_dataset.py
    │   │   ├── dataset.py
    │   │   └── aviskorpus.py
    │   ├── test
    │   │   ├── __init__.py
    │   │   ├── test_no_tokenizer.py
    │   │   ├── test_single_doc_sigterms.py
    │   │   ├── test_np_extractor.py
    │   │   ├── test_decompounder.py
    │   │   ├── test_tagger.py
    │   │   └── test_term_weight_provider.py
    │   ├── tokenizer.py
    │   ├── single_doc_sigterms.py
    │   ├── np_extractor.py
    │   ├── kera.py
    │   ├── lemmatizer.py
    │   ├── wordnet_centrality.py
    │   ├── decompounder.py
    │   ├── tagger.py
    │   └── term_weight_provider.py
    ├── requirements.txt
    ├── bin
    │   ├── run_singledoc_sig_terms.py
    │   ├── build-all-models.bat
    │   ├── build-all-models.sh
    │   ├── build_pyLDAvis.py
    │   ├── NOB_kera.py
    │   ├── wordcounts_from_dataset.py
    │   ├── index_dataset.py
    │   ├── build_no_tagger.py
    │   ├── build_LDA_kera_from_wiki.py
    │   ├── corpus2lemmatizedtext.py
    │   └── build-wiki-topicmodel.py
    ├── setup.py
    └── run_models.sh
├── .env
├── .gitignore
├── provision
    ├── elasticsearch.yml
    └── neo4j-server.properties
├── .travis.yml
├── environment.yml
├── spark-jobs
    └── ng-wc.py
├── readme.md
├── Vagrantfile
└── fabfile.py


/python-client/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
1 | source activate cta-env
2 | 


--------------------------------------------------------------------------------
/python-client/sklext/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/python-client/sklext/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/data/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | models/
3 | *.pyc
4 | .idea/
5 | !python-client/es_text_analytics/data
6 | 
7 | .vagrant/
8 | notebooks/
9 | 


--------------------------------------------------------------------------------
/provision/elasticsearch.yml:
--------------------------------------------------------------------------------
1 | marvel.agent.enabled: false
2 | index.number_of_shards: 1
3 | index.number_of_replicas: 0
4 | http.cors.enabled: true
5 | 


--------------------------------------------------------------------------------
/python-client/requirements.txt:
--------------------------------------------------------------------------------
1 | elasticsearch
2 | requests
3 | psutil
4 | textblob
5 | nltk
6 | gensim
7 | uniseg
8 | git+git://github.com/comperiosearch/python-elasticsearch-runner


--------------------------------------------------------------------------------
/python-client/es_text_analytics/data/test/test_aviskorpus.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/comperiosearch/comperio-text-analytics/master/python-client/es_text_analytics/data/test/test_aviskorpus.py


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 |   
 5 | install: "pip install -r python-client/requirements.txt"
 6 | script: nosetests
 7 | virtualenv:
 8 |   system_site_packages: true
 9 | before_install:
10 |  - sudo apt-get install -qq python-numpy python-scipy


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: cta-env
 2 | dependencies:
 3 | - python
 4 | - pip
 5 | - ipython
 6 | - ipython-notebook
 7 | - matplotlib
 8 | - scikit-learn
 9 | - requests
10 | - gensim
11 | - nltk
12 | - nose
13 | - pip:
14 |   - textblob
15 |   - elasticsearch
16 |   - psutil
17 |   - py4j
18 | 


--------------------------------------------------------------------------------
/python-client/bin/run_singledoc_sig_terms.py:
--------------------------------------------------------------------------------
1 | __author__ = 'cvig'
2 | from  es_text_analytics import single_doc_sigterms
3 | 
4 | from elasticsearch import Elasticsearch
5 | es = Elasticsearch()
6 | 
7 | sdt = single_doc_sigterms.SingleDocSigTerms(es, 'wiki', 'doc', 'article', None)
8 | print sdt.by_doc_id_idf(178472                        , 20)


--------------------------------------------------------------------------------
/python-client/es_text_analytics/test/__init__.py:
--------------------------------------------------------------------------------
 1 | from elasticsearch_runner.runner import ElasticsearchRunner
 2 | 
 3 | es_runner = ElasticsearchRunner()
 4 | 
 5 | 
 6 | def setup():
 7 |     es_runner.install()
 8 |     es_runner.run()
 9 |     es_runner.wait_for_green()
10 | 
11 | 
12 | def teardown():
13 |     if es_runner and es_runner.is_running():
14 |         es_runner.stop()
15 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/test/test_no_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from unittest import TestCase
 3 | 
 4 | from es_text_analytics.tokenizer import NOTokenizer
 5 | 
 6 | 
 7 | class TestNOTokenizer(TestCase):
 8 |     def test_tokenize(self):
 9 |         tokenizer = NOTokenizer()
10 |         self.assertEqual(['Dette', 'er', u'vårt', 'hus', '.'],
11 |                          tokenizer.tokenize(u'Dette er vårt hus.'))


--------------------------------------------------------------------------------
/python-client/bin/build-all-models.bat:
--------------------------------------------------------------------------------
1 | :: Builds all the models for Norwegian NLP functionality and places them in the default locations
2 | 
3 | set PYTHONPATH=%PYTHONPATH%;%~dp0\..
4 | 
5 | mkdir %~dp0\..\..\models
6 | 
7 | python %~dp0\build_no_tagger.py -m %~dp0\..\..\models\nob-tagger-default-model --features simple --language nob
8 | python %~dp0\build_no_tagger.py -m %~dp0\..\..\models\nno-tagger-default-model --features simple --language nno
9 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/tokenizer.py:
--------------------------------------------------------------------------------
 1 | from textblob.base import BaseTokenizer
 2 | from uniseg import wordbreak
 3 | 
 4 | # TextBlob compatible tokenizer for Norwegian.
 5 | # Simple implementation. Tokenizes according to Unicode Appendix 29 (UAX#29).
 6 | 
 7 | 
 8 | class NOTokenizer(BaseTokenizer):
 9 |     def tokenize(self, text):
10 |         return list(self.itokenize(text))
11 | 
12 |     def itokenize(self, text, *args, **kwargs):
13 |         return (token for token in wordbreak.words(text) if token != ' ')
14 | 


--------------------------------------------------------------------------------
/python-client/bin/build-all-models.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Builds all the models for Norwegian NLP functionality and places them in the default locations
 4 | 
 5 | SELF_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
 6 | 
 7 | mkdir -p ${SELF_DIR}\..\..\models
 8 | 
 9 | python ${SELF_DIR}\build_no_tagger.py -m ${SELF_DIR}\..\..\models\nob-tagger-default-model --features simple --language nob
10 | python ${SELF_DIR}\build_no_tagger.py -m ${SELF_DIR}\..\..\models\nno-tagger-default-model --features simple --language nno


--------------------------------------------------------------------------------
/python-client/sklext/cond_prob.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | from numpy import array
 3 | 
 4 | from sklext.term_estimators import joint_estimator_point, marginal_estimator
 5 | 
 6 | 
 7 | def conditional_probabilities(X, y, ratio=False):
 8 |     p_t_c = joint_estimator_point(X, y, smoothing=True)
 9 |     p_t = marginal_estimator(X, smoothing=True)
10 | 
11 |     p_t.shape = 2,1
12 | 
13 |     m = p_t_c / p_t
14 | 
15 |     if ratio:
16 |         p_c = marginal_estimator(y, smoothing=True)
17 | 
18 |         m = m / p_c
19 | 
20 |     return array(numpy.max(m, axis=1)).flatten()


--------------------------------------------------------------------------------
/python-client/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='es_text_analytics',
 5 |     version='0.1',
 6 |     packages=['es_text_analytics', 'es_text_analytics.test'],
 7 |     url='https://bitbucket.org/comperio/comperio-text-analytics',
 8 |     license='For internal use only.',
 9 |     author='Andre Lynum',
10 |     author_email='andre.lynum@comperiosearch.com',
11 |     description='es text analytics.',
12 |     install_requires=['elasticsearch', 'requests', 'psutil',  'textblob', 'nltk',  'gensim', 'uniseg'],
13 |     dependency_links=['git+ssh://git@github.com/comperiosearch/python-elasticsearch-runner']
14 | 
15 | )
16 | 


--------------------------------------------------------------------------------
/python-client/run_models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | wikifile=/data/no/nowiki-latest-pages-articles.xml.bz2
 3 | datadir=/data/no/
 4 | vocab=/data/no/nowiki_lsi_10.vocab
 5 | for n_topics in  100 200 400 1000 2000; do
 6 | 	python bin/build-wiki-topicmodel.py --model-id nowiki --model-type lsi  -d  $wikifile --data-dir $datadir --vocab $vocab --n-topics $n_topics
 7 | done
 8 | exit 
 9 | for n_topics in  50 100 250 500 1000; do
10 | 	python bin/build-wiki-topicmodel.py --model-id nowiki --model-type lda  -d  $wikifile  --data-dir $datadir --vocab $vocab --n-topics $n_topics
11 | done
12 | 
13 |  
14 | for window in  50 100 250; do
15 | 	for size in 500 1000, do
16 | 		python bin/build-wiki-topicmodel.py --model-id nowiki --model-type word2vec --w2v-window $window --w2v-size $size -d  $wikifile  --data-dir $datadir --vocab $vocab
17 | 	done
18 | done
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/data/test/test_newsgroups.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from es_text_analytics.data.newsgroups import parse
 4 | 
 5 | 
 6 | class TestNewsgroups(TestCase):
 7 |     def test_parse(self):
 8 |         self.assertEqual(parse('From: ba\nSubject: foo\nThe\nmessage.\n--\nSig\n'),
 9 |                          {'raw': 'From: ba\nSubject: foo\nThe\nmessage.\n--\nSig\n',
10 |                           'msg': 'The\nmessage.',
11 |                           'from': 'ba',
12 |                           'subject': 'foo',
13 |                           'sig': 'Sig\n'})
14 |         self.assertEqual(parse('Subject: foo\nFrom: ba\nThe\nmessage.\n--\nSig\n'),
15 |                          {'raw': 'Subject: foo\nFrom: ba\nThe\nmessage.\n--\nSig\n',
16 |                           'msg': 'The\nmessage.',
17 |                           'from': 'ba',
18 |                           'subject': 'foo',
19 |                           'sig': 'Sig\n'})
20 | 


--------------------------------------------------------------------------------
/spark-jobs/ng-wc.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from argparse import ArgumentParser
 3 | 
 4 | from pyspark import SparkContext
 5 | 
 6 | # Basic sample Spark job for testing
 7 | # To run pass the 20 Newsgroups JSON formatted corpus file as the -f argument, set
 8 | # the SPARK_HOME environment variable and run with spark-submit or pyspark.as
 9 | 
10 | # To run and edit within PyCharm add SPARK_HOME/python and SPARK_HOME/python/lib/py4j-x.x.x.x-src.zip
11 | # to the interpreter paths in addition to setting SPARK_HOME in the run configuration.
12 | 
13 | def main():
14 |     parser = ArgumentParser()
15 |     parser.add_argument('-f', '--filename')
16 |     opts = parser.parse_args()
17 | 
18 |     fn = opts.filename
19 | 
20 |     if not fn:
21 |         sys.exit(1)
22 | 
23 |     sc = SparkContext(appName='ng-wc')
24 | 
25 |     rdd = sc.textFile(fn)
26 | 
27 |     n = rdd.count()
28 | 
29 |     print 'The 20 Newsgroups corpus has %d articles.' % n
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     main()


--------------------------------------------------------------------------------
/python-client/bin/build_pyLDAvis.py:
--------------------------------------------------------------------------------
 1 | from gensim.corpora import Dictionary
 2 | from gensim.models.ldamodel import LdaModel
 3 | import gensim
 4 | from gensim import corpora
 5 | import pyLDAvis.gensim
 6 | 
 7 | 
 8 | def main():
 9 |     file = 'f:/projects/elasticsearch-enterprise-system/data/topic_models/nowiki_v2_3pass_lda_250'
10 |     mod = LdaModel.load(file)
11 |     dict = 'f:/projects/elasticsearch-enterprise-system/data/topic_models/voc_vocabulary_0.vocab'
12 |     vocab = Dictionary.load(dict)
13 |     corpfile = 'f:/projects/comperio-text-analytics/models/topicmodel/mojo_lda_100.corp'
14 |     corpus = gensim.corpora.MmCorpus(corpfile)
15 | 
16 |     print mod.show_topic(0)
17 |     print mod.id2word
18 |     mod.id2word = vocab
19 | 
20 |     print mod.show_topic(0)
21 | 
22 |     pydavis = pyLDAvis.gensim.prepare(mod, corpus, vocab)
23 |     pyLDAvis.save_html(pydavis, 'pydavis_250_v2_3passes.html')
24 |     pyLDAvis.show(pydavis)
25 | 
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     main()
30 | 


--------------------------------------------------------------------------------
/python-client/bin/NOB_kera.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'cvig'
 2 | from es_text_analytics.tagger import NOBTagger, install_hunpos
 3 | from es_text_analytics.np_extractor import NONPExtractor
 4 | from es_text_analytics.kera import extract_keywords
 5 | from nltk.tokenize import sent_tokenize
 6 | import re
 7 | import unicodedata
 8 | 
 9 | def fast_tokenize(str):
10 |     return [token.lower() for token in re.findall('[^\W\d_]+', re.sub(ur'[\00a0\n-]', ' ', str), re.MULTILINE|re.UNICODE)]
11 | 
12 | def unicode_tokenize(str):
13 |     normalized = unicodedata.normalize('NFKC', str)
14 |     return normalized.encode('utf-8').lower().split()
15 |     #return [token.lower() for token in re.findall('[^\W\d_]+', re.sub('[\n-]', ' ', normalized), re.MULTILINE|re.UNICODE)]
16 | 
17 | class NOB_kera():
18 |     def __init__(self):
19 |         self.tagger = NOBTagger()
20 |         self.chunker = NONPExtractor(tagger=self.tagger, keep_index=True)
21 | 
22 |     def extract_keywords(self, from_text):
23 |         return extract_keywords(from_text, fast_tokenize, sent_tokenize, self.tagger, self.chunker)
24 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/data/test/test_ndt_dataset.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from unittest import TestCase
 3 | 
 4 | from es_text_analytics.data.ndt_dataset import filelist, normalize
 5 | 
 6 | 
 7 | class TestNDTDatasetHelpers(TestCase):
 8 |     def test_filelist(self):
 9 |         files = filelist()
10 | 
11 |         self.assertEqual(2, len(files))
12 |         self.assertTrue('ndt_1-0_nno.conll' in files)
13 |         self.assertTrue('ndt_1-0_nob.conll' in files)
14 | 
15 |         files = filelist(lang='nob', sections=['parliament'])
16 | 
17 |         self.assertEqual(1, len(files))
18 |         self.assertTrue('parliament_ndt_1-0_nob.conll' in files)
19 | 
20 |     def test_normalize(self):
21 |         doc = [[1, 'Eg', 'eg', 'pron'],
22 |                [2, 'var', 'vere', 'verb'],
23 |                [3, u'på', u'på', 'prep'],
24 |                [4, 'bibeltime', 'bibeltime', 'subst'],
25 |                [5, '.', '$.', 'clb']]
26 | 
27 |         result = normalize(doc)
28 |         self.assertEqual(1, len(result))
29 |         self.assertTrue('content' in result)
30 |         self.assertTrue(u'Eg var på bibeltime .' in result.values())
31 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/data/elasticsearch_dataset.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import string
 3 | import tarfile
 4 | 
 5 | from es_text_analytics.data.dataset import Dataset
 6 | from elasticsearch.client import Elasticsearch
 7 | from elasticsearch.helpers import scan
 8 | 
 9 | """
10 | Elasticsearch as data source
11 | 
12 | """
13 | 
14 | 
15 | class ElasticsearchDataset(Dataset):
16 |     """
17 |     Class encapsulating using Elasticsearch as datasource. Uses scan/scroll API via the es-py helpers scan.
18 |     """
19 | 
20 |     def __init__(self, read_index, read_doc_type, index='new_index', doc_type='doc', query=None, dataset_path=None, normalize_func=None):
21 |         super(ElasticsearchDataset, self).__init__(index=index, doc_type=doc_type, dataset_path=dataset_path, normalize_func=normalize_func)
22 |         self.dataset_fn = 'elastics'
23 |         self.read_index = read_index
24 |         self.read_doc_type = read_doc_type
25 |         self.query = query
26 | 
27 |     def _iterator(self):
28 |         es = Elasticsearch(timeout=60)
29 |         return scan(es, scroll=u'10m', query=self.query,
30 |                     index=self.read_index, doc_type=self.read_doc_type)
31 | 


--------------------------------------------------------------------------------
/python-client/sklext/test/test_mutual_information.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from numpy import array
 4 | from numpy.ma.testutils import assert_array_approx_equal
 5 | from scipy.sparse.csr import csr_matrix
 6 | 
 7 | from sklext.mutual_information import mutual_information, pointwise_mutual_information
 8 | 
 9 | 
10 | class TestMutualInformation(TestCase):
11 |     def test_mutual_information(self):
12 |         X = array([[0, 1],
13 |                    [1, 0],
14 |                    [1, 1]])
15 |         y = array([[0, 1],
16 |                    [1, 0],
17 |                    [1, 0]])
18 | 
19 |         assert_array_approx_equal(mutual_information(X, y), [-0.37489, -0.605939], decimal=3)
20 |         assert_array_approx_equal(mutual_information(csr_matrix(X), csr_matrix(y)), [-0.37489, -0.605939], decimal=3)
21 | 
22 |     def test_pointwise_mutual_information(self):
23 |         X = array([[0, 1],
24 |                    [1, 0],
25 |                    [1, 1]])
26 |         y = array([[0, 1],
27 |                    [1, 0],
28 |                    [1, 0]])
29 | 
30 |         assert_array_approx_equal(pointwise_mutual_information(X, y), [0.1178, 0.1178], decimal=3)
31 |         assert_array_approx_equal(pointwise_mutual_information(csr_matrix(X), csr_matrix(y)),
32 |                                   [0.1178, 0.1178], decimal=3)
33 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/data/test/test_dataset.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from StringIO import StringIO
 3 | from unittest import TestCase
 4 | 
 5 | from es_text_analytics.data.dataset import fn_from_url, parse_conll
 6 | 
 7 | NDT_CONLL_SAMPLE = """
 8 | 1	Nokre	nokon	det
 9 | 2	refleksjonar	refleksjon	subst
10 | 3	|	$|	clb
11 | 
12 | 1	Eg	eg	pron
13 | 2	var	vere	verb
14 | 3	på	på	prep
15 | 4	bibeltime	bibeltime	subst
16 | 5	.	$.	clb
17 | 
18 | """
19 | 
20 | 
21 | class TestDataset(TestCase):
22 |     def test_fn_from_url(self):
23 |         self.assertEqual(fn_from_url('http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz'), '20news-18828.tar.gz')
24 | 
25 |     def test_parse_conll(self):
26 |         result = list(parse_conll(StringIO(NDT_CONLL_SAMPLE)))
27 | 
28 |         self.assertEqual(2, len(result))
29 |         self.assertEqual([[1, 'Nokre', 'nokon', 'det'],
30 |                           [2, 'refleksjonar', 'refleksjon', 'subst'],
31 |                           [3, '|', '$|', 'clb']],
32 |                          result[0])
33 |         self.assertEqual([[1, 'Eg', 'eg', 'pron'],
34 |                           [2, 'var', 'vere', 'verb'],
35 |                           [3, u'på', u'på', 'prep'],
36 |                           [4, 'bibeltime', 'bibeltime', 'subst'],
37 |                           [5, '.', '$.', 'clb']],
38 |                          result[1])
39 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/test/test_single_doc_sigterms.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from elasticsearch import Elasticsearch
 4 | from elasticsearch.client import IndicesClient
 5 | 
 6 | from es_text_analytics.single_doc_sigterms import SingleDocSigTerms
 7 | from es_text_analytics.test import es_runner
 8 | 
 9 | 
10 | class TestSingleDocSigTerms(TestCase):
11 |     def setUp(self):
12 |         super(TestSingleDocSigTerms, self).setUp()
13 | 
14 |         self.es = Elasticsearch(hosts=['localhost:%d' % es_runner.es_state.port])
15 |         self.ic = IndicesClient(self.es)
16 |         self.index = 'single_doc_sigterms_test'
17 |         self.doc_type = 'test-doc'
18 |         self.field = 'text'
19 | 
20 |         if self.ic.exists(self.index):
21 |             self.ic.delete(self.index)
22 | 
23 |         self.ic.create(self.index)
24 |         self.es.create(self.index, self.doc_type, {self.field: 'foo ba knark foo knirk knark foo'}, id='doc_1')
25 | 
26 |     def test_tf_for_doc_id(self):
27 |         sigterms = SingleDocSigTerms(self.es, self.index, self.doc_type, self.field, None)
28 | 
29 |         resp = dict(sigterms.tf_for_doc_id('doc_1'))
30 |         self.assertEquals(4, len(resp))
31 |         self.assertEquals(3, resp['foo'])
32 |         self.assertEquals(2, resp['knark'])
33 |         self.assertEquals(1, resp['ba'])
34 |         self.assertEquals(1, resp['knirk'])
35 | 


--------------------------------------------------------------------------------
/python-client/sklext/mutual_information.py:
--------------------------------------------------------------------------------
 1 | from math import log, e
 2 | 
 3 | import numpy
 4 | from numpy import array, zeros
 5 | 
 6 | from sklext.term_estimators import marginal_estimator, joint_estimator_point, joint_estimator_full
 7 | 
 8 | 
 9 | def mutual_information(X, y):
10 |     num_terms = X.shape[1]
11 |     num_classes = y.shape[1]
12 | 
13 |     p_c = marginal_estimator(y, smoothing=True)
14 |     p_t = marginal_estimator(X, smoothing=True)
15 | 
16 |     p_t_c = joint_estimator_full(X, y, smoothing=True)
17 | 
18 |     ig = zeros((num_terms))
19 | 
20 |     for i in xrange(num_terms):
21 |         for j in xrange(num_classes):
22 |             ig[i] += p_t_c[0][i, j] * log(p_t_c[0][i, j] / (p_t[i] * p_c[j]))
23 |             ig[i] += p_t_c[1][i, j] * log(p_t_c[1][i, j] / (p_t[i] * (1 - p_c[j])))
24 |             ig[i] += p_t_c[2][i, j] * log(p_t_c[2][i, j] / ((1 - p_t[i]) * p_c[j]))
25 |             ig[i] += p_t_c[3][i, j] * log(p_t_c[3][i, j] / ((1 - p_t[i]) * (1 - p_c[j])))
26 | 
27 |     return ig
28 | 
29 | 
30 | def pointwise_mutual_information(X, y, normalize=False, k_weight=None, positive=None):
31 |     p_c = marginal_estimator(y, smoothing=True)
32 |     p_t = marginal_estimator(X, smoothing=True)
33 | 
34 |     p_t.shape = 2, 1
35 |     p_c.shape = 1, 2
36 | 
37 |     p_t_c = joint_estimator_point(X, y, smoothing=True)
38 | 
39 |     if k_weight:
40 |         p_t_c = p_t_c**k_weight
41 | 
42 |     m = numpy.log(array(p_t_c) / (p_t * p_c))
43 | 
44 |     if normalize:
45 |         m = m / -numpy.log(p_t_c)
46 | 
47 |     if positive is 'cutoff':
48 |         m[m < .0] = .0
49 | 
50 |     if positive is 'exp':
51 |         m = e**m
52 | 
53 |     return array(numpy.max(m, axis=1)).flatten()
54 | 


--------------------------------------------------------------------------------
/python-client/sklext/term_weighting.py:
--------------------------------------------------------------------------------
 1 | from scipy.sparse import spdiags
 2 | from sklearn.base import BaseEstimator, TransformerMixin
 3 | 
 4 | from sklext.cond_prob import conditional_probabilities
 5 | from sklext.mutual_information import mutual_information, pointwise_mutual_information
 6 | 
 7 | 
 8 | class TermWeightTransformer(BaseEstimator, TransformerMixin):
 9 |     def __init__(self, method='mi', pmi_k=2):
10 |         self.method = method
11 |         self.pmi_k = pmi_k
12 | 
13 |         self._weights = None
14 | 
15 |     def fit(self, X, y):
16 |         if self.method is 'mi':
17 |             self._weights = mutual_information(X, y)
18 |         elif self.method is 'pmi':
19 |             self._weights = pointwise_mutual_information(X, y, normalize=False)
20 |         elif self.method is 'npmi':
21 |             self._weights = pointwise_mutual_information(X, y, normalize=True)
22 |         elif self.method is 'ppmi_exp':
23 |             self._weights = pointwise_mutual_information(X, y, normalize=True, positive='exp')
24 |         elif self.method is 'pmi_k':
25 |             self._weights = pointwise_mutual_information(X, y, normalize=True, k_weight=self.pmi_k)
26 |         elif self.method is 'ppmi':
27 |             self._weights = pointwise_mutual_information(X, y, normalize=False, positive='cutoff')
28 |         elif self.method is 'cp_raw':
29 |             self._weights = conditional_probabilities(X, y, ratio=False)
30 |         elif self.method is 'cp_ratio':
31 |             self._weights = conditional_probabilities(X, y, ratio=True)
32 |         else:
33 |             raise ValueError
34 | 
35 |         return self
36 | 
37 |     def transform(self, X, y=None):
38 |         p = len(self._weights)
39 |         w_diag = spdiags(self._weights, 0, p, p)
40 | 
41 |         return X * w_diag
42 | 


--------------------------------------------------------------------------------
/python-client/sklext/test/test_term_estimators.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from numpy import array
 4 | from numpy.ma.testutils import assert_array_approx_equal
 5 | from scipy.sparse import csr_matrix
 6 | 
 7 | from sklext.term_estimators import joint_estimator_point, joint_estimator_full
 8 | 
 9 | 
10 | class TestTermEstimators(TestCase):
11 |     def test_joint_estmator_point(self):
12 |         X = array([[0, 1],
13 |                    [1, 0],
14 |                    [1, 1]])
15 |         y = array([[0, 1],
16 |                    [1, 0],
17 |                    [1, 0]])
18 | 
19 |         assert_array_approx_equal(joint_estimator_point(X, y), [[.5, 0], [.25, .25]])
20 |         assert_array_approx_equal(joint_estimator_point(csr_matrix(X), csr_matrix(y)), [[.5, 0], [.25, .25]])
21 | 
22 |     def test_joint_estimator_full(self):
23 |         X = array([[0, 1],
24 |                    [1, 0],
25 |                    [1, 1]])
26 |         y = array([[0, 1],
27 |                    [1, 0],
28 |                    [1, 0]])
29 | 
30 |         assert_array_approx_equal(joint_estimator_full(X, y),
31 |                                   [[[.1667, .0], [.0833, .0833]],
32 |                                    [[.0 , .1667], [.0833, .0833]],
33 |                                    [[.0 , .0833], [.0833, .0]],
34 |                                    [[.0833, .0], [.0, .0833]]],
35 |                                   decimal=3)
36 |         assert_array_approx_equal(joint_estimator_full(csr_matrix(X), csr_matrix(y)),
37 |                                   [[[.1667, .0], [.0833, .0833]],
38 |                                    [[.0 , .1667], [.0833, .0833]],
39 |                                    [[.0 , .0833], [.0833, .0]],
40 |                                    [[.0833, .0], [.0, .0833]]],
41 |                                   decimal=3)


--------------------------------------------------------------------------------
/python-client/es_text_analytics/data/newsgroups.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from sklearn import datasets
 4 | from sklearn.datasets import twenty_newsgroups
 5 | 
 6 | from es_text_analytics.data.dataset import Dataset
 7 | 
 8 | """
 9 | The 20 Newsgroups dataset is a standardized dataset with Newsgroup messages.
10 | 
11 | http://qwone.com/~jason/20Newsgroups/
12 | """
13 | 
14 | NEWSGROUPS_ARCHIVE_URL = 'http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz'
15 | 
16 | 
17 | def iterator(dataset_fn):
18 |     """
19 |     Provides an iterator of parsed documents from the 20 Newsgroups dataset.
20 | 
21 |     :param dataset_fn: Path to Newsgroups dataset archive file.
22 |     :type dataset_fn: unicode|str
23 |     :rtype : generator
24 |     """
25 |     ng = datasets.fetch_20newsgroups()
26 | 
27 |     for article, group, target, filename in zip(ng['data'], [ng['target_names'][x] for x in ng['target']],
28 |                                                 ng['target'], ng['filenames']):
29 |         article = twenty_newsgroups.strip_newsgroup_header(article)
30 |         article = twenty_newsgroups.strip_newsgroup_footer(article)
31 |         article = twenty_newsgroups.strip_newsgroup_quoting(article)
32 |         doc_id = os.path.basename(filename)
33 | 
34 |         yield {'doc_id': doc_id, 'article': article, 'group': group, 'target': target, 'filename': filename}
35 | 
36 | 
37 | class NewsgroupsDataset(Dataset):
38 |     """
39 |     Class encapsulating the Newsgroups dataset and the information needed to retrieve and index it.
40 | 
41 |     Currently only downloads and index the dataset in Elasticsearch.
42 |     """
43 | 
44 |     def __init__(self, index='newsgroups', doc_type='message', dataset_path=None):
45 |         super(NewsgroupsDataset, self).__init__(index=index, doc_type=doc_type, dataset_path=dataset_path)
46 | 
47 | 
48 |     def _iterator(self):
49 |         return iterator(self.dataset_fn)
50 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/test/test_np_extractor.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from unittest import TestCase
 3 | 
 4 | from es_text_analytics.np_extractor import NONPExtractor
 5 | 
 6 | 
 7 | class TestNONPExtractor(TestCase):
 8 |     def test_tag(self):
 9 |         extractor = NONPExtractor()
10 |         self.assertEqual(extractor.extract([(u'Dette', 'PRON_PERS'),
11 |                                             (u'er', 'VERB'),
12 |                                             (u'vårt', 'DET'),
13 |                                             (u'hus', 'SUBST'),
14 |                                             (u'.', 'PUNKT')]),
15 |                          [u'hus'])
16 | 
17 |         self.assertEqual(extractor.extract([(u'Dette', 'PRON_PERS'),
18 |                                             (u'er', 'VERB'),
19 |                                             (u'vårt', 'DET'),
20 |                                             (u'fine', 'ADJ'),
21 |                                             (u'hus', 'SUBST'),
22 |                                             (u'.', 'PUNKT')]),
23 |                          [[u'fine', u'hus']])
24 |         extractor = NONPExtractor(keep_index=True)
25 |         self.assertEqual(extractor.extract([(u'Dette', 'PRON_PERS'),
26 |                                             (u'er', 'VERB'),
27 |                                             (u'vårt', 'DET'),
28 |                                             (u'hus', 'SUBST'),
29 |                                             (u'.', 'PUNKT')]),
30 |                          [(u'hus', 3)])
31 | 
32 |         self.assertEqual(extractor.extract([(u'Dette', 'PRON_PERS'),
33 |                                             (u'er', 'VERB'),
34 |                                             (u'vårt', 'DET'),
35 |                                             (u'fine', 'ADJ'),
36 |                                             (u'hus', 'SUBST'),
37 |                                             (u'.', 'PUNKT')]),
38 |                          [([u'fine', u'hus'], 3)])
39 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/single_doc_sigterms.py:
--------------------------------------------------------------------------------
 1 | from operator import itemgetter
 2 | 
 3 | 
 4 | class SingleDocSigTerms:
 5 |     def __init__(self, es, index,  doc_type, field, term_weight_provider):
 6 |         self.es = es
 7 |         self.index = index
 8 |         self.doc_type = doc_type
 9 |         self.field = field
10 |         self.term_weight_provider = term_weight_provider
11 | 
12 |     def tf_for_doc_id(self, doc_id):
13 |         resp = self.es.termvectors(index=self.index, doc_type=self.doc_type, id=doc_id, fields=[self.field])
14 | 
15 |         if resp['found']:
16 |             return [(term, val['term_freq']) for term, val in resp['term_vectors'][self.field]['terms'].items()]
17 | 
18 |     def by_doc_id_idf(self, doc_id, n=5):
19 |         resp = self.es.termvectors(index=self.index, doc_type=self.doc_type, id=doc_id, fields=[self.field], dfs=True,
20 |                                    term_statistics=True, positions=False, offsets=False)
21 |         if resp['found']:
22 |             termstats=[]
23 |             total_doc_term_frequency = sum([val['term_freq'] for term, val in resp['term_vectors'][self.field]['terms'].items()])
24 |             doc_count = resp['term_vectors'][self.field]['field_statistics']['sum_ttf']
25 |             for term, val in resp['term_vectors'][self.field]['terms'].items():
26 |                 doc_freq= val['ttf']
27 |                 term_doc_freq= val['term_freq']
28 |                 term_total_ratio = doc_freq / float(doc_count)
29 |                 doc_ration = term_doc_freq / float(total_doc_term_frequency)
30 |                 termstats.append((term,  doc_ration/float(term_total_ratio)))
31 |             return sorted(termstats, key=itemgetter(1), reverse=True)[0:n]
32 | 
33 |     def by_doc_id(self, doc_id, n=5):
34 |         term_freqs = self.tf_for_doc_id(doc_id)
35 | 
36 |         if self.term_weight_provider:
37 |             weights = self.term_weight_provider[(term for term, _ in term_freqs)]
38 | 
39 |             term_freqs = [(term, freq*weights[term]) for term, freq in term_freqs]
40 | 
41 |         return sorted(term_freqs, key=itemgetter(1), reverse=True)[0:n]
42 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/data/wiki_infobox.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'cvig'
 2 | #!/usr/bin/env python
 3 | # parts of this borrowed from https://github.com/scraperwiki/wikipedia-infobox-tool/blob/master/get_data.py
 4 | 
 5 | import re
 6 | 
 7 | 
 8 | def clean_data(data):
 9 |     # Strip square brackets.
10 |     data = re.sub('[\[\]]', '', data)
11 |     # Strip all HTML tags.
12 |     data = re.sub('<[^<]+?>', ' ', data)
13 |     data = re.sub('(?i)\{\{cite .*\}\}', '', data)
14 |     data = re.sub('&nbsp;', '', data)
15 |     return data
16 | 
17 | 
18 | def parse_tags(data):
19 |     data = re.sub('(?i)\{\{url\|([^\n]*)\}\}', '\g<1>', data)
20 |     data = re.sub('\[\[(.*)\|.*\]\]', '\g<1>', data)
21 |     data = re.sub('(?i)\{\{convert\|(.*?)\|(.*?)((\}\})|(\|.*\}\}))', '\g<1> \g<2>', data)
22 |     data = re.sub('(?i)\{\{convinfobox\|(.*?)\|(.*?)((\}\})|(\|.*\}\}))', '\g<1> \g<2>', data)
23 |     data = re.sub('(?i)\{\{nowrap\|(.*)\}\}', '\g<1>', data)
24 |     return data
25 | 
26 | 
27 | def scrape_infobox(content):
28 |     # Remove HTML comment tags.
29 |     content = re.sub('<!--[\\S\\s]*?-->', ' ', content)
30 | 
31 |     box_occurences = re.split('{{infoboks[^\n}]*\n', content.lower())
32 | 
33 |     if len(box_occurences) < 2:
34 |         return None
35 | 
36 |     data = {}
37 | 
38 |     for box_occurence in box_occurences[1:]:
39 | 
40 |         infobox_end = re.search('\n[^\n{]*\}\}[^\n{]*\n', box_occurence)
41 | 
42 |         if infobox_end is None:
43 |             return None
44 | 
45 |         box_occurence = box_occurence[:infobox_end.start():]
46 |         box_occurence = re.split('\n[^|\n]*\|', box_occurence)
47 | 
48 |         for item in box_occurence:
49 |             item = parse_tags(item)
50 |             item = clean_data(item)
51 |             if '=' in item:
52 |                 pair = item.split('=', 1)
53 |                 field = pair[0].strip()
54 |                 field = re.sub('\W', '_', field)
55 |                 value = pair[1].strip()
56 |                 field = field.lower().strip()
57 |                 if len(field) < 20:
58 |                     if value != '':
59 |                         data[field] = value
60 |         return data
61 | 
62 |     return data
63 | 
64 | 


--------------------------------------------------------------------------------
/python-client/bin/wordcounts_from_dataset.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from argparse import ArgumentParser
 3 | import sys
 4 | 
 5 | from gensim.corpora import Dictionary
 6 | from textblob import TextBlob
 7 | 
 8 | from es_text_analytics.data import newsgroups
 9 | from es_text_analytics.data.dataset import download_file, default_dataset_path
10 | from es_text_analytics.data.ndt_dataset import NDTDataset
11 | from es_text_analytics.tokenizer import NOTokenizer
12 | 
13 | """
14 | Generates wordcounts from a dataset.
15 | 
16 | Stores the counts in a Gensim Dictionary text file with id, word and count as tab separated fields.
17 | """
18 | 
19 | 
20 | NO_TOKENIZER = NOTokenizer()
21 | 
22 | def preprocess_ng(doc):
23 |     return [w.lower() for w in TextBlob(doc['msg']).words]
24 | 
25 | 
26 | def preprocess_ndt(doc):
27 |     return [w.lower() for w in TextBlob(doc['content'], tokenizer=NO_TOKENIZER).words]
28 | 
29 | 
30 | def main():
31 |     parser = ArgumentParser()
32 |     parser.add_argument('-d', '--dataset')
33 |     parser.add_argument('-p', '--dataset-path', default=default_dataset_path())
34 |     parser.add_argument('-o', '--output')
35 |     opts = parser.parse_args()
36 | 
37 |     dataset_name = opts.dataset
38 |     dataset_path = opts.dataset_path
39 |     out_fn = opts.output
40 | 
41 |     if not out_fn:
42 |         logging.error('--output argument required ...')
43 |         parser.print_usage()
44 |         sys.exit(1)
45 | 
46 |     if not dataset_name:
47 |         logging.error('--dataset argument required ...')
48 |         parser.print_usage()
49 |         sys.exit(1)
50 | 
51 |     if dataset_name == 'newsgroups':
52 |         corpus = (preprocess_ng(doc) for doc
53 |                   in newsgroups.iterator(download_file(newsgroups.NEWSGROUPS_ARCHIVE_URL, dataset_path)))
54 |     if dataset_name == 'ndt':
55 |         dataset = NDTDataset(dataset_path=dataset_path)
56 |         dataset.install()
57 | 
58 |         corpus = (preprocess_ndt(doc) for doc in dataset)
59 |     else:
60 |         logging.error('Unknown dataset %s ...' % dataset_name)
61 |         sys.exit(1)
62 | 
63 |     d = Dictionary(corpus)
64 |     d.save_as_text(out_fn, sort_by_word=False)
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     logging.basicConfig(level=logging.INFO)
69 |     main()
70 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/test/test_decompounder.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from es_text_analytics.decompounder import NOBDecompounder, decompound_inner, flatten_inner, flatten
 4 | 
 5 | 
 6 | class TestNOBDecompounder(TestCase):
 7 |     def setUp(self):
 8 |         super(TestNOBDecompounder, self).setUp()
 9 | 
10 |         self.fullform_index = {'ba': [{'pos': 'SUBST'}], 'bork': [{'pos': 'SUBST'}],
11 |                                'borkbork': [{'pos': 'SUBST'}], 'boing': [{'pos': 'PRON'}]}
12 | 
13 | 
14 |     def test_decompound(self):
15 |         decompounder = NOBDecompounder(fullform_index=self.fullform_index, min_match=1)
16 |         self.assertEqual(['ba', 'bork'], decompounder.decompound('babork'))
17 |         self.assertEqual(['ba', 'borkbork'], decompounder.decompound('baborkbork'))
18 |         self.assertEqual(['ba', 'ba'], decompounder.decompound('BaBa'))
19 |         self.assertEqual(None, decompounder.decompound('BaBaa'))
20 | 
21 |     def test_decompound_no_prons(self):
22 |         decompounder = NOBDecompounder(fullform_index=self.fullform_index, min_match=1)
23 | 
24 |         self.assertEqual(None, decompounder.decompound('baboing'))
25 | 
26 | 
27 | class TestDecompounderHelpers(TestCase):
28 |     def setUp(self):
29 |         super(TestDecompounderHelpers, self).setUp()
30 | 
31 |         self.fullform_index = {'ba': [{'pos': 'SUBST'}], 'bork': [{'pos': 'SUBST'}], 'borkbork': [{'pos': 'SUBST'}]}
32 | 
33 |     def test_decompund_inner(self):
34 |         self.assertEqual([['ba', ['ba']]], decompound_inner('baba', self.fullform_index, start=0, min_match=1))
35 |         self.assertEqual([['ba']], decompound_inner('baba', self.fullform_index, start=2, min_match=1))
36 |         self.assertEqual([], decompound_inner('baba', self.fullform_index, start=1, min_match=1))
37 | 
38 |     def test_flatten_inner(self):
39 |         self.assertEqual([['ba', 'ba']], flatten_inner(['ba', ['ba']]))
40 |         self.assertEqual([['ba']], flatten_inner(['ba']))
41 |         self.assertEqual([['ba', 'ba'], ['ba', 'foo']], flatten_inner(['ba', ['ba'], ['foo']]))
42 | 
43 |     def test_flatten(self):
44 |         self.assertEqual([['ba', 'ba'], ['ba'], ['ba', 'ba'], ['ba', 'foo']],
45 |                          flatten([['ba', ['ba']], ['ba'], ['ba', ['ba'], ['foo']]]))


--------------------------------------------------------------------------------
/python-client/sklext/term_estimators.py:
--------------------------------------------------------------------------------
 1 | from itertools import izip
 2 | 
 3 | import numpy
 4 | from numpy import array, sum, zeros
 5 | from scipy.sparse import issparse
 6 | 
 7 | 
 8 | def add_smoothing(m, amount=10 ** -12):
 9 |     m = m.astype(numpy.float)
10 |     m[m == 0] = amount
11 | 
12 |     return m
13 | 
14 | 
15 | def marginal_estimator(X, axis=0, smoothing=False):
16 |     N = X.shape[axis]
17 | 
18 |     if issparse(X):
19 |         counts = array((X > 0).sum(axis=axis))
20 |     else:
21 |         counts = array(sum(X > 0, axis=axis))
22 | 
23 |     if smoothing:
24 |         add_smoothing(counts)
25 | 
26 |     p = counts.flatten() / float(N)
27 | 
28 |     return p
29 | 
30 | 
31 | def joint_estimator_point(X, y, smoothing=False):
32 |     counts = X.T.dot(y)
33 | 
34 |     if issparse(counts):
35 |         counts = array(counts.todense())
36 | 
37 |     if smoothing:
38 |         counts = add_smoothing(counts)
39 | 
40 |     return counts / numpy.sum(counts, dtype=numpy.float)
41 | 
42 | 
43 | def joint_estimator_full_sparse(X, y, smoothing=False):
44 |     _, t = X.shape
45 |     _, c = y.shape
46 | 
47 |     X = X.tolil()
48 |     y = y.tolil()
49 | 
50 |     counts = [zeros((t, c)), zeros((t, c)), zeros((t, c)), zeros((t, c))]
51 | 
52 |     for t_idx, c_idx in izip(X.rows, y.rows):
53 |         t_mask = zeros(t, dtype=numpy.bool)
54 |         t_mask[t_idx] = True
55 |         c_mask = zeros(c, dtype=numpy.bool)
56 |         c_mask[c_idx] = True
57 | 
58 |         counts[0][t_mask, c_mask] += 1
59 |         counts[1][t_mask, ~c_mask] += 1
60 |         counts[2][~t_mask, c_mask] += 1
61 |         counts[3][~t_mask, ~c_mask] += 1
62 | 
63 |     if smoothing:
64 |         counts = [add_smoothing(m) for m in counts]
65 | 
66 |     total = numpy.sum([numpy.sum(m) for m in counts], dtype=numpy.float)
67 | 
68 |     return [m / total for m in counts]
69 | 
70 | 
71 | def joint_estimator_full(X, y, smoothing=False):
72 |     if issparse(X) or issparse(y):
73 |         return joint_estimator_full_sparse(X, y, smoothing=smoothing)
74 | 
75 |     counts = [xx.T.dot(yy) for xx, yy in zip([X, X, 1 - X, 1 - X], [y, 1 - y, y, 1 - y])]
76 | 
77 |     if smoothing:
78 |         counts = [add_smoothing(m) for m in counts]
79 | 
80 |     total = numpy.sum([numpy.sum(m) for m in counts], dtype=numpy.float)
81 | 
82 |     return [m / total for m in counts]


--------------------------------------------------------------------------------
/python-client/es_text_analytics/test/test_tagger.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import os
 3 | from unittest import TestCase
 4 | 
 5 | from es_text_analytics.tagger import obt_to_universal_tag, parse_hunpos_train_output, NOBTagger
 6 | from es_text_analytics.tagger import NNO_TAGGER_DEFAULT_MODEL_FN, NNOTagger, NOB_TAGGER_DEFAULT_MODEL_FN
 7 | 
 8 | HUNPUS_OUTPUT_SAMPLE = """
 9 | reading training corpus
10 | compiling probabilities
11 | constructing suffix guesser
12 | saving the model
13 | Traning corpus:
14 | 614375 tokens
15 | 37620 sentences
16 | 21 different tag
17 | 
18 | Guesser trained with
19 | 71042 lowercase
20 | 40359 uppercase tokens
21 | theta = 0.0728040512355
22 | """
23 | 
24 | class TestTaggerHelpers(TestCase):
25 |   def test_obt_to_universal_tag(self):
26 |     self.assertEqual('VERB', obt_to_universal_tag('skildre', 'verb', 'inf'))
27 |     self.assertEqual('PRON', obt_to_universal_tag('det', 'pron', u'pers|3|nøyt|eint'))
28 |     self.assertEqual('PUNCT', obt_to_universal_tag(',', '<komma>', '_'))
29 | 
30 |   def test_parse_hunpos_train_output(self):
31 |       self.assertEqual({'tokens': 614375,
32 |                         'sentences': 37620,
33 |                         'tag_card': 21,
34 |                         'n_lower': 71042,
35 |                         'n_upper': 40359,
36 |                         'theta': 0.0728040512355,
37 |                         'errors': []},
38 |                        parse_hunpos_train_output(HUNPUS_OUTPUT_SAMPLE))
39 | 
40 | 
41 | class TestNOBTagger(TestCase):
42 |     def test_tag(self):
43 |         if os.path.exists(NOB_TAGGER_DEFAULT_MODEL_FN):
44 |             tagger = NOBTagger()
45 |             self.assertEqual([(u'Dette', 'PRON_PERS'),
46 |                               (u'er', 'VERB'),
47 |                               (u'vårt', 'DET'),
48 |                               (u'hus', 'SUBST'),
49 |                               (u'.', 'PUNKT')],
50 |                              tagger.tag(u'Dette er vårt hus.'))
51 |         else:
52 |             self.skipTest('NOBTagger default model not found in %s' % NOB_TAGGER_DEFAULT_MODEL_FN)
53 | 
54 | 
55 | class TestNNOTagger(TestCase):
56 |     def test_tag(self):
57 |         if os.path.exists(NNO_TAGGER_DEFAULT_MODEL_FN):
58 |             tagger = NNOTagger()
59 |             self.assertEqual([(u'Røyndommen', 'SUBST'),
60 |                               (u'rammar', 'VERB'),
61 |                               (u'alle', 'DET'),
62 |                               (u'.', 'PUNKT')],
63 |                              tagger.tag(u'Røyndommen rammar alle.'))
64 |         else:
65 |             self.skipTest('NNOTagger default model not found in %s' % NNO_TAGGER_DEFAULT_MODEL_FN)
66 | 


--------------------------------------------------------------------------------
/python-client/bin/index_dataset.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from argparse import ArgumentParser
 3 | import sys
 4 | 
 5 | from elasticsearch.client import Elasticsearch
 6 | 
 7 | from es_text_analytics.data.aviskorpus import AviskorpusDataset
 8 | from es_text_analytics.data.ndt_dataset import NDTDataset
 9 | from es_text_analytics.data.newsgroups import NewsgroupsDataset
10 | 
11 | """
12 | Script for retrieving and indexing datasets.
13 | 
14 | Current datasets supported:
15 | - 20 Newsgroups (newsgroups)
16 | - Norsk Aviskorpus (aviskorpus), sections and sources can be specified with f.ex. -s 1|2-aa|vg|db
17 | - Norwegian Dependency Treebank (ndt), sections and languages can be specified with f.ex -s newspaper|blog-nob
18 | """
19 | 
20 | 
21 | def main():
22 |     parser = ArgumentParser()
23 |     parser.add_argument('-e', '--elasticsearch-server', default='localhost:9200')
24 |     parser.add_argument('-d', '--dataset')
25 |     parser.add_argument('-s', '--sections')
26 |     opts = parser.parse_args()
27 | 
28 |     es_hosts = [opts.elasticsearch_server]
29 |     dataset_name = opts.dataset
30 |     dataset_sections = opts.sections
31 | 
32 |     es = Elasticsearch(hosts=es_hosts, timeout=120)
33 | 
34 |     if dataset_name == 'newsgroups':
35 |         dataset = NewsgroupsDataset()
36 |     elif dataset_name == 'aviskorpus':
37 |         sections = None
38 |         sources = None
39 | 
40 |         if dataset_sections:
41 |             try:
42 |                 sections, sources = dataset_sections.split('-')
43 |                 sections = [int(s) for s in sections.split('|')]
44 |                 sources = [s for s in sources.split('|')]
45 |             except Exception:
46 |                 logging.error('Malformed section specification "%s" ...' % dataset_sections)
47 |                 sys.exit(1)
48 | 
49 |         dataset = AviskorpusDataset(sections=sections, sources=sources)
50 |     elif dataset_name == 'ndt':
51 |         sections = None
52 |         lang = None
53 | 
54 |         if dataset_sections:
55 |             try:
56 |                 sections, lang = dataset_sections.split('-')
57 |                 sections = [int(s) for s in sections.split('|')]
58 |                 lang = [s for s in lang.split('|')]
59 |             except Exception:
60 |                 logging.error('Malformed section specification "%s" ...' % dataset_sections)
61 |                 sys.exit(1)
62 | 
63 |         dataset = NDTDataset(lang=lang, sections=sections)
64 |     else:
65 |         logging.error('Unknown dataset %s ...' % dataset_name)
66 |         sys.exit(1)
67 | 
68 |     dataset.install(es)
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     logging.basicConfig(level=logging.INFO)
73 | 
74 |     main()
75 | 


--------------------------------------------------------------------------------
/python-client/bin/build_no_tagger.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import logging
 3 | from argparse import ArgumentParser
 4 | import os
 5 | import sys
 6 | import datetime
 7 | 
 8 | from es_text_analytics.data.ndt_dataset import NDTDataset
 9 | from es_text_analytics.tagger import train_hunpos_model, FEATURES_MAP
10 | 
11 | 
12 | 
13 | 
14 | # Trains a Norwegian part-of-speech tagger with the NDT dataset.
15 | # The tagger is trained on the combined Bokmål and Nynorsk material.
16 | 
17 | # Arguments:
18 | # -f, --features The normalized feature set, no-feats, simple or universal. See tagger.py for details.
19 | # -m, --model-file Where to save the resulting model. Crearet a default filename in the current directory
20 | #   if omitted.
21 | # -d, --dataset-file Where to find the NDT dataset. Uses default location if omitted.
22 | # -l, --language Which language training set to use: nob (bokmål), nno (nynorsk) or both.
23 | 
24 | FIELDS = ['form', 'postag', 'feats']
25 | 
26 | 
27 | def main():
28 |     parser = ArgumentParser()
29 |     parser.add_argument('-f', '--features')
30 |     parser.add_argument('-m', '--model-file')
31 |     parser.add_argument('-d', '--dataset-file')
32 |     parser.add_argument('-l', '--language', default='nob')
33 | 
34 |     args = parser.parse_args()
35 | 
36 |     features = args.features
37 |     model_fn = args.model_file
38 |     dataset_fn = args.dataset_file
39 |     lang = args.language
40 | 
41 |     if not features in FEATURES_MAP:
42 |         logging.error('Unknown feature identifier %s (one of <%s>) ...'
43 |                       % (features, '|'.join(FEATURES_MAP.keys())))
44 |         sys.exit(1)
45 | 
46 |     if dataset_fn and not os.path.exists(dataset_fn):
47 |         logging.error('Could not find NDT dataset archive %s ...' % dataset_fn)
48 |         sys.exit(1)
49 | 
50 |     if not model_fn:
51 |         # noinspection PyUnresolvedReferences
52 |         model_fn = 'no-ndt-hunpos-%s-%s' % (features, datetime.now().strftime("%Y-%m-%d-%H-%M"))
53 | 
54 |     if not lang in ['nob', 'nno', 'both']:
55 |         logging.error('Uknown language %s (one of <%s>) ...' % (lang), '|'.join(['nob', 'nno', 'both']))
56 |         sys.exit(1)
57 | 
58 |     if lang == 'both':
59 |         lang = None
60 | 
61 |     if dataset_fn:
62 |         dataset = NDTDataset(dataset_fn=dataset_fn, normalize_func=None, fields=FIELDS, lang=lang)
63 |     else:
64 |         dataset = NDTDataset(normalize_func=None, fields=FIELDS, lang=lang)
65 |         dataset.install()
66 | 
67 |     pos_norm_func = FEATURES_MAP[features]
68 |     seq_gen = ([(form, pos_norm_func(form, pos, feats)) for form, pos, feats in sent] for sent in dataset)
69 | 
70 |     stats = train_hunpos_model(seq_gen, model_fn)
71 | 
72 |     # print the stats from the hunpos output
73 |     for k, v in stats.items():
74 |         print '%s:\t%s' % (k, v)
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     logging.basicConfig(level=logging.INFO)
79 | 
80 |     main()
81 | 


--------------------------------------------------------------------------------
/provision/neo4j-server.properties:
--------------------------------------------------------------------------------
 1 | ################################################################
 2 | # Neo4j
 3 | #
 4 | # neo4j-server.properties - runtime operational settings
 5 | #
 6 | ################################################################
 7 | 
 8 | #***************************************************************
 9 | # Server configuration
10 | #***************************************************************
11 | 
12 | # location of the database directory
13 | org.neo4j.server.database.location=data/graph.db
14 | 
15 | # Low-level graph engine tuning file
16 | org.neo4j.server.db.tuning.properties=conf/neo4j.properties
17 | 
18 | # Let the webserver only listen on the specified IP. Default is localhost (only
19 | # accept local connections). Uncomment to allow any connection. Please see the
20 | # security section in the neo4j manual before modifying this.
21 | org.neo4j.server.webserver.address=0.0.0.0
22 | 
23 | # Require (or disable the requirement of) auth to access Neo4j
24 | dbms.security.auth_enabled=true
25 | 
26 | #
27 | # HTTP Connector
28 | #
29 | 
30 | # http port (for all data, administrative, and UI access)
31 | org.neo4j.server.webserver.port=7474
32 | 
33 | #
34 | # HTTPS Connector
35 | #
36 | 
37 | # Turn https-support on/off
38 | org.neo4j.server.webserver.https.enabled=true
39 | 
40 | # https port (for all data, administrative, and UI access)
41 | org.neo4j.server.webserver.https.port=7473
42 | 
43 | # Certificate location (auto generated if the file does not exist)
44 | org.neo4j.server.webserver.https.cert.location=conf/ssl/snakeoil.cert
45 | 
46 | # Private key location (auto generated if the file does not exist)
47 | org.neo4j.server.webserver.https.key.location=conf/ssl/snakeoil.key
48 | 
49 | # Internally generated keystore (don't try to put your own
50 | # keystore there, it will get deleted when the server starts)
51 | org.neo4j.server.webserver.https.keystore.location=data/keystore
52 | 
53 | # Comma separated list of JAX-RS packages containing JAX-RS resources, one
54 | # package name for each mountpoint. The listed package names will be loaded
55 | # under the mountpoints specified. Uncomment this line to mount the
56 | # org.neo4j.examples.server.unmanaged.HelloWorldResource.java from
57 | # neo4j-server-examples under /examples/unmanaged, resulting in a final URL of
58 | # http://localhost:7474/examples/unmanaged/helloworld/{nodeId}
59 | #org.neo4j.server.thirdparty_jaxrs_classes=org.neo4j.examples.server.unmanaged=/examples/unmanaged
60 | 
61 | 
62 | #*****************************************************************
63 | # HTTP logging configuration
64 | #*****************************************************************
65 | 
66 | # HTTP logging is disabled. HTTP logging can be enabled by setting this
67 | # property to 'true'.
68 | org.neo4j.server.http.log.enabled=false
69 | 
70 | # Logging policy file that governs how HTTP log output is presented and
71 | # archived. Note: changing the rollover and retention policy is sensible, but
72 | # changing the output format is less so, since it is configured to use the
73 | # ubiquitous common log format
74 | org.neo4j.server.http.log.config=conf/neo4j-http-logging.xml
75 | 
76 | 
77 | #*****************************************************************
78 | # Administration client configuration
79 | #*****************************************************************
80 | 
81 | # location of the servers round-robin database directory. Possible values:
82 | # - absolute path like /var/rrd
83 | # - path relative to the server working directory like data/rrd
84 | # - commented out, will default to the database data directory.
85 | org.neo4j.server.webadmin.rrdb.location=data/rrd
86 | 


--------------------------------------------------------------------------------
/python-client/bin/build_LDA_kera_from_wiki.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys
 3 | from operator import itemgetter
 4 | 
 5 | sys.path.append('F:\projects\comperio-text-analytics\python-client')
 6 | from elasticsearch import Elasticsearch
 7 | import pandas as pd
 8 | from gensim.models.ldamodel import LdaModel
 9 | from gensim.corpora.dictionary import Dictionary
10 | from gensim.models.tfidfmodel import TfidfModel
11 | import logging
12 | from NOB_kera import NOB_kera
13 | 
14 | num_words_from_topic = 20
15 | num_results_from_es = 5
16 | modelfile = 'F:/projects/elasticsearch-enterprise-system/data/topic_models/nowiki_v2_3pass_lda_250'
17 | vocabulary = 'F:/projects/elasticsearch-enterprise-system/data/topic_models/voc_vocabulary_0.vocab'
18 | 
19 | def flatten(x):
20 |     if isinstance(x, tuple):
21 |         return " ".join([i for i in x]).lower()
22 |     else:
23 |         return x.lower()
24 | 
25 | 
26 | def add_keywords(results, kera):
27 |     for topicresult in results:
28 |         toptitle = ''
29 |         kkw = {}
30 |         logging.debug(topicresult['topics'][0:300])
31 |         for hits in topicresult['result']['hits']['hits']:
32 |             title = hits['_source']['title']
33 |             topbody = hits['_source']['article']
34 |             toptitle += title + ' _ '
35 |             kwlist = kera.extract_keywords(toptitle + topbody)
36 |             kw = dict(kwlist)
37 |             logging.debug(kw)
38 |             logging.debug("t: %s len kw:%d" % (toptitle, len(kw)))
39 |             for keyword, keyvalue in kw.iteritems():
40 |                 if keyword in kkw:
41 |                     kkw[keyword] += float(keyvalue)
42 |                 else:
43 |                     kkw[keyword] = float(keyvalue)
44 |         kkw = sorted(kkw.items(), key=itemgetter(1), reverse=True)
45 |         logging.debug(kkw)
46 |         logging.debug('kkw %d' % len(kkw))
47 |         topicresult['keywords'] = kkw
48 |         topicresult['keyword_string'] = " ".join([flatten(k_kw[0]) for k_kw in kkw])
49 |         topicresult['titles'] = toptitle
50 |     return results
51 | 
52 | 
53 | def get_doc_topics(ldamodel, num_topics, num_words_from_topic, vocab, tfidfmodel):
54 |     for num_topic in range(num_topics):
55 |         topics = ldamodel.show_topic(num_topic, num_words_from_topic)
56 |         # filter out high/low frequent words from the vocabulary
57 | 
58 |         toks = [topic[1] for topic in topics]
59 |         logging.debug(toks)
60 |         tfidf = tfidfmodel[vocab.doc2bow(toks)]
61 |         # cut off 10 percent from top and bottom
62 |         cutoff = int(num_words_from_topic * 0.1)
63 |         outdoc = [vocab.get(wd[0]) for wd in sorted(tfidf, key=itemgetter(1), reverse=True)[cutoff:num_words_from_topic-cutoff]]
64 |         logging.debug(outdoc)
65 |         ss = set(toks)
66 |         sb = set(outdoc)
67 |         logging.debug(ss.difference(sb))
68 |         yield (' '.join(outdoc), num_topic)
69 | 
70 | 
71 | def main():
72 |     logformat = '%(asctime)s %(name)-12s: %(message)s'
73 |     logging.basicConfig(level=logging.DEBUG, format=logformat)
74 |     kera = NOB_kera()
75 |     es = Elasticsearch(port=9201)
76 |     mod = LdaModel.load(modelfile)
77 |     vocab = Dictionary.load(vocabulary)
78 |     tfidf = TfidfModel(dictionary=vocab)
79 |     results = []
80 |     for (topics, topicid) in get_doc_topics(mod, mod.num_topics, num_words_from_topic, vocab, tfidf):
81 |         res = es.search(index='wiki4', body={"query": {"match": {"_all": topics}}}, size=num_results_from_es)
82 |         results.append({'topics': topics, 'result': res, 'topicid': topicid})
83 |     results = add_keywords(results, kera)
84 |     df = pd.DataFrame(results)
85 |     df.to_csv('nowiki_4_with_kera_250_topics.csv', encoding='utf-8')
86 | 
87 | 
88 | main()


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # Comperio text analytics [![Build Status](https://travis-ci.org/comperiosearch/comperio-text-analytics.svg?branch=master)](https://travis-ci.org/comperiosearch/comperio-text-analytics)
  2 | 
  3 | ElasticSearch based text analytics.
  4 | 
  5 | Implementation of:
  6 | 
  7 | * Single document significant terms - [Trello board](https://trello.com/c/nrO8QIp9) (private)
  8 | * Classification - [Trello board](https://trello.com/c/PU7XqsTi) (private)
  9 | * Sentiment analysis - [Trello board](https://trello.com/c/C8H5fBcJ) (private)
 10 | 
 11 | ## Norwegian linguistics support for text analytics
 12 | 
 13 | There is currently partial experimental support for some linguistic analysis of Norwegian Bokmål. This
 14 | support depends on the following resources:
 15 | 
 16 | * Norwegian Dependency Treebank (NDT) (freely available, permissive licensing).
 17 | * Norsk Ordbank (available on request, GPL or commercial licensing).
 18 | 
 19 | Norsk Ordbank must be obtained separately and unzipped in the data directory for it to be used automatically
 20 | by linguistic processing components.
 21 | 
 22 | ### Tokenizer
 23 | 
 24 | Currently a simple application of the UAX29 standard for Unicode tokenization. Will be expanded to handle hyphens
 25 | in accordance with Norwegian norms.
 26 | 
 27 | ```python
 28 | tokenizer = NOTokenizer()
 29 | tokenizer.tokenize(u'Vi er konsulenter, med fokus på søk!')
 30 | 
 31 | [u'Vi',
 32 |  u'er',
 33 |  u'konsulenter',
 34 |  u',',
 35 |  u'med',
 36 |  u'fokus',
 37 |  u'på',
 38 |  u'søk',
 39 |  u'!']
 40 | ```
 41 | 
 42 | ### Part of speech annotation
 43 | 
 44 | Adds part of speech descriptions. THe default annotation is a very simplified version of the one used by Norsk Ordbank
 45 | and NDT.
 46 | 
 47 | ```python
 48 | tagger = NOBTagger()
 49 | tagger.tag(u'Vi spiste lunsj ute i det fine været.')
 50 | 
 51 | [(u'Vi', 'PRON_PERS'),
 52 |  (u'spiste', 'VERB'),
 53 |  (u'lunsj', 'SUBST'),
 54 |  (u'ute', 'PREP'),
 55 |  (u'i', 'PREP'),
 56 |  (u'det', 'DET'),
 57 |  (u'fine', 'ADJ'),
 58 |  (u'været', 'SUBST'),
 59 |  (u'.', 'PUNKT')]
 60 | ```
 61 | 
 62 | Evaluation of the tagger precision is forthcoming, but users should expect a reasonable error rate given the
 63 | limited trraining data available.
 64 | 
 65 | ### Lemmatization
 66 | 
 67 | Based on Norsk Ordbank. It is possible to pass the part of speech tag in order to disambiguate words which can
 68 | have more than one lemma form.
 69 | 
 70 | ```python
 71 | sent = tagger.tag(u'Vi er godt forberedt.')
 72 | [(word, lem.lemmatize(word, tag)) for word, tag in sent]
 73 | 
 74 | [(u'Vi', u'vi'),
 75 |  (u'er', u'være'),
 76 |  (u'godt', u'god'),
 77 |  (u'forberedt', u'forberedt'),
 78 |  (u'.', u'.')]
 79 | ```
 80 | 
 81 | ### Decompounder
 82 | 
 83 | Simple heuristics based decompounder based on the word forms in Norsk Ordbank. This can overgenerate so it should
 84 | primarily be used on wellformed text.
 85 | 
 86 | ```python
 87 | dec = NOBDecompounder()
 88 | dec.decompound(u'lampekostbatteri'), dec.decompound(u'søkekonsulenter')
 89 | 
 90 | [u'lampe', u'kost', u'batteri'], [u'søke', u'konsulenter']
 91 | 
 92 | ```
 93 | 
 94 | ## Vagrant development server
 95 | 
 96 | In order to set up up a Vagrant development server run:
 97 | 
 98 | ```
 99 | vagrant up
100 | fab vagrant provision_server
101 | ```
102 | 
103 | 
104 | #Installation notes
105 | 
106 | To use the tagger, decompounder and lemmatizer you will need to download Norsk ordbank.
107 | You can download it by registering at [http://www.edd.uio.no/prosjekt/ordbanken/](http://www.edd.uio.no/prosjekt/ordbanken/)
108 | 
109 | You will need to build models for the tagger, by running:
110 | 
111 |     from es_text_analytics.tagger import install_hunpos
112 |     install_hunpos()
113 |     comperio-text-analytics\python-client\bin\build-all-models.bat
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/np_extractor.py:
--------------------------------------------------------------------------------
  1 | from textblob.base import BaseNPExtractor
  2 | 
  3 | """
  4 | Minimal NP chunker for Norwegian adapted from the TextBlob FastNPExtractor.
  5 | 
  6 | Compatible with the TextBlob API.
  7 | """
  8 | 
  9 | CFG = {
 10 |     ('SUBST_PROP', 'SUBST_PROP'): 'SUBST_PROP',
 11 |     ('SUBST', 'SUBST'): 'SUBSTP',
 12 |     ('SUBSTP', 'SUBST'): 'SUBSTP',
 13 |     ('ADJ', 'ADJ'): 'ADJ',
 14 |     ('ADJ', 'SUBST'): 'SUBSTP',
 15 |     }
 16 | 
 17 | 
 18 | def force_list(item):
 19 |     """
 20 |     Wrapped the passed argument in a list if it is not a list.
 21 | 
 22 |     :param item: Anything.
 23 |     :return: List wrapping any non list item passed.
 24 |     """
 25 |     if not isinstance(item, list):
 26 |         return [item]
 27 |     else:
 28 |         return item
 29 | 
 30 | 
 31 | def extract(tagged_tokens, keep_index=False):
 32 |     """
 33 |     Extracted NP chunks from a tagged sequence of tokens.
 34 | 
 35 |     This method uses a simple CFG over POS tags
 36 | 
 37 |     :param tagged_tokens: A sequence of token/tag pairs from the NNO or NOB tagger.
 38 |     :type tagged_tokens: list[(str|unicode, str|unicode)]
 39 |     :param keep_index: Return token index positions for chunks.
 40 |     :type keep_index: bool
 41 |     :rtype : list[str|unicode|list[str|unicode]|(str|unicode|list[str|unicode], int)]
 42 |     :return: A list of NP chunks as strings with the complete phrase. Chunks can be strings for single token chunks,
 43 |       list of strings for ultiple tokens or a chunk/index tuple if keep_index is set to true.
 44 |     """
 45 |     merge = True
 46 |     while merge:
 47 |         merge = False
 48 |         for x in range(0, len(tagged_tokens) - 1):
 49 |             t1 = tagged_tokens[x]
 50 |             t2 = tagged_tokens[x + 1]
 51 |             key = t1[1], t2[1]
 52 |             value = CFG.get(key, '')
 53 | 
 54 |             if value:
 55 |                 merge = True
 56 |                 tagged_tokens.pop(x)
 57 |                 tagged_tokens.pop(x)
 58 |                 match = force_list(t1[0]) + force_list(t2[0])
 59 |                 pos = value
 60 |                 # noinspection PyTypeChecker
 61 |                 tagged_tokens.insert(x, (match, pos))
 62 |                 break
 63 | 
 64 |     matches = []
 65 |     index = 0
 66 | 
 67 |     for t in tagged_tokens:
 68 |         if t[1] in ['SUBST', 'SUBST_PROP', 'SUBSTP']:
 69 |             if keep_index:
 70 |                 value = (t[0], index)
 71 |             else:
 72 |                 value = t[0]
 73 | 
 74 |             matches.append(value)
 75 | 
 76 |         if isinstance(t, list):
 77 |             index += len(t)
 78 |         else:
 79 |             index += 1
 80 | 
 81 |     return matches
 82 | 
 83 | 
 84 | class NONPExtractor(BaseNPExtractor):
 85 |     """
 86 |     Simple NP extractor similar to FastNPEXtractor in TextBlob.
 87 |     """
 88 |     def __init__(self, tagger=None, keep_index=False):
 89 |         """
 90 |         :param tagger: If initialized a tagger instance extract arguments will be processed with this tagger.
 91 |           Otherwise the extract method expects tagged input.
 92 |         :type tagger: None|textblob.base.BaseTagger
 93 |         :param keep_index: Return token index positions for chunks.
 94 |         :type keep_index: bool
 95 |         """
 96 |         self.tagger = tagger
 97 |         self.keep_index = keep_index
 98 | 
 99 |     def extract(self, tokens):
100 |         """
101 |         Extract NP chunks from passed tokens.
102 | 
103 |         :param tokens: Tokens as untagged string or pretagged list of token/tag pairs according to tagger configuration.
104 |         :type tokens: str|list[(str|unicode, str|unicode)]
105 |         :rtype : list[str|unicode]
106 |         :return:
107 |         """
108 |         if self.tagger:
109 |             tokens = self.tagger.tag(tokens)
110 | 
111 |         return extract(tokens, keep_index=self.keep_index)
112 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/kera.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | from operator import itemgetter
 3 | 
 4 | from nltk import BigramAssocMeasures, BigramCollocationFinder
 5 | 
 6 | 
 7 | # TODO A wrapper class could encapsulate default configurations.
 8 | 
 9 | 
10 | def extract_keywords(string, tokenizer, sent_tokenizer, tagger, extractor, proper_noun_tag='SUBST_PROP'):
11 |     """
12 |     Implements KERA keyword extraction algorithm.
13 | 
14 |     See: https://www.ida.org/~/media/Corporate/Files/Publications/IDA_Documents/ITSD/ida-document-ns-d-4931.pdf
15 | 
16 |     Basic implementation of the procedure described in the paper.
17 |     Probably needs some refinements in order to be more broadly effective.
18 | 
19 |     :param string: Document to analyze.
20 |     :type string: str|unicode
21 |     :param tokenizer: Function that returns a token segmentation as an iterable of strings given a string..
22 |     :type tokenizer: (str|unicode) -> list[str|unicode]
23 |     :param sent_tokenizer: Function that returns a sentence segmentation as an iterable of strings given a string.
24 |     :type sent_tokenizer: (str|unicode) -> list[str|unicode]
25 |     :param tagger: TextBlob compatible POS tagger. Must accept untokenized sentences.
26 |     :type tagger: textblob.base.BaseTagger
27 |     :param extractor: TextBlob compatible noun phrase extractor. Must accept untokenized sentences and use the same
28 |       POS tagger which is passed as the tagger parameter.
29 |     :type extractor: textblob.base.BaseNPExtractor
30 |     :param proper_noun_tag: POS tag indicating proper nouns.
31 |     :type proper_noun_tag: str|unicode
32 |     :return: List of keyword/score tuples. Keyword may be a string or tuple of strings.
33 |     :rtype : list[(str|unicode|(str|unicode)), float]
34 |     """
35 |     # find bigram collocations
36 |     bigram_measures = BigramAssocMeasures()
37 |     finder = BigramCollocationFinder.from_words(tokenizer(string))
38 |     collocations = finder.score_ngrams(bigram_measures.likelihood_ratio)[0:50]
39 | 
40 |     # find noun phrases
41 |     phrases = [extractor.extract(s) for s in sent_tokenizer(string)]
42 |     phrases = [item for sublist in phrases for item in sublist]
43 | 
44 |     # find proper noun tokens, collect total/frequency for weighting/normalization
45 |     sents = [tagger.tag(s) for s in sent_tokenizer(string)]
46 |     sents = [item for sublist in sents for item in sublist]
47 | 
48 |     proper_nouns = []
49 | 
50 |     np_doc_len = 0
51 | 
52 |     for i, (token, tag) in enumerate(sents):
53 |         np_doc_len += 1
54 | 
55 |         if tag == proper_noun_tag:
56 |             proper_nouns.append((token, i))
57 | 
58 |     # find noun phrase/collocation overlap
59 |     phrase_strings = [' '.join(x[0]).lower() for x in phrases if isinstance(x[0], list)]
60 |     collocations = [c for c in collocations if ' '.join(c[0]) in phrase_strings]
61 | 
62 |     ranks = []
63 | 
64 |     # calculate combined index score and normalized collocation score for collocations
65 |     coll_score_total = sum([x[1] for x in collocations])
66 |     coll_doc_len = len(tokenizer(string))
67 | 
68 |     for coll, coll_score in collocations:
69 |         idx = phrases[phrase_strings.index(' '.join(coll))][1]
70 | 
71 |         alpha = coll_score / coll_score_total
72 |         beta = 1 - (float(idx) / coll_doc_len)
73 | 
74 |         score = 2 * alpha * beta / (alpha + beta)
75 | 
76 |         ranks.append((coll, score))
77 | 
78 |     # calculate combined index score and normalized term frequency score for proper nouns
79 |     np_strings = [x[0] for x in proper_nouns]
80 |     np_counts = Counter(np_strings)
81 |     np_total = len(proper_nouns)
82 | 
83 |     # only use normalize over the same number of proper nouns as collocations in order to keep
84 |     # the scores roughly comparable.
85 |     # TODO There are rarely more proper names than collocations. Handle this too.
86 |     for np, count in sorted(np_counts.items(), key=itemgetter(1), reverse=True)[0:len(collocations)]:
87 |         idx = proper_nouns[np_strings.index(np)][1]
88 | 
89 |         alpha = float(count) / np_total
90 |         beta = 1 - (float(idx) / np_doc_len)
91 | 
92 |         score = 2 * alpha * beta / (alpha + beta)
93 | 
94 |         ranks.append((np, score))
95 | 
96 |     # return list of keywords and scores sorted by score
97 |     return sorted(ranks, key=itemgetter(1), reverse=True)
98 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/data/ndt_dataset.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import logging
  3 | from operator import itemgetter
  4 | import os
  5 | from tarfile import TarFile
  6 | 
  7 | from es_text_analytics.data.dataset import Dataset, parse_conll, CONLL_U_FIELDS
  8 | 
  9 | NDT_ARCHIVE_URL='http://www.nb.no/sbfil/tekst/20140328_NDT_1-01.tar.gz'
 10 | 
 11 | 
 12 | def filelist(lang=None, sections=None):
 13 |     """
 14 |     Generate a list of filenames corresponding to languages (Nynorsk and Bokmål)
 15 |      and source sections in the Treebank. Default is to include all lsnguages
 16 |      and sections.
 17 | 
 18 |     :param lang:
 19 |     :type lang: str|unicode|None
 20 |     :param sections:
 21 |     :type sections: list[str|unicode]|None
 22 |     :rtype : list[str|unicode]
 23 |     :return: list of filenames corresponding to the specified Treebank content.
 24 |     """
 25 |     files = []
 26 | 
 27 |     if not sections:
 28 |         sections = ['ndt_1-0']
 29 |     else:
 30 |         sections = ['%s_ndt_1-0' % s for s in sections]
 31 | 
 32 |     if not lang:
 33 |         lang = ['nob', 'nno']
 34 |     else:
 35 |         lang = [lang]
 36 | 
 37 |     for s in sections:
 38 |         for l in lang:
 39 |             files.append('%s_%s.conll' % (s, l))
 40 | 
 41 |     return files
 42 | 
 43 | 
 44 | def iterator(dataset_fn, sections=None, lang=None, field_indices=None):
 45 |     """
 46 |     Provides an iterator of CONLL formatted sentences from NDT.
 47 | 
 48 |     :param dataset_fn: Path to Newsgroups dataset archive file.
 49 |     :type dataset_fn: unicode|str
 50 |     :param sections:
 51 |     :type sections: list[str|unicode]|None
 52 |     :param lang:
 53 |     :type lang: list[str|unicode]|None
 54 |     :rtype : generator
 55 |     """
 56 |     files = filelist(lang=lang, sections=sections)
 57 | 
 58 |     with TarFile.open(dataset_fn, 'r:gz') as f:
 59 |         for member in f:
 60 |             if member.isfile() and os.path.basename(member.name) in files:
 61 |                 logging.info('parsing %s ...' % member.name)
 62 |                 m_f = f.extractfile(member)
 63 | 
 64 |                 for sentence in parse_conll(m_f, field_indices=field_indices):
 65 |                     yield sentence
 66 | 
 67 |                 m_f.close()
 68 | 
 69 | 
 70 | def normalize(doc):
 71 |     """
 72 |     Normalize a treebank sentence to a string with the token forms.
 73 | 
 74 |     :param doc: Parsed CONLL sentence.
 75 |     :type doc: list[list]
 76 |     :rtype : dict[str|unicode, str|unicode]
 77 |     :return: A document dict with the normalized sentence in the 'content' key.
 78 |     """
 79 |     return {'content': u' '.join(map(itemgetter(1), doc))}
 80 | 
 81 | 
 82 | class NDTDataset(Dataset):
 83 |     """
 84 |     Class encapsulating the Norwegian Dependency Treebank. Uses the main CONLL data files.
 85 |     See http://www.nb.no/sprakbanken/show?serial=sbr-10&lang=nb for details.
 86 |     """
 87 | 
 88 | 
 89 |     def __init__(self, index='ndt', doc_type='sentence', dataset_path=None,
 90 |                  dataset_fn=None, lang=None, sections=None, fields=None,
 91 |                  normalize_func=normalize):
 92 |         """
 93 |         Default includes all sections, languages and fields.
 94 | 
 95 |         :param sections: Sections to include (blog, newspaper, partliament, report).
 96 |         :type sections: list[str|unicode]|None
 97 |         :param lang: Languages to include (nno, nob).
 98 |         :type lang: list[str|unicode]|None
 99 |         :param fields: Columns to include (index, form, lemma, cpostag, postag, feats, head, deprel, deps, misc).
100 |         :type fields: list[str|unicode]|None
101 |         """
102 |         super(NDTDataset, self).__init__(index=index, doc_type=doc_type, dataset_path=dataset_path,
103 |                                          dataset_fn=dataset_fn, normalize_func=normalize_func)
104 | 
105 |         self.archive_fn = NDT_ARCHIVE_URL
106 |         self.field_indices = None
107 |         self.fields = CONLL_U_FIELDS
108 | 
109 |         if fields:
110 |             self.fields = fields
111 |             self.field_indices = [CONLL_U_FIELDS.index(f) for f in fields]
112 | 
113 |         self.sections = sections
114 |         self.lang = lang
115 | 
116 |     def _iterator(self):
117 |         return iterator(self.dataset_fn, sections=self.sections,
118 |                         lang=self.lang, field_indices=self.field_indices)
119 | 


--------------------------------------------------------------------------------
/python-client/sklext/test/test_term_weight_transformer.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | 
  3 | from nose.tools import assert_true
  4 | from numpy import array
  5 | from numpy.ma.testutils import assert_array_approx_equal
  6 | from scipy.sparse import issparse
  7 | from scipy.sparse.csgraph._min_spanning_tree import csr_matrix
  8 | 
  9 | from sklext.term_weighting import TermWeightTransformer
 10 | 
 11 | 
 12 | class TestTermWeightTransformer(TestCase):
 13 |     def test_mi(self):
 14 |         X = array([[0, 1],
 15 |                    [1, 0],
 16 |                    [1, 1]])
 17 |         y = array([[0, 1],
 18 |                    [1, 0],
 19 |                    [1, 0]])
 20 | 
 21 |         transformer = TermWeightTransformer(method='mi')
 22 |         transformer.fit(X, y)
 23 | 
 24 |         assert_array_approx_equal(transformer._weights, [-0.37489, -0.605939], decimal=3)
 25 |         assert_array_approx_equal(transformer.transform(X), array([[0., -0.605939],
 26 |                                                                    [-0.37489, 0.],
 27 |                                                                    [-0.37489, -0.605939]]),
 28 |                                   decimal=3)
 29 | 
 30 |         transformer = TermWeightTransformer(method='mi')
 31 |         X = csr_matrix(X)
 32 |         y = csr_matrix(y)
 33 |         transformer.fit(X, y)
 34 |         newX = transformer.transform(X)
 35 | 
 36 |         assert_array_approx_equal(transformer._weights, [-0.37489, -0.605939], decimal=3)
 37 |         assert_true(issparse(newX))
 38 |         assert_array_approx_equal(newX.todense(), array([[0., -0.605939],
 39 |                                                          [-0.37489, 0.],
 40 |                                                          [-0.37489, -0.605939]]),
 41 |                                   decimal=3)
 42 | 
 43 |     def test_pmi(self):
 44 |         X = array([[0, 1],
 45 |                    [1, 0],
 46 |                    [1, 1]])
 47 |         y = array([[0, 1],
 48 |                    [1, 0],
 49 |                    [1, 0]])
 50 | 
 51 |         transformer = TermWeightTransformer(method='pmi')
 52 |         transformer.fit(X, y)
 53 | 
 54 |         assert_array_approx_equal(transformer._weights, [0.1178, 0.1178], decimal=3)
 55 |         assert_array_approx_equal(transformer.transform(X), array([[0., 0.1178],
 56 |                                                                    [0.1178, 0.],
 57 |                                                                    [0.1178, 0.1178]]),
 58 |                                   decimal=3)
 59 | 
 60 |         transformer = TermWeightTransformer(method='pmi')
 61 |         X = csr_matrix(X)
 62 |         y = csr_matrix(y)
 63 |         transformer.fit(X, y)
 64 |         newX = transformer.transform(X)
 65 | 
 66 |         assert_array_approx_equal(transformer._weights, [0.1178, 0.1178], decimal=3)
 67 |         assert_true(issparse(newX))
 68 |         assert_array_approx_equal(newX.todense(), array([[0., 0.1178],
 69 |                                                          [0.1178, 0.],
 70 |                                                          [0.1178, 0.1178]]),
 71 |                                   decimal=3)
 72 | 
 73 |     def test_npmi(self):
 74 |         X = array([[0, 1],
 75 |                    [1, 0],
 76 |                    [1, 1]])
 77 |         y = array([[0, 1],
 78 |                    [1, 0],
 79 |                    [1, 0]])
 80 | 
 81 |         transformer = TermWeightTransformer(method='npmi')
 82 |         transformer.fit(X, y)
 83 | 
 84 |         assert_array_approx_equal(transformer._weights, [0.1699, 0.0850], decimal=3)
 85 |         assert_array_approx_equal(transformer.transform(X), array([[0., 0.0850],
 86 |                                                                    [0.1700, 0.],
 87 |                                                                    [0.1700, 0.0850]]),
 88 |                                   decimal=3)
 89 | 
 90 |         transformer = TermWeightTransformer(method='npmi')
 91 |         X = csr_matrix(X)
 92 |         y = csr_matrix(y)
 93 |         transformer.fit(X, y)
 94 |         newX = transformer.transform(X)
 95 | 
 96 |         assert_array_approx_equal(transformer._weights, [0.1700, 0.0850], decimal=3)
 97 |         assert_true(issparse(newX))
 98 |         assert_array_approx_equal(newX.todense(), array([[0., 0.0850],
 99 |                                                          [0.1700, 0.],
100 |                                                          [0.1700, 0.0850]]),
101 |                                   decimal=3)
102 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/lemmatizer.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import codecs
  3 | import os
  4 | 
  5 | from es_text_analytics.data.dataset import project_path
  6 | from es_text_analytics.tagger import FEATURES_MAP
  7 | 
  8 | 
  9 | # Norwegian lemmatizer based on Norsk Ordbank, http://www.edd.uio.no/prosjekt/ordbanken/data/index.html or
 10 | # http://www.nb.no/sprakbanken/show?serial=sbr-5&lang=nb
 11 | #
 12 | # Norsk Ordbank is not freely available but must be obtained from one of the urls above.
 13 | 
 14 | ORDBANK_BM_DEFAULT_PATH = os.path.join(project_path(), 'data', 'ordbank_bm')
 15 | FULLFORM_BM_FN = 'fullform_bm.txt'
 16 | 
 17 | FULLFORM_FIELDS = ['word_id', 'lemma', 'fullform', 'morph_descr', 'paradigm_code', 'paradigm_entry']
 18 | 
 19 | 
 20 | def parse_fullform_file(f, feat_norm='simple'):
 21 |     """
 22 |     Parses the fullform data file in Norsk Ordbank and returns dicts indexed on the fullform and lemma respectively.
 23 | 
 24 |     All fullforms are lowercased.
 25 |     Morphological information is normalized to POS tags.
 26 | 
 27 |     :param f: file instance for reading the fullform Norsk Ordbank data file.
 28 |     :param feat_norm: Type of POS tag to normalize morphological information. Must correspond to POS tagger tagset
 29 |       if doing contextual lemmatization.
 30 |     :type feat_norm: str|unicode
 31 |     :rtype : (dict, dict)
 32 |     :return: The fullform and lemma indexes to the file entries.
 33 |     """
 34 |     fullform_index = {}
 35 |     lemma_index = {}
 36 | 
 37 |     for line in f:
 38 |         line = line.strip()
 39 |         # published Ordbank files are latin-1 encoded
 40 |         line = line.decode('latin1')
 41 | 
 42 |         if line == '' or line[0] == '*':
 43 |             continue
 44 | 
 45 |         tokens = line.split('\t')
 46 | 
 47 |         entry = dict(zip(FULLFORM_FIELDS, tokens))
 48 | 
 49 |         entry['fullform'] = entry['fullform'].lower()
 50 | 
 51 |         entry['word_id'] = int(entry['word_id'])
 52 |         entry['paradigm_entry'] = int(entry['paradigm_entry'])
 53 | 
 54 |         # extract pos and features fro mthe morphological field and normalize pos
 55 |         morph_parts = entry['morph_descr'].split()
 56 |         entry['ndt_pos'] = morph_parts[0]
 57 |         entry['ndt_feats'] = '|'.join(morph_parts[1:])
 58 |         entry['pos'] = FEATURES_MAP[feat_norm](entry['fullform'], entry['ndt_pos'], entry['ndt_feats'])
 59 | 
 60 |         fullform_index[entry['fullform']] = fullform_index.get(entry['fullform'], []) + [entry]
 61 |         lemma_index[entry['lemma']] = lemma_index.get(entry['lemma'], []) + [entry]
 62 | 
 63 |     return fullform_index, lemma_index
 64 | 
 65 | 
 66 | class OrdbankLemmatizer(object):
 67 |     """
 68 |     Class implementing a simple lemmatizer for Bokmål based on Norsk Ordbank
 69 | 
 70 |     Uses "simple" POS tags for contextual disambiguation by default.
 71 |     """
 72 |     def __init__(self, ordbank_path=None, contextual=False, feat_norm='simple'):
 73 |         """
 74 |         :param ordbank_path: Path to Norsk Ordbank Bokmål datafiles. Uses the default location of absent.
 75 |         :param feat_norm: POS tag type to use for contextual disambiguation. Only "simple" currently supported.
 76 |         :type feat_norm: str|unicode
 77 |         """
 78 |         super(OrdbankLemmatizer, self).__init__()
 79 | 
 80 |         if not ordbank_path:
 81 |             ordbank_path = ORDBANK_BM_DEFAULT_PATH
 82 | 
 83 |         with codecs.open(os.path.join(ordbank_path, FULLFORM_BM_FN)) as f:
 84 |             self.fullform_index, self.lemma_index = parse_fullform_file(f, feat_norm=feat_norm)
 85 | 
 86 |     def lemmatize(self, word, pos=None):
 87 |         """
 88 |         Lemmatize the word using the POS tag context if passed.
 89 | 
 90 |         :param word: Word to lemmatize.
 91 |         :type word: str|unicode
 92 |         :param pos: Optional POS tag for disambiguation.
 93 |         :type pos: str|unicode
 94 |         :rtype : str|unicode
 95 |         :return: Lemma for passed word.
 96 |         """
 97 |         # all matching is done on lowercase
 98 |         word = word.lower()
 99 | 
100 |         if pos:
101 |             # lookup candidates and eliminate those with mismatching POS tag
102 |             candidates = [cand for cand in self.fullform_index.get(word, []) if cand['pos'] == pos]
103 |         else:
104 |             candidates = self.fullform_index.get(word)
105 | 
106 |         if candidates:
107 |             # if there are several candidates we choose the last one
108 |             # if the candidates are POS tag disambiguated our experience shows that further disambigous
109 |             # entries has the "more reasonable" lemmas listed last
110 |             return candidates[-1]['lemma']
111 |         else:
112 |             # default strategy for failing matches is to do nothing
113 |             return word
114 | 


--------------------------------------------------------------------------------
/Vagrantfile:
--------------------------------------------------------------------------------
  1 | # -*- mode: ruby -*-
  2 | # vi: set ft=ruby :
  3 | 
  4 | # Vagrantfile API/syntax version. Don't touch unless you know what you're doing!
  5 | VAGRANTFILE_API_VERSION = "2"
  6 | 
  7 | Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
  8 |   # All Vagrant configuration is done here. The most common configuration
  9 |   # options are documented and commented below. For a complete reference,
 10 |   # please see the online documentation at vagrantup.com.
 11 | 
 12 |   # Every Vagrant virtual environment requires a box to build off of.
 13 |   config.vm.box = "ubuntu/trusty64"
 14 | 
 15 |   # Disable automatic box update checking. If you disable this, then
 16 |   # boxes will only be checked for updates when the user runs
 17 |   # `vagrant box outdated`. This is not recommended.
 18 |   # config.vm.box_check_update = false
 19 | 
 20 |   # Create a forwarded port mapping which allows access to a specific port
 21 |   # within the machine from a port on the host machine. In the example below,
 22 |   # accessing "localhost:8080" will access port 80 on the guest machine.
 23 |   # config.vm.network "forwarded_port", guest: 80, host: 8080
 24 |   config.vm.network "forwarded_port", guest: 9200, host: 9201
 25 | 
 26 |   # Create a private network, which allows host-only access to the machine
 27 |   # using a specific IP.
 28 |   # config.vm.network "private_network", ip: "192.168.33.10"
 29 | 
 30 |   # Create a public network, which generally matched to bridged network.
 31 |   # Bridged networks make the machine appear as another physical device on
 32 |   # your network.
 33 |   # config.vm.network "public_network"
 34 | 
 35 |   # If true, then any SSH connections made will enable agent forwarding.
 36 |   # Default value: false
 37 |   # config.ssh.forward_agent = true
 38 | 
 39 |   # Share an additional folder to the guest VM. The first argument is
 40 |   # the path on the host to the actual folder. The second argument is
 41 |   # the path on the guest to mount the folder. And the optional third
 42 |   # argument is a set of non-required options.
 43 |   # config.vm.synced_folder "../data", "/vagrant_data"
 44 | 
 45 |   # Provider-specific configuration so you can fine-tune various
 46 |   # backing providers for Vagrant. These expose provider-specific options.
 47 |   # Example for VirtualBox:
 48 |   #
 49 |   # config.vm.provider "virtualbox" do |vb|
 50 |   #   # Don't boot with headless mode
 51 |   #   vb.gui = true
 52 |   #
 53 |   #   # Use VBoxManage to customize the VM. For example to change memory:
 54 |   #   vb.customize ["modifyvm", :id, "--memory", "1024"]
 55 |   # end
 56 |   #
 57 |   # View the documentation for the provider you're using for more
 58 |   # information on available options.
 59 | 
 60 |   # Enable provisioning with CFEngine. CFEngine Community packages are
 61 |   # automatically installed. For example, configure the host as a
 62 |   # policy server and optionally a policy file to run:
 63 |   #
 64 |   # config.vm.provision "cfengine" do |cf|
 65 |   #   cf.am_policy_hub = true
 66 |   #   # cf.run_file = "motd.cf"
 67 |   # end
 68 |   #
 69 |   # You can also configure and bootstrap a client to an existing
 70 |   # policy server:
 71 |   #
 72 |   # config.vm.provision "cfengine" do |cf|
 73 |   #   cf.policy_server_address = "10.0.2.15"
 74 |   # end
 75 | 
 76 |   # Enable provisioning with Puppet stand alone.  Puppet manifests
 77 |   # are contained in a directory path relative to this Vagrantfile.
 78 |   # You will need to create the manifests directory and a manifest in
 79 |   # the file default.pp in the manifests_path directory.
 80 |   #
 81 |   # config.vm.provision "puppet" do |puppet|
 82 |   #   puppet.manifests_path = "manifests"
 83 |   #   puppet.manifest_file  = "default.pp"
 84 |   # end
 85 | 
 86 |   # Enable provisioning with chef solo, specifying a cookbooks path, roles
 87 |   # path, and data_bags path (all relative to this Vagrantfile), and adding
 88 |   # some recipes and/or roles.
 89 |   #
 90 |   # config.vm.provision "chef_solo" do |chef|
 91 |   #   chef.cookbooks_path = "../my-recipes/cookbooks"
 92 |   #   chef.roles_path = "../my-recipes/roles"
 93 |   #   chef.data_bags_path = "../my-recipes/data_bags"
 94 |   #   chef.add_recipe "mysql"
 95 |   #   chef.add_role "web"
 96 |   #
 97 |   #   # You may also specify custom JSON attributes:
 98 |   #   chef.json = { mysql_password: "foo" }
 99 |   # end
100 | 
101 |   # Enable provisioning with chef server, specifying the chef server URL,
102 |   # and the path to the validation key (relative to this Vagrantfile).
103 |   #
104 |   # The Opscode Platform uses HTTPS. Substitute your organization for
105 |   # ORGNAME in the URL and validation key.
106 |   #
107 |   # If you have your own Chef Server, use the appropriate URL, which may be
108 |   # HTTP instead of HTTPS depending on your configuration. Also change the
109 |   # validation key to validation.pem.
110 |   #
111 |   # config.vm.provision "chef_client" do |chef|
112 |   #   chef.chef_server_url = "https://api.opscode.com/organizations/ORGNAME"
113 |   #   chef.validation_key_path = "ORGNAME-validator.pem"
114 |   # end
115 |   #
116 |   # If you're using the Opscode platform, your validator client is
117 |   # ORGNAME-validator, replacing ORGNAME with your organization name.
118 |   #
119 |   # If you have your own Chef Server, the default validation client name is
120 |   # chef-validator, unless you changed the configuration.
121 |   #
122 |   #   chef.validation_client_name = "ORGNAME-validator"
123 | end
124 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/wordnet_centrality.py:
--------------------------------------------------------------------------------
  1 | from operator import itemgetter
  2 | 
  3 | import networkx as nx
  4 | 
  5 | """
  6 | Experimental concept mining technique.
  7 | 
  8 | Takes a weighted list of terms and returns the central WordNet Synsets for those terms.
  9 | 
 10 | terms = [(u'gruppe', 1.0308515122783903), (u'skarabider', 1.0283549292633594), (u'utbredelse', 1.0255859517307202),
 11 |          (u'slekt', 1.02428182204782), (u'h\xf8re', 1.0236714839113259), (u'oldenborre', 1.0212900521382506),
 12 |          (u'art', 1.0206984849354699), (u'leve', 1.0181363254554074), (u'scarabaeoidea', 1.0178225609839886),
 13 |          (u'melolonthinae', 1.0137513034441485), (u'stor', 1.010924267580678),
 14 |          (u'underfamilie', 1.010754657594739), (u'sm\xe5', 1.0095704409677608), (u'underart', 1.0092106422118465),
 15 |          (u'millimeter', 1.009166357579949), (u'dekkvinge', 1.0077143382226799), (u'afrika', 1.0073806489590316),
 16 |          (u'pronotum', 1.0065698471665749), (u'gullbasse', 1.0065362930098589), (u'amerika', 1.0062859498858436),
 17 |          (u'parasittveps', 1.0060005533568113), (u'parasitt', 1.0060), (u'veps', 1.0060), (u'australia', 1.0058669303831191),
 18 |          (u'finnes', 1.0057317293824628),
 19 |          (u'gammaridea', 1.0056893049779934), (u'lang', 1.00559787835873), (u'familie', 1.0054340556687946),
 20 |          (u'parasitoide', 1.0053805526619595), (u'gjerne', 1.00537793821737), (u'taksonomisk', 1.005285633510592),
 21 |          (u'jorda', 1.005110740380347), (u's\xf8r', 1.0049040009707946), (u'asia', 1.0047862880057254),
 22 |          (u'panamerikansk', 1.0046940558397519), (u'svart', 1.0046021508509742), (u'inndeling', 1.0045280362592695),
 23 |          (u'omfatte', 1.0045210803669595), (u'cm', 1.004380770050699), (u'cetoniinae', 1.0042811151167377),
 24 |          (u'kjent', 1.0042292429979698), (u'praktskarabide', 1.0042044116189903), (u'\xe9n', 1.0041848737627979),
 25 |          (u'rutelinae', 1.0041419789388988), (u'ganske', 1.0039716523273752), (u'lys', 1.0036022222516647)]
 26 | 
 27 | c = ConceptClassifier()
 28 | c.concepts(terms)
 29 | 
 30 | [(u'*ROOT*', 0.6080213703117321),
 31 | ...
 32 |  (u'social_group', 0.06255333412324249),
 33 |  (u'collection', 0.06117590141836287),
 34 |  (u'cognition', 0.050449579778215264),
 35 |  (u'position', 0.050446927402192525),
 36 |  (u'h\xf8re', 0.04431870710158872),
 37 |  (u'kin', 0.02783831781626138),
 38 |  (u'gruppe', 0.02340480985900115),
 39 |  (u'content', 0.022421702867730226),
 40 |  (u'direction', 0.022415540534822163),
 41 |  (u'genealogy', 0.012455836088714487),
 42 |  (u'idea', 0.009964750496621828),
 43 |  (u'compass_point', 0.009950798727198123),
 44 |  (u'lineage', 0.005723405829441778),
 45 |  (u'concept', 0.0044278959856953815),
 46 |  (u'cardinal_compass_point', 0.004396465865842209),
 47 |  (u'family', 0.002965638904359013),
 48 |  (u'category', 0.0019660280031428582),
 49 |  (u'south', 0.0018952926436456173),
 50 |  (u'slekt', 0.0011273697128777484),
 51 |  (u'familie', 0.0011066250305903473),
 52 |  (u'kind', 0.0008694833352302178),
 53 |  (u's\xf8r', 0.0007068536091788623),
 54 |  (u'type', 0.0003767580775104511),
 55 |  (u'art', 0.00014272125955615077)]
 56 | 
 57 | plt.figure(3,figsize=(12,12))
 58 | nx.draw(c.g, with_labels=True, font_size=8)
 59 | 
 60 | """
 61 | 
 62 | 
 63 | def _create_subgraph(paths, root):
 64 |     g = nx.Graph()
 65 | 
 66 |     for ss_path in paths:
 67 |         for ss1, ss2 in zip(ss_path, ss_path[1:]):
 68 |             ss1_name = ss1[0]
 69 |             weight = ss1[1]
 70 |             ss2_name = ss2[0]
 71 | 
 72 |             g.add_node(ss1_name)
 73 |             g.add_node(ss2_name)
 74 |             g.add_edge(ss1_name, ss2_name, {'w': weight})
 75 | 
 76 |             if ss2_name == root:
 77 |                 break
 78 | 
 79 |     return g
 80 | 
 81 | 
 82 | def _path_root(paths):
 83 |     path_root = None
 84 | 
 85 |     for ss_level in zip(*[reversed(p) for p in paths]):
 86 |         names = [x[0] for x in ss_level]
 87 | 
 88 |         if len(set(names)) == 1:
 89 |             path_root = names[0]
 90 | 
 91 |     return path_root
 92 | 
 93 | 
 94 | class ConceptFinder(object):
 95 |     def __init__(self, lang='nob'):
 96 |         super(ConceptFinder, self).__init__()
 97 | 
 98 |         from nltk.corpus import wordnet as wordnet
 99 | 
100 |         self.lang = lang
101 |         self.wordnet = wordnet
102 |         self.graph = None
103 | 
104 |     def concepts(self, terms):
105 |         paths = self._synset_paths(terms)
106 |         root = _path_root(paths).split('.')[0]
107 |         self.graph = _create_subgraph(paths, root)
108 | 
109 |         return sorted(nx.eigenvector_centrality_numpy(self.graph, weight='w').items(),
110 |                       key=lambda x: x[1], reverse=True)
111 | 
112 |     def _top_synset(self, term):
113 |         ss = self.wordnet.synsets(term)
114 | 
115 |         if len(ss) >= 1:
116 |             return ss[0]
117 | 
118 |         return None
119 | 
120 |     def _synset_paths(self, terms):
121 |         paths = []
122 | 
123 |         for term, score, ss in [(term, score, self.wordnet.synsets(term, lang=self.lang)) for term, score in terms]:
124 |             if len(ss) >= 1:
125 |                 paths.append([(term, score)] + [(x[0].name().split('.')[0], 1.0)
126 |                                                 for x in sorted(ss[0]._shortest_hypernym_paths(True).items(),
127 |                                                                 key=itemgetter(1))])
128 | 
129 |         return paths
130 | 


--------------------------------------------------------------------------------
/fabfile.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | from fabric.contrib.console import confirm
  4 | from fabric.network import disconnect_all
  5 | from fabric.operations import sudo, run, put, local
  6 | from fabric.state import env
  7 | 
  8 | ANACONDA_MD5 = 'c3100392685b5a62c8509c0588ce9376'
  9 | ANACONDA_URL = 'https://3230d63b5fc54e62148e-c95ac804525aac4b6dba79b00b39d1d3.ssl.cf1.rackcdn.com/Anaconda-2.3.0-Linux-x86_64.sh'
 10 | ANACONDA_FN = 'Anaconda-2.3.0-Linux-x86_64.sh'
 11 | ANACONDA_INSTALL_PATH = '/opt/anaconda'
 12 | 
 13 | NEO4J_URL = 'http://neo4j.com/artifact.php?name=neo4j-community-2.2.3-unix.tar.gz'
 14 | NEO4J_FN = 'neo4j-community-2.2.3-unix.tar.gz'
 15 | NEO4J_FOLDER = 'neo4j-community-2.2.3'
 16 | 
 17 | ESLIB_INSTALL_PATH = '/opt/eslib'
 18 | 
 19 | 
 20 | def provision_server():
 21 |     sudo('apt-get update -qq -y > /dev/null')
 22 |     install_debian_packages(['screen', 'unzip'])
 23 |     install_anaconda()
 24 |     install_elasticsearch()
 25 |     install_neo4j()
 26 |     install_self()
 27 |     restart_server()
 28 | 
 29 | 
 30 | def restart_server():
 31 |     sudo('shutdown -r now')
 32 | 
 33 | 
 34 | def anaconda_downloaded():
 35 |     r = run('test -f %s' % ANACONDA_FN, quiet=True)
 36 | 
 37 |     if getattr(r, 'return_code') != 0:
 38 |         return False
 39 | 
 40 |     r = run('md5sum %s' % ANACONDA_FN)
 41 |     md5, _ = getattr(r, 'stdout').split()
 42 | 
 43 |     if md5 != ANACONDA_MD5:
 44 |         if confirm("Anaconda archive corrupt. Delete?"):
 45 |             run('rm %s' % ANACONDA_FN)
 46 |         else:
 47 |             disconnect_all()
 48 |             sys.exit(1)
 49 | 
 50 |     return True
 51 | 
 52 | 
 53 | def anaconda_installed():
 54 |     r = run('test -d %s' % ANACONDA_INSTALL_PATH, quiet=True)
 55 | 
 56 |     if getattr(r, 'return_code') != 0:
 57 |         return False
 58 | 
 59 |     return True
 60 | 
 61 | 
 62 | def install_anaconda():
 63 |     if not anaconda_downloaded():
 64 |         run('wget --quiet %s' % ANACONDA_URL)
 65 |     if not anaconda_installed():
 66 |         sudo('bash %s -b -p %s' % (ANACONDA_FN, ANACONDA_INSTALL_PATH))
 67 |         sudo('echo "export PATH=/opt/anaconda/bin:$PATH" > /etc/profile.d/anaconda.sh')
 68 | 
 69 | 
 70 | def package_installed(pkg):
 71 |     r = run("dpkg-query -W -f='${Status}' %s 2>/dev/null | grep -c \"ok installed\"" % pkg, quiet=True)
 72 | 
 73 |     if getattr(r, 'return_code') != 0:
 74 |         return False
 75 | 
 76 |     return True
 77 | 
 78 | 
 79 | def install_java():
 80 |     if not package_installed('default-jre'):
 81 |         sudo('apt-get install -y -qq default-jre')
 82 | 
 83 | 
 84 | def install_elasticsearch():
 85 |     install_java()
 86 | 
 87 |     if not package_installed('elasticsearch'):
 88 |         run('wget -qO - https://packages.elastic.co/GPG-KEY-elasticsearch | sudo apt-key add -')
 89 |         run('echo "deb http://packages.elastic.co/elasticsearch/1.5/debian stable main" | sudo tee -a /etc/apt/sources.list')
 90 |         sudo('apt-get update -qq -y > /dev/null')
 91 |         sudo('apt-get install -qq -y elasticsearch')
 92 |         put('provision/elasticsearch.yml', '/etc/elasticsearch/elasticsearch.yml', use_sudo=True)
 93 |         sudo('/usr/share/elasticsearch/bin/plugin -i elasticsearch/marvel/latest')
 94 |         sudo('update-rc.d elasticsearch defaults 95 10')
 95 |         sudo('service elasticsearch start')
 96 | 
 97 | 
 98 | def install_neo4j():
 99 |     if not file_exists(NEO4J_FN):
100 |         run('wget -q %s -O %s' % (NEO4J_URL, NEO4J_FN))
101 | 
102 |     sudo('adduser --home /home/neo4j --system --shell /bin/bash neo4j')
103 |     sudo('(cd /home/neo4j; tar zxf ~/%s)' % NEO4J_FN, user='neo4j')
104 |     sudo('yes neo4j|(HEADLESS=true; /home/neo4j/%s/bin/neo4j-installer install)' % NEO4J_FOLDER)
105 |     sudo('echo "neo4j   soft    nofile  40000" > /etc/security/limits.conf')
106 |     sudo('echo "neo4j   hard    nofile  40000" >> /etc/security/limits.conf')
107 |     sudo('echo "session required pam_limits.so" > /etc/pam.d/common-session')
108 |     sudo('echo "session required pam_limits.so" > /etc/pam.d/common-session-noninteractive')
109 |     put('provision/neo4j-server.properties', '.')
110 |     sudo('mv neo4j-server.properties /home/neo4j/%s/conf/neo4j-server.properties' % NEO4J_FOLDER)
111 |     sudo('chown neo4j /home/neo4j/%s/conf/neo4j-server.properties' % NEO4J_FOLDER)
112 |     sudo('chmod 644 /home/neo4j/%s/conf/neo4j-server.properties' % NEO4J_FOLDER)
113 |     sudo('service neo4j-service start')
114 | 
115 | 
116 | def install_neo4j_user():
117 |     put('provision/auth', '.')
118 |     sudo('mv auth /home/neo4j/neo4j-community-2.2.2/data/dbms/auth')
119 |     sudo('chown neo4j /home/neo4j/neo4j-community-2.2.2/data/dbms/auth')
120 |     sudo('chmod 600 /home/neo4j/neo4j-community-2.2.2/data/dbms/auth')
121 |     sudo('service neo4j-service restart')
122 | 
123 | 
124 | def file_exists(fn):
125 |     r = run('test -f %s' %fn, quiet=True)
126 | 
127 |     if getattr(r, 'return_code') != 0:
128 |         return False
129 | 
130 |     return True
131 | 
132 | 
133 | def install_self():
134 |     local('git archive master -o master.zip --format zip --prefix comperio-text-analytics/')
135 |     put('master.zip', '.')
136 |     sudo('unzip master')
137 | 
138 | 
139 | def vagrant():
140 |     env.user = 'vagrant'
141 |     env.hosts = ['127.0.0.1:2222']
142 |     #env.key_filename = '~/.vagrant.d/insecure_private_key'
143 |     env.key_filename = '.vagrant/machines/default/virtualbox/private_key'
144 |     env.disable_known_hosts = True
145 | 
146 | 
147 | def install_debian_packages(packages=None):
148 |     if packages and isinstance(packages, basestring):
149 |         packages = [p.strip() for p in packages.split(';')]
150 | 
151 |     if packages:
152 |         sudo('apt-get install -qq -y %s' % ' '.join(packages))
153 | 
154 | 


--------------------------------------------------------------------------------
/python-client/bin/corpus2lemmatizedtext.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from argparse import ArgumentParser
  3 | import re
  4 | import sys
  5 | import re
  6 | import codecs
  7 | from gensim.corpora import Dictionary
  8 | from gensim.models.tfidfmodel import TfidfModel
  9 | from gensim import corpora
 10 | 
 11 | from es_text_analytics.data.wikipedia import WikipediaDataset
 12 | from es_text_analytics.data.elasticsearch_dataset import ElasticsearchDataset
 13 | from nltk.corpus import stopwords
 14 | 
 15 | from es_text_analytics.tagger import NOBTagger, install_hunpos
 16 | from es_text_analytics.lemmatizer import OrdbankLemmatizer
 17 | 
 18 | def fast_tokenize(str):
 19 |     return [x.lower() for x in re.findall('[^\W\d_]+', str, re.MULTILINE | re.UNICODE)]
 20 | 
 21 | def normalize_es(doc):
 22 |     return doc['_source']['article']
 23 | 
 24 | 
 25 | def normalize_wiki(doc):
 26 |     return doc['id'], doc['article.text']
 27 | 
 28 | 
 29 | def get_tokenized(page, sw):
 30 |     return [token for token in fast_tokenize(page) if token not in sw and len(token) > 1]
 31 | 
 32 | 
 33 | class IterableDataset(object):
 34 |     def __init__(self, args_dataset, stopwords, nobtag, lemmatizer):
 35 |         self.dataset = args_dataset
 36 |         self.tagger = nobtag
 37 |         self.lem = lemmatizer
 38 |         self.stopwords = stopwords
 39 | 
 40 |     def __len__(self):
 41 |         return sum(1 for _ in self.dataset)
 42 | 
 43 |     def __iter__(self):
 44 |         for page in self.dataset:
 45 |             tokens = get_tokenized(page[1], self.stopwords)
 46 |             sent = self.tagger.tag(tokens, tokenize=False)
 47 |             yield page[0], " ".join([self.lem.lemmatize(word, tag) for word, tag in sent]).lower()
 48 | 
 49 | 
 50 | # wikidata download https://dumps.wikimedia.org/nowiki/latest/nowiki-latest-pages-articles.xml.bz2
 51 | def main():
 52 |     parser = ArgumentParser(
 53 |         description='wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information')
 54 |     parser.add_argument('-ds', '--dataset', default='wiki', help='What kind of dataset to use. (wiki or es)')
 55 |     parser.add_argument('-d', '--dump-file', help='Wiki: bz2 dump file with wiki in it')
 56 |     parser.add_argument('-l', '--limit', help='Wiki: How many documents to extract from wiki')
 57 |     parser.add_argument('--model-id', default='model', help='Filename for created model.')
 58 |     parser.add_argument('-q', '--query', default=None, help='Elasticsearch: Query to use to fetch documents')
 59 |     parser.add_argument('--index', help='Elasticsearch: index to read from.')
 60 |     parser.add_argument('--doc_type', default='doc', help='Elasticsearch: data type in index.')
 61 |     parser.add_argument('--data-dir', default='.',  help='Directory to save the generated models and vocabularies into.')
 62 | 
 63 |     opts = parser.parse_args()
 64 | 
 65 |     dump_fn = opts.dump_file
 66 |     limit = int(opts.limit) if opts.limit else None
 67 | 
 68 |     data_type = opts.dataset.lower()
 69 |     if data_type not in ['es', 'wiki']:
 70 |         logging.error("Invalid dataset  type %s" % data_type)
 71 |         parser.print_usage()
 72 |         exit(-1)
 73 |     limit = None
 74 |     if opts.limit:
 75 |         limit = int(opts.limit)
 76 |     if not dump_fn and data_type in ['wiki']:
 77 |         logging.error('--dump-file required for wiki dataset')
 78 |         sys.exit(1)
 79 | 
 80 |     query = opts.query
 81 |     index = opts.index
 82 |     doc_type = opts.doc_type
 83 |     if data_type == 'es' and index is None:
 84 |         logging.error(
 85 |             "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter")
 86 |         sys.exit(1)
 87 | 
 88 |     data_dir = opts.data_dir
 89 |     model_id = opts.model_id
 90 |     model_fn = '%s' % (model_id)
 91 |     if data_dir:
 92 |         model_fn = '%s%s' % (data_dir, model_fn)
 93 |     logging.info("Writing models to %s." % model_fn)
 94 | 
 95 |     if data_type == 'es':
 96 |         logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query))
 97 |         dataset = ElasticsearchDataset(read_index=index, read_doc_type=doc_type, query=query,
 98 |                                        normalize_func=normalize_es)
 99 |     else:
100 |         logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit))
101 |         dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki)
102 |     sw = set(stopwords.words('norwegian'))
103 |     #install_hunpos()
104 |     nobtag = NOBTagger()
105 |     ord = OrdbankLemmatizer()
106 | 
107 |     corpus = IterableDataset(dataset, sw, nobtag, ord)
108 |     with codecs.open(model_fn, mode='w', encoding='utf-8') as fn:
109 |         for document in corpus:
110 |             logging.info(document[0])
111 |             fn.write(str(document[0]) + '\t' + document[1] + '\n')
112 | 
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     logformat = '%(asctime)s %(name)-12s: %(message)s'
117 |     logging.basicConfig(level=logging.INFO, format=logformat, filename='wiki-topicmodel.log' )
118 |     console = logging.StreamHandler()
119 |     formatter = logging.Formatter(logformat)
120 |     console.setFormatter(formatter)
121 |     logging.getLogger('').addHandler(console)
122 |     main()
123 |     # ########## sample usage
124 |     #
125 |     #--model-type=lda -d F:/projects/elasticsearch-enterprise-system/data/nowiki-20150901-pages-articles.xml.bz2 -l 100 --n-topics 10
126 |     #--model-type=lda -ds es  --n-topics 10 --index wiki --query "{\"query\":{\"match\": {\"_all\":\"kongo\"}}}"
127 |     #--model-type=word2vec -ds es   --index wiki --w2v_window=7 --w2v_size=75
128 |     #--model-type=hdp -d F:/projects/elasticsearch-enterprise-system/data/nowiki-20150901-pages-articles.xml.bz2 -l 100 --n-topics 10
129 | 
130 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/data/dataset.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from abc import ABCMeta
  4 | from abc import abstractmethod
  5 | from urlparse import urlparse
  6 | 
  7 | import requests
  8 | from elasticsearch.client import IndicesClient
  9 | 
 10 | BULK_REQUEST_SIZE = 100
 11 | 
 12 | CONLL_U_FIELDS = ['index', 'form', 'lemma', 'cpostag', 'postag', 'feats',
 13 |                   'head', 'deprel', 'deps', 'misc']
 14 | 
 15 | 
 16 | def fn_from_url(url):
 17 |     """
 18 |     Extract the final part of an url in order to get the filename of a downloaded url.
 19 | 
 20 |     :param url: url string
 21 |     :type url : str|unicode
 22 |     :rtype : str|unicode
 23 |     :return: url filename part
 24 |     """
 25 |     parse = urlparse(url)
 26 | 
 27 |     return os.path.basename(parse.path)
 28 | 
 29 | 
 30 | def download_file(url, dest_path):
 31 |     """
 32 |     Download the file pointed to by the url to the path specified or the defult dataset location.
 33 |     If the dfile is already present at the path it will not be downloaded and the path to this file
 34 |     is returned.
 35 | 
 36 |     :param url: url string pointing to the file
 37 |     :type url : str|unicode
 38 |     :param dest_path: path to location where the file will be stored locally
 39 |     :type dest_path : str|unicode
 40 |     :rtype : str|unicode
 41 |     :return: path to the downloaded dataset
 42 |     """
 43 |     if not os.path.exists(dest_path):
 44 |         os.makedirs(dest_path)
 45 | 
 46 |     fn = fn_from_url(url)
 47 |     full_fn = os.path.join(dest_path, fn)
 48 | 
 49 |     if os.path.exists(full_fn):
 50 |         logging.info('Dataset archive %s already exists in %s ...' % (fn, dest_path))
 51 |     else:
 52 |         r = requests.get(url, stream=True)
 53 |         with open(full_fn, 'wb') as f:
 54 |             for chunk in r.iter_content(chunk_size=1024):
 55 |                 if chunk: # filter out keep-alive new chunks
 56 |                     f.write(chunk)
 57 |                     f.flush()
 58 | 
 59 |     return full_fn
 60 | 
 61 | 
 62 | def project_path():
 63 |     """
 64 |     Returns the path to the root project directory.
 65 | 
 66 |     :rtype : str|unicode
 67 |     :return: The root project path as a string.
 68 |     """
 69 |     self_path = os.path.dirname(os.path.abspath(__file__))
 70 | 
 71 |     return os.path.abspath(os.path.join(self_path, '..', '..', '..'))
 72 | 
 73 | 
 74 | def default_dataset_path():
 75 |     """
 76 |     Returns the data default dataset location in the project directory.
 77 | 
 78 |     :rtype : str|unicode
 79 |     :return: the path to the default dataset location
 80 |     """
 81 |     return os.path.join(project_path(), 'data')
 82 | 
 83 | 
 84 | def parse_conll(fileobj, field_indices=None):
 85 |     """
 86 |     Parse a CONLL formatted dependency treebank file. Supports the CONLL-U format
 87 |     with UTF-8 encoding.
 88 | 
 89 |     :param fileobj: A file like instance with CONLL formatted text.
 90 |     :rtype : generator
 91 |     """
 92 |     sentence = []
 93 | 
 94 |     for line in fileobj:
 95 |         line = line.decode('utf-8')
 96 |         line = line.strip()
 97 | 
 98 |         if line == '':
 99 |             if sentence:
100 |                 yield sentence
101 | 
102 |             sentence = []
103 | 
104 |             continue
105 | 
106 |         row = line.split(u'\t')
107 |         row[0] = int(row[0])
108 | 
109 |         if field_indices:
110 |             row = [row[i] for i in field_indices]
111 | 
112 |         sentence.append(row)
113 | 
114 |     if sentence:
115 |         yield sentence
116 | 
117 | 
118 | class Dataset:
119 |     """
120 |     Base class for self-installable and self-indexable datasets.
121 | 
122 |     Contains base methods for downloading the dataset and creating Elasticsearch index based on it.
123 |     """
124 |     __metaclass__ = ABCMeta
125 | 
126 |     def __init__(self, index=None, doc_type=None, dataset_path=None, dataset_fn=None,
127 |                  normalize_func=None):
128 |         """
129 |         Initialize the instance with optional Elasticsearch index information.
130 | 
131 |         :param index: Elasticsearch index where the dataset will be stored if indexed.
132 |         :type index: str|unicode
133 |         :param doc_type:
134 |         :type doc_type: str|unicode
135 |         :param dataset_path: location where dataset wiil be downloaded. If None the default location is used.
136 |         :type dataset_path: None|str|unicode
137 |         :param dataset_fn: Location of the dataset. If this argument is used the file specified will be used and
138 |           the archive will not be downloaded automatically if not present.
139 |         :type dataset_fn: None|str|unicode
140 |         :param normalize_func: Function to normalize corpus documemt format. Default will create a dict with a field
141 |           that contains the full document text. Exact format is corpus dependent.
142 |         :type normalize_func: function|None
143 |         """
144 |         self.es_index = index
145 |         self.es_doc_type = doc_type
146 |         self.dataset_fn = dataset_fn
147 |         self.archive_fn = None
148 |         self.normalize_func = normalize_func
149 | 
150 |         self.dataset_path = dataset_path
151 | 
152 |         if not dataset_path:
153 |             self.dataset_path = default_dataset_path()
154 | 
155 |     @abstractmethod
156 |     def _iterator(self):
157 |         """
158 |         Subclasses should implement this method returning a generator yielding
159 |         dicts with the document data.
160 |         """
161 |         raise NotImplementedError
162 | 
163 |     def __iter__(self):
164 |         if self.archive_fn:
165 |             if not self.dataset_fn:
166 |                 raise ValueError()
167 | 
168 |         for doc in self._iterator():
169 |             try:
170 |                 if self.normalize_func:
171 |                     doc = self.normalize_func(doc)
172 |             except Exception:
173 |                 logging.error('Unable to normalize doc ...')
174 | 
175 |             yield doc
176 | 
177 |     def index(self, es):
178 |         """
179 |         Index the dataset in the given index with archive in the dataset location.
180 | 
181 |         :param es: Elasticsearch client instance
182 |         :type es: elasticsearch.client.Elasticsearch
183 |         :rtype : elasticsearch.client.Elasticsearch
184 |         :return: :raise ValueError:
185 |         """
186 |         docs = []
187 |         count = 0
188 | 
189 |         for doc in self:
190 |             if '_id' in  doc:
191 |                 docs += [{'index': dict(_index=self.es_index, _type=self.es_doc_type, _id=doc['_id'])}, doc]
192 |             else:
193 |                 docs += [{'index': {'_index': self.es_index, '_type': self.es_doc_type }}, doc]
194 |             count += 1
195 | 
196 |             if len(docs) % (2 * BULK_REQUEST_SIZE) == 0:
197 |                 es.bulk(index=self.es_index, doc_type=self.es_doc_type, body=docs)
198 |                 logging.info('Added %d documents ...' % count)
199 |                 docs = []
200 | 
201 |         if docs:
202 |             es.bulk(index=self.es_index, doc_type=self.es_doc_type, body=docs)
203 |             logging.info('Added %d documents ...' % count)
204 | 
205 |         return self
206 | 
207 |     def delete_index(self, es):
208 |         """
209 |         Delete the dataset index.
210 | 
211 |         :param es: Elasticsearch client instance
212 |         :type es: elasticsearch.client.Elasticsearch
213 |         :rtype : NewsgroupsDataset
214 |         """
215 |         ic = IndicesClient(es)
216 |         ic.delete(index=self.es_index, ignore=[400, 404])
217 | 
218 |         return self
219 | 
220 |     def install(self, es=None):
221 |         """
222 |         Install and optionally index the dataset.
223 |         WARNING: Deletes the index before installing.
224 | 
225 |         :param es: Pass an Elasticsearch client instance to index the dataset.
226 |         :type es: None|elasticsearch.client.Elasticsearch
227 |         :rtype : Dataset
228 |         """
229 |         if not self.archive_fn:
230 |             logging.info("No installable archive for this dataset ...")
231 |         else:
232 |             if self.dataset_fn:
233 |                 logging.warn('Dataset initialized directly or already installed ...')
234 |                 return self
235 |             else:
236 |                 self.dataset_fn = download_file(self.archive_fn, dest_path=self.dataset_path)
237 | 
238 |         if es:
239 |             logging.info("Creating Elasticsearch index %s ..." % self.index)
240 |             self.delete_index(es)
241 |             self.index(es)
242 | 
243 |         return self
244 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/decompounder.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | from numpy import argmin
  3 | 
  4 | from es_text_analytics.lemmatizer import OrdbankLemmatizer, ORDBANK_BM_DEFAULT_PATH
  5 | 
  6 | 
  7 | """
  8 | Simple decompounder that matches parts of words to fullform entries in Norsk Ordbank.
  9 | 
 10 | TODO: Better match. Maybe match lemmas with fixed spacing characters. This should yield less overgeneration.
 11 | TODO: Only match allowed POS tag sequences in a compound word. For example adjectives must be prepositioned
 12 |   and so on.
 13 | TODO: Annotate compound entries in Norsk Ordbank. This would yield better size disambiguation heuristics and
 14 |   avoid keeping compound forms that are listed on Norsk Ordbank.
 15 | TODO: Return POS tags for internal word components.
 16 | """
 17 | 
 18 | # allowed POS tags that can form compounds
 19 | COMPOUND_POS_MAP = {
 20 |     'simple': ['SUBST', 'ADJ', 'ADV', 'VERB']
 21 | }
 22 | 
 23 | # The decompounder searches for fullform matches from the beginning of the string creates a tree
 24 | # of match combinations for each initial match. This is implemented in decompound() and decompund_inner().
 25 | #
 26 | # The resulting trees are flattened to lists of word forms that the compoun can be split into. This is
 27 | # implemented in flatten() and flatten_inner().
 28 | #
 29 | # Fullform matches can be filtered on length (very short words are probably not "proper" words) and POS tag
 30 | # (compounds are for example not productively formed from closed word classes in Norwegian). This is implemented
 31 | # in fullform_index_match().
 32 | #
 33 | # NOTE: Not optimized. Can probably be made a lot more efficient.
 34 | 
 35 | def fullform_index_match(string, fullform_index, pos_match_field=None, pos_format='simple'):
 36 |     """
 37 |     Partial string matching to fullform index. See main comment.
 38 | 
 39 |     :param string: Partial word that is being decompounded.
 40 |     :type string: str|unicode
 41 |     :param fullform_index: Fullform index to Norsk Ordbank entries.
 42 |     :type fullform_index: dict[str|unicode, list[dict]]
 43 |     :param pos_match_field: Field in fullform index entry to match POS tag to.
 44 |     :type pos_match_field: None|str|unicode
 45 |     :param pos_format: POS tag type, must correspond to POS tag field in fullform index.
 46 |     :type pos_format: str|unicode
 47 |     :rtype : bool
 48 |     :return: True if matching entry within constraints is found in index.
 49 |     """
 50 |     if pos_match_field:
 51 |         return [match for match in fullform_index.get(string, [])
 52 |                 if match[pos_match_field] in COMPOUND_POS_MAP[pos_format]]
 53 |     else:
 54 |         return string in fullform_index
 55 | 
 56 | 
 57 | def decompound_inner(word, fullform_index, start=0, min_match=2, pos_match_field=None, pos_format='simple'):
 58 |     """
 59 |     Decompound tree builder. See main comment.
 60 | 
 61 |     :param word: Word that is being decompounded.
 62 |     :type word: str|unicode
 63 |     :param fullform_index: Fullform index to Norsk Ordbank entries.
 64 |     :type fullform_index: dict[str|unicode, list[dict]]
 65 |     :param start: Decompound from this position in the word.
 66 |     :type start: int|long
 67 |     :param min_match: Minimum string length to match.
 68 |     :type min_match: int|long
 69 |     :param pos_match_field: Field in fullform index entry to match POS tag to.
 70 |     :type pos_match_field: None|str|unicode
 71 |     :param pos_format: POS tag type, must correspond to POS tag field in fullform index.
 72 |     :type pos_format: str|unicode
 73 |     :rtype : list[str|unicode|list]
 74 |     :return: List based tree structure of partial matches.
 75 |     """
 76 |     compounds = []
 77 |     for i in range(start+1, len(word) + 1):
 78 |         if fullform_index_match(word[start:i], fullform_index, pos_format=pos_format, pos_match_field=pos_match_field) \
 79 |                 and i - start > min_match:
 80 |             # recursively collect sequential matches
 81 |             compounds.append([word[start:i]] +
 82 |                              decompound_inner(word, fullform_index, start=i, min_match=min_match,
 83 |                                               pos_match_field=pos_match_field, pos_format=pos_format))
 84 | 
 85 |     return compounds
 86 | 
 87 | 
 88 | def flatten_inner(compound_tree):
 89 |     """
 90 |     Flatten single tree structure with fullform mathes for a compound word. See main comment.
 91 | 
 92 |     :param compound_tree: List based tree structure of partial matches.
 93 |     :type compound_tree: list[str|unicode|list]
 94 |     :rtype : list[list[str|unicode]]
 95 |     :return: List of partial matches for each branch of the passed tree.
 96 |     """
 97 |     results = []
 98 | 
 99 |     # recursive base case, leaf of tree
100 |     if len(compound_tree) == 1:
101 |         return [[compound_tree[0]]]
102 | 
103 |     head = compound_tree[0]
104 | 
105 |     # recursively traverse each branch
106 |     for rest in compound_tree[1:]:
107 |         results += [[head] + tail for tail in flatten_inner(rest)]
108 | 
109 |     return results
110 | 
111 | 
112 | def flatten(compound_forest):
113 |     """
114 |     Flatten a list of compund match trees. See main comment.
115 | 
116 |     :param compound_forest: List of list based tree structures with partial fullform matches.
117 |     :type compound_forest: list[list[str|unicode|list]]
118 |     :rtype : list[list[str|unicode]]
119 |     :return: List of partial matches for each branch of eaxh tree of the passed list of trees.
120 |     """
121 |     result = []
122 | 
123 |     for tree in compound_forest:
124 |         result += flatten_inner(tree)
125 | 
126 |     return result
127 | 
128 | 
129 | def decompound(word, fullform_index, min_match=2, pos_match_field=None, pos_format='simple'):
130 |     """
131 |     Main decompounder entry point. See main comment.
132 | 
133 |     Filters out compound word decompositions that does not exactly match the passed word
134 | 
135 |     :param word: Word that is being decompounded.
136 |     :type word: str|unicode
137 |     :param fullform_index: Use this fullform index during decompounding. Must conform to the structure
138 |       used by the OrdbankLemmatizer class.
139 |     :type fullform_index: dict[str|unicode, list[dict]]
140 |     :param min_match: Minimum string length to match.
141 |     :type min_match: int|long
142 |     :param pos_match_field: Field in fullform index entry to match POS tag to.
143 |     :type pos_match_field: None|str|unicode
144 |     :param pos_format: POS tag type, must correspond to POS tag field in fullform index.
145 |     :type pos_format: str|unicode
146 |     :rtype : list[list[str|unicode]]
147 |     :return: List of compound word decompositions into substrings.
148 |     """
149 |     candidates = flatten(decompound_inner(word, fullform_index, min_match=min_match,
150 |                                           pos_format=pos_format, pos_match_field=pos_match_field))
151 | 
152 |     return [c for c in candidates if sum([len(p) for p in c]) == len(word)]
153 | 
154 | 
155 | class NOBDecompounder(object):
156 |     """
157 |     Class implementing a simple decompounding strategy for Norwegian Bokmål using the
158 |     Norsk Ordbank lexical database.
159 | 
160 |     The decompounder uses heuristics and word matching to find and disambiguate
161 |     decompounding candidates.
162 |     """
163 |     def __init__(self, fullform_index=None, min_match=2, pos_format='simple'):
164 |         """
165 |         :param fullform_index: Use this fullform index during decompounding. Must conform to the structure
166 |           used by the OrdbankLemmatizer class.
167 |         :type fullform_index: dict[str|unicode, list[dict]]
168 |         :param min_match: Minimum length of subword that will be matched.
169 |         :type min_match: int|long
170 |         :param pos_format: POS tag type used for disambiguation. Must match fullform index content.
171 |         :type pos_format: str|unicode
172 |         """
173 |         super(NOBDecompounder, self).__init__()
174 | 
175 |         self.min_match = min_match
176 |         self.pos_format = pos_format
177 |         self.fullform_index = fullform_index
178 | 
179 |         if not self.fullform_index:
180 |             self.fullform_index = OrdbankLemmatizer(ORDBANK_BM_DEFAULT_PATH, feat_norm=self.pos_format).fullform_index
181 | 
182 |     def decompound(self, word):
183 |         """
184 |         Decompose the passed compound word if possible.
185 | 
186 |         :param word: Word to decompound.
187 |         :type word: str|unicode
188 |         :rtype : None|list[string|unicode]
189 |         :return: A list of words that compose the compound word or None if no decomposition is found.
190 |         """
191 |         candidates = decompound(word.lower(), self.fullform_index, min_match=self.min_match,
192 |                                 pos_match_field='pos', pos_format=self.pos_format)
193 | 
194 |         if not candidates:
195 |             return None
196 |         else:
197 |             # if there are several candidates we will pick the one with the simplest decomposition, ie. the
198 |             # one with the fewest elements.
199 |             # if there are still several candidates argmin implicitly chooses the first one since this should
200 |             #  usually have the longest last component with the current matching strategy
201 |             return candidates[argmin([len(c) for c in candidates])]
202 | 


--------------------------------------------------------------------------------
/python-client/bin/build-wiki-topicmodel.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from argparse import ArgumentParser
  3 | import re
  4 | import sys
  5 | import re
  6 | from gensim import corpora
  7 | from gensim.corpora import Dictionary
  8 | from gensim.models.lsimodel import LsiModel
  9 | from gensim.models.ldamodel import LdaModel
 10 | from gensim.models.word2vec import Word2Vec
 11 | from gensim.models.hdpmodel import HdpModel
 12 | from gensim.models.tfidfmodel import TfidfModel
 13 | 
 14 | from es_text_analytics.data.wikipedia import WikipediaDataset
 15 | from es_text_analytics.data.elasticsearch_dataset import ElasticsearchDataset
 16 | from nltk.corpus import stopwords
 17 | 
 18 | 
 19 | def fast_tokenize(str):
 20 |     return [x.lower() for x in re.findall('[^\W\d_]+', str, re.MULTILINE | re.UNICODE)]
 21 | 
 22 | 
 23 | def normalize_es(doc):
 24 |     return doc['_source']['article']
 25 | 
 26 | 
 27 | def normalize_wiki(doc):
 28 |       return doc['article.text']
 29 | 
 30 | 
 31 | def normalize_file(doc):
 32 |     return doc.split('\t')[1]
 33 | 
 34 | 
 35 | def get_tokenized(page, sw):
 36 |       return [token for token in fast_tokenize(page) if token not in sw and len(token) > 1]
 37 | 
 38 | 
 39 | 
 40 | import re
 41 | import string
 42 | import tarfile
 43 | import codecs
 44 | from es_text_analytics.data.dataset import Dataset
 45 | from elasticsearch.client import Elasticsearch
 46 | from elasticsearch.helpers import scan
 47 | 
 48 | """
 49 | Elasticsearch as data source
 50 | 
 51 | """
 52 | 
 53 | 
 54 | class FileDataset(Dataset):
 55 |     """
 56 |     Class encapsulating using a text file as datasource. Assumes file contains lines with documents.
 57 |     The formatting of the lines are up to you, but remember to extract what you need in the normalize_func
 58 |     """
 59 | 
 60 |     def __init__(self,  dump_fn, num_articles=None, normalize_func=None):
 61 |         super(FileDataset, self).__init__( normalize_func=normalize_func)
 62 |         self.dataset_fn = dump_fn
 63 | 
 64 | 
 65 |     def _iterator(self):
 66 |         with codecs.open(self.dataset_fn, 'r', encoding='utf-8') as f:
 67 |             for line in f:
 68 |                 yield line
 69 | 
 70 | 
 71 | 
 72 | 
 73 | class IterableDataset(object):
 74 |     def __init__(self, args_dataset, stopwords, vocabulary, doc2bow=True):
 75 |         self.dataset = args_dataset
 76 |         self.doc2bow = doc2bow
 77 |         self.stopwords = stopwords
 78 |         self.vocabulary = vocabulary
 79 | 
 80 |     def __len__(self):
 81 |         return sum(1 for _ in self.dataset)
 82 | 
 83 |     def __iter__(self):
 84 |         for page in self.dataset:
 85 |             doc = get_tokenized(page, self.stopwords)
 86 |             if self.doc2bow:
 87 |                 yield self.vocabulary.doc2bow(doc)
 88 |             else:
 89 |                 yield doc
 90 | 
 91 | 
 92 | # wikidata download https://dumps.wikimedia.org/nowiki/latest/nowiki-latest-pages-articles.xml.bz2
 93 | def main():
 94 |     parser = ArgumentParser(
 95 |         description='wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information')
 96 |     parser.add_argument('-ds', '--dataset', default='wiki', help='What kind of dataset to use. (wiki,es,file)')
 97 |     parser.add_argument('-d', '--dump-file', help='Wiki: bz2 dump file with wiki in it')
 98 |     parser.add_argument('-l', '--limit', help='Wiki: How many documents to extract from wiki')
 99 |     parser.add_argument('--model-id', default='model', help='Filename for created model.')
100 |     parser.add_argument('--model-type', default='lsi', help='Model type (lsi, lda, word2vec, hdp, vocabulary).')
101 |     parser.add_argument('--n-topics', default=10, help='Number of topics to model.')
102 |     parser.add_argument('--n-passes', default=1, help='Number of passes for LDA  model.')
103 |     parser.add_argument('--w2v-size', default=100, help='size of Word2Vec context.')
104 |     parser.add_argument('--w2v-window',  default=5, help='window for Word2Vec.')
105 |     parser.add_argument('-q', '--query', default=None, help='Elasticsearch: Query to use to fetch documents')
106 |     parser.add_argument('--index', help='Elasticsearch: index to read from.')
107 |     parser.add_argument('--doc_type', default='doc', help='Elasticsearch: data type in index.')
108 |     parser.add_argument('--data-dir', help='Directory to save the generated models and vocabularies into.')
109 |     parser.add_argument('--vocab',  help='Prebuilt Vocabulary file. Use this to avoid having to generate one.')
110 | 
111 |     opts = parser.parse_args()
112 | 
113 |     model_type = opts.model_type.lower()
114 |     if model_type not in ['lsi', 'lda', 'word2vec', 'hdp', 'vocabulary']:
115 |         logging.error("Invalid model type %s" % model_type)
116 |         parser.print_usage()
117 |         exit(-1)
118 | 
119 |     logging.info("Using model type %s" % model_type)
120 | 
121 |     dump_fn = opts.dump_file
122 |     limit = int(opts.limit) if opts.limit else None
123 | 
124 |     data_type = opts.dataset.lower()
125 |     if data_type not in ['es', 'wiki', 'file']:
126 |         logging.error("Invalid dataset  type %s" % data_type)
127 |         parser.print_usage()
128 |         exit(-1)
129 |     limit = None
130 |     if opts.limit:
131 |         limit = int(opts.limit)
132 |     if not dump_fn and data_type in ['wiki']:
133 |         logging.error('--dump-file required for wiki dataset')
134 |         sys.exit(1)
135 | 
136 |     query = opts.query
137 |     index = opts.index
138 |     doc_type = opts.doc_type
139 |     if data_type == 'es' and index is None:
140 |         logging.error(
141 |             "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter")
142 |         sys.exit(1)
143 | 
144 |     n_topics = int(opts.n_topics)
145 |     n_passes = int(opts.n_passes)
146 |     logging.info("Using %d topics." % n_topics)
147 |     data_dir = opts.data_dir
148 |     model_id = opts.model_id
149 |     model_fn = '%s_%s_%d' % (model_id, model_type, n_topics)
150 |     if data_dir:
151 |         model_fn = '%s/%s' % (data_dir, model_fn)
152 |     if model_type == 'word2vec':
153 |         w2v_size = int(opts.w2v_size)
154 |         w2v_window = int(opts.w2v_window)
155 |         model_fn = '%s_w_%s_s_%s' % (model_fn, w2v_window, w2v_size)
156 |     logging.info("Writing models to %s." % model_fn)
157 | 
158 |     if data_type == 'es':
159 |         logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query))
160 |         dataset = ElasticsearchDataset(read_index=index, read_doc_type=doc_type, query=query,
161 |                                        normalize_func=normalize_es)
162 |     elif data_type == 'wiki':
163 |         logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit))
164 |         dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki)
165 |     elif data_type == 'file':
166 |         logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit))
167 |         dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file)
168 |     vocab_file = opts.vocab
169 |     vocab = Dictionary()
170 |     sw = set(stopwords.words('norwegian'))
171 |     if not vocab_file or model_type == 'vocabulary':
172 |         vocab.add_documents([get_tokenized(page, sw) for page in dataset])
173 |         vocab.filter_extremes()
174 |         vocab.compactify()
175 |         vocab.save(model_fn + '.vocab')
176 |     else:
177 |         vocab = Dictionary.load(vocab_file)
178 |     if model_type == 'vocabulary':
179 |         return
180 |     tfidf = TfidfModel(dictionary=vocab)
181 |     if model_type == 'lsi':
182 |         corpus = IterableDataset(dataset, sw, vocab)
183 |         model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics,
184 |                          id2word=vocab)
185 |     elif model_type == 'lda':
186 |         corpus = IterableDataset(dataset, sw, vocab)
187 |         model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes,
188 |                          id2word=vocab)
189 | 
190 |     elif model_type == 'word2vec':
191 |         corpus = IterableDataset(dataset, sw, vocab, doc2bow=False)
192 |         corpus.dictionary = vocab
193 |         model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size)
194 |     elif model_type == 'hdp':
195 |         corpus = IterableDataset(dataset, sw, vocab)
196 |         model = HdpModel(corpus=tfidf[corpus], id2word=vocab)
197 | 
198 |     logging.info(model)
199 |     model.save(model_fn)
200 | 
201 | 
202 | if __name__ == '__main__':
203 |     logformat = '%(asctime)s %(name)-12s: %(message)s'
204 |     logging.basicConfig(level=logging.INFO, format=logformat, filename='wiki-topicmodel.log' )
205 |     console = logging.StreamHandler()
206 |     formatter = logging.Formatter(logformat)
207 |     console.setFormatter(formatter)
208 |     logging.getLogger('').addHandler(console)
209 |     main()
210 | 
211 |     # ########## sample usage
212 |     #
213 |     #--model-type=lda -d F:/projects/elasticsearch-enterprise-system/data/nowiki-20150901-pages-articles.xml.bz2 -l 100 --n-topics 10
214 |     #--model-type=lda -ds es  --n-topics 10 --index wiki --query "{\"query\":{\"match\": {\"_all\":\"kongo\"}}}"
215 |     #--model-type=word2vec -ds es   --index wiki --w2v_window=7 --w2v_size=75
216 |     #--model-type=hdp -d F:/projects/elasticsearch-enterprise-system/data/nowiki-20150901-pages-articles.xml.bz2 -l 100 --n-topics 10
217 |     #--model-type=lda -ds file  --n-topics 10 -d f:/projects/comperio-text-analytics/models/dump
218 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/tagger.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import logging
  3 | import os
  4 | import re
  5 | from tarfile import TarFile
  6 | from zipfile import ZipFile
  7 | import sys
  8 | from subprocess import Popen, PIPE
  9 | 
 10 | from nltk.tag.hunpos import HunposTagger
 11 | from textblob.base import BaseTagger
 12 | 
 13 | from es_text_analytics.data.dataset import project_path, download_file
 14 | from es_text_analytics.tokenizer import NOTokenizer
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | # TextBlob compatible part-of-speech tagger for Norwegian.
 22 | 
 23 | # default HunPos model locations
 24 | NOB_TAGGER_DEFAULT_MODEL_FN = os.path.join(project_path(), 'models', 'nob-tagger-default-model')
 25 | NNO_TAGGER_DEFAULT_MODEL_FN = os.path.join(project_path(), 'models', 'nno-tagger-default-model')
 26 | 
 27 | HUNPOS_URL_MAP = {
 28 |     'linux2': 'https://hunpos.googlecode.com/files/hunpos-1.0-linux.tgz',
 29 |     'darwin': 'https://hunpos.googlecode.com/files/hunpos-1.0-macosx.tgz',
 30 |     'win32': 'https://hunpos.googlecode.com/files/hunpos-1.0-win.zip'
 31 | }
 32 | 
 33 | HUNPOS_SUBDIR_MAP = {
 34 |     'win32': 'hunpos-1.0-win',
 35 |     'darwin': 'hunpos-1.0-macosx',
 36 |     'linux2': 'hunpos-1.0-linux'
 37 | }
 38 | 
 39 | 
 40 | def obt_to_universal_tag(form, pos, feats):
 41 |     """
 42 |     Maps OBT POS tags and features as found in the NDT to universal POS tags as described in
 43 |     http://universaldependencies.github.io/docs/u/pos/index.html
 44 | 
 45 |     The mapping is not complete or completely precises because of discrepancies between the OBT/NDT
 46 |     annotation and this tagset. For example:
 47 | 
 48 |     - AUX is not annotated in NDT and would need a wordlist to extract properly.
 49 |     - NUM is done mostly heuristically since NDT does nt encode numbers. Ideally numbers and similar
 50 |       entities should be normalized before tagging.
 51 | 
 52 |     :param form: NDT word form.
 53 |     :type form: str|unicode
 54 |     :param pos: OBT pos tag.
 55 |     :type pos: str|unicode
 56 |     :param feats: OBT features encoded as | separated string as in NDT.
 57 |     :type feats: str|unicode
 58 |     :rtype : str|unicode
 59 |     :return: Normalized universal POS tag.
 60 |     """
 61 |     feats = feats.split('|')
 62 | 
 63 |     if re.search('\d', form):
 64 |         return 'NUM'
 65 | 
 66 |     if pos == 'adj':
 67 |         return 'ADJ'
 68 |     if pos == 'adv':
 69 |         return 'ADV'
 70 |     if pos == 'konj':
 71 |         return 'CONJ'
 72 |     if pos == 'det' and ('<romertall>' in feats or '<romartal>' in feats):
 73 |         return 'NUM'
 74 |     if pos == 'det':
 75 |         return 'DET'
 76 |     if pos == 'interj':
 77 |         return 'INTJ'
 78 |     # we'll include dates among the proper nouns
 79 |     if pos == 'subst' and ('prop' in feats or '<tittel>' in feats or 'fork' in feats or '<dato>' in feats):
 80 |         return 'PROPN'
 81 |     if pos == 'subst' and 'symb' in feats:
 82 |         return 'SYM'
 83 |     if pos == 'subst':
 84 |         return 'NOUN'
 85 |     if pos == 'pron':
 86 |         return 'PRON'
 87 |     if pos in ['clb', '<anf>', '<komma>', '<parentes-beg>', '<parentes-slutt>', '<strek>']:
 88 |         return 'PUNCT'
 89 |     if pos == 'sbu':
 90 |         return 'SCONJ'
 91 |     if pos == 'symb':
 92 |         return 'SYM'
 93 |     if pos in ['inf-merke', 'verb']:
 94 |         return 'VERB'
 95 |     if pos == 'prep':
 96 |         return 'ADP'
 97 | 
 98 |     return 'X'
 99 | 
100 | 
101 | def obt_to_simple(form, pos, feats):
102 |     """
103 |     Mapping from OBT to a simple POS tag set including a small set of basic features into the tag.
104 | 
105 |     - Heuristically extracts number tag.
106 |     - Normalizes punctuation to single tag.
107 |     - Includes pronoun type.
108 |     - Normalizes proper noun tags and features.
109 |     - includes passive feature on verbs.
110 | 
111 |     :param form: NDT word form.
112 |     :type form: str|unicode
113 |     :param pos: OBT pos tag.
114 |     :type pos: str|unicode
115 |     :param feats: OBT features encoded as | separated string as in NDT.
116 |     :type feats: str|unicode
117 |     :rtype : str|unicode
118 |     :return: Normalized POS tag.
119 |     """
120 |     feats = feats.split('|')
121 | 
122 |     if re.search('\d', form):
123 |         return 'NUM'
124 | 
125 |     if pos == 'det' and ('<romertall>' in feats or '<romartal>' in feats):
126 |         return 'NUM'
127 | 
128 |     if pos in ['clb', '<anf>', '<komma>', '<parentes-beg>', '<parentes-slutt>', '<strek>']:
129 |         return 'PUNKT'
130 | 
131 |     if pos == 'pron':
132 |         for feat in ['sp', 'pers', 'poss', 'refl']:
133 |             if feat in feats:
134 |                  return ('%s_%s' % (pos, feat)).upper()
135 | 
136 |     if pos == 'subst':
137 |         if 'sym' in feats:
138 |             return 'SYMB'
139 | 
140 |         # include dates
141 |         for feat in ['prop', '<tittel>', 'fork', '<dato>']:
142 |             if feat in feats:
143 |                 return 'SUBST_PROP'
144 | 
145 |     if pos == 'verb' and 'pass' in feats:
146 |         return 'VERB_PASS'
147 | 
148 |     return pos.upper()
149 | 
150 | 
151 | # maps feature normalization identifiers to the functions
152 | FEATURES_MAP = {'universal': obt_to_universal_tag,
153 |                 'simple': obt_to_simple,
154 |                 # removes all features and includes just the bare POS tag
155 |                 'no-feats': lambda form, pos, feats: pos,
156 |                 # includes all features execpt blank ones
157 |                 'all-feats': lambda form, pos, feats: '%s_%s' % (pos, '_'.join([f for f in feats.split('|') if f != '_']))}
158 | 
159 | 
160 | def install_hunpos():
161 |     """
162 |     Downloads and install system appropriate HunPos binaries in the default location.
163 | 
164 |     :rtype : None
165 |     """
166 |     models_dir = os.path.join(project_path(), 'models')
167 | 
168 |     hunpos_archive_fn = download_file(HUNPOS_URL_MAP[sys.platform], models_dir)
169 | 
170 |     if sys.platform == 'win32':
171 |         with ZipFile(hunpos_archive_fn) as f:
172 |             f.extractall(models_dir)
173 |     else:
174 |         with TarFile(hunpos_archive_fn) as f:
175 |             f.extractall(models_dir)
176 | 
177 |     os.remove(hunpos_archive_fn)
178 | 
179 | 
180 | def hunpos_path():
181 |     """
182 |     Returns the system specifiuc default install directory for HunPos binaries.
183 | 
184 |     :rtype : str|unicode
185 |     :return:
186 |     """
187 |     return os.path.join(project_path(), 'models', HUNPOS_SUBDIR_MAP[sys.platform])
188 | 
189 | 
190 | def hunpos_tag_bin():
191 |     """
192 |     Path to system specific hunpos-tag binary.
193 | 
194 |     :rtype : str|unicode
195 |     :return:
196 |     """
197 |     if sys.platform == 'win32':
198 |         return os.path.join(hunpos_path(), 'hunpos-tag.exe')
199 |     else:
200 |         return os.path.join(hunpos_path(), 'hunpos-tag')
201 | 
202 | 
203 | def hunpos_train_bin():
204 |     """
205 |     Path to system specific hunpos-train binary.
206 | 
207 |     :rtype : str|unicode
208 |     :return:
209 |     """
210 |     if sys.platform == 'win32':
211 |         return os.path.join(hunpos_path(), 'hunpos-train.exe')
212 |     else:
213 |         return os.path.join(hunpos_path(), 'hunpos-train')
214 | 
215 | 
216 | def parse_hunpos_train_output(output):
217 |     """
218 |     Parses hunpos-train output and collects the reported statistics.
219 | 
220 |     Includes:
221 |     - error messages (errors)
222 |     - # of sentences and # of tokens (sentences, tokens)
223 |     - # of uppercase and lowercase tokens (n_upper, n_lower)
224 |     - # of different POS tags (tag_card)
225 | 
226 |     :param output: String with newline separated output from hunpos-train
227 |     :rtype : dict
228 |     :return: Dict with statistics reported by hunpos-train.
229 |     """
230 |     stats = {'errors': []}
231 | 
232 |     for line in output.split('\n'):
233 |         line = line.strip()
234 | 
235 |         m = re.match('(\d+) tokens', line)
236 |         if m:
237 |             stats['tokens'] = int(m.group(1))
238 | 
239 |         m = re.match('(\d+) sentences', line)
240 |         if m:
241 |             stats['sentences'] = int(m.group(1))
242 | 
243 |         m = re.match('(\d+) different tag', line)
244 |         if m:
245 |             stats['tag_card'] = int(m.group(1))
246 | 
247 |         m = re.match('(\d+) lowercase', line)
248 |         if m:
249 |             stats['n_lower'] = int(m.group(1))
250 | 
251 |         m = re.match('(\d+) uppercase tokens', line)
252 |         if m:
253 |             stats['n_upper'] = int(m.group(1))
254 | 
255 |         m = re.match('theta = (\d\.\d+)', line)
256 |         if m:
257 |             stats['theta'] = float(m.group(1))
258 | 
259 |         # the error format is not documented so this will suffice for now
260 |         m = re.search('error', line, re.IGNORECASE)
261 |         if m:
262 |             stats['errors'] += line
263 | 
264 |     return stats
265 | 
266 | 
267 | def train_hunpos_model(seq, model_fn):
268 |     """
269 |     Trains a HunPos POS tagger on the sentences passed as seq using the external hunpos-train binary.
270 | 
271 |     Models use UTF-8 encoding.
272 | 
273 |     :param seq: Iterator with sentences. Sentences are iterators with word form/pos tag tuples.
274 |     :param model_fn: File where the resulting model will be stored.
275 |     :type model_fn: str|unicode
276 |     :rtype : dict
277 |     :return: Reported statistics printed by hunpos-train
278 |     """
279 | 
280 |     # We'll be doind it simple here.
281 |     # Just write all the data to stdin and catch potential errors on stderr afterwards.
282 |     train_proc = Popen([hunpos_train_bin(), model_fn], stdin=PIPE, stderr=PIPE)
283 | 
284 |     for sent in seq:
285 |         for form, tag in sent:
286 |             line = b'%s\t%s\n' % (form, tag)
287 |             line = line.encode('utf-8')
288 |             train_proc.stdin.write(line)
289 | 
290 |         train_proc.stdin.write('\n')
291 | 
292 |     train_proc.stdin.close()
293 | 
294 |     # parse the output
295 |     # hunpos-trai reports results and errors on stderr
296 |     stats = parse_hunpos_train_output(train_proc.stderr.read())
297 | 
298 |     train_proc.wait()
299 | 
300 |     # check if the stats reports any errors
301 |     if len(stats['errors']) != 0:
302 |         logging.error('HunPos failed with error messages ...')
303 | 
304 |         for error in stats['errors']:
305 |             logging.error(error)
306 | 
307 |     return stats
308 | 
309 | 
310 | def clean_input(string):
311 |     return re.sub('\n', ' ', string)
312 | 
313 | 
314 | class NOBTagger (BaseTagger, object):
315 |     """
316 |     TextBlob compatible Norsk Bokmål POS tagger class based on the NLTK HunPos wrapper.
317 |     """
318 |     def __init__(self, model_fn=None):
319 |         self.tokenizer = NOTokenizer()
320 |         self.tagger = HunposTagger(NOB_TAGGER_DEFAULT_MODEL_FN,
321 |                                    hunpos_tag_bin(), encoding='utf-8')
322 | 
323 |     def tag(self, text, tokenize=True):
324 | 
325 |         if tokenize:
326 |             text = clean_input(text)
327 |             text = self.tokenizer.tokenize(text)
328 | 
329 |         return self.tagger.tag(text)
330 | 
331 | 
332 | class NNOTagger (BaseTagger, object):
333 |     """
334 |     TextBlob compatible Norsk Nynorsk POS tagger class based on the NLTK HunPos wrapper.
335 |     """
336 |     def __init__(self, model_fn=None):
337 |         self.tokenizer = NOTokenizer()
338 |         self.tagger = HunposTagger(NNO_TAGGER_DEFAULT_MODEL_FN,
339 |                                    hunpos_tag_bin(), encoding='utf-8')
340 | 
341 |     def tag(self, text, tokenize=True):
342 |         text = clean_input(text)
343 | 
344 |         if tokenize:
345 |             text = self.tokenizer.tokenize(text)
346 | 
347 |         return self.tagger.tag(text)
348 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/term_weight_provider.py:
--------------------------------------------------------------------------------
  1 | from abc import ABCMeta, abstractmethod
  2 | import logging
  3 | from math import log
  4 | import re
  5 | 
  6 | from elasticsearch.client import IndicesClient
  7 | from gensim.corpora import Dictionary
  8 | from gensim.models import TfidfModel
  9 | 
 10 | ES_TERM_WEIGHTING_INDEX_DEFAULT_NAME = 'es_term_weighting_index'
 11 | 
 12 | ES_TERMWEIGHTING_INDEX_SETTINGS = {"mappings": {
 13 |     "term": {"properties": {"form": {"type": "string", "index": "not_analyzed"}, "value": {"type": "float"}}}}}
 14 | 
 15 | 
 16 | class TermWeightingProvider:
 17 |     """
 18 |     Base class for term weighting providers handling common weighting transforms, basic missing terms policies and
 19 |      the weight retrieval API.
 20 |     """
 21 |     __metaclass__ = ABCMeta
 22 | 
 23 |     def __init__(self, missing='error', inverse=False, sublinear=False):
 24 |         """
 25 |         :param missing: Missing terms policy. 'error' raises KeyError, 'ignore' removes missing terms from the result,
 26 |             and a number value returns that value for missing terms.
 27 |         :type missing: str|unicode|int|long|float
 28 |         :param inverse: Inverse the frequency ratio (for IDF and similar weighttings).
 29 |         :type inverse:bool
 30 |         :param sublinear: Log the frequency ratio. Applied after inversion if applicable.
 31 |         :type sublinear:bool
 32 |         :raise ValueError: When passed invalid missing argument.
 33 |         """
 34 |         self.inverse = inverse
 35 |         self.sublinear = sublinear
 36 | 
 37 |         self.default_value = None
 38 | 
 39 |         if isinstance(missing, (str, unicode)):
 40 |             self.missing_value_policy = missing
 41 |         elif isinstance(missing, (int, long, float)):
 42 |             self.missing_value_policy = 'value'
 43 |             self.default_value = float(missing)
 44 |         else:
 45 |             raise ValueError
 46 | 
 47 |     def _handle_missing_term(self, term):
 48 |         """
 49 |         Implements missing terms according to configured policy
 50 | 
 51 |         :param term:
 52 |         :type term:str|unicode
 53 |         :rtype : None|float
 54 |         :return: :raise KeyError:
 55 |         """
 56 |         if self.missing_value_policy == 'error':
 57 |             raise KeyError(term)
 58 |         elif self.missing_value_policy == 'ignore':
 59 |             return None
 60 |         else:
 61 |             return self.default_value
 62 | 
 63 |     def __getitem__(self, terms):
 64 |         """
 65 |         Retrieves the weights for one or more term.
 66 | 
 67 |         :param terms: single or list of terms
 68 |         :type terms: str|unicode|list|tuple
 69 |         :rtype : (str|unicode, float)|list
 70 |         :raise NotImplementedError:
 71 |         """
 72 |         # single term retrievals are returned in a special format
 73 |         single = False
 74 | 
 75 |         if isinstance(terms, (str, unicode)):
 76 |             terms = [terms]
 77 |             single = True
 78 | 
 79 |         # retrieve the term weight map implemented in the superclass
 80 |         tw = self._weights_for_terms(terms)
 81 | 
 82 |         w = []
 83 | 
 84 |         for term in terms:
 85 |             if term in tw:
 86 |                 w.append((term, tw[term]))
 87 |             else:
 88 |                 # check missing value policy
 89 |                 val = self._handle_missing_term(term)
 90 | 
 91 |                 if val:
 92 |                     w.append((term, val))
 93 | 
 94 |         # do transforms
 95 |         if self.inverse:
 96 |             w = [(term, 1. / freq) for term, freq in w]
 97 | 
 98 |         if self.sublinear:
 99 |             w = [(term, log(freq)) for term, freq in w]
100 | 
101 |         # if we're returning a single or null result we unwrap the list
102 |         if single and (len(w) == 1):
103 |             return w[0]
104 |         elif len(w) == 0:
105 |             return None
106 |         else:
107 |             return w
108 | 
109 |     @abstractmethod
110 |     def _weights_for_terms(self, terms):
111 |         """
112 |         Implement this method to retrieve the actual weights for the terms in the query.
113 |         If a term is missing it should not be included, the base class will handle missing values.
114 | 
115 |         :param terms:
116 |         :type terms:list
117 |         :rtype : dict
118 |         :raise NotImplementedError:
119 |         """
120 |         raise NotImplementedError
121 | 
122 | 
123 | def weight_map_from_term_counts(term_count_iter, min_count=1):
124 |     """
125 |     Create a map of terms and their frequencies from a list of terms and counts.
126 | 
127 |     :param term_count_iter: An iterator with tuples of terms and counts, ie, (term, count).
128 |     :param min_count: Minimum count value that will be added to the weight map:
129 |     :type min_count: int|long
130 |     :rtype : dict
131 |     :return: A dict with the terms as keys and the frequency ratios as values.
132 |     """
133 |     weight_map = {}
134 |     total = 0
135 | 
136 |     for term, count in term_count_iter:
137 |         total += count
138 | 
139 |         if count >= min_count:
140 |             weight_map[term] = weight_map.get(term, 0) + count
141 | 
142 |     for term in weight_map.keys():
143 |         w = weight_map[term] / float(total)
144 | 
145 |         weight_map[term] = w
146 | 
147 |     return weight_map
148 | 
149 | 
150 | def term_counts_line_parser(line, delim='\t', term_index=1, count_index=2):
151 |     """
152 |     Parses a line from a file with terms and counts as line items.
153 | 
154 |     The defaults f.ex. parses "34\tba\t45\n" into ('ba', 45)
155 | 
156 |     :param line:
157 |     :type line: unicode|str
158 |     :param delim: Character used to split tokens.
159 |     :type delim: unicode|str
160 |     :param term_index: Token index for the term element.
161 |     :type term_index: int|long
162 |     :param count_index: Token index for the count element.
163 |     :type count_index: int|long
164 |     :rtype : (unicode|str, int|long)
165 |     :return: Tuple with the term and count from the passed line string.
166 |     """
167 |     tokens = line.split(delim)
168 | 
169 |     return tokens[term_index], int(tokens[count_index])
170 | 
171 | 
172 | def term_counts_iter_from_file(f, line_parser=None):
173 |     """
174 |     Reads term counts from a file with term/count pairs as line items.
175 | 
176 |     :param f: A FileIO instance
177 |     :type f: FileIO
178 |     :param line_parser: Function that parses a line into a term, count tuple. Default parses Gensim Dictionary
179 |         text format.
180 |     :type line_parser: function
181 |     :rtype : generator
182 |     """
183 |     if not line_parser:
184 |         line_parser = term_counts_line_parser
185 | 
186 |     for line in f:
187 |         yield line_parser(line)
188 | 
189 | 
190 | class SimpleTermWeightProvider(TermWeightingProvider):
191 |     """
192 |     Simple term weight provider for term count ratios supplied by an iterator. Takes options for returning
193 |     logged or inverse ratios.
194 |     """
195 | 
196 |     def __init__(self, term_count_iter, **kwargs):
197 |         super(SimpleTermWeightProvider, self).__init__(**kwargs)
198 | 
199 |         self.weight_map = weight_map_from_term_counts(term_count_iter)
200 | 
201 |     def _weights_for_terms(self, terms):
202 |         # just return the whole weight dict
203 |         return self.weight_map
204 | 
205 | 
206 | class ESTermAggregationWeightProvider(TermWeightingProvider):
207 |     """
208 |     Term weight provider for DF/IDF values based on an Elasticsearch index using the terms aggregator.
209 | 
210 |     Defaults to logged IDF values.
211 |     """
212 | 
213 |     def __init__(self, es, index, doc_type, field, **kwargs):
214 |         super(ESTermAggregationWeightProvider, self).__init__(**kwargs)
215 | 
216 |         self.es = es
217 |         self.index = index
218 |         self.doc_type = doc_type
219 |         self.field = field
220 | 
221 |     def _weights_for_terms(self, terms):
222 |         q = {"size": 0,
223 |              "aggs": {"df": {"terms": {"field": self.field, "size": len(terms),
224 |                                        "include": '|'.join([re.escape(term) for term in terms])}}}}
225 | 
226 |         resp = self.es.search(index=self.index, doc_type=self.doc_type, body=q)
227 | 
228 |         try:
229 |             n_doc = resp['hits']['total']
230 |             tf = dict((e['key'], e['doc_count'] / float(n_doc)) for e in resp['aggregations']['df']['buckets'])
231 |         except KeyError:
232 |             # malformed response
233 |             raise RuntimeError
234 | 
235 |         return dict(tf)
236 | 
237 | 
238 | class GensimIDFProvider(TermWeightingProvider):
239 |     """
240 |     IDF TermWeightingProvider based on a Gensim Dictionary using the Gensim TfIdf model.
241 |     """
242 |     def __init__(self, dictionary, **kwargs):
243 |         super(GensimIDFProvider, self).__init__(**kwargs)
244 | 
245 |         if {'missing', 'linear', 'linear'} <= set(kwargs):
246 |             logging.warning('<%s> argumemts to GensimIDFProvider can generate incorrect weights and should not be used'
247 |                             % '|'.join({'missing', 'linear', 'linear'}))
248 | 
249 |         if isinstance(dictionary, (str, unicode)):
250 |             dictionary = Dictionary.load(dictionary)
251 |         self.dictionary = dictionary
252 |         self.tfidf = TfidfModel(dictionary=dictionary, normalize=False)
253 | 
254 |     def _weights_for_terms(self, terms):
255 |         return {self.dictionary[bow_id]: val for bow_id, val in self.tfidf[self.dictionary.doc2bow(terms)]}
256 | 
257 | 
258 | class ESTermIndexWeightingProvider(TermWeightingProvider):
259 |     """
260 |     Class implementing storage of term weights in an Elasticsearch index.
261 |     """
262 |     def __init__(self, es, index=None, initial_weights=None, **kwargs):
263 |         """
264 |         :param es: Elasticsearch instance from py-elasticsearch API.
265 |         :type es:elasticsearch.Elasticsearch
266 |         :param index: Name of the index where term weights are stored. If it doesn't exist it is created.
267 |         :type index:str|unicode
268 |         :param initial_weights: Iterator with term/weight pairs that will be added to the index during initialization.
269 |         """
270 |         super(ESTermIndexWeightingProvider, self).__init__(**kwargs)
271 | 
272 |         self.es = es
273 |         self.index = index
274 | 
275 |         if not self.index:
276 |             self.index = ES_TERM_WEIGHTING_INDEX_DEFAULT_NAME
277 | 
278 |         ESTermIndexWeightingProvider._create_weight_index(self.es, self.index)
279 | 
280 |         if initial_weights:
281 |             ESTermIndexWeightingProvider._add_terms_iter(self.es, self.index, initial_weights)
282 | 
283 |     @staticmethod
284 |     def _create_weight_index(es, index):
285 |         """
286 |         Creates the index with the right mapping if it doesn't exist.
287 | 
288 |         :param es:
289 |         :type es:elasticsearch.Elasticsearch
290 |         :param index:
291 |         :type index:str|unicode
292 |         """
293 |         ic = IndicesClient(es)
294 | 
295 |         if ic.exists(index):
296 |             logging.info('Index %s already exists ...' % index)
297 |         else:
298 |             ic.create(index=index, body=ES_TERMWEIGHTING_INDEX_SETTINGS)
299 | 
300 |     @staticmethod
301 |     def _add_terms_iter(es, index, iter, bulk_size=1000):
302 |         """
303 |         Adds term documents to the index from the term weight pairs in the iterator.
304 | 
305 |         :param es:
306 |         :type es:elasticsearch.Elasticsearch
307 |         :param index:
308 |         :type index:str|unicode
309 |         :param iter:
310 |         """
311 |         bulk_actions = []
312 |         count = 0
313 | 
314 |         for term, weight in iter:
315 |             count = 1
316 | 
317 |             bulk_actions += [{'index': {'_index': index, '_type': 'term'}},
318 |                              {'form': term, 'value': weight}]
319 | 
320 |             if len(bulk_actions) % (2 * bulk_size) == 0:
321 |                 es.bulk(index=index, doc_type='term', body=bulk_actions)
322 |                 logging.info('Added %d documents ...' % count)
323 |                 bulk_actions = []
324 | 
325 |         if bulk_actions:
326 |             es.bulk(index=index, doc_type='term', body=bulk_actions)
327 |             logging.info('Added %d documents ...' % count)
328 | 
329 |     def _weights_for_terms(self, terms):
330 |         should_clauses = [{'match': {'form': term}} for term in terms]
331 | 
332 |         resp = self.es.search(index=self.index, doc_type='term',
333 |                        body={'query': {
334 |                            'bool': {
335 |                                'should': should_clauses
336 |                            }
337 |                        },
338 |                        'fields': ['form', 'value']})
339 | 
340 |         return {hit['fields']['form'][0]: float(hit['fields']['value'][0]) for hit in resp['hits']['hits']}
341 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/data/aviskorpus.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import logging
  3 | import os
  4 | import re
  5 | import unicodedata
  6 | from StringIO import StringIO
  7 | from gzip import GzipFile
  8 | from tarfile import TarFile
  9 | from zipfile import ZipFile
 10 | 
 11 | from bs4 import BeautifulSoup
 12 | 
 13 | from es_text_analytics.data.dataset import Dataset
 14 | 
 15 | AVISKORPUS_ARCHIVE_URL='http://www.nb.no/sbfil/tekst/norsk_aviskorpus.zip'
 16 | 
 17 | # Map of corpus sections, corresponding files inside the main archive and codes for the newspaper in
 18 | # each section. See http://www.nb.no/sprakbanken/show?serial=sbr-4&lang=nb for details.
 19 | CORPUS_SECTIONS = {
 20 |     '1': {'paths': ['1/19981013-20010307.gz',
 21 |                     '1/20010308-20030116.gz',
 22 |                     '1/20030116-20050403.gz'],
 23 |           'name': 'ti-aviser-1998-2005',
 24 |           'sources': ['aa', 'ap', 'bt', 'da', 'db',
 25 |                       'dn', 'fv', 'nl', 'sa', 'vg']},
 26 |     '2': {'paths': ['2/aa.tar.gz',
 27 |                     '2/ap.tar.gz',
 28 |                     '2/bt.tar.gz',
 29 |                     '2/da.tar.gz',
 30 |                     '2/db.tar.gz',
 31 |                     '2/dn.tar.gz',
 32 |                     '2/fv.tar.gz',
 33 |                     '2/nl.tar.gz',
 34 |                     '2/sa.tar.gz',
 35 |                     '2/vg.tar.gz'],
 36 |           'name': 'ti-aviser-2005-2011',
 37 |           'sources': ['aa', 'ap', 'bt', 'da', 'db',
 38 |                       'dn', 'fv', 'nl', 'sa', 'vg']},
 39 |     '3': {'paths': ['3/dt.tar.gz',
 40 |                     '3/fi.tar.gz',
 41 |                     '3/hd.tar.gz',
 42 |                     '3/ho.tar.gz',
 43 |                     '3/kk.tar.gz',
 44 |                     '3/na.tar.gz',
 45 |                     '3/sh.tar.gz',
 46 |                     '3/so.tar.gz',
 47 |                     '3/sp.tar.gz',
 48 |                     '3/vb.tar.gz',
 49 |                     '3/vt.tar.gz'],
 50 |           'name': 'andre-aviser',
 51 |           'sources': ['dt', 'fi', 'hd', 'ho', 'kk', 'na',
 52 |                       'sh', 'so', 'sp', 'vb', 'vt']}}
 53 | 
 54 | 
 55 | def match_or_none(pattern, string, flags=0):
 56 |     """
 57 |     Small wrapper for reqexes with one match group which may or may not match.
 58 | 
 59 |     Matches on whole string not just the beginning (ie. uses re.search).
 60 | 
 61 |     :param pattern: Regular expression with a single match group.
 62 |     :type pattern: unicode|str
 63 |     :param string: String to match against.
 64 |     :type string: unicode|str
 65 |     :param flags:
 66 |     :rtype : str|unicode|None
 67 |     :return: The match group string.
 68 |     """
 69 |     m = re.search(pattern, string, flags=flags)
 70 | 
 71 |     if m:
 72 |         if len(m.groups()) == 1:
 73 |             return m.group(1)
 74 | 
 75 |         return m.groups()
 76 | 
 77 | 
 78 | def section_1_header_line(line):
 79 |     """
 80 |     Detects and extracts url from section 1 header line.
 81 | 
 82 |     :param line: Line from section 1 data file.
 83 |     :type line: str|unicode
 84 |     :rtype : None|unicode|str
 85 |     :return: Return the url in the header or None if the passed string is not a section 1 header line.
 86 |     """
 87 |     m = re.search('<U #(http://.*)>', line)
 88 | 
 89 |     if m:
 90 |         return m.group(1)
 91 | 
 92 | 
 93 | def section_1_parser(fileobj):
 94 |     """
 95 |     Parser for section 1 data files.
 96 |     Returns a generator with dict instances with the article data.
 97 | 
 98 |     :param fileobj: A file like instance with section 1 formatted text.
 99 |     :rtype : generator
100 |     """
101 |     line = fileobj.readline()
102 |     tokens = []
103 |     doc = None
104 | 
105 |     while line:
106 |         line = line.decode('latin1')
107 |         line = line.strip()
108 | 
109 |         if line == '':
110 |             pass
111 |         elif section_1_header_line(line):
112 |             # skip empty documents
113 |             if tokens:
114 |                 doc['tokens'] = [unicodedata.normalize('NFC', unicode(token)) for token in tokens]
115 |                 doc['corpus_section'] = 1
116 | 
117 |                 yield doc
118 |                 tokens = []
119 | 
120 |             url = section_1_header_line(line)
121 |             fileobj.readline()
122 |             source_code = match_or_none('^<B (\w\w)>$', fileobj.readline().strip())
123 |             year = match_or_none('^<A (\d\d)>$', fileobj.readline().strip())
124 |             pub_year = int(year) if year else None
125 |             month = match_or_none('^<M (\d\d)>$', fileobj.readline().strip())
126 |             pub_month = int(month) if month else None
127 |             day = match_or_none('^<D (\d\d)>$', fileobj.readline().strip())
128 |             pub_day = int(day) if day else None
129 | 
130 |             doc = {'url': url, 'source': source_code,
131 |                    'pub_year': pub_year, 'pub_month': pub_month, 'pub_day': pub_day}
132 |         else:
133 |             # article content consists of tokens, one on each line
134 |             tokens.append(line)
135 | 
136 |         line = fileobj.readline()
137 | 
138 |     # catch the last document
139 |     if doc and tokens:
140 |         doc['tokens'] = [unicodedata.normalize('NFC', unicode(token)) for token in tokens]
141 |         doc['corpus_section'] = 1
142 | 
143 |         yield doc
144 | 
145 | 
146 | def section_2_header_line(line):
147 |     """
148 |     Detects and extracts url from section 2 header line.
149 | 
150 |     :param line: Line from section 2 data file.
151 |     :type line: str|unicode
152 |     :rtype : None|unicode|str
153 |     :return: Return the url in the header or None if the passed string is not a section 1 header line.
154 |     """
155 |     m = re.search('##U #(http://.*)>', line)
156 | 
157 |     if m:
158 |         return m.group(1)
159 | 
160 | 
161 | def section_2_parser(fileobj):
162 |     """
163 |     Parser for section 2 data files.
164 |     Returns a generator with dict instances with the article data.
165 | 
166 |     :param fileobj: A file like instance with section 2 formatted text.
167 |     :rtype : generator
168 |     """
169 |     line = fileobj.readline()
170 |     text = ''
171 |     doc = None
172 | 
173 |     while line:
174 |         line = line.decode('latin1')
175 |         line = line.strip()
176 | 
177 |         if line == '' or line == '|':
178 |             pass
179 |         elif section_2_header_line(line):
180 |             # skip articles with no content
181 |             if text and doc:
182 |                 # content consists of text lines with header sections delimited by | characters and
183 |                 # sentences delimited by paragraph characters
184 |                 text = text.replace(u'¶', u'|')
185 |                 doc['sentences'] = [unicodedata.normalize('NFC', unicode(sent.strip()))
186 |                                     for sent in text.split(u'|') if sent.strip() != '']
187 |                 doc['corpus_section'] = 2
188 | 
189 |                 yield doc
190 |                 text = ''
191 | 
192 |             url = section_2_header_line(line)
193 |             source_code = match_or_none('^##B (\w\w)>$', fileobj.readline().strip())
194 |             year = match_or_none('^##A (\d\d)>$', fileobj.readline().strip())
195 |             pub_year = int(year) if year else None
196 |             month = match_or_none('^##M (\d\d)>$', fileobj.readline().strip())
197 |             pub_month = int(month) if month else None
198 |             day = match_or_none('^##D (\d\d)>$', fileobj.readline().strip())
199 |             pub_day = int(day) if day else None
200 | 
201 |             doc = {'url': url, 'source': source_code,
202 |                    'pub_year': pub_year, 'pub_month': pub_month, 'pub_day': pub_day}
203 |         else:
204 |             text += line
205 | 
206 |         line = fileobj.readline()
207 | 
208 |     # yield the last document in the file
209 |     if doc and text:
210 |         text = text.replace(u'¶', u'|')
211 |         doc['sentences'] = [unicodedata.normalize('NFC', unicode(sent.strip()))
212 |                             for sent in text.split(u'|') if sent.strip() != '']
213 |         doc['corpus_section'] = 2
214 |         yield doc
215 | 
216 | 
217 | def section_3_parser(fileobj):
218 |     """
219 |     Parser for section 3 data files.
220 |     Returns a dict instance with the article data.
221 | 
222 |     :param fileobj: A file like instance with section 2 formatted text.
223 |     :rtype : dict
224 |     """
225 |     xml_doc = BeautifulSoup(fileobj)
226 | 
227 |     # each article is in a separate XML file. We pick out metadata from attribute tags and div
228 |     # tags with the type attribute. The content is in a div tag with the text attribute.
229 |     metadata = dict([(a['name'], unicodedata.normalize('NFC', unicode(a['value']).strip()))
230 |                      for a in xml_doc.find_all('attribute')])
231 |     text = [unicodedata.normalize('NFC', unicode(t.text).strip())
232 |             for t in xml_doc.find('div', type='text').find_all('p')]
233 | 
234 |     doc = dict([(t.attrs['type'], unicodedata.normalize('NFC', unicode(t.text).strip())) for t in xml_doc.find_all('div')
235 |                 if 'type' in t.attrs and t.attrs['type'] != 'text'])
236 |     doc['text'] = [t for t in text if t != '']
237 |     doc['metadata'] = metadata
238 |     doc['corpus_section'] = 3
239 | 
240 |     return doc
241 | 
242 | 
243 | def iterator(dataset_fn, sections=None, sources=None):
244 |     """
245 |     Generator that yields all the documents in the korpus.
246 |     The generator can return only specific newspaper or sections if specified in
247 |     the arguments,
248 | 
249 |     :param dataset_fn: Dataset archive file. None uses default location.
250 |     :type dataset_fn: str|unicode|None
251 |     :param sections: Sections to include. The default None yields all sections.
252 |     :type sections: list[int|long]|None
253 |     :param sources: Newspaper sources to include. The default None yields all sources.
254 |     :type sources: list[str|unicode]|None
255 |     :rtype : generator
256 |     """
257 |     count = 0
258 | 
259 |     with ZipFile(dataset_fn) as zf:
260 |         # corpus content files are compressed and archived in various ways inside the corpus zip archive.
261 |         if not sections or 1 in sections:
262 |             for fn in CORPUS_SECTIONS['1']['paths']:
263 |                 logging.info('Reading %s ...' % fn)
264 | 
265 |                 with GzipFile(fileobj=StringIO(zf.read(fn))) as iz:
266 |                     try:
267 |                         for doc in section_1_parser(iz):
268 |                             if sources is None or doc['source'] in sources:
269 |                                 yield doc
270 |                                 count += 1
271 | 
272 |                                 if count != 0 and count % 1000 == 0:
273 |                                     logging.info("Read %d files ..." % count)
274 |                     except Exception:
275 |                         logging.error("Parse failure while reading %s ..." % fn)
276 | 
277 |         if not sections or 2 in sections:
278 |             for fn in CORPUS_SECTIONS['2']['paths']:
279 |                 if sources and not os.path.basename(fn).split('.')[0] in sources:
280 |                     continue
281 | 
282 |                 logging.info('Reading %s ...' % fn)
283 | 
284 |                 # TarFile with compression doesn't work inside a ZipFile
285 |                 # Uncompress to a StringIO object and hand that to TarFile
286 |                 with GzipFile(fileobj=StringIO(zf.read(fn))) as iz:
287 |                     with TarFile(fileobj=iz) as tf:
288 |                         for member in tf.getmembers():
289 | 
290 |                             if member.isfile():
291 |                                 tif = tf.extractfile(member)
292 | 
293 |                                 try:
294 |                                     for doc in section_2_parser(tif):
295 |                                         yield doc
296 |                                         count += 1
297 | 
298 |                                         if count != 0 and count % 1000 == 0:
299 |                                             logging.info("Read %d files ..." % count)
300 |                                 except Exception:
301 |                                     logging.error("Parse failure while reading %s ..." % fn)
302 | 
303 |                                 tif.close()
304 | 
305 |         if not sections or 3 in sections:
306 |             for fn in CORPUS_SECTIONS['3']['paths']:
307 |                 if sources and not os.path.basename(fn).split('.')[0] in sources:
308 |                     continue
309 | 
310 |                 logging.info('Reading %s ...' % fn)
311 | 
312 |                 # Same tar.gz inside zip problem as with section 2
313 |                 with GzipFile(fileobj=StringIO(zf.read(fn))) as iz:
314 |                     with TarFile(fileobj=iz) as tf:
315 |                         for member in tf.getmembers():
316 |                             if member.isfile():
317 |                                 tif = tf.extractfile(member)
318 | 
319 |                                 if os.path.splitext(member.name)[1] != '.xml':
320 |                                     continue
321 | 
322 |                                 try:
323 |                                     doc = section_3_parser(tif)
324 | 
325 |                                     yield doc
326 |                                     count += 1
327 | 
328 |                                 except Exception:
329 |                                     logging.error("Unable to parse file %s ..." % member.name)
330 | 
331 |                                 if count != 0 and count % 1000 == 0:
332 |                                     logging.info("Read %d files ..." % count)
333 | 
334 |                                 tif.close()
335 | 
336 | 
337 | def normalize(doc):
338 |     """
339 |     Normalizes content from the Aviskorpus to a single string with the article text in a separate 'text' field.
340 | 
341 |     :param doc: Parsed document from the corpus.
342 |     :type doc: dict
343 |     :rtype : dict
344 |     :return: The passed document dict with normalized 'text' field added.
345 |     :raise ValueError: If the section_id field is malformed
346 |     """
347 |     section_id = doc['corpus_section']
348 | 
349 |     if section_id == 1:
350 |         doc['text'] = ' '.join(doc['tokens'])
351 |     elif section_id == 2:
352 |         doc['text'] = '\n'.join(doc['sentences'])
353 |     elif section_id == 3:
354 |         doc['raw'] = doc['text']
355 |         doc['text'] = '\n\n'.join(doc['text'])
356 | 
357 |         if ('metadata' in doc) and ('source' in doc['metadata']):
358 |             doc['source'] = doc['metadata']['source']
359 |     else:
360 |         raise ValueError('Unknown section id %d in document ...' % section_id)
361 | 
362 |     return doc
363 | 
364 | 
365 | class AviskorpusDataset(Dataset):
366 |     """
367 |     Class encapsulating Norsk Aviskorpus, a large corpus of Norwegian newspaper articles.
368 | 
369 |     See http://www.nb.no/sprakbanken/show?serial=sbr-4&lang=nb for details.
370 |     """
371 |     def __init__(self, index='aviskorpus', doc_type='article', dataset_path=None,
372 |                  sections=None, sources=None, dataset_fn=None):
373 |         super(AviskorpusDataset, self).__init__(index=index, doc_type=doc_type, dataset_path=dataset_path,
374 |                                                 dataset_fn=dataset_fn)
375 | 
376 |         self.archive_fn = AVISKORPUS_ARCHIVE_URL
377 |         
378 |         self.sections = sections
379 |         self.sources = sources
380 |         self.normalize_func = normalize
381 | 
382 |     def _iterator(self):
383 |         return iterator(self.dataset_fn, sections=self.sections, sources=self.sources)
384 | 


--------------------------------------------------------------------------------
/python-client/es_text_analytics/test/test_term_weight_provider.py:
--------------------------------------------------------------------------------
  1 | from StringIO import StringIO
  2 | from unittest import TestCase
  3 | 
  4 | from elasticsearch.client import Elasticsearch, IndicesClient
  5 | from gensim.corpora.dictionary import Dictionary
  6 | 
  7 | from es_text_analytics.term_weight_provider import SimpleTermWeightProvider, ESTermAggregationWeightProvider, \
  8 |     weight_map_from_term_counts, term_counts_line_parser, term_counts_iter_from_file, GensimIDFProvider, \
  9 |     ESTermIndexWeightingProvider
 10 | from es_text_analytics.test import es_runner
 11 | 
 12 | 
 13 | class TestTermWeightProviderHelpers(TestCase):
 14 |     def test_weight_map_from_term_counts(self):
 15 |         wm = sorted(weight_map_from_term_counts([('foo', 2), ('ba', 1), ('knark', 4), ('knirk', 1)]).items())
 16 |         self.assertEqual(wm, [('ba', 0.125), ('foo', 0.25), ('knark', 0.5), ('knirk', 0.125)])
 17 | 
 18 |         wm = sorted(weight_map_from_term_counts([('foo', 2), ('ba', 1), ('knark', 4), ('knirk', 1)], min_count=4).items())
 19 |         self.assertEqual(wm, [('knark', .5)])
 20 | 
 21 |     def test_term_counts_line_parser(self):
 22 |         self.assertEqual(('absolutely', 342), term_counts_line_parser('5949\tabsolutely\t342\n'))
 23 |         self.assertEqual(('finished', 136), term_counts_line_parser('497\tfinished\t136'))
 24 | 
 25 |     def test_term_counts_iter_from_file(self):
 26 |         f = StringIO('5949\tabsolutely\t342\n497\tfinished\t136')
 27 | 
 28 |         self.assertEqual([('absolutely', 342), ('finished', 136)], list(term_counts_iter_from_file(f)))
 29 | 
 30 | 
 31 | class TestSimpleTermWeightProvider(TestCase):
 32 |     def test_getitem_single(self):
 33 |         provider = SimpleTermWeightProvider([('ba', 2), ('foo', 1), ('ba', 1), ('knark', 1),
 34 |                                              ('knirk', 1), ('ba', 1), ('knark', 1)])
 35 |         term, w = provider['ba']
 36 |         self.assertEqual('ba', term)
 37 |         self.assertAlmostEqual(.5, w)
 38 |         term, w = provider['knark']
 39 |         self.assertEqual('knark', term)
 40 |         self.assertAlmostEqual(.25, w)
 41 |         term, w = provider['knirk']
 42 |         self.assertEqual('knirk', term)
 43 |         self.assertAlmostEqual(.125, w)
 44 |         term, w = provider['foo']
 45 |         self.assertEqual('foo', term)
 46 |         self.assertAlmostEqual(.125, w)
 47 | 
 48 |     def test_inverse(self):
 49 |         provider = SimpleTermWeightProvider([('ba', 2), ('foo', 1), ('ba', 1), ('knark', 1),
 50 |                                              ('knirk', 1), ('ba', 1), ('knark', 1)],
 51 |                                             inverse=True)
 52 |         term, w = provider['ba']
 53 |         self.assertEqual('ba', term)
 54 |         self.assertAlmostEqual(2., w)
 55 |         term, w = provider['knark']
 56 |         self.assertEqual('knark', term)
 57 |         self.assertAlmostEqual(4., w)
 58 |         term, w = provider['knirk']
 59 |         self.assertEqual('knirk', term)
 60 |         self.assertAlmostEqual(8., w)
 61 |         term, w = provider['foo']
 62 |         self.assertEqual('foo', term)
 63 |         self.assertAlmostEqual(8., w)
 64 | 
 65 |     def test_sublinear(self):
 66 |         provider = SimpleTermWeightProvider([('ba', 2), ('foo', 1), ('ba', 1), ('knark', 1),
 67 |                                              ('knirk', 1), ('ba', 1), ('knark', 1)],
 68 |                                             sublinear=True)
 69 |         term, w = provider['ba']
 70 |         self.assertEqual('ba', term)
 71 |         self.assertAlmostEqual(-0.693147, w, places=4)
 72 |         term, w = provider['knark']
 73 |         self.assertEqual('knark', term)
 74 |         self.assertAlmostEqual(-1.386294, w, places=4)
 75 |         term, w = provider['knirk']
 76 |         self.assertEqual('knirk', term)
 77 |         self.assertAlmostEqual(-2.079442, w, places=4)
 78 |         term, w = provider['foo']
 79 |         self.assertEqual('foo', term)
 80 |         self.assertAlmostEqual(-2.079442, w, places=4)
 81 | 
 82 |     def test_inverse_sublinear(self):
 83 |         provider = SimpleTermWeightProvider([('ba', 2), ('foo', 1), ('ba', 1), ('knark', 1),
 84 |                                              ('knirk', 1), ('ba', 1), ('knark', 1)],
 85 |                                             sublinear=True, inverse=True)
 86 |         term, w = provider['ba']
 87 |         self.assertEqual('ba', term)
 88 |         self.assertAlmostEqual(0.693147, w, places=4)
 89 |         term, w = provider['knark']
 90 |         self.assertEqual('knark', term)
 91 |         self.assertAlmostEqual(1.386294, w, places=4)
 92 |         term, w = provider['knirk']
 93 |         self.assertEqual('knirk', term)
 94 |         self.assertAlmostEqual(2.079442, w, places=4)
 95 |         term, w = provider['foo']
 96 |         self.assertEqual('foo', term)
 97 |         self.assertAlmostEqual(2.079442, w, places=4)
 98 | 
 99 |     def test_getitem_multiple(self):
100 |         provider = SimpleTermWeightProvider([('ba', 2), ('foo', 1), ('ba', 1), ('knark', 1),
101 |                                              ('knirk', 1), ('ba', 1), ('knark', 1)])
102 | 
103 |         weights = dict(provider[['ba', 'foo', 'knark', 'knirk']])
104 |         self.assertEqual(['ba', 'foo', 'knark', 'knirk'], sorted(weights.keys()))
105 |         self.assertAlmostEqual(weights['ba'], .5)
106 |         self.assertAlmostEqual(weights['knark'], .25)
107 |         self.assertAlmostEqual(weights['knirk'], .125)
108 |         self.assertAlmostEqual(weights['foo'], .125)
109 | 
110 |         provider = SimpleTermWeightProvider([('ba', 2), ('foo', 1), ('ba', 1), ('knark', 1),
111 |                                              ('knirk', 1), ('ba', 1), ('knark', 1)])
112 | 
113 |         weights = dict(provider['ba', 'foo', 'knark', 'knirk'])
114 |         self.assertEqual(['ba', 'foo', 'knark', 'knirk'], sorted(weights.keys()))
115 |         self.assertAlmostEqual(weights['ba'], .5)
116 |         self.assertAlmostEqual(weights['knark'], .25)
117 |         self.assertAlmostEqual(weights['knirk'], .125)
118 |         self.assertAlmostEqual(weights['foo'], .125)
119 | 
120 |     def test_getitem_missing(self):
121 |         provider = SimpleTermWeightProvider([('ba', 2), ('foo', 1), ('ba', 1), ('knark', 1),
122 |                                              ('knirk', 1), ('ba', 1), ('knark', 1)])
123 | 
124 |         self.assertRaises(KeyError, lambda: provider['notfound'])
125 |         self.assertRaises(KeyError, lambda: provider['ba', 'notfound'])
126 | 
127 |         provider = SimpleTermWeightProvider([('ba', 2), ('foo', 1), ('ba', 1), ('knark', 1),
128 |                                              ('knirk', 1), ('ba', 1), ('knark', 1)], missing='ignore')
129 |         self.assertEqual([('ba', .5)], list(provider['ba', 'notfound']))
130 |         self.assertIsNone(provider['notfound'])
131 | 
132 | 
133 | class TestESTermAggregationWeightProvider(TestCase):
134 | 
135 |     def setUp(self):
136 |         super(TestESTermAggregationWeightProvider, self).setUp()
137 | 
138 |         self.es = Elasticsearch(hosts=['localhost:%d' % es_runner.es_state.port])
139 |         self.ic = IndicesClient(self.es)
140 |         self.index = 'es_term_weight_provider_test'
141 |         self.doc_type = 'test-doc'
142 |         self.field = 'text'
143 | 
144 |         if self.ic.exists(self.index):
145 |             self.ic.delete(self.index)
146 | 
147 |         self.ic.create(self.index)
148 |         self.es.create(self.index, self.doc_type, {self.field: 'foo'})
149 |         self.es.create(self.index, self.doc_type, {self.field: 'knark'})
150 |         self.es.create(self.index, self.doc_type, {self.field: 'ba'})
151 |         self.es.create(self.index, self.doc_type, {self.field: 'knirk'})
152 |         self.es.create(self.index, self.doc_type, {self.field: 'ba'})
153 |         self.es.create(self.index, self.doc_type, {self.field: 'ba'})
154 |         self.es.create(self.index, self.doc_type, {self.field: 'knark '})
155 |         self.es.create(self.index, self.doc_type, {self.field: 'ba'}, refresh=True)
156 | 
157 |     def tearDown(self):
158 |         super(TestESTermAggregationWeightProvider, self).tearDown()
159 | 
160 |         self.ic.delete(self.index)
161 | 
162 | 
163 |     def test_getitem_single(self):
164 |         provider = ESTermAggregationWeightProvider(self.es, self.index, self.doc_type, self.field,
165 |                                         inverse=False, sublinear=False)
166 | 
167 |         term, w = provider['ba']
168 |         self.assertEqual('ba', term)
169 |         self.assertAlmostEqual(.5, w)
170 |         term, w = provider['knark']
171 |         self.assertEqual('knark', term)
172 |         self.assertAlmostEqual(.25, w)
173 |         term, w = provider['knirk']
174 |         self.assertEqual('knirk', term)
175 |         self.assertAlmostEqual(.125, w)
176 |         term, w = provider['foo']
177 |         self.assertEqual('foo', term)
178 |         self.assertAlmostEqual(.125, w)
179 | 
180 |     def test_inverse(self):
181 |         provider = ESTermAggregationWeightProvider(self.es, self.index, self.doc_type, self.field,
182 |                                         inverse=True, sublinear=False)
183 |         term, w = provider['ba']
184 |         self.assertEqual('ba', term)
185 |         self.assertAlmostEqual(2., w)
186 |         term, w = provider['knark']
187 |         self.assertEqual('knark', term)
188 |         self.assertAlmostEqual(4., w)
189 |         term, w = provider['knirk']
190 |         self.assertEqual('knirk', term)
191 |         self.assertAlmostEqual(8., w)
192 |         term, w = provider['foo']
193 |         self.assertEqual('foo', term)
194 |         self.assertAlmostEqual(8., w)
195 | 
196 |     def test_sublinear(self):
197 |         provider = ESTermAggregationWeightProvider(self.es, self.index, self.doc_type, self.field,
198 |                                         inverse=False, sublinear=True)
199 |         term, w = provider['ba']
200 |         self.assertEqual('ba', term)
201 |         self.assertAlmostEqual(-0.693147, w, places=4)
202 |         term, w = provider['knark']
203 |         self.assertEqual('knark', term)
204 |         self.assertAlmostEqual(-1.386294, w, places=4)
205 |         term, w = provider['knirk']
206 |         self.assertEqual('knirk', term)
207 |         self.assertAlmostEqual(-2.079442, w, places=4)
208 |         term, w = provider['foo']
209 |         self.assertEqual('foo', term)
210 |         self.assertAlmostEqual(-2.079442, w, places=4)
211 | 
212 |     def test_inverse_sublinear(self):
213 |         provider = ESTermAggregationWeightProvider(self.es, self.index, self.doc_type, self.field,
214 |                                         inverse=True, sublinear=True)
215 |         term, w = provider['ba']
216 |         self.assertEqual('ba', term)
217 |         self.assertAlmostEqual(0.693147, w, places=4)
218 |         term, w = provider['knark']
219 |         self.assertEqual('knark', term)
220 |         self.assertAlmostEqual(1.386294, w, places=4)
221 |         term, w = provider['knirk']
222 |         self.assertEqual('knirk', term)
223 |         self.assertAlmostEqual(2.079442, w, places=4)
224 |         term, w = provider['foo']
225 |         self.assertEqual('foo', term)
226 |         self.assertAlmostEqual(2.079442, w, places=4)
227 | 
228 |     def test_getitem_multiple(self):
229 |         provider = ESTermAggregationWeightProvider(self.es, self.index, self.doc_type, self.field,
230 |                                         inverse=False, sublinear=False)
231 | 
232 |         weights = dict(provider[['ba', 'foo', 'knark', 'knirk']])
233 |         self.assertEqual(['ba', 'foo', 'knark', 'knirk'], sorted(weights.keys()))
234 |         self.assertAlmostEqual(weights['ba'], .5)
235 |         self.assertAlmostEqual(weights['knark'], .25)
236 |         self.assertAlmostEqual(weights['knirk'], .125)
237 |         self.assertAlmostEqual(weights['foo'], .125)
238 | 
239 |         weights = dict(provider['ba', 'foo', 'knark', 'knirk'])
240 |         self.assertEqual(['ba', 'foo', 'knark', 'knirk'], sorted(weights.keys()))
241 |         self.assertAlmostEqual(weights['ba'], .5)
242 |         self.assertAlmostEqual(weights['knark'], .25)
243 |         self.assertAlmostEqual(weights['knirk'], .125)
244 |         self.assertAlmostEqual(weights['foo'], .125)
245 | 
246 |     def test_getitem_missing(self):
247 |         provider = ESTermAggregationWeightProvider(self.es, self.index, self.doc_type, self.field,
248 |                                         inverse=False, sublinear=False)
249 | 
250 |         self.assertRaises(KeyError, lambda: provider['notfound'])
251 |         self.assertRaises(KeyError, lambda: provider['ba', 'notfound'])
252 | 
253 |         provider = ESTermAggregationWeightProvider(self.es, self.index, self.doc_type, self.field,
254 |                                         inverse=False, sublinear=False, missing='ignore')
255 | 
256 |         self.assertIsNone(provider['notfound'])
257 |         self.assertEqual([('ba', .5)], list(provider['ba', 'notfound']))
258 | 
259 | class TestGensimIDFProvider(TestCase):
260 |     def setUp(self):
261 |         super(TestGensimIDFProvider, self).setUp()
262 | 
263 |         self.dictionary = Dictionary([['foo'], ['knark'], ['ba'], ['knirk'], ['ba'], ['ba'], ['knark'], ['ba']])
264 | 
265 |     def test_getitem_single(self):
266 |         provider = GensimIDFProvider(self.dictionary)
267 | 
268 |         term, w = provider['ba']
269 |         self.assertEqual('ba', term)
270 |         self.assertAlmostEqual(1, w)
271 |         term, w = provider['knark']
272 |         self.assertEqual('knark', term)
273 |         self.assertAlmostEqual(2, w)
274 |         term, w = provider['knirk']
275 |         self.assertEqual('knirk', term)
276 |         self.assertAlmostEqual(3, w)
277 |         term, w = provider['foo']
278 |         self.assertEqual('foo', term)
279 |         self.assertAlmostEqual(3, w)
280 | 
281 |     def test_getitem_multiple(self):
282 |         provider = GensimIDFProvider(self.dictionary)
283 | 
284 |         weights = dict(provider[['ba', 'foo', 'knark', 'knirk']])
285 |         self.assertEqual(['ba', 'foo', 'knark', 'knirk'], sorted(weights.keys()))
286 |         self.assertAlmostEqual(weights['ba'], 1)
287 |         self.assertAlmostEqual(weights['knark'], 2)
288 |         self.assertAlmostEqual(weights['knirk'], 3)
289 |         self.assertAlmostEqual(weights['foo'], 3)
290 | 
291 |         weights = dict(provider['ba', 'foo', 'knark', 'knirk'])
292 |         self.assertEqual(['ba', 'foo', 'knark', 'knirk'], sorted(weights.keys()))
293 |         self.assertAlmostEqual(weights['ba'], 1)
294 |         self.assertAlmostEqual(weights['knark'], 2)
295 |         self.assertAlmostEqual(weights['knirk'], 3)
296 |         self.assertAlmostEqual(weights['foo'], 3)
297 | 
298 |     def test_getitem_missing(self):
299 |         provider = GensimIDFProvider(self.dictionary)
300 | 
301 |         self.assertRaises(KeyError, lambda: provider['notfound'])
302 |         self.assertRaises(KeyError, lambda: provider['ba', 'notfound'])
303 | 
304 |         provider = GensimIDFProvider(self.dictionary, missing='ignore')
305 | 
306 |         self.assertIsNone(provider['notfound'])
307 |         self.assertEqual([('ba', 1)], list(provider['ba', 'notfound']))
308 | 
309 | 
310 | class TestESTermIndexWeightingProvider(TestCase):
311 |     def setUp(self):
312 |         super(TestESTermIndexWeightingProvider, self).setUp()
313 | 
314 |         self.es = Elasticsearch(hosts=['localhost:%d' % es_runner.es_state.port])
315 |         self.index = 'test_es_term_index_weighting_provider'
316 |         ESTermIndexWeightingProvider._create_weight_index(self.es, self.index)
317 | 
318 |     def tearDown(self):
319 |         super(TestESTermIndexWeightingProvider, self).tearDown()
320 | 
321 |         ic = IndicesClient(self.es)
322 |         ic.delete(self.index)
323 | 
324 |     def test_getitem_single(self):
325 |         provider = ESTermIndexWeightingProvider(self.es, self.index,
326 |                                                 initial_weights=[('ba', 1), ('knark', 2), ('knirk', 3), ('foo', 3)])
327 |         IndicesClient(self.es).refresh(self.index)
328 | 
329 |         term, w = provider['ba']
330 |         self.assertEqual('ba', term)
331 |         self.assertAlmostEqual(1, w)
332 |         term, w = provider['knark']
333 |         self.assertEqual('knark', term)
334 |         self.assertAlmostEqual(2, w)
335 |         term, w = provider['knirk']
336 |         self.assertEqual('knirk', term)
337 |         self.assertAlmostEqual(3, w)
338 |         term, w = provider['foo']
339 |         self.assertEqual('foo', term)
340 |         self.assertAlmostEqual(3, w)
341 | 
342 |     def test_getitem_multiple(self):
343 |         provider = ESTermIndexWeightingProvider(self.es, self.index,
344 |                                                 initial_weights=[('ba', 1), ('knark', 2), ('knirk', 3), ('foo', 3)])
345 |         IndicesClient(self.es).refresh(self.index)
346 | 
347 |         weights = dict(provider[['ba', 'foo', 'knark', 'knirk']])
348 |         self.assertEqual(['ba', 'foo', 'knark', 'knirk'], sorted(weights.keys()))
349 |         self.assertAlmostEqual(weights['ba'], 1)
350 |         self.assertAlmostEqual(weights['knark'], 2)
351 |         self.assertAlmostEqual(weights['knirk'], 3)
352 |         self.assertAlmostEqual(weights['foo'], 3)
353 | 
354 |         weights = dict(provider['ba', 'foo', 'knark', 'knirk'])
355 |         self.assertEqual(['ba', 'foo', 'knark', 'knirk'], sorted(weights.keys()))
356 |         self.assertAlmostEqual(weights['ba'], 1)
357 |         self.assertAlmostEqual(weights['knark'], 2)
358 |         self.assertAlmostEqual(weights['knirk'], 3)
359 |         self.assertAlmostEqual(weights['foo'], 3)
360 | 
361 |     def test_getitem_missing(self):
362 |         provider = ESTermIndexWeightingProvider(self.es, self.index,
363 |                                                 initial_weights=[('ba', 1), ('knark', 2), ('knirk', 3), ('foo', 3)])
364 |         IndicesClient(self.es).refresh(self.index)
365 | 
366 |         self.assertRaises(KeyError, lambda: provider['notfound'])
367 |         self.assertRaises(KeyError, lambda: provider['ba', 'notfound'])
368 | 
369 |         provider = ESTermIndexWeightingProvider(self.es, self.index,
370 |                                                 initial_weights=[('ba', 1), ('knark', 2), ('knirk', 3), ('foo', 3)],
371 |                                                 missing='ignore')
372 |         IndicesClient(self.es).refresh(self.index)
373 | 
374 |         self.assertIsNone(provider['notfound'])
375 |         self.assertEqual([('ba', 1)], list(provider['ba', 'notfound']))


--------------------------------------------------------------------------------