├── python-client ├── __init__.py ├── sklext │ ├── __init__.py │ ├── test │ │ ├── __init__.py │ │ ├── test_mutual_information.py │ │ ├── test_term_estimators.py │ │ └── test_term_weight_transformer.py │ ├── cond_prob.py │ ├── mutual_information.py │ ├── term_weighting.py │ └── term_estimators.py ├── es_text_analytics │ ├── __init__.py │ ├── data │ │ ├── __init__.py │ │ ├── test │ │ │ ├── __init__.py │ │ │ ├── test_aviskorpus.py │ │ │ ├── test_newsgroups.py │ │ │ ├── test_ndt_dataset.py │ │ │ └── test_dataset.py │ │ ├── elasticsearch_dataset.py │ │ ├── newsgroups.py │ │ ├── wiki_infobox.py │ │ ├── ndt_dataset.py │ │ ├── dataset.py │ │ └── aviskorpus.py │ ├── test │ │ ├── __init__.py │ │ ├── test_no_tokenizer.py │ │ ├── test_single_doc_sigterms.py │ │ ├── test_np_extractor.py │ │ ├── test_decompounder.py │ │ ├── test_tagger.py │ │ └── test_term_weight_provider.py │ ├── tokenizer.py │ ├── single_doc_sigterms.py │ ├── np_extractor.py │ ├── kera.py │ ├── lemmatizer.py │ ├── wordnet_centrality.py │ ├── decompounder.py │ ├── tagger.py │ └── term_weight_provider.py ├── requirements.txt ├── bin │ ├── run_singledoc_sig_terms.py │ ├── build-all-models.bat │ ├── build-all-models.sh │ ├── build_pyLDAvis.py │ ├── NOB_kera.py │ ├── wordcounts_from_dataset.py │ ├── index_dataset.py │ ├── build_no_tagger.py │ ├── build_LDA_kera_from_wiki.py │ ├── corpus2lemmatizedtext.py │ └── build-wiki-topicmodel.py ├── setup.py └── run_models.sh ├── .env ├── .gitignore ├── provision ├── elasticsearch.yml └── neo4j-server.properties ├── .travis.yml ├── environment.yml ├── spark-jobs └── ng-wc.py ├── readme.md ├── Vagrantfile └── fabfile.py /python-client/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | source activate cta-env 2 | -------------------------------------------------------------------------------- /python-client/sklext/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python-client/sklext/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/data/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | models/ 3 | *.pyc 4 | .idea/ 5 | !python-client/es_text_analytics/data 6 | 7 | .vagrant/ 8 | notebooks/ 9 | -------------------------------------------------------------------------------- /provision/elasticsearch.yml: -------------------------------------------------------------------------------- 1 | marvel.agent.enabled: false 2 | index.number_of_shards: 1 3 | index.number_of_replicas: 0 4 | http.cors.enabled: true 5 | -------------------------------------------------------------------------------- /python-client/requirements.txt: -------------------------------------------------------------------------------- 1 | elasticsearch 2 | requests 3 | psutil 4 | textblob 5 | nltk 6 | gensim 7 | uniseg 8 | git+git://github.com/comperiosearch/python-elasticsearch-runner -------------------------------------------------------------------------------- /python-client/es_text_analytics/data/test/test_aviskorpus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/comperiosearch/comperio-text-analytics/master/python-client/es_text_analytics/data/test/test_aviskorpus.py -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | 5 | install: "pip install -r python-client/requirements.txt" 6 | script: nosetests 7 | virtualenv: 8 | system_site_packages: true 9 | before_install: 10 | - sudo apt-get install -qq python-numpy python-scipy -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: cta-env 2 | dependencies: 3 | - python 4 | - pip 5 | - ipython 6 | - ipython-notebook 7 | - matplotlib 8 | - scikit-learn 9 | - requests 10 | - gensim 11 | - nltk 12 | - nose 13 | - pip: 14 | - textblob 15 | - elasticsearch 16 | - psutil 17 | - py4j 18 | -------------------------------------------------------------------------------- /python-client/bin/run_singledoc_sig_terms.py: -------------------------------------------------------------------------------- 1 | __author__ = 'cvig' 2 | from es_text_analytics import single_doc_sigterms 3 | 4 | from elasticsearch import Elasticsearch 5 | es = Elasticsearch() 6 | 7 | sdt = single_doc_sigterms.SingleDocSigTerms(es, 'wiki', 'doc', 'article', None) 8 | print sdt.by_doc_id_idf(178472 , 20) -------------------------------------------------------------------------------- /python-client/es_text_analytics/test/__init__.py: -------------------------------------------------------------------------------- 1 | from elasticsearch_runner.runner import ElasticsearchRunner 2 | 3 | es_runner = ElasticsearchRunner() 4 | 5 | 6 | def setup(): 7 | es_runner.install() 8 | es_runner.run() 9 | es_runner.wait_for_green() 10 | 11 | 12 | def teardown(): 13 | if es_runner and es_runner.is_running(): 14 | es_runner.stop() 15 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/test/test_no_tokenizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from unittest import TestCase 3 | 4 | from es_text_analytics.tokenizer import NOTokenizer 5 | 6 | 7 | class TestNOTokenizer(TestCase): 8 | def test_tokenize(self): 9 | tokenizer = NOTokenizer() 10 | self.assertEqual(['Dette', 'er', u'vårt', 'hus', '.'], 11 | tokenizer.tokenize(u'Dette er vårt hus.')) -------------------------------------------------------------------------------- /python-client/bin/build-all-models.bat: -------------------------------------------------------------------------------- 1 | :: Builds all the models for Norwegian NLP functionality and places them in the default locations 2 | 3 | set PYTHONPATH=%PYTHONPATH%;%~dp0\.. 4 | 5 | mkdir %~dp0\..\..\models 6 | 7 | python %~dp0\build_no_tagger.py -m %~dp0\..\..\models\nob-tagger-default-model --features simple --language nob 8 | python %~dp0\build_no_tagger.py -m %~dp0\..\..\models\nno-tagger-default-model --features simple --language nno 9 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/tokenizer.py: -------------------------------------------------------------------------------- 1 | from textblob.base import BaseTokenizer 2 | from uniseg import wordbreak 3 | 4 | # TextBlob compatible tokenizer for Norwegian. 5 | # Simple implementation. Tokenizes according to Unicode Appendix 29 (UAX#29). 6 | 7 | 8 | class NOTokenizer(BaseTokenizer): 9 | def tokenize(self, text): 10 | return list(self.itokenize(text)) 11 | 12 | def itokenize(self, text, *args, **kwargs): 13 | return (token for token in wordbreak.words(text) if token != ' ') 14 | -------------------------------------------------------------------------------- /python-client/bin/build-all-models.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Builds all the models for Norwegian NLP functionality and places them in the default locations 4 | 5 | SELF_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 6 | 7 | mkdir -p ${SELF_DIR}\..\..\models 8 | 9 | python ${SELF_DIR}\build_no_tagger.py -m ${SELF_DIR}\..\..\models\nob-tagger-default-model --features simple --language nob 10 | python ${SELF_DIR}\build_no_tagger.py -m ${SELF_DIR}\..\..\models\nno-tagger-default-model --features simple --language nno -------------------------------------------------------------------------------- /python-client/sklext/cond_prob.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | from numpy import array 3 | 4 | from sklext.term_estimators import joint_estimator_point, marginal_estimator 5 | 6 | 7 | def conditional_probabilities(X, y, ratio=False): 8 | p_t_c = joint_estimator_point(X, y, smoothing=True) 9 | p_t = marginal_estimator(X, smoothing=True) 10 | 11 | p_t.shape = 2,1 12 | 13 | m = p_t_c / p_t 14 | 15 | if ratio: 16 | p_c = marginal_estimator(y, smoothing=True) 17 | 18 | m = m / p_c 19 | 20 | return array(numpy.max(m, axis=1)).flatten() -------------------------------------------------------------------------------- /python-client/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='es_text_analytics', 5 | version='0.1', 6 | packages=['es_text_analytics', 'es_text_analytics.test'], 7 | url='https://bitbucket.org/comperio/comperio-text-analytics', 8 | license='For internal use only.', 9 | author='Andre Lynum', 10 | author_email='andre.lynum@comperiosearch.com', 11 | description='es text analytics.', 12 | install_requires=['elasticsearch', 'requests', 'psutil', 'textblob', 'nltk', 'gensim', 'uniseg'], 13 | dependency_links=['git+ssh://git@github.com/comperiosearch/python-elasticsearch-runner'] 14 | 15 | ) 16 | -------------------------------------------------------------------------------- /python-client/run_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wikifile=/data/no/nowiki-latest-pages-articles.xml.bz2 3 | datadir=/data/no/ 4 | vocab=/data/no/nowiki_lsi_10.vocab 5 | for n_topics in 100 200 400 1000 2000; do 6 | python bin/build-wiki-topicmodel.py --model-id nowiki --model-type lsi -d $wikifile --data-dir $datadir --vocab $vocab --n-topics $n_topics 7 | done 8 | exit 9 | for n_topics in 50 100 250 500 1000; do 10 | python bin/build-wiki-topicmodel.py --model-id nowiki --model-type lda -d $wikifile --data-dir $datadir --vocab $vocab --n-topics $n_topics 11 | done 12 | 13 | 14 | for window in 50 100 250; do 15 | for size in 500 1000, do 16 | python bin/build-wiki-topicmodel.py --model-id nowiki --model-type word2vec --w2v-window $window --w2v-size $size -d $wikifile --data-dir $datadir --vocab $vocab 17 | done 18 | done 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/data/test/test_newsgroups.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from es_text_analytics.data.newsgroups import parse 4 | 5 | 6 | class TestNewsgroups(TestCase): 7 | def test_parse(self): 8 | self.assertEqual(parse('From: ba\nSubject: foo\nThe\nmessage.\n--\nSig\n'), 9 | {'raw': 'From: ba\nSubject: foo\nThe\nmessage.\n--\nSig\n', 10 | 'msg': 'The\nmessage.', 11 | 'from': 'ba', 12 | 'subject': 'foo', 13 | 'sig': 'Sig\n'}) 14 | self.assertEqual(parse('Subject: foo\nFrom: ba\nThe\nmessage.\n--\nSig\n'), 15 | {'raw': 'Subject: foo\nFrom: ba\nThe\nmessage.\n--\nSig\n', 16 | 'msg': 'The\nmessage.', 17 | 'from': 'ba', 18 | 'subject': 'foo', 19 | 'sig': 'Sig\n'}) 20 | -------------------------------------------------------------------------------- /spark-jobs/ng-wc.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from argparse import ArgumentParser 3 | 4 | from pyspark import SparkContext 5 | 6 | # Basic sample Spark job for testing 7 | # To run pass the 20 Newsgroups JSON formatted corpus file as the -f argument, set 8 | # the SPARK_HOME environment variable and run with spark-submit or pyspark.as 9 | 10 | # To run and edit within PyCharm add SPARK_HOME/python and SPARK_HOME/python/lib/py4j-x.x.x.x-src.zip 11 | # to the interpreter paths in addition to setting SPARK_HOME in the run configuration. 12 | 13 | def main(): 14 | parser = ArgumentParser() 15 | parser.add_argument('-f', '--filename') 16 | opts = parser.parse_args() 17 | 18 | fn = opts.filename 19 | 20 | if not fn: 21 | sys.exit(1) 22 | 23 | sc = SparkContext(appName='ng-wc') 24 | 25 | rdd = sc.textFile(fn) 26 | 27 | n = rdd.count() 28 | 29 | print 'The 20 Newsgroups corpus has %d articles.' % n 30 | 31 | 32 | if __name__ == '__main__': 33 | main() -------------------------------------------------------------------------------- /python-client/bin/build_pyLDAvis.py: -------------------------------------------------------------------------------- 1 | from gensim.corpora import Dictionary 2 | from gensim.models.ldamodel import LdaModel 3 | import gensim 4 | from gensim import corpora 5 | import pyLDAvis.gensim 6 | 7 | 8 | def main(): 9 | file = 'f:/projects/elasticsearch-enterprise-system/data/topic_models/nowiki_v2_3pass_lda_250' 10 | mod = LdaModel.load(file) 11 | dict = 'f:/projects/elasticsearch-enterprise-system/data/topic_models/voc_vocabulary_0.vocab' 12 | vocab = Dictionary.load(dict) 13 | corpfile = 'f:/projects/comperio-text-analytics/models/topicmodel/mojo_lda_100.corp' 14 | corpus = gensim.corpora.MmCorpus(corpfile) 15 | 16 | print mod.show_topic(0) 17 | print mod.id2word 18 | mod.id2word = vocab 19 | 20 | print mod.show_topic(0) 21 | 22 | pydavis = pyLDAvis.gensim.prepare(mod, corpus, vocab) 23 | pyLDAvis.save_html(pydavis, 'pydavis_250_v2_3passes.html') 24 | pyLDAvis.show(pydavis) 25 | 26 | 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /python-client/bin/NOB_kera.py: -------------------------------------------------------------------------------- 1 | __author__ = 'cvig' 2 | from es_text_analytics.tagger import NOBTagger, install_hunpos 3 | from es_text_analytics.np_extractor import NONPExtractor 4 | from es_text_analytics.kera import extract_keywords 5 | from nltk.tokenize import sent_tokenize 6 | import re 7 | import unicodedata 8 | 9 | def fast_tokenize(str): 10 | return [token.lower() for token in re.findall('[^\W\d_]+', re.sub(ur'[\00a0\n-]', ' ', str), re.MULTILINE|re.UNICODE)] 11 | 12 | def unicode_tokenize(str): 13 | normalized = unicodedata.normalize('NFKC', str) 14 | return normalized.encode('utf-8').lower().split() 15 | #return [token.lower() for token in re.findall('[^\W\d_]+', re.sub('[\n-]', ' ', normalized), re.MULTILINE|re.UNICODE)] 16 | 17 | class NOB_kera(): 18 | def __init__(self): 19 | self.tagger = NOBTagger() 20 | self.chunker = NONPExtractor(tagger=self.tagger, keep_index=True) 21 | 22 | def extract_keywords(self, from_text): 23 | return extract_keywords(from_text, fast_tokenize, sent_tokenize, self.tagger, self.chunker) 24 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/data/test/test_ndt_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from unittest import TestCase 3 | 4 | from es_text_analytics.data.ndt_dataset import filelist, normalize 5 | 6 | 7 | class TestNDTDatasetHelpers(TestCase): 8 | def test_filelist(self): 9 | files = filelist() 10 | 11 | self.assertEqual(2, len(files)) 12 | self.assertTrue('ndt_1-0_nno.conll' in files) 13 | self.assertTrue('ndt_1-0_nob.conll' in files) 14 | 15 | files = filelist(lang='nob', sections=['parliament']) 16 | 17 | self.assertEqual(1, len(files)) 18 | self.assertTrue('parliament_ndt_1-0_nob.conll' in files) 19 | 20 | def test_normalize(self): 21 | doc = [[1, 'Eg', 'eg', 'pron'], 22 | [2, 'var', 'vere', 'verb'], 23 | [3, u'på', u'på', 'prep'], 24 | [4, 'bibeltime', 'bibeltime', 'subst'], 25 | [5, '.', '$.', 'clb']] 26 | 27 | result = normalize(doc) 28 | self.assertEqual(1, len(result)) 29 | self.assertTrue('content' in result) 30 | self.assertTrue(u'Eg var på bibeltime .' in result.values()) 31 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/data/elasticsearch_dataset.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | import tarfile 4 | 5 | from es_text_analytics.data.dataset import Dataset 6 | from elasticsearch.client import Elasticsearch 7 | from elasticsearch.helpers import scan 8 | 9 | """ 10 | Elasticsearch as data source 11 | 12 | """ 13 | 14 | 15 | class ElasticsearchDataset(Dataset): 16 | """ 17 | Class encapsulating using Elasticsearch as datasource. Uses scan/scroll API via the es-py helpers scan. 18 | """ 19 | 20 | def __init__(self, read_index, read_doc_type, index='new_index', doc_type='doc', query=None, dataset_path=None, normalize_func=None): 21 | super(ElasticsearchDataset, self).__init__(index=index, doc_type=doc_type, dataset_path=dataset_path, normalize_func=normalize_func) 22 | self.dataset_fn = 'elastics' 23 | self.read_index = read_index 24 | self.read_doc_type = read_doc_type 25 | self.query = query 26 | 27 | def _iterator(self): 28 | es = Elasticsearch(timeout=60) 29 | return scan(es, scroll=u'10m', query=self.query, 30 | index=self.read_index, doc_type=self.read_doc_type) 31 | -------------------------------------------------------------------------------- /python-client/sklext/test/test_mutual_information.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from numpy import array 4 | from numpy.ma.testutils import assert_array_approx_equal 5 | from scipy.sparse.csr import csr_matrix 6 | 7 | from sklext.mutual_information import mutual_information, pointwise_mutual_information 8 | 9 | 10 | class TestMutualInformation(TestCase): 11 | def test_mutual_information(self): 12 | X = array([[0, 1], 13 | [1, 0], 14 | [1, 1]]) 15 | y = array([[0, 1], 16 | [1, 0], 17 | [1, 0]]) 18 | 19 | assert_array_approx_equal(mutual_information(X, y), [-0.37489, -0.605939], decimal=3) 20 | assert_array_approx_equal(mutual_information(csr_matrix(X), csr_matrix(y)), [-0.37489, -0.605939], decimal=3) 21 | 22 | def test_pointwise_mutual_information(self): 23 | X = array([[0, 1], 24 | [1, 0], 25 | [1, 1]]) 26 | y = array([[0, 1], 27 | [1, 0], 28 | [1, 0]]) 29 | 30 | assert_array_approx_equal(pointwise_mutual_information(X, y), [0.1178, 0.1178], decimal=3) 31 | assert_array_approx_equal(pointwise_mutual_information(csr_matrix(X), csr_matrix(y)), 32 | [0.1178, 0.1178], decimal=3) 33 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/data/test/test_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from StringIO import StringIO 3 | from unittest import TestCase 4 | 5 | from es_text_analytics.data.dataset import fn_from_url, parse_conll 6 | 7 | NDT_CONLL_SAMPLE = """ 8 | 1 Nokre nokon det 9 | 2 refleksjonar refleksjon subst 10 | 3 | $| clb 11 | 12 | 1 Eg eg pron 13 | 2 var vere verb 14 | 3 på på prep 15 | 4 bibeltime bibeltime subst 16 | 5 . $. clb 17 | 18 | """ 19 | 20 | 21 | class TestDataset(TestCase): 22 | def test_fn_from_url(self): 23 | self.assertEqual(fn_from_url('http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz'), '20news-18828.tar.gz') 24 | 25 | def test_parse_conll(self): 26 | result = list(parse_conll(StringIO(NDT_CONLL_SAMPLE))) 27 | 28 | self.assertEqual(2, len(result)) 29 | self.assertEqual([[1, 'Nokre', 'nokon', 'det'], 30 | [2, 'refleksjonar', 'refleksjon', 'subst'], 31 | [3, '|', '$|', 'clb']], 32 | result[0]) 33 | self.assertEqual([[1, 'Eg', 'eg', 'pron'], 34 | [2, 'var', 'vere', 'verb'], 35 | [3, u'på', u'på', 'prep'], 36 | [4, 'bibeltime', 'bibeltime', 'subst'], 37 | [5, '.', '$.', 'clb']], 38 | result[1]) 39 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/test/test_single_doc_sigterms.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from elasticsearch import Elasticsearch 4 | from elasticsearch.client import IndicesClient 5 | 6 | from es_text_analytics.single_doc_sigterms import SingleDocSigTerms 7 | from es_text_analytics.test import es_runner 8 | 9 | 10 | class TestSingleDocSigTerms(TestCase): 11 | def setUp(self): 12 | super(TestSingleDocSigTerms, self).setUp() 13 | 14 | self.es = Elasticsearch(hosts=['localhost:%d' % es_runner.es_state.port]) 15 | self.ic = IndicesClient(self.es) 16 | self.index = 'single_doc_sigterms_test' 17 | self.doc_type = 'test-doc' 18 | self.field = 'text' 19 | 20 | if self.ic.exists(self.index): 21 | self.ic.delete(self.index) 22 | 23 | self.ic.create(self.index) 24 | self.es.create(self.index, self.doc_type, {self.field: 'foo ba knark foo knirk knark foo'}, id='doc_1') 25 | 26 | def test_tf_for_doc_id(self): 27 | sigterms = SingleDocSigTerms(self.es, self.index, self.doc_type, self.field, None) 28 | 29 | resp = dict(sigterms.tf_for_doc_id('doc_1')) 30 | self.assertEquals(4, len(resp)) 31 | self.assertEquals(3, resp['foo']) 32 | self.assertEquals(2, resp['knark']) 33 | self.assertEquals(1, resp['ba']) 34 | self.assertEquals(1, resp['knirk']) 35 | -------------------------------------------------------------------------------- /python-client/sklext/mutual_information.py: -------------------------------------------------------------------------------- 1 | from math import log, e 2 | 3 | import numpy 4 | from numpy import array, zeros 5 | 6 | from sklext.term_estimators import marginal_estimator, joint_estimator_point, joint_estimator_full 7 | 8 | 9 | def mutual_information(X, y): 10 | num_terms = X.shape[1] 11 | num_classes = y.shape[1] 12 | 13 | p_c = marginal_estimator(y, smoothing=True) 14 | p_t = marginal_estimator(X, smoothing=True) 15 | 16 | p_t_c = joint_estimator_full(X, y, smoothing=True) 17 | 18 | ig = zeros((num_terms)) 19 | 20 | for i in xrange(num_terms): 21 | for j in xrange(num_classes): 22 | ig[i] += p_t_c[0][i, j] * log(p_t_c[0][i, j] / (p_t[i] * p_c[j])) 23 | ig[i] += p_t_c[1][i, j] * log(p_t_c[1][i, j] / (p_t[i] * (1 - p_c[j]))) 24 | ig[i] += p_t_c[2][i, j] * log(p_t_c[2][i, j] / ((1 - p_t[i]) * p_c[j])) 25 | ig[i] += p_t_c[3][i, j] * log(p_t_c[3][i, j] / ((1 - p_t[i]) * (1 - p_c[j]))) 26 | 27 | return ig 28 | 29 | 30 | def pointwise_mutual_information(X, y, normalize=False, k_weight=None, positive=None): 31 | p_c = marginal_estimator(y, smoothing=True) 32 | p_t = marginal_estimator(X, smoothing=True) 33 | 34 | p_t.shape = 2, 1 35 | p_c.shape = 1, 2 36 | 37 | p_t_c = joint_estimator_point(X, y, smoothing=True) 38 | 39 | if k_weight: 40 | p_t_c = p_t_c**k_weight 41 | 42 | m = numpy.log(array(p_t_c) / (p_t * p_c)) 43 | 44 | if normalize: 45 | m = m / -numpy.log(p_t_c) 46 | 47 | if positive is 'cutoff': 48 | m[m < .0] = .0 49 | 50 | if positive is 'exp': 51 | m = e**m 52 | 53 | return array(numpy.max(m, axis=1)).flatten() 54 | -------------------------------------------------------------------------------- /python-client/sklext/term_weighting.py: -------------------------------------------------------------------------------- 1 | from scipy.sparse import spdiags 2 | from sklearn.base import BaseEstimator, TransformerMixin 3 | 4 | from sklext.cond_prob import conditional_probabilities 5 | from sklext.mutual_information import mutual_information, pointwise_mutual_information 6 | 7 | 8 | class TermWeightTransformer(BaseEstimator, TransformerMixin): 9 | def __init__(self, method='mi', pmi_k=2): 10 | self.method = method 11 | self.pmi_k = pmi_k 12 | 13 | self._weights = None 14 | 15 | def fit(self, X, y): 16 | if self.method is 'mi': 17 | self._weights = mutual_information(X, y) 18 | elif self.method is 'pmi': 19 | self._weights = pointwise_mutual_information(X, y, normalize=False) 20 | elif self.method is 'npmi': 21 | self._weights = pointwise_mutual_information(X, y, normalize=True) 22 | elif self.method is 'ppmi_exp': 23 | self._weights = pointwise_mutual_information(X, y, normalize=True, positive='exp') 24 | elif self.method is 'pmi_k': 25 | self._weights = pointwise_mutual_information(X, y, normalize=True, k_weight=self.pmi_k) 26 | elif self.method is 'ppmi': 27 | self._weights = pointwise_mutual_information(X, y, normalize=False, positive='cutoff') 28 | elif self.method is 'cp_raw': 29 | self._weights = conditional_probabilities(X, y, ratio=False) 30 | elif self.method is 'cp_ratio': 31 | self._weights = conditional_probabilities(X, y, ratio=True) 32 | else: 33 | raise ValueError 34 | 35 | return self 36 | 37 | def transform(self, X, y=None): 38 | p = len(self._weights) 39 | w_diag = spdiags(self._weights, 0, p, p) 40 | 41 | return X * w_diag 42 | -------------------------------------------------------------------------------- /python-client/sklext/test/test_term_estimators.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from numpy import array 4 | from numpy.ma.testutils import assert_array_approx_equal 5 | from scipy.sparse import csr_matrix 6 | 7 | from sklext.term_estimators import joint_estimator_point, joint_estimator_full 8 | 9 | 10 | class TestTermEstimators(TestCase): 11 | def test_joint_estmator_point(self): 12 | X = array([[0, 1], 13 | [1, 0], 14 | [1, 1]]) 15 | y = array([[0, 1], 16 | [1, 0], 17 | [1, 0]]) 18 | 19 | assert_array_approx_equal(joint_estimator_point(X, y), [[.5, 0], [.25, .25]]) 20 | assert_array_approx_equal(joint_estimator_point(csr_matrix(X), csr_matrix(y)), [[.5, 0], [.25, .25]]) 21 | 22 | def test_joint_estimator_full(self): 23 | X = array([[0, 1], 24 | [1, 0], 25 | [1, 1]]) 26 | y = array([[0, 1], 27 | [1, 0], 28 | [1, 0]]) 29 | 30 | assert_array_approx_equal(joint_estimator_full(X, y), 31 | [[[.1667, .0], [.0833, .0833]], 32 | [[.0 , .1667], [.0833, .0833]], 33 | [[.0 , .0833], [.0833, .0]], 34 | [[.0833, .0], [.0, .0833]]], 35 | decimal=3) 36 | assert_array_approx_equal(joint_estimator_full(csr_matrix(X), csr_matrix(y)), 37 | [[[.1667, .0], [.0833, .0833]], 38 | [[.0 , .1667], [.0833, .0833]], 39 | [[.0 , .0833], [.0833, .0]], 40 | [[.0833, .0], [.0, .0833]]], 41 | decimal=3) -------------------------------------------------------------------------------- /python-client/es_text_analytics/data/newsgroups.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from sklearn import datasets 4 | from sklearn.datasets import twenty_newsgroups 5 | 6 | from es_text_analytics.data.dataset import Dataset 7 | 8 | """ 9 | The 20 Newsgroups dataset is a standardized dataset with Newsgroup messages. 10 | 11 | http://qwone.com/~jason/20Newsgroups/ 12 | """ 13 | 14 | NEWSGROUPS_ARCHIVE_URL = 'http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz' 15 | 16 | 17 | def iterator(dataset_fn): 18 | """ 19 | Provides an iterator of parsed documents from the 20 Newsgroups dataset. 20 | 21 | :param dataset_fn: Path to Newsgroups dataset archive file. 22 | :type dataset_fn: unicode|str 23 | :rtype : generator 24 | """ 25 | ng = datasets.fetch_20newsgroups() 26 | 27 | for article, group, target, filename in zip(ng['data'], [ng['target_names'][x] for x in ng['target']], 28 | ng['target'], ng['filenames']): 29 | article = twenty_newsgroups.strip_newsgroup_header(article) 30 | article = twenty_newsgroups.strip_newsgroup_footer(article) 31 | article = twenty_newsgroups.strip_newsgroup_quoting(article) 32 | doc_id = os.path.basename(filename) 33 | 34 | yield {'doc_id': doc_id, 'article': article, 'group': group, 'target': target, 'filename': filename} 35 | 36 | 37 | class NewsgroupsDataset(Dataset): 38 | """ 39 | Class encapsulating the Newsgroups dataset and the information needed to retrieve and index it. 40 | 41 | Currently only downloads and index the dataset in Elasticsearch. 42 | """ 43 | 44 | def __init__(self, index='newsgroups', doc_type='message', dataset_path=None): 45 | super(NewsgroupsDataset, self).__init__(index=index, doc_type=doc_type, dataset_path=dataset_path) 46 | 47 | 48 | def _iterator(self): 49 | return iterator(self.dataset_fn) 50 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/test/test_np_extractor.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from unittest import TestCase 3 | 4 | from es_text_analytics.np_extractor import NONPExtractor 5 | 6 | 7 | class TestNONPExtractor(TestCase): 8 | def test_tag(self): 9 | extractor = NONPExtractor() 10 | self.assertEqual(extractor.extract([(u'Dette', 'PRON_PERS'), 11 | (u'er', 'VERB'), 12 | (u'vårt', 'DET'), 13 | (u'hus', 'SUBST'), 14 | (u'.', 'PUNKT')]), 15 | [u'hus']) 16 | 17 | self.assertEqual(extractor.extract([(u'Dette', 'PRON_PERS'), 18 | (u'er', 'VERB'), 19 | (u'vårt', 'DET'), 20 | (u'fine', 'ADJ'), 21 | (u'hus', 'SUBST'), 22 | (u'.', 'PUNKT')]), 23 | [[u'fine', u'hus']]) 24 | extractor = NONPExtractor(keep_index=True) 25 | self.assertEqual(extractor.extract([(u'Dette', 'PRON_PERS'), 26 | (u'er', 'VERB'), 27 | (u'vårt', 'DET'), 28 | (u'hus', 'SUBST'), 29 | (u'.', 'PUNKT')]), 30 | [(u'hus', 3)]) 31 | 32 | self.assertEqual(extractor.extract([(u'Dette', 'PRON_PERS'), 33 | (u'er', 'VERB'), 34 | (u'vårt', 'DET'), 35 | (u'fine', 'ADJ'), 36 | (u'hus', 'SUBST'), 37 | (u'.', 'PUNKT')]), 38 | [([u'fine', u'hus'], 3)]) 39 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/single_doc_sigterms.py: -------------------------------------------------------------------------------- 1 | from operator import itemgetter 2 | 3 | 4 | class SingleDocSigTerms: 5 | def __init__(self, es, index, doc_type, field, term_weight_provider): 6 | self.es = es 7 | self.index = index 8 | self.doc_type = doc_type 9 | self.field = field 10 | self.term_weight_provider = term_weight_provider 11 | 12 | def tf_for_doc_id(self, doc_id): 13 | resp = self.es.termvectors(index=self.index, doc_type=self.doc_type, id=doc_id, fields=[self.field]) 14 | 15 | if resp['found']: 16 | return [(term, val['term_freq']) for term, val in resp['term_vectors'][self.field]['terms'].items()] 17 | 18 | def by_doc_id_idf(self, doc_id, n=5): 19 | resp = self.es.termvectors(index=self.index, doc_type=self.doc_type, id=doc_id, fields=[self.field], dfs=True, 20 | term_statistics=True, positions=False, offsets=False) 21 | if resp['found']: 22 | termstats=[] 23 | total_doc_term_frequency = sum([val['term_freq'] for term, val in resp['term_vectors'][self.field]['terms'].items()]) 24 | doc_count = resp['term_vectors'][self.field]['field_statistics']['sum_ttf'] 25 | for term, val in resp['term_vectors'][self.field]['terms'].items(): 26 | doc_freq= val['ttf'] 27 | term_doc_freq= val['term_freq'] 28 | term_total_ratio = doc_freq / float(doc_count) 29 | doc_ration = term_doc_freq / float(total_doc_term_frequency) 30 | termstats.append((term, doc_ration/float(term_total_ratio))) 31 | return sorted(termstats, key=itemgetter(1), reverse=True)[0:n] 32 | 33 | def by_doc_id(self, doc_id, n=5): 34 | term_freqs = self.tf_for_doc_id(doc_id) 35 | 36 | if self.term_weight_provider: 37 | weights = self.term_weight_provider[(term for term, _ in term_freqs)] 38 | 39 | term_freqs = [(term, freq*weights[term]) for term, freq in term_freqs] 40 | 41 | return sorted(term_freqs, key=itemgetter(1), reverse=True)[0:n] 42 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/data/wiki_infobox.py: -------------------------------------------------------------------------------- 1 | __author__ = 'cvig' 2 | #!/usr/bin/env python 3 | # parts of this borrowed from https://github.com/scraperwiki/wikipedia-infobox-tool/blob/master/get_data.py 4 | 5 | import re 6 | 7 | 8 | def clean_data(data): 9 | # Strip square brackets. 10 | data = re.sub('[\[\]]', '', data) 11 | # Strip all HTML tags. 12 | data = re.sub('<[^<]+?>', ' ', data) 13 | data = re.sub('(?i)\{\{cite .*\}\}', '', data) 14 | data = re.sub(' ', '', data) 15 | return data 16 | 17 | 18 | def parse_tags(data): 19 | data = re.sub('(?i)\{\{url\|([^\n]*)\}\}', '\g<1>', data) 20 | data = re.sub('\[\[(.*)\|.*\]\]', '\g<1>', data) 21 | data = re.sub('(?i)\{\{convert\|(.*?)\|(.*?)((\}\})|(\|.*\}\}))', '\g<1> \g<2>', data) 22 | data = re.sub('(?i)\{\{convinfobox\|(.*?)\|(.*?)((\}\})|(\|.*\}\}))', '\g<1> \g<2>', data) 23 | data = re.sub('(?i)\{\{nowrap\|(.*)\}\}', '\g<1>', data) 24 | return data 25 | 26 | 27 | def scrape_infobox(content): 28 | # Remove HTML comment tags. 29 | content = re.sub('', ' ', content) 30 | 31 | box_occurences = re.split('{{infoboks[^\n}]*\n', content.lower()) 32 | 33 | if len(box_occurences) < 2: 34 | return None 35 | 36 | data = {} 37 | 38 | for box_occurence in box_occurences[1:]: 39 | 40 | infobox_end = re.search('\n[^\n{]*\}\}[^\n{]*\n', box_occurence) 41 | 42 | if infobox_end is None: 43 | return None 44 | 45 | box_occurence = box_occurence[:infobox_end.start():] 46 | box_occurence = re.split('\n[^|\n]*\|', box_occurence) 47 | 48 | for item in box_occurence: 49 | item = parse_tags(item) 50 | item = clean_data(item) 51 | if '=' in item: 52 | pair = item.split('=', 1) 53 | field = pair[0].strip() 54 | field = re.sub('\W', '_', field) 55 | value = pair[1].strip() 56 | field = field.lower().strip() 57 | if len(field) < 20: 58 | if value != '': 59 | data[field] = value 60 | return data 61 | 62 | return data 63 | 64 | -------------------------------------------------------------------------------- /python-client/bin/wordcounts_from_dataset.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from argparse import ArgumentParser 3 | import sys 4 | 5 | from gensim.corpora import Dictionary 6 | from textblob import TextBlob 7 | 8 | from es_text_analytics.data import newsgroups 9 | from es_text_analytics.data.dataset import download_file, default_dataset_path 10 | from es_text_analytics.data.ndt_dataset import NDTDataset 11 | from es_text_analytics.tokenizer import NOTokenizer 12 | 13 | """ 14 | Generates wordcounts from a dataset. 15 | 16 | Stores the counts in a Gensim Dictionary text file with id, word and count as tab separated fields. 17 | """ 18 | 19 | 20 | NO_TOKENIZER = NOTokenizer() 21 | 22 | def preprocess_ng(doc): 23 | return [w.lower() for w in TextBlob(doc['msg']).words] 24 | 25 | 26 | def preprocess_ndt(doc): 27 | return [w.lower() for w in TextBlob(doc['content'], tokenizer=NO_TOKENIZER).words] 28 | 29 | 30 | def main(): 31 | parser = ArgumentParser() 32 | parser.add_argument('-d', '--dataset') 33 | parser.add_argument('-p', '--dataset-path', default=default_dataset_path()) 34 | parser.add_argument('-o', '--output') 35 | opts = parser.parse_args() 36 | 37 | dataset_name = opts.dataset 38 | dataset_path = opts.dataset_path 39 | out_fn = opts.output 40 | 41 | if not out_fn: 42 | logging.error('--output argument required ...') 43 | parser.print_usage() 44 | sys.exit(1) 45 | 46 | if not dataset_name: 47 | logging.error('--dataset argument required ...') 48 | parser.print_usage() 49 | sys.exit(1) 50 | 51 | if dataset_name == 'newsgroups': 52 | corpus = (preprocess_ng(doc) for doc 53 | in newsgroups.iterator(download_file(newsgroups.NEWSGROUPS_ARCHIVE_URL, dataset_path))) 54 | if dataset_name == 'ndt': 55 | dataset = NDTDataset(dataset_path=dataset_path) 56 | dataset.install() 57 | 58 | corpus = (preprocess_ndt(doc) for doc in dataset) 59 | else: 60 | logging.error('Unknown dataset %s ...' % dataset_name) 61 | sys.exit(1) 62 | 63 | d = Dictionary(corpus) 64 | d.save_as_text(out_fn, sort_by_word=False) 65 | 66 | 67 | if __name__ == '__main__': 68 | logging.basicConfig(level=logging.INFO) 69 | main() 70 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/test/test_decompounder.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from es_text_analytics.decompounder import NOBDecompounder, decompound_inner, flatten_inner, flatten 4 | 5 | 6 | class TestNOBDecompounder(TestCase): 7 | def setUp(self): 8 | super(TestNOBDecompounder, self).setUp() 9 | 10 | self.fullform_index = {'ba': [{'pos': 'SUBST'}], 'bork': [{'pos': 'SUBST'}], 11 | 'borkbork': [{'pos': 'SUBST'}], 'boing': [{'pos': 'PRON'}]} 12 | 13 | 14 | def test_decompound(self): 15 | decompounder = NOBDecompounder(fullform_index=self.fullform_index, min_match=1) 16 | self.assertEqual(['ba', 'bork'], decompounder.decompound('babork')) 17 | self.assertEqual(['ba', 'borkbork'], decompounder.decompound('baborkbork')) 18 | self.assertEqual(['ba', 'ba'], decompounder.decompound('BaBa')) 19 | self.assertEqual(None, decompounder.decompound('BaBaa')) 20 | 21 | def test_decompound_no_prons(self): 22 | decompounder = NOBDecompounder(fullform_index=self.fullform_index, min_match=1) 23 | 24 | self.assertEqual(None, decompounder.decompound('baboing')) 25 | 26 | 27 | class TestDecompounderHelpers(TestCase): 28 | def setUp(self): 29 | super(TestDecompounderHelpers, self).setUp() 30 | 31 | self.fullform_index = {'ba': [{'pos': 'SUBST'}], 'bork': [{'pos': 'SUBST'}], 'borkbork': [{'pos': 'SUBST'}]} 32 | 33 | def test_decompund_inner(self): 34 | self.assertEqual([['ba', ['ba']]], decompound_inner('baba', self.fullform_index, start=0, min_match=1)) 35 | self.assertEqual([['ba']], decompound_inner('baba', self.fullform_index, start=2, min_match=1)) 36 | self.assertEqual([], decompound_inner('baba', self.fullform_index, start=1, min_match=1)) 37 | 38 | def test_flatten_inner(self): 39 | self.assertEqual([['ba', 'ba']], flatten_inner(['ba', ['ba']])) 40 | self.assertEqual([['ba']], flatten_inner(['ba'])) 41 | self.assertEqual([['ba', 'ba'], ['ba', 'foo']], flatten_inner(['ba', ['ba'], ['foo']])) 42 | 43 | def test_flatten(self): 44 | self.assertEqual([['ba', 'ba'], ['ba'], ['ba', 'ba'], ['ba', 'foo']], 45 | flatten([['ba', ['ba']], ['ba'], ['ba', ['ba'], ['foo']]])) -------------------------------------------------------------------------------- /python-client/sklext/term_estimators.py: -------------------------------------------------------------------------------- 1 | from itertools import izip 2 | 3 | import numpy 4 | from numpy import array, sum, zeros 5 | from scipy.sparse import issparse 6 | 7 | 8 | def add_smoothing(m, amount=10 ** -12): 9 | m = m.astype(numpy.float) 10 | m[m == 0] = amount 11 | 12 | return m 13 | 14 | 15 | def marginal_estimator(X, axis=0, smoothing=False): 16 | N = X.shape[axis] 17 | 18 | if issparse(X): 19 | counts = array((X > 0).sum(axis=axis)) 20 | else: 21 | counts = array(sum(X > 0, axis=axis)) 22 | 23 | if smoothing: 24 | add_smoothing(counts) 25 | 26 | p = counts.flatten() / float(N) 27 | 28 | return p 29 | 30 | 31 | def joint_estimator_point(X, y, smoothing=False): 32 | counts = X.T.dot(y) 33 | 34 | if issparse(counts): 35 | counts = array(counts.todense()) 36 | 37 | if smoothing: 38 | counts = add_smoothing(counts) 39 | 40 | return counts / numpy.sum(counts, dtype=numpy.float) 41 | 42 | 43 | def joint_estimator_full_sparse(X, y, smoothing=False): 44 | _, t = X.shape 45 | _, c = y.shape 46 | 47 | X = X.tolil() 48 | y = y.tolil() 49 | 50 | counts = [zeros((t, c)), zeros((t, c)), zeros((t, c)), zeros((t, c))] 51 | 52 | for t_idx, c_idx in izip(X.rows, y.rows): 53 | t_mask = zeros(t, dtype=numpy.bool) 54 | t_mask[t_idx] = True 55 | c_mask = zeros(c, dtype=numpy.bool) 56 | c_mask[c_idx] = True 57 | 58 | counts[0][t_mask, c_mask] += 1 59 | counts[1][t_mask, ~c_mask] += 1 60 | counts[2][~t_mask, c_mask] += 1 61 | counts[3][~t_mask, ~c_mask] += 1 62 | 63 | if smoothing: 64 | counts = [add_smoothing(m) for m in counts] 65 | 66 | total = numpy.sum([numpy.sum(m) for m in counts], dtype=numpy.float) 67 | 68 | return [m / total for m in counts] 69 | 70 | 71 | def joint_estimator_full(X, y, smoothing=False): 72 | if issparse(X) or issparse(y): 73 | return joint_estimator_full_sparse(X, y, smoothing=smoothing) 74 | 75 | counts = [xx.T.dot(yy) for xx, yy in zip([X, X, 1 - X, 1 - X], [y, 1 - y, y, 1 - y])] 76 | 77 | if smoothing: 78 | counts = [add_smoothing(m) for m in counts] 79 | 80 | total = numpy.sum([numpy.sum(m) for m in counts], dtype=numpy.float) 81 | 82 | return [m / total for m in counts] -------------------------------------------------------------------------------- /python-client/es_text_analytics/test/test_tagger.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os 3 | from unittest import TestCase 4 | 5 | from es_text_analytics.tagger import obt_to_universal_tag, parse_hunpos_train_output, NOBTagger 6 | from es_text_analytics.tagger import NNO_TAGGER_DEFAULT_MODEL_FN, NNOTagger, NOB_TAGGER_DEFAULT_MODEL_FN 7 | 8 | HUNPUS_OUTPUT_SAMPLE = """ 9 | reading training corpus 10 | compiling probabilities 11 | constructing suffix guesser 12 | saving the model 13 | Traning corpus: 14 | 614375 tokens 15 | 37620 sentences 16 | 21 different tag 17 | 18 | Guesser trained with 19 | 71042 lowercase 20 | 40359 uppercase tokens 21 | theta = 0.0728040512355 22 | """ 23 | 24 | class TestTaggerHelpers(TestCase): 25 | def test_obt_to_universal_tag(self): 26 | self.assertEqual('VERB', obt_to_universal_tag('skildre', 'verb', 'inf')) 27 | self.assertEqual('PRON', obt_to_universal_tag('det', 'pron', u'pers|3|nøyt|eint')) 28 | self.assertEqual('PUNCT', obt_to_universal_tag(',', '', '_')) 29 | 30 | def test_parse_hunpos_train_output(self): 31 | self.assertEqual({'tokens': 614375, 32 | 'sentences': 37620, 33 | 'tag_card': 21, 34 | 'n_lower': 71042, 35 | 'n_upper': 40359, 36 | 'theta': 0.0728040512355, 37 | 'errors': []}, 38 | parse_hunpos_train_output(HUNPUS_OUTPUT_SAMPLE)) 39 | 40 | 41 | class TestNOBTagger(TestCase): 42 | def test_tag(self): 43 | if os.path.exists(NOB_TAGGER_DEFAULT_MODEL_FN): 44 | tagger = NOBTagger() 45 | self.assertEqual([(u'Dette', 'PRON_PERS'), 46 | (u'er', 'VERB'), 47 | (u'vårt', 'DET'), 48 | (u'hus', 'SUBST'), 49 | (u'.', 'PUNKT')], 50 | tagger.tag(u'Dette er vårt hus.')) 51 | else: 52 | self.skipTest('NOBTagger default model not found in %s' % NOB_TAGGER_DEFAULT_MODEL_FN) 53 | 54 | 55 | class TestNNOTagger(TestCase): 56 | def test_tag(self): 57 | if os.path.exists(NNO_TAGGER_DEFAULT_MODEL_FN): 58 | tagger = NNOTagger() 59 | self.assertEqual([(u'Røyndommen', 'SUBST'), 60 | (u'rammar', 'VERB'), 61 | (u'alle', 'DET'), 62 | (u'.', 'PUNKT')], 63 | tagger.tag(u'Røyndommen rammar alle.')) 64 | else: 65 | self.skipTest('NNOTagger default model not found in %s' % NNO_TAGGER_DEFAULT_MODEL_FN) 66 | -------------------------------------------------------------------------------- /python-client/bin/index_dataset.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from argparse import ArgumentParser 3 | import sys 4 | 5 | from elasticsearch.client import Elasticsearch 6 | 7 | from es_text_analytics.data.aviskorpus import AviskorpusDataset 8 | from es_text_analytics.data.ndt_dataset import NDTDataset 9 | from es_text_analytics.data.newsgroups import NewsgroupsDataset 10 | 11 | """ 12 | Script for retrieving and indexing datasets. 13 | 14 | Current datasets supported: 15 | - 20 Newsgroups (newsgroups) 16 | - Norsk Aviskorpus (aviskorpus), sections and sources can be specified with f.ex. -s 1|2-aa|vg|db 17 | - Norwegian Dependency Treebank (ndt), sections and languages can be specified with f.ex -s newspaper|blog-nob 18 | """ 19 | 20 | 21 | def main(): 22 | parser = ArgumentParser() 23 | parser.add_argument('-e', '--elasticsearch-server', default='localhost:9200') 24 | parser.add_argument('-d', '--dataset') 25 | parser.add_argument('-s', '--sections') 26 | opts = parser.parse_args() 27 | 28 | es_hosts = [opts.elasticsearch_server] 29 | dataset_name = opts.dataset 30 | dataset_sections = opts.sections 31 | 32 | es = Elasticsearch(hosts=es_hosts, timeout=120) 33 | 34 | if dataset_name == 'newsgroups': 35 | dataset = NewsgroupsDataset() 36 | elif dataset_name == 'aviskorpus': 37 | sections = None 38 | sources = None 39 | 40 | if dataset_sections: 41 | try: 42 | sections, sources = dataset_sections.split('-') 43 | sections = [int(s) for s in sections.split('|')] 44 | sources = [s for s in sources.split('|')] 45 | except Exception: 46 | logging.error('Malformed section specification "%s" ...' % dataset_sections) 47 | sys.exit(1) 48 | 49 | dataset = AviskorpusDataset(sections=sections, sources=sources) 50 | elif dataset_name == 'ndt': 51 | sections = None 52 | lang = None 53 | 54 | if dataset_sections: 55 | try: 56 | sections, lang = dataset_sections.split('-') 57 | sections = [int(s) for s in sections.split('|')] 58 | lang = [s for s in lang.split('|')] 59 | except Exception: 60 | logging.error('Malformed section specification "%s" ...' % dataset_sections) 61 | sys.exit(1) 62 | 63 | dataset = NDTDataset(lang=lang, sections=sections) 64 | else: 65 | logging.error('Unknown dataset %s ...' % dataset_name) 66 | sys.exit(1) 67 | 68 | dataset.install(es) 69 | 70 | 71 | if __name__ == '__main__': 72 | logging.basicConfig(level=logging.INFO) 73 | 74 | main() 75 | -------------------------------------------------------------------------------- /python-client/bin/build_no_tagger.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import logging 3 | from argparse import ArgumentParser 4 | import os 5 | import sys 6 | import datetime 7 | 8 | from es_text_analytics.data.ndt_dataset import NDTDataset 9 | from es_text_analytics.tagger import train_hunpos_model, FEATURES_MAP 10 | 11 | 12 | 13 | 14 | # Trains a Norwegian part-of-speech tagger with the NDT dataset. 15 | # The tagger is trained on the combined Bokmål and Nynorsk material. 16 | 17 | # Arguments: 18 | # -f, --features The normalized feature set, no-feats, simple or universal. See tagger.py for details. 19 | # -m, --model-file Where to save the resulting model. Crearet a default filename in the current directory 20 | # if omitted. 21 | # -d, --dataset-file Where to find the NDT dataset. Uses default location if omitted. 22 | # -l, --language Which language training set to use: nob (bokmål), nno (nynorsk) or both. 23 | 24 | FIELDS = ['form', 'postag', 'feats'] 25 | 26 | 27 | def main(): 28 | parser = ArgumentParser() 29 | parser.add_argument('-f', '--features') 30 | parser.add_argument('-m', '--model-file') 31 | parser.add_argument('-d', '--dataset-file') 32 | parser.add_argument('-l', '--language', default='nob') 33 | 34 | args = parser.parse_args() 35 | 36 | features = args.features 37 | model_fn = args.model_file 38 | dataset_fn = args.dataset_file 39 | lang = args.language 40 | 41 | if not features in FEATURES_MAP: 42 | logging.error('Unknown feature identifier %s (one of <%s>) ...' 43 | % (features, '|'.join(FEATURES_MAP.keys()))) 44 | sys.exit(1) 45 | 46 | if dataset_fn and not os.path.exists(dataset_fn): 47 | logging.error('Could not find NDT dataset archive %s ...' % dataset_fn) 48 | sys.exit(1) 49 | 50 | if not model_fn: 51 | # noinspection PyUnresolvedReferences 52 | model_fn = 'no-ndt-hunpos-%s-%s' % (features, datetime.now().strftime("%Y-%m-%d-%H-%M")) 53 | 54 | if not lang in ['nob', 'nno', 'both']: 55 | logging.error('Uknown language %s (one of <%s>) ...' % (lang), '|'.join(['nob', 'nno', 'both'])) 56 | sys.exit(1) 57 | 58 | if lang == 'both': 59 | lang = None 60 | 61 | if dataset_fn: 62 | dataset = NDTDataset(dataset_fn=dataset_fn, normalize_func=None, fields=FIELDS, lang=lang) 63 | else: 64 | dataset = NDTDataset(normalize_func=None, fields=FIELDS, lang=lang) 65 | dataset.install() 66 | 67 | pos_norm_func = FEATURES_MAP[features] 68 | seq_gen = ([(form, pos_norm_func(form, pos, feats)) for form, pos, feats in sent] for sent in dataset) 69 | 70 | stats = train_hunpos_model(seq_gen, model_fn) 71 | 72 | # print the stats from the hunpos output 73 | for k, v in stats.items(): 74 | print '%s:\t%s' % (k, v) 75 | 76 | 77 | if __name__ == '__main__': 78 | logging.basicConfig(level=logging.INFO) 79 | 80 | main() 81 | -------------------------------------------------------------------------------- /provision/neo4j-server.properties: -------------------------------------------------------------------------------- 1 | ################################################################ 2 | # Neo4j 3 | # 4 | # neo4j-server.properties - runtime operational settings 5 | # 6 | ################################################################ 7 | 8 | #*************************************************************** 9 | # Server configuration 10 | #*************************************************************** 11 | 12 | # location of the database directory 13 | org.neo4j.server.database.location=data/graph.db 14 | 15 | # Low-level graph engine tuning file 16 | org.neo4j.server.db.tuning.properties=conf/neo4j.properties 17 | 18 | # Let the webserver only listen on the specified IP. Default is localhost (only 19 | # accept local connections). Uncomment to allow any connection. Please see the 20 | # security section in the neo4j manual before modifying this. 21 | org.neo4j.server.webserver.address=0.0.0.0 22 | 23 | # Require (or disable the requirement of) auth to access Neo4j 24 | dbms.security.auth_enabled=true 25 | 26 | # 27 | # HTTP Connector 28 | # 29 | 30 | # http port (for all data, administrative, and UI access) 31 | org.neo4j.server.webserver.port=7474 32 | 33 | # 34 | # HTTPS Connector 35 | # 36 | 37 | # Turn https-support on/off 38 | org.neo4j.server.webserver.https.enabled=true 39 | 40 | # https port (for all data, administrative, and UI access) 41 | org.neo4j.server.webserver.https.port=7473 42 | 43 | # Certificate location (auto generated if the file does not exist) 44 | org.neo4j.server.webserver.https.cert.location=conf/ssl/snakeoil.cert 45 | 46 | # Private key location (auto generated if the file does not exist) 47 | org.neo4j.server.webserver.https.key.location=conf/ssl/snakeoil.key 48 | 49 | # Internally generated keystore (don't try to put your own 50 | # keystore there, it will get deleted when the server starts) 51 | org.neo4j.server.webserver.https.keystore.location=data/keystore 52 | 53 | # Comma separated list of JAX-RS packages containing JAX-RS resources, one 54 | # package name for each mountpoint. The listed package names will be loaded 55 | # under the mountpoints specified. Uncomment this line to mount the 56 | # org.neo4j.examples.server.unmanaged.HelloWorldResource.java from 57 | # neo4j-server-examples under /examples/unmanaged, resulting in a final URL of 58 | # http://localhost:7474/examples/unmanaged/helloworld/{nodeId} 59 | #org.neo4j.server.thirdparty_jaxrs_classes=org.neo4j.examples.server.unmanaged=/examples/unmanaged 60 | 61 | 62 | #***************************************************************** 63 | # HTTP logging configuration 64 | #***************************************************************** 65 | 66 | # HTTP logging is disabled. HTTP logging can be enabled by setting this 67 | # property to 'true'. 68 | org.neo4j.server.http.log.enabled=false 69 | 70 | # Logging policy file that governs how HTTP log output is presented and 71 | # archived. Note: changing the rollover and retention policy is sensible, but 72 | # changing the output format is less so, since it is configured to use the 73 | # ubiquitous common log format 74 | org.neo4j.server.http.log.config=conf/neo4j-http-logging.xml 75 | 76 | 77 | #***************************************************************** 78 | # Administration client configuration 79 | #***************************************************************** 80 | 81 | # location of the servers round-robin database directory. Possible values: 82 | # - absolute path like /var/rrd 83 | # - path relative to the server working directory like data/rrd 84 | # - commented out, will default to the database data directory. 85 | org.neo4j.server.webadmin.rrdb.location=data/rrd 86 | -------------------------------------------------------------------------------- /python-client/bin/build_LDA_kera_from_wiki.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys 3 | from operator import itemgetter 4 | 5 | sys.path.append('F:\projects\comperio-text-analytics\python-client') 6 | from elasticsearch import Elasticsearch 7 | import pandas as pd 8 | from gensim.models.ldamodel import LdaModel 9 | from gensim.corpora.dictionary import Dictionary 10 | from gensim.models.tfidfmodel import TfidfModel 11 | import logging 12 | from NOB_kera import NOB_kera 13 | 14 | num_words_from_topic = 20 15 | num_results_from_es = 5 16 | modelfile = 'F:/projects/elasticsearch-enterprise-system/data/topic_models/nowiki_v2_3pass_lda_250' 17 | vocabulary = 'F:/projects/elasticsearch-enterprise-system/data/topic_models/voc_vocabulary_0.vocab' 18 | 19 | def flatten(x): 20 | if isinstance(x, tuple): 21 | return " ".join([i for i in x]).lower() 22 | else: 23 | return x.lower() 24 | 25 | 26 | def add_keywords(results, kera): 27 | for topicresult in results: 28 | toptitle = '' 29 | kkw = {} 30 | logging.debug(topicresult['topics'][0:300]) 31 | for hits in topicresult['result']['hits']['hits']: 32 | title = hits['_source']['title'] 33 | topbody = hits['_source']['article'] 34 | toptitle += title + ' _ ' 35 | kwlist = kera.extract_keywords(toptitle + topbody) 36 | kw = dict(kwlist) 37 | logging.debug(kw) 38 | logging.debug("t: %s len kw:%d" % (toptitle, len(kw))) 39 | for keyword, keyvalue in kw.iteritems(): 40 | if keyword in kkw: 41 | kkw[keyword] += float(keyvalue) 42 | else: 43 | kkw[keyword] = float(keyvalue) 44 | kkw = sorted(kkw.items(), key=itemgetter(1), reverse=True) 45 | logging.debug(kkw) 46 | logging.debug('kkw %d' % len(kkw)) 47 | topicresult['keywords'] = kkw 48 | topicresult['keyword_string'] = " ".join([flatten(k_kw[0]) for k_kw in kkw]) 49 | topicresult['titles'] = toptitle 50 | return results 51 | 52 | 53 | def get_doc_topics(ldamodel, num_topics, num_words_from_topic, vocab, tfidfmodel): 54 | for num_topic in range(num_topics): 55 | topics = ldamodel.show_topic(num_topic, num_words_from_topic) 56 | # filter out high/low frequent words from the vocabulary 57 | 58 | toks = [topic[1] for topic in topics] 59 | logging.debug(toks) 60 | tfidf = tfidfmodel[vocab.doc2bow(toks)] 61 | # cut off 10 percent from top and bottom 62 | cutoff = int(num_words_from_topic * 0.1) 63 | outdoc = [vocab.get(wd[0]) for wd in sorted(tfidf, key=itemgetter(1), reverse=True)[cutoff:num_words_from_topic-cutoff]] 64 | logging.debug(outdoc) 65 | ss = set(toks) 66 | sb = set(outdoc) 67 | logging.debug(ss.difference(sb)) 68 | yield (' '.join(outdoc), num_topic) 69 | 70 | 71 | def main(): 72 | logformat = '%(asctime)s %(name)-12s: %(message)s' 73 | logging.basicConfig(level=logging.DEBUG, format=logformat) 74 | kera = NOB_kera() 75 | es = Elasticsearch(port=9201) 76 | mod = LdaModel.load(modelfile) 77 | vocab = Dictionary.load(vocabulary) 78 | tfidf = TfidfModel(dictionary=vocab) 79 | results = [] 80 | for (topics, topicid) in get_doc_topics(mod, mod.num_topics, num_words_from_topic, vocab, tfidf): 81 | res = es.search(index='wiki4', body={"query": {"match": {"_all": topics}}}, size=num_results_from_es) 82 | results.append({'topics': topics, 'result': res, 'topicid': topicid}) 83 | results = add_keywords(results, kera) 84 | df = pd.DataFrame(results) 85 | df.to_csv('nowiki_4_with_kera_250_topics.csv', encoding='utf-8') 86 | 87 | 88 | main() -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Comperio text analytics [![Build Status](https://travis-ci.org/comperiosearch/comperio-text-analytics.svg?branch=master)](https://travis-ci.org/comperiosearch/comperio-text-analytics) 2 | 3 | ElasticSearch based text analytics. 4 | 5 | Implementation of: 6 | 7 | * Single document significant terms - [Trello board](https://trello.com/c/nrO8QIp9) (private) 8 | * Classification - [Trello board](https://trello.com/c/PU7XqsTi) (private) 9 | * Sentiment analysis - [Trello board](https://trello.com/c/C8H5fBcJ) (private) 10 | 11 | ## Norwegian linguistics support for text analytics 12 | 13 | There is currently partial experimental support for some linguistic analysis of Norwegian Bokmål. This 14 | support depends on the following resources: 15 | 16 | * Norwegian Dependency Treebank (NDT) (freely available, permissive licensing). 17 | * Norsk Ordbank (available on request, GPL or commercial licensing). 18 | 19 | Norsk Ordbank must be obtained separately and unzipped in the data directory for it to be used automatically 20 | by linguistic processing components. 21 | 22 | ### Tokenizer 23 | 24 | Currently a simple application of the UAX29 standard for Unicode tokenization. Will be expanded to handle hyphens 25 | in accordance with Norwegian norms. 26 | 27 | ```python 28 | tokenizer = NOTokenizer() 29 | tokenizer.tokenize(u'Vi er konsulenter, med fokus på søk!') 30 | 31 | [u'Vi', 32 | u'er', 33 | u'konsulenter', 34 | u',', 35 | u'med', 36 | u'fokus', 37 | u'på', 38 | u'søk', 39 | u'!'] 40 | ``` 41 | 42 | ### Part of speech annotation 43 | 44 | Adds part of speech descriptions. THe default annotation is a very simplified version of the one used by Norsk Ordbank 45 | and NDT. 46 | 47 | ```python 48 | tagger = NOBTagger() 49 | tagger.tag(u'Vi spiste lunsj ute i det fine været.') 50 | 51 | [(u'Vi', 'PRON_PERS'), 52 | (u'spiste', 'VERB'), 53 | (u'lunsj', 'SUBST'), 54 | (u'ute', 'PREP'), 55 | (u'i', 'PREP'), 56 | (u'det', 'DET'), 57 | (u'fine', 'ADJ'), 58 | (u'været', 'SUBST'), 59 | (u'.', 'PUNKT')] 60 | ``` 61 | 62 | Evaluation of the tagger precision is forthcoming, but users should expect a reasonable error rate given the 63 | limited trraining data available. 64 | 65 | ### Lemmatization 66 | 67 | Based on Norsk Ordbank. It is possible to pass the part of speech tag in order to disambiguate words which can 68 | have more than one lemma form. 69 | 70 | ```python 71 | sent = tagger.tag(u'Vi er godt forberedt.') 72 | [(word, lem.lemmatize(word, tag)) for word, tag in sent] 73 | 74 | [(u'Vi', u'vi'), 75 | (u'er', u'være'), 76 | (u'godt', u'god'), 77 | (u'forberedt', u'forberedt'), 78 | (u'.', u'.')] 79 | ``` 80 | 81 | ### Decompounder 82 | 83 | Simple heuristics based decompounder based on the word forms in Norsk Ordbank. This can overgenerate so it should 84 | primarily be used on wellformed text. 85 | 86 | ```python 87 | dec = NOBDecompounder() 88 | dec.decompound(u'lampekostbatteri'), dec.decompound(u'søkekonsulenter') 89 | 90 | [u'lampe', u'kost', u'batteri'], [u'søke', u'konsulenter'] 91 | 92 | ``` 93 | 94 | ## Vagrant development server 95 | 96 | In order to set up up a Vagrant development server run: 97 | 98 | ``` 99 | vagrant up 100 | fab vagrant provision_server 101 | ``` 102 | 103 | 104 | #Installation notes 105 | 106 | To use the tagger, decompounder and lemmatizer you will need to download Norsk ordbank. 107 | You can download it by registering at [http://www.edd.uio.no/prosjekt/ordbanken/](http://www.edd.uio.no/prosjekt/ordbanken/) 108 | 109 | You will need to build models for the tagger, by running: 110 | 111 | from es_text_analytics.tagger import install_hunpos 112 | install_hunpos() 113 | comperio-text-analytics\python-client\bin\build-all-models.bat 114 | 115 | 116 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/np_extractor.py: -------------------------------------------------------------------------------- 1 | from textblob.base import BaseNPExtractor 2 | 3 | """ 4 | Minimal NP chunker for Norwegian adapted from the TextBlob FastNPExtractor. 5 | 6 | Compatible with the TextBlob API. 7 | """ 8 | 9 | CFG = { 10 | ('SUBST_PROP', 'SUBST_PROP'): 'SUBST_PROP', 11 | ('SUBST', 'SUBST'): 'SUBSTP', 12 | ('SUBSTP', 'SUBST'): 'SUBSTP', 13 | ('ADJ', 'ADJ'): 'ADJ', 14 | ('ADJ', 'SUBST'): 'SUBSTP', 15 | } 16 | 17 | 18 | def force_list(item): 19 | """ 20 | Wrapped the passed argument in a list if it is not a list. 21 | 22 | :param item: Anything. 23 | :return: List wrapping any non list item passed. 24 | """ 25 | if not isinstance(item, list): 26 | return [item] 27 | else: 28 | return item 29 | 30 | 31 | def extract(tagged_tokens, keep_index=False): 32 | """ 33 | Extracted NP chunks from a tagged sequence of tokens. 34 | 35 | This method uses a simple CFG over POS tags 36 | 37 | :param tagged_tokens: A sequence of token/tag pairs from the NNO or NOB tagger. 38 | :type tagged_tokens: list[(str|unicode, str|unicode)] 39 | :param keep_index: Return token index positions for chunks. 40 | :type keep_index: bool 41 | :rtype : list[str|unicode|list[str|unicode]|(str|unicode|list[str|unicode], int)] 42 | :return: A list of NP chunks as strings with the complete phrase. Chunks can be strings for single token chunks, 43 | list of strings for ultiple tokens or a chunk/index tuple if keep_index is set to true. 44 | """ 45 | merge = True 46 | while merge: 47 | merge = False 48 | for x in range(0, len(tagged_tokens) - 1): 49 | t1 = tagged_tokens[x] 50 | t2 = tagged_tokens[x + 1] 51 | key = t1[1], t2[1] 52 | value = CFG.get(key, '') 53 | 54 | if value: 55 | merge = True 56 | tagged_tokens.pop(x) 57 | tagged_tokens.pop(x) 58 | match = force_list(t1[0]) + force_list(t2[0]) 59 | pos = value 60 | # noinspection PyTypeChecker 61 | tagged_tokens.insert(x, (match, pos)) 62 | break 63 | 64 | matches = [] 65 | index = 0 66 | 67 | for t in tagged_tokens: 68 | if t[1] in ['SUBST', 'SUBST_PROP', 'SUBSTP']: 69 | if keep_index: 70 | value = (t[0], index) 71 | else: 72 | value = t[0] 73 | 74 | matches.append(value) 75 | 76 | if isinstance(t, list): 77 | index += len(t) 78 | else: 79 | index += 1 80 | 81 | return matches 82 | 83 | 84 | class NONPExtractor(BaseNPExtractor): 85 | """ 86 | Simple NP extractor similar to FastNPEXtractor in TextBlob. 87 | """ 88 | def __init__(self, tagger=None, keep_index=False): 89 | """ 90 | :param tagger: If initialized a tagger instance extract arguments will be processed with this tagger. 91 | Otherwise the extract method expects tagged input. 92 | :type tagger: None|textblob.base.BaseTagger 93 | :param keep_index: Return token index positions for chunks. 94 | :type keep_index: bool 95 | """ 96 | self.tagger = tagger 97 | self.keep_index = keep_index 98 | 99 | def extract(self, tokens): 100 | """ 101 | Extract NP chunks from passed tokens. 102 | 103 | :param tokens: Tokens as untagged string or pretagged list of token/tag pairs according to tagger configuration. 104 | :type tokens: str|list[(str|unicode, str|unicode)] 105 | :rtype : list[str|unicode] 106 | :return: 107 | """ 108 | if self.tagger: 109 | tokens = self.tagger.tag(tokens) 110 | 111 | return extract(tokens, keep_index=self.keep_index) 112 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/kera.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from operator import itemgetter 3 | 4 | from nltk import BigramAssocMeasures, BigramCollocationFinder 5 | 6 | 7 | # TODO A wrapper class could encapsulate default configurations. 8 | 9 | 10 | def extract_keywords(string, tokenizer, sent_tokenizer, tagger, extractor, proper_noun_tag='SUBST_PROP'): 11 | """ 12 | Implements KERA keyword extraction algorithm. 13 | 14 | See: https://www.ida.org/~/media/Corporate/Files/Publications/IDA_Documents/ITSD/ida-document-ns-d-4931.pdf 15 | 16 | Basic implementation of the procedure described in the paper. 17 | Probably needs some refinements in order to be more broadly effective. 18 | 19 | :param string: Document to analyze. 20 | :type string: str|unicode 21 | :param tokenizer: Function that returns a token segmentation as an iterable of strings given a string.. 22 | :type tokenizer: (str|unicode) -> list[str|unicode] 23 | :param sent_tokenizer: Function that returns a sentence segmentation as an iterable of strings given a string. 24 | :type sent_tokenizer: (str|unicode) -> list[str|unicode] 25 | :param tagger: TextBlob compatible POS tagger. Must accept untokenized sentences. 26 | :type tagger: textblob.base.BaseTagger 27 | :param extractor: TextBlob compatible noun phrase extractor. Must accept untokenized sentences and use the same 28 | POS tagger which is passed as the tagger parameter. 29 | :type extractor: textblob.base.BaseNPExtractor 30 | :param proper_noun_tag: POS tag indicating proper nouns. 31 | :type proper_noun_tag: str|unicode 32 | :return: List of keyword/score tuples. Keyword may be a string or tuple of strings. 33 | :rtype : list[(str|unicode|(str|unicode)), float] 34 | """ 35 | # find bigram collocations 36 | bigram_measures = BigramAssocMeasures() 37 | finder = BigramCollocationFinder.from_words(tokenizer(string)) 38 | collocations = finder.score_ngrams(bigram_measures.likelihood_ratio)[0:50] 39 | 40 | # find noun phrases 41 | phrases = [extractor.extract(s) for s in sent_tokenizer(string)] 42 | phrases = [item for sublist in phrases for item in sublist] 43 | 44 | # find proper noun tokens, collect total/frequency for weighting/normalization 45 | sents = [tagger.tag(s) for s in sent_tokenizer(string)] 46 | sents = [item for sublist in sents for item in sublist] 47 | 48 | proper_nouns = [] 49 | 50 | np_doc_len = 0 51 | 52 | for i, (token, tag) in enumerate(sents): 53 | np_doc_len += 1 54 | 55 | if tag == proper_noun_tag: 56 | proper_nouns.append((token, i)) 57 | 58 | # find noun phrase/collocation overlap 59 | phrase_strings = [' '.join(x[0]).lower() for x in phrases if isinstance(x[0], list)] 60 | collocations = [c for c in collocations if ' '.join(c[0]) in phrase_strings] 61 | 62 | ranks = [] 63 | 64 | # calculate combined index score and normalized collocation score for collocations 65 | coll_score_total = sum([x[1] for x in collocations]) 66 | coll_doc_len = len(tokenizer(string)) 67 | 68 | for coll, coll_score in collocations: 69 | idx = phrases[phrase_strings.index(' '.join(coll))][1] 70 | 71 | alpha = coll_score / coll_score_total 72 | beta = 1 - (float(idx) / coll_doc_len) 73 | 74 | score = 2 * alpha * beta / (alpha + beta) 75 | 76 | ranks.append((coll, score)) 77 | 78 | # calculate combined index score and normalized term frequency score for proper nouns 79 | np_strings = [x[0] for x in proper_nouns] 80 | np_counts = Counter(np_strings) 81 | np_total = len(proper_nouns) 82 | 83 | # only use normalize over the same number of proper nouns as collocations in order to keep 84 | # the scores roughly comparable. 85 | # TODO There are rarely more proper names than collocations. Handle this too. 86 | for np, count in sorted(np_counts.items(), key=itemgetter(1), reverse=True)[0:len(collocations)]: 87 | idx = proper_nouns[np_strings.index(np)][1] 88 | 89 | alpha = float(count) / np_total 90 | beta = 1 - (float(idx) / np_doc_len) 91 | 92 | score = 2 * alpha * beta / (alpha + beta) 93 | 94 | ranks.append((np, score)) 95 | 96 | # return list of keywords and scores sorted by score 97 | return sorted(ranks, key=itemgetter(1), reverse=True) 98 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/data/ndt_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import logging 3 | from operator import itemgetter 4 | import os 5 | from tarfile import TarFile 6 | 7 | from es_text_analytics.data.dataset import Dataset, parse_conll, CONLL_U_FIELDS 8 | 9 | NDT_ARCHIVE_URL='http://www.nb.no/sbfil/tekst/20140328_NDT_1-01.tar.gz' 10 | 11 | 12 | def filelist(lang=None, sections=None): 13 | """ 14 | Generate a list of filenames corresponding to languages (Nynorsk and Bokmål) 15 | and source sections in the Treebank. Default is to include all lsnguages 16 | and sections. 17 | 18 | :param lang: 19 | :type lang: str|unicode|None 20 | :param sections: 21 | :type sections: list[str|unicode]|None 22 | :rtype : list[str|unicode] 23 | :return: list of filenames corresponding to the specified Treebank content. 24 | """ 25 | files = [] 26 | 27 | if not sections: 28 | sections = ['ndt_1-0'] 29 | else: 30 | sections = ['%s_ndt_1-0' % s for s in sections] 31 | 32 | if not lang: 33 | lang = ['nob', 'nno'] 34 | else: 35 | lang = [lang] 36 | 37 | for s in sections: 38 | for l in lang: 39 | files.append('%s_%s.conll' % (s, l)) 40 | 41 | return files 42 | 43 | 44 | def iterator(dataset_fn, sections=None, lang=None, field_indices=None): 45 | """ 46 | Provides an iterator of CONLL formatted sentences from NDT. 47 | 48 | :param dataset_fn: Path to Newsgroups dataset archive file. 49 | :type dataset_fn: unicode|str 50 | :param sections: 51 | :type sections: list[str|unicode]|None 52 | :param lang: 53 | :type lang: list[str|unicode]|None 54 | :rtype : generator 55 | """ 56 | files = filelist(lang=lang, sections=sections) 57 | 58 | with TarFile.open(dataset_fn, 'r:gz') as f: 59 | for member in f: 60 | if member.isfile() and os.path.basename(member.name) in files: 61 | logging.info('parsing %s ...' % member.name) 62 | m_f = f.extractfile(member) 63 | 64 | for sentence in parse_conll(m_f, field_indices=field_indices): 65 | yield sentence 66 | 67 | m_f.close() 68 | 69 | 70 | def normalize(doc): 71 | """ 72 | Normalize a treebank sentence to a string with the token forms. 73 | 74 | :param doc: Parsed CONLL sentence. 75 | :type doc: list[list] 76 | :rtype : dict[str|unicode, str|unicode] 77 | :return: A document dict with the normalized sentence in the 'content' key. 78 | """ 79 | return {'content': u' '.join(map(itemgetter(1), doc))} 80 | 81 | 82 | class NDTDataset(Dataset): 83 | """ 84 | Class encapsulating the Norwegian Dependency Treebank. Uses the main CONLL data files. 85 | See http://www.nb.no/sprakbanken/show?serial=sbr-10&lang=nb for details. 86 | """ 87 | 88 | 89 | def __init__(self, index='ndt', doc_type='sentence', dataset_path=None, 90 | dataset_fn=None, lang=None, sections=None, fields=None, 91 | normalize_func=normalize): 92 | """ 93 | Default includes all sections, languages and fields. 94 | 95 | :param sections: Sections to include (blog, newspaper, partliament, report). 96 | :type sections: list[str|unicode]|None 97 | :param lang: Languages to include (nno, nob). 98 | :type lang: list[str|unicode]|None 99 | :param fields: Columns to include (index, form, lemma, cpostag, postag, feats, head, deprel, deps, misc). 100 | :type fields: list[str|unicode]|None 101 | """ 102 | super(NDTDataset, self).__init__(index=index, doc_type=doc_type, dataset_path=dataset_path, 103 | dataset_fn=dataset_fn, normalize_func=normalize_func) 104 | 105 | self.archive_fn = NDT_ARCHIVE_URL 106 | self.field_indices = None 107 | self.fields = CONLL_U_FIELDS 108 | 109 | if fields: 110 | self.fields = fields 111 | self.field_indices = [CONLL_U_FIELDS.index(f) for f in fields] 112 | 113 | self.sections = sections 114 | self.lang = lang 115 | 116 | def _iterator(self): 117 | return iterator(self.dataset_fn, sections=self.sections, 118 | lang=self.lang, field_indices=self.field_indices) 119 | -------------------------------------------------------------------------------- /python-client/sklext/test/test_term_weight_transformer.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from nose.tools import assert_true 4 | from numpy import array 5 | from numpy.ma.testutils import assert_array_approx_equal 6 | from scipy.sparse import issparse 7 | from scipy.sparse.csgraph._min_spanning_tree import csr_matrix 8 | 9 | from sklext.term_weighting import TermWeightTransformer 10 | 11 | 12 | class TestTermWeightTransformer(TestCase): 13 | def test_mi(self): 14 | X = array([[0, 1], 15 | [1, 0], 16 | [1, 1]]) 17 | y = array([[0, 1], 18 | [1, 0], 19 | [1, 0]]) 20 | 21 | transformer = TermWeightTransformer(method='mi') 22 | transformer.fit(X, y) 23 | 24 | assert_array_approx_equal(transformer._weights, [-0.37489, -0.605939], decimal=3) 25 | assert_array_approx_equal(transformer.transform(X), array([[0., -0.605939], 26 | [-0.37489, 0.], 27 | [-0.37489, -0.605939]]), 28 | decimal=3) 29 | 30 | transformer = TermWeightTransformer(method='mi') 31 | X = csr_matrix(X) 32 | y = csr_matrix(y) 33 | transformer.fit(X, y) 34 | newX = transformer.transform(X) 35 | 36 | assert_array_approx_equal(transformer._weights, [-0.37489, -0.605939], decimal=3) 37 | assert_true(issparse(newX)) 38 | assert_array_approx_equal(newX.todense(), array([[0., -0.605939], 39 | [-0.37489, 0.], 40 | [-0.37489, -0.605939]]), 41 | decimal=3) 42 | 43 | def test_pmi(self): 44 | X = array([[0, 1], 45 | [1, 0], 46 | [1, 1]]) 47 | y = array([[0, 1], 48 | [1, 0], 49 | [1, 0]]) 50 | 51 | transformer = TermWeightTransformer(method='pmi') 52 | transformer.fit(X, y) 53 | 54 | assert_array_approx_equal(transformer._weights, [0.1178, 0.1178], decimal=3) 55 | assert_array_approx_equal(transformer.transform(X), array([[0., 0.1178], 56 | [0.1178, 0.], 57 | [0.1178, 0.1178]]), 58 | decimal=3) 59 | 60 | transformer = TermWeightTransformer(method='pmi') 61 | X = csr_matrix(X) 62 | y = csr_matrix(y) 63 | transformer.fit(X, y) 64 | newX = transformer.transform(X) 65 | 66 | assert_array_approx_equal(transformer._weights, [0.1178, 0.1178], decimal=3) 67 | assert_true(issparse(newX)) 68 | assert_array_approx_equal(newX.todense(), array([[0., 0.1178], 69 | [0.1178, 0.], 70 | [0.1178, 0.1178]]), 71 | decimal=3) 72 | 73 | def test_npmi(self): 74 | X = array([[0, 1], 75 | [1, 0], 76 | [1, 1]]) 77 | y = array([[0, 1], 78 | [1, 0], 79 | [1, 0]]) 80 | 81 | transformer = TermWeightTransformer(method='npmi') 82 | transformer.fit(X, y) 83 | 84 | assert_array_approx_equal(transformer._weights, [0.1699, 0.0850], decimal=3) 85 | assert_array_approx_equal(transformer.transform(X), array([[0., 0.0850], 86 | [0.1700, 0.], 87 | [0.1700, 0.0850]]), 88 | decimal=3) 89 | 90 | transformer = TermWeightTransformer(method='npmi') 91 | X = csr_matrix(X) 92 | y = csr_matrix(y) 93 | transformer.fit(X, y) 94 | newX = transformer.transform(X) 95 | 96 | assert_array_approx_equal(transformer._weights, [0.1700, 0.0850], decimal=3) 97 | assert_true(issparse(newX)) 98 | assert_array_approx_equal(newX.todense(), array([[0., 0.0850], 99 | [0.1700, 0.], 100 | [0.1700, 0.0850]]), 101 | decimal=3) 102 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/lemmatizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import codecs 3 | import os 4 | 5 | from es_text_analytics.data.dataset import project_path 6 | from es_text_analytics.tagger import FEATURES_MAP 7 | 8 | 9 | # Norwegian lemmatizer based on Norsk Ordbank, http://www.edd.uio.no/prosjekt/ordbanken/data/index.html or 10 | # http://www.nb.no/sprakbanken/show?serial=sbr-5&lang=nb 11 | # 12 | # Norsk Ordbank is not freely available but must be obtained from one of the urls above. 13 | 14 | ORDBANK_BM_DEFAULT_PATH = os.path.join(project_path(), 'data', 'ordbank_bm') 15 | FULLFORM_BM_FN = 'fullform_bm.txt' 16 | 17 | FULLFORM_FIELDS = ['word_id', 'lemma', 'fullform', 'morph_descr', 'paradigm_code', 'paradigm_entry'] 18 | 19 | 20 | def parse_fullform_file(f, feat_norm='simple'): 21 | """ 22 | Parses the fullform data file in Norsk Ordbank and returns dicts indexed on the fullform and lemma respectively. 23 | 24 | All fullforms are lowercased. 25 | Morphological information is normalized to POS tags. 26 | 27 | :param f: file instance for reading the fullform Norsk Ordbank data file. 28 | :param feat_norm: Type of POS tag to normalize morphological information. Must correspond to POS tagger tagset 29 | if doing contextual lemmatization. 30 | :type feat_norm: str|unicode 31 | :rtype : (dict, dict) 32 | :return: The fullform and lemma indexes to the file entries. 33 | """ 34 | fullform_index = {} 35 | lemma_index = {} 36 | 37 | for line in f: 38 | line = line.strip() 39 | # published Ordbank files are latin-1 encoded 40 | line = line.decode('latin1') 41 | 42 | if line == '' or line[0] == '*': 43 | continue 44 | 45 | tokens = line.split('\t') 46 | 47 | entry = dict(zip(FULLFORM_FIELDS, tokens)) 48 | 49 | entry['fullform'] = entry['fullform'].lower() 50 | 51 | entry['word_id'] = int(entry['word_id']) 52 | entry['paradigm_entry'] = int(entry['paradigm_entry']) 53 | 54 | # extract pos and features fro mthe morphological field and normalize pos 55 | morph_parts = entry['morph_descr'].split() 56 | entry['ndt_pos'] = morph_parts[0] 57 | entry['ndt_feats'] = '|'.join(morph_parts[1:]) 58 | entry['pos'] = FEATURES_MAP[feat_norm](entry['fullform'], entry['ndt_pos'], entry['ndt_feats']) 59 | 60 | fullform_index[entry['fullform']] = fullform_index.get(entry['fullform'], []) + [entry] 61 | lemma_index[entry['lemma']] = lemma_index.get(entry['lemma'], []) + [entry] 62 | 63 | return fullform_index, lemma_index 64 | 65 | 66 | class OrdbankLemmatizer(object): 67 | """ 68 | Class implementing a simple lemmatizer for Bokmål based on Norsk Ordbank 69 | 70 | Uses "simple" POS tags for contextual disambiguation by default. 71 | """ 72 | def __init__(self, ordbank_path=None, contextual=False, feat_norm='simple'): 73 | """ 74 | :param ordbank_path: Path to Norsk Ordbank Bokmål datafiles. Uses the default location of absent. 75 | :param feat_norm: POS tag type to use for contextual disambiguation. Only "simple" currently supported. 76 | :type feat_norm: str|unicode 77 | """ 78 | super(OrdbankLemmatizer, self).__init__() 79 | 80 | if not ordbank_path: 81 | ordbank_path = ORDBANK_BM_DEFAULT_PATH 82 | 83 | with codecs.open(os.path.join(ordbank_path, FULLFORM_BM_FN)) as f: 84 | self.fullform_index, self.lemma_index = parse_fullform_file(f, feat_norm=feat_norm) 85 | 86 | def lemmatize(self, word, pos=None): 87 | """ 88 | Lemmatize the word using the POS tag context if passed. 89 | 90 | :param word: Word to lemmatize. 91 | :type word: str|unicode 92 | :param pos: Optional POS tag for disambiguation. 93 | :type pos: str|unicode 94 | :rtype : str|unicode 95 | :return: Lemma for passed word. 96 | """ 97 | # all matching is done on lowercase 98 | word = word.lower() 99 | 100 | if pos: 101 | # lookup candidates and eliminate those with mismatching POS tag 102 | candidates = [cand for cand in self.fullform_index.get(word, []) if cand['pos'] == pos] 103 | else: 104 | candidates = self.fullform_index.get(word) 105 | 106 | if candidates: 107 | # if there are several candidates we choose the last one 108 | # if the candidates are POS tag disambiguated our experience shows that further disambigous 109 | # entries has the "more reasonable" lemmas listed last 110 | return candidates[-1]['lemma'] 111 | else: 112 | # default strategy for failing matches is to do nothing 113 | return word 114 | -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | # Vagrantfile API/syntax version. Don't touch unless you know what you're doing! 5 | VAGRANTFILE_API_VERSION = "2" 6 | 7 | Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| 8 | # All Vagrant configuration is done here. The most common configuration 9 | # options are documented and commented below. For a complete reference, 10 | # please see the online documentation at vagrantup.com. 11 | 12 | # Every Vagrant virtual environment requires a box to build off of. 13 | config.vm.box = "ubuntu/trusty64" 14 | 15 | # Disable automatic box update checking. If you disable this, then 16 | # boxes will only be checked for updates when the user runs 17 | # `vagrant box outdated`. This is not recommended. 18 | # config.vm.box_check_update = false 19 | 20 | # Create a forwarded port mapping which allows access to a specific port 21 | # within the machine from a port on the host machine. In the example below, 22 | # accessing "localhost:8080" will access port 80 on the guest machine. 23 | # config.vm.network "forwarded_port", guest: 80, host: 8080 24 | config.vm.network "forwarded_port", guest: 9200, host: 9201 25 | 26 | # Create a private network, which allows host-only access to the machine 27 | # using a specific IP. 28 | # config.vm.network "private_network", ip: "192.168.33.10" 29 | 30 | # Create a public network, which generally matched to bridged network. 31 | # Bridged networks make the machine appear as another physical device on 32 | # your network. 33 | # config.vm.network "public_network" 34 | 35 | # If true, then any SSH connections made will enable agent forwarding. 36 | # Default value: false 37 | # config.ssh.forward_agent = true 38 | 39 | # Share an additional folder to the guest VM. The first argument is 40 | # the path on the host to the actual folder. The second argument is 41 | # the path on the guest to mount the folder. And the optional third 42 | # argument is a set of non-required options. 43 | # config.vm.synced_folder "../data", "/vagrant_data" 44 | 45 | # Provider-specific configuration so you can fine-tune various 46 | # backing providers for Vagrant. These expose provider-specific options. 47 | # Example for VirtualBox: 48 | # 49 | # config.vm.provider "virtualbox" do |vb| 50 | # # Don't boot with headless mode 51 | # vb.gui = true 52 | # 53 | # # Use VBoxManage to customize the VM. For example to change memory: 54 | # vb.customize ["modifyvm", :id, "--memory", "1024"] 55 | # end 56 | # 57 | # View the documentation for the provider you're using for more 58 | # information on available options. 59 | 60 | # Enable provisioning with CFEngine. CFEngine Community packages are 61 | # automatically installed. For example, configure the host as a 62 | # policy server and optionally a policy file to run: 63 | # 64 | # config.vm.provision "cfengine" do |cf| 65 | # cf.am_policy_hub = true 66 | # # cf.run_file = "motd.cf" 67 | # end 68 | # 69 | # You can also configure and bootstrap a client to an existing 70 | # policy server: 71 | # 72 | # config.vm.provision "cfengine" do |cf| 73 | # cf.policy_server_address = "10.0.2.15" 74 | # end 75 | 76 | # Enable provisioning with Puppet stand alone. Puppet manifests 77 | # are contained in a directory path relative to this Vagrantfile. 78 | # You will need to create the manifests directory and a manifest in 79 | # the file default.pp in the manifests_path directory. 80 | # 81 | # config.vm.provision "puppet" do |puppet| 82 | # puppet.manifests_path = "manifests" 83 | # puppet.manifest_file = "default.pp" 84 | # end 85 | 86 | # Enable provisioning with chef solo, specifying a cookbooks path, roles 87 | # path, and data_bags path (all relative to this Vagrantfile), and adding 88 | # some recipes and/or roles. 89 | # 90 | # config.vm.provision "chef_solo" do |chef| 91 | # chef.cookbooks_path = "../my-recipes/cookbooks" 92 | # chef.roles_path = "../my-recipes/roles" 93 | # chef.data_bags_path = "../my-recipes/data_bags" 94 | # chef.add_recipe "mysql" 95 | # chef.add_role "web" 96 | # 97 | # # You may also specify custom JSON attributes: 98 | # chef.json = { mysql_password: "foo" } 99 | # end 100 | 101 | # Enable provisioning with chef server, specifying the chef server URL, 102 | # and the path to the validation key (relative to this Vagrantfile). 103 | # 104 | # The Opscode Platform uses HTTPS. Substitute your organization for 105 | # ORGNAME in the URL and validation key. 106 | # 107 | # If you have your own Chef Server, use the appropriate URL, which may be 108 | # HTTP instead of HTTPS depending on your configuration. Also change the 109 | # validation key to validation.pem. 110 | # 111 | # config.vm.provision "chef_client" do |chef| 112 | # chef.chef_server_url = "https://api.opscode.com/organizations/ORGNAME" 113 | # chef.validation_key_path = "ORGNAME-validator.pem" 114 | # end 115 | # 116 | # If you're using the Opscode platform, your validator client is 117 | # ORGNAME-validator, replacing ORGNAME with your organization name. 118 | # 119 | # If you have your own Chef Server, the default validation client name is 120 | # chef-validator, unless you changed the configuration. 121 | # 122 | # chef.validation_client_name = "ORGNAME-validator" 123 | end 124 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/wordnet_centrality.py: -------------------------------------------------------------------------------- 1 | from operator import itemgetter 2 | 3 | import networkx as nx 4 | 5 | """ 6 | Experimental concept mining technique. 7 | 8 | Takes a weighted list of terms and returns the central WordNet Synsets for those terms. 9 | 10 | terms = [(u'gruppe', 1.0308515122783903), (u'skarabider', 1.0283549292633594), (u'utbredelse', 1.0255859517307202), 11 | (u'slekt', 1.02428182204782), (u'h\xf8re', 1.0236714839113259), (u'oldenborre', 1.0212900521382506), 12 | (u'art', 1.0206984849354699), (u'leve', 1.0181363254554074), (u'scarabaeoidea', 1.0178225609839886), 13 | (u'melolonthinae', 1.0137513034441485), (u'stor', 1.010924267580678), 14 | (u'underfamilie', 1.010754657594739), (u'sm\xe5', 1.0095704409677608), (u'underart', 1.0092106422118465), 15 | (u'millimeter', 1.009166357579949), (u'dekkvinge', 1.0077143382226799), (u'afrika', 1.0073806489590316), 16 | (u'pronotum', 1.0065698471665749), (u'gullbasse', 1.0065362930098589), (u'amerika', 1.0062859498858436), 17 | (u'parasittveps', 1.0060005533568113), (u'parasitt', 1.0060), (u'veps', 1.0060), (u'australia', 1.0058669303831191), 18 | (u'finnes', 1.0057317293824628), 19 | (u'gammaridea', 1.0056893049779934), (u'lang', 1.00559787835873), (u'familie', 1.0054340556687946), 20 | (u'parasitoide', 1.0053805526619595), (u'gjerne', 1.00537793821737), (u'taksonomisk', 1.005285633510592), 21 | (u'jorda', 1.005110740380347), (u's\xf8r', 1.0049040009707946), (u'asia', 1.0047862880057254), 22 | (u'panamerikansk', 1.0046940558397519), (u'svart', 1.0046021508509742), (u'inndeling', 1.0045280362592695), 23 | (u'omfatte', 1.0045210803669595), (u'cm', 1.004380770050699), (u'cetoniinae', 1.0042811151167377), 24 | (u'kjent', 1.0042292429979698), (u'praktskarabide', 1.0042044116189903), (u'\xe9n', 1.0041848737627979), 25 | (u'rutelinae', 1.0041419789388988), (u'ganske', 1.0039716523273752), (u'lys', 1.0036022222516647)] 26 | 27 | c = ConceptClassifier() 28 | c.concepts(terms) 29 | 30 | [(u'*ROOT*', 0.6080213703117321), 31 | ... 32 | (u'social_group', 0.06255333412324249), 33 | (u'collection', 0.06117590141836287), 34 | (u'cognition', 0.050449579778215264), 35 | (u'position', 0.050446927402192525), 36 | (u'h\xf8re', 0.04431870710158872), 37 | (u'kin', 0.02783831781626138), 38 | (u'gruppe', 0.02340480985900115), 39 | (u'content', 0.022421702867730226), 40 | (u'direction', 0.022415540534822163), 41 | (u'genealogy', 0.012455836088714487), 42 | (u'idea', 0.009964750496621828), 43 | (u'compass_point', 0.009950798727198123), 44 | (u'lineage', 0.005723405829441778), 45 | (u'concept', 0.0044278959856953815), 46 | (u'cardinal_compass_point', 0.004396465865842209), 47 | (u'family', 0.002965638904359013), 48 | (u'category', 0.0019660280031428582), 49 | (u'south', 0.0018952926436456173), 50 | (u'slekt', 0.0011273697128777484), 51 | (u'familie', 0.0011066250305903473), 52 | (u'kind', 0.0008694833352302178), 53 | (u's\xf8r', 0.0007068536091788623), 54 | (u'type', 0.0003767580775104511), 55 | (u'art', 0.00014272125955615077)] 56 | 57 | plt.figure(3,figsize=(12,12)) 58 | nx.draw(c.g, with_labels=True, font_size=8) 59 | 60 | """ 61 | 62 | 63 | def _create_subgraph(paths, root): 64 | g = nx.Graph() 65 | 66 | for ss_path in paths: 67 | for ss1, ss2 in zip(ss_path, ss_path[1:]): 68 | ss1_name = ss1[0] 69 | weight = ss1[1] 70 | ss2_name = ss2[0] 71 | 72 | g.add_node(ss1_name) 73 | g.add_node(ss2_name) 74 | g.add_edge(ss1_name, ss2_name, {'w': weight}) 75 | 76 | if ss2_name == root: 77 | break 78 | 79 | return g 80 | 81 | 82 | def _path_root(paths): 83 | path_root = None 84 | 85 | for ss_level in zip(*[reversed(p) for p in paths]): 86 | names = [x[0] for x in ss_level] 87 | 88 | if len(set(names)) == 1: 89 | path_root = names[0] 90 | 91 | return path_root 92 | 93 | 94 | class ConceptFinder(object): 95 | def __init__(self, lang='nob'): 96 | super(ConceptFinder, self).__init__() 97 | 98 | from nltk.corpus import wordnet as wordnet 99 | 100 | self.lang = lang 101 | self.wordnet = wordnet 102 | self.graph = None 103 | 104 | def concepts(self, terms): 105 | paths = self._synset_paths(terms) 106 | root = _path_root(paths).split('.')[0] 107 | self.graph = _create_subgraph(paths, root) 108 | 109 | return sorted(nx.eigenvector_centrality_numpy(self.graph, weight='w').items(), 110 | key=lambda x: x[1], reverse=True) 111 | 112 | def _top_synset(self, term): 113 | ss = self.wordnet.synsets(term) 114 | 115 | if len(ss) >= 1: 116 | return ss[0] 117 | 118 | return None 119 | 120 | def _synset_paths(self, terms): 121 | paths = [] 122 | 123 | for term, score, ss in [(term, score, self.wordnet.synsets(term, lang=self.lang)) for term, score in terms]: 124 | if len(ss) >= 1: 125 | paths.append([(term, score)] + [(x[0].name().split('.')[0], 1.0) 126 | for x in sorted(ss[0]._shortest_hypernym_paths(True).items(), 127 | key=itemgetter(1))]) 128 | 129 | return paths 130 | -------------------------------------------------------------------------------- /fabfile.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from fabric.contrib.console import confirm 4 | from fabric.network import disconnect_all 5 | from fabric.operations import sudo, run, put, local 6 | from fabric.state import env 7 | 8 | ANACONDA_MD5 = 'c3100392685b5a62c8509c0588ce9376' 9 | ANACONDA_URL = 'https://3230d63b5fc54e62148e-c95ac804525aac4b6dba79b00b39d1d3.ssl.cf1.rackcdn.com/Anaconda-2.3.0-Linux-x86_64.sh' 10 | ANACONDA_FN = 'Anaconda-2.3.0-Linux-x86_64.sh' 11 | ANACONDA_INSTALL_PATH = '/opt/anaconda' 12 | 13 | NEO4J_URL = 'http://neo4j.com/artifact.php?name=neo4j-community-2.2.3-unix.tar.gz' 14 | NEO4J_FN = 'neo4j-community-2.2.3-unix.tar.gz' 15 | NEO4J_FOLDER = 'neo4j-community-2.2.3' 16 | 17 | ESLIB_INSTALL_PATH = '/opt/eslib' 18 | 19 | 20 | def provision_server(): 21 | sudo('apt-get update -qq -y > /dev/null') 22 | install_debian_packages(['screen', 'unzip']) 23 | install_anaconda() 24 | install_elasticsearch() 25 | install_neo4j() 26 | install_self() 27 | restart_server() 28 | 29 | 30 | def restart_server(): 31 | sudo('shutdown -r now') 32 | 33 | 34 | def anaconda_downloaded(): 35 | r = run('test -f %s' % ANACONDA_FN, quiet=True) 36 | 37 | if getattr(r, 'return_code') != 0: 38 | return False 39 | 40 | r = run('md5sum %s' % ANACONDA_FN) 41 | md5, _ = getattr(r, 'stdout').split() 42 | 43 | if md5 != ANACONDA_MD5: 44 | if confirm("Anaconda archive corrupt. Delete?"): 45 | run('rm %s' % ANACONDA_FN) 46 | else: 47 | disconnect_all() 48 | sys.exit(1) 49 | 50 | return True 51 | 52 | 53 | def anaconda_installed(): 54 | r = run('test -d %s' % ANACONDA_INSTALL_PATH, quiet=True) 55 | 56 | if getattr(r, 'return_code') != 0: 57 | return False 58 | 59 | return True 60 | 61 | 62 | def install_anaconda(): 63 | if not anaconda_downloaded(): 64 | run('wget --quiet %s' % ANACONDA_URL) 65 | if not anaconda_installed(): 66 | sudo('bash %s -b -p %s' % (ANACONDA_FN, ANACONDA_INSTALL_PATH)) 67 | sudo('echo "export PATH=/opt/anaconda/bin:$PATH" > /etc/profile.d/anaconda.sh') 68 | 69 | 70 | def package_installed(pkg): 71 | r = run("dpkg-query -W -f='${Status}' %s 2>/dev/null | grep -c \"ok installed\"" % pkg, quiet=True) 72 | 73 | if getattr(r, 'return_code') != 0: 74 | return False 75 | 76 | return True 77 | 78 | 79 | def install_java(): 80 | if not package_installed('default-jre'): 81 | sudo('apt-get install -y -qq default-jre') 82 | 83 | 84 | def install_elasticsearch(): 85 | install_java() 86 | 87 | if not package_installed('elasticsearch'): 88 | run('wget -qO - https://packages.elastic.co/GPG-KEY-elasticsearch | sudo apt-key add -') 89 | run('echo "deb http://packages.elastic.co/elasticsearch/1.5/debian stable main" | sudo tee -a /etc/apt/sources.list') 90 | sudo('apt-get update -qq -y > /dev/null') 91 | sudo('apt-get install -qq -y elasticsearch') 92 | put('provision/elasticsearch.yml', '/etc/elasticsearch/elasticsearch.yml', use_sudo=True) 93 | sudo('/usr/share/elasticsearch/bin/plugin -i elasticsearch/marvel/latest') 94 | sudo('update-rc.d elasticsearch defaults 95 10') 95 | sudo('service elasticsearch start') 96 | 97 | 98 | def install_neo4j(): 99 | if not file_exists(NEO4J_FN): 100 | run('wget -q %s -O %s' % (NEO4J_URL, NEO4J_FN)) 101 | 102 | sudo('adduser --home /home/neo4j --system --shell /bin/bash neo4j') 103 | sudo('(cd /home/neo4j; tar zxf ~/%s)' % NEO4J_FN, user='neo4j') 104 | sudo('yes neo4j|(HEADLESS=true; /home/neo4j/%s/bin/neo4j-installer install)' % NEO4J_FOLDER) 105 | sudo('echo "neo4j soft nofile 40000" > /etc/security/limits.conf') 106 | sudo('echo "neo4j hard nofile 40000" >> /etc/security/limits.conf') 107 | sudo('echo "session required pam_limits.so" > /etc/pam.d/common-session') 108 | sudo('echo "session required pam_limits.so" > /etc/pam.d/common-session-noninteractive') 109 | put('provision/neo4j-server.properties', '.') 110 | sudo('mv neo4j-server.properties /home/neo4j/%s/conf/neo4j-server.properties' % NEO4J_FOLDER) 111 | sudo('chown neo4j /home/neo4j/%s/conf/neo4j-server.properties' % NEO4J_FOLDER) 112 | sudo('chmod 644 /home/neo4j/%s/conf/neo4j-server.properties' % NEO4J_FOLDER) 113 | sudo('service neo4j-service start') 114 | 115 | 116 | def install_neo4j_user(): 117 | put('provision/auth', '.') 118 | sudo('mv auth /home/neo4j/neo4j-community-2.2.2/data/dbms/auth') 119 | sudo('chown neo4j /home/neo4j/neo4j-community-2.2.2/data/dbms/auth') 120 | sudo('chmod 600 /home/neo4j/neo4j-community-2.2.2/data/dbms/auth') 121 | sudo('service neo4j-service restart') 122 | 123 | 124 | def file_exists(fn): 125 | r = run('test -f %s' %fn, quiet=True) 126 | 127 | if getattr(r, 'return_code') != 0: 128 | return False 129 | 130 | return True 131 | 132 | 133 | def install_self(): 134 | local('git archive master -o master.zip --format zip --prefix comperio-text-analytics/') 135 | put('master.zip', '.') 136 | sudo('unzip master') 137 | 138 | 139 | def vagrant(): 140 | env.user = 'vagrant' 141 | env.hosts = ['127.0.0.1:2222'] 142 | #env.key_filename = '~/.vagrant.d/insecure_private_key' 143 | env.key_filename = '.vagrant/machines/default/virtualbox/private_key' 144 | env.disable_known_hosts = True 145 | 146 | 147 | def install_debian_packages(packages=None): 148 | if packages and isinstance(packages, basestring): 149 | packages = [p.strip() for p in packages.split(';')] 150 | 151 | if packages: 152 | sudo('apt-get install -qq -y %s' % ' '.join(packages)) 153 | 154 | -------------------------------------------------------------------------------- /python-client/bin/corpus2lemmatizedtext.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from argparse import ArgumentParser 3 | import re 4 | import sys 5 | import re 6 | import codecs 7 | from gensim.corpora import Dictionary 8 | from gensim.models.tfidfmodel import TfidfModel 9 | from gensim import corpora 10 | 11 | from es_text_analytics.data.wikipedia import WikipediaDataset 12 | from es_text_analytics.data.elasticsearch_dataset import ElasticsearchDataset 13 | from nltk.corpus import stopwords 14 | 15 | from es_text_analytics.tagger import NOBTagger, install_hunpos 16 | from es_text_analytics.lemmatizer import OrdbankLemmatizer 17 | 18 | def fast_tokenize(str): 19 | return [x.lower() for x in re.findall('[^\W\d_]+', str, re.MULTILINE | re.UNICODE)] 20 | 21 | def normalize_es(doc): 22 | return doc['_source']['article'] 23 | 24 | 25 | def normalize_wiki(doc): 26 | return doc['id'], doc['article.text'] 27 | 28 | 29 | def get_tokenized(page, sw): 30 | return [token for token in fast_tokenize(page) if token not in sw and len(token) > 1] 31 | 32 | 33 | class IterableDataset(object): 34 | def __init__(self, args_dataset, stopwords, nobtag, lemmatizer): 35 | self.dataset = args_dataset 36 | self.tagger = nobtag 37 | self.lem = lemmatizer 38 | self.stopwords = stopwords 39 | 40 | def __len__(self): 41 | return sum(1 for _ in self.dataset) 42 | 43 | def __iter__(self): 44 | for page in self.dataset: 45 | tokens = get_tokenized(page[1], self.stopwords) 46 | sent = self.tagger.tag(tokens, tokenize=False) 47 | yield page[0], " ".join([self.lem.lemmatize(word, tag) for word, tag in sent]).lower() 48 | 49 | 50 | # wikidata download https://dumps.wikimedia.org/nowiki/latest/nowiki-latest-pages-articles.xml.bz2 51 | def main(): 52 | parser = ArgumentParser( 53 | description='wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information') 54 | parser.add_argument('-ds', '--dataset', default='wiki', help='What kind of dataset to use. (wiki or es)') 55 | parser.add_argument('-d', '--dump-file', help='Wiki: bz2 dump file with wiki in it') 56 | parser.add_argument('-l', '--limit', help='Wiki: How many documents to extract from wiki') 57 | parser.add_argument('--model-id', default='model', help='Filename for created model.') 58 | parser.add_argument('-q', '--query', default=None, help='Elasticsearch: Query to use to fetch documents') 59 | parser.add_argument('--index', help='Elasticsearch: index to read from.') 60 | parser.add_argument('--doc_type', default='doc', help='Elasticsearch: data type in index.') 61 | parser.add_argument('--data-dir', default='.', help='Directory to save the generated models and vocabularies into.') 62 | 63 | opts = parser.parse_args() 64 | 65 | dump_fn = opts.dump_file 66 | limit = int(opts.limit) if opts.limit else None 67 | 68 | data_type = opts.dataset.lower() 69 | if data_type not in ['es', 'wiki']: 70 | logging.error("Invalid dataset type %s" % data_type) 71 | parser.print_usage() 72 | exit(-1) 73 | limit = None 74 | if opts.limit: 75 | limit = int(opts.limit) 76 | if not dump_fn and data_type in ['wiki']: 77 | logging.error('--dump-file required for wiki dataset') 78 | sys.exit(1) 79 | 80 | query = opts.query 81 | index = opts.index 82 | doc_type = opts.doc_type 83 | if data_type == 'es' and index is None: 84 | logging.error( 85 | "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter") 86 | sys.exit(1) 87 | 88 | data_dir = opts.data_dir 89 | model_id = opts.model_id 90 | model_fn = '%s' % (model_id) 91 | if data_dir: 92 | model_fn = '%s%s' % (data_dir, model_fn) 93 | logging.info("Writing models to %s." % model_fn) 94 | 95 | if data_type == 'es': 96 | logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query)) 97 | dataset = ElasticsearchDataset(read_index=index, read_doc_type=doc_type, query=query, 98 | normalize_func=normalize_es) 99 | else: 100 | logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) 101 | dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki) 102 | sw = set(stopwords.words('norwegian')) 103 | #install_hunpos() 104 | nobtag = NOBTagger() 105 | ord = OrdbankLemmatizer() 106 | 107 | corpus = IterableDataset(dataset, sw, nobtag, ord) 108 | with codecs.open(model_fn, mode='w', encoding='utf-8') as fn: 109 | for document in corpus: 110 | logging.info(document[0]) 111 | fn.write(str(document[0]) + '\t' + document[1] + '\n') 112 | 113 | 114 | 115 | if __name__ == '__main__': 116 | logformat = '%(asctime)s %(name)-12s: %(message)s' 117 | logging.basicConfig(level=logging.INFO, format=logformat, filename='wiki-topicmodel.log' ) 118 | console = logging.StreamHandler() 119 | formatter = logging.Formatter(logformat) 120 | console.setFormatter(formatter) 121 | logging.getLogger('').addHandler(console) 122 | main() 123 | # ########## sample usage 124 | # 125 | #--model-type=lda -d F:/projects/elasticsearch-enterprise-system/data/nowiki-20150901-pages-articles.xml.bz2 -l 100 --n-topics 10 126 | #--model-type=lda -ds es --n-topics 10 --index wiki --query "{\"query\":{\"match\": {\"_all\":\"kongo\"}}}" 127 | #--model-type=word2vec -ds es --index wiki --w2v_window=7 --w2v_size=75 128 | #--model-type=hdp -d F:/projects/elasticsearch-enterprise-system/data/nowiki-20150901-pages-articles.xml.bz2 -l 100 --n-topics 10 129 | 130 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/data/dataset.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from abc import ABCMeta 4 | from abc import abstractmethod 5 | from urlparse import urlparse 6 | 7 | import requests 8 | from elasticsearch.client import IndicesClient 9 | 10 | BULK_REQUEST_SIZE = 100 11 | 12 | CONLL_U_FIELDS = ['index', 'form', 'lemma', 'cpostag', 'postag', 'feats', 13 | 'head', 'deprel', 'deps', 'misc'] 14 | 15 | 16 | def fn_from_url(url): 17 | """ 18 | Extract the final part of an url in order to get the filename of a downloaded url. 19 | 20 | :param url: url string 21 | :type url : str|unicode 22 | :rtype : str|unicode 23 | :return: url filename part 24 | """ 25 | parse = urlparse(url) 26 | 27 | return os.path.basename(parse.path) 28 | 29 | 30 | def download_file(url, dest_path): 31 | """ 32 | Download the file pointed to by the url to the path specified or the defult dataset location. 33 | If the dfile is already present at the path it will not be downloaded and the path to this file 34 | is returned. 35 | 36 | :param url: url string pointing to the file 37 | :type url : str|unicode 38 | :param dest_path: path to location where the file will be stored locally 39 | :type dest_path : str|unicode 40 | :rtype : str|unicode 41 | :return: path to the downloaded dataset 42 | """ 43 | if not os.path.exists(dest_path): 44 | os.makedirs(dest_path) 45 | 46 | fn = fn_from_url(url) 47 | full_fn = os.path.join(dest_path, fn) 48 | 49 | if os.path.exists(full_fn): 50 | logging.info('Dataset archive %s already exists in %s ...' % (fn, dest_path)) 51 | else: 52 | r = requests.get(url, stream=True) 53 | with open(full_fn, 'wb') as f: 54 | for chunk in r.iter_content(chunk_size=1024): 55 | if chunk: # filter out keep-alive new chunks 56 | f.write(chunk) 57 | f.flush() 58 | 59 | return full_fn 60 | 61 | 62 | def project_path(): 63 | """ 64 | Returns the path to the root project directory. 65 | 66 | :rtype : str|unicode 67 | :return: The root project path as a string. 68 | """ 69 | self_path = os.path.dirname(os.path.abspath(__file__)) 70 | 71 | return os.path.abspath(os.path.join(self_path, '..', '..', '..')) 72 | 73 | 74 | def default_dataset_path(): 75 | """ 76 | Returns the data default dataset location in the project directory. 77 | 78 | :rtype : str|unicode 79 | :return: the path to the default dataset location 80 | """ 81 | return os.path.join(project_path(), 'data') 82 | 83 | 84 | def parse_conll(fileobj, field_indices=None): 85 | """ 86 | Parse a CONLL formatted dependency treebank file. Supports the CONLL-U format 87 | with UTF-8 encoding. 88 | 89 | :param fileobj: A file like instance with CONLL formatted text. 90 | :rtype : generator 91 | """ 92 | sentence = [] 93 | 94 | for line in fileobj: 95 | line = line.decode('utf-8') 96 | line = line.strip() 97 | 98 | if line == '': 99 | if sentence: 100 | yield sentence 101 | 102 | sentence = [] 103 | 104 | continue 105 | 106 | row = line.split(u'\t') 107 | row[0] = int(row[0]) 108 | 109 | if field_indices: 110 | row = [row[i] for i in field_indices] 111 | 112 | sentence.append(row) 113 | 114 | if sentence: 115 | yield sentence 116 | 117 | 118 | class Dataset: 119 | """ 120 | Base class for self-installable and self-indexable datasets. 121 | 122 | Contains base methods for downloading the dataset and creating Elasticsearch index based on it. 123 | """ 124 | __metaclass__ = ABCMeta 125 | 126 | def __init__(self, index=None, doc_type=None, dataset_path=None, dataset_fn=None, 127 | normalize_func=None): 128 | """ 129 | Initialize the instance with optional Elasticsearch index information. 130 | 131 | :param index: Elasticsearch index where the dataset will be stored if indexed. 132 | :type index: str|unicode 133 | :param doc_type: 134 | :type doc_type: str|unicode 135 | :param dataset_path: location where dataset wiil be downloaded. If None the default location is used. 136 | :type dataset_path: None|str|unicode 137 | :param dataset_fn: Location of the dataset. If this argument is used the file specified will be used and 138 | the archive will not be downloaded automatically if not present. 139 | :type dataset_fn: None|str|unicode 140 | :param normalize_func: Function to normalize corpus documemt format. Default will create a dict with a field 141 | that contains the full document text. Exact format is corpus dependent. 142 | :type normalize_func: function|None 143 | """ 144 | self.es_index = index 145 | self.es_doc_type = doc_type 146 | self.dataset_fn = dataset_fn 147 | self.archive_fn = None 148 | self.normalize_func = normalize_func 149 | 150 | self.dataset_path = dataset_path 151 | 152 | if not dataset_path: 153 | self.dataset_path = default_dataset_path() 154 | 155 | @abstractmethod 156 | def _iterator(self): 157 | """ 158 | Subclasses should implement this method returning a generator yielding 159 | dicts with the document data. 160 | """ 161 | raise NotImplementedError 162 | 163 | def __iter__(self): 164 | if self.archive_fn: 165 | if not self.dataset_fn: 166 | raise ValueError() 167 | 168 | for doc in self._iterator(): 169 | try: 170 | if self.normalize_func: 171 | doc = self.normalize_func(doc) 172 | except Exception: 173 | logging.error('Unable to normalize doc ...') 174 | 175 | yield doc 176 | 177 | def index(self, es): 178 | """ 179 | Index the dataset in the given index with archive in the dataset location. 180 | 181 | :param es: Elasticsearch client instance 182 | :type es: elasticsearch.client.Elasticsearch 183 | :rtype : elasticsearch.client.Elasticsearch 184 | :return: :raise ValueError: 185 | """ 186 | docs = [] 187 | count = 0 188 | 189 | for doc in self: 190 | if '_id' in doc: 191 | docs += [{'index': dict(_index=self.es_index, _type=self.es_doc_type, _id=doc['_id'])}, doc] 192 | else: 193 | docs += [{'index': {'_index': self.es_index, '_type': self.es_doc_type }}, doc] 194 | count += 1 195 | 196 | if len(docs) % (2 * BULK_REQUEST_SIZE) == 0: 197 | es.bulk(index=self.es_index, doc_type=self.es_doc_type, body=docs) 198 | logging.info('Added %d documents ...' % count) 199 | docs = [] 200 | 201 | if docs: 202 | es.bulk(index=self.es_index, doc_type=self.es_doc_type, body=docs) 203 | logging.info('Added %d documents ...' % count) 204 | 205 | return self 206 | 207 | def delete_index(self, es): 208 | """ 209 | Delete the dataset index. 210 | 211 | :param es: Elasticsearch client instance 212 | :type es: elasticsearch.client.Elasticsearch 213 | :rtype : NewsgroupsDataset 214 | """ 215 | ic = IndicesClient(es) 216 | ic.delete(index=self.es_index, ignore=[400, 404]) 217 | 218 | return self 219 | 220 | def install(self, es=None): 221 | """ 222 | Install and optionally index the dataset. 223 | WARNING: Deletes the index before installing. 224 | 225 | :param es: Pass an Elasticsearch client instance to index the dataset. 226 | :type es: None|elasticsearch.client.Elasticsearch 227 | :rtype : Dataset 228 | """ 229 | if not self.archive_fn: 230 | logging.info("No installable archive for this dataset ...") 231 | else: 232 | if self.dataset_fn: 233 | logging.warn('Dataset initialized directly or already installed ...') 234 | return self 235 | else: 236 | self.dataset_fn = download_file(self.archive_fn, dest_path=self.dataset_path) 237 | 238 | if es: 239 | logging.info("Creating Elasticsearch index %s ..." % self.index) 240 | self.delete_index(es) 241 | self.index(es) 242 | 243 | return self 244 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/decompounder.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from numpy import argmin 3 | 4 | from es_text_analytics.lemmatizer import OrdbankLemmatizer, ORDBANK_BM_DEFAULT_PATH 5 | 6 | 7 | """ 8 | Simple decompounder that matches parts of words to fullform entries in Norsk Ordbank. 9 | 10 | TODO: Better match. Maybe match lemmas with fixed spacing characters. This should yield less overgeneration. 11 | TODO: Only match allowed POS tag sequences in a compound word. For example adjectives must be prepositioned 12 | and so on. 13 | TODO: Annotate compound entries in Norsk Ordbank. This would yield better size disambiguation heuristics and 14 | avoid keeping compound forms that are listed on Norsk Ordbank. 15 | TODO: Return POS tags for internal word components. 16 | """ 17 | 18 | # allowed POS tags that can form compounds 19 | COMPOUND_POS_MAP = { 20 | 'simple': ['SUBST', 'ADJ', 'ADV', 'VERB'] 21 | } 22 | 23 | # The decompounder searches for fullform matches from the beginning of the string creates a tree 24 | # of match combinations for each initial match. This is implemented in decompound() and decompund_inner(). 25 | # 26 | # The resulting trees are flattened to lists of word forms that the compoun can be split into. This is 27 | # implemented in flatten() and flatten_inner(). 28 | # 29 | # Fullform matches can be filtered on length (very short words are probably not "proper" words) and POS tag 30 | # (compounds are for example not productively formed from closed word classes in Norwegian). This is implemented 31 | # in fullform_index_match(). 32 | # 33 | # NOTE: Not optimized. Can probably be made a lot more efficient. 34 | 35 | def fullform_index_match(string, fullform_index, pos_match_field=None, pos_format='simple'): 36 | """ 37 | Partial string matching to fullform index. See main comment. 38 | 39 | :param string: Partial word that is being decompounded. 40 | :type string: str|unicode 41 | :param fullform_index: Fullform index to Norsk Ordbank entries. 42 | :type fullform_index: dict[str|unicode, list[dict]] 43 | :param pos_match_field: Field in fullform index entry to match POS tag to. 44 | :type pos_match_field: None|str|unicode 45 | :param pos_format: POS tag type, must correspond to POS tag field in fullform index. 46 | :type pos_format: str|unicode 47 | :rtype : bool 48 | :return: True if matching entry within constraints is found in index. 49 | """ 50 | if pos_match_field: 51 | return [match for match in fullform_index.get(string, []) 52 | if match[pos_match_field] in COMPOUND_POS_MAP[pos_format]] 53 | else: 54 | return string in fullform_index 55 | 56 | 57 | def decompound_inner(word, fullform_index, start=0, min_match=2, pos_match_field=None, pos_format='simple'): 58 | """ 59 | Decompound tree builder. See main comment. 60 | 61 | :param word: Word that is being decompounded. 62 | :type word: str|unicode 63 | :param fullform_index: Fullform index to Norsk Ordbank entries. 64 | :type fullform_index: dict[str|unicode, list[dict]] 65 | :param start: Decompound from this position in the word. 66 | :type start: int|long 67 | :param min_match: Minimum string length to match. 68 | :type min_match: int|long 69 | :param pos_match_field: Field in fullform index entry to match POS tag to. 70 | :type pos_match_field: None|str|unicode 71 | :param pos_format: POS tag type, must correspond to POS tag field in fullform index. 72 | :type pos_format: str|unicode 73 | :rtype : list[str|unicode|list] 74 | :return: List based tree structure of partial matches. 75 | """ 76 | compounds = [] 77 | for i in range(start+1, len(word) + 1): 78 | if fullform_index_match(word[start:i], fullform_index, pos_format=pos_format, pos_match_field=pos_match_field) \ 79 | and i - start > min_match: 80 | # recursively collect sequential matches 81 | compounds.append([word[start:i]] + 82 | decompound_inner(word, fullform_index, start=i, min_match=min_match, 83 | pos_match_field=pos_match_field, pos_format=pos_format)) 84 | 85 | return compounds 86 | 87 | 88 | def flatten_inner(compound_tree): 89 | """ 90 | Flatten single tree structure with fullform mathes for a compound word. See main comment. 91 | 92 | :param compound_tree: List based tree structure of partial matches. 93 | :type compound_tree: list[str|unicode|list] 94 | :rtype : list[list[str|unicode]] 95 | :return: List of partial matches for each branch of the passed tree. 96 | """ 97 | results = [] 98 | 99 | # recursive base case, leaf of tree 100 | if len(compound_tree) == 1: 101 | return [[compound_tree[0]]] 102 | 103 | head = compound_tree[0] 104 | 105 | # recursively traverse each branch 106 | for rest in compound_tree[1:]: 107 | results += [[head] + tail for tail in flatten_inner(rest)] 108 | 109 | return results 110 | 111 | 112 | def flatten(compound_forest): 113 | """ 114 | Flatten a list of compund match trees. See main comment. 115 | 116 | :param compound_forest: List of list based tree structures with partial fullform matches. 117 | :type compound_forest: list[list[str|unicode|list]] 118 | :rtype : list[list[str|unicode]] 119 | :return: List of partial matches for each branch of eaxh tree of the passed list of trees. 120 | """ 121 | result = [] 122 | 123 | for tree in compound_forest: 124 | result += flatten_inner(tree) 125 | 126 | return result 127 | 128 | 129 | def decompound(word, fullform_index, min_match=2, pos_match_field=None, pos_format='simple'): 130 | """ 131 | Main decompounder entry point. See main comment. 132 | 133 | Filters out compound word decompositions that does not exactly match the passed word 134 | 135 | :param word: Word that is being decompounded. 136 | :type word: str|unicode 137 | :param fullform_index: Use this fullform index during decompounding. Must conform to the structure 138 | used by the OrdbankLemmatizer class. 139 | :type fullform_index: dict[str|unicode, list[dict]] 140 | :param min_match: Minimum string length to match. 141 | :type min_match: int|long 142 | :param pos_match_field: Field in fullform index entry to match POS tag to. 143 | :type pos_match_field: None|str|unicode 144 | :param pos_format: POS tag type, must correspond to POS tag field in fullform index. 145 | :type pos_format: str|unicode 146 | :rtype : list[list[str|unicode]] 147 | :return: List of compound word decompositions into substrings. 148 | """ 149 | candidates = flatten(decompound_inner(word, fullform_index, min_match=min_match, 150 | pos_format=pos_format, pos_match_field=pos_match_field)) 151 | 152 | return [c for c in candidates if sum([len(p) for p in c]) == len(word)] 153 | 154 | 155 | class NOBDecompounder(object): 156 | """ 157 | Class implementing a simple decompounding strategy for Norwegian Bokmål using the 158 | Norsk Ordbank lexical database. 159 | 160 | The decompounder uses heuristics and word matching to find and disambiguate 161 | decompounding candidates. 162 | """ 163 | def __init__(self, fullform_index=None, min_match=2, pos_format='simple'): 164 | """ 165 | :param fullform_index: Use this fullform index during decompounding. Must conform to the structure 166 | used by the OrdbankLemmatizer class. 167 | :type fullform_index: dict[str|unicode, list[dict]] 168 | :param min_match: Minimum length of subword that will be matched. 169 | :type min_match: int|long 170 | :param pos_format: POS tag type used for disambiguation. Must match fullform index content. 171 | :type pos_format: str|unicode 172 | """ 173 | super(NOBDecompounder, self).__init__() 174 | 175 | self.min_match = min_match 176 | self.pos_format = pos_format 177 | self.fullform_index = fullform_index 178 | 179 | if not self.fullform_index: 180 | self.fullform_index = OrdbankLemmatizer(ORDBANK_BM_DEFAULT_PATH, feat_norm=self.pos_format).fullform_index 181 | 182 | def decompound(self, word): 183 | """ 184 | Decompose the passed compound word if possible. 185 | 186 | :param word: Word to decompound. 187 | :type word: str|unicode 188 | :rtype : None|list[string|unicode] 189 | :return: A list of words that compose the compound word or None if no decomposition is found. 190 | """ 191 | candidates = decompound(word.lower(), self.fullform_index, min_match=self.min_match, 192 | pos_match_field='pos', pos_format=self.pos_format) 193 | 194 | if not candidates: 195 | return None 196 | else: 197 | # if there are several candidates we will pick the one with the simplest decomposition, ie. the 198 | # one with the fewest elements. 199 | # if there are still several candidates argmin implicitly chooses the first one since this should 200 | # usually have the longest last component with the current matching strategy 201 | return candidates[argmin([len(c) for c in candidates])] 202 | -------------------------------------------------------------------------------- /python-client/bin/build-wiki-topicmodel.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from argparse import ArgumentParser 3 | import re 4 | import sys 5 | import re 6 | from gensim import corpora 7 | from gensim.corpora import Dictionary 8 | from gensim.models.lsimodel import LsiModel 9 | from gensim.models.ldamodel import LdaModel 10 | from gensim.models.word2vec import Word2Vec 11 | from gensim.models.hdpmodel import HdpModel 12 | from gensim.models.tfidfmodel import TfidfModel 13 | 14 | from es_text_analytics.data.wikipedia import WikipediaDataset 15 | from es_text_analytics.data.elasticsearch_dataset import ElasticsearchDataset 16 | from nltk.corpus import stopwords 17 | 18 | 19 | def fast_tokenize(str): 20 | return [x.lower() for x in re.findall('[^\W\d_]+', str, re.MULTILINE | re.UNICODE)] 21 | 22 | 23 | def normalize_es(doc): 24 | return doc['_source']['article'] 25 | 26 | 27 | def normalize_wiki(doc): 28 | return doc['article.text'] 29 | 30 | 31 | def normalize_file(doc): 32 | return doc.split('\t')[1] 33 | 34 | 35 | def get_tokenized(page, sw): 36 | return [token for token in fast_tokenize(page) if token not in sw and len(token) > 1] 37 | 38 | 39 | 40 | import re 41 | import string 42 | import tarfile 43 | import codecs 44 | from es_text_analytics.data.dataset import Dataset 45 | from elasticsearch.client import Elasticsearch 46 | from elasticsearch.helpers import scan 47 | 48 | """ 49 | Elasticsearch as data source 50 | 51 | """ 52 | 53 | 54 | class FileDataset(Dataset): 55 | """ 56 | Class encapsulating using a text file as datasource. Assumes file contains lines with documents. 57 | The formatting of the lines are up to you, but remember to extract what you need in the normalize_func 58 | """ 59 | 60 | def __init__(self, dump_fn, num_articles=None, normalize_func=None): 61 | super(FileDataset, self).__init__( normalize_func=normalize_func) 62 | self.dataset_fn = dump_fn 63 | 64 | 65 | def _iterator(self): 66 | with codecs.open(self.dataset_fn, 'r', encoding='utf-8') as f: 67 | for line in f: 68 | yield line 69 | 70 | 71 | 72 | 73 | class IterableDataset(object): 74 | def __init__(self, args_dataset, stopwords, vocabulary, doc2bow=True): 75 | self.dataset = args_dataset 76 | self.doc2bow = doc2bow 77 | self.stopwords = stopwords 78 | self.vocabulary = vocabulary 79 | 80 | def __len__(self): 81 | return sum(1 for _ in self.dataset) 82 | 83 | def __iter__(self): 84 | for page in self.dataset: 85 | doc = get_tokenized(page, self.stopwords) 86 | if self.doc2bow: 87 | yield self.vocabulary.doc2bow(doc) 88 | else: 89 | yield doc 90 | 91 | 92 | # wikidata download https://dumps.wikimedia.org/nowiki/latest/nowiki-latest-pages-articles.xml.bz2 93 | def main(): 94 | parser = ArgumentParser( 95 | description='wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information') 96 | parser.add_argument('-ds', '--dataset', default='wiki', help='What kind of dataset to use. (wiki,es,file)') 97 | parser.add_argument('-d', '--dump-file', help='Wiki: bz2 dump file with wiki in it') 98 | parser.add_argument('-l', '--limit', help='Wiki: How many documents to extract from wiki') 99 | parser.add_argument('--model-id', default='model', help='Filename for created model.') 100 | parser.add_argument('--model-type', default='lsi', help='Model type (lsi, lda, word2vec, hdp, vocabulary).') 101 | parser.add_argument('--n-topics', default=10, help='Number of topics to model.') 102 | parser.add_argument('--n-passes', default=1, help='Number of passes for LDA model.') 103 | parser.add_argument('--w2v-size', default=100, help='size of Word2Vec context.') 104 | parser.add_argument('--w2v-window', default=5, help='window for Word2Vec.') 105 | parser.add_argument('-q', '--query', default=None, help='Elasticsearch: Query to use to fetch documents') 106 | parser.add_argument('--index', help='Elasticsearch: index to read from.') 107 | parser.add_argument('--doc_type', default='doc', help='Elasticsearch: data type in index.') 108 | parser.add_argument('--data-dir', help='Directory to save the generated models and vocabularies into.') 109 | parser.add_argument('--vocab', help='Prebuilt Vocabulary file. Use this to avoid having to generate one.') 110 | 111 | opts = parser.parse_args() 112 | 113 | model_type = opts.model_type.lower() 114 | if model_type not in ['lsi', 'lda', 'word2vec', 'hdp', 'vocabulary']: 115 | logging.error("Invalid model type %s" % model_type) 116 | parser.print_usage() 117 | exit(-1) 118 | 119 | logging.info("Using model type %s" % model_type) 120 | 121 | dump_fn = opts.dump_file 122 | limit = int(opts.limit) if opts.limit else None 123 | 124 | data_type = opts.dataset.lower() 125 | if data_type not in ['es', 'wiki', 'file']: 126 | logging.error("Invalid dataset type %s" % data_type) 127 | parser.print_usage() 128 | exit(-1) 129 | limit = None 130 | if opts.limit: 131 | limit = int(opts.limit) 132 | if not dump_fn and data_type in ['wiki']: 133 | logging.error('--dump-file required for wiki dataset') 134 | sys.exit(1) 135 | 136 | query = opts.query 137 | index = opts.index 138 | doc_type = opts.doc_type 139 | if data_type == 'es' and index is None: 140 | logging.error( 141 | "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter") 142 | sys.exit(1) 143 | 144 | n_topics = int(opts.n_topics) 145 | n_passes = int(opts.n_passes) 146 | logging.info("Using %d topics." % n_topics) 147 | data_dir = opts.data_dir 148 | model_id = opts.model_id 149 | model_fn = '%s_%s_%d' % (model_id, model_type, n_topics) 150 | if data_dir: 151 | model_fn = '%s/%s' % (data_dir, model_fn) 152 | if model_type == 'word2vec': 153 | w2v_size = int(opts.w2v_size) 154 | w2v_window = int(opts.w2v_window) 155 | model_fn = '%s_w_%s_s_%s' % (model_fn, w2v_window, w2v_size) 156 | logging.info("Writing models to %s." % model_fn) 157 | 158 | if data_type == 'es': 159 | logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query)) 160 | dataset = ElasticsearchDataset(read_index=index, read_doc_type=doc_type, query=query, 161 | normalize_func=normalize_es) 162 | elif data_type == 'wiki': 163 | logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) 164 | dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki) 165 | elif data_type == 'file': 166 | logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) 167 | dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file) 168 | vocab_file = opts.vocab 169 | vocab = Dictionary() 170 | sw = set(stopwords.words('norwegian')) 171 | if not vocab_file or model_type == 'vocabulary': 172 | vocab.add_documents([get_tokenized(page, sw) for page in dataset]) 173 | vocab.filter_extremes() 174 | vocab.compactify() 175 | vocab.save(model_fn + '.vocab') 176 | else: 177 | vocab = Dictionary.load(vocab_file) 178 | if model_type == 'vocabulary': 179 | return 180 | tfidf = TfidfModel(dictionary=vocab) 181 | if model_type == 'lsi': 182 | corpus = IterableDataset(dataset, sw, vocab) 183 | model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics, 184 | id2word=vocab) 185 | elif model_type == 'lda': 186 | corpus = IterableDataset(dataset, sw, vocab) 187 | model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes, 188 | id2word=vocab) 189 | 190 | elif model_type == 'word2vec': 191 | corpus = IterableDataset(dataset, sw, vocab, doc2bow=False) 192 | corpus.dictionary = vocab 193 | model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size) 194 | elif model_type == 'hdp': 195 | corpus = IterableDataset(dataset, sw, vocab) 196 | model = HdpModel(corpus=tfidf[corpus], id2word=vocab) 197 | 198 | logging.info(model) 199 | model.save(model_fn) 200 | 201 | 202 | if __name__ == '__main__': 203 | logformat = '%(asctime)s %(name)-12s: %(message)s' 204 | logging.basicConfig(level=logging.INFO, format=logformat, filename='wiki-topicmodel.log' ) 205 | console = logging.StreamHandler() 206 | formatter = logging.Formatter(logformat) 207 | console.setFormatter(formatter) 208 | logging.getLogger('').addHandler(console) 209 | main() 210 | 211 | # ########## sample usage 212 | # 213 | #--model-type=lda -d F:/projects/elasticsearch-enterprise-system/data/nowiki-20150901-pages-articles.xml.bz2 -l 100 --n-topics 10 214 | #--model-type=lda -ds es --n-topics 10 --index wiki --query "{\"query\":{\"match\": {\"_all\":\"kongo\"}}}" 215 | #--model-type=word2vec -ds es --index wiki --w2v_window=7 --w2v_size=75 216 | #--model-type=hdp -d F:/projects/elasticsearch-enterprise-system/data/nowiki-20150901-pages-articles.xml.bz2 -l 100 --n-topics 10 217 | #--model-type=lda -ds file --n-topics 10 -d f:/projects/comperio-text-analytics/models/dump 218 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/tagger.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import logging 3 | import os 4 | import re 5 | from tarfile import TarFile 6 | from zipfile import ZipFile 7 | import sys 8 | from subprocess import Popen, PIPE 9 | 10 | from nltk.tag.hunpos import HunposTagger 11 | from textblob.base import BaseTagger 12 | 13 | from es_text_analytics.data.dataset import project_path, download_file 14 | from es_text_analytics.tokenizer import NOTokenizer 15 | 16 | 17 | 18 | 19 | 20 | 21 | # TextBlob compatible part-of-speech tagger for Norwegian. 22 | 23 | # default HunPos model locations 24 | NOB_TAGGER_DEFAULT_MODEL_FN = os.path.join(project_path(), 'models', 'nob-tagger-default-model') 25 | NNO_TAGGER_DEFAULT_MODEL_FN = os.path.join(project_path(), 'models', 'nno-tagger-default-model') 26 | 27 | HUNPOS_URL_MAP = { 28 | 'linux2': 'https://hunpos.googlecode.com/files/hunpos-1.0-linux.tgz', 29 | 'darwin': 'https://hunpos.googlecode.com/files/hunpos-1.0-macosx.tgz', 30 | 'win32': 'https://hunpos.googlecode.com/files/hunpos-1.0-win.zip' 31 | } 32 | 33 | HUNPOS_SUBDIR_MAP = { 34 | 'win32': 'hunpos-1.0-win', 35 | 'darwin': 'hunpos-1.0-macosx', 36 | 'linux2': 'hunpos-1.0-linux' 37 | } 38 | 39 | 40 | def obt_to_universal_tag(form, pos, feats): 41 | """ 42 | Maps OBT POS tags and features as found in the NDT to universal POS tags as described in 43 | http://universaldependencies.github.io/docs/u/pos/index.html 44 | 45 | The mapping is not complete or completely precises because of discrepancies between the OBT/NDT 46 | annotation and this tagset. For example: 47 | 48 | - AUX is not annotated in NDT and would need a wordlist to extract properly. 49 | - NUM is done mostly heuristically since NDT does nt encode numbers. Ideally numbers and similar 50 | entities should be normalized before tagging. 51 | 52 | :param form: NDT word form. 53 | :type form: str|unicode 54 | :param pos: OBT pos tag. 55 | :type pos: str|unicode 56 | :param feats: OBT features encoded as | separated string as in NDT. 57 | :type feats: str|unicode 58 | :rtype : str|unicode 59 | :return: Normalized universal POS tag. 60 | """ 61 | feats = feats.split('|') 62 | 63 | if re.search('\d', form): 64 | return 'NUM' 65 | 66 | if pos == 'adj': 67 | return 'ADJ' 68 | if pos == 'adv': 69 | return 'ADV' 70 | if pos == 'konj': 71 | return 'CONJ' 72 | if pos == 'det' and ('' in feats or '' in feats): 73 | return 'NUM' 74 | if pos == 'det': 75 | return 'DET' 76 | if pos == 'interj': 77 | return 'INTJ' 78 | # we'll include dates among the proper nouns 79 | if pos == 'subst' and ('prop' in feats or '' in feats or 'fork' in feats or '' in feats): 80 | return 'PROPN' 81 | if pos == 'subst' and 'symb' in feats: 82 | return 'SYM' 83 | if pos == 'subst': 84 | return 'NOUN' 85 | if pos == 'pron': 86 | return 'PRON' 87 | if pos in ['clb', '', '', '', '', '']: 88 | return 'PUNCT' 89 | if pos == 'sbu': 90 | return 'SCONJ' 91 | if pos == 'symb': 92 | return 'SYM' 93 | if pos in ['inf-merke', 'verb']: 94 | return 'VERB' 95 | if pos == 'prep': 96 | return 'ADP' 97 | 98 | return 'X' 99 | 100 | 101 | def obt_to_simple(form, pos, feats): 102 | """ 103 | Mapping from OBT to a simple POS tag set including a small set of basic features into the tag. 104 | 105 | - Heuristically extracts number tag. 106 | - Normalizes punctuation to single tag. 107 | - Includes pronoun type. 108 | - Normalizes proper noun tags and features. 109 | - includes passive feature on verbs. 110 | 111 | :param form: NDT word form. 112 | :type form: str|unicode 113 | :param pos: OBT pos tag. 114 | :type pos: str|unicode 115 | :param feats: OBT features encoded as | separated string as in NDT. 116 | :type feats: str|unicode 117 | :rtype : str|unicode 118 | :return: Normalized POS tag. 119 | """ 120 | feats = feats.split('|') 121 | 122 | if re.search('\d', form): 123 | return 'NUM' 124 | 125 | if pos == 'det' and ('' in feats or '' in feats): 126 | return 'NUM' 127 | 128 | if pos in ['clb', '', '', '', '', '']: 129 | return 'PUNKT' 130 | 131 | if pos == 'pron': 132 | for feat in ['sp', 'pers', 'poss', 'refl']: 133 | if feat in feats: 134 | return ('%s_%s' % (pos, feat)).upper() 135 | 136 | if pos == 'subst': 137 | if 'sym' in feats: 138 | return 'SYMB' 139 | 140 | # include dates 141 | for feat in ['prop', '', 'fork', '']: 142 | if feat in feats: 143 | return 'SUBST_PROP' 144 | 145 | if pos == 'verb' and 'pass' in feats: 146 | return 'VERB_PASS' 147 | 148 | return pos.upper() 149 | 150 | 151 | # maps feature normalization identifiers to the functions 152 | FEATURES_MAP = {'universal': obt_to_universal_tag, 153 | 'simple': obt_to_simple, 154 | # removes all features and includes just the bare POS tag 155 | 'no-feats': lambda form, pos, feats: pos, 156 | # includes all features execpt blank ones 157 | 'all-feats': lambda form, pos, feats: '%s_%s' % (pos, '_'.join([f for f in feats.split('|') if f != '_']))} 158 | 159 | 160 | def install_hunpos(): 161 | """ 162 | Downloads and install system appropriate HunPos binaries in the default location. 163 | 164 | :rtype : None 165 | """ 166 | models_dir = os.path.join(project_path(), 'models') 167 | 168 | hunpos_archive_fn = download_file(HUNPOS_URL_MAP[sys.platform], models_dir) 169 | 170 | if sys.platform == 'win32': 171 | with ZipFile(hunpos_archive_fn) as f: 172 | f.extractall(models_dir) 173 | else: 174 | with TarFile(hunpos_archive_fn) as f: 175 | f.extractall(models_dir) 176 | 177 | os.remove(hunpos_archive_fn) 178 | 179 | 180 | def hunpos_path(): 181 | """ 182 | Returns the system specifiuc default install directory for HunPos binaries. 183 | 184 | :rtype : str|unicode 185 | :return: 186 | """ 187 | return os.path.join(project_path(), 'models', HUNPOS_SUBDIR_MAP[sys.platform]) 188 | 189 | 190 | def hunpos_tag_bin(): 191 | """ 192 | Path to system specific hunpos-tag binary. 193 | 194 | :rtype : str|unicode 195 | :return: 196 | """ 197 | if sys.platform == 'win32': 198 | return os.path.join(hunpos_path(), 'hunpos-tag.exe') 199 | else: 200 | return os.path.join(hunpos_path(), 'hunpos-tag') 201 | 202 | 203 | def hunpos_train_bin(): 204 | """ 205 | Path to system specific hunpos-train binary. 206 | 207 | :rtype : str|unicode 208 | :return: 209 | """ 210 | if sys.platform == 'win32': 211 | return os.path.join(hunpos_path(), 'hunpos-train.exe') 212 | else: 213 | return os.path.join(hunpos_path(), 'hunpos-train') 214 | 215 | 216 | def parse_hunpos_train_output(output): 217 | """ 218 | Parses hunpos-train output and collects the reported statistics. 219 | 220 | Includes: 221 | - error messages (errors) 222 | - # of sentences and # of tokens (sentences, tokens) 223 | - # of uppercase and lowercase tokens (n_upper, n_lower) 224 | - # of different POS tags (tag_card) 225 | 226 | :param output: String with newline separated output from hunpos-train 227 | :rtype : dict 228 | :return: Dict with statistics reported by hunpos-train. 229 | """ 230 | stats = {'errors': []} 231 | 232 | for line in output.split('\n'): 233 | line = line.strip() 234 | 235 | m = re.match('(\d+) tokens', line) 236 | if m: 237 | stats['tokens'] = int(m.group(1)) 238 | 239 | m = re.match('(\d+) sentences', line) 240 | if m: 241 | stats['sentences'] = int(m.group(1)) 242 | 243 | m = re.match('(\d+) different tag', line) 244 | if m: 245 | stats['tag_card'] = int(m.group(1)) 246 | 247 | m = re.match('(\d+) lowercase', line) 248 | if m: 249 | stats['n_lower'] = int(m.group(1)) 250 | 251 | m = re.match('(\d+) uppercase tokens', line) 252 | if m: 253 | stats['n_upper'] = int(m.group(1)) 254 | 255 | m = re.match('theta = (\d\.\d+)', line) 256 | if m: 257 | stats['theta'] = float(m.group(1)) 258 | 259 | # the error format is not documented so this will suffice for now 260 | m = re.search('error', line, re.IGNORECASE) 261 | if m: 262 | stats['errors'] += line 263 | 264 | return stats 265 | 266 | 267 | def train_hunpos_model(seq, model_fn): 268 | """ 269 | Trains a HunPos POS tagger on the sentences passed as seq using the external hunpos-train binary. 270 | 271 | Models use UTF-8 encoding. 272 | 273 | :param seq: Iterator with sentences. Sentences are iterators with word form/pos tag tuples. 274 | :param model_fn: File where the resulting model will be stored. 275 | :type model_fn: str|unicode 276 | :rtype : dict 277 | :return: Reported statistics printed by hunpos-train 278 | """ 279 | 280 | # We'll be doind it simple here. 281 | # Just write all the data to stdin and catch potential errors on stderr afterwards. 282 | train_proc = Popen([hunpos_train_bin(), model_fn], stdin=PIPE, stderr=PIPE) 283 | 284 | for sent in seq: 285 | for form, tag in sent: 286 | line = b'%s\t%s\n' % (form, tag) 287 | line = line.encode('utf-8') 288 | train_proc.stdin.write(line) 289 | 290 | train_proc.stdin.write('\n') 291 | 292 | train_proc.stdin.close() 293 | 294 | # parse the output 295 | # hunpos-trai reports results and errors on stderr 296 | stats = parse_hunpos_train_output(train_proc.stderr.read()) 297 | 298 | train_proc.wait() 299 | 300 | # check if the stats reports any errors 301 | if len(stats['errors']) != 0: 302 | logging.error('HunPos failed with error messages ...') 303 | 304 | for error in stats['errors']: 305 | logging.error(error) 306 | 307 | return stats 308 | 309 | 310 | def clean_input(string): 311 | return re.sub('\n', ' ', string) 312 | 313 | 314 | class NOBTagger (BaseTagger, object): 315 | """ 316 | TextBlob compatible Norsk Bokmål POS tagger class based on the NLTK HunPos wrapper. 317 | """ 318 | def __init__(self, model_fn=None): 319 | self.tokenizer = NOTokenizer() 320 | self.tagger = HunposTagger(NOB_TAGGER_DEFAULT_MODEL_FN, 321 | hunpos_tag_bin(), encoding='utf-8') 322 | 323 | def tag(self, text, tokenize=True): 324 | 325 | if tokenize: 326 | text = clean_input(text) 327 | text = self.tokenizer.tokenize(text) 328 | 329 | return self.tagger.tag(text) 330 | 331 | 332 | class NNOTagger (BaseTagger, object): 333 | """ 334 | TextBlob compatible Norsk Nynorsk POS tagger class based on the NLTK HunPos wrapper. 335 | """ 336 | def __init__(self, model_fn=None): 337 | self.tokenizer = NOTokenizer() 338 | self.tagger = HunposTagger(NNO_TAGGER_DEFAULT_MODEL_FN, 339 | hunpos_tag_bin(), encoding='utf-8') 340 | 341 | def tag(self, text, tokenize=True): 342 | text = clean_input(text) 343 | 344 | if tokenize: 345 | text = self.tokenizer.tokenize(text) 346 | 347 | return self.tagger.tag(text) 348 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/term_weight_provider.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | import logging 3 | from math import log 4 | import re 5 | 6 | from elasticsearch.client import IndicesClient 7 | from gensim.corpora import Dictionary 8 | from gensim.models import TfidfModel 9 | 10 | ES_TERM_WEIGHTING_INDEX_DEFAULT_NAME = 'es_term_weighting_index' 11 | 12 | ES_TERMWEIGHTING_INDEX_SETTINGS = {"mappings": { 13 | "term": {"properties": {"form": {"type": "string", "index": "not_analyzed"}, "value": {"type": "float"}}}}} 14 | 15 | 16 | class TermWeightingProvider: 17 | """ 18 | Base class for term weighting providers handling common weighting transforms, basic missing terms policies and 19 | the weight retrieval API. 20 | """ 21 | __metaclass__ = ABCMeta 22 | 23 | def __init__(self, missing='error', inverse=False, sublinear=False): 24 | """ 25 | :param missing: Missing terms policy. 'error' raises KeyError, 'ignore' removes missing terms from the result, 26 | and a number value returns that value for missing terms. 27 | :type missing: str|unicode|int|long|float 28 | :param inverse: Inverse the frequency ratio (for IDF and similar weighttings). 29 | :type inverse:bool 30 | :param sublinear: Log the frequency ratio. Applied after inversion if applicable. 31 | :type sublinear:bool 32 | :raise ValueError: When passed invalid missing argument. 33 | """ 34 | self.inverse = inverse 35 | self.sublinear = sublinear 36 | 37 | self.default_value = None 38 | 39 | if isinstance(missing, (str, unicode)): 40 | self.missing_value_policy = missing 41 | elif isinstance(missing, (int, long, float)): 42 | self.missing_value_policy = 'value' 43 | self.default_value = float(missing) 44 | else: 45 | raise ValueError 46 | 47 | def _handle_missing_term(self, term): 48 | """ 49 | Implements missing terms according to configured policy 50 | 51 | :param term: 52 | :type term:str|unicode 53 | :rtype : None|float 54 | :return: :raise KeyError: 55 | """ 56 | if self.missing_value_policy == 'error': 57 | raise KeyError(term) 58 | elif self.missing_value_policy == 'ignore': 59 | return None 60 | else: 61 | return self.default_value 62 | 63 | def __getitem__(self, terms): 64 | """ 65 | Retrieves the weights for one or more term. 66 | 67 | :param terms: single or list of terms 68 | :type terms: str|unicode|list|tuple 69 | :rtype : (str|unicode, float)|list 70 | :raise NotImplementedError: 71 | """ 72 | # single term retrievals are returned in a special format 73 | single = False 74 | 75 | if isinstance(terms, (str, unicode)): 76 | terms = [terms] 77 | single = True 78 | 79 | # retrieve the term weight map implemented in the superclass 80 | tw = self._weights_for_terms(terms) 81 | 82 | w = [] 83 | 84 | for term in terms: 85 | if term in tw: 86 | w.append((term, tw[term])) 87 | else: 88 | # check missing value policy 89 | val = self._handle_missing_term(term) 90 | 91 | if val: 92 | w.append((term, val)) 93 | 94 | # do transforms 95 | if self.inverse: 96 | w = [(term, 1. / freq) for term, freq in w] 97 | 98 | if self.sublinear: 99 | w = [(term, log(freq)) for term, freq in w] 100 | 101 | # if we're returning a single or null result we unwrap the list 102 | if single and (len(w) == 1): 103 | return w[0] 104 | elif len(w) == 0: 105 | return None 106 | else: 107 | return w 108 | 109 | @abstractmethod 110 | def _weights_for_terms(self, terms): 111 | """ 112 | Implement this method to retrieve the actual weights for the terms in the query. 113 | If a term is missing it should not be included, the base class will handle missing values. 114 | 115 | :param terms: 116 | :type terms:list 117 | :rtype : dict 118 | :raise NotImplementedError: 119 | """ 120 | raise NotImplementedError 121 | 122 | 123 | def weight_map_from_term_counts(term_count_iter, min_count=1): 124 | """ 125 | Create a map of terms and their frequencies from a list of terms and counts. 126 | 127 | :param term_count_iter: An iterator with tuples of terms and counts, ie, (term, count). 128 | :param min_count: Minimum count value that will be added to the weight map: 129 | :type min_count: int|long 130 | :rtype : dict 131 | :return: A dict with the terms as keys and the frequency ratios as values. 132 | """ 133 | weight_map = {} 134 | total = 0 135 | 136 | for term, count in term_count_iter: 137 | total += count 138 | 139 | if count >= min_count: 140 | weight_map[term] = weight_map.get(term, 0) + count 141 | 142 | for term in weight_map.keys(): 143 | w = weight_map[term] / float(total) 144 | 145 | weight_map[term] = w 146 | 147 | return weight_map 148 | 149 | 150 | def term_counts_line_parser(line, delim='\t', term_index=1, count_index=2): 151 | """ 152 | Parses a line from a file with terms and counts as line items. 153 | 154 | The defaults f.ex. parses "34\tba\t45\n" into ('ba', 45) 155 | 156 | :param line: 157 | :type line: unicode|str 158 | :param delim: Character used to split tokens. 159 | :type delim: unicode|str 160 | :param term_index: Token index for the term element. 161 | :type term_index: int|long 162 | :param count_index: Token index for the count element. 163 | :type count_index: int|long 164 | :rtype : (unicode|str, int|long) 165 | :return: Tuple with the term and count from the passed line string. 166 | """ 167 | tokens = line.split(delim) 168 | 169 | return tokens[term_index], int(tokens[count_index]) 170 | 171 | 172 | def term_counts_iter_from_file(f, line_parser=None): 173 | """ 174 | Reads term counts from a file with term/count pairs as line items. 175 | 176 | :param f: A FileIO instance 177 | :type f: FileIO 178 | :param line_parser: Function that parses a line into a term, count tuple. Default parses Gensim Dictionary 179 | text format. 180 | :type line_parser: function 181 | :rtype : generator 182 | """ 183 | if not line_parser: 184 | line_parser = term_counts_line_parser 185 | 186 | for line in f: 187 | yield line_parser(line) 188 | 189 | 190 | class SimpleTermWeightProvider(TermWeightingProvider): 191 | """ 192 | Simple term weight provider for term count ratios supplied by an iterator. Takes options for returning 193 | logged or inverse ratios. 194 | """ 195 | 196 | def __init__(self, term_count_iter, **kwargs): 197 | super(SimpleTermWeightProvider, self).__init__(**kwargs) 198 | 199 | self.weight_map = weight_map_from_term_counts(term_count_iter) 200 | 201 | def _weights_for_terms(self, terms): 202 | # just return the whole weight dict 203 | return self.weight_map 204 | 205 | 206 | class ESTermAggregationWeightProvider(TermWeightingProvider): 207 | """ 208 | Term weight provider for DF/IDF values based on an Elasticsearch index using the terms aggregator. 209 | 210 | Defaults to logged IDF values. 211 | """ 212 | 213 | def __init__(self, es, index, doc_type, field, **kwargs): 214 | super(ESTermAggregationWeightProvider, self).__init__(**kwargs) 215 | 216 | self.es = es 217 | self.index = index 218 | self.doc_type = doc_type 219 | self.field = field 220 | 221 | def _weights_for_terms(self, terms): 222 | q = {"size": 0, 223 | "aggs": {"df": {"terms": {"field": self.field, "size": len(terms), 224 | "include": '|'.join([re.escape(term) for term in terms])}}}} 225 | 226 | resp = self.es.search(index=self.index, doc_type=self.doc_type, body=q) 227 | 228 | try: 229 | n_doc = resp['hits']['total'] 230 | tf = dict((e['key'], e['doc_count'] / float(n_doc)) for e in resp['aggregations']['df']['buckets']) 231 | except KeyError: 232 | # malformed response 233 | raise RuntimeError 234 | 235 | return dict(tf) 236 | 237 | 238 | class GensimIDFProvider(TermWeightingProvider): 239 | """ 240 | IDF TermWeightingProvider based on a Gensim Dictionary using the Gensim TfIdf model. 241 | """ 242 | def __init__(self, dictionary, **kwargs): 243 | super(GensimIDFProvider, self).__init__(**kwargs) 244 | 245 | if {'missing', 'linear', 'linear'} <= set(kwargs): 246 | logging.warning('<%s> argumemts to GensimIDFProvider can generate incorrect weights and should not be used' 247 | % '|'.join({'missing', 'linear', 'linear'})) 248 | 249 | if isinstance(dictionary, (str, unicode)): 250 | dictionary = Dictionary.load(dictionary) 251 | self.dictionary = dictionary 252 | self.tfidf = TfidfModel(dictionary=dictionary, normalize=False) 253 | 254 | def _weights_for_terms(self, terms): 255 | return {self.dictionary[bow_id]: val for bow_id, val in self.tfidf[self.dictionary.doc2bow(terms)]} 256 | 257 | 258 | class ESTermIndexWeightingProvider(TermWeightingProvider): 259 | """ 260 | Class implementing storage of term weights in an Elasticsearch index. 261 | """ 262 | def __init__(self, es, index=None, initial_weights=None, **kwargs): 263 | """ 264 | :param es: Elasticsearch instance from py-elasticsearch API. 265 | :type es:elasticsearch.Elasticsearch 266 | :param index: Name of the index where term weights are stored. If it doesn't exist it is created. 267 | :type index:str|unicode 268 | :param initial_weights: Iterator with term/weight pairs that will be added to the index during initialization. 269 | """ 270 | super(ESTermIndexWeightingProvider, self).__init__(**kwargs) 271 | 272 | self.es = es 273 | self.index = index 274 | 275 | if not self.index: 276 | self.index = ES_TERM_WEIGHTING_INDEX_DEFAULT_NAME 277 | 278 | ESTermIndexWeightingProvider._create_weight_index(self.es, self.index) 279 | 280 | if initial_weights: 281 | ESTermIndexWeightingProvider._add_terms_iter(self.es, self.index, initial_weights) 282 | 283 | @staticmethod 284 | def _create_weight_index(es, index): 285 | """ 286 | Creates the index with the right mapping if it doesn't exist. 287 | 288 | :param es: 289 | :type es:elasticsearch.Elasticsearch 290 | :param index: 291 | :type index:str|unicode 292 | """ 293 | ic = IndicesClient(es) 294 | 295 | if ic.exists(index): 296 | logging.info('Index %s already exists ...' % index) 297 | else: 298 | ic.create(index=index, body=ES_TERMWEIGHTING_INDEX_SETTINGS) 299 | 300 | @staticmethod 301 | def _add_terms_iter(es, index, iter, bulk_size=1000): 302 | """ 303 | Adds term documents to the index from the term weight pairs in the iterator. 304 | 305 | :param es: 306 | :type es:elasticsearch.Elasticsearch 307 | :param index: 308 | :type index:str|unicode 309 | :param iter: 310 | """ 311 | bulk_actions = [] 312 | count = 0 313 | 314 | for term, weight in iter: 315 | count = 1 316 | 317 | bulk_actions += [{'index': {'_index': index, '_type': 'term'}}, 318 | {'form': term, 'value': weight}] 319 | 320 | if len(bulk_actions) % (2 * bulk_size) == 0: 321 | es.bulk(index=index, doc_type='term', body=bulk_actions) 322 | logging.info('Added %d documents ...' % count) 323 | bulk_actions = [] 324 | 325 | if bulk_actions: 326 | es.bulk(index=index, doc_type='term', body=bulk_actions) 327 | logging.info('Added %d documents ...' % count) 328 | 329 | def _weights_for_terms(self, terms): 330 | should_clauses = [{'match': {'form': term}} for term in terms] 331 | 332 | resp = self.es.search(index=self.index, doc_type='term', 333 | body={'query': { 334 | 'bool': { 335 | 'should': should_clauses 336 | } 337 | }, 338 | 'fields': ['form', 'value']}) 339 | 340 | return {hit['fields']['form'][0]: float(hit['fields']['value'][0]) for hit in resp['hits']['hits']} 341 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/data/aviskorpus.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import logging 3 | import os 4 | import re 5 | import unicodedata 6 | from StringIO import StringIO 7 | from gzip import GzipFile 8 | from tarfile import TarFile 9 | from zipfile import ZipFile 10 | 11 | from bs4 import BeautifulSoup 12 | 13 | from es_text_analytics.data.dataset import Dataset 14 | 15 | AVISKORPUS_ARCHIVE_URL='http://www.nb.no/sbfil/tekst/norsk_aviskorpus.zip' 16 | 17 | # Map of corpus sections, corresponding files inside the main archive and codes for the newspaper in 18 | # each section. See http://www.nb.no/sprakbanken/show?serial=sbr-4&lang=nb for details. 19 | CORPUS_SECTIONS = { 20 | '1': {'paths': ['1/19981013-20010307.gz', 21 | '1/20010308-20030116.gz', 22 | '1/20030116-20050403.gz'], 23 | 'name': 'ti-aviser-1998-2005', 24 | 'sources': ['aa', 'ap', 'bt', 'da', 'db', 25 | 'dn', 'fv', 'nl', 'sa', 'vg']}, 26 | '2': {'paths': ['2/aa.tar.gz', 27 | '2/ap.tar.gz', 28 | '2/bt.tar.gz', 29 | '2/da.tar.gz', 30 | '2/db.tar.gz', 31 | '2/dn.tar.gz', 32 | '2/fv.tar.gz', 33 | '2/nl.tar.gz', 34 | '2/sa.tar.gz', 35 | '2/vg.tar.gz'], 36 | 'name': 'ti-aviser-2005-2011', 37 | 'sources': ['aa', 'ap', 'bt', 'da', 'db', 38 | 'dn', 'fv', 'nl', 'sa', 'vg']}, 39 | '3': {'paths': ['3/dt.tar.gz', 40 | '3/fi.tar.gz', 41 | '3/hd.tar.gz', 42 | '3/ho.tar.gz', 43 | '3/kk.tar.gz', 44 | '3/na.tar.gz', 45 | '3/sh.tar.gz', 46 | '3/so.tar.gz', 47 | '3/sp.tar.gz', 48 | '3/vb.tar.gz', 49 | '3/vt.tar.gz'], 50 | 'name': 'andre-aviser', 51 | 'sources': ['dt', 'fi', 'hd', 'ho', 'kk', 'na', 52 | 'sh', 'so', 'sp', 'vb', 'vt']}} 53 | 54 | 55 | def match_or_none(pattern, string, flags=0): 56 | """ 57 | Small wrapper for reqexes with one match group which may or may not match. 58 | 59 | Matches on whole string not just the beginning (ie. uses re.search). 60 | 61 | :param pattern: Regular expression with a single match group. 62 | :type pattern: unicode|str 63 | :param string: String to match against. 64 | :type string: unicode|str 65 | :param flags: 66 | :rtype : str|unicode|None 67 | :return: The match group string. 68 | """ 69 | m = re.search(pattern, string, flags=flags) 70 | 71 | if m: 72 | if len(m.groups()) == 1: 73 | return m.group(1) 74 | 75 | return m.groups() 76 | 77 | 78 | def section_1_header_line(line): 79 | """ 80 | Detects and extracts url from section 1 header line. 81 | 82 | :param line: Line from section 1 data file. 83 | :type line: str|unicode 84 | :rtype : None|unicode|str 85 | :return: Return the url in the header or None if the passed string is not a section 1 header line. 86 | """ 87 | m = re.search('', line) 88 | 89 | if m: 90 | return m.group(1) 91 | 92 | 93 | def section_1_parser(fileobj): 94 | """ 95 | Parser for section 1 data files. 96 | Returns a generator with dict instances with the article data. 97 | 98 | :param fileobj: A file like instance with section 1 formatted text. 99 | :rtype : generator 100 | """ 101 | line = fileobj.readline() 102 | tokens = [] 103 | doc = None 104 | 105 | while line: 106 | line = line.decode('latin1') 107 | line = line.strip() 108 | 109 | if line == '': 110 | pass 111 | elif section_1_header_line(line): 112 | # skip empty documents 113 | if tokens: 114 | doc['tokens'] = [unicodedata.normalize('NFC', unicode(token)) for token in tokens] 115 | doc['corpus_section'] = 1 116 | 117 | yield doc 118 | tokens = [] 119 | 120 | url = section_1_header_line(line) 121 | fileobj.readline() 122 | source_code = match_or_none('^$', fileobj.readline().strip()) 123 | year = match_or_none('^$', fileobj.readline().strip()) 124 | pub_year = int(year) if year else None 125 | month = match_or_none('^$', fileobj.readline().strip()) 126 | pub_month = int(month) if month else None 127 | day = match_or_none('^$', fileobj.readline().strip()) 128 | pub_day = int(day) if day else None 129 | 130 | doc = {'url': url, 'source': source_code, 131 | 'pub_year': pub_year, 'pub_month': pub_month, 'pub_day': pub_day} 132 | else: 133 | # article content consists of tokens, one on each line 134 | tokens.append(line) 135 | 136 | line = fileobj.readline() 137 | 138 | # catch the last document 139 | if doc and tokens: 140 | doc['tokens'] = [unicodedata.normalize('NFC', unicode(token)) for token in tokens] 141 | doc['corpus_section'] = 1 142 | 143 | yield doc 144 | 145 | 146 | def section_2_header_line(line): 147 | """ 148 | Detects and extracts url from section 2 header line. 149 | 150 | :param line: Line from section 2 data file. 151 | :type line: str|unicode 152 | :rtype : None|unicode|str 153 | :return: Return the url in the header or None if the passed string is not a section 1 header line. 154 | """ 155 | m = re.search('##U #(http://.*)>', line) 156 | 157 | if m: 158 | return m.group(1) 159 | 160 | 161 | def section_2_parser(fileobj): 162 | """ 163 | Parser for section 2 data files. 164 | Returns a generator with dict instances with the article data. 165 | 166 | :param fileobj: A file like instance with section 2 formatted text. 167 | :rtype : generator 168 | """ 169 | line = fileobj.readline() 170 | text = '' 171 | doc = None 172 | 173 | while line: 174 | line = line.decode('latin1') 175 | line = line.strip() 176 | 177 | if line == '' or line == '|': 178 | pass 179 | elif section_2_header_line(line): 180 | # skip articles with no content 181 | if text and doc: 182 | # content consists of text lines with header sections delimited by | characters and 183 | # sentences delimited by paragraph characters 184 | text = text.replace(u'¶', u'|') 185 | doc['sentences'] = [unicodedata.normalize('NFC', unicode(sent.strip())) 186 | for sent in text.split(u'|') if sent.strip() != ''] 187 | doc['corpus_section'] = 2 188 | 189 | yield doc 190 | text = '' 191 | 192 | url = section_2_header_line(line) 193 | source_code = match_or_none('^##B (\w\w)>$', fileobj.readline().strip()) 194 | year = match_or_none('^##A (\d\d)>$', fileobj.readline().strip()) 195 | pub_year = int(year) if year else None 196 | month = match_or_none('^##M (\d\d)>$', fileobj.readline().strip()) 197 | pub_month = int(month) if month else None 198 | day = match_or_none('^##D (\d\d)>$', fileobj.readline().strip()) 199 | pub_day = int(day) if day else None 200 | 201 | doc = {'url': url, 'source': source_code, 202 | 'pub_year': pub_year, 'pub_month': pub_month, 'pub_day': pub_day} 203 | else: 204 | text += line 205 | 206 | line = fileobj.readline() 207 | 208 | # yield the last document in the file 209 | if doc and text: 210 | text = text.replace(u'¶', u'|') 211 | doc['sentences'] = [unicodedata.normalize('NFC', unicode(sent.strip())) 212 | for sent in text.split(u'|') if sent.strip() != ''] 213 | doc['corpus_section'] = 2 214 | yield doc 215 | 216 | 217 | def section_3_parser(fileobj): 218 | """ 219 | Parser for section 3 data files. 220 | Returns a dict instance with the article data. 221 | 222 | :param fileobj: A file like instance with section 2 formatted text. 223 | :rtype : dict 224 | """ 225 | xml_doc = BeautifulSoup(fileobj) 226 | 227 | # each article is in a separate XML file. We pick out metadata from attribute tags and div 228 | # tags with the type attribute. The content is in a div tag with the text attribute. 229 | metadata = dict([(a['name'], unicodedata.normalize('NFC', unicode(a['value']).strip())) 230 | for a in xml_doc.find_all('attribute')]) 231 | text = [unicodedata.normalize('NFC', unicode(t.text).strip()) 232 | for t in xml_doc.find('div', type='text').find_all('p')] 233 | 234 | doc = dict([(t.attrs['type'], unicodedata.normalize('NFC', unicode(t.text).strip())) for t in xml_doc.find_all('div') 235 | if 'type' in t.attrs and t.attrs['type'] != 'text']) 236 | doc['text'] = [t for t in text if t != ''] 237 | doc['metadata'] = metadata 238 | doc['corpus_section'] = 3 239 | 240 | return doc 241 | 242 | 243 | def iterator(dataset_fn, sections=None, sources=None): 244 | """ 245 | Generator that yields all the documents in the korpus. 246 | The generator can return only specific newspaper or sections if specified in 247 | the arguments, 248 | 249 | :param dataset_fn: Dataset archive file. None uses default location. 250 | :type dataset_fn: str|unicode|None 251 | :param sections: Sections to include. The default None yields all sections. 252 | :type sections: list[int|long]|None 253 | :param sources: Newspaper sources to include. The default None yields all sources. 254 | :type sources: list[str|unicode]|None 255 | :rtype : generator 256 | """ 257 | count = 0 258 | 259 | with ZipFile(dataset_fn) as zf: 260 | # corpus content files are compressed and archived in various ways inside the corpus zip archive. 261 | if not sections or 1 in sections: 262 | for fn in CORPUS_SECTIONS['1']['paths']: 263 | logging.info('Reading %s ...' % fn) 264 | 265 | with GzipFile(fileobj=StringIO(zf.read(fn))) as iz: 266 | try: 267 | for doc in section_1_parser(iz): 268 | if sources is None or doc['source'] in sources: 269 | yield doc 270 | count += 1 271 | 272 | if count != 0 and count % 1000 == 0: 273 | logging.info("Read %d files ..." % count) 274 | except Exception: 275 | logging.error("Parse failure while reading %s ..." % fn) 276 | 277 | if not sections or 2 in sections: 278 | for fn in CORPUS_SECTIONS['2']['paths']: 279 | if sources and not os.path.basename(fn).split('.')[0] in sources: 280 | continue 281 | 282 | logging.info('Reading %s ...' % fn) 283 | 284 | # TarFile with compression doesn't work inside a ZipFile 285 | # Uncompress to a StringIO object and hand that to TarFile 286 | with GzipFile(fileobj=StringIO(zf.read(fn))) as iz: 287 | with TarFile(fileobj=iz) as tf: 288 | for member in tf.getmembers(): 289 | 290 | if member.isfile(): 291 | tif = tf.extractfile(member) 292 | 293 | try: 294 | for doc in section_2_parser(tif): 295 | yield doc 296 | count += 1 297 | 298 | if count != 0 and count % 1000 == 0: 299 | logging.info("Read %d files ..." % count) 300 | except Exception: 301 | logging.error("Parse failure while reading %s ..." % fn) 302 | 303 | tif.close() 304 | 305 | if not sections or 3 in sections: 306 | for fn in CORPUS_SECTIONS['3']['paths']: 307 | if sources and not os.path.basename(fn).split('.')[0] in sources: 308 | continue 309 | 310 | logging.info('Reading %s ...' % fn) 311 | 312 | # Same tar.gz inside zip problem as with section 2 313 | with GzipFile(fileobj=StringIO(zf.read(fn))) as iz: 314 | with TarFile(fileobj=iz) as tf: 315 | for member in tf.getmembers(): 316 | if member.isfile(): 317 | tif = tf.extractfile(member) 318 | 319 | if os.path.splitext(member.name)[1] != '.xml': 320 | continue 321 | 322 | try: 323 | doc = section_3_parser(tif) 324 | 325 | yield doc 326 | count += 1 327 | 328 | except Exception: 329 | logging.error("Unable to parse file %s ..." % member.name) 330 | 331 | if count != 0 and count % 1000 == 0: 332 | logging.info("Read %d files ..." % count) 333 | 334 | tif.close() 335 | 336 | 337 | def normalize(doc): 338 | """ 339 | Normalizes content from the Aviskorpus to a single string with the article text in a separate 'text' field. 340 | 341 | :param doc: Parsed document from the corpus. 342 | :type doc: dict 343 | :rtype : dict 344 | :return: The passed document dict with normalized 'text' field added. 345 | :raise ValueError: If the section_id field is malformed 346 | """ 347 | section_id = doc['corpus_section'] 348 | 349 | if section_id == 1: 350 | doc['text'] = ' '.join(doc['tokens']) 351 | elif section_id == 2: 352 | doc['text'] = '\n'.join(doc['sentences']) 353 | elif section_id == 3: 354 | doc['raw'] = doc['text'] 355 | doc['text'] = '\n\n'.join(doc['text']) 356 | 357 | if ('metadata' in doc) and ('source' in doc['metadata']): 358 | doc['source'] = doc['metadata']['source'] 359 | else: 360 | raise ValueError('Unknown section id %d in document ...' % section_id) 361 | 362 | return doc 363 | 364 | 365 | class AviskorpusDataset(Dataset): 366 | """ 367 | Class encapsulating Norsk Aviskorpus, a large corpus of Norwegian newspaper articles. 368 | 369 | See http://www.nb.no/sprakbanken/show?serial=sbr-4&lang=nb for details. 370 | """ 371 | def __init__(self, index='aviskorpus', doc_type='article', dataset_path=None, 372 | sections=None, sources=None, dataset_fn=None): 373 | super(AviskorpusDataset, self).__init__(index=index, doc_type=doc_type, dataset_path=dataset_path, 374 | dataset_fn=dataset_fn) 375 | 376 | self.archive_fn = AVISKORPUS_ARCHIVE_URL 377 | 378 | self.sections = sections 379 | self.sources = sources 380 | self.normalize_func = normalize 381 | 382 | def _iterator(self): 383 | return iterator(self.dataset_fn, sections=self.sections, sources=self.sources) 384 | -------------------------------------------------------------------------------- /python-client/es_text_analytics/test/test_term_weight_provider.py: -------------------------------------------------------------------------------- 1 | from StringIO import StringIO 2 | from unittest import TestCase 3 | 4 | from elasticsearch.client import Elasticsearch, IndicesClient 5 | from gensim.corpora.dictionary import Dictionary 6 | 7 | from es_text_analytics.term_weight_provider import SimpleTermWeightProvider, ESTermAggregationWeightProvider, \ 8 | weight_map_from_term_counts, term_counts_line_parser, term_counts_iter_from_file, GensimIDFProvider, \ 9 | ESTermIndexWeightingProvider 10 | from es_text_analytics.test import es_runner 11 | 12 | 13 | class TestTermWeightProviderHelpers(TestCase): 14 | def test_weight_map_from_term_counts(self): 15 | wm = sorted(weight_map_from_term_counts([('foo', 2), ('ba', 1), ('knark', 4), ('knirk', 1)]).items()) 16 | self.assertEqual(wm, [('ba', 0.125), ('foo', 0.25), ('knark', 0.5), ('knirk', 0.125)]) 17 | 18 | wm = sorted(weight_map_from_term_counts([('foo', 2), ('ba', 1), ('knark', 4), ('knirk', 1)], min_count=4).items()) 19 | self.assertEqual(wm, [('knark', .5)]) 20 | 21 | def test_term_counts_line_parser(self): 22 | self.assertEqual(('absolutely', 342), term_counts_line_parser('5949\tabsolutely\t342\n')) 23 | self.assertEqual(('finished', 136), term_counts_line_parser('497\tfinished\t136')) 24 | 25 | def test_term_counts_iter_from_file(self): 26 | f = StringIO('5949\tabsolutely\t342\n497\tfinished\t136') 27 | 28 | self.assertEqual([('absolutely', 342), ('finished', 136)], list(term_counts_iter_from_file(f))) 29 | 30 | 31 | class TestSimpleTermWeightProvider(TestCase): 32 | def test_getitem_single(self): 33 | provider = SimpleTermWeightProvider([('ba', 2), ('foo', 1), ('ba', 1), ('knark', 1), 34 | ('knirk', 1), ('ba', 1), ('knark', 1)]) 35 | term, w = provider['ba'] 36 | self.assertEqual('ba', term) 37 | self.assertAlmostEqual(.5, w) 38 | term, w = provider['knark'] 39 | self.assertEqual('knark', term) 40 | self.assertAlmostEqual(.25, w) 41 | term, w = provider['knirk'] 42 | self.assertEqual('knirk', term) 43 | self.assertAlmostEqual(.125, w) 44 | term, w = provider['foo'] 45 | self.assertEqual('foo', term) 46 | self.assertAlmostEqual(.125, w) 47 | 48 | def test_inverse(self): 49 | provider = SimpleTermWeightProvider([('ba', 2), ('foo', 1), ('ba', 1), ('knark', 1), 50 | ('knirk', 1), ('ba', 1), ('knark', 1)], 51 | inverse=True) 52 | term, w = provider['ba'] 53 | self.assertEqual('ba', term) 54 | self.assertAlmostEqual(2., w) 55 | term, w = provider['knark'] 56 | self.assertEqual('knark', term) 57 | self.assertAlmostEqual(4., w) 58 | term, w = provider['knirk'] 59 | self.assertEqual('knirk', term) 60 | self.assertAlmostEqual(8., w) 61 | term, w = provider['foo'] 62 | self.assertEqual('foo', term) 63 | self.assertAlmostEqual(8., w) 64 | 65 | def test_sublinear(self): 66 | provider = SimpleTermWeightProvider([('ba', 2), ('foo', 1), ('ba', 1), ('knark', 1), 67 | ('knirk', 1), ('ba', 1), ('knark', 1)], 68 | sublinear=True) 69 | term, w = provider['ba'] 70 | self.assertEqual('ba', term) 71 | self.assertAlmostEqual(-0.693147, w, places=4) 72 | term, w = provider['knark'] 73 | self.assertEqual('knark', term) 74 | self.assertAlmostEqual(-1.386294, w, places=4) 75 | term, w = provider['knirk'] 76 | self.assertEqual('knirk', term) 77 | self.assertAlmostEqual(-2.079442, w, places=4) 78 | term, w = provider['foo'] 79 | self.assertEqual('foo', term) 80 | self.assertAlmostEqual(-2.079442, w, places=4) 81 | 82 | def test_inverse_sublinear(self): 83 | provider = SimpleTermWeightProvider([('ba', 2), ('foo', 1), ('ba', 1), ('knark', 1), 84 | ('knirk', 1), ('ba', 1), ('knark', 1)], 85 | sublinear=True, inverse=True) 86 | term, w = provider['ba'] 87 | self.assertEqual('ba', term) 88 | self.assertAlmostEqual(0.693147, w, places=4) 89 | term, w = provider['knark'] 90 | self.assertEqual('knark', term) 91 | self.assertAlmostEqual(1.386294, w, places=4) 92 | term, w = provider['knirk'] 93 | self.assertEqual('knirk', term) 94 | self.assertAlmostEqual(2.079442, w, places=4) 95 | term, w = provider['foo'] 96 | self.assertEqual('foo', term) 97 | self.assertAlmostEqual(2.079442, w, places=4) 98 | 99 | def test_getitem_multiple(self): 100 | provider = SimpleTermWeightProvider([('ba', 2), ('foo', 1), ('ba', 1), ('knark', 1), 101 | ('knirk', 1), ('ba', 1), ('knark', 1)]) 102 | 103 | weights = dict(provider[['ba', 'foo', 'knark', 'knirk']]) 104 | self.assertEqual(['ba', 'foo', 'knark', 'knirk'], sorted(weights.keys())) 105 | self.assertAlmostEqual(weights['ba'], .5) 106 | self.assertAlmostEqual(weights['knark'], .25) 107 | self.assertAlmostEqual(weights['knirk'], .125) 108 | self.assertAlmostEqual(weights['foo'], .125) 109 | 110 | provider = SimpleTermWeightProvider([('ba', 2), ('foo', 1), ('ba', 1), ('knark', 1), 111 | ('knirk', 1), ('ba', 1), ('knark', 1)]) 112 | 113 | weights = dict(provider['ba', 'foo', 'knark', 'knirk']) 114 | self.assertEqual(['ba', 'foo', 'knark', 'knirk'], sorted(weights.keys())) 115 | self.assertAlmostEqual(weights['ba'], .5) 116 | self.assertAlmostEqual(weights['knark'], .25) 117 | self.assertAlmostEqual(weights['knirk'], .125) 118 | self.assertAlmostEqual(weights['foo'], .125) 119 | 120 | def test_getitem_missing(self): 121 | provider = SimpleTermWeightProvider([('ba', 2), ('foo', 1), ('ba', 1), ('knark', 1), 122 | ('knirk', 1), ('ba', 1), ('knark', 1)]) 123 | 124 | self.assertRaises(KeyError, lambda: provider['notfound']) 125 | self.assertRaises(KeyError, lambda: provider['ba', 'notfound']) 126 | 127 | provider = SimpleTermWeightProvider([('ba', 2), ('foo', 1), ('ba', 1), ('knark', 1), 128 | ('knirk', 1), ('ba', 1), ('knark', 1)], missing='ignore') 129 | self.assertEqual([('ba', .5)], list(provider['ba', 'notfound'])) 130 | self.assertIsNone(provider['notfound']) 131 | 132 | 133 | class TestESTermAggregationWeightProvider(TestCase): 134 | 135 | def setUp(self): 136 | super(TestESTermAggregationWeightProvider, self).setUp() 137 | 138 | self.es = Elasticsearch(hosts=['localhost:%d' % es_runner.es_state.port]) 139 | self.ic = IndicesClient(self.es) 140 | self.index = 'es_term_weight_provider_test' 141 | self.doc_type = 'test-doc' 142 | self.field = 'text' 143 | 144 | if self.ic.exists(self.index): 145 | self.ic.delete(self.index) 146 | 147 | self.ic.create(self.index) 148 | self.es.create(self.index, self.doc_type, {self.field: 'foo'}) 149 | self.es.create(self.index, self.doc_type, {self.field: 'knark'}) 150 | self.es.create(self.index, self.doc_type, {self.field: 'ba'}) 151 | self.es.create(self.index, self.doc_type, {self.field: 'knirk'}) 152 | self.es.create(self.index, self.doc_type, {self.field: 'ba'}) 153 | self.es.create(self.index, self.doc_type, {self.field: 'ba'}) 154 | self.es.create(self.index, self.doc_type, {self.field: 'knark '}) 155 | self.es.create(self.index, self.doc_type, {self.field: 'ba'}, refresh=True) 156 | 157 | def tearDown(self): 158 | super(TestESTermAggregationWeightProvider, self).tearDown() 159 | 160 | self.ic.delete(self.index) 161 | 162 | 163 | def test_getitem_single(self): 164 | provider = ESTermAggregationWeightProvider(self.es, self.index, self.doc_type, self.field, 165 | inverse=False, sublinear=False) 166 | 167 | term, w = provider['ba'] 168 | self.assertEqual('ba', term) 169 | self.assertAlmostEqual(.5, w) 170 | term, w = provider['knark'] 171 | self.assertEqual('knark', term) 172 | self.assertAlmostEqual(.25, w) 173 | term, w = provider['knirk'] 174 | self.assertEqual('knirk', term) 175 | self.assertAlmostEqual(.125, w) 176 | term, w = provider['foo'] 177 | self.assertEqual('foo', term) 178 | self.assertAlmostEqual(.125, w) 179 | 180 | def test_inverse(self): 181 | provider = ESTermAggregationWeightProvider(self.es, self.index, self.doc_type, self.field, 182 | inverse=True, sublinear=False) 183 | term, w = provider['ba'] 184 | self.assertEqual('ba', term) 185 | self.assertAlmostEqual(2., w) 186 | term, w = provider['knark'] 187 | self.assertEqual('knark', term) 188 | self.assertAlmostEqual(4., w) 189 | term, w = provider['knirk'] 190 | self.assertEqual('knirk', term) 191 | self.assertAlmostEqual(8., w) 192 | term, w = provider['foo'] 193 | self.assertEqual('foo', term) 194 | self.assertAlmostEqual(8., w) 195 | 196 | def test_sublinear(self): 197 | provider = ESTermAggregationWeightProvider(self.es, self.index, self.doc_type, self.field, 198 | inverse=False, sublinear=True) 199 | term, w = provider['ba'] 200 | self.assertEqual('ba', term) 201 | self.assertAlmostEqual(-0.693147, w, places=4) 202 | term, w = provider['knark'] 203 | self.assertEqual('knark', term) 204 | self.assertAlmostEqual(-1.386294, w, places=4) 205 | term, w = provider['knirk'] 206 | self.assertEqual('knirk', term) 207 | self.assertAlmostEqual(-2.079442, w, places=4) 208 | term, w = provider['foo'] 209 | self.assertEqual('foo', term) 210 | self.assertAlmostEqual(-2.079442, w, places=4) 211 | 212 | def test_inverse_sublinear(self): 213 | provider = ESTermAggregationWeightProvider(self.es, self.index, self.doc_type, self.field, 214 | inverse=True, sublinear=True) 215 | term, w = provider['ba'] 216 | self.assertEqual('ba', term) 217 | self.assertAlmostEqual(0.693147, w, places=4) 218 | term, w = provider['knark'] 219 | self.assertEqual('knark', term) 220 | self.assertAlmostEqual(1.386294, w, places=4) 221 | term, w = provider['knirk'] 222 | self.assertEqual('knirk', term) 223 | self.assertAlmostEqual(2.079442, w, places=4) 224 | term, w = provider['foo'] 225 | self.assertEqual('foo', term) 226 | self.assertAlmostEqual(2.079442, w, places=4) 227 | 228 | def test_getitem_multiple(self): 229 | provider = ESTermAggregationWeightProvider(self.es, self.index, self.doc_type, self.field, 230 | inverse=False, sublinear=False) 231 | 232 | weights = dict(provider[['ba', 'foo', 'knark', 'knirk']]) 233 | self.assertEqual(['ba', 'foo', 'knark', 'knirk'], sorted(weights.keys())) 234 | self.assertAlmostEqual(weights['ba'], .5) 235 | self.assertAlmostEqual(weights['knark'], .25) 236 | self.assertAlmostEqual(weights['knirk'], .125) 237 | self.assertAlmostEqual(weights['foo'], .125) 238 | 239 | weights = dict(provider['ba', 'foo', 'knark', 'knirk']) 240 | self.assertEqual(['ba', 'foo', 'knark', 'knirk'], sorted(weights.keys())) 241 | self.assertAlmostEqual(weights['ba'], .5) 242 | self.assertAlmostEqual(weights['knark'], .25) 243 | self.assertAlmostEqual(weights['knirk'], .125) 244 | self.assertAlmostEqual(weights['foo'], .125) 245 | 246 | def test_getitem_missing(self): 247 | provider = ESTermAggregationWeightProvider(self.es, self.index, self.doc_type, self.field, 248 | inverse=False, sublinear=False) 249 | 250 | self.assertRaises(KeyError, lambda: provider['notfound']) 251 | self.assertRaises(KeyError, lambda: provider['ba', 'notfound']) 252 | 253 | provider = ESTermAggregationWeightProvider(self.es, self.index, self.doc_type, self.field, 254 | inverse=False, sublinear=False, missing='ignore') 255 | 256 | self.assertIsNone(provider['notfound']) 257 | self.assertEqual([('ba', .5)], list(provider['ba', 'notfound'])) 258 | 259 | class TestGensimIDFProvider(TestCase): 260 | def setUp(self): 261 | super(TestGensimIDFProvider, self).setUp() 262 | 263 | self.dictionary = Dictionary([['foo'], ['knark'], ['ba'], ['knirk'], ['ba'], ['ba'], ['knark'], ['ba']]) 264 | 265 | def test_getitem_single(self): 266 | provider = GensimIDFProvider(self.dictionary) 267 | 268 | term, w = provider['ba'] 269 | self.assertEqual('ba', term) 270 | self.assertAlmostEqual(1, w) 271 | term, w = provider['knark'] 272 | self.assertEqual('knark', term) 273 | self.assertAlmostEqual(2, w) 274 | term, w = provider['knirk'] 275 | self.assertEqual('knirk', term) 276 | self.assertAlmostEqual(3, w) 277 | term, w = provider['foo'] 278 | self.assertEqual('foo', term) 279 | self.assertAlmostEqual(3, w) 280 | 281 | def test_getitem_multiple(self): 282 | provider = GensimIDFProvider(self.dictionary) 283 | 284 | weights = dict(provider[['ba', 'foo', 'knark', 'knirk']]) 285 | self.assertEqual(['ba', 'foo', 'knark', 'knirk'], sorted(weights.keys())) 286 | self.assertAlmostEqual(weights['ba'], 1) 287 | self.assertAlmostEqual(weights['knark'], 2) 288 | self.assertAlmostEqual(weights['knirk'], 3) 289 | self.assertAlmostEqual(weights['foo'], 3) 290 | 291 | weights = dict(provider['ba', 'foo', 'knark', 'knirk']) 292 | self.assertEqual(['ba', 'foo', 'knark', 'knirk'], sorted(weights.keys())) 293 | self.assertAlmostEqual(weights['ba'], 1) 294 | self.assertAlmostEqual(weights['knark'], 2) 295 | self.assertAlmostEqual(weights['knirk'], 3) 296 | self.assertAlmostEqual(weights['foo'], 3) 297 | 298 | def test_getitem_missing(self): 299 | provider = GensimIDFProvider(self.dictionary) 300 | 301 | self.assertRaises(KeyError, lambda: provider['notfound']) 302 | self.assertRaises(KeyError, lambda: provider['ba', 'notfound']) 303 | 304 | provider = GensimIDFProvider(self.dictionary, missing='ignore') 305 | 306 | self.assertIsNone(provider['notfound']) 307 | self.assertEqual([('ba', 1)], list(provider['ba', 'notfound'])) 308 | 309 | 310 | class TestESTermIndexWeightingProvider(TestCase): 311 | def setUp(self): 312 | super(TestESTermIndexWeightingProvider, self).setUp() 313 | 314 | self.es = Elasticsearch(hosts=['localhost:%d' % es_runner.es_state.port]) 315 | self.index = 'test_es_term_index_weighting_provider' 316 | ESTermIndexWeightingProvider._create_weight_index(self.es, self.index) 317 | 318 | def tearDown(self): 319 | super(TestESTermIndexWeightingProvider, self).tearDown() 320 | 321 | ic = IndicesClient(self.es) 322 | ic.delete(self.index) 323 | 324 | def test_getitem_single(self): 325 | provider = ESTermIndexWeightingProvider(self.es, self.index, 326 | initial_weights=[('ba', 1), ('knark', 2), ('knirk', 3), ('foo', 3)]) 327 | IndicesClient(self.es).refresh(self.index) 328 | 329 | term, w = provider['ba'] 330 | self.assertEqual('ba', term) 331 | self.assertAlmostEqual(1, w) 332 | term, w = provider['knark'] 333 | self.assertEqual('knark', term) 334 | self.assertAlmostEqual(2, w) 335 | term, w = provider['knirk'] 336 | self.assertEqual('knirk', term) 337 | self.assertAlmostEqual(3, w) 338 | term, w = provider['foo'] 339 | self.assertEqual('foo', term) 340 | self.assertAlmostEqual(3, w) 341 | 342 | def test_getitem_multiple(self): 343 | provider = ESTermIndexWeightingProvider(self.es, self.index, 344 | initial_weights=[('ba', 1), ('knark', 2), ('knirk', 3), ('foo', 3)]) 345 | IndicesClient(self.es).refresh(self.index) 346 | 347 | weights = dict(provider[['ba', 'foo', 'knark', 'knirk']]) 348 | self.assertEqual(['ba', 'foo', 'knark', 'knirk'], sorted(weights.keys())) 349 | self.assertAlmostEqual(weights['ba'], 1) 350 | self.assertAlmostEqual(weights['knark'], 2) 351 | self.assertAlmostEqual(weights['knirk'], 3) 352 | self.assertAlmostEqual(weights['foo'], 3) 353 | 354 | weights = dict(provider['ba', 'foo', 'knark', 'knirk']) 355 | self.assertEqual(['ba', 'foo', 'knark', 'knirk'], sorted(weights.keys())) 356 | self.assertAlmostEqual(weights['ba'], 1) 357 | self.assertAlmostEqual(weights['knark'], 2) 358 | self.assertAlmostEqual(weights['knirk'], 3) 359 | self.assertAlmostEqual(weights['foo'], 3) 360 | 361 | def test_getitem_missing(self): 362 | provider = ESTermIndexWeightingProvider(self.es, self.index, 363 | initial_weights=[('ba', 1), ('knark', 2), ('knirk', 3), ('foo', 3)]) 364 | IndicesClient(self.es).refresh(self.index) 365 | 366 | self.assertRaises(KeyError, lambda: provider['notfound']) 367 | self.assertRaises(KeyError, lambda: provider['ba', 'notfound']) 368 | 369 | provider = ESTermIndexWeightingProvider(self.es, self.index, 370 | initial_weights=[('ba', 1), ('knark', 2), ('knirk', 3), ('foo', 3)], 371 | missing='ignore') 372 | IndicesClient(self.es).refresh(self.index) 373 | 374 | self.assertIsNone(provider['notfound']) 375 | self.assertEqual([('ba', 1)], list(provider['ba', 'notfound'])) --------------------------------------------------------------------------------