├── tests
    ├── __init__.py
    ├── test_utils.py
    ├── test_pair_counts.py
    ├── test_dataset.py
    └── test_bunch.py
├── hyperhyper
    ├── evaluation_datasets
    │   ├── __init__.py
    │   ├── de
    │   │   ├── __init__.py
    │   │   ├── ws
    │   │   │   ├── __init__.py
    │   │   │   ├── gur65.txt
    │   │   │   ├── ws353sim.txt
    │   │   │   ├── schm280.txt
    │   │   │   ├── ws353rel.txt
    │   │   │   ├── zg222.txt
    │   │   │   └── gur350.txt
    │   │   ├── analogy
    │   │   │   ├── __init__.py
    │   │   │   ├── opposite.txt
    │   │   │   └── open.txt
    │   │   └── README.txt
    │   └── en
    │   │   ├── __init__.py
    │   │   ├── ws
    │   │       ├── __init__.py
    │   │       ├── ws353_similarity.txt
    │   │       ├── ws353_relatedness.txt
    │   │       ├── radinsky_mturk.txt
    │   │       └── ws353.txt
    │   │   └── analogy
    │   │       └── __init__.py
    ├── __init__.py
    ├── preprocessing.py
    ├── svd.py
    ├── utils.py
    ├── pmi.py
    ├── evaluation.py
    ├── experiment.py
    ├── bunch.py
    ├── pair_counts.py
    └── corpus.py
├── .editorconfig
├── .travis.yml
├── pyproject.toml
├── LICENSE
├── .gitignore
├── README.md
└── examples
    ├── 02_wikipedia.ipynb
    └── 01_news.ipynb


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/de/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/de/ws/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/en/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/en/ws/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/de/analogy/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/en/analogy/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # http://editorconfig.org
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | indent_style = space
 7 | indent_size = 4
 8 | insert_final_newline = true
 9 | trim_trailing_whitespace = true
10 | end_of_line = lf
11 | charset = utf-8
12 | 
13 | [*.py]
14 | max_line_length = 79
15 | 
16 | [*.md]
17 | insert_final_newline = false
18 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist: bionic
 2 | language: python
 3 | python:
 4 |     - "3.6"
 5 |     - "3.7"
 6 |     - "3.8"
 7 | 
 8 | install:
 9 |     - pip install poetry
10 |     - if [[ $TRAVIS_PYTHON_VERSION == 3.6 ]]; then rm poetry.lock; fi
11 |     - poetry install
12 |     - poetry run python -m spacy download en_core_web_sm
13 | 
14 | script: poetry run pytest
15 | 
16 | notifications:
17 |     email: false
18 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import hyperhyper
 4 | 
 5 | 
 6 | def foo(li):
 7 |     return [pow(x, 10) for x in li]
 8 | 
 9 | 
10 | def test_map_chunks():
11 |     some_list = list(range(100))
12 |     results = hyperhyper.utils.map_pool_chunks(
13 |         some_list, foo, chunk_size=10, combine=True
14 |     )
15 |     assert len(results) == 100
16 |     assert results[50] == pow(50, 10)
17 |     print(results)
18 | 


--------------------------------------------------------------------------------
/hyperhyper/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from logging import NullHandler
 3 | 
 4 | from . import evaluation, utils
 5 | from .bunch import Bunch
 6 | from .corpus import Corpus, Vocab
 7 | from .pair_counts import count_pairs
 8 | from .preprocessing import (texts_to_sents, tokenize_texts,
 9 |                             tokenize_texts_parallel)
10 | 
11 | logging.getLogger(__name__).addHandler(NullHandler())
12 | 
13 | __version__ = "0.1.1"
14 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | authors = ["Johannes Filter <hi@jfilter.de>"]
 3 | classifiers = [
 4 |   "Programming Language :: Python :: 3.6",
 5 |   "Programming Language :: Python :: 3.7",
 6 |   "Programming Language :: Python :: 3.8",
 7 |   "License :: OSI Approved :: BSD License",
 8 |   "Topic :: Scientific/Engineering :: Information Analysis",
 9 |   "Topic :: Scientific/Engineering :: Artificial Intelligence",
10 | ]
11 | description = "Python package to construct word embeddings for small data"
12 | keywords = ["natural-language-processing", "word-embeddings", "ppmi", "pmi", "pmi-svd"]
13 | license = "BSD-2-Clause"
14 | name = "hyperhyper"
15 | readme = "README.md"
16 | repository = "https://github.com/jfilter/hyperhyper"
17 | version = "0.1.1"
18 | 
19 | [tool.poetry.dependencies]
20 | dataset = "1.*"
21 | gensim = "3.*"
22 | importlib_resources = {version = "*", python = "<= 3.6"}
23 | python = ">= 3.6"
24 | tqdm = "*"
25 | 
26 | scikit-learn = {version = "*", optional = true}
27 | spacy = {version = "2.*", optional = true}
28 | 
29 | [tool.poetry.dev-dependencies]
30 | black = "*"
31 | pylint = "*"
32 | pytest = "^5.2"
33 | scikit-learn = "*"
34 | spacy = "*"
35 | 
36 | [tool.poetry.extras]
37 | full = ["scikit-learn", "spacy"]
38 | 
39 | [build-system]
40 | build-backend = "poetry.core.masonry.api"
41 | requires = ["poetry-core>=1.0.0"]
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2014-2015, Omer Levy, Yoav Goldberg, and Ido Dagan
 4 | 
 5 | Copyright (c) 2018-2019, Johannes Hellrich
 6 | 
 7 | Copyright (c) 2019, Johannes Filter
 8 | 
 9 | All rights reserved.
10 | 
11 | Redistribution and use in source and binary forms, with or without
12 | modification, are permitted provided that the following conditions are met:
13 | 
14 | * Redistributions of source code must retain the above copyright notice, this
15 |   list of conditions and the following disclaimer.
16 | 
17 | * Redistributions in binary form must reproduce the above copyright notice,
18 |   this list of conditions and the following disclaimer in the documentation
19 |   and/or other materials provided with the distribution.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/de/ws/gur65.txt:
--------------------------------------------------------------------------------
 1 | Auto Fahrt 2.75
 2 | Auto Polster 1.25
 3 | Autogramm Küste 0.04
 4 | Autogramm Unterschrift 3.54
 5 | Backofen Herd 3.42
 6 | Backofen Werkzeug 1.04
 7 | Berg Herd 0
 8 | Berg Küste 1.71
 9 | Berg Wald 1.75
10 | Bruder Bursche 1.58
11 | Bruder Mönch 3.04
12 | Bursche Magier 0.58
13 | Edelstein Juwel 3.83
14 | Fabel Magier 1.54
15 | Fahrt Reise 3.25
16 | Forst Kirchhof 0.46
17 | Forst Wald 3.75
18 | Friedhof Hügel 0.92
19 | Friedhof Kirchhof 3
20 | Friedhof Psychiatrie 0.38
21 | Friedhof Wald 0.96
22 | Gerät Werkzeug 3
23 | Glas Becher 3.25
24 | Glas Juwel 1.08
25 | Glas Zauberer 0.58
26 | Gockel Hahn 4
27 | Grinsen Bursche 0.58
28 | Grinsen Lächeln 3.38
29 | Grinsen Werkzeug 0
30 | Hahn Reise 0
31 | Hügel Berg 3.46
32 | Irrenhaus Friedhof 0.33
33 | Irrenhaus Mönch 0.25
34 | Irrenhaus Obst 0.04
35 | Irrenhaus Psychiatrie 3.67
36 | Junge Bursche 3.79
37 | Junge Fabel 0.38
38 | Junge Hahn 0.29
39 | Kraftfahrzeug Auto 3.79
40 | Kraftfahrzeug Magier 0.04
41 | Kran Werkzeug 1.96
42 | Kranich Hahn 2.21
43 | Küste Forst 1.08
44 | Küste Reise 1.46
45 | Küste Ufer 3.67
46 | Leibeigener Sklave 3.83
47 | Mittag Mittagsstunde 3.54
48 | Mittag Schnur 0.04
49 | Mönch Orakel 0.54
50 | Mönch Sklave 0.58
51 | Nahrung Hahn 1.88
52 | Nahrung Obst 3.29
53 | Obst Backofen 0.92
54 | Orakel Fabel 1.25
55 | Polster Juwel 0.29
56 | Polster Kissen 3.13
57 | Schnur Seil 3.38
58 | Seil Lächeln 0
59 | Ufer Hügel 1.25
60 | Ufer Wald 1.29
61 | Vogel Hahn 3.17
62 | Vogel Kranich 3.54
63 | Vogel Wald 1.63
64 | Zauberer Magier 3.96
65 | Zauberer Orakel 1.71
66 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | .vscode
107 | test_bunch
108 | dev
109 | 


--------------------------------------------------------------------------------
/hyperhyper/preprocessing.py:
--------------------------------------------------------------------------------
 1 | """
 2 | simple text preprocessing such as cleaning and tokenization
 3 | """
 4 | 
 5 | import os
 6 | import re
 7 | 
 8 | from gensim.parsing.preprocessing import (preprocess_string,
 9 |                                           strip_non_alphanum, strip_tags)
10 | from tqdm import tqdm
11 | 
12 | from .utils import map_pool
13 | 
14 | try:
15 |     import spacy
16 | except:
17 |     spacy = None
18 | 
19 | 
20 | def simple_preproc(text):
21 |     """
22 |     replace digits with 0 and lowercase text
23 |     """
24 |     return re.sub(r"\d", "0", text.lower())
25 | 
26 | 
27 | def tokenize_string(text):
28 |     """
29 |     tokenize based on whitespaces
30 |     """
31 |     CUSTOM_FILTERS = [simple_preproc, strip_tags, strip_non_alphanum]
32 |     return preprocess_string(text, CUSTOM_FILTERS)
33 | 
34 | 
35 | def tokenize_texts(texts):
36 |     """
37 |     tokenize multiple texts (list of texts) based on whitespaces
38 |     """
39 |     return [tokenize_string(t) for t in texts]
40 | 
41 | 
42 | def tokenize_texts_parallel(texts):
43 |     """
44 |     tokenize multiple texts based on whitespaces in parrallel
45 |     """
46 |     return map_pool(texts, tokenize_string)
47 | 
48 | 
49 | def texts_to_sents(texts, model="en_core_web_sm", remove_stop=True, lemmatize=True):
50 |     """
51 |     transform list of texts to list of sents (list of tokens) and apply
52 |     simple text preprocessing
53 |     """
54 |     texts = [strip_tags(t) for t in texts]
55 |     results = []
56 | 
57 |     assert spacy is not None, 'please install spacy, i.e., "pip install spacy"'
58 | 
59 |     try:
60 |         nlp = spacy.load(model, disable=["ner"])
61 |     except Exception as e:
62 |         print(e, "\ntrying to download model...")
63 |         os.system("python -m spacy download " + model)
64 |         nlp = spacy.load(model, disable=["ner"])
65 | 
66 |     for doc in tqdm(nlp.pipe(texts), total=len(texts), desc="texts to sents"):
67 |         for s in doc.sents:
68 |             results.append(
69 |                 [
70 |                     simple_preproc(
71 |                         strip_non_alphanum(t.lemma_ if lemmatize else t.text)
72 |                     )
73 |                     for t in s
74 |                     if not any((t.is_punct, t.is_space, remove_stop and t.is_stop))
75 |                 ]
76 |             )
77 |     return results
78 | 


--------------------------------------------------------------------------------
/tests/test_pair_counts.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import hyperhyper
 4 | 
 5 | 
 6 | @pytest.fixture()
 7 | def corpus():
 8 |     some_text1 = """
 9 |     The English Wikipedia is the English-language edition of the free online encyclopedia Wikipedia. Founded on 15 January 2001, it is the first edition of Wikipedia and, as of April 2019, has the most articles of any of the editions.[2] As of June 2019, 12% of articles in all Wikipedias belong to the English-language edition. This share has gradually declined from more than 50 percent in 2003, due to the growth of Wikipedias in other languages.[3] As of 1 June 2019, there are 5,870,200 articles on the site,[4] having surpassed the 5 million mark on 1 November 2015.[5] In October 2015, the combined text of the English Wikipedia's articles totalled 11.5 gigabytes when compressed.[6]
10 | 
11 |     The Simple English Wikipedia is a variation in which most of the articles use only basic English vocabulary. There is also the Old English (Ænglisc/Anglo-Saxon) Wikipedia (angwiki). Community-produced news publications include The Signpost.[7]
12 |     """
13 | 
14 |     some_text2 = """
15 |     The English Wikipedia was the first Wikipedia edition and has remained the largest. It has pioneered many ideas as conventions, policies or features which were later adopted by Wikipedia editions in some of the other languages. These ideas include "featured articles",[8] the neutral-point-of-view policy,[9] navigation templates,[10] the sorting of short "stub" articles into sub-categories,[11] dispute resolution mechanisms such as mediation and arbitration,[12] and weekly collaborations.[13]
16 | 
17 |     The English Wikipedia has adopted features from Wikipedias in other languages. These features include verified revisions from the German Wikipedia (dewiki) and town population-lookup templates from the Dutch Wikipedia (nlwiki).
18 | 
19 |     Although the English Wikipedia stores images and audio files, as well as text files, many of the images have been moved to Wikimedia Commons with the same name, as passed-through files. However, the English Wikipedia also has fair-use images and audio/video files (with copyright restrictions), most of which are not allowed on Commons.
20 | 
21 |     Many of the most active participants in the Wikimedia Foundation, and the developers of the MediaWiki software that powers Wikipedia, are English users.
22 |     """
23 | 
24 |     texts = [some_text1, some_text2]
25 |     c = hyperhyper.Corpus.from_texts(texts)
26 |     c.texts_to_file("test_bunch/bla", 5)
27 |     return c
28 | 
29 | 
30 | def test_count(corpus):
31 |     pair_c = hyperhyper.count_pairs(corpus)
32 |     print(pair_c)
33 | 
34 | 
35 | def test_count_subs(corpus):
36 |     pair_c = hyperhyper.count_pairs(corpus, subsample="prob")
37 |     pair_c = hyperhyper.count_pairs(corpus, subsample="deter")
38 |     pair_c = hyperhyper.count_pairs(corpus, subsample="deter", low_memory=True)
39 |     pair_c = hyperhyper.count_pairs(corpus, dynamic_window="decay")
40 | 


--------------------------------------------------------------------------------
/hyperhyper/svd.py:
--------------------------------------------------------------------------------
 1 | """
 2 | apply SVD on a PPMI matrix to get low-dimensional word embeddings
 3 | """
 4 | 
 5 | import heapq
 6 | import logging
 7 | 
 8 | import numpy as np
 9 | from gensim.models.lsimodel import stochastic_svd
10 | from scipy.sparse import linalg
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | try:
16 |     from sparsesvd import sparsesvd
17 | except ImportError:
18 |     logger.info("no sparsvd")
19 | 
20 | try:
21 |     from sklearn.utils.extmath import randomized_svd
22 | except ImportError:
23 |     logger.info("no sklearn")
24 | 
25 | 
26 | def calc_svd(matrix, dim, impl, impl_args):
27 |     """
28 |     apply truncated SVD with several implementations
29 | 
30 |     truncated SVD:
31 |     sparsesvd: https://pypi.org/project/sparsesvd/
32 |     scipy: https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.svd.html
33 | 
34 |     randomized truncated SVD:
35 |     gensim: https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/lsimodel.py
36 |     scikit: https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_svd.html
37 | 
38 |     Check out the comparision: https://github.com/jfilter/sparse-svd-benchmark
39 |     """
40 |     if impl == "sparsesvd":
41 |         # originally used SVD implementation
42 |         ut, s, _ = sparsesvd(matrix.m.tocsc(), dim)
43 |         # returns in a different format
44 |         ut = ut.T
45 |     if impl == "scipy":
46 |         ut, s, _ = linalg.svds(matrix.m, dim)
47 |     # randomized (but fast) truncated SVD
48 |     if impl == "gensim":
49 |         # better default arguments
50 |         args = {"power_iters": 5, "extra_dims": 10, **impl_args}
51 |         ut, s = stochastic_svd(matrix.m, dim, matrix.m.shape[0], **args)
52 |     if impl == "scikit":
53 |         ut, s, _ = randomized_svd(matrix.m, dim, **impl_args)
54 | 
55 |     return ut, s
56 | 
57 | 
58 | class SVDEmbedding:
59 |     """
60 |     SVD embeddings.
61 |     Enables controlling the weighted exponent of the eigenvalue matrix (eig).
62 |     """
63 | 
64 |     def __init__(self, ut, s, normalize=True, eig=0.0):
65 |         if eig == 0.0:
66 |             self.m = ut
67 |         elif eig == 1.0:
68 |             self.m = s * ut
69 |         else:
70 |             self.m = np.power(s, eig) * ut
71 | 
72 |         # not used?
73 |         # self.dim = self.m.shape[1]
74 | 
75 |         if normalize:
76 |             self.normalize()
77 | 
78 |     def normalize(self):
79 |         norm = np.sqrt(np.sum(self.m * self.m, axis=1))
80 |         self.m = self.m / norm[:, np.newaxis]
81 | 
82 |     def represent(self, w_idx):
83 |         return self.m[w_idx, :]
84 | 
85 |     def similarity(self, w_idx_1, w_idx_2):
86 |         """
87 |         Assumes the vectors have been normalized.
88 |         """
89 |         return self.represent(w_idx_1).dot(self.represent(w_idx_2))
90 | 
91 |     def most_similar(self, w_idx, n=10):
92 |         """
93 |         Assumes the vectors have been normalized.
94 |         """
95 |         scores = self.m.dot(self.represent(w_idx))
96 |         return heapq.nlargest(n, zip(scores, list(range(len(scores)))))
97 | 


--------------------------------------------------------------------------------
/hyperhyper/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | utility functions for i/o and other general funtionality
  3 | """
  4 | 
  5 | import logging
  6 | import math
  7 | import os
  8 | import pickle
  9 | from collections import defaultdict
 10 | from concurrent import futures
 11 | 
 12 | import numpy as np
 13 | from scipy.sparse import csr_matrix
 14 | from tqdm import tqdm
 15 | 
 16 | num_cpu = os.cpu_count()
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | def save_arrays(f, a1, a2):
 22 |     if type(f) != str:
 23 |         f = str(f)
 24 |     np.savez_compressed(f, a1=a1, a2=a2)
 25 | 
 26 | 
 27 | def load_arrays(f):
 28 |     if type(f) != str:
 29 |         f = str(f)
 30 |     if not f.endswith(".npz"):
 31 |         f += ".npz"
 32 |     loader = np.load(f)
 33 |     return loader["a1"], loader["a2"]
 34 | 
 35 | 
 36 | def save_matrix(f, m):
 37 |     if type(f) != str:
 38 |         f = str(f)
 39 |     np.savez_compressed(
 40 |         f, data=m.data, indices=m.indices, indptr=m.indptr, shape=m.shape
 41 |     )
 42 | 
 43 | 
 44 | def load_matrix(f):
 45 |     if type(f) != str:
 46 |         f = str(f)
 47 |     if not f.endswith(".npz"):
 48 |         f += ".npz"
 49 |     loader = np.load(f)
 50 |     return csr_matrix(
 51 |         (loader["data"], loader["indices"], loader["indptr"]), shape=loader["shape"]
 52 |     )
 53 | 
 54 | 
 55 | def chunks(l, n):
 56 |     """
 57 |     Yield successive n-sized chunks from l.
 58 |     """
 59 |     for i in range(0, len(l), n):
 60 |         yield l[i : i + n]
 61 | 
 62 | 
 63 | # TODO: more perfz
 64 | def combine_chunks(chunks):
 65 |     for c in chunks:
 66 |         for x in c:
 67 |             yield x
 68 | 
 69 | 
 70 | def map_pool_chunks(
 71 |     array, fun, num_chunks=100, chunk_size=None, combine=True, **kwargs
 72 | ):
 73 |     if chunk_size is None:
 74 |         chunk_size = math.ceil(len(array) / num_chunks)
 75 |     results = map_pool(chunks(array, chunk_size), fun, total=len(array), **kwargs)
 76 |     if combine:
 77 |         results = list(combine_chunks(results))
 78 |     return results
 79 | 
 80 | 
 81 | def map_pool(array, fun, total=None, desc=None, process_chunksize=100):
 82 |     with futures.ProcessPoolExecutor(num_cpu) as executor:
 83 |         if desc is None:
 84 |             return list(executor.map(fun, array, chunksize=process_chunksize))
 85 |         return list(
 86 |             tqdm(
 87 |                 executor.map(fun, array, chunksize=process_chunksize),
 88 |                 total=len(array) if total is None else total,
 89 |                 desc=desc,
 90 |             )
 91 |         )
 92 | 
 93 | 
 94 | def delete_folder(pth):
 95 |     for sub in pth.iterdir():
 96 |         if sub.is_dir():
 97 |             delete_folder(sub)
 98 |         else:
 99 |             sub.unlink()
100 |     pth.rmdir()
101 | 
102 | 
103 | def to_pickle(ob, fn):
104 |     fn.parent.mkdir(parents=True, exist_ok=True)
105 |     with open(fn, "wb") as outfile:
106 |         pickle.dump(ob, outfile)
107 | 
108 | 
109 | def read_pickle(fn):
110 |     with open(fn, "rb") as infile:
111 |         return pickle.load(infile)
112 | 
113 | 
114 | def dsum(*dicts):
115 |     """
116 |     sum up numerical values in multiple dictionaries
117 |     """
118 |     ret = defaultdict(int)
119 |     for d in dicts:
120 |         for k, v in d.items():
121 |             ret[k] += v
122 |     return dict(ret)
123 | 


--------------------------------------------------------------------------------
/tests/test_dataset.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | from pathlib import Path
 3 | 
 4 | import pytest
 5 | 
 6 | import hyperhyper
 7 | 
 8 | some_text1 = """
 9 | The English Wikipedia is the English-language edition of the free online encyclopedia Wikipedia. Founded on 15 January 2001, it is the first edition of Wikipedia and, as of April 2019, has the most articles of any of the editions.[2] As of June 2019, 12% of articles in all Wikipedias belong to the English-language edition. This share has gradually declined from more than 50 percent in 2003, due to the growth of Wikipedias in other languages.[3] As of 1 June 2019, there are 5,870,200 articles on the site,[4] having surpassed the 5 million mark on 1 November 2015.[5] In October 2015, the combined text of the English Wikipedia's articles totalled 11.5 gigabytes when compressed.[6]
10 | 
11 | The Simple English Wikipedia is a variation in which most of the articles use only basic English vocabulary. There is also the Old English (Ænglisc/Anglo-Saxon) Wikipedia (angwiki). Community-produced news publications include The Signpost.[7]
12 | """
13 | 
14 | some_text2 = """
15 | The English Wikipedia was the first Wikipedia edition and has remained the largest. It has pioneered many ideas as conventions, policies or features which were later adopted by Wikipedia editions in some of the other languages. These ideas include "featured articles",[8] the neutral-point-of-view policy,[9] navigation templates,[10] the sorting of short "stub" articles into sub-categories,[11] dispute resolution mechanisms such as mediation and arbitration,[12] and weekly collaborations.[13]
16 | 
17 | The English Wikipedia has adopted features from Wikipedias in other languages. These features include verified revisions from the German Wikipedia (dewiki) and town population-lookup templates from the Dutch Wikipedia (nlwiki).
18 | 
19 | Although the English Wikipedia stores images and audio files, as well as text files, many of the images have been moved to Wikimedia Commons with the same name, as passed-through files. However, the English Wikipedia also has fair-use images and audio/video files (with copyright restrictions), most of which are not allowed on Commons.
20 | 
21 | Many of the most active participants in the Wikimedia Foundation, and the developers of the MediaWiki software that powers Wikipedia, are English users.
22 | """
23 | 
24 | texts = [some_text1, some_text2] * 10
25 | 
26 | 
27 | def test_corpus():
28 |     sents = []
29 |     for t in texts:
30 |         sents += t.split("\n\n")
31 |     corpus = hyperhyper.Corpus.from_sents(sents)
32 |     assert corpus.size == len(sents)
33 |     assert corpus.counts[corpus.vocab.token2id["wikipedia"]] > 0
34 |     assert corpus.vocab.token2id["wikipedia"] == corpus.vocab.tokens.index("wikipedia")
35 | 
36 |     keys = corpus.vocab.token2id.keys()
37 |     print(len(keys))
38 | 
39 |     for k in keys:
40 |         i = corpus.vocab.token2id[k]
41 |         assert i < len(keys)
42 | 
43 | 
44 | def test_sent_split():
45 |     corpus = hyperhyper.Corpus.from_texts(texts)
46 |     print(corpus.texts)
47 |     assert corpus.size > 2
48 | 
49 | 
50 | def test_text_files():
51 |     # setup
52 |     test_dir = tempfile.mkdtemp()
53 |     for i, t in enumerate(texts):
54 |         Path(test_dir + f"/{i}.txt").write_text(t)
55 |     # test
56 |     corpus = hyperhyper.Corpus.from_text_files(test_dir)
57 |     corpus = hyperhyper.Corpus.from_text_files(test_dir, view_fraction=0.2)
58 |     print(corpus)
59 |     assert corpus.size > 2
60 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/de/README.txt:
--------------------------------------------------------------------------------
 1 | # Sources
 2 | 
 3 | The evaluation datasets for German come from two sources:
 4 | 
 5 | 1. Deep Semantic Analogies Dataset
 6 | 2. Bachelors' Thesis by Andreas Müller: https://devmount.github.io/GermanWordEmbeddings/
 7 | 3. Word Similarity: https://www.informatik.tu-darmstadt.de/ukp/research_6/data/semantic_relatedness/german_relatedness_datasets/index.en.jsp
 8 | 4. Multilingual SimLex999 and WordSim353: http://leviants.com/ira.leviant/MultilingualVSMdata.html
 9 | 
10 | 
11 | Deep Semantic Analogies Dataset
12 | --------------------------------------------
13 | 
14 | Paper: https://www.aclweb.org/anthology/W15-0105
15 | 
16 | This collection contains six newly created semantic datasets.
17 | 
18 | It contains 5 files:
19 |  * de_re-rated_Schm280.txt
20 |  * de_sem-para_SemRel.txt
21 |  * en_sem-para_BLESS.txt
22 |  * en_sem-para_SemRel.txt
23 |  * de_toefl_subset.txt
24 |  * de_trans_Google_analogies.txt
25 | 
26 | For a detailed description of the data, please refer to the paper (see reference below).
27 | For questions, please contact
28 |   Maximilian Koeper (koepermn@ims.uni-stuttgart.de),
29 |   Christian Scheible (scheibcn@ims.uni-stuttgart.de), or
30 |   Sabine Schulte im Walde (schulte@ims.uni-stuttgart.de)
31 | 
32 | File descriptions:
33 | ------------------
34 | * de_re-rated_Schm280.txt contains the re-rated version of the Schm280 set (Schmidt et al. 2001). Schm280 consists of 280 translated word pairs from WordSim350. We re-rated these pairs, asking 10 Judges under the same conditions as in WordSim353. We call the resulting dataset WordSim280. Each line contains a word pair and the mean similarity score in [0,10]
35 | 
36 | * en_sem-para_SemRel.txt and de_sem-para_SemRel.txt contain analogy questions based on the word pairs from (Scheible and Schulte im Walde, 2014). Each question is of the form A:B::C:D. The questions cover the relations adj-antonym, noun-hyperonym, noun-synonym, noun-antonym, and verb-antonym. For more details, please refer to the paper. This file consists of several sections (delimited by header lines), each for a different relation. Within a section, each line lists the four related words A, B, C, and D of an analogy "A is to B as C is to D".
37 | 
38 | * en_sem-para_BLESS.txt was constructed the same way as the SemRel datasets, but based on hyperonymy and meronymy relations from the BLESS dataset (Baroni & Lenci. 2011). The format is the same as for the SemRel files.
39 | 
40 | * de_toefl_subset.txt is a subset of the German word choice questions from the University of Darmstadt (Mohammad et al., 2007). We removed all questions that contain phrases in order to obtain a challenge of a difficulty comparable to the English TOEFL data. Each line contains a question of the form "stem correct_answer distractor1 distractor2 distractor3".
41 | 
42 | * de_trans_Google_analogies.txt is the German translation of the Google (Mikolov et al., 2013a) analogy set. We omit the  adjective-adverb relation as this distinction does not exist in German. The format is again the same as for the SemRel files.
43 | 
44 | Reference:
45 | ----------
46 | 
47 | @inproceedings{KoeperScheibleSchulte2015IWCS,
48 |    title     = {Multilingual Reliability and ``Semantic''  Structure of Continuous Word Spaces},
49 |    author    = {Maximilian K\"oper, Christian Scheible, Sabine {Schulte im Walde}},
50 |    booktitle = {Proceedings of the 11th International Conference on Computational Semantics (IWCS 2015) -- Short Papers},
51 |    address   = {London, UK},
52 |    year      = {2015}
53 | }
54 | 


--------------------------------------------------------------------------------
/hyperhyper/pmi.py:
--------------------------------------------------------------------------------
  1 | """
  2 | implements PMI matrix (Pointwise mutual information)
  3 | See: https://en.wikipedia.org/wiki/Pointwise_mutual_information
  4 | """
  5 | 
  6 | import heapq
  7 | 
  8 | import numpy as np
  9 | from gensim import matutils
 10 | from scipy.sparse import csr_matrix, dok_matrix
 11 | 
 12 | 
 13 | def calc_pmi(counts, cds):
 14 |     """
 15 |     Calculates e^PMI; PMI without the log().
 16 |     """
 17 | 
 18 |     sum_w = np.array(counts.sum(axis=1))[:, 0]
 19 |     sum_c = np.array(counts.sum(axis=0))[0, :]
 20 |     if cds != 1:
 21 |         sum_c = sum_c ** cds
 22 |     sum_total = sum_c.sum()
 23 |     sum_w = np.reciprocal(sum_w)
 24 |     sum_c = np.reciprocal(sum_c)
 25 | 
 26 |     pmi = csr_matrix(counts)
 27 |     pmi = multiply_by_rows(pmi, sum_w)
 28 |     pmi = multiply_by_columns(pmi, sum_c)
 29 |     pmi = pmi * sum_total
 30 |     return pmi
 31 | 
 32 | 
 33 | def multiply_by_rows(matrix, row_coefs):
 34 |     normalizer = dok_matrix((len(row_coefs), len(row_coefs)))
 35 |     normalizer.setdiag(row_coefs)
 36 |     return normalizer.tocsr().dot(matrix)
 37 | 
 38 | 
 39 | def multiply_by_columns(matrix, col_coefs):
 40 |     normalizer = dok_matrix((len(col_coefs), len(col_coefs)))
 41 |     normalizer.setdiag(col_coefs)
 42 |     return matrix.dot(normalizer.tocsr())
 43 | 
 44 | 
 45 | class PPMIEmbedding:
 46 |     """
 47 |     Base class for explicit representations. Assumes that the serialized input is e^PMI.
 48 | 
 49 |     Positive PMI (PPMI) with negative sampling (neg).
 50 |     Negative samples shift the PMI matrix before truncation.
 51 |     """
 52 | 
 53 |     def __init__(self, matrix, normalize=True, neg=1):
 54 |         self.m = matrix
 55 |         self.m.data = np.log(self.m.data)
 56 | 
 57 |         # not needed?
 58 |         # # self.normal = normalize
 59 | 
 60 |         if neg is not None:
 61 |             self.m.data -= np.log(neg)
 62 |             self.m.data[self.m.data < 0] = 0
 63 |             self.m.eliminate_zeros()
 64 | 
 65 |         if normalize:
 66 |             self.normalize()
 67 | 
 68 |     def normalize(self):
 69 |         m2 = self.m.copy()
 70 |         m2.data **= 2
 71 |         norm = np.reciprocal(np.sqrt(np.array(m2.sum(axis=1))[:, 0]))
 72 |         normalizer = dok_matrix((len(norm), len(norm)))
 73 |         normalizer.setdiag(norm)
 74 |         self.m = normalizer.tocsr().dot(self.m)
 75 | 
 76 |     def represent(self, w_idx):
 77 |         return self.m[w_idx, :]
 78 | 
 79 |     def similarity(self, w1, w2):
 80 |         """
 81 |         Assumes the vectors have been normalized.
 82 |         """
 83 |         return self.represent(w1).dot(self.represent(w2).T)[0, 0]
 84 | 
 85 |     def most_similar(self, w, n=10):
 86 |         """
 87 |         Assumes the vectors have been normalized.
 88 |         """
 89 |         scores = self.m.dot(self.represent(w).T).T.tocsr()
 90 |         return heapq.nlargest(n, zip(scores.data, scores.indices))
 91 | 
 92 | 
 93 |     # TODO: working?
 94 |     def most_similar_vectors(self, positives, negatives, topn=10):
 95 |         """
 96 |         Some parts taken from gensim.
 97 |         https://github.com/RaRe-Technologies/gensim/blob/ea87470e4c065676d3d33df15b8db4192b30ebc1/gensim/models/keyedvectors.py#L690
 98 |         """
 99 |         mean = [np.squeeze(self.represent(x).toarray()) for x in positives] + [-1 * np.squeeze(self.represent(x).toarray()) for x in negatives]
100 |         mean = matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
101 | 
102 |         dists = self.m.dot(mean)
103 | 
104 |         best = matutils.argsort(dists, topn=topn, reverse=True)
105 |         return [(best_idx, float(dists[best_idx])) for best_idx in best]
106 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/en/ws/ws353_similarity.txt:
--------------------------------------------------------------------------------
  1 | tiger	cat	7.35
  2 | tiger	tiger	10.00
  3 | plane	car	5.77
  4 | train	car	6.31
  5 | television	radio	6.77
  6 | media	radio	7.42
  7 | bread	butter	6.19
  8 | cucumber	potato	5.92
  9 | doctor	nurse	7.00
 10 | professor	doctor	6.62
 11 | student	professor	6.81
 12 | smart	stupid	5.81
 13 | wood	forest	7.73
 14 | money	cash	9.15
 15 | king	queen	8.58
 16 | king	rook	5.92
 17 | bishop	rabbi	6.69
 18 | fuck	sex	9.44
 19 | football	soccer	9.03
 20 | football	basketball	6.81
 21 | football	tennis	6.63
 22 | Arafat	Jackson	2.50
 23 | physics	chemistry	7.35
 24 | vodka	gin	8.46
 25 | vodka	brandy	8.13
 26 | drink	eat	6.87
 27 | car	automobile	8.94
 28 | gem	jewel	8.96
 29 | journey	voyage	9.29
 30 | boy	lad	8.83
 31 | coast	shore	9.10
 32 | asylum	madhouse	8.87
 33 | magician	wizard	9.02
 34 | midday	noon	9.29
 35 | furnace	stove	8.79
 36 | food	fruit	7.52
 37 | bird	cock	7.10
 38 | bird	crane	7.38
 39 | food	rooster	4.42
 40 | money	dollar	8.42
 41 | money	currency	9.04
 42 | tiger	jaguar	8.00
 43 | tiger	feline	8.00
 44 | tiger	carnivore	7.08
 45 | tiger	mammal	6.85
 46 | tiger	animal	7.00
 47 | tiger	organism	4.77
 48 | tiger	fauna	5.62
 49 | psychology	psychiatry	8.08
 50 | psychology	science	6.71
 51 | psychology	discipline	5.58
 52 | planet	star	8.45
 53 | planet	moon	8.08
 54 | planet	sun	8.02
 55 | precedent	example	5.85
 56 | precedent	antecedent	6.04
 57 | cup	tableware	6.85
 58 | cup	artifact	2.92
 59 | cup	object	3.69
 60 | cup	entity	2.15
 61 | jaguar	cat	7.42
 62 | jaguar	car	7.27
 63 | mile	kilometer	8.66
 64 | skin	eye	6.22
 65 | Japanese	American	6.50
 66 | century	year	7.59
 67 | announcement	news	7.56
 68 | doctor	personnel	5.00
 69 | Harvard	Yale	8.13
 70 | hospital	infrastructure	4.63
 71 | life	death	7.88
 72 | travel	activity	5.00
 73 | type	kind	8.97
 74 | street	place	6.44
 75 | street	avenue	8.88
 76 | street	block	6.88
 77 | cell	phone	7.81
 78 | dividend	payment	7.63
 79 | calculation	computation	8.44
 80 | profit	loss	7.63
 81 | dollar	yen	7.78
 82 | dollar	buck	9.22
 83 | phone	equipment	7.13
 84 | liquid	water	7.89
 85 | marathon	sprint	7.47
 86 | seafood	food	8.34
 87 | seafood	lobster	8.70
 88 | lobster	food	7.81
 89 | lobster	wine	5.70
 90 | championship	tournament	8.36
 91 | man	woman	8.30
 92 | man	governor	5.25
 93 | murder	manslaughter	8.53
 94 | opera	performance	6.88
 95 | Mexico	Brazil	7.44
 96 | glass	metal	5.56
 97 | aluminum	metal	7.83
 98 | rock	jazz	7.59
 99 | museum	theater	7.19
100 | shower	thunderstorm	6.31
101 | monk	oracle	5.00
102 | cup	food	5.00
103 | journal	association	4.97
104 | street	children	4.94
105 | car	flight	4.94
106 | space	chemistry	4.88
107 | situation	conclusion	4.81
108 | word	similarity	4.75
109 | peace	plan	4.75
110 | consumer	energy	4.75
111 | ministry	culture	4.69
112 | smart	student	4.62
113 | investigation	effort	4.59
114 | image	surface	4.56
115 | life	term	4.50
116 | start	match	4.47
117 | computer	news	4.47
118 | board	recommendation	4.47
119 | lad	brother	4.46
120 | observation	architecture	4.38
121 | coast	hill	4.38
122 | deployment	departure	4.25
123 | benchmark	index	4.25
124 | attempt	peace	4.25
125 | consumer	confidence	4.13
126 | start	year	4.06
127 | focus	life	4.06
128 | development	issue	3.97
129 | theater	history	3.91
130 | situation	isolation	3.88
131 | profit	warning	3.88
132 | media	trading	3.88
133 | chance	credibility	3.88
134 | precedent	information	3.85
135 | architecture	century	3.78
136 | population	development	3.75
137 | stock	live	3.73
138 | peace	atmosphere	3.69
139 | morality	marriage	3.69
140 | minority	peace	3.69
141 | atmosphere	landscape	3.69
142 | report	gain	3.63
143 | music	project	3.63
144 | seven	series	3.56
145 | experience	music	3.47
146 | school	center	3.44
147 | five	month	3.38
148 | announcement	production	3.38
149 | morality	importance	3.31
150 | money	operation	3.31
151 | delay	news	3.31
152 | governor	interview	3.25
153 | practice	institution	3.19
154 | century	nation	3.16
155 | coast	forest	3.15
156 | shore	woodland	3.08
157 | drink	car	3.04
158 | president	medal	3.00
159 | prejudice	recognition	3.00
160 | viewer	serial	2.97
161 | peace	insurance	2.94
162 | Mars	water	2.94
163 | media	gain	2.88
164 | precedent	cognition	2.81
165 | announcement	effort	2.75
166 | line	insurance	2.69
167 | crane	implement	2.69
168 | drink	mother	2.65
169 | opera	industry	2.63
170 | volunteer	motto	2.56
171 | listing	proximity	2.56
172 | precedent	collection	2.50
173 | cup	article	2.40
174 | sign	recess	2.38
175 | problem	airport	2.38
176 | reason	hypertension	2.31
177 | direction	combination	2.25
178 | Wednesday	news	2.22
179 | glass	magician	2.08
180 | cemetery	woodland	2.08
181 | possibility	girl	1.94
182 | cup	substance	1.92
183 | forest	graveyard	1.85
184 | stock	egg	1.81
185 | month	hotel	1.81
186 | energy	secretary	1.81
187 | precedent	group	1.77
188 | production	hike	1.75
189 | stock	phone	1.62
190 | holy	sex	1.62
191 | stock	CD	1.31
192 | drink	ear	1.31
193 | delay	racism	1.19
194 | stock	life	0.92
195 | stock	jaguar	0.92
196 | monk	slave	0.92
197 | lad	wizard	0.92
198 | sugar	approach	0.88
199 | rooster	voyage	0.62
200 | noon	string	0.54
201 | chord	smile	0.54
202 | professor	cucumber	0.31
203 | king	cabbage	0.23
204 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Evaluate the performance of embeddings with word simularities and word analogies.
  3 | 
  4 | Can't use the evaluation methods in gensim because the keyed vector structure does not work for PPMI.
  5 | So we have to caculate the metrics ourselves.
  6 | """
  7 | 
  8 | from pathlib import Path
  9 | 
 10 | import numpy as np
 11 | from scipy.stats.stats import spearmanr
 12 | 
 13 | from . import evaluation_datasets
 14 | 
 15 | try:
 16 |     from importlib.resources import path
 17 | except ImportError:
 18 |     # backport for Python <3.7
 19 |     from importlib_resources import path
 20 | 
 21 | 
 22 | def read_test_data(lang, type):
 23 |     """
 24 |     read test data that is stored within the module
 25 |     """
 26 |     with path(evaluation_datasets, lang) as eval_dir:
 27 |         for file in eval_dir.glob(f"{type}/*.txt"):
 28 |             yield file
 29 | 
 30 | 
 31 | def to_item(li):
 32 |     """
 33 |     squeeze
 34 |     """
 35 |     if isinstance(li, list):
 36 |         if len(li) == 0:
 37 |             return None
 38 |         if len(li) == 1:
 39 |             return li[0]
 40 |         return to_item(li[0])
 41 |     return li
 42 | 
 43 | 
 44 | def setup_test_tokens(p, keep_len):
 45 |     """
 46 |     Read in traning data from files and discard comments (etc.)
 47 |     """
 48 |     lines = Path(p).read_text().split("\n")
 49 |     lines = [l.split() for l in lines]
 50 |     lines = [l for l in lines if len(l) == keep_len]
 51 |     return zip(*lines)
 52 | 
 53 | 
 54 | def eval_similarity(vectors, token2id, preproc_fun, lang="en"):
 55 |     """
 56 |     evaluate word similarity on several test datasets
 57 |     """
 58 |     line_counts, spear_results, full_results = [], [], []
 59 | 
 60 |     for data in read_test_data(lang, "ws"):
 61 |         results = []
 62 | 
 63 |         token1, token2, sims = setup_test_tokens(data, 3)
 64 |         # preprocess tokens 'in batch'
 65 |         token1, token2 = preproc_fun(token1), preproc_fun(token2)
 66 |         lines = list(zip(token1, token2, sims))
 67 |         for x, y, sim in lines:
 68 |             x, y = to_item(x), to_item(y)
 69 | 
 70 |             # not sure it the lines below are needed
 71 |             # if x is None or y is None:
 72 |             #     continue
 73 | 
 74 |             # skip over OOV
 75 |             if x in token2id and y in token2id:
 76 |                 results.append((vectors.similarity(token2id[x], token2id[y]), sim))
 77 | 
 78 |         if len(results) == 0:
 79 |             print("not enough results for this dataset: ", data.name)
 80 |             continue
 81 | 
 82 |         actual, expected = zip(*results)
 83 |         spear_res = spearmanr(actual, expected)[0]
 84 |         spear_results.append(spear_res)
 85 |         line_counts.append(len(results))
 86 |         oov = (len(lines) - len(results)) / len(lines)
 87 | 
 88 |         full_results.append(
 89 |             {
 90 |                 "name": f"{lang}_{data.stem}",
 91 |                 "score": spear_res,
 92 |                 "oov": oov,
 93 |                 "fullscore": spear_res * (1 - oov),  # consider the portion of OOV
 94 |             }
 95 |         )
 96 | 
 97 |     micro_avg = sum([x * y for x, y in zip(line_counts, spear_results)]) / sum(
 98 |         line_counts
 99 |     )
100 |     macro_avg = sum(spear_results) / len(spear_results)
101 |     return {"micro": micro_avg, "macro": macro_avg, "results": full_results}
102 | 
103 | 
104 | # TODO:
105 | 
106 | # analogies
107 | def eval_analogies(vectors, token2id, preproc_fun, lang="en"):
108 |     line_counts, full_results = [], []
109 | 
110 |     for data in read_test_data(lang, "analogy"):
111 |         results = []
112 | 
113 |         line_tokens = setup_test_tokens(data, 4)
114 |         line_tokens = [preproc_fun(t) for t in line_tokens]
115 |         lines = list(zip(*line_tokens))
116 |         for tokens in lines:
117 |             tokens = [to_item(x) for x in tokens]
118 |             # skip over OOV
119 |             if not all([x in token2id for x in tokens]):
120 |                 continue
121 | 
122 |             tokens = [token2id[x] for x in tokens]
123 |             a, a_, b, b_ = tokens
124 |             guesses = vectors.most_similar_vectors([a, b], [a_])
125 |             result = 1 if b_ in guesses else 0
126 |             results.append(result)
127 | 
128 |         if len(results) == 0:
129 |             print("not enough results for this dataset: ", data.name)
130 |             continue
131 | 
132 |         sum_results = sum(results)
133 |         line_counts.append(len(results))
134 |         oov = (len(lines) - len(results)) / len(lines)
135 | 
136 |         full_results.append(
137 |             {
138 |                 "name": f"{lang}_{data.stem}",
139 |                 "score": sum_results,
140 |                 "oov": oov,
141 |                 "fullscore": sum_results * (1 - oov),  # consider the portion of OOV
142 |             }
143 |         )
144 | 
145 |     scores = [x['score'] for x in full_results]
146 |     micro_avg = sum([x * y for x, y in zip(line_counts, scores)]) / sum(
147 |         line_counts
148 |     )
149 |     macro_avg = sum(scores) / len(scores)
150 |     return {"micro": micro_avg, "macro": macro_avg, "results": full_results}
151 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/de/ws/ws353sim.txt:
--------------------------------------------------------------------------------
  1 | Tiger  Katze  7.92
  2 | Tiger  Tiger  10
  3 | Flugzeug  Auto  4.92
  4 | Zug  Auto  5.54
  5 | Fernseher  Radio  5.77
  6 | Medien  Radio  8.15
  7 | Brot  Butter  5.62
  8 | Gurke  Kartoffel  4.92
  9 | Arzt  Krankenschwester  6.69
 10 | Professor  Doktor  6.77
 11 | Student  Professor  5.69
 12 | klug  Student  4.85
 13 | klug  dumm  5
 14 | Vorrat  Telefon  0.31
 15 | Vorrat  CD  0.54
 16 | Vorrat  Jaguar  0.23
 17 | Vorrat  Ei  2.15
 18 | Aktie  Live  1.08
 19 | Aktie  Leben  0.62
 20 | Holz  Wald  8.54
 21 | Geld  Bargeld  9.69
 22 | Professor  Gurke  0.15
 23 | König  Kohl  0.23
 24 | König  Königin  10
 25 | König  Turm  5.15
 26 | Bischoff  Rabbi  7
 27 | Heilig  Sex  0.69
 28 | Ficken  Sex  9.15
 29 | Fußball  Basketball  5.38
 30 | Fußball  Tennis  4.69
 31 | Arafat  Jackson  0.69
 32 | Physik  Chemie  7.54
 33 | Weltall  Chemie  3.31
 34 | Wodka  Gin  7.92
 35 | Wodka  Brandy  8.22
 36 | Drink  Auto  1.85
 37 | Trinken  Ohren  0.62
 38 | Trinken  Essen  7.23
 39 | Säugen  Mutter  7.69
 40 | Auto  Fahrzeug  9.19
 41 | Edelstein  Juwel  9.27
 42 | Ausflug  Reise  8.23
 43 | Junge  Bursche  9.27
 44 | Irrenanstalt  Tollhaus  8.23
 45 | Magier  Zauberer  9.65
 46 | Ofen  Herd  8.81
 47 | Essen  Frucht  6.77
 48 | Vogel  Hahn  6.08
 49 | Vogel  Kranich  7.46
 50 | Kran  Arbeitsgerät  6.08
 51 | Bursche  Bruder  4.23
 52 | Mönch  Orakel  1.38
 53 | Friedhof  Waldgebiet  1.92
 54 | Essen  Hahn  3.46
 55 | Küste  Hügel  2.69
 56 | Wald  Friedhof  3.31
 57 | Ufer  Waldgebiet  2.31
 58 | Mönch  Sklave  1.08
 59 | Küste  Wald  1.77
 60 | Bursche  Zauberer  0.77
 61 | Akkord  Lächeln  0.31
 62 | Glas  Magier  1.69
 63 | Mittag  Faden  0.15
 64 | Hahn  Reise  0.31
 65 | Geld  Dollar  7.92
 66 | Geld  Bargeld  9.19
 67 | Geld  Währung  8.12
 68 | Geld  Wäsche  3.62
 69 | Tiger  Jaguar  6
 70 | Tiger  Katze  6.92
 71 | Tiger  Raubtier  8
 72 | Tiger  Säugetier  6.58
 73 | Tiger  Tier  7.85
 74 | Tiger  Organismus  3.59
 75 | Tiger  Fauna  3.69
 76 | Psychologie  Psychiatrie  6.85
 77 | Psychologie  Wissenschaft  5.96
 78 | Psychologie  Disziplin  4.77
 79 | Planet  Stern  7.23
 80 | Planet  Mond  7.08
 81 | Planet  Sonne  7.08
 82 | Präzedenz  Beispiel  6.83
 83 | Basis  Information  3.32
 84 | Voraussetzung  Erkenntnis  3.15
 85 | Beispielhaft  Sammlung  1.92
 86 | Vorbildlich  Gruppe  2.19
 87 | Vorangehend  Vorausgehend  8.04
 88 | Tasse  Geschirr  8
 89 | Tasse  Gegenstand  6.08
 90 | Tasse  Artefakt  2
 91 | Tasse  Objekt  5.69
 92 | Tasse  Ding  5.08
 93 | Tasse  Essen  1.77
 94 | Tasse  Substanz  1.69
 95 | Jaguar  Katze  6.66
 96 | Jaguar  Auto  7.55
 97 | Energie  Minister  4.15
 98 | Untersuchung  Aufwand  4.15
 99 | Mars  Wasser  2.77
100 | Bild  Oberfläche  3.38
101 | Zeichen  Kerbe  3.92
102 | Mittwoch  Nachrichten  1.38
103 | Meile  Kilometer  7.62
104 | Computer  Nachrichten  4.23
105 | Atmosphäre  Landschaft  2.5
106 | Präsident  Orden  2.77
107 | Haut  Augen  4.12
108 | Japaner  Amerikaner  5.73
109 | Theater  Geschichte  3.85
110 | Freiwilliger  Motto  0.77
111 | Vorurteil  Anerkennung  3.46
112 | Jahrhundert  Jahr  6.85
113 | Jahrhundert  Nation  1.54
114 | Verzögerung  Rassismus  1.31
115 | Verzögerung  Nachrichten  1.08
116 | Frieden  Plan  3.85
117 | Minderheit  Frieden  2.23
118 | Versuch  Frieden  3.46
119 | Aufmarsch  Abzug  6.27
120 | Ankündigung  Nachrichten  6.31
121 | Ankündigung  Aufwand  1.08
122 | Zeitschrift  Verein  1.77
123 | Arzt  Personal  4.46
124 | Schule  Zentrum  4
125 | Ursache  Bluthochdruck  3.65
126 | Harvard  Yale  7.85
127 | Krankenhaus  Infrastruktur  5
128 | Leben  Tod  8.69
129 | Leben  Dauer  6.42
130 | Wort  Ähnlichkeit  2.46
131 | Gremium  Empfehlung  3.65
132 | Direktor  Interview  2.46
133 | Frieden  Stimmung  4.46
134 | Frieden  Versicherung  2.15
135 | Reise  Aktivität  6.31
136 | Konsument  Vertrauen  3.85
137 | Konsument  Energie  3.69
138 | Problem  Flughafen  2.23
139 | Auto  Flug  3.77
140 | Monat  Hotel  0.69
141 | Art  Sorte  8.81
142 | Lage  Schlussfolgerung  3.69
143 | Situation  Isolation  2.15
144 | Richtung  Verbindung  3.65
145 | Straße  Platz  5.35
146 | Straße  Allee  7.73
147 | Straße  Häuserblock  5.5
148 | Straße  Kinder  3.73
149 | Aufzählung  Nähe  0.92
150 | Zelle  Telefon  6.38
151 | Herstellung  Wanderung  0.69
152 | Richtwert  Kennziffer  4.92
153 | Medien  Handel  2.87
154 | Medien  Vorteil  1.81
155 | Gewinnanteil  Auszahlung  7.32
156 | Kalkulation  Berechnung  9.54
157 | Ankündigung  Produktion  1.65
158 | Gewinn  Warnung  2.85
159 | Gewinn  Verlust  7.85
160 | Dollar  Yen  7.31
161 | Dollar  Kohle  5.58
162 | Telefon  Zubehör  4.42
163 | Fünf  Monat  1.33
164 | Bericht  Zuwachs  2.02
165 | Flüssigkeit  Wasser  8.62
166 | Marathon  Sprint  7.67
167 | Sieben  Reihe  2.41
168 | Meeresfrüchte  Essen  7.23
169 | Meeresfrüchte  Hummer  6.81
170 | Hummer  Essen  6.78
171 | Hummer  Wein  4.15
172 | Beginn  Jahr  4.7
173 | Beginn  Partie  4.22
174 | Meisterschaft  Turnier  8.05
175 | Grundsatz  Versicherung  2.51
176 | Mann  Frau  8.69
177 | Mann  Präsident  5.46
178 | Mord  Totschlag  8.62
179 | Oper  Aufführung  6.38
180 | Fokus  Leben  2.62
181 | Zuschauer  Serie  5.38
182 | Möglichkeit  Mädchen  1.65
183 | Bevölkerung  Entwicklung  4.27
184 | Moral  Wichtigkeit  4.65
185 | Moral  Heirat  2.81
186 | Mexiko  Brasil  5.23
187 | Oper  Industrie  0.85
188 | Zucker  Annäherung  1.46
189 | Praxis  Institution  4.27
190 | Ministerium  Kultur  4.38
191 | Entwicklung  Ausgabe  3.15
192 | Erfahrung  Musik  1.08
193 | Musik  Projekt  4.04
194 | Glas  Metal  3.92
195 | Aluminium  Metal  7.81
196 | Möglichkeit  Glaubwürdigkeit  2.69
197 | Rock  Jazz  6.19
198 | Museum  Theater  5.42
199 | Betrachtung  Architektur  4.38
200 | Regen  Gewitter  7.85
201 | Architektur  Jahrhundert  3.08
202 | 


--------------------------------------------------------------------------------
/hyperhyper/experiment.py:
--------------------------------------------------------------------------------
  1 | """
  2 | store and retrieve experiment results in a database
  3 | """
  4 | 
  5 | import time
  6 | 
  7 | import sqlalchemy
  8 | 
  9 | from .pair_counts import default_pair_args
 10 | 
 11 | 
 12 | def flatten_dict(prefix, dict):
 13 |     """
 14 |     flatten a dict Django-style
 15 |     """
 16 |     for k, v in dict.items():
 17 |         yield {f"{prefix}__{k}": v}
 18 | 
 19 | 
 20 | def record(func):
 21 |     """
 22 |     record the evaluation of an embedding in a database
 23 |     """
 24 | 
 25 |     def wrapper(*args, **kwargs):
 26 |         results = func(*args, **kwargs)
 27 | 
 28 |         if not "pair_args" in kwargs:
 29 |             kwargs["pair_args"] = default_pair_args
 30 | 
 31 |         if "evaluate" in kwargs and not kwargs["evaluate"]:
 32 |             return results
 33 | 
 34 |         if len(results) > 1:
 35 |             db_dic = {}
 36 |             # params to dict
 37 |             db_dic.update({"method": func.__name__})
 38 |             for k, v in kwargs.items():
 39 |                 if type(v) is dict:
 40 |                     if k == "pair_args":
 41 |                         # merge with default arguments of pair counts
 42 |                         v = {**default_pair_args, **v}
 43 |                     for x in flatten_dict(k, v):
 44 |                         db_dic.update(x)
 45 |                 else:
 46 |                     db_dic.update({k: v})
 47 |             # results to dicts
 48 |             db_dic.update({"micro_results": results[1]["micro"]})
 49 |             db_dic.update({"macro_results": results[1]["macro"]})
 50 |             for r in results[1]["results"]:
 51 |                 db_dic.update({f"{r['name']}_score": r["score"]})
 52 |                 db_dic.update({f"{r['name']}_oov": r["oov"]})
 53 |                 db_dic.update({f"{r['name']}_fullscore": r["fullscore"]})
 54 | 
 55 |             # Couldn't figure out the timeout param for datasets so keep retrying.
 56 |             while True:
 57 |                 try:
 58 |                     # args[0] is self
 59 |                     db = args[0].get_db()
 60 |                     table = db["experiments"]
 61 |                     # specify type because dataset guesses them sometimes wrongly
 62 |                     # ensure that rows are not duplicated. This may happen, if the same function is called multiple times.
 63 |                     table.insert_ignore(
 64 |                         db_dic,
 65 |                         db_dic.keys(),
 66 |                         types={
 67 |                             k: sqlalchemy.types.String
 68 |                             if type(v) is str
 69 |                             else sqlalchemy.types.Float
 70 |                             for k, v in db_dic.items()
 71 |                         },
 72 |                     )
 73 |                     break
 74 |                 except Exception as e:
 75 |                     print(e)
 76 |                     time.sleep(10)
 77 |         return results
 78 | 
 79 |     return wrapper
 80 | 
 81 | 
 82 | def results_from_db(db, query={}, order="micro_results desc", limit=100):
 83 |     """
 84 |     retrieve (the best) results from a database
 85 |     """
 86 |     where = []
 87 |     for k, v in query.items():
 88 |         if type(v) is dict:
 89 |             for fkfv in flatten_dict(k, v):
 90 |                 # ugly
 91 |                 for fk, fv in fkfv.items():
 92 |                     where.append(f"{fk}={fv}")
 93 |         else:
 94 |             where.append(f"{k}={v}")
 95 |     if len(where) > 0:
 96 |         where = "where " + " and ".join(where)
 97 |     else:
 98 |         where = ""
 99 | 
100 |     if order is None:
101 |         order = ""
102 |     if len(order) > 0:
103 |         order = f"order by {order}"
104 | 
105 |     if limit is None:
106 |         limit = ""
107 |     else:
108 |         limit = f"limit {limit}"
109 | 
110 |     query_string = f"select distinct * from experiments {where} {order} {limit}"
111 |     return list(db.query(query_string))
112 | 
113 | 
114 | # TODO
115 | # def get_embedding_from_params(row):
116 | #     pair_args = {}
117 | #     args = {}
118 | #     for k, v in row.items():
119 | #         k_parts = k.split("__")
120 | #         if len(k_parts) > 1:
121 | #             pair_args[k_parts[1]] = v
122 | #         else:
123 | #             arg[k] = v
124 | 
125 | #     for best in list(db.query(statement)):
126 | #         oov = True if best["pair_args__delete_oov"] == 1 else False
127 | #         window = int(best["pair_args__window"])
128 | #         if not isinstance(window, int):
129 | #             window = int.from_bytes(window, "little")
130 | #         neg = float(best["neg"])
131 | #         if neg.is_integer():
132 | #             neg = int(neg)
133 | #         dim = int(best["dim"])
134 | 
135 | #         print(oov, best)
136 | #         try:
137 | #             print(best["neg"])
138 | #             kv, res = b.svd(
139 | #                 impl="scipy",
140 | #                 evaluate=True,
141 | #                 pair_args={
142 | #                     "subsample": "deter",
143 | #                     "subsample_factor": best["pair_args__subsample_factor"],
144 | #                     "delete_oov": True,
145 | #                     "decay_rate": best["pair_args__decay_rate"],
146 | #                     "window": window,
147 | #                     "dynamic_window": "decay",
148 | #                 },
149 | #                 neg=neg,
150 | #                 eig=best["eig"],
151 | #                 dim=dim,
152 | #                 keyed_vector=True,
153 | #             )
154 | #             print(res)
155 | #             print(best)
156 | #         except Exception as e:
157 | #             print(e)
158 | #     return kv
159 | 
160 | 
161 | # def get_best(db, query):
162 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/en/ws/ws353_relatedness.txt:
--------------------------------------------------------------------------------
  1 | computer	keyboard	7.62
  2 | Jerusalem	Israel	8.46
  3 | planet	galaxy	8.11
  4 | canyon	landscape	7.53
  5 | OPEC	country	5.63
  6 | day	summer	3.94
  7 | day	dawn	7.53
  8 | country	citizen	7.31
  9 | planet	people	5.75
 10 | environment	ecology	8.81
 11 | Maradona	football	8.62
 12 | OPEC	oil	8.59
 13 | money	bank	8.50
 14 | computer	software	8.50
 15 | law	lawyer	8.38
 16 | weather	forecast	8.34
 17 | network	hardware	8.31
 18 | nature	environment	8.31
 19 | FBI	investigation	8.31
 20 | money	wealth	8.27
 21 | psychology	Freud	8.21
 22 | news	report	8.16
 23 | war	troops	8.13
 24 | physics	proton	8.12
 25 | bank	money	8.12
 26 | stock	market	8.08
 27 | planet	constellation	8.06
 28 | credit	card	8.06
 29 | hotel	reservation	8.03
 30 | closet	clothes	8.00
 31 | soap	opera	7.94
 32 | planet	astronomer	7.94
 33 | planet	space	7.92
 34 | movie	theater	7.92
 35 | treatment	recovery	7.91
 36 | baby	mother	7.85
 37 | money	deposit	7.73
 38 | television	film	7.72
 39 | psychology	mind	7.69
 40 | game	team	7.69
 41 | admission	ticket	7.69
 42 | Jerusalem	Palestinian	7.65
 43 | Arafat	terror	7.65
 44 | boxing	round	7.61
 45 | computer	internet	7.58
 46 | money	property	7.57
 47 | tennis	racket	7.56
 48 | telephone	communication	7.50
 49 | currency	market	7.50
 50 | psychology	cognition	7.48
 51 | seafood	sea	7.47
 52 | book	paper	7.46
 53 | book	library	7.46
 54 | psychology	depression	7.42
 55 | fighting	defeating	7.41
 56 | movie	star	7.38
 57 | hundred	percent	7.38
 58 | dollar	profit	7.38
 59 | money	possession	7.29
 60 | cup	drink	7.25
 61 | psychology	health	7.23
 62 | summer	drought	7.16
 63 | investor	earning	7.13
 64 | company	stock	7.08
 65 | stroke	hospital	7.03
 66 | liability	insurance	7.03
 67 | game	victory	7.03
 68 | psychology	anxiety	7.00
 69 | game	defeat	6.97
 70 | FBI	fingerprint	6.94
 71 | money	withdrawal	6.88
 72 | psychology	fear	6.85
 73 | drug	abuse	6.85
 74 | concert	virtuoso	6.81
 75 | computer	laboratory	6.78
 76 | love	sex	6.77
 77 | problem	challenge	6.75
 78 | movie	critic	6.73
 79 | Arafat	peace	6.73
 80 | bed	closet	6.72
 81 | lawyer	evidence	6.69
 82 | fertility	egg	6.69
 83 | precedent	law	6.65
 84 | minister	party	6.63
 85 | psychology	clinic	6.58
 86 | cup	coffee	6.58
 87 | water	seepage	6.56
 88 | government	crisis	6.56
 89 | space	world	6.53
 90 | dividend	calculation	6.48
 91 | victim	emergency	6.47
 92 | luxury	car	6.47
 93 | tool	implement	6.46
 94 | competition	price	6.44
 95 | psychology	doctor	6.42
 96 | gender	equality	6.41
 97 | listing	category	6.38
 98 | video	archive	6.34
 99 | oil	stock	6.34
100 | governor	office	6.34
101 | discovery	space	6.34
102 | record	number	6.31
103 | brother	monk	6.27
104 | production	crew	6.25
105 | nature	man	6.25
106 | family	planning	6.25
107 | disaster	area	6.25
108 | food	preparation	6.22
109 | preservation	world	6.19
110 | movie	popcorn	6.19
111 | lover	quarrel	6.19
112 | game	series	6.19
113 | dollar	loss	6.09
114 | weapon	secret	6.06
115 | shower	flood	6.03
116 | registration	arrangement	6.00
117 | arrival	hotel	6.00
118 | announcement	warning	6.00
119 | game	round	5.97
120 | baseball	season	5.97
121 | drink	mouth	5.96
122 | life	lesson	5.94
123 | grocery	money	5.94
124 | energy	crisis	5.94
125 | reason	criterion	5.91
126 | equipment	maker	5.91
127 | cup	liquid	5.90
128 | deployment	withdrawal	5.88
129 | tiger	zoo	5.87
130 | journey	car	5.85
131 | money	laundering	5.65
132 | summer	nature	5.63
133 | decoration	valor	5.63
134 | Mars	scientist	5.63
135 | alcohol	chemistry	5.54
136 | disability	death	5.47
137 | change	attitude	5.44
138 | arrangement	accommodation	5.41
139 | territory	surface	5.34
140 | size	prominence	5.31
141 | exhibit	memorabilia	5.31
142 | credit	information	5.31
143 | territory	kilometer	5.28
144 | death	row	5.25
145 | doctor	liability	5.19
146 | impartiality	interest	5.16
147 | energy	laboratory	5.09
148 | secretary	senate	5.06
149 | death	inmate	5.03
150 | monk	oracle	5.00
151 | cup	food	5.00
152 | journal	association	4.97
153 | street	children	4.94
154 | car	flight	4.94
155 | space	chemistry	4.88
156 | situation	conclusion	4.81
157 | word	similarity	4.75
158 | peace	plan	4.75
159 | consumer	energy	4.75
160 | ministry	culture	4.69
161 | smart	student	4.62
162 | investigation	effort	4.59
163 | image	surface	4.56
164 | life	term	4.50
165 | start	match	4.47
166 | computer	news	4.47
167 | board	recommendation	4.47
168 | lad	brother	4.46
169 | observation	architecture	4.38
170 | coast	hill	4.38
171 | deployment	departure	4.25
172 | benchmark	index	4.25
173 | attempt	peace	4.25
174 | consumer	confidence	4.13
175 | start	year	4.06
176 | focus	life	4.06
177 | development	issue	3.97
178 | theater	history	3.91
179 | situation	isolation	3.88
180 | profit	warning	3.88
181 | media	trading	3.88
182 | chance	credibility	3.88
183 | precedent	information	3.85
184 | architecture	century	3.78
185 | population	development	3.75
186 | stock	live	3.73
187 | peace	atmosphere	3.69
188 | morality	marriage	3.69
189 | minority	peace	3.69
190 | atmosphere	landscape	3.69
191 | report	gain	3.63
192 | music	project	3.63
193 | seven	series	3.56
194 | experience	music	3.47
195 | school	center	3.44
196 | five	month	3.38
197 | announcement	production	3.38
198 | morality	importance	3.31
199 | money	operation	3.31
200 | delay	news	3.31
201 | governor	interview	3.25
202 | practice	institution	3.19
203 | century	nation	3.16
204 | coast	forest	3.15
205 | shore	woodland	3.08
206 | drink	car	3.04
207 | president	medal	3.00
208 | prejudice	recognition	3.00
209 | viewer	serial	2.97
210 | peace	insurance	2.94
211 | Mars	water	2.94
212 | media	gain	2.88
213 | precedent	cognition	2.81
214 | announcement	effort	2.75
215 | line	insurance	2.69
216 | crane	implement	2.69
217 | drink	mother	2.65
218 | opera	industry	2.63
219 | volunteer	motto	2.56
220 | listing	proximity	2.56
221 | precedent	collection	2.50
222 | cup	article	2.40
223 | sign	recess	2.38
224 | problem	airport	2.38
225 | reason	hypertension	2.31
226 | direction	combination	2.25
227 | Wednesday	news	2.22
228 | glass	magician	2.08
229 | cemetery	woodland	2.08
230 | possibility	girl	1.94
231 | cup	substance	1.92
232 | forest	graveyard	1.85
233 | stock	egg	1.81
234 | month	hotel	1.81
235 | energy	secretary	1.81
236 | precedent	group	1.77
237 | production	hike	1.75
238 | stock	phone	1.62
239 | holy	sex	1.62
240 | stock	CD	1.31
241 | drink	ear	1.31
242 | delay	racism	1.19
243 | stock	life	0.92
244 | stock	jaguar	0.92
245 | monk	slave	0.92
246 | lad	wizard	0.92
247 | sugar	approach	0.88
248 | rooster	voyage	0.62
249 | noon	string	0.54
250 | chord	smile	0.54
251 | professor	cucumber	0.31
252 | king	cabbage 0.23
253 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/de/ws/schm280.txt:
--------------------------------------------------------------------------------
  1 | psychologie geist	7.2
  2 | fünf monat	3.2
  3 | planet galaxie	8.3
  4 | wodka gin	7.8
  5 | tiger katze	8.2
  6 | auto flug	6.1
  7 | ankunft hotel	6.2
  8 | könig königin	9.4
  9 | seife oper	5.1
 10 | brühe ei	2.9
 11 | essen vorbereitung	4.9
 12 | trinken essen	8.1
 13 | ankündigung aufwand	1.5
 14 | sommer natur	5.4
 15 | film star	6.9
 16 | psychologie klinik	7.1
 17 | spiel sieg	6.5
 18 | situation isolation	1.8
 19 | psychologie depression	7.6
 20 | gouverneur interview	3.6
 21 | bischof rabbi	6.6
 22 | kredit information	2.7
 23 | frieden versicherung	1.9
 24 | fruchtbarkeit ei	6.5
 25 | dollar verlust	5.5
 26 | schlaganfall krankenhaus	7.2
 27 | vorhergend information	2.4
 28 | gewinn verlust	8.2
 29 | opec land	4.4
 30 | pokal artefakt	4.4
 31 | kalkulation berechnung	8.3
 32 | psychologie wissenschaft	7.5
 33 | heilig sex	1.4
 34 | vogel kranich	8.3
 35 | psychologie gesundheit	6.4
 36 | meile kilometer	8.9
 37 | bank geld	7.9
 38 | wodka brandy	7.7
 39 | boxen runde	6.2
 40 | video archiv	4.8
 41 | edelstein juwel	8.1
 42 | bericht gewinn	2.4
 43 | geld abhebung	5.8
 44 | oper aufführung	7.4
 45 | aktie markt	6.4
 46 | straße kinder	3.4
 47 | leben tod	8.4
 48 | luxus auto	5.2
 49 | gurke kartoffel	6
 50 | medien radio	7.1
 51 | meeresfrüchte meer	7.3
 52 | planet sonne	8.3
 53 | fußball basketball	7.1
 54 | magier zauberer	9.3
 55 | fbi fingerabdruck	6
 56 | mittag mittagsstunde	7.1
 57 | netzwerk hardware	6.3
 58 | bild oberfläche	3.5
 59 | professor gurke	0.8
 60 | mittwoch nachrichten	1.3
 61 | öl aktie	4.3
 62 | zucker ansatz	0.5
 63 | tiger tierwelt	6.5
 64 | leben dauer	4.7
 65 | tiger zoo	7.2
 66 | atmosphäre landschaft	3.3
 67 | verbraucher energie	6.3
 68 | energie labor	4.3
 69 | ficken sex	9.1
 70 | nachrichten bericht	7.8
 71 | präzedenzfall beispiel	5.7
 72 | computer tastatur	7.1
 73 | problem flughafen	2.4
 74 | buch bücherei	8.1
 75 | krieg truppen	7.6
 76 | spiel serie	2.8
 77 | tasse nahrung	3.9
 78 | psychologie angst	7.4
 79 | psychologie freud	8.2
 80 | spiel mannschaft	7.4
 81 | beobachtung architektur	2
 82 | arafat frieden	5.4
 83 | produktion erhöhung	4.2
 84 | schlucht landschaft	5.8
 85 | straße häuserblock	5.7
 86 | fußball tennis	7.3
 87 | computer programm	7.8
 88 | computer nachrichten	3.1
 89 | haut auge	5.1
 90 | waffen geheimnis	3.1
 91 | zeichen pause	1.9
 92 | krankenhaus infrastruktur	4
 93 | tasse kaffee	7.5
 94 | liebe sex	7.7
 95 | währung markt	5.7
 96 | küste ufer	8.1
 97 | umwelt ökologie	8.2
 98 | tag dämmerung	7.3
 99 | kredit karte	6.1
100 | verzögerung rassismus	1.3
101 | tasse flüssig	6.1
102 | problem herausforderung	7.1
103 | geld bargeld	8.9
104 | ministerium kultur	6
105 | wort ähnlichkeit	1.7
106 | geld bank	7.9
107 | oper industrie	1.8
108 | tiger katze	8
109 | regierung krise	6.3
110 | glas zauberer	1.4
111 | trinken auto	1
112 | reise auto	6.4
113 | mittelpunkt leben	3.6
114 | flugzeug auto	7.4
115 | wald friedhof	3.2
116 | psychologie arzt	6
117 | tasse gegenstand	7.1
118 | minderheit frieden	3.4
119 | natur umwelt	8.1
120 | planet stern	8.5
121 | zuschauer serie	6.3
122 | akkord lächeln	0.6
123 | präzedenzfall gruppe	1
124 | fernsehen radio	7.8
125 | museum theater	7.3
126 | opfer notfall	6.3
127 | lebensmittelgeschäft geld	6.2
128 | straße allee	8.3
129 | tasse einheit	2.1
130 | küste wald	4.2
131 | musik projekt	3.2
132 | geld eigentum	5.8
133 | medien handel	1.9
134 | gouverneur amt	5.8
135 | ausrüstung hersteller	4.5
136 | dividend berechnung	6
137 | tiger tiger	10
138 | minister partei	7.5
139 | anfang jahr	5.2
140 | tag sommer	4.2
141 | dollar dollar	9.9
142 | entdeckung weltraum	6.2
143 | film popcorn	7
144 | getränk ohr	0.6
145 | holz wald	7.8
146 | professor doktor	7.5
147 | firma aktie	6.1
148 | geld besitz	6.8
149 | spiel runde	6.5
150 | mars wasser	4.1
151 | freiwilliger motto	0.7
152 | anwalt beweis	6
153 | hahn reise	0.2
154 | linie versicherung	0.8
155 | konzert virtuose	5.8
156 | spiel niederlage	6.8
157 | frieden atmosphäre	3.8
158 | zelle telefon	6.6
159 | geld währung	8.6
160 | film kino	8.4
161 | zug auto	7.6
162 | aluminium metall	8.4
163 | computer internet	8.3
164 | geld bargeld	9
165 | ausschuss empfehlung	4.3
166 | küste hügel	5.3
167 | fernsehen film	7.8
168 | tasse substanz	3.2
169 | rock jazz	7
170 | arafat jackson	2.3
171 | meisterschaft turnier	8.5
172 | mönch sklave	1.4
173 | fbi ermittlung	6.8
174 | weltraum chemie	4
175 | familie planung	5.5
176 | jahrhundert nation	3.2
177 | mars wissenschaftler	5.3
178 | psychologie wahrnehmung	7
179 | jaguar auto	7.5
180 | film kritiker	6.3
181 | maradona fußball	7.7
182 | richtung kombination	2.4
183 | tasse getränk	7.1
184 | planet konstellation	5.8
185 | profit warnung	2.1
186 | jerusalem israel	7.7
187 | hummer wein	3.6
188 | schrank kleidung	7.6
189 | arafat terror	6.3
190 | sommer dürre	6.3
191 | dollar yen	8.5
192 | planet astronom	7.6
193 | medien gewinn	2.5
194 | buch papier	6.6
195 | tod insasse	3
196 | tod reihe	1.7
197 | auszeichnung tapferkeit	5.7
198 | physik chemie	8.2
199 | hotel reservierung	6.9
200 | meeresfrüchte essen	7.1
201 | typ art	7.9
202 | frieden plan	4.2
203 | bevölkerung entwicklung	5.1
204 | straße ort	6
205 | tiger tier	7.8
206 | flüssigkeit wasser	8.4
207 | haftung versicherung	7
208 | küste wald	5.4
209 | mann gouverneur	6.1
210 | gebiet kilometer	4.2
211 | marathon spring	3.9
212 | glas metall	6.1
213 | energie minister	4.2
214 | mexiko brasilien	7
215 | baseball saison	5.5
216 | geld reichtum	7.8
217 | meeresfrüchte hummer	7.6
218 | arzt krankenschwester	8.2
219 | baby mutter	8
220 | japanisch amerikanisch	7.6
221 | psychologie psychiatrie	8
222 | behandlung genesung	7.9
223 | energie krise	4.8
224 | präzedenzfall recht	4
225 | monat hotel	1.3
226 | schlau dumm	8.4
227 | telefon kommunikation	7.9
228 | jerusalem palästinenser	7.5
229 | tiger jaguar	8.7
230 | jahrhundert jahr	7.9
231 | theater geschichte	3.1
232 | wetter vorhersage	6.7
233 | moral wichtigkeit	3.7
234 | reisen aktivität	5.3
235 | geld dollar	8
236 | geschlecht gleichheit	4.1
237 | ausstellung erinnerungsstück	3.3
238 | planet mond	7.9
239 | mönch orakel	4.7
240 | behinderung tod	3.5
241 | chance glaubwürdigkeit	1.4
242 | bruder mönch	5
243 | grund kriterium	6.1
244 | alkohol chemie	7.1
245 | wasser leck	5.3
246 | mord totschlag	7.9
247 | planet weltraum	8.1
248 | könig turm	5.3
249 | präsident medaille	2.9
250 | psychologie angst	7.3
251 | jaguar katze	8.1
252 | opec öl	7.3
253 | student professor	7.1
254 | tasse geschirr	7.6
255 | ankündigung neuigkeit	8.1
256 | getränk mutter	0.8
257 | mann frau	8.6
258 | psychologie disziplin	5.6
259 | hummer essen	6.5
260 | vogel hahn	8
261 | telefon ausrüstung	3.7
262 | leben lektion	3.4
263 | tiger säugetier	8.1
264 | erfahrung musik	2.7
265 | droge missbrauch	6.3
266 | möglichkeit mädchen	1.2
267 | dollar gewinn	5.8
268 | gesetz rechtsanwalt	6.9
269 | schule zentrum	2.6
270 | sekretär senat	3.4
271 | planet menschen	5.2
272 | tiger organismus	5.2
273 | physik proton	7.9
274 | harvard yale	7.7
275 | tennis schläger	7
276 | tiger fleischfresser	7.1
277 | fußball fußball	9.9
278 | architektur jahrhundert	4
279 | hundert prozent	6.4
280 | brot butter	6.7
281 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/de/ws/ws353rel.txt:
--------------------------------------------------------------------------------
  1 | Liebe  Sex  8.46
  2 | Buch  Papier  7.08
  3 | Computer  Tastatur  8
  4 | Computer  Internet  8.08
  5 | Telefon  Kommunikation  8.38
  6 | Drogen  Mißbrauch  6.46
  7 | klug  Student  4.85
  8 | Unternehmen  Aktie  6.54
  9 | Aktie  Börse  8.85
 10 | Vorrat  Telefon  0.31
 11 | Vorrat  CD  0.54
 12 | Vorrat  Jaguar  0.23
 13 | Vorrat  Ei  2.15
 14 | Fruchtbarkeit  Ei  7.92
 15 | Aktie  Live  1.08
 16 | Aktie  Leben  0.62
 17 | Buch  Bibliothek  8.31
 18 | Bank  Geld  8.15
 19 | Professor  Gurke  0.15
 20 | König  Kohl  0.23
 21 | Jerusalem  Israel  8.85
 22 | Jerusalem  Palestinensisch  6.85
 23 | Heilig  Sex  0.69
 24 | Maradona  Fußball  8
 25 | Tennis  Schläger  7.08
 26 | Arafat  Frieden  2.46
 27 | Arafat  Terror  5.23
 28 | Gesetz  Anwalt  8.38
 29 | Film  Star  7.62
 30 | Film  Popcorn  6.08
 31 | Film  Kritik  5.85
 32 | Kino  Theater  6.85
 33 | Physik  Proton  7
 34 | Weltall  Chemie  3.31
 35 | Alkohol  Chemie  5.08
 36 | Drink  Auto  1.85
 37 | Trinken  Ohren  0.62
 38 | Trinken  Mund  6.46
 39 | Baby  Mutter  7.85
 40 | Säugen  Mutter  7.69
 41 | Werkzeug  Arbeitsgerät  8.38
 42 | Bruder  Mönch  5.92
 43 | Kran  Arbeitsgerät  6.08
 44 | Bursche  Bruder  4.23
 45 | Fahrt  Auto  6.62
 46 | Mönch  Orakel  1.38
 47 | Friedhof  Waldgebiet  1.92
 48 | Küste  Hügel  2.69
 49 | Wald  Friedhof  3.31
 50 | Ufer  Waldgebiet  2.31
 51 | Mönch  Sklave  1.08
 52 | Küste  Wald  1.77
 53 | Bursche  Zauberer  0.77
 54 | Akkord  Lächeln  0.31
 55 | Glas  Magier  1.69
 56 | Mittag  Faden  0.15
 57 | Hahn  Reise  0.31
 58 | Geld  Reichtum  8.19
 59 | Geld  Eigentum  6.62
 60 | Geld  Besitz  6.92
 61 | Geld  Bank  8.31
 62 | Geld  Pfand  5.13
 63 | Geld  Einzahlung  6.23
 64 | Geld  Abheben  6.54
 65 | Geld  Wäsche  3.62
 66 | Tiger  Zoo  5.91
 67 | Psychologie  Beklemmung  4.35
 68 | Psychologie  Angst  4.92
 69 | Psychologie  Depression  6.77
 70 | Psychologie  Klinik  6.17
 71 | Psychologie  Arzt  5.85
 72 | Psychologie  Freud  7
 73 | Psychologie  Seele  5.88
 74 | Psychologie  Gesundheit  5.11
 75 | Psychologie  Erkenntnis  4.92
 76 | Planet  Konstellation  6.23
 77 | Planet  Galaxie  7.08
 78 | Planet  Weltraum  7.08
 79 | Planet  Astronom  6.38
 80 | Basis  Information  3.32
 81 | Voraussetzung  Erkenntnis  3.15
 82 | Präzedensfall  Gesetz  5.62
 83 | Beispielhaft  Sammlung  1.92
 84 | Vorbildlich  Gruppe  2.19
 85 | Tasse  Kaffee  7.21
 86 | Tasse  Gegenstand  6.08
 87 | Tasse  Trinken  7.62
 88 | Tasse  Essen  1.77
 89 | Tasse  Substanz  1.69
 90 | Tasse  Flüssigkeit  5.47
 91 | Energie  Minister  4.15
 92 | Minister  Senat  5.96
 93 | Energie  Labor  3.23
 94 | Computer  Labor  4.31
 95 | Waffe  Geheimnis  1.85
 96 | Polizei  Fingerabdruck  6.23
 97 | Polizei  Ermittlung  7.27
 98 | Untersuchung  Aufwand  4.15
 99 | Mars  Wasser  2.77
100 | Mars  Wissenschaftler  5.54
101 | Nachrichten  Bericht  7.85
102 | Schlucht  Landschaft  6.54
103 | Bild  Oberfläche  3.38
104 | Entdeckung  Weltall  4.77
105 | Wasser  Leck  5.81
106 | Zeichen  Kerbe  3.92
107 | Mittwoch  Nachrichten  1.38
108 | Computer  Nachrichten  4.23
109 | Gebiet  Oberfläche  3.77
110 | Atmosphäre  Landschaft  2.5
111 | Präsident  Orden  2.77
112 | Krieg  Truppen  6.81
113 | Rekord  Nummer  2.77
114 | Theater  Geschichte  3.85
115 | Freiwilliger  Motto  0.77
116 | Vorurteil  Anerkennung  3.46
117 | Auszeichnung  Tapferkeit  5.92
118 | Jahrhundert  Nation  1.54
119 | Verzögerung  Rassismus  1.31
120 | Verzögerung  Nachrichten  1.08
121 | Minister  Partei  7.38
122 | Frieden  Plan  3.85
123 | Minderheit  Frieden  2.23
124 | Versuch  Frieden  3.46
125 | Regierung  Krise  5.65
126 | Aufmarsch  Abzug  6.27
127 | Aufmarsch  Rückzug  6.81
128 | Energie  Krise  4.77
129 | Ankündigung  Aufwand  1.08
130 | Schlaganfall  Krankenhaus  6.88
131 | Behinderung  Tod  2.42
132 | Opfer  Notfall  6.69
133 | Behandlung  Erholung  5.46
134 | Zeitschrift  Verein  1.77
135 | Arzt  Verantwortung  6.65
136 | Haftung  Versicherung  7.62
137 | Schule  Zentrum  4
138 | Ursache  Bluthochdruck  3.65
139 | Ursache  Kriterium  3.69
140 | Hundert  Prozent  6.92
141 | Tod  Trakt  2.46
142 | Tod  Insasse  2.38
143 | Rechtsanwalt  Beweis  6.04
144 | Leben  Dauer  6.42
145 | Wort  Ähnlichkeit  2.46
146 | Gremium  Empfehlung  3.65
147 | Direktor  Interview  2.46
148 | OPEC  Staat  3.92
149 | Frieden  Stimmung  4.46
150 | Frieden  Versicherung  2.15
151 | Gelände  Kilometer  3.46
152 | Wettbewerb  Preis  7.73
153 | Konsument  Vertrauen  3.85
154 | Konsument  Energie  3.69
155 | Problem  Flughafen  2.23
156 | Auto  Flug  3.77
157 | Kredit  Karte  6.31
158 | Vertrauen  Information  4.77
159 | Hotel  Reservierung  6.81
160 | Lebensmittel  Geld  4.88
161 | Registrierung  Abmachung  2.69
162 | Vereinbarung  Unterkunft  2
163 | Monat  Hotel  0.69
164 | Ankunft  Hotel  4.54
165 | Bett  Schrank  6.15
166 | Schrank  Kleider  7.5
167 | Lage  Schlussfolgerung  3.69
168 | Situation  Isolation  2.15
169 | Unparteilichkeit  Interesse  2.5
170 | Richtung  Verbindung  3.65
171 | Straße  Kinder  3.73
172 | Aufzählung  Nähe  0.92
173 | Liste  Kategorie  5.85
174 | Herstellung  Wanderung  0.69
175 | Richtwert  Kennziffer  4.92
176 | Medien  Handel  2.87
177 | Medien  Vorteil  1.81
178 | Gewinnanteil  Kalkulation  6.15
179 | Währung  Markt  6.03
180 | OPEC  Öl  7.65
181 | Öl  Aktie  5.14
182 | Ankündigung  Produktion  1.65
183 | Ankündigung  Warnung  6.06
184 | Gewinn  Warnung  2.85
185 | Dollar  Gewinn  5.25
186 | Dollar  Verlust  5.27
187 | Computer  Software  8.35
188 | Netzwerk  Hardware  6.88
189 | Zubehör  Hersteller  4.43
190 | Luxus  Auto  5.14
191 | Fünf  Monat  1.33
192 | Bericht  Zuwachs  2.02
193 | Investor  Einkommen  4.05
194 | Baseball  Saison  4.98
195 | Spiel  Sieg  7.08
196 | Spiel  Mannschaft  6.88
197 | Spiel  Serie  4.85
198 | Spiel  Niederlage  6.77
199 | Sieben  Reihe  2.41
200 | Meeresfrüchte  Meer  7.32
201 | Essen  Vorbereitung  5.58
202 | Video  Archiv  5.06
203 | Beginn  Jahr  4.7
204 | Beginn  Partie  4.22
205 | Spiel  Runde  6.33
206 | Boxen  Runde  7.35
207 | Kämpfen  Besiegen  7.41
208 | Grundsatz  Versicherung  2.51
209 | Tag  Sommer  3.33
210 | Sommer  Dürre  6.04
211 | Sommer  Natur  5.37
212 | Tag  Dämmerung  5.97
213 | Natur  Umwelt  8
214 | Umwelt  Nachhaltigkeit  6.54
215 | Natur  Mensch  6.31
216 | Seife  Oper  2.85
217 | Leben  Lektion  4.54
218 | Fokus  Leben  2.62
219 | Herstellung  Belegschaft  3.23
220 | Fernsehen  Film  7.31
221 | Liebhaber  Streit  4.08
222 | Zuschauer  Serie  5.38
223 | Möglichkeit  Mädchen  1.65
224 | Bevölkerung  Entwicklung  4.27
225 | Moral  Wichtigkeit  4.65
226 | Moral  Heirat  2.81
227 | Geschlecht  Gleichheit  3.81
228 | Änderung  Einstellung  4.15
229 | Familie  Planung  5.62
230 | Oper  Industrie  0.85
231 | Zucker  Annäherung  1.46
232 | Praxis  Institution  4.27
233 | Ministerium  Kultur  4.38
234 | Problem  Herausforderung  6.08
235 | Größe  Prominenz  5.46
236 | Staat  Bürger  6.77
237 | Planet  Menschen  5.62
238 | Entwicklung  Ausgabe  3.15
239 | Erfahrung  Musik  1.08
240 | Musik  Projekt  4.04
241 | Möglichkeit  Glaubwürdigkeit  2.69
242 | Ausstellungsstück  Erinnerungsstück  4.38
243 | Konzert  virtuos  4.46
244 | Betrachtung  Architektur  4.38
245 | Weltraum  Erde  6.46
246 | Erhaltung  Welt  4.31
247 | Einlass  Eintritt  8.08
248 | Regen  Flut  7.27
249 | Wetter  Vorhersage  6.46
250 | Katastrophe  Gebiet  4.27
251 | Präsident  Büro  3.42
252 | Architektur  Jahrhundert  3.08
253 | 


--------------------------------------------------------------------------------
/tests/test_bunch.py:
--------------------------------------------------------------------------------
  1 | import tempfile
  2 | from pathlib import Path
  3 | 
  4 | import pytest
  5 | 
  6 | import hyperhyper
  7 | 
  8 | 
  9 | @pytest.fixture()
 10 | def corpus():
 11 |     some_text1 = """
 12 |     The English Wikipedia is the English-language edition of the free online encyclopedia Wikipedia. Founded on 15 January 2001, it is the first edition of Wikipedia and, as of April 2019, has the most articles of any of the editions.[2] As of June 2019, 12% of articles in all Wikipedias belong to the English-language edition. This share has gradually declined from more than 50 percent in 2003, due to the growth of Wikipedias in other languages.[3] As of 1 June 2019, there are 5,870,200 articles on the site,[4] having surpassed the 5 million mark on 1 November 2015.[5] In October 2015, the combined text of the English Wikipedia's articles totalled 11.5 gigabytes when compressed.[6]
 13 | 
 14 |     The Simple English Wikipedia is a variation in which most of the articles use only basic English vocabulary. There is also the Old English (Ænglisc/Anglo-Saxon) Wikipedia (angwiki). Community-produced news publications include The Signpost.[7]
 15 |     """
 16 | 
 17 |     some_text2 = """
 18 |     The English Wikipedia was the first Wikipedia edition and has remained the largest. It has pioneered many ideas as conventions, policies or features which were later adopted by Wikipedia editions in some of the other languages. These ideas include "featured articles",[8] the neutral-point-of-view policy,[9] navigation templates,[10] the sorting of short "stub" articles into sub-categories,[11] dispute resolution mechanisms such as mediation and arbitration,[12] and weekly collaborations.[13]
 19 | 
 20 |     The English Wikipedia has adopted features from Wikipedias in other languages. These features include verified revisions from the German Wikipedia (dewiki) and town population-lookup templates from the Dutch Wikipedia (nlwiki).
 21 | 
 22 |     Although the English Wikipedia stores images and audio files, as well as text files, many of the images have been moved to Wikimedia Commons with the same name, as passed-through files. However, the English Wikipedia also has fair-use images and audio/video files (with copyright restrictions), most of which are not allowed on Commons.
 23 | 
 24 |     Many of the most active participants in the Wikimedia Foundation, and the developers of the MediaWiki software that powers Wikipedia, are English users.
 25 |     """
 26 | 
 27 |     texts = [some_text1, some_text2]
 28 |     c = hyperhyper.Corpus.from_texts(texts)
 29 |     return c
 30 | 
 31 | 
 32 | def test_bunch(corpus):
 33 |     bunch = hyperhyper.Bunch("test_bunch", corpus, force_overwrite=True)
 34 |     pmi_matrix, _ = bunch.pmi()
 35 |     bunch.eval_sim(pmi_matrix)
 36 | 
 37 |     bunch.eval_analogy(pmi_matrix)
 38 | 
 39 |     # testing the evaluation of pmi
 40 |     english_idx = corpus.vocab.token2id["english"]
 41 |     wikipedia_idx = corpus.vocab.token2id["wikipedia"]
 42 |     for sim, token_idx in pmi_matrix.most_similar(english_idx):
 43 |         assert pmi_matrix.similarity(english_idx, token_idx) == pmi_matrix.similarity(token_idx, english_idx)
 44 |         assert pmi_matrix.similarity(english_idx, token_idx) == sim
 45 | 
 46 |     pmi_matrix.most_similar_vectors([english_idx], [wikipedia_idx])
 47 | 
 48 |     svd_matrix, _ = bunch.svd(dim=2)
 49 | 
 50 |     # testing the evaluation of svd
 51 |     english_idx = corpus.vocab.token2id["english"]
 52 |     for sim, token_idx in svd_matrix.most_similar(english_idx):
 53 |         assert svd_matrix.similarity(english_idx, token_idx) == svd_matrix.similarity(token_idx, english_idx)
 54 |         assert svd_matrix.similarity(english_idx, token_idx) == sim
 55 | 
 56 |     svd_matrix, _ = bunch.svd(dim=2, keyed_vectors=True)
 57 |     svd_matrix = bunch.svd(dim=3, keyed_vectors=True, evaluate=False)
 58 | 
 59 |     # `most_similar` comes from gensim's keyedvectors
 60 |     svd_matrix.most_similar("english")
 61 | 
 62 |     assert pmi_matrix.m.count_nonzero() > 0
 63 | 
 64 | 
 65 | def test_db_query(corpus):
 66 |     bunch = hyperhyper.Bunch("test_bunch", corpus, force_overwrite=True)
 67 |     bunch.svd(dim=2)
 68 |     res = bunch.results(query={"dim": 2, "pair_args": {"window": 2}})
 69 |     print(res)
 70 | 
 71 | 
 72 | def test_bunch_text_files():
 73 |     some_text1 = """
 74 |     The English Wikipedia is the English-language edition of the free online encyclopedia Wikipedia. Founded on 15 January 2001, it is the first edition of Wikipedia and, as of April 2019, has the most articles of any of the editions.[2] As of June 2019, 12% of articles in all Wikipedias belong to the English-language edition. This share has gradually declined from more than 50 percent in 2003, due to the growth of Wikipedias in other languages.[3] As of 1 June 2019, there are 5,870,200 articles on the site,[4] having surpassed the 5 million mark on 1 November 2015.[5] In October 2015, the combined text of the English Wikipedia's articles totalled 11.5 gigabytes when compressed.[6]
 75 | 
 76 |     The Simple English Wikipedia is a variation in which most of the articles use only basic English vocabulary. There is also the Old English (Ænglisc/Anglo-Saxon) Wikipedia (angwiki). Community-produced news publications include The Signpost.[7]
 77 |     """
 78 | 
 79 |     some_text2 = """
 80 |     The English Wikipedia was the first Wikipedia edition and has remained the largest. It has pioneered many ideas as conventions, policies or features which were later adopted by Wikipedia editions in some of the other languages. These ideas include "featured articles",[8] the neutral-point-of-view policy,[9] navigation templates,[10] the sorting of short "stub" articles into sub-categories,[11] dispute resolution mechanisms such as mediation and arbitration,[12] and weekly collaborations.[13]
 81 | 
 82 |     The English Wikipedia has adopted features from Wikipedias in other languages. These features include verified revisions from the German Wikipedia (dewiki) and town population-lookup templates from the Dutch Wikipedia (nlwiki).
 83 | 
 84 |     Although the English Wikipedia stores images and audio files, as well as text files, many of the images have been moved to Wikimedia Commons with the same name, as passed-through files. However, the English Wikipedia also has fair-use images and audio/video files (with copyright restrictions), most of which are not allowed on Commons.
 85 | 
 86 |     Many of the most active participants in the Wikimedia Foundation, and the developers of the MediaWiki software that powers Wikipedia, are English users.
 87 |     """
 88 | 
 89 |     texts = [some_text1, some_text2]
 90 |     # setup
 91 |     test_dir = tempfile.mkdtemp()
 92 |     for i, t in enumerate(texts):
 93 |         Path(test_dir + f"/{i}.txt").write_text(t)
 94 |     # test
 95 |     corpus = hyperhyper.Corpus.from_text_files(test_dir)
 96 |     bunch = hyperhyper.Bunch("test_bunch", corpus, force_overwrite=True)
 97 | 
 98 |     pmi_matrix, _ = bunch.pmi()
 99 |     bunch.eval_sim(pmi_matrix)
100 |     svd_matrix, _ = bunch.svd(dim=2)
101 |     svd_matrix, _ = bunch.svd(dim=2, keyed_vectors=True)
102 |     svd_matrix = bunch.svd(dim=2, keyed_vectors=True, evaluate=False)
103 | 
104 |     print(svd_matrix.most_similar("english"))
105 | 
106 |     assert pmi_matrix.m.count_nonzero() > 0
107 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/en/ws/radinsky_mturk.txt:
--------------------------------------------------------------------------------
  1 | episcopal	russia	2.75
  2 | water	shortage	2.714285714
  3 | horse	wedding	2.266666667
  4 | plays	losses	3.2
  5 | classics	advertiser	2.25
  6 | latin	credit	2.0625
  7 | ship	ballots	2.3125
  8 | mistake	error	4.352941176
  9 | disease	plague	4.117647059
 10 | sake	shade	2.529411765
 11 | saints	observatory	1.9375
 12 | treaty	wheat	1.8125
 13 | texas	death	1.533333333
 14 | republicans	challenge	2.3125
 15 | body	peaceful	2.058823529
 16 | admiralty	intensity	2.647058824
 17 | body	improving	2.117647059
 18 | heroin	marijuana	3.375
 19 | scottish	commuters	2.6875
 20 | apollo	myth	2.6
 21 | film	cautious	2.125
 22 | exhibition	art	4.117647059
 23 | chocolate	candy	3.764705882
 24 | republic	candidate	2.8125
 25 | gospel	church	4.0625
 26 | momentum	desirable	2.4
 27 | singapore	sanctions	2.117647059
 28 | english	french	3.823529412
 29 | exile	church	2.941176471
 30 | navy	coordinator	2.235294118
 31 | adventure	flood	2.4375
 32 | radar	plane	3.235294118
 33 | pacific	ocean	4.266666667
 34 | scotch	liquor	4.571428571
 35 | kennedy	gun	3
 36 | garfield	cat	2.866666667
 37 | scale	budget	3.5
 38 | rhythm	blues	3.071428571
 39 | rich	privileges	3.2
 40 | navy	withdrawn	1.571428571
 41 | marble	marching	2.615384615
 42 | polo	charged	2.125
 43 | mark	missing	2.333333333
 44 | battleship	army	4.235294118
 45 | medium	organization	2.5625
 46 | pennsylvania	writer	1.466666667
 47 | hamlet	poet	3.882352941
 48 | battle	prisoners	3.705882353
 49 | guild	smith	2.75
 50 | mud	soil	4.235294118
 51 | crime	assaulted	3.941176471
 52 | mussolini	stability	2.133333333
 53 | lincoln	division	2.4375
 54 | slaves	insured	2.2
 55 | summer	winter	4.375
 56 | integration	dignity	3.058823529
 57 | money	quota	2.5
 58 | honolulu	vacation	3.6875
 59 | libya	forged	2.461538462
 60 | cheers	musician	2.823529412
 61 | session	surprises	1.8125
 62 | billion	campaigning	2.571428571
 63 | perjury	soybean	2.0625
 64 | forswearing	perjury	3.3125
 65 | costume	halloween	3.4375
 66 | bulgarian	nurses	1.941176471
 67 | costume	ultimate	2.5
 68 | faith	judging	2.235294118
 69 | france	bridges	2.235294118
 70 | citizenship	casey	2.2
 71 | recreation	dish	1.4
 72 | intelligence	troubles	1.625
 73 | germany	worst	1.4375
 74 | chaos	death	2.75
 75 | sydney	hancock	2.857142857
 76 | sabbath	stevenson	2.214285714
 77 | espionage	passport	2.3125
 78 | political	today	1.6875
 79 | pipe	convertible	2
 80 | scouting	demonstrate	2.5625
 81 | salute	patterns	2.235294118
 82 | reichstag	germany	2.285714286
 83 | radiation	costumes	1.5625
 84 | horace	grief	1.764705882
 85 | sale	rental	3.470588235
 86 | open	close	4.058823529
 87 | photography	proving	2.375
 88 | propaganda	germany	1.705882353
 89 | assassination	forbes	2.071428571
 90 | mirror	duel	1.928571429
 91 | probability	hanging	2.058823529
 92 | africa	theater	1.5
 93 | hell	heaven	4.117647059
 94 | mussolini	italy	3
 95 | composer	beethoven	3.647058824
 96 | minister	forthcoming	1.764705882
 97 | brussels	sweden	3.176470588
 98 | neutral	parish	1.6
 99 | emotion	taxation	1.733333333
100 | louisiana	simple	2
101 | quarantine	disease	3
102 | cannon	imprisoned	2.625
103 | bronze	suspicion	2
104 | pearl	interim	2.352941176
105 | artist	paint	4.117647059
106 | relay	family	2.0625
107 | art	mortality	2.294117647
108 | food	investment	2.25
109 | alt	tenor	2.692307692
110 | catholics	protestant	3.5625
111 | militia	landlord	3.0625
112 | battle	warships	4.176470588
113 | alcohol	fleeing	2.5625
114 | coil	ashes	3.117647059
115 | poland	russia	4
116 | explosive	builders	2.4375
117 | aeronautics	plane	4.277777778
118 | charge	sentence	3.133333333
119 | pet	retiring	2
120 | drink	alcohol	4.352941176
121 | stability	species	2.375
122 | colonies	depression	2
123 | easter	preference	2.0625
124 | genius	intellect	4.090909091
125 | diamond	killed	1.555555556
126 | slavery	african	2.8
127 | jurisdiction	law	4.454545455
128 | saints	repeal	1.555555556
129 | conspiracy	campaign	2.166666667
130 | operator	extracts	2.214285714
131 | physician	action	2.153846154
132 | electronics	guess	1.916666667
133 | slavery	diamond	2.285714286
134 | quarterback	sport	3.142857143
135 | assassination	killed	4.285714286
136 | slavery	klan	2.230769231
137 | heroin	shoot	2.692307692
138 | birds	disturbances	1.692307692
139 | palestinians	turks	2.5
140 | citizenship	court	2.5
141 | immunity	violation	2.076923077
142 | alternative	contend	2.461538462
143 | chile	plates	2.692307692
144 | abraham	stranger	1.846153846
145 | kansas	city	3.769230769
146 | month	year	3.857142857
147 | month	day	3.857142857
148 | amateur	actor	2.333333333
149 | afghanistan	war	3.384615385
150 | transmission	maxwell	2.25
151 | manchester	ambitious	1.923076923
152 | program	battered	1.928571429
153 | drawing	music	2.583333333
154 | exile	pledges	2.307692308
155 | adventure	sixteen	1.538461538
156 | exile	threats	2.166666667
157 | concrete	wings	1.428571429
158 | seizure	bishops	2
159 | submarine	sea	3.857142857
160 | villa	mayor	2.25
161 | trade	farley	2.375
162 | nature	forest	3.636363636
163 | chronicle	young	1.9
164 | radical	bishops	1.818181818
165 | pakistan	radical	2.875
166 | fire	water	4.266666667
167 | gossip	nuisance	3.0625
168 | con	examiner	2.266666667
169 | satellite	space	3.75
170 | essay	boston	2
171 | miniature	statue	3.6
172 | spill	pollution	3.5
173 | minister	council	3.5625
174 | landscape	mountain	3.5625
175 | religion	remedy	2.5625
176 | ship	storm	3.5
177 | college	scientist	2.8125
178 | crystal	oldest	2.5625
179 | afghanistan	wise	2.066666667
180 | trinity	religion	3.133333333
181 | homer	odyssey	2.857142857
182 | parish	clue	2.4375
183 | actress	actor	4.0625
184 | patent	professionals	2.375
185 | chaos	horrible	3.066666667
186 | acre	earthquake	2.125
187 | goverment	immunity	2
188 | football	justice	1.8
189 | gambling	money	3.75
190 | corruption	nervous	1.875
191 | cardinals	villages	2.375
192 | life	death	4.103448276
193 | artillery	sanctions	2.428571429
194 | jerusalem	murdered	2.357142857
195 | cell	brick	3.285714286
196 | knowledge	promoter	2.642857143
197 | adventure	rails	2.571428571
198 | houston	crash	2.357142857
199 | oxford	subcommittee	2.642857143
200 | militia	weapon	3.785714286
201 | manufacturer	meat	1.857142857
202 | damages	reaction	3.071428571
203 | sea	fishing	4.357142857
204 | atomic	clash	2.785714286
205 | broadcasting	athletics	3
206 | mystery	expedition	2.538461538
207 | kremlin	soviets	3.166666667
208 | pig	blaze	1.75
209 | riverside	vietnamese	2.25
210 | bitter	protective	1.923076923
211 | disaster	announced	2.384615385
212 | pork	blaze	2.230769231
213 | feet	international	1.916666667
214 | radical	uniform	2.5
215 | gossip	condemned	2.692307692
216 | mozart	wagner	3.166666667
217 | soccer	boxing	3.4
218 | radical	roles	2.75
219 | rescued	slaying	3
220 | researchers	tested	3.538461538
221 | sales	season	2.307692308
222 | homeless	refugees	3.615384615
223 | pakistan	repair	1.75
224 | athens	painting	2.294117647
225 | tiger	woods	3.375
226 | aircraft	plane	4.473684211
227 | solar	carbon	2.842105263
228 | enterprise	bankruptcy	2.5
229 | homer	springfield	2.833333333
230 | coin	awards	2.166666667
231 | rhodes	native	2.25
232 | soccer	curator	2.125
233 | gasoline	stock	2.888888889
234 | guilt	extended	2.105263158
235 | rapid	singapore	1.764705882
236 | coin	banker	3.631578947
237 | london	correspondence	1.944444444
238 | pop	sex	2.6
239 | medicine	bread	2.176470588
240 | asia	animal	1.555555556
241 | pop	clubhouse	3.210526316
242 | nazi	defensive	2.055555556
243 | earth	poles	3.421052632
244 | thailand	crowded	2.166666667
245 | day	independence	3.473684211
246 | controversy	pitch	2.375
247 | stock	gasoline	3.166666667
248 | composers	mozart	3.833333333
249 | tone	piano	3.722222222
250 | paris	chef	2.111111111
251 | profession	responsible	2.722222222
252 | bankruptcy	chronicle	2
253 | lebanon	war	2.722222222
254 | israel	terror	3.055555556
255 | angola	military	2.941176471
256 | chemistry	patients	2.357142857
257 | munich	constitution	3.071428571
258 | piano	theater	3.266666667
259 | poetry	artist	3.8
260 | acre	burned	1.769230769
261 | religion	abortion	2.076923077
262 | jazz	music	4.533333333
263 | government	transportation	3
264 | color	wine	2.533333333
265 | jackson	quota	1.692307692
266 | shariff	deputy	3.642857143
267 | boat	negroes	2
268 | shooting	sentenced	2.933333333
269 | republicans	friedman	2.416666667
270 | politics	brokerage	2.5
271 | russian	stalin	3.357142857
272 | love	philip	2.5
273 | nuclear	plant	3.733333333
274 | jamaica	queens	3.076923077
275 | dollar	asylum	1.846153846
276 | bridge	rowing	2.785714286
277 | berlin	germany	4
278 | funeral	death	4.714285714
279 | albert	einstein	4.266666667
280 | gulf	shore	3.857142857
281 | ecuador	argentina	3.266666667
282 | britain	france	3.714285714
283 | sports	score	3.866666667
284 | socialism	capitalism	3.785714286
285 | treaty	peace	4.166666667
286 | exchange	market	4.266666667
287 | marriage	anniversary	4.333333333
288 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/en/ws/ws353.txt:
--------------------------------------------------------------------------------
  1 | love	sex	6.77
  2 | tiger	cat	7.35
  3 | tiger	tiger	10.00
  4 | book	paper	7.46
  5 | computer	keyboard	7.62
  6 | computer	internet	7.58
  7 | plane	car	5.77
  8 | train	car	6.31
  9 | telephone	communication	7.50
 10 | television	radio	6.77
 11 | media	radio	7.42
 12 | drug	abuse	6.85
 13 | bread	butter	6.19
 14 | cucumber	potato	5.92
 15 | doctor	nurse	7.00
 16 | professor	doctor	6.62
 17 | student	professor	6.81
 18 | smart	student	4.62
 19 | smart	stupid	5.81
 20 | company	stock	7.08
 21 | stock	market	8.08
 22 | stock	phone	1.62
 23 | stock	CD	1.31
 24 | stock	jaguar	0.92
 25 | stock	egg	1.81
 26 | fertility	egg	6.69
 27 | stock	live	3.73
 28 | stock	life	0.92
 29 | book	library	7.46
 30 | bank	money	8.12
 31 | wood	forest	7.73
 32 | money	cash	9.15
 33 | professor	cucumber	0.31
 34 | king	cabbage	0.23
 35 | king	queen	8.58
 36 | king	rook	5.92
 37 | bishop	rabbi	6.69
 38 | Jerusalem	Israel	8.46
 39 | Jerusalem	Palestinian	7.65
 40 | holy	sex	1.62
 41 | fuck	sex	9.44
 42 | Maradona	football	8.62
 43 | football	soccer	9.03
 44 | football	basketball	6.81
 45 | football	tennis	6.63
 46 | tennis	racket	7.56
 47 | Arafat	peace	6.73
 48 | Arafat	terror	7.65
 49 | Arafat	Jackson	2.50
 50 | law	lawyer	8.38
 51 | movie	star	7.38
 52 | movie	popcorn	6.19
 53 | movie	critic	6.73
 54 | movie	theater	7.92
 55 | physics	proton	8.12
 56 | physics	chemistry	7.35
 57 | space	chemistry	4.88
 58 | alcohol	chemistry	5.54
 59 | vodka	gin	8.46
 60 | vodka	brandy	8.13
 61 | drink	car	3.04
 62 | drink	ear	1.31
 63 | drink	mouth	5.96
 64 | drink	eat	6.87
 65 | baby	mother	7.85
 66 | drink	mother	2.65
 67 | car	automobile	8.94
 68 | gem	jewel	8.96
 69 | journey	voyage	9.29
 70 | boy	lad	8.83
 71 | coast	shore	9.10
 72 | asylum	madhouse	8.87
 73 | magician	wizard	9.02
 74 | midday	noon	9.29
 75 | furnace	stove	8.79
 76 | food	fruit	7.52
 77 | bird	cock	7.10
 78 | bird	crane	7.38
 79 | tool	implement	6.46
 80 | brother	monk	6.27
 81 | crane	implement	2.69
 82 | lad	brother	4.46
 83 | journey	car	5.85
 84 | monk	oracle	5.00
 85 | cemetery	woodland	2.08
 86 | food	rooster	4.42
 87 | coast	hill	4.38
 88 | forest	graveyard	1.85
 89 | shore	woodland	3.08
 90 | monk	slave	0.92
 91 | coast	forest	3.15
 92 | lad	wizard	0.92
 93 | chord	smile	0.54
 94 | glass	magician	2.08
 95 | noon	string	0.54
 96 | rooster	voyage	0.62
 97 | money	dollar	8.42
 98 | money	cash	9.08
 99 | money	currency	9.04
100 | money	wealth	8.27
101 | money	property	7.57
102 | money	possession	7.29
103 | money	bank	8.50
104 | money	deposit	7.73
105 | money	withdrawal	6.88
106 | money	laundering	5.65
107 | money	operation	3.31
108 | tiger	jaguar	8.00
109 | tiger	feline	8.00
110 | tiger	carnivore	7.08
111 | tiger	mammal	6.85
112 | tiger	animal	7.00
113 | tiger	organism	4.77
114 | tiger	fauna	5.62
115 | tiger	zoo	5.87
116 | psychology	psychiatry	8.08
117 | psychology	anxiety	7.00
118 | psychology	fear	6.85
119 | psychology	depression	7.42
120 | psychology	clinic	6.58
121 | psychology	doctor	6.42
122 | psychology	Freud	8.21
123 | psychology	mind	7.69
124 | psychology	health	7.23
125 | psychology	science	6.71
126 | psychology	discipline	5.58
127 | psychology	cognition	7.48
128 | planet	star	8.45
129 | planet	constellation	8.06
130 | planet	moon	8.08
131 | planet	sun	8.02
132 | planet	galaxy	8.11
133 | planet	space	7.92
134 | planet	astronomer	7.94
135 | precedent	example	5.85
136 | precedent	information	3.85
137 | precedent	cognition	2.81
138 | precedent	law	6.65
139 | precedent	collection	2.50
140 | precedent	group	1.77
141 | precedent	antecedent	6.04
142 | cup	coffee	6.58
143 | cup	tableware	6.85
144 | cup	article	2.40
145 | cup	artifact	2.92
146 | cup	object	3.69
147 | cup	entity	2.15
148 | cup	drink	7.25
149 | cup	food	5.00
150 | cup	substance	1.92
151 | cup	liquid	5.90
152 | jaguar	cat	7.42
153 | jaguar	car	7.27
154 | energy	secretary	1.81
155 | secretary	senate	5.06
156 | energy	laboratory	5.09
157 | computer	laboratory	6.78
158 | weapon	secret	6.06
159 | FBI	fingerprint	6.94
160 | FBI	investigation	8.31
161 | investigation	effort	4.59
162 | Mars	water	2.94
163 | Mars	scientist	5.63
164 | news	report	8.16
165 | canyon	landscape	7.53
166 | image	surface	4.56
167 | discovery	space	6.34
168 | water	seepage	6.56
169 | sign	recess	2.38
170 | Wednesday	news	2.22
171 | mile	kilometer	8.66
172 | computer	news	4.47
173 | territory	surface	5.34
174 | atmosphere	landscape	3.69
175 | president	medal	3.00
176 | war	troops	8.13
177 | record	number	6.31
178 | skin	eye	6.22
179 | Japanese	American	6.50
180 | theater	history	3.91
181 | volunteer	motto	2.56
182 | prejudice	recognition	3.00
183 | decoration	valor	5.63
184 | century	year	7.59
185 | century	nation	3.16
186 | delay	racism	1.19
187 | delay	news	3.31
188 | minister	party	6.63
189 | peace	plan	4.75
190 | minority	peace	3.69
191 | attempt	peace	4.25
192 | government	crisis	6.56
193 | deployment	departure	4.25
194 | deployment	withdrawal	5.88
195 | energy	crisis	5.94
196 | announcement	news	7.56
197 | announcement	effort	2.75
198 | stroke	hospital	7.03
199 | disability	death	5.47
200 | victim	emergency	6.47
201 | treatment	recovery	7.91
202 | journal	association	4.97
203 | doctor	personnel	5.00
204 | doctor	liability	5.19
205 | liability	insurance	7.03
206 | school	center	3.44
207 | reason	hypertension	2.31
208 | reason	criterion	5.91
209 | hundred	percent	7.38
210 | Harvard	Yale	8.13
211 | hospital	infrastructure	4.63
212 | death	row	5.25
213 | death	inmate	5.03
214 | lawyer	evidence	6.69
215 | life	death	7.88
216 | life	term	4.50
217 | word	similarity	4.75
218 | board	recommendation	4.47
219 | governor	interview	3.25
220 | OPEC	country	5.63
221 | peace	atmosphere	3.69
222 | peace	insurance	2.94
223 | territory	kilometer	5.28
224 | travel	activity	5.00
225 | competition	price	6.44
226 | consumer	confidence	4.13
227 | consumer	energy	4.75
228 | problem	airport	2.38
229 | car	flight	4.94
230 | credit	card	8.06
231 | credit	information	5.31
232 | hotel	reservation	8.03
233 | grocery	money	5.94
234 | registration	arrangement	6.00
235 | arrangement	accommodation	5.41
236 | month	hotel	1.81
237 | type	kind	8.97
238 | arrival	hotel	6.00
239 | bed	closet	6.72
240 | closet	clothes	8.00
241 | situation	conclusion	4.81
242 | situation	isolation	3.88
243 | impartiality	interest	5.16
244 | direction	combination	2.25
245 | street	place	6.44
246 | street	avenue	8.88
247 | street	block	6.88
248 | street	children	4.94
249 | listing	proximity	2.56
250 | listing	category	6.38
251 | cell	phone	7.81
252 | production	hike	1.75
253 | benchmark	index	4.25
254 | media	trading	3.88
255 | media	gain	2.88
256 | dividend	payment	7.63
257 | dividend	calculation	6.48
258 | calculation	computation	8.44
259 | currency	market	7.50
260 | OPEC	oil	8.59
261 | oil	stock	6.34
262 | announcement	production	3.38
263 | announcement	warning	6.00
264 | profit	warning	3.88
265 | profit	loss	7.63
266 | dollar	yen	7.78
267 | dollar	buck	9.22
268 | dollar	profit	7.38
269 | dollar	loss	6.09
270 | computer	software	8.50
271 | network	hardware	8.31
272 | phone	equipment	7.13
273 | equipment	maker	5.91
274 | luxury	car	6.47
275 | five	month	3.38
276 | report	gain	3.63
277 | investor	earning	7.13
278 | liquid	water	7.89
279 | baseball	season	5.97
280 | game	victory	7.03
281 | game	team	7.69
282 | marathon	sprint	7.47
283 | game	series	6.19
284 | game	defeat	6.97
285 | seven	series	3.56
286 | seafood	sea	7.47
287 | seafood	food	8.34
288 | seafood	lobster	8.70
289 | lobster	food	7.81
290 | lobster	wine	5.70
291 | food	preparation	6.22
292 | video	archive	6.34
293 | start	year	4.06
294 | start	match	4.47
295 | game	round	5.97
296 | boxing	round	7.61
297 | championship	tournament	8.36
298 | fighting	defeating	7.41
299 | line	insurance	2.69
300 | day	summer	3.94
301 | summer	drought	7.16
302 | summer	nature	5.63
303 | day	dawn	7.53
304 | nature	environment	8.31
305 | environment	ecology	8.81
306 | nature	man	6.25
307 | man	woman	8.30
308 | man	governor	5.25
309 | murder	manslaughter	8.53
310 | soap	opera	7.94
311 | opera	performance	6.88
312 | life	lesson	5.94
313 | focus	life	4.06
314 | production	crew	6.25
315 | television	film	7.72
316 | lover	quarrel	6.19
317 | viewer	serial	2.97
318 | possibility	girl	1.94
319 | population	development	3.75
320 | morality	importance	3.31
321 | morality	marriage	3.69
322 | Mexico	Brazil	7.44
323 | gender	equality	6.41
324 | change	attitude	5.44
325 | family	planning	6.25
326 | opera	industry	2.63
327 | sugar	approach	0.88
328 | practice	institution	3.19
329 | ministry	culture	4.69
330 | problem	challenge	6.75
331 | size	prominence	5.31
332 | country	citizen	7.31
333 | planet	people	5.75
334 | development	issue	3.97
335 | experience	music	3.47
336 | music	project	3.63
337 | glass	metal	5.56
338 | aluminum	metal	7.83
339 | chance	credibility	3.88
340 | exhibit	memorabilia	5.31
341 | concert	virtuoso	6.81
342 | rock	jazz	7.59
343 | museum	theater	7.19
344 | observation	architecture	4.38
345 | space	world	6.53
346 | preservation	world	6.19
347 | admission	ticket	7.69
348 | shower	thunderstorm	6.31
349 | shower	flood	6.03
350 | weather	forecast	8.34
351 | disaster	area	6.25
352 | governor	office	6.34
353 | architecture	century	3.78
354 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/de/ws/zg222.txt:
--------------------------------------------------------------------------------
  1 | Abweichung  sortieren  2
  2 | agieren  mobil  2
  3 | aktuell  Portfolioanalyse  1.523809524
  4 | Altersstufe  Mut  1.238095238
  5 | Anbieter  Bedarf  3
  6 | Angebotsseite  Bestandsaufnahme  1.714285714
  7 | angehend  Verfahrenstechnik  0.619047619
  8 | anleiten  rekonstruieren  1.333333333
  9 | Ansatz  europäisch  0.952380952
 10 | anschließend  Blutspendeaktion  0.476190476
 11 | anschließend  Maschinenfunktion  0.571428571
 12 | Antrittsvorlesung  Justus  0.476190476
 13 | Approach  Implementation  1.761904762
 14 | Arbeitstitel  Abkehr  0.380952381
 15 | Assistentin  Überblick  1.666666667
 16 | Aufgabe  Vertriebstechniker  2.095238095
 17 | aufsuchen  adäquat  0.571428571
 18 | Ausbildung  Beispiel  1.952380952
 19 | Ausbildung  Gesundheitswesen  1.761904762
 20 | Ausbildung  nah  1.047619048
 21 | Ausgangsmaterial  Probenehmer  1.428571429
 22 | Ausland  Suche  1.428571429
 23 | Autor  Identität  2.238095238
 24 | Baumaschinenmeister  beruflich  2.904761905
 25 | Bayern  Unterrichtsmittel  0.857142857
 26 | Beamter  Mitarbeit  1.428571429
 27 | bearbeiten  frühere  0.476190476
 28 | Berechnung  Firma  2.047619048
 29 | Berlin  demographisch  1.619047619
 30 | Berlin  Recht  1.380952381
 31 | berücksichtigen  kooperieren  1.904761905
 32 | Berücksichtigung  Branche  0.952380952
 33 | beständig  Managerinnen  1.095238095
 34 | beurteilen  denkbar  1.19047619
 35 | Beurteilung  Verhaltensmuster  2.380952381
 36 | bezüglich  Durchführung  0.857142857
 37 | Bildung  Ziel  2.523809524
 38 | Bildungsabschluss  Verhaltensmuster  1.238095238
 39 | Bildungsträger  Berufspraxis  1.952380952
 40 | bleiben  Gewalt  0.619047619
 41 | Büroequipment  Institut  1.285714286
 42 | Computer  Plattform  3.238095238
 43 | Core  Metadatenwerkzeug  1.380952381
 44 | Datenbank  Bachelorstudiengang  1.19047619
 45 | Datum  erheben  1.380952381
 46 | Detailkonstrukteurinnen  Tätigkeit  2.19047619
 47 | Dienst  Ingenieurbüro  2.142857143
 48 | Dozent  Kamera  0.571428571
 49 | drängeln  Bereich  0.952380952
 50 | Dreieck  Stuttgart  0.952380952
 51 | Druckplatte  Tätigkeit  1.571428571
 52 | Durchführung  Zusammenarbeit  2.142857143
 53 | Eigenaktivität  Durchführung  1.857142857
 54 | eignen  auswirken  1.285714286
 55 | eindimensional  Backware  0.19047619
 56 | einschließlich  häufig  0.571428571
 57 | einzig  Nachbar  0.619047619
 58 | elektronisch  neu  1.476190476
 59 | Entwicklung  Struktur  1.857142857
 60 | erfolgreich  Universität  2.571428571
 61 | Erkenntnisinteresse  gleichzeitig  0.523809524
 62 | Erklärung  Aktensperrfrist  1.19047619
 63 | europäisch  Intervention  2.142857143
 64 | Evaluation  funktional  1.333333333
 65 | Fach  Institut  3.047619048
 66 | Fachbereichsvertreter  individuell  0.80952381
 67 | fallen  überwachen  0.428571429
 68 | Familie  Beitrag  1.333333333
 69 | Familie  Modellvorhaben  0.571428571
 70 | Feinwerktechnik  Hochschule  1.952380952
 71 | Flächeneinsparung  Landwirtschaft  2.333333333
 72 | Forschungsverbund  Langzeittherapieprogramm  1.619047619
 73 | Forschungszentrum  Euro  1.285714286
 74 | Fortschritt  Asien  2.333333333
 75 | Frage  Universität  2.333333333
 76 | Führungsmittel  Gastlandkontakte  0.380952381
 77 | Garnerzeugung  Vliesstofferzeugung  3.238095238
 78 | Gegenstand  Filmherstellung  1.047619048
 79 | Gegenwart  beinhalten  0.666666667
 80 | gelangen  Sujet  0.428571429
 81 | genius  stilistisch  1.142857143
 82 | Gentest  Risiko  2.571428571
 83 | Georg  August  1.238095238
 84 | gerade  gängig  1.095238095
 85 | Gespannfahren  Altersgruppe  0.380952381
 86 | Gesundheitsbegriff  Entwicklungsgeschichte  0.952380952
 87 | Gleitkomma  deaktivieren  0.666666667
 88 | groß  Arbeitszeitregelung  0.238095238
 89 | groß  methodisch  0.523809524
 90 | großflächig  vorzeitig  0.285714286
 91 | gründen  Forschung  2.285714286
 92 | Gymnasium  Ober  1.380952381
 93 | Handarbeit  Flöte  1.666666667
 94 | Handlungsanleitung  Kooperationspartner  1.142857143
 95 | Handout  üben  1.428571429
 96 | Helfer  Problem  2.952380952
 97 | Hubschraubertyp  einschließlich  0.19047619
 98 | Identifizierung  praxisbezogen  0.761904762
 99 | Institut  Einführung  1.380952381
100 | Instrumentarium  anwendbar  2.380952381
101 | Interaktion  Auswirkung  2.619047619
102 | Internet  außerdem  0.285714286
103 | Interpretation  politisch  1.714285714
104 | Kenntnis  Ingenieur  2.80952381
105 | Kenntnis  speziell  2.80952381
106 | Kolloquium  Wissen  3
107 | Kompetenz  Arzt  3
108 | Konflikt  deutsch  1.476190476
109 | konkret  Handlungsempfehlung  2.047619048
110 | Konstruktionsbüro  elektro  1.285714286
111 | Körpernorm  Lebenszusammenhang  0.904761905
112 | Korrektur  nutzen  1.047619048
113 | Kostüm  gekonnt  1.095238095
114 | Kraft  Geselle  0.952380952
115 | Krankenhausmanagement  Betriebswirt  2.476190476
116 | künstlich  Samenzahnrad  0.761904762
117 | lassen  Elisabeth  0.142857143
118 | lehren  Verkehrswirtschaft  1.571428571
119 | Lehrerausbildung  Medium  1.619047619
120 | Lehrerrolle  Hilfe  2.666666667
121 | Leopold  Institut  0.80952381
122 | Literaturwissenschaft  allgemein  0.952380952
123 | logisch  Juni  0.142857143
124 | Lust  Uni  1.714285714
125 | Management  international  2.333333333
126 | Marketing  Firma  2.904761905
127 | Maschinenbau  Beschreibung  1.19047619
128 | Metall  Berufsbezeichnung  1.285714286
129 | Microsoft  Industries  2.523809524
130 | Migrantinnen  Handelsstruktur  0.761904762
131 | MIPS  Core  2.095238095
132 | mobil  beschränkt  1.666666667
133 | Motivation  geplant  1.333333333
134 | müssen  Übergeordneter  1.285714286
135 | Mut  lassen  0.80952381
136 | Neoautoritarismus  Chance  0.428571429
137 | Objekt  wechselseitig  1.142857143
138 | Outfit  Strom  0.238095238
139 | Personaldisposition  überwachen  1.238095238
140 | Pflanzenzüchtung  models  0.428571429
141 | Pharmakotherapie  Evidence  0.523809524
142 | Polarisierung  Beurteilung  1.619047619
143 | Politikbereich  altersbezogen  1.238095238
144 | Porter  wechselseitig  0.428571429
145 | postmaterialistisch  diesbezüglich  0.285714286
146 | praxisbezogen  ausbilden  2.761904762
147 | Pressebüro  Nanopartikel  0.285714286
148 | Privatkunde  Bereich  0.952380952
149 | Problem  Grenze  1.857142857
150 | Prof.  Ludwig  0.904761905
151 | Programmiersystem  Warte  0.619047619
152 | Quelle  Text  3.238095238
153 | Rahmenbedingung  Hochtechnologie  1.761904762
154 | Rainer  Folie  0.142857143
155 | redaktionell  stützen  1
156 | Reflexivität  kollektiv  0.80952381
157 | Reformmöglichkeit  Bildungspolitik  2.714285714
158 | Regisseur  gestalterisch  2.428571429
159 | Rekonstruktion  vornehmlich  0.571428571
160 | religiös  Sahara  0.666666667
161 | Restaurierungsmethode  Bildungsträger  0.619047619
162 | Risikokind  Start  0.80952381
163 | Schiff  Segelflugzeug  2.19047619
164 | schließen  Reiseantrag  0.619047619
165 | Schritt  Wohnung  0.619047619
166 | selbstständig  individuell  3
167 | Sicherheit  Frontenbildung  1.095238095
168 | sicherheitspolitisch  vereinigt  1.476190476
169 | sozial  insistieren  0.80952381
170 | Soziales  sozial  3.761904762
171 | Spanish  Latein  3.333333333
172 | Spielidee  Computergraphik  2.80952381
173 | Sport  bargeldlos  0.428571429
174 | starten  Endlast  0.904761905
175 | stehen  Finger  0.476190476
176 | stehen  politisch  1.142857143
177 | strafen  Paragraph  3.047619048
178 | Studie  Anpassung  1.428571429
179 | Stufe  beurteilen  1.952380952
180 | Stuttgart  Ausbildung  0.80952381
181 | Suche  Entnahme  1.142857143
182 | Tätigkeit  ausführen  3.666666667
183 | Tätigkeit  Maschine  2.619047619
184 | Tätigkeitsbezeichnung  Personal  2.666666667
185 | Tiefbaubauingenieur  heranführen  0.619047619
186 | Tomcat  zentral  0.666666667
187 | Trage  Berührung  0.761904762
188 | Turnier  Sport  3.619047619
189 | Übersicht  Kursstätte  0.904761905
190 | überzeugen  Kommunikation  2.80952381
191 | üblich  Sport  0.714285714
192 | Umwelt  Organisationskompetenz  1.333333333
193 | Umweltschutz  Gesundheitsschutz  2.857142857
194 | Uni  Titel  3.095238095
195 | Universität  Anforderung  2.952380952
196 | Universität  Bildungseinrichtung  3.904761905
197 | Universität  Euro  1.238095238
198 | Universitätsklinik  Universität  3.523809524
199 | unterrichten  soft  0.714285714
200 | Unterrichtsmittel  Versuchsfläche  1.619047619
201 | unterschiedlich  fallen  0.571428571
202 | unterzeichnen  gewährleisten  1.80952381
203 | verantwortlich  Firma  2.333333333
204 | verarbeiten  dichten  1.380952381
205 | verfügen  Kommunikation  1.095238095
206 | Verkäuferinnen  Gesteck  1.19047619
207 | Verwaltung  Betriebswirt  2.857142857
208 | Vortrag  technisch  1.80952381
209 | Wahlfächer  Gymnasium  3.238095238
210 | wahrnehmen  Grundsatzfrage  0.857142857
211 | wahrnehmen  selbstständig  1.238095238
212 | wahrnehmen  Trägereinrichtung  0.380952381
213 | wahrnehmen  zusammenarbeiten  0.761904762
214 | Wartung  Einhaltung  2
215 | Weiterbildung  Arbeitsbereich  2.571428571
216 | Welthungerhilfe  Form  0.714285714
217 | Widerstand  diagnostisch  0.571428571
218 | Wirtschaftsminister  handeln  2.380952381
219 | Wörterbuch  Bewertung  0.857142857
220 | Zusammenarbeit  Objekt  1.095238095
221 | Zusammenarbeit  Wiki  2.761904762
222 | zusammenstellen  zwei  1.952380952
223 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # `hyperhyper` [![Build Status](https://travis-ci.com/jfilter/hyperhyper.svg?branch=master)](https://travis-ci.com/jfilter/hyperhyper) [![PyPI](https://img.shields.io/pypi/v/hyperhyper.svg)](https://pypi.org/project/hyperhyper/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/hyperhyper.svg)](https://pypi.org/project/hyperhyper/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/hyperhyper)](https://pypistats.org/packages/hyperhyper)
  2 | 
  3 | `hyperhyper` is a Python package to construct word embeddings for small data.
  4 | 
  5 | ## Why?
  6 | 
  7 | Nowadays, [word embeddings](https://en.wikipedia.org/wiki/Word_embedding) are mostly associated with [Word2vec](https://en.wikipedia.org/wiki/Word2vec) or [fastText](https://en.wikipedia.org/wiki/FastText).
  8 | These approaches focus on scenarios, where an abundance of data is available.
  9 | And big players such as Facebook provide ready-to-use [pre-trained word embeddings](https://fasttext.cc/docs/en/crawl-vectors.html).
 10 | So often you don't have to train new word embeddings from scratch.
 11 | But sometimes you do.
 12 | 
 13 | Word2vec or fastText require a lot of data – but texts, especially domain-specific texts, may be scarce.
 14 | There exist alternative methods based on counting co-locations (word pairs) that require fewer data to work.
 15 | This package implements these approaches (somewhat) efficiently.
 16 | 
 17 | ## Installation
 18 | 
 19 | ```bash
 20 | pip install hyperhyper
 21 | ```
 22 | 
 23 | To enable all features (such as pre-processing with spaCy):
 24 | 
 25 | ```bash
 26 | pip install hyperhyper[full]
 27 | ```
 28 | 
 29 | ## Usage
 30 | 
 31 | ```python
 32 | import hyperhyper as hy
 33 | 
 34 | # download and uncomproess the data
 35 | # wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2010.en.shuffled.gz && gzip -d news.2010.en.shuffled.gz
 36 | corpus = hy.Corpus.from_file("news.2010.en.shuffled")
 37 | bunch = hy.Bunch("news_bunch", corpus)
 38 | 
 39 | # `hyperhyper` is built open `gensim`. So you can get word embeddings in a keyed vectors format.
 40 | # https://radimrehurek.com/gensim/models/keyedvectors.html
 41 | vectors, results = bunch.svd(keyed_vectors=True)
 42 | 
 43 | results["results"][1]
 44 | >>> {"name": "en_ws353",
 45 |  "score": 0.6510955349164682,
 46 |  "oov": 0.014164305949008499,
 47 |  "fullscore": 0.641873218557878}
 48 | 
 49 | vectors.most_similar("berlin")
 50 | >>> [("vienna", 0.6323208808898926),
 51 |  ("frankfurt", 0.5965485572814941),
 52 |  ("munich", 0.5737138986587524),
 53 |  ("amsterdam", 0.5511572360992432),
 54 |  ("stockholm", 0.5423270463943481)]
 55 | ```
 56 | 
 57 | Check out the [examples](./examples).
 58 | 
 59 | The general concepts:
 60 | 
 61 | -   preprocess data once and save them in a `bunch`
 62 | -   cache all results and also record their performance on test data
 63 | -   make it easy to fine-tune parameters for your data
 64 | 
 65 | More documentation may be forthcoming. Until then you have to read the [source code](./hyperhyper).
 66 | 
 67 | ## Performance Optimization
 68 | 
 69 | ### Install MKL
 70 | 
 71 | If you have an Intel CPU, it's recommended to use [MKL](https://en.wikipedia.org/wiki/Math_Kernel_Library) to speed up numeric executions.
 72 | Otherwise, the default [OpenBLAS](https://en.wikipedia.org/wiki/OpenBLAS) will get installed when initially installing `hyperhyper`.
 73 | 
 74 | It can be challenging to correctly set up MKL.
 75 | A conda package by Intel may help you.
 76 | 
 77 | ```bash
 78 | conda install -c intel intelpython3_core
 79 | pip install hyperhyper
 80 | ```
 81 | 
 82 | Verify wheter `mkl_info` is present in the numpy config:
 83 | 
 84 | ```python
 85 | >>> import numpy
 86 | >>> numpy.__config__.show()
 87 | ```
 88 | 
 89 | ### Disable Numerical Multithreading
 90 | 
 91 | Further, disable the internal multithreading ability of MKL or OpenBLAS (numerical libraries).
 92 | This speeds up computation because you should do multiprocessing on an outer loop anyhow.
 93 | But you can also leave the default to take advantage of all cores for your numerical computations.
 94 | [Some Tweets why multithreading with OpenBLAS can cause problems.](https://twitter.com/honnibal/status/1067920534585917440)
 95 | 
 96 | ```bash
 97 | export OPENBLAS_NUM_THREADS=1
 98 | export MKL_NUM_THREADS=1
 99 | ```
100 | 
101 | ## Background
102 | 
103 | `hyperhyper` is based on research by Omer Levy et al. from 2015 ([the paper](https://aclweb.org/anthology/papers/Q/Q15/Q15-1016/)).
104 | The authors published the code they used in their experiments as [Hyperwods](https://bitbucket.org/omerlevy/hyperwords).
105 | Initially, I [tried](https://github.com/jfilter/hyperwords) to port their original software to Python 3 but I ended up re-writing large parts of it.
106 | So this package was born.
107 | 
108 | 
109 | ![How pairs are counted](./docs/imgs/window.svg)
110 | 
111 | The basic idea: Construct pairs of words that appear together in sentences (within a given window size).
112 | Then do some math magic around matrix operations (PPMI, SVD) to get low-dimensional embeddings.
113 | 
114 | The count-based word-embeddings by `hyperhyper` are deterministic.
115 | So multiple runs of experiments with identical parameters will yield the same results.
116 | Word2vec and others unstable.
117 | Due to randomness, their results will vary.
118 | 
119 | `hyperhyper` is built upon the seminal Python NLP package [gensim](https://radimrehurek.com/gensim/).
120 | 
121 | Limitations: With `hyperhyper` you will run into (memory) problems if you need large vocabularies (set of possible words).
122 | It's fine if you have a vocabulary up until ~ 50k.
123 | Word2vec and fastText especially solve this [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality).
124 | If you're interested in details you should read the aforementioned excellent [paper by Omer Levy et al.](https://aclweb.org/anthology/papers/Q/Q15/Q15-1016/).
125 | 
126 | ### Scientific Literature
127 | 
128 | This software is based on ideas stemming from the following papers:
129 | 
130 | -   Improving Distributional Similarity with Lessons Learned from Word Embeddings, Omer Levy, Yoav Goldberg, Ido Dagan, TACL 2015. [Paper](https://aclweb.org/anthology/papers/Q/Q15/Q15-1016/) [Code](https://bitbucket.org/omerlevy/hyperwords)
131 |     > Recent trends suggest that neural-network-inspired word embedding models outperform traditional count-based distributional models on word similarity and analogy detection tasks. We reveal that much of the performance gains of word embeddings are due to certain system design choices and hyperparameter optimizations, rather than the embedding algorithms themselves. Furthermore, we show that these modifications can be transferred to traditional distributional models, yielding similar gains. In contrast to prior reports, we observe mostly local or insignificant performance differences between the methods, with no global advantage to any single approach over the others.
132 | -   The Influence of Down-Sampling Strategies on SVD Word Embedding Stability, Johannes Hellrich, Bernd Kampe, Udo Hahn, NAACL 2019. [Paper](https://aclweb.org/anthology/papers/W/W19/W19-2003/) [Code](https://github.com/hellrich/hyperwords) [Code](https://github.com/hellrich/embedding_downsampling_comparison)
133 |     > The stability of word embedding algorithms, i.e., the consistency of the word representations they reveal when trained repeatedly on the same data set, has recently raised concerns. We here compare word embedding algorithms on three corpora of different sizes, and evaluate both their stability and accuracy. We find strong evidence that down-sampling strategies (used as part of their training procedures) are particularly influential for the stability of SVD-PPMI-type embeddings. This finding seems to explain diverging reports on their stability and lead us to a simple modification which provides superior stability as well as accuracy on par with skip-gram embedding
134 | 
135 | ## Development
136 | 
137 | Install and use [poetry](https://python-poetry.org/).
138 | 
139 | ## Contributing
140 | 
141 | If you have a **question**, found a **bug** or want to propose a new **feature**, have a look at the [issues page](https://github.com/jfilter/hyperhyper/issues).
142 | 
143 | **Pull requests** are especially welcomed when they fix bugs or improve the code quality.
144 | 
145 | ## Future Work / TODO
146 | 
147 | -   evaluation for analogies
148 | -   implement counting in a more efficient programming language, e.g. Cython.
149 | 
150 | ## `hyperhyper`?
151 | 
152 | [![Scooter – Hyper Hyper (Song)](https://img.youtube.com/vi/7Twnmhe948A/0.jpg)](https://www.youtube.com/watch?v=7Twnmhe948A "Scooter – Hyper Hyper")
153 | 
154 | ## Acknowledgments
155 | 
156 | Building upon the work by Omer Levy et al. for [Hyperwords](https://bitbucket.org/omerlevy/hyperwords).
157 | 
158 | ## License
159 | 
160 | BSD-2-Clause
161 | 
162 | ## Sponsoring
163 | 
164 | This work was created as part of a [project](https://github.com/jfilter/ptf) that was funded by the German [Federal Ministry of Education and Research](https://www.bmbf.de/en/index.html).
165 | 
166 | <img src="./bmbf_funded.svg">
167 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/de/analogy/opposite.txt:
--------------------------------------------------------------------------------
  1 | Frage Antwort stark schwach
  2 | Frage Antwort viel wenig
  3 | Frage Antwort positiv negativ
  4 | Frage Antwort rechts links
  5 | Frage Antwort nah fern
  6 | Frage Antwort männlich weiblich
  7 | Frage Antwort warm kalt
  8 | Frage Antwort rechts links
  9 | Frage Antwort schnell langsam
 10 | Frage Antwort Junge Mädchen
 11 | Junge Mädchen Sommer Winter
 12 | Junge Mädchen viel wenig
 13 | Junge Mädchen Frage Antwort
 14 | Junge Mädchen Mann Frau
 15 | Junge Mädchen männlich weiblich
 16 | Junge Mädchen männlich weiblich
 17 | Junge Mädchen alt jung
 18 | Junge Mädchen hell dunkel
 19 | Junge Mädchen oben unten
 20 | Junge Mädchen voll leer
 21 | Leben Tod hoch tief
 22 | Leben Tod davor danach
 23 | Leben Tod Norden Süden
 24 | Leben Tod bekannt unbekannt
 25 | Leben Tod rechts links
 26 | Leben Tod groß klein
 27 | Leben Tod warm kalt
 28 | Leben Tod männlich weiblich
 29 | Leben Tod Osten Westen
 30 | Leben Tod Sommer Winter
 31 | Mann Frau groß klein
 32 | Mann Frau schnell langsam
 33 | Mann Frau oben unten
 34 | Mann Frau lang kurz
 35 | Mann Frau oben unten
 36 | Mann Frau männlich weiblich
 37 | Mann Frau warm kalt
 38 | Mann Frau Osten Westen
 39 | Mann Frau bekannt unbekannt
 40 | Mann Frau hell dunkel
 41 | Norden Süden hell dunkel
 42 | Norden Süden gewinnen verlieren
 43 | Norden Süden groß klein
 44 | Norden Süden oben unten
 45 | Norden Süden Osten Westen
 46 | Norden Süden lang kurz
 47 | Norden Süden viel wenig
 48 | Norden Süden positiv negativ
 49 | Norden Süden Mann Frau
 50 | Norden Süden stark schwach
 51 | Osten Westen leicht schwer
 52 | Osten Westen hoch tief
 53 | Osten Westen positiv negativ
 54 | Osten Westen lachen weinen
 55 | Osten Westen lachen weinen
 56 | Osten Westen hoch tief
 57 | Osten Westen nah fern
 58 | Osten Westen bekannt unbekannt
 59 | Osten Westen hell dunkel
 60 | Osten Westen gewinnen verlieren
 61 | Sommer Winter Junge Mädchen
 62 | Sommer Winter warm kalt
 63 | Sommer Winter positiv negativ
 64 | Sommer Winter leicht schwer
 65 | Sommer Winter davor danach
 66 | Sommer Winter bekannt unbekannt
 67 | Sommer Winter Mann Frau
 68 | Sommer Winter warm kalt
 69 | Sommer Winter gewinnen verlieren
 70 | Sommer Winter lachen weinen
 71 | Start Ziel Frage Antwort
 72 | Start Ziel viel wenig
 73 | Start Ziel Frage Antwort
 74 | Start Ziel groß klein
 75 | Start Ziel Junge Mädchen
 76 | Start Ziel stark schwach
 77 | Start Ziel lachen weinen
 78 | Start Ziel oben unten
 79 | Start Ziel oben unten
 80 | Start Ziel viel wenig
 81 | Tag Nacht Junge Mädchen
 82 | Tag Nacht hell dunkel
 83 | Tag Nacht positiv negativ
 84 | Tag Nacht Start Ziel
 85 | Tag Nacht Start Ziel
 86 | Tag Nacht oben unten
 87 | Tag Nacht männlich weiblich
 88 | Tag Nacht Leben Tod
 89 | Tag Nacht männlich weiblich
 90 | Tag Nacht Osten Westen
 91 | alt jung leicht schwer
 92 | alt jung früh spät
 93 | alt jung bekannt unbekannt
 94 | alt jung rechts links
 95 | alt jung Osten Westen
 96 | alt jung nah fern
 97 | alt jung Norden Süden
 98 | alt jung Tag Nacht
 99 | alt jung Junge Mädchen
100 | alt jung bekannt unbekannt
101 | bekannt unbekannt leicht schwer
102 | bekannt unbekannt rechts links
103 | bekannt unbekannt Osten Westen
104 | bekannt unbekannt alt jung
105 | bekannt unbekannt schnell langsam
106 | bekannt unbekannt Leben Tod
107 | bekannt unbekannt viel wenig
108 | bekannt unbekannt Mann Frau
109 | bekannt unbekannt lachen weinen
110 | bekannt unbekannt früh spät
111 | davor danach Start Ziel
112 | davor danach positiv negativ
113 | davor danach Osten Westen
114 | davor danach Norden Süden
115 | davor danach groß klein
116 | davor danach groß klein
117 | davor danach positiv negativ
118 | davor danach voll leer
119 | davor danach groß klein
120 | davor danach warm kalt
121 | früh spät stark schwach
122 | früh spät stark schwach
123 | früh spät positiv negativ
124 | früh spät schnell langsam
125 | früh spät Osten Westen
126 | früh spät Start Ziel
127 | früh spät Frage Antwort
128 | früh spät bekannt unbekannt
129 | früh spät Junge Mädchen
130 | früh spät Norden Süden
131 | groß klein Mann Frau
132 | groß klein männlich weiblich
133 | groß klein stark schwach
134 | groß klein Sommer Winter
135 | groß klein nah fern
136 | groß klein lang kurz
137 | groß klein viel wenig
138 | groß klein oben unten
139 | groß klein Leben Tod
140 | groß klein männlich weiblich
141 | hell dunkel Mann Frau
142 | hell dunkel Osten Westen
143 | hell dunkel früh spät
144 | hell dunkel alt jung
145 | hell dunkel früh spät
146 | hell dunkel männlich weiblich
147 | hell dunkel früh spät
148 | hell dunkel lachen weinen
149 | hell dunkel voll leer
150 | hell dunkel hoch tief
151 | hoch tief groß klein
152 | hoch tief warm kalt
153 | hoch tief lachen weinen
154 | hoch tief alt jung
155 | hoch tief warm kalt
156 | hoch tief bekannt unbekannt
157 | hoch tief Leben Tod
158 | hoch tief schnell langsam
159 | hoch tief rechts links
160 | hoch tief leicht schwer
161 | lang kurz gewinnen verlieren
162 | lang kurz warm kalt
163 | lang kurz Sommer Winter
164 | lang kurz Norden Süden
165 | lang kurz Junge Mädchen
166 | lang kurz Tag Nacht
167 | lang kurz bekannt unbekannt
168 | lang kurz voll leer
169 | lang kurz gewinnen verlieren
170 | lang kurz hell dunkel
171 | leicht schwer Sommer Winter
172 | leicht schwer warm kalt
173 | leicht schwer hell dunkel
174 | leicht schwer Junge Mädchen
175 | leicht schwer Mann Frau
176 | leicht schwer Leben Tod
177 | leicht schwer früh spät
178 | leicht schwer oben unten
179 | leicht schwer lachen weinen
180 | leicht schwer Start Ziel
181 | männlich weiblich stark schwach
182 | männlich weiblich Tag Nacht
183 | männlich weiblich bekannt unbekannt
184 | männlich weiblich lang kurz
185 | männlich weiblich hoch tief
186 | männlich weiblich nah fern
187 | männlich weiblich rechts links
188 | männlich weiblich Mann Frau
189 | männlich weiblich Start Ziel
190 | männlich weiblich schnell langsam
191 | nah fern hoch tief
192 | nah fern Frage Antwort
193 | nah fern bekannt unbekannt
194 | nah fern leicht schwer
195 | nah fern hoch tief
196 | nah fern hoch tief
197 | nah fern bekannt unbekannt
198 | nah fern Junge Mädchen
199 | nah fern bekannt unbekannt
200 | nah fern leicht schwer
201 | oben unten Sommer Winter
202 | oben unten voll leer
203 | oben unten davor danach
204 | oben unten lang kurz
205 | oben unten gewinnen verlieren
206 | oben unten nah fern
207 | oben unten lachen weinen
208 | oben unten Start Ziel
209 | oben unten hoch tief
210 | oben unten nah fern
211 | positiv negativ Junge Mädchen
212 | positiv negativ Leben Tod
213 | positiv negativ Junge Mädchen
214 | positiv negativ warm kalt
215 | positiv negativ leicht schwer
216 | positiv negativ hoch tief
217 | positiv negativ früh spät
218 | positiv negativ männlich weiblich
219 | positiv negativ viel wenig
220 | positiv negativ Leben Tod
221 | rechts links Frage Antwort
222 | rechts links Mann Frau
223 | rechts links hoch tief
224 | rechts links alt jung
225 | rechts links positiv negativ
226 | rechts links früh spät
227 | rechts links Start Ziel
228 | rechts links oben unten
229 | rechts links Junge Mädchen
230 | rechts links lachen weinen
231 | schnell langsam voll leer
232 | schnell langsam hoch tief
233 | schnell langsam gewinnen verlieren
234 | schnell langsam hoch tief
235 | schnell langsam davor danach
236 | schnell langsam hell dunkel
237 | schnell langsam davor danach
238 | schnell langsam männlich weiblich
239 | schnell langsam stark schwach
240 | schnell langsam viel wenig
241 | stark schwach rechts links
242 | stark schwach viel wenig
243 | stark schwach Norden Süden
244 | stark schwach Sommer Winter
245 | stark schwach hoch tief
246 | stark schwach voll leer
247 | stark schwach Sommer Winter
248 | stark schwach voll leer
249 | stark schwach nah fern
250 | stark schwach hell dunkel
251 | viel wenig hoch tief
252 | viel wenig Norden Süden
253 | viel wenig Norden Süden
254 | viel wenig schnell langsam
255 | viel wenig hell dunkel
256 | viel wenig bekannt unbekannt
257 | viel wenig früh spät
258 | viel wenig Osten Westen
259 | viel wenig hell dunkel
260 | viel wenig Tag Nacht
261 | voll leer positiv negativ
262 | voll leer nah fern
263 | voll leer rechts links
264 | voll leer groß klein
265 | voll leer Norden Süden
266 | voll leer männlich weiblich
267 | voll leer hoch tief
268 | voll leer nah fern
269 | voll leer männlich weiblich
270 | voll leer gewinnen verlieren
271 | warm kalt hoch tief
272 | warm kalt Junge Mädchen
273 | warm kalt lachen weinen
274 | warm kalt viel wenig
275 | warm kalt rechts links
276 | warm kalt hoch tief
277 | warm kalt Frage Antwort
278 | warm kalt davor danach
279 | warm kalt davor danach
280 | warm kalt positiv negativ
281 | gewinnen verlieren hoch tief
282 | gewinnen verlieren lang kurz
283 | gewinnen verlieren hoch tief
284 | gewinnen verlieren hell dunkel
285 | gewinnen verlieren Tag Nacht
286 | gewinnen verlieren schnell langsam
287 | gewinnen verlieren voll leer
288 | gewinnen verlieren lang kurz
289 | gewinnen verlieren alt jung
290 | gewinnen verlieren alt jung
291 | lachen weinen viel wenig
292 | lachen weinen oben unten
293 | lachen weinen alt jung
294 | lachen weinen stark schwach
295 | lachen weinen oben unten
296 | lachen weinen davor danach
297 | lachen weinen Sommer Winter
298 | lachen weinen alt jung
299 | lachen weinen männlich weiblich
300 | lachen weinen männlich weiblich
301 | 


--------------------------------------------------------------------------------
/hyperhyper/bunch.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The heart of the package. This combines all the function and also exposes
  3 | the funtionality to the user. The `bunch` is the location where all the
  4 | resulting files are stored.
  5 | """
  6 | 
  7 | import logging
  8 | from pathlib import Path
  9 | from timeit import default_timer as timer
 10 | 
 11 | import dataset
 12 | import numpy as np
 13 | from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors
 14 | 
 15 | from . import evaluation, pair_counts, pmi, svd
 16 | from .corpus import Corpus
 17 | from .experiment import record, results_from_db
 18 | from .utils import (delete_folder, load_arrays, load_matrix, save_arrays,
 19 |                     save_matrix)
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | 
 24 | class Bunch:
 25 |     def __init__(
 26 |         self, path, corpus=None, force_overwrite=False, text_chunk_size=100000
 27 |     ):
 28 |         self.db = None
 29 |         self.path = Path(path)
 30 | 
 31 |         if force_overwrite and self.path.exists():
 32 |             delete_folder(self.path)
 33 | 
 34 |         if not corpus is None and not force_overwrite:
 35 |             if Path(self.path / "corpus.pkl").is_file():
 36 |                 raise ValueError(
 37 |                     "There is already another corpus file saved. Set `force_overwrite` to True if you want to override it."
 38 |                 )
 39 | 
 40 |         if corpus is None:
 41 |             self.corpus = Corpus.load(str(self.path / "corpus.pkl"))
 42 |         else:
 43 |             self.path.mkdir(parents=True, exist_ok=True)
 44 |             self.corpus = corpus
 45 |             self.corpus.texts_to_file(self.path / "texts", text_chunk_size)
 46 |             self.corpus.save(str(self.path / "corpus.pkl"))
 47 | 
 48 |     def get_db(self):
 49 |         """
 50 |         Connecting to a SQLite database.
 51 |         """
 52 |         if self.db is None:
 53 |             self.db = dataset.connect(f"sqlite:///{self.path}/results.db")
 54 |         return self.db
 55 | 
 56 |     def dict_to_path(self, folder, dict):
 57 |         """
 58 |         Return a file path for an embedding based on parameters.
 59 |         """
 60 | 
 61 |         # cast integer floats to ints
 62 |         for k, v in dict.items():
 63 |             if type(v) is float:
 64 |                 if v.is_integer():
 65 |                     dict[k] = int(v)
 66 | 
 67 |         filenames = [f"{k}_{v}".lower() for k, v in dict.items()]
 68 |         filename = "_".join(sorted(filenames))
 69 |         if len(filename) == 0:
 70 |             filename = "default"
 71 | 
 72 |         filename += ".npz"
 73 |         full_path = self.path / folder / filename
 74 |         return full_path
 75 | 
 76 |     def pair_counts(self, **kwargs):
 77 |         """
 78 |         Count pairs.
 79 |         """
 80 |         pair_path = self.dict_to_path("pair_counts", kwargs)
 81 |         if pair_path.is_file():
 82 |             try:
 83 |                 logger.info("retrieved already saved pair count")
 84 |                 return load_matrix(pair_path)
 85 |             except Exception as e:
 86 |                 logger.info(f"creating pair counts, error while loading files: {e}")
 87 | 
 88 |         print("create new pair counts")
 89 |         pair_path.parent.mkdir(parents=True, exist_ok=True)
 90 |         count_matrix = pair_counts.count_pairs(self.corpus, **kwargs)
 91 |         save_matrix(pair_path, count_matrix)
 92 |         return count_matrix
 93 | 
 94 |     def pmi_matrix(self, cds=0.75, pair_args={}, **kwargs):
 95 |         """
 96 |         Create a PMI matrix.
 97 |         """
 98 |         pmi_path = self.dict_to_path("pmi", {"cds": cds, **pair_args})
 99 |         if pmi_path.is_file():
100 |             try:
101 |                 logger.info("retrieved already saved pmi")
102 |                 return load_matrix(pmi_path)
103 |             except Exception as e:
104 |                 logger.info(f"creating new pmi, error while loading files: {e}")
105 | 
106 |         print("create new pmi")
107 |         counts = self.pair_counts(**pair_args, **kwargs)
108 | 
109 |         start = timer()
110 |         pmi_matrix = pmi.calc_pmi(counts, cds)
111 | 
112 |         end = timer()
113 |         logger.info("pmi took " + str(round(end - start, 2)) + " seconds")
114 | 
115 |         pmi_path.parent.mkdir(parents=True, exist_ok=True)
116 |         save_matrix(pmi_path, pmi_matrix)
117 |         logger.info("matrix saved")
118 | 
119 |         return pmi_matrix
120 | 
121 |     @record
122 |     def pmi(
123 |         self,
124 |         neg=1,
125 |         cds=0.75,
126 |         pair_args={},
127 |         keyed_vectors=False,
128 |         evaluate=True,
129 |         **kwargs,
130 |     ):
131 |         """
132 |         Gets the PMI matrix.
133 |         """
134 |         m = self.pmi_matrix(cds, pair_args, **kwargs)
135 |         embd = pmi.PPMIEmbedding(m, neg=neg)
136 |         if evaluate:
137 |             eval_results = self.eval_sim(embd)
138 |         if keyed_vectors:
139 |             # because of the large dimensions, the matrix will get huge!
140 |             return self.to_keyed_vectors(embd.m.todense(), m.shape[0])
141 |         if evaluate:
142 |             return embd, eval_results
143 |         return embd
144 | 
145 |     def svd_matrix(
146 |         self, impl, impl_args={}, dim=500, neg=1, cds=0.75, pair_args={}, **kwargs
147 |     ):
148 |         """
149 |         Do the actual SVD computation.
150 |         """
151 |         assert impl in ["scipy", "gensim", "scikit", "sparsesvd"]
152 | 
153 |         svd_path = self.dict_to_path(
154 |             "svd",
155 |             {
156 |                 "impl": impl,
157 |                 **impl_args,
158 |                 "neg": neg,
159 |                 "cds": cds,
160 |                 "dim": dim,
161 |                 **pair_args,
162 |             },
163 |         )
164 |         logger.debug(f"looking up the file: {svd_path}")
165 |         if svd_path.is_file():
166 |             try:
167 |                 logger.info("retrieved already saved svd")
168 |                 return load_arrays(svd_path)
169 |             except Exception as e:
170 |                 logger.info(f"creating new svd, error while loading files: {e}")
171 | 
172 |         print("creating new svd")
173 |         m = self.pmi_matrix(cds, pair_args, **kwargs)
174 |         m = pmi.PPMIEmbedding(m, neg=neg, normalize=False)
175 | 
176 |         start = timer()
177 |         ut, s = svd.calc_svd(m, dim, impl, impl_args)
178 |         end = timer()
179 |         logger.info("svd took " + str(round((end - start) / 60, 2)) + " minutes")
180 | 
181 |         svd_path.parent.mkdir(parents=True, exist_ok=True)
182 |         save_arrays(svd_path, ut, s)
183 |         logger.info("svd arrays saved")
184 | 
185 |         return ut, s
186 | 
187 |     @record
188 |     def svd(
189 |         self,
190 |         dim=500,
191 |         eig=0,
192 |         neg=1,
193 |         cds=0.75,
194 |         impl="scipy",
195 |         impl_args={},
196 |         pair_args={},
197 |         keyed_vectors=False,
198 |         evaluate=True,
199 |         **kwargs,
200 |     ):
201 |         """
202 |         Gets and SVD embedding.
203 |         """
204 |         ut, s = self.svd_matrix(
205 |             impl=impl,
206 |             impl_args=impl_args,
207 |             dim=dim,
208 |             neg=neg,
209 |             cds=cds,
210 |             pair_args=pair_args,
211 |             **kwargs,
212 |         )
213 |         embedding = svd.SVDEmbedding(ut, s, eig=eig)
214 | 
215 |         if evaluate:
216 |             eval_results = self.eval_sim(embedding)
217 |         if keyed_vectors:
218 |             embedding = self.to_keyed_vectors(embedding.m, dim)
219 |         if evaluate:
220 |             return embedding, eval_results
221 |         return embedding
222 | 
223 |     def to_keyed_vectors(self, embd_matrix, dim, delete_unknown=True):
224 |         """
225 |         Transform to gensim's keyed vectors structure for further usage.
226 |         https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/keyedvectors.py
227 |         """
228 |         vectors = WordEmbeddingsKeyedVectors(vector_size=dim)
229 |         tokens = self.corpus.vocab.tokens
230 |         if delete_unknown:
231 |             # delete last row (for <UNK> token)
232 |             embd_matrix = np.delete(embd_matrix, (-1), axis=0)
233 |         else:
234 |             # the last token is the UNK token so append it
235 |             tokens.append("<UNK>")
236 | 
237 |         vectors.add(tokens, embd_matrix)
238 |         return vectors
239 | 
240 |     def eval_sim(self, embd, **kwargs):
241 |         """
242 |         Evaluate the performance on word similarity datasets.
243 |         NB: The corpus has to be initialized with the correct language.
244 |         """
245 |         return evaluation.eval_similarity(
246 |             embd,
247 |             self.corpus.vocab.token2id,
248 |             self.corpus.preproc_fun,
249 |             lang=self.corpus.lang,
250 |             **kwargs,
251 |         )
252 | 
253 |     def eval_analogy(self, embd, **kwargs):
254 |         """
255 |         Evaluate the performance on word analogies datasets.
256 |         NB: The corpus has to be initialized with the correct language.
257 |         """
258 |         return evaluation.eval_analogies(
259 |             embd,
260 |             self.corpus.vocab.token2id,
261 |             self.corpus.preproc_fun,
262 |             lang=self.corpus.lang,
263 |             **kwargs,
264 |         )
265 | 
266 |     def results(self, **kwargs):
267 |         """
268 |         Retrieve evaluation results from the database.
269 |         """
270 |         return results_from_db(self.get_db(), **kwargs)
271 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/de/ws/gur350.txt:
--------------------------------------------------------------------------------
  1 | Absage ablehnen 3.5
  2 | Absage Stellenanzeige 1.88
  3 | Affe Gepäckkontrolle 0.13
  4 | Affe Makake 4
  5 | Afrika historisch 1
  6 | Agentur Irrtum 0
  7 | Airbag Kopfairbag 3.88
  8 | analysieren Analyse 3.88
  9 | Ansehen Schaden 0.88
 10 | Arbeitssuchender Bewerbung 2.75
 11 | aufklären erklären 2.5
 12 | Aufpreis Grundpreis 3.13
 13 | Aufstieg Erfolg 3.25
 14 | aufzeichnen schreiben 2.75
 15 | Aussage Auftritt 1.38
 16 | Aussage Rede 2.38
 17 | Aussage sagen 3.38
 18 | Aussterben bedrohen 2.13
 19 | Auto fahren 3.5
 20 | Bayern Bayerisch 4
 21 | Bayern Deutschland 3.5
 22 | Bayern weißblau 2.75
 23 | Beamte Amt 3.63
 24 | beginnen dauern 2.38
 25 | begründen ausgehen 0.88
 26 | Behörde Vorschrift 2.75
 27 | beinhalten umfassen 3.25
 28 | Benedetto Benedikt 3.63
 29 | Benziner Dieselversion 3
 30 | Berlin Berlin-Kreuzberg 3.38
 31 | Berufstätigkeit Erfolg 2.13
 32 | beschleunigen übertreiben 1.13
 33 | beschuldigen Mitschuld 2.5
 34 | Besucher bekommen 1.38
 35 | Bewerbung Job 2.38
 36 | Bild ähneln 1.38
 37 | Bild Grafik 3.13
 38 | Bild Röntgenaufnahme 3
 39 | Bild Symbol 2.13
 40 | Bild visuell 3
 41 | Böse Gott 2
 42 | Botschaft sichtbar 0.25
 43 | Büro Schreibtisch 3
 44 | Demut demütig 4
 45 | demütig selbstbewusst 1.88
 46 | Design Optik 2.63
 47 | Designer Eleganz 2.63
 48 | deutsch Deutscher 3.88
 49 | Deutscher Bundesbürger 3.5
 50 | Deutschland Europa 3.25
 51 | Ding Gegenstand 4
 52 | Doktorandin Abteilung 1.88
 53 | Doktorandin Dissertationsthema 2.63
 54 | Drehmoment drehfreudig 1.75
 55 | dringend rasch 2.38
 56 | Durchsicht sehen 2.75
 57 | einfach komplex 2.75
 58 | Einkommen Gehaltsunterschied 2
 59 | Einrichtung Interior 3.5
 60 | Einsamkeit allein 3.5
 61 | einsteigen aussteigen 2.75
 62 | Eleganz klobig 1.38
 63 | Eltern Vater 3.5
 64 | entgehen bewundern 0.13
 65 | entwickeln Entwicklungschef 2.63
 66 | Erfolg erfolgreich 4
 67 | Erfolg Maßstab 1.25
 68 | erforschen herausfinden 3.13
 69 | Erhalt bedroht 1
 70 | erkennen sehen 3
 71 | erklären begründen 2.5
 72 | erklären machen 0.5
 73 | ernst ironisch 2
 74 | erst Ursprugsort 1.38
 75 | Erwachsener Geist 0
 76 | Erwachsener Kinder 2.63
 77 | erwarten klären 0
 78 | fahren Automobil 3
 79 | filtern herausfiltern 3.63
 80 | filtern selektieren 3.38
 81 | finden herausfinden 3
 82 | Fisch schwimmen 3.38
 83 | Flaschenöffner Küchenwerkzeug 3.63
 84 | fokussieren Aufmerksamkeit 2.63
 85 | folgen sortieren 0.25
 86 | Form Farbe 2.13
 87 | formulieren Formulierung 3.88
 88 | Formulierung Stiftung 0.13
 89 | Forscher Wissenschaftler 3.88
 90 | Frage Antwort 3.25
 91 | Franzose Deutscher 2.38
 92 | Frau Familie 2.75
 93 | Frau Mann 3.25
 94 | Frühlingssonne kitzeln 1.25
 95 | Frust frustrieren 3.88
 96 | Frust Leidensgenosse 1.88
 97 | Frust Rache 1.88
 98 | geben nehmen 3.25
 99 | Gefühl Frau 1.75
100 | Gegenwind kritisieren 0.5
101 | Gehege Zoo 2.63
102 | Gehirn Kortex 3.25
103 | Gehirn verstehen 2.13
104 | gemeinsam leben 1
105 | Generation Jugendlicher 2.5
106 | geografisch praktisch 0.13
107 | Gepäckkontrolle Flughafen 3.13
108 | Gepäcknetz Staumöglichkeit 2.25
109 | Geschirrdurcheinander Menschenleben 0.5
110 | Geschlecht Mann 3
111 | Gewalt Frieden 2.63
112 | Gewalt Kämpfer 2.63
113 | Gewicht Karriere 0.38
114 | Glaube natürlich 0.5
115 | Glück glücklich 3.88
116 | Gorilla Schlange 1.25
117 | großzügig schrumpfen 0.5
118 | gründen Arbeitsgruppe 0.75
119 | Grundlagenforschung verstehen 1.63
120 | Hand Erwachsener 1.38
121 | Hand Mensch 2.75
122 | heimisch Urwaldhaus 1
123 | helfen unterstützen 3.38
124 | herausstreichen öffentlich 0.5
125 | Herkunft Geschlecht 1.38
126 | Hintergrund Fassade 2
127 | Hirn Gehirn 3.88
128 | Hirnsignal Neuronenaktivität 3.5
129 | Hoffnung Resignation 2.75
130 | Honorarbasis bezahlen 3
131 | Hunderttausend Menge 3
132 | Hunger Armut 2.88
133 | Inaugurationsmesse Premiere 2.13
134 | informieren erfahren 2.63
135 | Innenspiegel Auto 3.13
136 | Internetseite herunterladen 3.25
137 | italienisch vergehen 0
138 | Jäger Wald 2.75
139 | Kaffeetasse parallel 0
140 | Kaffeetasse Tasse 3.75
141 | Kamera TV-Kamera 3.75
142 | kämpfen idyllisch 0.13
143 | kämpfen Veterinär 0.38
144 | Karriere hinaufklettern 2
145 | Karriere Risiko 1
146 | Kind Familie 3.38
147 | Kompaktvan Modell 2.5
148 | Kopfairbag Seitenairbag 3.25
149 | Krankheit reißen 0.25
150 | Krebserkennung Röntgenaufnahme 2
151 | kühl hübsch 0.38
152 | Kulturwissenschaft Grafiker 0.63
153 | lachen leben 1.63
154 | lassen prägen 0.25
155 | laufen bleiben 1.25
156 | leben hellen 0.13
157 | leben Tod 3.25
158 | Lebensbedürfnis ansiedeln 0.38
159 | legen Tisch 1.13
160 | lernen gleichzeitig 0
161 | Lied singen 3.38
162 | Linguistik Wissenschaft 3.5
163 | Luft Leben 2.75
164 | Lupe suchen 2
165 | lustig Witz 3.25
166 | machen anfertigen 3.63
167 | machen ausüben 2.5
168 | Macht Reich 2.5
169 | Mai Januar 2.88
170 | Mann Geschäftspartner 1.5
171 | männlich Weiblich 3.13
172 | Marktl Bayern 2.25
173 | Mehrarbeit Workaholic 2
174 | Meinung Überzeugung 3.13
175 | Mercedes Premium-Hersteller 2.63
176 | Minister Außenminister 3.38
177 | Minister Ministerpräsident 3.38
178 | Minister Politiker 3.25
179 | mitteilen Nachricht 3
180 | moderat extra 1.25
181 | modern sportlich 1.25
182 | momentan kommend 1.38
183 | Monate alt 2.25
184 | Montag November 2.38
185 | Motor Hubraum 2.75
186 | nachgehen untersuchen 2.75
187 | Natur künstlich 2.63
188 | Niedersachsen Landesverband 1.63
189 | niederschmetternd positiv 1.63
190 | Objekt Gegenstand 3.88
191 | objektiv subjektiv 3.13
192 | pädagogisch weitläufig 0.5
193 | Papst Kirche 3.38
194 | parallel linear 1.75
195 | Pass Reiseschutzpass 2.75
196 | Petersdom Inaugurationsmesse 2.63
197 | Pinguin baden 1.5
198 | plätschern Wasser 2.88
199 | Platz aufgebläht 0.13
200 | Platz Petersplatz 3.13
201 | Pontifikat Papst 3.38
202 | Post Portokosten 3
203 | Premium-Hersteller Opel 1.63
204 | Premium-Hersteller VW 2
205 | Problem Schwierigkeit 3.25
206 | Projekt Aktion 2
207 | Prozentzeichen Symbol 3.38
208 | Prüfung Zeugnis 2.5
209 | Punktverlust Platz 1.13
210 | Ratzinger Papst 3.38
211 | Relevanz relevant 3.88
212 | riesig üppig 2.63
213 | rot-weiß weißblau 2.75
214 | sachlich Seriosität 2.13
215 | sagen erklären 2.13
216 | sagen mitteilen 3.13
217 | Sandwich-Konzept Sicherheit 0.5
218 | schauen sehen 3.75
219 | Schleusung Betrugshandlung 2.13
220 | schließen Überlegung 0.88
221 | Schrank Küchenschrank 3.38
222 | Schwabe sparen 2.75
223 | Schwabe Stuttgarter 3.38
224 | Seitenansicht A-Säule 0.88
225 | Selbstinszenierung Beziehungsarbeit 0.5
226 | serienmäßig extra 2.13
227 | Sicherheit Frontalkollision 1.63
228 | Sicherheit klobig 0.25
229 | Sohn aussteigen 0
230 | Sohn Vater 3.38
231 | Spitze allein 1.13
232 | Spitze hoch 2.25
233 | sportlich Interior 0
234 | sportlich teuer 0.38
235 | stark Gehaltsunterschied 0.13
236 | stark Kämpfer 1.88
237 | Steckdose komplex 0.13
238 | Steckdose Stern 0.13
239 | Stellenangebot sehen 0.38
240 | Stellenangebot Wochenzeitung 2.25
241 | Stellenanzeige Bewerbungsgespräch 2.25
242 | Stellenanzeige rasch 0.5
243 | Stoiber drehfreudig 0.25
244 | Stoiber Ministerpräsident 3.13
245 | Studie Dissertationsthema 1.88
246 | Studie Ergebnis 2.75
247 | Studierende Abteilung 1.63
248 | Studierende Note 2.38
249 | Studium arbeiten 2.63
250 | Studium Beruf 3
251 | Studium Deutscher 0.25
252 | Studium Europa 0.5
253 | Studium Gegenstand 0.88
254 | Studium studieren 4
255 | suchen Bundesbürger 0
256 | suchen finden 3
257 | Suchmaschinenbetreiber Eleganz 0.25
258 | Suchmaschinenbetreiber Linkstatistik 1.75
259 | Suchstrategie Optik 0.25
260 | Suchstrategie suchen 3.5
261 | summieren selbstbewusst 0.13
262 | summieren teuer 0.88
263 | Tag demütig 0.25
264 | Tag Donnerstag 3.38
265 | Tag Leben 1.5
266 | Tag Schreibtisch 0
267 | Tag sichtbar 0.63
268 | Tag Stunde 2.75
269 | Tastatur Gott 0
270 | Tastatur Suche 0.63
271 | Tätigkeit Arbeit 3
272 | Tätigkeit visuell 0.13
273 | teuer kostspielig 3.88
274 | teuer Symbol 0.25
275 | Tier Natur 2.63
276 | Tier Röntgenaufnahme 0.25
277 | Tierpark Giraffe 3
278 | Tierpark Grafik 0.5
279 | Tod ähneln 0
280 | Tod Beerdigung 3.25
281 | Topmanagement Job 2.5
282 | Topmanagement Unternehmen 2.75
283 | Traurigkeit bekommen 0.13
284 | Traurigkeit Heimgang 1.13
285 | überzeugen Mitschuld 0.5
286 | überzeugen zeigen 1.5
287 | Überzeugung übertreiben 0.63
288 | Überzeugung Zweifel 2.63
289 | Umfrage Erfolg 0.13
290 | Umfrage Quartalsumfrage 2.88
291 | umklappen Berlin-Kreuzberg 0
292 | umklappen flachlegen 1.63
293 | Unternehmen Dieselversion 0
294 | Unternehmen Firma 3.63
295 | untersuchen Benedikt 0
296 | untersuchen suchen 2.5
297 | Untersuchungsausschuss aussagen 1.88
298 | Untersuchungsausschuss umfassen 0.38
299 | Van Sports-Tourer 2.38
300 | Van Vorschrift 0.25
301 | Vatikan ausgehen 0.13
302 | Vatikan Katholik 3.25
303 | veranstalten betreuen 1.38
304 | veranstalten dauern 0.75
305 | verantwortlich Amt 2.25
306 | verantwortlich zuständig 3.63
307 | vergangen damalig 3.25
308 | vergangen weißblau 0
309 | Vergangenheit alte 2
310 | Vergangenheit Deutschland 1
311 | verhindert Bayerisch 0
312 | verhindert Beihilfe 0.75
313 | verkaufen bezahlen 2.5
314 | verkaufen fahren 0.13
315 | Vernehmung bedrohen 0.75
316 | Vernehmung vernommen 3.63
317 | versäumen sagen 0.13
318 | versäumen überprüfen 0.13
319 | verschicken Post 3
320 | verschicken Rede 0.25
321 | versichern Auftritt 0.13
322 | versichern bedauern 0.5
323 | viel groß 2
324 | viel schreiben 0.38
325 | Volierenzelt Erfolg 0
326 | Volierenzelt Käfig 2.38
327 | vorankommen Entwicklung 2.5
328 | vorankommen Grundpreis 0.25
329 | weit Bewerbung 0
330 | weit erklären 0.25
331 | weit nahe 3.13
332 | weit wegrennen 1.5
333 | Welle Schaden 1
334 | Welle Surfer 3.13
335 | Widerspruch Analyse 1.13
336 | Widerspruch Gebiet 0
337 | Wien deutschsprachig 3
338 | Wien Kopfairbag 0
339 | Wirtschaftsprofessor Irrtum 0.38
340 | Wirtschaftsprofessor Professor 3.63
341 | Wirtschaftsuniversität Abteilung 1.75
342 | Wirtschaftsuniversität historisch 0.63
343 | Witz Gepäckkontrolle 0.25
344 | Witz Joke 4
345 | Witz Kopf 1.13
346 | Witz Makake 0.13
347 | Zebra Stellenanzeige 0
348 | Zebra Tier 3.25
349 | Zielstrebigkeit ablehnen 0.25
350 | Zielstrebigkeit Erfolg 2.63
351 | 


--------------------------------------------------------------------------------
/hyperhyper/pair_counts.py:
--------------------------------------------------------------------------------
  1 | """
  2 | construct a co-occurrence matrix by counting word pairs (co-locations of words)
  3 | """
  4 | 
  5 | import logging
  6 | import os
  7 | import random
  8 | from collections import defaultdict
  9 | from concurrent import futures
 10 | from math import ceil, e, fabs, sqrt
 11 | 
 12 | import numpy as np
 13 | from scipy.sparse import coo_matrix, csr_matrix, lil_matrix
 14 | from tqdm import tqdm
 15 | 
 16 | from .utils import read_pickle
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | def decay(distance, rate):
 22 |     """
 23 |     simple exponential decay
 24 |     """
 25 |     distance -= 1  # the returned value is 1 when the distance is 1
 26 |     return e ** -(rate * distance)
 27 | 
 28 | 
 29 | def to_count_matrix(pair_counts, vocab_size):
 30 |     """
 31 |     transforms the counts into a sparse matrix
 32 |     """
 33 |     cols = []
 34 |     rows = []
 35 |     data = []
 36 |     for k, v in pair_counts.items():
 37 |         rows.append(k[0])
 38 |         cols.append(k[1])
 39 |         data.append(v)
 40 |     # setting to float is important, +1 for UNK
 41 |     # COO matrix is the fastest for constructing the matrix since we have all
 42 |     # the data already
 43 |     count_matrix = coo_matrix(
 44 |         (data, (rows, cols)), shape=(vocab_size + 1, vocab_size + 1), dtype=np.float32
 45 |     )
 46 |     # CSR matrices support more arithmetic operations and are more efficient
 47 |     return count_matrix.tocsr()
 48 | 
 49 | 
 50 | def count_pairs_parallel(texts_paths, count_pairs_closure, low_memory):
 51 |     """
 52 |     count pairs in parallel by loading and processing files to keep memory
 53 |     consumption low
 54 |     """
 55 |     # Ensure that memory is freed when a job completes.
 56 |     res = None
 57 |     with futures.ProcessPoolExecutor() as executor:
 58 |         # A dictionary which will contain a list the future info in the key, and the filename in the value
 59 |         jobs = {}
 60 |         files_left = len(texts_paths)
 61 |         files_iter = iter(texts_paths)
 62 | 
 63 |         if low_memory:
 64 |             MAX_JOBS_IN_QUEUE = os.cpu_count()
 65 |         else:
 66 |             MAX_JOBS_IN_QUEUE = os.cpu_count() * 2  # heuristic ;)
 67 | 
 68 |         with tqdm(total=len(texts_paths), desc="generating pairs") as pbar:
 69 |             while files_left:
 70 |                 for this_file in files_iter:
 71 |                     job = executor.submit(count_pairs_closure, this_file)
 72 |                     jobs[job] = this_file
 73 |                     if len(jobs) > MAX_JOBS_IN_QUEUE:
 74 |                         break  # limit the job submission for now job
 75 | 
 76 |                 # Get the completed jobs whenever they are done
 77 |                 for job in futures.as_completed(jobs):
 78 |                     files_left -= 1
 79 |                     pbar.update(1)
 80 |                     m = job.result()
 81 |                     if res is None:
 82 |                         res = m
 83 |                     else:
 84 |                         res += m
 85 | 
 86 |                     del jobs[job]
 87 |     return res
 88 | 
 89 | 
 90 | class CountPairsClosure(object):
 91 |     """
 92 |     creating a closure, has to be an object to be pickle-able when doing
 93 |     multiprocessing
 94 |     """
 95 | 
 96 |     def __init__(self, **kwargs):
 97 |         self.__dict__.update(kwargs)
 98 | 
 99 |     def __call__(self, text_path):
100 |         texts = read_pickle(text_path)
101 |         counter = defaultdict(int)
102 |         for t in texts:
103 |             for pair in iterate_tokens(
104 |                 t,
105 |                 self.window,
106 |                 self.dynamic_window_prob,
107 |                 self.dynamic_window_deter,
108 |                 self.dynamic_window_decay,
109 |                 self.delete_oov,
110 |                 self.subsampler_prob,
111 |                 self.vocab_size,  # <UKN> id
112 |             ):
113 |                 counter[pair[0], pair[1]] += pair[2]
114 |         return to_count_matrix(counter, self.vocab_size)
115 | 
116 | 
117 | def iterate_tokens(
118 |     tokens,
119 |     window,
120 |     dynamic_window_prob,
121 |     dynamic_window_deter,
122 |     dynamic_window_decay,
123 |     delete_oov,
124 |     subsampler_prob,
125 |     unkown_id,
126 | ):
127 |     """
128 |     iterate over tokens in a sentence and counting pairs
129 |     """
130 |     if delete_oov:
131 |         tokens = [t for t in tokens if t != unkown_id]
132 | 
133 |     if not subsampler_prob is None:
134 |         tokens = [
135 |             t
136 |             if t not in subsampler_prob or random.random() <= subsampler_prob[t]
137 |             else None
138 |             for t in tokens
139 |         ]
140 | 
141 |     len_tokens = len(tokens)
142 |     res = []
143 |     for i, tok in enumerate(tokens):
144 |         if tok is not None:
145 |             if dynamic_window_prob:
146 |                 offset = random.randint(1, window)
147 |             else:
148 |                 offset = window
149 |             start = i - offset
150 |             if start < 0:
151 |                 start = 0
152 |             end = i + offset + 1
153 |             if end > len_tokens:
154 |                 end = len_tokens
155 |             for j in range(start, end):
156 |                 if j != i and tokens[j] is not None:
157 |                     count = 1
158 |                     # the variations are exclusive
159 |                     if dynamic_window_deter:
160 |                         distance = fabs(i - j)
161 |                         count = (window + 1 - distance) / window
162 |                     if not dynamic_window_decay is None:
163 |                         distance = fabs(i - j)
164 |                         count = decay(distance, dynamic_window_decay)
165 |                     res.append((tok, tokens[j], count))
166 |     return res
167 | 
168 | 
169 | # storing the default values here again to re-use them when writing to the db
170 | # TODO: implement in a more elegant way
171 | default_pair_args = {
172 |     "window": 2,
173 |     "dynamic_window": "deter",
174 |     "decay_rate": 0.25,
175 |     "delete_oov": True,
176 |     "subsample": "deter",
177 |     "subsample_factor": 1e-5,
178 | }
179 | 
180 | 
181 | def count_pairs(
182 |     corpus,
183 |     window=2,
184 |     dynamic_window="deter",
185 |     decay_rate=0.25,
186 |     delete_oov=True,
187 |     subsample="deter",
188 |     subsample_factor=1e-5,
189 |     seed=1312,
190 |     low_memory=False,
191 |     low_memory_chunk=100,
192 |     min_count=0,
193 | ):
194 |     """
195 |     counting pairs in a corpus
196 | 
197 |     TODO: instead of giving a subsample_factor, give a portion of tokens to apply subsample
198 |     """
199 |     for x in [dynamic_window, subsample]:
200 |         if not x is None and not x == False:
201 |             assert x in ("deter", "prob", "off", "decay")
202 | 
203 |     random.seed(seed)
204 | 
205 |     subsampler_prob = None
206 |     if subsample == "prob":
207 |         subsampler_prob = subsample_factor * corpus.size
208 |         subsampler_prob = {
209 |             word: 1 - sqrt(subsampler_prob / count)
210 |             for word, count in corpus.counts.items()
211 |             if count > subsampler_prob
212 |         }
213 | 
214 |     count_matrix = count_pairs_parallel(
215 |         corpus.texts,
216 |         CountPairsClosure(
217 |             window=window,
218 |             dynamic_window_prob=dynamic_window == "prob",
219 |             dynamic_window_deter=dynamic_window == "deter",
220 |             dynamic_window_decay=decay_rate if dynamic_window == "decay" else None,
221 |             delete_oov=delete_oov,
222 |             subsampler_prob=subsampler_prob,
223 |             vocab_size=corpus.vocab.size,
224 |         ),
225 |         low_memory=low_memory,
226 |     )
227 | 
228 |     # already prunning with a `min_count` of 1 can greatly reduces memory usage
229 |     logger.info(f"Sparseness rate: {count_matrix.nnz / (corpus.vocab.size ** 2)}")
230 |     if not min_count is None and min_count > 0:
231 |         count_matrix.data *= count_matrix.data >= min_count
232 |         count_matrix.eliminate_zeros()
233 |         logger.info(
234 |             f"Sparseness rate after pruning: {count_matrix.nnz / (corpus.vocab.size ** 2)}"
235 |         )
236 | 
237 |     # down sample in a deterministic way
238 |     if subsample == "deter":
239 |         # construct array with appropriate factor
240 |         logger.info("creating array for the subsampling")
241 |         subsample_value = subsample_factor * corpus.size
242 |         subsampler = np.ones(corpus.vocab.size + 1, dtype=np.float32)
243 |         num_sub = 0
244 |         for word, count in corpus.counts.items():
245 |             if count > subsample_value:
246 |                 subsampler[word] = sqrt(subsample_value / count)
247 |                 num_sub += 1
248 |         print(f"subsampling applied to {num_sub / corpus.vocab.size} of the tokens")
249 | 
250 |         if low_memory:
251 |             # iterate over all rows in blocks
252 |             count_matrix = lil_matrix(count_matrix)
253 |             for i in tqdm(range(ceil((corpus.vocab.size + 1) / low_memory_chunk))):
254 |                 count_matrix[
255 |                     i * low_memory_chunk : (i + 1) * low_memory_chunk,
256 |                 ] = count_matrix[
257 |                     i * low_memory_chunk : (i + 1) * low_memory_chunk,
258 |                 ].multiply(
259 |                     subsampler[i * low_memory_chunk : (i + 1) * low_memory_chunk]
260 |                     .reshape((-1, 1))
261 |                     .dot(subsampler.reshape(1, -1))
262 |                 )
263 |         else:
264 |             logger.info("creating subsampler matrix")
265 |             # to 2d matrix
266 |             subsampler = subsampler.reshape((-1, 1)).dot(subsampler.reshape(1, -1))
267 |             logger.info("multiply elementwise: start")
268 |             # elementwise muplication of 2 matrices
269 |             count_matrix = count_matrix.multiply(subsampler)
270 |             logger.info("multiply elementwise: done")
271 |         # in both cases: transform to csr matrix
272 |         count_matrix = csr_matrix(count_matrix)
273 |     return count_matrix
274 | 


--------------------------------------------------------------------------------
/hyperhyper/corpus.py:
--------------------------------------------------------------------------------
  1 | """
  2 | represent a collection of texts
  3 | """
  4 | 
  5 | import logging
  6 | import os
  7 | import random
  8 | from array import array
  9 | from collections import defaultdict
 10 | from concurrent import futures
 11 | from pathlib import Path
 12 | 
 13 | from gensim.corpora import Dictionary
 14 | from gensim.utils import SaveLoad
 15 | from tqdm import tqdm
 16 | 
 17 | from .preprocessing import (texts_to_sents, tokenize_texts,
 18 |                             tokenize_texts_parallel)
 19 | from .utils import chunks, dsum, read_pickle, to_pickle
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | 
 24 | class Vocab(Dictionary):
 25 |     """
 26 |     Holds mapping for the integer ids to the tokens (words).
 27 |     """
 28 | 
 29 |     def __init__(self, texts=None, **kwargs):
 30 |         super().__init__(texts)
 31 |         if not texts is None:
 32 |             self.filter(**kwargs)
 33 | 
 34 |     def filter(self, no_below=0, no_above=1, keep_n=50000, keep_tokens=None):
 35 |         """
 36 |         Filter extremes with sane defaults.
 37 |         """
 38 |         self.filter_extremes(
 39 |             no_below=no_below, no_above=no_above, keep_n=keep_n, keep_tokens=keep_tokens
 40 |         )
 41 | 
 42 |     @property
 43 |     def size(self):
 44 |         return len(self.token2id)
 45 | 
 46 |     @property
 47 |     def tokens(self):
 48 |         """
 49 |         Return tokens as array (in order of id).
 50 |         """
 51 |         return [tup[0] for tup in sorted(self.token2id.items(), key=lambda x: x[1])]
 52 | 
 53 | 
 54 | class TransformToIndicesClosure(object):
 55 |     """
 56 |     A closure that is pickable, usefull for multiprocessing.
 57 |     <UNK> is the last ID (thus vocab_size)
 58 |     for sizes: https://docs.python.org/3/library/array.html
 59 |     """
 60 | 
 61 |     def __init__(self, c):
 62 |         self.vocab_size = c.vocab.size
 63 |         self.d = c.vocab.doc2idx
 64 |         if self.vocab_size <= 65535:
 65 |             self.size = "H"
 66 |         else:
 67 |             self.size = "L"
 68 | 
 69 |     def __call__(self, texts):
 70 |         return array(self.size, self.d(texts, self.vocab_size))
 71 | 
 72 | 
 73 | def count_tokens(texts):
 74 |     """
 75 |     Count token frequencies since gensim's dictionary only provides document frequencies.
 76 |     """
 77 |     counts = defaultdict(int)
 78 |     for text in texts:
 79 |         for token in text:
 80 |             counts[token] += 1
 81 |     return counts
 82 | 
 83 | 
 84 | def _texts_to_ids(args):
 85 |     f, to_indices = args[0], args[1]
 86 |     texts = read_pickle(f)
 87 |     transformed = [to_indices(t) for t in texts]
 88 |     to_pickle(transformed, f)
 89 |     counts = count_tokens(transformed)
 90 |     return len(transformed), counts
 91 | 
 92 | 
 93 | def texts_to_ids(input_text_fns, to_indices):
 94 |     """
 95 |     transform the raw texts to integer ids
 96 |     """
 97 |     total_len = 0
 98 |     all_counts = []
 99 |     with futures.ProcessPoolExecutor() as executor:
100 |         # A dictionary which will contain a list the future info in the key, and the filename in the value
101 |         jobs = {}
102 |         files_left = len(input_text_fns)
103 |         files_iter = iter(input_text_fns)
104 |         MAX_JOBS_IN_QUEUE = os.cpu_count() * 2
105 | 
106 |         with tqdm(total=len(input_text_fns), desc="texts to ids") as pbar:
107 |             while files_left:
108 |                 for this_file in files_iter:
109 |                     job = executor.submit(_texts_to_ids, [this_file, to_indices])
110 |                     jobs[job] = this_file
111 |                     if len(jobs) > MAX_JOBS_IN_QUEUE:
112 |                         break  # limit the job submission for now job
113 | 
114 |                 # Get the completed jobs whenever they are done
115 |                 for job in futures.as_completed(jobs):
116 |                     files_left -= 1
117 |                     pbar.update(1)
118 |                     num_sents, counts = job.result()
119 |                     all_counts.append(counts)
120 |                     total_len += num_sents
121 |                     del jobs[job]
122 | 
123 |     return total_len, dsum(*all_counts)
124 | 
125 | 
126 | def _build_vocab_from_file(args):
127 |     f, preproc_func, view_fraction = args[0], args[1], args[2]
128 | 
129 |     texts = f.read_text().split("\n")
130 |     texts = preproc_func(texts)
131 | 
132 |     # temporary save processed files to continue working later
133 |     to_pickle(texts, f.with_suffix(".pkl"))
134 | 
135 |     # skip at random
136 |     if 0.999 > view_fraction < random.random():
137 |         return Vocab()
138 |     return Vocab(texts)
139 | 
140 | 
141 | class Corpus(SaveLoad):
142 |     """
143 |     An object to hold text.
144 |     """
145 | 
146 |     def __init__(self, vocab, preproc_fun, texts=None, input_text_fns=None, lang="en"):
147 |         self.vocab = vocab
148 |         self.vocab_size = vocab.size
149 |         self.lang = lang
150 |         self.preproc_fun = preproc_fun
151 | 
152 |         if texts is None:
153 |             to_indices = TransformToIndicesClosure(self)
154 |             self.size, self.counts = texts_to_ids(input_text_fns, to_indices)
155 |             self.input_text_fns = input_text_fns
156 |             self.texts = None
157 |         else:
158 |             to_indices = TransformToIndicesClosure(self)
159 |             transformed = [
160 |                 to_indices(t) for t in tqdm(texts, desc="transform to indices")
161 |             ]
162 |             self.texts = transformed
163 |             self.counts = count_tokens(transformed)
164 |             self.size = len(transformed)
165 | 
166 |     def texts_to_file(self, dir, text_chunk_size):
167 |         """
168 |         If we haven't created the temporay text files yet, do it here.
169 |         We could't do it earlier since we only have location on the filesystem
170 |         through the `bunch`.
171 |         """
172 |         if self.texts is None:
173 |             # re-use the texts that were created for initialization of the corpus
174 |             # TODO: make use of chunk size?
175 |             self.texts = self.input_text_fns
176 |             fns = []
177 |             Path(dir).mkdir(parents=True, exist_ok=True)
178 |             for i, f in enumerate(self.input_text_fns):
179 |                 new_path = Path(f"{dir}/texts_{i}.pkl").resolve()
180 |                 # only works if data and bunch are on same file system
181 |                 f.rename(new_path)
182 |                 fns.append(new_path)
183 |             self.texts = fns
184 |         else:
185 |             fns = []
186 |             for i, c in enumerate(chunks(self.texts, text_chunk_size)):
187 |                 fn = Path(f"{dir}/texts_{i}.pkl").resolve()
188 |                 to_pickle(c, fn)
189 |                 fns.append(fn)
190 |             self.texts = fns
191 | 
192 |     @staticmethod
193 |     def from_file(input_path, limit=None, **kwargs):
194 |         """
195 |         Construct a Corpus from a text file with newline-delimited sentences.
196 |         """
197 |         logger.info("reading file")
198 |         text = Path(input_path).read_text()
199 |         lines = text.splitlines()
200 |         if limit is not None:
201 |             lines = lines[:limit]
202 |         logger.info("done reading file")
203 |         return Corpus.from_sents(lines, **kwargs)
204 | 
205 |     @staticmethod
206 |     def from_sents(
207 |         texts, vocab=None, preproc_func=tokenize_texts_parallel, lang="en", **kwargs
208 |     ):
209 |         """
210 |         Construct corpus from lists of sentences.
211 |         """
212 |         texts = preproc_func(texts)
213 |         if vocab is None:
214 |             vocab = Vocab(texts, **kwargs)
215 |         corpus = Corpus(vocab, preproc_func, texts=texts, lang=lang)
216 |         return corpus
217 | 
218 |     @staticmethod
219 |     def from_texts(texts, preproc_func=texts_to_sents, **kwargs):
220 |         """
221 |         Construct corpus from list of texts.
222 |         """
223 |         return Corpus.from_sents(texts, preproc_func=preproc_func, **kwargs)
224 | 
225 |     @staticmethod
226 |     def from_text_files(
227 |         base_dir, preproc_func=texts_to_sents, view_fraction=1, lang="en", **kwargs
228 |     ):
229 |         """
230 |         Construct a corpus from a folder of text files.
231 |         The size of the text files determine the working memory size later on.
232 |         This is usefull for larger amount of text.
233 | 
234 |         Args:
235 |             base_dir (str): The directory with the text files.
236 |             preproc_func (fun): The funcation to preprocess texts into sentences.
237 |             view_fraction (float): Option to only look at portions of the text to determine the most frequent words.
238 |             lang (str): The language of the texts, defaults to "en".
239 | 
240 |         Returns:
241 |             Corpus
242 |         """
243 |         voc = Vocab()
244 |         input_text_fns = list(Path(base_dir).glob("*.txt"))
245 |         proc_fns = [f.with_suffix(".pkl") for f in input_text_fns]
246 | 
247 |         with futures.ProcessPoolExecutor() as executor:
248 |             jobs = {}
249 |             files_left = len(input_text_fns)
250 |             files_iter = iter(input_text_fns)
251 |             MAX_JOBS_IN_QUEUE = os.cpu_count() * 2
252 | 
253 |             with tqdm(total=len(input_text_fns), desc="build up vocab") as pbar:
254 |                 while files_left:
255 |                     for this_file in files_iter:
256 |                         job = executor.submit(
257 |                             _build_vocab_from_file,
258 |                             [this_file, preproc_func, view_fraction],
259 |                         )
260 |                         jobs[job] = this_file
261 |                         if len(jobs) > MAX_JOBS_IN_QUEUE:
262 |                             break
263 | 
264 |                     for job in futures.as_completed(jobs):
265 |                         files_left -= 1
266 |                         pbar.update(1)
267 |                         # merge into one vocab
268 |                         voc.merge_with(job.result())
269 |                         del jobs[job]
270 | 
271 |         # only consider most frequent terms etc.
272 |         voc.filter(**kwargs)
273 | 
274 |         return Corpus(voc, preproc_func, input_text_fns=proc_fns, lang=lang)
275 | 
276 | 


--------------------------------------------------------------------------------
/examples/02_wikipedia.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# hyerhyper is mainly for constructing word embeddings if you you don't have enough data. For large corpora, use word2vec or fastText. However, it still (somewhat) works for Wikipedia.\n",
 10 |     "\n",
 11 |     "# A dump of the English wikipedia is quite large. So preprocessing may take a while (hours or days).\n",
 12 |     "# 1. download wikipedia dump (https://dumps.wikimedia.org/enwiki/), wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2\n",
 13 |     "# 2. `python WikiExtractor.py enwiki-latest-pages-articles.xml -o en-wiki` WikiExtraktor (https://github.com/attardi/wikiextractor)\n",
 14 |     "# 3. i=0; for f in en-wiki/*/*; do cp $f en-wiki-flat/$i.txt && ((i++)) && echo $i; done"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import hyperhyper as hy"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "# We change the default preprocessing to speed things up. An sentence in this case means a whole article. \n",
 33 |     "# This is quite dirty, but proper preprocessing (splitting into sentences, removing stop words) takes more time."
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 3,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "name": "stderr",
 43 |      "output_type": "stream",
 44 |      "text": [
 45 |       "build up vocab: 100%|██████████| 13039/13039 [28:24<00:00,  7.65it/s]\n",
 46 |       "texts to ids: 100%|██████████| 13039/13039 [13:45<00:00, 15.80it/s]\n"
 47 |      ]
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "corpus = hy.Corpus.from_text_files('/mnt/data/datasets/wiki/en-wiki-flat', preproc_func=hy.tokenize_texts) "
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 5,
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "name": "stderr",
 61 |      "output_type": "stream",
 62 |      "text": [
 63 |       "/home/filter/anaconda3/envs/hyperhyper/lib/python3.6/site-packages/smart_open/smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n",
 64 |       "  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n"
 65 |      ]
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "bunch = hy.Bunch(\"/mnt/data/datasets/wiki/wikibunch\", corpus) # saves data (corpus) to disk"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "bunch = hy.Bunch(\"/mnt/data/datasets/wiki/wikibunch\") # load already saved bunch"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 6,
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "name": "stderr",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "generating pairs:   0%|          | 0/13039 [00:00<?, ?it/s]"
 91 |      ]
 92 |     },
 93 |     {
 94 |      "name": "stdout",
 95 |      "output_type": "stream",
 96 |      "text": [
 97 |       "create new pmi\n",
 98 |       "create new pair counts\n"
 99 |      ]
100 |     },
101 |     {
102 |      "name": "stderr",
103 |      "output_type": "stream",
104 |      "text": [
105 |       "generating pairs: 100%|██████████| 13039/13039 [2:37:26<00:00,  1.38it/s]\n"
106 |      ]
107 |     },
108 |     {
109 |      "name": "stdout",
110 |      "output_type": "stream",
111 |      "text": [
112 |       "subsampling applied to 0.98836 of the tokens\n"
113 |      ]
114 |     },
115 |     {
116 |      "name": "stderr",
117 |      "output_type": "stream",
118 |      "text": [
119 |       "/home/filter/code/hyperhyper/hyperhyper/pmi.py:22: RuntimeWarning: divide by zero encountered in reciprocal\n",
120 |       "  sum_w = np.reciprocal(sum_w)\n",
121 |       "/home/filter/code/hyperhyper/hyperhyper/pmi.py:23: RuntimeWarning: divide by zero encountered in reciprocal\n",
122 |       "  sum_c = np.reciprocal(sum_c)\n",
123 |       "/home/filter/code/hyperhyper/hyperhyper/pmi.py:70: RuntimeWarning: divide by zero encountered in reciprocal\n",
124 |       "  norm = np.reciprocal(np.sqrt(np.array(m2.sum(axis=1))[:, 0]))\n",
125 |       "/home/filter/anaconda3/envs/hyperhyper/lib/python3.6/site-packages/scipy/stats/stats.py:248: RuntimeWarning: The input array could not be properly checked for nan values. nan values will be ignored.\n",
126 |       "  \"values. nan values will be ignored.\", RuntimeWarning)\n"
127 |      ]
128 |     }
129 |    ],
130 |    "source": [
131 |     "_, results = bunch.pmi()"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 7,
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "data": {
141 |       "text/plain": [
142 |        "{'micro': 0.3173059147797395,\n",
143 |        " 'macro': 0.4969743403889793,\n",
144 |        " 'results': [{'name': 'en_bruni_men',\n",
145 |        "   'score': 0.19761923248494612,\n",
146 |        "   'oov': 0.010333333333333333,\n",
147 |        "   'fullscore': 0.19557716708260167},\n",
148 |        "  {'name': 'en_ws353',\n",
149 |        "   'score': 0.56679594670363,\n",
150 |        "   'oov': 0.0028328611898017,\n",
151 |        "   'fullscore': 0.5651902924636764},\n",
152 |        "  {'name': 'en_ws353_relatedness',\n",
153 |        "   'score': 0.5155747896548892,\n",
154 |        "   'oov': 0.0,\n",
155 |        "   'fullscore': 0.5155747896548892},\n",
156 |        "  {'name': 'en_radinsky_mturk',\n",
157 |        "   'score': 0.6417782180665873,\n",
158 |        "   'oov': 0.010452961672473868,\n",
159 |        "   'fullscore': 0.6350697349509087},\n",
160 |        "  {'name': 'en_luong_rare',\n",
161 |        "   'score': 0.39624839611860396,\n",
162 |        "   'oov': 0.6548672566371682,\n",
163 |        "   'fullscore': 0.13675829600553588},\n",
164 |        "  {'name': 'en_ws353_similarity',\n",
165 |        "   'score': 0.6638294593052194,\n",
166 |        "   'oov': 0.0049261083743842365,\n",
167 |        "   'fullscore': 0.6605593634465731}]}"
168 |       ]
169 |      },
170 |      "execution_count": 7,
171 |      "metadata": {},
172 |      "output_type": "execute_result"
173 |     }
174 |    ],
175 |    "source": [
176 |     "results"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 8,
182 |    "metadata": {},
183 |    "outputs": [
184 |     {
185 |      "name": "stdout",
186 |      "output_type": "stream",
187 |      "text": [
188 |       "creating new svd\n"
189 |      ]
190 |     }
191 |    ],
192 |    "source": [
193 |     "vectors, results = bunch.svd(keyed_vectors=True) # using gensims' KeyedVectors, https://radimrehurek.com/gensim/models/keyedvectors.html"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 9,
199 |    "metadata": {},
200 |    "outputs": [
201 |     {
202 |      "data": {
203 |       "text/plain": [
204 |        "{'micro': 0.410857059100136,\n",
205 |        " 'macro': 0.5784650369278442,\n",
206 |        " 'results': [{'name': 'en_bruni_men',\n",
207 |        "   'score': 0.2966850790462694,\n",
208 |        "   'oov': 0.010333333333333333,\n",
209 |        "   'fullscore': 0.29361933322945794},\n",
210 |        "  {'name': 'en_ws353',\n",
211 |        "   'score': 0.7027647632507101,\n",
212 |        "   'oov': 0.0028328611898017,\n",
213 |        "   'fullscore': 0.700773928227337},\n",
214 |        "  {'name': 'en_ws353_relatedness',\n",
215 |        "   'score': 0.6463844612888198,\n",
216 |        "   'oov': 0.0,\n",
217 |        "   'fullscore': 0.6463844612888198},\n",
218 |        "  {'name': 'en_radinsky_mturk',\n",
219 |        "   'score': 0.5645760761416896,\n",
220 |        "   'oov': 0.010452961672473868,\n",
221 |        "   'fullscore': 0.5586745840565848},\n",
222 |        "  {'name': 'en_luong_rare',\n",
223 |        "   'score': 0.5000680882050135,\n",
224 |        "   'oov': 0.6548672566371682,\n",
225 |        "   'fullscore': 0.17258987115040286},\n",
226 |        "  {'name': 'en_ws353_similarity',\n",
227 |        "   'score': 0.760311753634563,\n",
228 |        "   'oov': 0.0049261083743842365,\n",
229 |        "   'fullscore': 0.7565663755378411}]}"
230 |       ]
231 |      },
232 |      "execution_count": 9,
233 |      "metadata": {},
234 |      "output_type": "execute_result"
235 |     }
236 |    ],
237 |    "source": [
238 |     "results"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 10,
244 |    "metadata": {},
245 |    "outputs": [
246 |     {
247 |      "data": {
248 |       "text/plain": [
249 |        "[('munich', 0.8728309869766235),\n",
250 |        " ('hamburg', 0.8400862216949463),\n",
251 |        " ('frankfurt', 0.8366657495498657),\n",
252 |        " ('dresden', 0.8209679126739502),\n",
253 |        " ('leipzig', 0.8084107637405396),\n",
254 |        " ('stuttgart', 0.7912859320640564),\n",
255 |        " ('düsseldorf', 0.7777412533760071),\n",
256 |        " ('bonn', 0.7646675109863281),\n",
257 |        " ('karlsruhe', 0.7132641673088074),\n",
258 |        " ('freiburg', 0.6984316110610962)]"
259 |       ]
260 |      },
261 |      "execution_count": 10,
262 |      "metadata": {},
263 |      "output_type": "execute_result"
264 |     }
265 |    ],
266 |    "source": [
267 |     "vectors.most_similar('berlin')"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": []
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": []
283 |   }
284 |  ],
285 |  "metadata": {
286 |   "kernelspec": {
287 |    "display_name": "Python 3",
288 |    "language": "python",
289 |    "name": "python3"
290 |   },
291 |   "language_info": {
292 |    "codemirror_mode": {
293 |     "name": "ipython",
294 |     "version": 3
295 |    },
296 |    "file_extension": ".py",
297 |    "mimetype": "text/x-python",
298 |    "name": "python",
299 |    "nbconvert_exporter": "python",
300 |    "pygments_lexer": "ipython3",
301 |    "version": "3.6.8"
302 |   }
303 |  },
304 |  "nbformat": 4,
305 |  "nbformat_minor": 2
306 | }
307 | 


--------------------------------------------------------------------------------
/examples/01_news.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "--2020-02-27 20:43:33--  http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2010.en.shuffled.gz\n",
 13 |       "Resolving www.statmt.org (www.statmt.org)... 129.215.197.184\n",
 14 |       "Connecting to www.statmt.org (www.statmt.org)|129.215.197.184|:80... connected.\n",
 15 |       "HTTP request sent, awaiting response... 200 OK\n",
 16 |       "Length: 360828816 (344M) [application/x-gzip]\n",
 17 |       "Saving to: ‘news.2010.en.shuffled.gz’\n",
 18 |       "\n",
 19 |       "news.2010.en.shuffl 100%[===================>] 344,11M   439KB/s    in 16m 33s \n",
 20 |       "\n",
 21 |       "2020-02-27 21:00:06 (355 KB/s) - ‘news.2010.en.shuffled.gz’ saved [360828816/360828816]\n",
 22 |       "\n"
 23 |      ]
 24 |     }
 25 |    ],
 26 |    "source": [
 27 |     "# get data, a text file with one sentence per line\n",
 28 |     "! wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2010.en.shuffled.gz && gzip -d news.2010.en.shuffled.gz"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 1,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "import hyperhyper as hy"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 2,
 43 |    "metadata": {},
 44 |    "outputs": [
 45 |     {
 46 |      "name": "stderr",
 47 |      "output_type": "stream",
 48 |      "text": [
 49 |       "transform to indices: 100%|██████████| 6797225/6797225 [01:05<00:00, 103953.79it/s]\n"
 50 |      ]
 51 |     }
 52 |    ],
 53 |    "source": [
 54 |     "corpus = hy.Corpus.from_file('news.2010.en.shuffled') # this may take a while"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "name": "stderr",
 64 |      "output_type": "stream",
 65 |      "text": [
 66 |       "/home/filter/anaconda3/envs/hyperhyper/lib/python3.6/site-packages/smart_open/smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n",
 67 |       "  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n"
 68 |      ]
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "bunch = hy.Bunch(\"news_bunch\", corpus) # saves data (corpus) to disk"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 4,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "bunch = hy.Bunch(\"news_bunch\") # load already saved bunch"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 5,
 87 |    "metadata": {},
 88 |    "outputs": [
 89 |     {
 90 |      "name": "stderr",
 91 |      "output_type": "stream",
 92 |      "text": [
 93 |       "generating pairs:   0%|          | 0/68 [00:00<?, ?it/s]"
 94 |      ]
 95 |     },
 96 |     {
 97 |      "name": "stdout",
 98 |      "output_type": "stream",
 99 |      "text": [
100 |       "create new pmi\n",
101 |       "create new pair counts\n"
102 |      ]
103 |     },
104 |     {
105 |      "name": "stderr",
106 |      "output_type": "stream",
107 |      "text": [
108 |       "generating pairs: 100%|██████████| 68/68 [04:49<00:00,  4.83s/it]\n"
109 |      ]
110 |     },
111 |     {
112 |      "name": "stdout",
113 |      "output_type": "stream",
114 |      "text": [
115 |       "subsampling applied to 0.84438 of the tokens\n"
116 |      ]
117 |     },
118 |     {
119 |      "name": "stderr",
120 |      "output_type": "stream",
121 |      "text": [
122 |       "/home/filter/anaconda3/envs/hyperhyper/lib/python3.6/site-packages/scipy/stats/stats.py:248: RuntimeWarning: The input array could not be properly checked for nan values. nan values will be ignored.\n",
123 |       "  \"values. nan values will be ignored.\", RuntimeWarning)\n"
124 |      ]
125 |     }
126 |    ],
127 |    "source": [
128 |     "_, results = bunch.pmi()"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 6,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "_, results = bunch.pmi() # all results are cached on disk, no need to compute the same configuration twice"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 7,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "data": {
147 |       "text/plain": [
148 |        "{'micro': 0.267872457927308,\n",
149 |        " 'macro': 0.4341605801936888,\n",
150 |        " 'results': [{'name': 'en_bruni_men',\n",
151 |        "   'score': 0.15717451527190343,\n",
152 |        "   'oov': 0.022,\n",
153 |        "   'fullscore': 0.15371667593592156},\n",
154 |        "  {'name': 'en_ws353',\n",
155 |        "   'score': 0.48913996990928227,\n",
156 |        "   'oov': 0.014164305949008499,\n",
157 |        "   'fullscore': 0.48221164172359837},\n",
158 |        "  {'name': 'en_ws353_relatedness',\n",
159 |        "   'score': 0.4576555965263546,\n",
160 |        "   'oov': 0.003968253968253968,\n",
161 |        "   'fullscore': 0.45583950288934527},\n",
162 |        "  {'name': 'en_radinsky_mturk',\n",
163 |        "   'score': 0.5751285213411367,\n",
164 |        "   'oov': 0.017421602787456445,\n",
165 |        "   'fullscore': 0.5651088606905943},\n",
166 |        "  {'name': 'en_luong_rare',\n",
167 |        "   'score': 0.3412283258632189,\n",
168 |        "   'oov': 0.6617502458210422,\n",
169 |        "   'fullscore': 0.11542039734213108},\n",
170 |        "  {'name': 'en_ws353_similarity',\n",
171 |        "   'score': 0.584636552250237,\n",
172 |        "   'oov': 0.019704433497536946,\n",
173 |        "   'fullscore': 0.5731166201861929}]}"
174 |       ]
175 |      },
176 |      "execution_count": 7,
177 |      "metadata": {},
178 |      "output_type": "execute_result"
179 |     }
180 |    ],
181 |    "source": [
182 |     "results"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 8,
188 |    "metadata": {},
189 |    "outputs": [
190 |     {
191 |      "name": "stdout",
192 |      "output_type": "stream",
193 |      "text": [
194 |       "creating new svd\n"
195 |      ]
196 |     }
197 |    ],
198 |    "source": [
199 |     "vectors, results = bunch.svd(keyed_vectors=True) # using gensims' KeyedVectors, https://radimrehurek.com/gensim/models/keyedvectors.html"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 9,
205 |    "metadata": {},
206 |    "outputs": [
207 |     {
208 |      "data": {
209 |       "text/plain": [
210 |        "{'micro': 0.3928535017769153,\n",
211 |        " 'macro': 0.5595357109445663,\n",
212 |        " 'results': [{'name': 'en_bruni_men',\n",
213 |        "   'score': 0.2767574659277407,\n",
214 |        "   'oov': 0.022,\n",
215 |        "   'fullscore': 0.2706688016773304},\n",
216 |        "  {'name': 'en_ws353',\n",
217 |        "   'score': 0.6510955349164682,\n",
218 |        "   'oov': 0.014164305949008499,\n",
219 |        "   'fullscore': 0.641873218557878},\n",
220 |        "  {'name': 'en_ws353_relatedness',\n",
221 |        "   'score': 0.6014764224460522,\n",
222 |        "   'oov': 0.003968253968253968,\n",
223 |        "   'fullscore': 0.5990896112458695},\n",
224 |        "  {'name': 'en_radinsky_mturk',\n",
225 |        "   'score': 0.6082736732219536,\n",
226 |        "   'oov': 0.017421602787456445,\n",
227 |        "   'fullscore': 0.5976765709010137},\n",
228 |        "  {'name': 'en_luong_rare',\n",
229 |        "   'score': 0.49775691948405365,\n",
230 |        "   'oov': 0.6617502458210422,\n",
231 |        "   'fullscore': 0.1683661556563564},\n",
232 |        "  {'name': 'en_ws353_similarity',\n",
233 |        "   'score': 0.72185424967113,\n",
234 |        "   'oov': 0.019704433497536946,\n",
235 |        "   'fullscore': 0.7076305206135708}]}"
236 |       ]
237 |      },
238 |      "execution_count": 9,
239 |      "metadata": {},
240 |      "output_type": "execute_result"
241 |     }
242 |    ],
243 |    "source": [
244 |     "results"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 10,
250 |    "metadata": {},
251 |    "outputs": [
252 |     {
253 |      "data": {
254 |       "text/plain": [
255 |        "{'name': 'en_ws353',\n",
256 |        " 'score': 0.6510955349164682,\n",
257 |        " 'oov': 0.014164305949008499,\n",
258 |        " 'fullscore': 0.641873218557878}"
259 |       ]
260 |      },
261 |      "execution_count": 10,
262 |      "metadata": {},
263 |      "output_type": "execute_result"
264 |     }
265 |    ],
266 |    "source": [
267 |     "results['results'][1]"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 11,
273 |    "metadata": {},
274 |    "outputs": [
275 |     {
276 |      "data": {
277 |       "text/plain": [
278 |        "[('vienna', 0.6323208808898926),\n",
279 |        " ('frankfurt', 0.5965485572814941),\n",
280 |        " ('munich', 0.5737138986587524),\n",
281 |        " ('amsterdam', 0.5511572360992432),\n",
282 |        " ('stockholm', 0.5423270463943481),\n",
283 |        " ('brussels', 0.5251672267913818),\n",
284 |        " ('dusseldorf', 0.513325572013855),\n",
285 |        " ('budapest', 0.4937877058982849),\n",
286 |        " ('paris', 0.4913218319416046),\n",
287 |        " ('zurich', 0.4875534176826477)]"
288 |       ]
289 |      },
290 |      "execution_count": 11,
291 |      "metadata": {},
292 |      "output_type": "execute_result"
293 |     }
294 |    ],
295 |    "source": [
296 |     "vectors.most_similar('berlin')"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 12,
302 |    "metadata": {},
303 |    "outputs": [
304 |     {
305 |      "data": {
306 |       "text/plain": [
307 |        "[OrderedDict([('id', 2),\n",
308 |        "              ('method', 'svd'),\n",
309 |        "              ('pair_args__window', 2.0),\n",
310 |        "              ('pair_args__dynamic_window', 'deter'),\n",
311 |        "              ('pair_args__decay_rate', 0.25),\n",
312 |        "              ('pair_args__delete_oov', 1.0),\n",
313 |        "              ('pair_args__subsample', 'deter'),\n",
314 |        "              ('pair_args__subsample_factor', 1e-05),\n",
315 |        "              ('micro_results', 0.3928535017769153),\n",
316 |        "              ('macro_results', 0.5595357109445663),\n",
317 |        "              ('en_bruni_men_score', 0.2767574659277407),\n",
318 |        "              ('en_bruni_men_oov', 0.022),\n",
319 |        "              ('en_bruni_men_fullscore', 0.2706688016773304),\n",
320 |        "              ('en_ws353_score', 0.6510955349164682),\n",
321 |        "              ('en_ws353_oov', 0.014164305949008499),\n",
322 |        "              ('en_ws353_fullscore', 0.641873218557878),\n",
323 |        "              ('en_ws353_relatedness_score', 0.6014764224460522),\n",
324 |        "              ('en_ws353_relatedness_oov', 0.003968253968253968),\n",
325 |        "              ('en_ws353_relatedness_fullscore', 0.5990896112458695),\n",
326 |        "              ('en_radinsky_mturk_score', 0.6082736732219536),\n",
327 |        "              ('en_radinsky_mturk_oov', 0.017421602787456445),\n",
328 |        "              ('en_radinsky_mturk_fullscore', 0.5976765709010137),\n",
329 |        "              ('en_luong_rare_score', 0.49775691948405365),\n",
330 |        "              ('en_luong_rare_oov', 0.6617502458210422),\n",
331 |        "              ('en_luong_rare_fullscore', 0.1683661556563564),\n",
332 |        "              ('en_ws353_similarity_score', 0.72185424967113),\n",
333 |        "              ('en_ws353_similarity_oov', 0.019704433497536946),\n",
334 |        "              ('en_ws353_similarity_fullscore', 0.7076305206135708),\n",
335 |        "              ('keyed_vectors', 1.0)]),\n",
336 |        " OrderedDict([('id', 1),\n",
337 |        "              ('method', 'pmi'),\n",
338 |        "              ('pair_args__window', 2.0),\n",
339 |        "              ('pair_args__dynamic_window', 'deter'),\n",
340 |        "              ('pair_args__decay_rate', 0.25),\n",
341 |        "              ('pair_args__delete_oov', 1.0),\n",
342 |        "              ('pair_args__subsample', 'deter'),\n",
343 |        "              ('pair_args__subsample_factor', 1e-05),\n",
344 |        "              ('micro_results', 0.267872457927308),\n",
345 |        "              ('macro_results', 0.4341605801936888),\n",
346 |        "              ('en_bruni_men_score', 0.15717451527190343),\n",
347 |        "              ('en_bruni_men_oov', 0.022),\n",
348 |        "              ('en_bruni_men_fullscore', 0.15371667593592156),\n",
349 |        "              ('en_ws353_score', 0.48913996990928227),\n",
350 |        "              ('en_ws353_oov', 0.014164305949008499),\n",
351 |        "              ('en_ws353_fullscore', 0.48221164172359837),\n",
352 |        "              ('en_ws353_relatedness_score', 0.4576555965263546),\n",
353 |        "              ('en_ws353_relatedness_oov', 0.003968253968253968),\n",
354 |        "              ('en_ws353_relatedness_fullscore', 0.45583950288934527),\n",
355 |        "              ('en_radinsky_mturk_score', 0.5751285213411367),\n",
356 |        "              ('en_radinsky_mturk_oov', 0.017421602787456445),\n",
357 |        "              ('en_radinsky_mturk_fullscore', 0.5651088606905943),\n",
358 |        "              ('en_luong_rare_score', 0.3412283258632189),\n",
359 |        "              ('en_luong_rare_oov', 0.6617502458210422),\n",
360 |        "              ('en_luong_rare_fullscore', 0.11542039734213108),\n",
361 |        "              ('en_ws353_similarity_score', 0.584636552250237),\n",
362 |        "              ('en_ws353_similarity_oov', 0.019704433497536946),\n",
363 |        "              ('en_ws353_similarity_fullscore', 0.5731166201861929),\n",
364 |        "              ('keyed_vectors', None)])]"
365 |       ]
366 |      },
367 |      "execution_count": 12,
368 |      "metadata": {},
369 |      "output_type": "execute_result"
370 |     }
371 |    ],
372 |    "source": [
373 |     "bunch.results() # all results are saved in a sqlite database"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": null,
379 |    "metadata": {},
380 |    "outputs": [],
381 |    "source": []
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": null,
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": []
389 |   }
390 |  ],
391 |  "metadata": {
392 |   "kernelspec": {
393 |    "display_name": "Python 3",
394 |    "language": "python",
395 |    "name": "python3"
396 |   },
397 |   "language_info": {
398 |    "codemirror_mode": {
399 |     "name": "ipython",
400 |     "version": 3
401 |    },
402 |    "file_extension": ".py",
403 |    "mimetype": "text/x-python",
404 |    "name": "python",
405 |    "nbconvert_exporter": "python",
406 |    "pygments_lexer": "ipython3",
407 |    "version": "3.6.8"
408 |   }
409 |  },
410 |  "nbformat": 4,
411 |  "nbformat_minor": 2
412 | }
413 | 


--------------------------------------------------------------------------------
/hyperhyper/evaluation_datasets/de/analogy/open.txt:
--------------------------------------------------------------------------------
  1 | China Yuan Deutschland Euro
  2 | China Yuan Dänemark Krone
  3 | China Yuan England Pfund
  4 | China Yuan Japan Yen
  5 | China Yuan Russland Rubel
  6 | China Yuan USA Dollar
  7 | Deutschland Euro Dänemark Krone
  8 | Deutschland Euro England Pfund
  9 | Deutschland Euro Japan Yen
 10 | Deutschland Euro Russland Rubel
 11 | Deutschland Euro USA Dollar
 12 | Dänemark Krone England Pfund
 13 | Dänemark Krone Japan Yen
 14 | Dänemark Krone Russland Rubel
 15 | Dänemark Krone USA Dollar
 16 | England Pfund Japan Yen
 17 | England Pfund Russland Rubel
 18 | England Pfund USA Dollar
 19 | Japan Yen Russland Rubel
 20 | Japan Yen USA Dollar
 21 | Russland Rubel USA Dollar
 22 | Athen Griechenland Bagdad Irak
 23 | Athen Griechenland Bangkok Thailand
 24 | Athen Griechenland Berlin Deutschland
 25 | Athen Griechenland Bern Schweiz
 26 | Athen Griechenland Hanoi Vietnam
 27 | Athen Griechenland Helsinki Finnland
 28 | Athen Griechenland Kairo Ägypten
 29 | Athen Griechenland Kiew Ukraine
 30 | Athen Griechenland London England
 31 | Athen Griechenland Madrid Spain
 32 | Athen Griechenland Melbourne Australien
 33 | Athen Griechenland Moskau Russland
 34 | Athen Griechenland Oslo Norwegen
 35 | Athen Griechenland Ottawa Kanada
 36 | Athen Griechenland Paris Frankreich
 37 | Athen Griechenland Rom Italien
 38 | Athen Griechenland Stockholm Schweden
 39 | Athen Griechenland Teheran Iran
 40 | Athen Griechenland Tokio Japan
 41 | Athen Griechenland Washington USA
 42 | Bagdad Irak Bangkok Thailand
 43 | Bagdad Irak Berlin Deutschland
 44 | Bagdad Irak Bern Schweiz
 45 | Bagdad Irak Hanoi Vietnam
 46 | Bagdad Irak Helsinki Finnland
 47 | Bagdad Irak Kairo Ägypten
 48 | Bagdad Irak Kiew Ukraine
 49 | Bagdad Irak London England
 50 | Bagdad Irak Madrid Spain
 51 | Bagdad Irak Melbourne Australien
 52 | Bagdad Irak Moskau Russland
 53 | Bagdad Irak Oslo Norwegen
 54 | Bagdad Irak Ottawa Kanada
 55 | Bagdad Irak Paris Frankreich
 56 | Bagdad Irak Rom Italien
 57 | Bagdad Irak Stockholm Schweden
 58 | Bagdad Irak Teheran Iran
 59 | Bagdad Irak Tokio Japan
 60 | Bagdad Irak Washington USA
 61 | Bangkok Thailand Berlin Deutschland
 62 | Bangkok Thailand Bern Schweiz
 63 | Bangkok Thailand Hanoi Vietnam
 64 | Bangkok Thailand Helsinki Finnland
 65 | Bangkok Thailand Kairo Ägypten
 66 | Bangkok Thailand Kiew Ukraine
 67 | Bangkok Thailand London England
 68 | Bangkok Thailand Madrid Spain
 69 | Bangkok Thailand Melbourne Australien
 70 | Bangkok Thailand Moskau Russland
 71 | Bangkok Thailand Oslo Norwegen
 72 | Bangkok Thailand Ottawa Kanada
 73 | Bangkok Thailand Paris Frankreich
 74 | Bangkok Thailand Rom Italien
 75 | Bangkok Thailand Stockholm Schweden
 76 | Bangkok Thailand Teheran Iran
 77 | Bangkok Thailand Tokio Japan
 78 | Bangkok Thailand Washington USA
 79 | Berlin Deutschland Bern Schweiz
 80 | Berlin Deutschland Hanoi Vietnam
 81 | Berlin Deutschland Helsinki Finnland
 82 | Berlin Deutschland Kairo Ägypten
 83 | Berlin Deutschland Kiew Ukraine
 84 | Berlin Deutschland London England
 85 | Berlin Deutschland Madrid Spain
 86 | Berlin Deutschland Melbourne Australien
 87 | Berlin Deutschland Moskau Russland
 88 | Berlin Deutschland Oslo Norwegen
 89 | Berlin Deutschland Ottawa Kanada
 90 | Berlin Deutschland Paris Frankreich
 91 | Berlin Deutschland Rom Italien
 92 | Berlin Deutschland Stockholm Schweden
 93 | Berlin Deutschland Teheran Iran
 94 | Berlin Deutschland Tokio Japan
 95 | Berlin Deutschland Washington USA
 96 | Bern Schweiz Hanoi Vietnam
 97 | Bern Schweiz Helsinki Finnland
 98 | Bern Schweiz Kairo Ägypten
 99 | Bern Schweiz Kiew Ukraine
100 | Bern Schweiz London England
101 | Bern Schweiz Madrid Spain
102 | Bern Schweiz Melbourne Australien
103 | Bern Schweiz Moskau Russland
104 | Bern Schweiz Oslo Norwegen
105 | Bern Schweiz Ottawa Kanada
106 | Bern Schweiz Paris Frankreich
107 | Bern Schweiz Rom Italien
108 | Bern Schweiz Stockholm Schweden
109 | Bern Schweiz Teheran Iran
110 | Bern Schweiz Tokio Japan
111 | Bern Schweiz Washington USA
112 | Hanoi Vietnam Helsinki Finnland
113 | Hanoi Vietnam Kairo Ägypten
114 | Hanoi Vietnam Kiew Ukraine
115 | Hanoi Vietnam London England
116 | Hanoi Vietnam Madrid Spain
117 | Hanoi Vietnam Melbourne Australien
118 | Hanoi Vietnam Moskau Russland
119 | Hanoi Vietnam Oslo Norwegen
120 | Hanoi Vietnam Ottawa Kanada
121 | Hanoi Vietnam Paris Frankreich
122 | Hanoi Vietnam Rom Italien
123 | Hanoi Vietnam Stockholm Schweden
124 | Hanoi Vietnam Teheran Iran
125 | Hanoi Vietnam Tokio Japan
126 | Hanoi Vietnam Washington USA
127 | Helsinki Finnland Kairo Ägypten
128 | Helsinki Finnland Kiew Ukraine
129 | Helsinki Finnland London England
130 | Helsinki Finnland Madrid Spain
131 | Helsinki Finnland Melbourne Australien
132 | Helsinki Finnland Moskau Russland
133 | Helsinki Finnland Oslo Norwegen
134 | Helsinki Finnland Ottawa Kanada
135 | Helsinki Finnland Paris Frankreich
136 | Helsinki Finnland Rom Italien
137 | Helsinki Finnland Stockholm Schweden
138 | Helsinki Finnland Teheran Iran
139 | Helsinki Finnland Tokio Japan
140 | Helsinki Finnland Washington USA
141 | Kairo Ägypten Kiew Ukraine
142 | Kairo Ägypten London England
143 | Kairo Ägypten Madrid Spain
144 | Kairo Ägypten Melbourne Australien
145 | Kairo Ägypten Moskau Russland
146 | Kairo Ägypten Oslo Norwegen
147 | Kairo Ägypten Ottawa Kanada
148 | Kairo Ägypten Paris Frankreich
149 | Kairo Ägypten Rom Italien
150 | Kairo Ägypten Stockholm Schweden
151 | Kairo Ägypten Teheran Iran
152 | Kairo Ägypten Tokio Japan
153 | Kairo Ägypten Washington USA
154 | Kiew Ukraine London England
155 | Kiew Ukraine Madrid Spain
156 | Kiew Ukraine Melbourne Australien
157 | Kiew Ukraine Moskau Russland
158 | Kiew Ukraine Oslo Norwegen
159 | Kiew Ukraine Ottawa Kanada
160 | Kiew Ukraine Paris Frankreich
161 | Kiew Ukraine Rom Italien
162 | Kiew Ukraine Stockholm Schweden
163 | Kiew Ukraine Teheran Iran
164 | Kiew Ukraine Tokio Japan
165 | Kiew Ukraine Washington USA
166 | London England Madrid Spain
167 | London England Melbourne Australien
168 | London England Moskau Russland
169 | London England Oslo Norwegen
170 | London England Ottawa Kanada
171 | London England Paris Frankreich
172 | London England Rom Italien
173 | London England Stockholm Schweden
174 | London England Teheran Iran
175 | London England Tokio Japan
176 | London England Washington USA
177 | Madrid Spain Melbourne Australien
178 | Madrid Spain Moskau Russland
179 | Madrid Spain Oslo Norwegen
180 | Madrid Spain Ottawa Kanada
181 | Madrid Spain Paris Frankreich
182 | Madrid Spain Rom Italien
183 | Madrid Spain Stockholm Schweden
184 | Madrid Spain Teheran Iran
185 | Madrid Spain Tokio Japan
186 | Madrid Spain Washington USA
187 | Melbourne Australien Moskau Russland
188 | Melbourne Australien Oslo Norwegen
189 | Melbourne Australien Ottawa Kanada
190 | Melbourne Australien Paris Frankreich
191 | Melbourne Australien Rom Italien
192 | Melbourne Australien Stockholm Schweden
193 | Melbourne Australien Teheran Iran
194 | Melbourne Australien Tokio Japan
195 | Melbourne Australien Washington USA
196 | Moskau Russland Oslo Norwegen
197 | Moskau Russland Ottawa Kanada
198 | Moskau Russland Paris Frankreich
199 | Moskau Russland Rom Italien
200 | Moskau Russland Stockholm Schweden
201 | Moskau Russland Teheran Iran
202 | Moskau Russland Tokio Japan
203 | Moskau Russland Washington USA
204 | Oslo Norwegen Ottawa Kanada
205 | Oslo Norwegen Paris Frankreich
206 | Oslo Norwegen Rom Italien
207 | Oslo Norwegen Stockholm Schweden
208 | Oslo Norwegen Teheran Iran
209 | Oslo Norwegen Tokio Japan
210 | Oslo Norwegen Washington USA
211 | Ottawa Kanada Paris Frankreich
212 | Ottawa Kanada Rom Italien
213 | Ottawa Kanada Stockholm Schweden
214 | Ottawa Kanada Teheran Iran
215 | Ottawa Kanada Tokio Japan
216 | Ottawa Kanada Washington USA
217 | Paris Frankreich Rom Italien
218 | Paris Frankreich Stockholm Schweden
219 | Paris Frankreich Teheran Iran
220 | Paris Frankreich Tokio Japan
221 | Paris Frankreich Washington USA
222 | Rom Italien Stockholm Schweden
223 | Rom Italien Teheran Iran
224 | Rom Italien Tokio Japan
225 | Rom Italien Washington USA
226 | Stockholm Schweden Teheran Iran
227 | Stockholm Schweden Tokio Japan
228 | Stockholm Schweden Washington USA
229 | Teheran Iran Tokio Japan
230 | Teheran Iran Washington USA
231 | Tokio Japan Washington USA
232 | Brasilien Südamerika China Asien
233 | Brasilien Südamerika Deutschland Europa
234 | Brasilien Südamerika England Europa
235 | Brasilien Südamerika Frankreich Europa
236 | Brasilien Südamerika Griechenland Europa
237 | Brasilien Südamerika Indien Asien
238 | Brasilien Südamerika Italien Europa
239 | Brasilien Südamerika Kanada Nordamerika
240 | Brasilien Südamerika Polen Europa
241 | Brasilien Südamerika USA Nordamerika
242 | Brasilien Südamerika Vietnam Asien
243 | Brasilien Südamerika Ägypten Afrika
244 | China Asien Deutschland Europa
245 | China Asien England Europa
246 | China Asien Frankreich Europa
247 | China Asien Griechenland Europa
248 | China Asien Indien Asien
249 | China Asien Italien Europa
250 | China Asien Kanada Nordamerika
251 | China Asien Polen Europa
252 | China Asien USA Nordamerika
253 | China Asien Vietnam Asien
254 | China Asien Ägypten Afrika
255 | Deutschland Europa England Europa
256 | Deutschland Europa Frankreich Europa
257 | Deutschland Europa Griechenland Europa
258 | Deutschland Europa Indien Asien
259 | Deutschland Europa Italien Europa
260 | Deutschland Europa Kanada Nordamerika
261 | Deutschland Europa Polen Europa
262 | Deutschland Europa USA Nordamerika
263 | Deutschland Europa Vietnam Asien
264 | Deutschland Europa Ägypten Afrika
265 | England Europa Frankreich Europa
266 | England Europa Griechenland Europa
267 | England Europa Indien Asien
268 | England Europa Italien Europa
269 | England Europa Kanada Nordamerika
270 | England Europa Polen Europa
271 | England Europa USA Nordamerika
272 | England Europa Vietnam Asien
273 | England Europa Ägypten Afrika
274 | Frankreich Europa Griechenland Europa
275 | Frankreich Europa Indien Asien
276 | Frankreich Europa Italien Europa
277 | Frankreich Europa Kanada Nordamerika
278 | Frankreich Europa Polen Europa
279 | Frankreich Europa USA Nordamerika
280 | Frankreich Europa Vietnam Asien
281 | Frankreich Europa Ägypten Afrika
282 | Griechenland Europa Indien Asien
283 | Griechenland Europa Italien Europa
284 | Griechenland Europa Kanada Nordamerika
285 | Griechenland Europa Polen Europa
286 | Griechenland Europa USA Nordamerika
287 | Griechenland Europa Vietnam Asien
288 | Griechenland Europa Ägypten Afrika
289 | Indien Asien Italien Europa
290 | Indien Asien Kanada Nordamerika
291 | Indien Asien Polen Europa
292 | Indien Asien USA Nordamerika
293 | Indien Asien Vietnam Asien
294 | Indien Asien Ägypten Afrika
295 | Italien Europa Kanada Nordamerika
296 | Italien Europa Polen Europa
297 | Italien Europa USA Nordamerika
298 | Italien Europa Vietnam Asien
299 | Italien Europa Ägypten Afrika
300 | Kanada Nordamerika Polen Europa
301 | Kanada Nordamerika USA Nordamerika
302 | Kanada Nordamerika Vietnam Asien
303 | Kanada Nordamerika Ägypten Afrika
304 | Polen Europa USA Nordamerika
305 | Polen Europa Vietnam Asien
306 | Polen Europa Ägypten Afrika
307 | USA Nordamerika Vietnam Asien
308 | USA Nordamerika Ägypten Afrika
309 | Vietnam Asien Ägypten Afrika
310 | China Chinesisch Deutschland Deutsch
311 | China Chinesisch England Englisch
312 | China Chinesisch Frankreich Französisch
313 | China Chinesisch Griechenland Griechisch
314 | China Chinesisch Italien Italienisch
315 | China Chinesisch Japan Japanisch
316 | China Chinesisch Korea Koreanisch
317 | China Chinesisch Norwegen Norwegisch
318 | China Chinesisch Polen Polnisch
319 | China Chinesisch Russland Russisch
320 | China Chinesisch Schweden Schwedisch
321 | China Chinesisch Spanien Spanisch
322 | China Chinesisch Ukraine Ukrainisch
323 | Deutschland Deutsch England Englisch
324 | Deutschland Deutsch Frankreich Französisch
325 | Deutschland Deutsch Griechenland Griechisch
326 | Deutschland Deutsch Italien Italienisch
327 | Deutschland Deutsch Japan Japanisch
328 | Deutschland Deutsch Korea Koreanisch
329 | Deutschland Deutsch Norwegen Norwegisch
330 | Deutschland Deutsch Polen Polnisch
331 | Deutschland Deutsch Russland Russisch
332 | Deutschland Deutsch Schweden Schwedisch
333 | Deutschland Deutsch Spanien Spanisch
334 | Deutschland Deutsch Ukraine Ukrainisch
335 | England Englisch Frankreich Französisch
336 | England Englisch Griechenland Griechisch
337 | England Englisch Italien Italienisch
338 | England Englisch Japan Japanisch
339 | England Englisch Korea Koreanisch
340 | England Englisch Norwegen Norwegisch
341 | England Englisch Polen Polnisch
342 | England Englisch Russland Russisch
343 | England Englisch Schweden Schwedisch
344 | England Englisch Spanien Spanisch
345 | England Englisch Ukraine Ukrainisch
346 | Frankreich Französisch Griechenland Griechisch
347 | Frankreich Französisch Italien Italienisch
348 | Frankreich Französisch Japan Japanisch
349 | Frankreich Französisch Korea Koreanisch
350 | Frankreich Französisch Norwegen Norwegisch
351 | Frankreich Französisch Polen Polnisch
352 | Frankreich Französisch Russland Russisch
353 | Frankreich Französisch Schweden Schwedisch
354 | Frankreich Französisch Spanien Spanisch
355 | Frankreich Französisch Ukraine Ukrainisch
356 | Griechenland Griechisch Italien Italienisch
357 | Griechenland Griechisch Japan Japanisch
358 | Griechenland Griechisch Korea Koreanisch
359 | Griechenland Griechisch Norwegen Norwegisch
360 | Griechenland Griechisch Polen Polnisch
361 | Griechenland Griechisch Russland Russisch
362 | Griechenland Griechisch Schweden Schwedisch
363 | Griechenland Griechisch Spanien Spanisch
364 | Griechenland Griechisch Ukraine Ukrainisch
365 | Italien Italienisch Japan Japanisch
366 | Italien Italienisch Korea Koreanisch
367 | Italien Italienisch Norwegen Norwegisch
368 | Italien Italienisch Polen Polnisch
369 | Italien Italienisch Russland Russisch
370 | Italien Italienisch Schweden Schwedisch
371 | Italien Italienisch Spanien Spanisch
372 | Italien Italienisch Ukraine Ukrainisch
373 | Japan Japanisch Korea Koreanisch
374 | Japan Japanisch Norwegen Norwegisch
375 | Japan Japanisch Polen Polnisch
376 | Japan Japanisch Russland Russisch
377 | Japan Japanisch Schweden Schwedisch
378 | Japan Japanisch Spanien Spanisch
379 | Japan Japanisch Ukraine Ukrainisch
380 | Korea Koreanisch Norwegen Norwegisch
381 | Korea Koreanisch Polen Polnisch
382 | Korea Koreanisch Russland Russisch
383 | Korea Koreanisch Schweden Schwedisch
384 | Korea Koreanisch Spanien Spanisch
385 | Korea Koreanisch Ukraine Ukrainisch
386 | Norwegen Norwegisch Polen Polnisch
387 | Norwegen Norwegisch Russland Russisch
388 | Norwegen Norwegisch Schweden Schwedisch
389 | Norwegen Norwegisch Spanien Spanisch
390 | Norwegen Norwegisch Ukraine Ukrainisch
391 | Polen Polnisch Russland Russisch
392 | Polen Polnisch Schweden Schwedisch
393 | Polen Polnisch Spanien Spanisch
394 | Polen Polnisch Ukraine Ukrainisch
395 | Russland Russisch Schweden Schwedisch
396 | Russland Russisch Spanien Spanisch
397 | Russland Russisch Ukraine Ukrainisch
398 | Schweden Schwedisch Spanien Spanisch
399 | Schweden Schwedisch Ukraine Ukrainisch
400 | Spanien Spanisch Ukraine Ukrainisch
401 | Elisabeth Königin Charles Prinz
402 | Android Google iOS Apple
403 | Android Google Windows Microsoft
404 | iOS Apple Windows Microsoft
405 | Bruder Schwester Bräutigam Braut
406 | Bruder Schwester Ehemann Ehefrau
407 | Bruder Schwester Enkel Enkelin
408 | Bruder Schwester er sie
409 | Bruder Schwester Er Sie
410 | Bruder Schwester Großvater Großmutter
411 | Bruder Schwester Junge Mädchen
412 | Bruder Schwester König Königin
413 | Bruder Schwester Mann Frau
414 | Bruder Schwester männlich weiblich
415 | Bruder Schwester Neffe Nichte
416 | Bruder Schwester Onkel Tante
417 | Bruder Schwester Papa Mama
418 | Bruder Schwester Partner Partnerin
419 | Bruder Schwester Prinz Prinzessin
420 | Bruder Schwester Vater Mutter
421 | Bräutigam Braut Ehemann Ehefrau
422 | Bräutigam Braut Enkel Enkelin
423 | Bräutigam Braut er sie
424 | Bräutigam Braut Er Sie
425 | Bräutigam Braut Großvater Großmutter
426 | Bräutigam Braut Junge Mädchen
427 | Bräutigam Braut König Königin
428 | Bräutigam Braut Mann Frau
429 | Bräutigam Braut männlich weiblich
430 | Bräutigam Braut Neffe Nichte
431 | Bräutigam Braut Onkel Tante
432 | Bräutigam Braut Papa Mama
433 | Bräutigam Braut Partner Partnerin
434 | Bräutigam Braut Prinz Prinzessin
435 | Bräutigam Braut Vater Mutter
436 | Ehemann Ehefrau Enkel Enkelin
437 | Ehemann Ehefrau er sie
438 | Ehemann Ehefrau Er Sie
439 | Ehemann Ehefrau Großvater Großmutter
440 | Ehemann Ehefrau Junge Mädchen
441 | Ehemann Ehefrau König Königin
442 | Ehemann Ehefrau Mann Frau
443 | Ehemann Ehefrau männlich weiblich
444 | Ehemann Ehefrau Neffe Nichte
445 | Ehemann Ehefrau Onkel Tante
446 | Ehemann Ehefrau Papa Mama
447 | Ehemann Ehefrau Partner Partnerin
448 | Ehemann Ehefrau Prinz Prinzessin
449 | Ehemann Ehefrau Vater Mutter
450 | Enkel Enkelin er sie
451 | Enkel Enkelin Er Sie
452 | Enkel Enkelin Großvater Großmutter
453 | Enkel Enkelin Junge Mädchen
454 | Enkel Enkelin König Königin
455 | Enkel Enkelin Mann Frau
456 | Enkel Enkelin männlich weiblich
457 | Enkel Enkelin Neffe Nichte
458 | Enkel Enkelin Onkel Tante
459 | Enkel Enkelin Papa Mama
460 | Enkel Enkelin Partner Partnerin
461 | Enkel Enkelin Prinz Prinzessin
462 | Enkel Enkelin Vater Mutter
463 | er sie Großvater Großmutter
464 | er sie Junge Mädchen
465 | er sie König Königin
466 | er sie Mann Frau
467 | er sie männlich weiblich
468 | er sie Neffe Nichte
469 | er sie Onkel Tante
470 | er sie Papa Mama
471 | er sie Partner Partnerin
472 | er sie Prinz Prinzessin
473 | er sie Vater Mutter
474 | Er Sie Großvater Großmutter
475 | Er Sie Junge Mädchen
476 | Er Sie König Königin
477 | Er Sie Mann Frau
478 | Er Sie männlich weiblich
479 | Er Sie Neffe Nichte
480 | Er Sie Onkel Tante
481 | Er Sie Papa Mama
482 | Er Sie Partner Partnerin
483 | Er Sie Prinz Prinzessin
484 | Er Sie Vater Mutter
485 | Großvater Großmutter Junge Mädchen
486 | Großvater Großmutter König Königin
487 | Großvater Großmutter Mann Frau
488 | Großvater Großmutter männlich weiblich
489 | Großvater Großmutter Neffe Nichte
490 | Großvater Großmutter Onkel Tante
491 | Großvater Großmutter Papa Mama
492 | Großvater Großmutter Partner Partnerin
493 | Großvater Großmutter Prinz Prinzessin
494 | Großvater Großmutter Vater Mutter
495 | Junge Mädchen König Königin
496 | Junge Mädchen Mann Frau
497 | Junge Mädchen männlich weiblich
498 | Junge Mädchen Neffe Nichte
499 | Junge Mädchen Onkel Tante
500 | Junge Mädchen Papa Mama
501 | Junge Mädchen Partner Partnerin
502 | Junge Mädchen Prinz Prinzessin
503 | Junge Mädchen Vater Mutter
504 | König Königin Mann Frau
505 | König Königin männlich weiblich
506 | König Königin Neffe Nichte
507 | König Königin Onkel Tante
508 | König Königin Papa Mama
509 | König Königin Partner Partnerin
510 | König Königin Prinz Prinzessin
511 | König Königin Vater Mutter
512 | Mann Frau männlich weiblich
513 | Mann Frau Neffe Nichte
514 | Mann Frau Onkel Tante
515 | Mann Frau Papa Mama
516 | Mann Frau Partner Partnerin
517 | Mann Frau Prinz Prinzessin
518 | Mann Frau Vater Mutter
519 | männlich weiblich Neffe Nichte
520 | männlich weiblich Onkel Tante
521 | männlich weiblich Papa Mama
522 | männlich weiblich Partner Partnerin
523 | männlich weiblich Prinz Prinzessin
524 | männlich weiblich Vater Mutter
525 | Neffe Nichte Onkel Tante
526 | Neffe Nichte Papa Mama
527 | Neffe Nichte Partner Partnerin
528 | Neffe Nichte Prinz Prinzessin
529 | Neffe Nichte Vater Mutter
530 | Onkel Tante Papa Mama
531 | Onkel Tante Partner Partnerin
532 | Onkel Tante Prinz Prinzessin
533 | Onkel Tante Vater Mutter
534 | Papa Mama Partner Partnerin
535 | Papa Mama Prinz Prinzessin
536 | Papa Mama Vater Mutter
537 | Partner Partnerin Prinz Prinzessin
538 | Partner Partnerin Vater Mutter
539 | Prinz Prinzessin Vater Mutter
540 | 


--------------------------------------------------------------------------------