├── tests ├── __init__.py ├── test_utils.py ├── test_pair_counts.py ├── test_dataset.py └── test_bunch.py ├── hyperhyper ├── evaluation_datasets │ ├── __init__.py │ ├── de │ │ ├── __init__.py │ │ ├── ws │ │ │ ├── __init__.py │ │ │ ├── gur65.txt │ │ │ ├── ws353sim.txt │ │ │ ├── schm280.txt │ │ │ ├── ws353rel.txt │ │ │ ├── zg222.txt │ │ │ └── gur350.txt │ │ ├── analogy │ │ │ ├── __init__.py │ │ │ ├── opposite.txt │ │ │ └── open.txt │ │ └── README.txt │ └── en │ │ ├── __init__.py │ │ ├── ws │ │ ├── __init__.py │ │ ├── ws353_similarity.txt │ │ ├── ws353_relatedness.txt │ │ ├── radinsky_mturk.txt │ │ └── ws353.txt │ │ └── analogy │ │ └── __init__.py ├── __init__.py ├── preprocessing.py ├── svd.py ├── utils.py ├── pmi.py ├── evaluation.py ├── experiment.py ├── bunch.py ├── pair_counts.py └── corpus.py ├── .editorconfig ├── .travis.yml ├── pyproject.toml ├── LICENSE ├── .gitignore ├── README.md └── examples ├── 02_wikipedia.ipynb └── 01_news.ipynb /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/de/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/de/ws/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/en/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/en/ws/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/de/analogy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/en/analogy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | insert_final_newline = true 9 | trim_trailing_whitespace = true 10 | end_of_line = lf 11 | charset = utf-8 12 | 13 | [*.py] 14 | max_line_length = 79 15 | 16 | [*.md] 17 | insert_final_newline = false 18 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: bionic 2 | language: python 3 | python: 4 | - "3.6" 5 | - "3.7" 6 | - "3.8" 7 | 8 | install: 9 | - pip install poetry 10 | - if [[ $TRAVIS_PYTHON_VERSION == 3.6 ]]; then rm poetry.lock; fi 11 | - poetry install 12 | - poetry run python -m spacy download en_core_web_sm 13 | 14 | script: poetry run pytest 15 | 16 | notifications: 17 | email: false 18 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import hyperhyper 4 | 5 | 6 | def foo(li): 7 | return [pow(x, 10) for x in li] 8 | 9 | 10 | def test_map_chunks(): 11 | some_list = list(range(100)) 12 | results = hyperhyper.utils.map_pool_chunks( 13 | some_list, foo, chunk_size=10, combine=True 14 | ) 15 | assert len(results) == 100 16 | assert results[50] == pow(50, 10) 17 | print(results) 18 | -------------------------------------------------------------------------------- /hyperhyper/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from logging import NullHandler 3 | 4 | from . import evaluation, utils 5 | from .bunch import Bunch 6 | from .corpus import Corpus, Vocab 7 | from .pair_counts import count_pairs 8 | from .preprocessing import (texts_to_sents, tokenize_texts, 9 | tokenize_texts_parallel) 10 | 11 | logging.getLogger(__name__).addHandler(NullHandler()) 12 | 13 | __version__ = "0.1.1" 14 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | authors = ["Johannes Filter "] 3 | classifiers = [ 4 | "Programming Language :: Python :: 3.6", 5 | "Programming Language :: Python :: 3.7", 6 | "Programming Language :: Python :: 3.8", 7 | "License :: OSI Approved :: BSD License", 8 | "Topic :: Scientific/Engineering :: Information Analysis", 9 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 10 | ] 11 | description = "Python package to construct word embeddings for small data" 12 | keywords = ["natural-language-processing", "word-embeddings", "ppmi", "pmi", "pmi-svd"] 13 | license = "BSD-2-Clause" 14 | name = "hyperhyper" 15 | readme = "README.md" 16 | repository = "https://github.com/jfilter/hyperhyper" 17 | version = "0.1.1" 18 | 19 | [tool.poetry.dependencies] 20 | dataset = "1.*" 21 | gensim = "3.*" 22 | importlib_resources = {version = "*", python = "<= 3.6"} 23 | python = ">= 3.6" 24 | tqdm = "*" 25 | 26 | scikit-learn = {version = "*", optional = true} 27 | spacy = {version = "2.*", optional = true} 28 | 29 | [tool.poetry.dev-dependencies] 30 | black = "*" 31 | pylint = "*" 32 | pytest = "^5.2" 33 | scikit-learn = "*" 34 | spacy = "*" 35 | 36 | [tool.poetry.extras] 37 | full = ["scikit-learn", "spacy"] 38 | 39 | [build-system] 40 | build-backend = "poetry.core.masonry.api" 41 | requires = ["poetry-core>=1.0.0"] 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2014-2015, Omer Levy, Yoav Goldberg, and Ido Dagan 4 | 5 | Copyright (c) 2018-2019, Johannes Hellrich 6 | 7 | Copyright (c) 2019, Johannes Filter 8 | 9 | All rights reserved. 10 | 11 | Redistribution and use in source and binary forms, with or without 12 | modification, are permitted provided that the following conditions are met: 13 | 14 | * Redistributions of source code must retain the above copyright notice, this 15 | list of conditions and the following disclaimer. 16 | 17 | * Redistributions in binary form must reproduce the above copyright notice, 18 | this list of conditions and the following disclaimer in the documentation 19 | and/or other materials provided with the distribution. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 25 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/de/ws/gur65.txt: -------------------------------------------------------------------------------- 1 | Auto Fahrt 2.75 2 | Auto Polster 1.25 3 | Autogramm Küste 0.04 4 | Autogramm Unterschrift 3.54 5 | Backofen Herd 3.42 6 | Backofen Werkzeug 1.04 7 | Berg Herd 0 8 | Berg Küste 1.71 9 | Berg Wald 1.75 10 | Bruder Bursche 1.58 11 | Bruder Mönch 3.04 12 | Bursche Magier 0.58 13 | Edelstein Juwel 3.83 14 | Fabel Magier 1.54 15 | Fahrt Reise 3.25 16 | Forst Kirchhof 0.46 17 | Forst Wald 3.75 18 | Friedhof Hügel 0.92 19 | Friedhof Kirchhof 3 20 | Friedhof Psychiatrie 0.38 21 | Friedhof Wald 0.96 22 | Gerät Werkzeug 3 23 | Glas Becher 3.25 24 | Glas Juwel 1.08 25 | Glas Zauberer 0.58 26 | Gockel Hahn 4 27 | Grinsen Bursche 0.58 28 | Grinsen Lächeln 3.38 29 | Grinsen Werkzeug 0 30 | Hahn Reise 0 31 | Hügel Berg 3.46 32 | Irrenhaus Friedhof 0.33 33 | Irrenhaus Mönch 0.25 34 | Irrenhaus Obst 0.04 35 | Irrenhaus Psychiatrie 3.67 36 | Junge Bursche 3.79 37 | Junge Fabel 0.38 38 | Junge Hahn 0.29 39 | Kraftfahrzeug Auto 3.79 40 | Kraftfahrzeug Magier 0.04 41 | Kran Werkzeug 1.96 42 | Kranich Hahn 2.21 43 | Küste Forst 1.08 44 | Küste Reise 1.46 45 | Küste Ufer 3.67 46 | Leibeigener Sklave 3.83 47 | Mittag Mittagsstunde 3.54 48 | Mittag Schnur 0.04 49 | Mönch Orakel 0.54 50 | Mönch Sklave 0.58 51 | Nahrung Hahn 1.88 52 | Nahrung Obst 3.29 53 | Obst Backofen 0.92 54 | Orakel Fabel 1.25 55 | Polster Juwel 0.29 56 | Polster Kissen 3.13 57 | Schnur Seil 3.38 58 | Seil Lächeln 0 59 | Ufer Hügel 1.25 60 | Ufer Wald 1.29 61 | Vogel Hahn 3.17 62 | Vogel Kranich 3.54 63 | Vogel Wald 1.63 64 | Zauberer Magier 3.96 65 | Zauberer Orakel 1.71 66 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .vscode 107 | test_bunch 108 | dev 109 | -------------------------------------------------------------------------------- /hyperhyper/preprocessing.py: -------------------------------------------------------------------------------- 1 | """ 2 | simple text preprocessing such as cleaning and tokenization 3 | """ 4 | 5 | import os 6 | import re 7 | 8 | from gensim.parsing.preprocessing import (preprocess_string, 9 | strip_non_alphanum, strip_tags) 10 | from tqdm import tqdm 11 | 12 | from .utils import map_pool 13 | 14 | try: 15 | import spacy 16 | except: 17 | spacy = None 18 | 19 | 20 | def simple_preproc(text): 21 | """ 22 | replace digits with 0 and lowercase text 23 | """ 24 | return re.sub(r"\d", "0", text.lower()) 25 | 26 | 27 | def tokenize_string(text): 28 | """ 29 | tokenize based on whitespaces 30 | """ 31 | CUSTOM_FILTERS = [simple_preproc, strip_tags, strip_non_alphanum] 32 | return preprocess_string(text, CUSTOM_FILTERS) 33 | 34 | 35 | def tokenize_texts(texts): 36 | """ 37 | tokenize multiple texts (list of texts) based on whitespaces 38 | """ 39 | return [tokenize_string(t) for t in texts] 40 | 41 | 42 | def tokenize_texts_parallel(texts): 43 | """ 44 | tokenize multiple texts based on whitespaces in parrallel 45 | """ 46 | return map_pool(texts, tokenize_string) 47 | 48 | 49 | def texts_to_sents(texts, model="en_core_web_sm", remove_stop=True, lemmatize=True): 50 | """ 51 | transform list of texts to list of sents (list of tokens) and apply 52 | simple text preprocessing 53 | """ 54 | texts = [strip_tags(t) for t in texts] 55 | results = [] 56 | 57 | assert spacy is not None, 'please install spacy, i.e., "pip install spacy"' 58 | 59 | try: 60 | nlp = spacy.load(model, disable=["ner"]) 61 | except Exception as e: 62 | print(e, "\ntrying to download model...") 63 | os.system("python -m spacy download " + model) 64 | nlp = spacy.load(model, disable=["ner"]) 65 | 66 | for doc in tqdm(nlp.pipe(texts), total=len(texts), desc="texts to sents"): 67 | for s in doc.sents: 68 | results.append( 69 | [ 70 | simple_preproc( 71 | strip_non_alphanum(t.lemma_ if lemmatize else t.text) 72 | ) 73 | for t in s 74 | if not any((t.is_punct, t.is_space, remove_stop and t.is_stop)) 75 | ] 76 | ) 77 | return results 78 | -------------------------------------------------------------------------------- /tests/test_pair_counts.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import hyperhyper 4 | 5 | 6 | @pytest.fixture() 7 | def corpus(): 8 | some_text1 = """ 9 | The English Wikipedia is the English-language edition of the free online encyclopedia Wikipedia. Founded on 15 January 2001, it is the first edition of Wikipedia and, as of April 2019, has the most articles of any of the editions.[2] As of June 2019, 12% of articles in all Wikipedias belong to the English-language edition. This share has gradually declined from more than 50 percent in 2003, due to the growth of Wikipedias in other languages.[3] As of 1 June 2019, there are 5,870,200 articles on the site,[4] having surpassed the 5 million mark on 1 November 2015.[5] In October 2015, the combined text of the English Wikipedia's articles totalled 11.5 gigabytes when compressed.[6] 10 | 11 | The Simple English Wikipedia is a variation in which most of the articles use only basic English vocabulary. There is also the Old English (Ænglisc/Anglo-Saxon) Wikipedia (angwiki). Community-produced news publications include The Signpost.[7] 12 | """ 13 | 14 | some_text2 = """ 15 | The English Wikipedia was the first Wikipedia edition and has remained the largest. It has pioneered many ideas as conventions, policies or features which were later adopted by Wikipedia editions in some of the other languages. These ideas include "featured articles",[8] the neutral-point-of-view policy,[9] navigation templates,[10] the sorting of short "stub" articles into sub-categories,[11] dispute resolution mechanisms such as mediation and arbitration,[12] and weekly collaborations.[13] 16 | 17 | The English Wikipedia has adopted features from Wikipedias in other languages. These features include verified revisions from the German Wikipedia (dewiki) and town population-lookup templates from the Dutch Wikipedia (nlwiki). 18 | 19 | Although the English Wikipedia stores images and audio files, as well as text files, many of the images have been moved to Wikimedia Commons with the same name, as passed-through files. However, the English Wikipedia also has fair-use images and audio/video files (with copyright restrictions), most of which are not allowed on Commons. 20 | 21 | Many of the most active participants in the Wikimedia Foundation, and the developers of the MediaWiki software that powers Wikipedia, are English users. 22 | """ 23 | 24 | texts = [some_text1, some_text2] 25 | c = hyperhyper.Corpus.from_texts(texts) 26 | c.texts_to_file("test_bunch/bla", 5) 27 | return c 28 | 29 | 30 | def test_count(corpus): 31 | pair_c = hyperhyper.count_pairs(corpus) 32 | print(pair_c) 33 | 34 | 35 | def test_count_subs(corpus): 36 | pair_c = hyperhyper.count_pairs(corpus, subsample="prob") 37 | pair_c = hyperhyper.count_pairs(corpus, subsample="deter") 38 | pair_c = hyperhyper.count_pairs(corpus, subsample="deter", low_memory=True) 39 | pair_c = hyperhyper.count_pairs(corpus, dynamic_window="decay") 40 | -------------------------------------------------------------------------------- /hyperhyper/svd.py: -------------------------------------------------------------------------------- 1 | """ 2 | apply SVD on a PPMI matrix to get low-dimensional word embeddings 3 | """ 4 | 5 | import heapq 6 | import logging 7 | 8 | import numpy as np 9 | from gensim.models.lsimodel import stochastic_svd 10 | from scipy.sparse import linalg 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | try: 16 | from sparsesvd import sparsesvd 17 | except ImportError: 18 | logger.info("no sparsvd") 19 | 20 | try: 21 | from sklearn.utils.extmath import randomized_svd 22 | except ImportError: 23 | logger.info("no sklearn") 24 | 25 | 26 | def calc_svd(matrix, dim, impl, impl_args): 27 | """ 28 | apply truncated SVD with several implementations 29 | 30 | truncated SVD: 31 | sparsesvd: https://pypi.org/project/sparsesvd/ 32 | scipy: https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.svd.html 33 | 34 | randomized truncated SVD: 35 | gensim: https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/lsimodel.py 36 | scikit: https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_svd.html 37 | 38 | Check out the comparision: https://github.com/jfilter/sparse-svd-benchmark 39 | """ 40 | if impl == "sparsesvd": 41 | # originally used SVD implementation 42 | ut, s, _ = sparsesvd(matrix.m.tocsc(), dim) 43 | # returns in a different format 44 | ut = ut.T 45 | if impl == "scipy": 46 | ut, s, _ = linalg.svds(matrix.m, dim) 47 | # randomized (but fast) truncated SVD 48 | if impl == "gensim": 49 | # better default arguments 50 | args = {"power_iters": 5, "extra_dims": 10, **impl_args} 51 | ut, s = stochastic_svd(matrix.m, dim, matrix.m.shape[0], **args) 52 | if impl == "scikit": 53 | ut, s, _ = randomized_svd(matrix.m, dim, **impl_args) 54 | 55 | return ut, s 56 | 57 | 58 | class SVDEmbedding: 59 | """ 60 | SVD embeddings. 61 | Enables controlling the weighted exponent of the eigenvalue matrix (eig). 62 | """ 63 | 64 | def __init__(self, ut, s, normalize=True, eig=0.0): 65 | if eig == 0.0: 66 | self.m = ut 67 | elif eig == 1.0: 68 | self.m = s * ut 69 | else: 70 | self.m = np.power(s, eig) * ut 71 | 72 | # not used? 73 | # self.dim = self.m.shape[1] 74 | 75 | if normalize: 76 | self.normalize() 77 | 78 | def normalize(self): 79 | norm = np.sqrt(np.sum(self.m * self.m, axis=1)) 80 | self.m = self.m / norm[:, np.newaxis] 81 | 82 | def represent(self, w_idx): 83 | return self.m[w_idx, :] 84 | 85 | def similarity(self, w_idx_1, w_idx_2): 86 | """ 87 | Assumes the vectors have been normalized. 88 | """ 89 | return self.represent(w_idx_1).dot(self.represent(w_idx_2)) 90 | 91 | def most_similar(self, w_idx, n=10): 92 | """ 93 | Assumes the vectors have been normalized. 94 | """ 95 | scores = self.m.dot(self.represent(w_idx)) 96 | return heapq.nlargest(n, zip(scores, list(range(len(scores))))) 97 | -------------------------------------------------------------------------------- /hyperhyper/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | utility functions for i/o and other general funtionality 3 | """ 4 | 5 | import logging 6 | import math 7 | import os 8 | import pickle 9 | from collections import defaultdict 10 | from concurrent import futures 11 | 12 | import numpy as np 13 | from scipy.sparse import csr_matrix 14 | from tqdm import tqdm 15 | 16 | num_cpu = os.cpu_count() 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | def save_arrays(f, a1, a2): 22 | if type(f) != str: 23 | f = str(f) 24 | np.savez_compressed(f, a1=a1, a2=a2) 25 | 26 | 27 | def load_arrays(f): 28 | if type(f) != str: 29 | f = str(f) 30 | if not f.endswith(".npz"): 31 | f += ".npz" 32 | loader = np.load(f) 33 | return loader["a1"], loader["a2"] 34 | 35 | 36 | def save_matrix(f, m): 37 | if type(f) != str: 38 | f = str(f) 39 | np.savez_compressed( 40 | f, data=m.data, indices=m.indices, indptr=m.indptr, shape=m.shape 41 | ) 42 | 43 | 44 | def load_matrix(f): 45 | if type(f) != str: 46 | f = str(f) 47 | if not f.endswith(".npz"): 48 | f += ".npz" 49 | loader = np.load(f) 50 | return csr_matrix( 51 | (loader["data"], loader["indices"], loader["indptr"]), shape=loader["shape"] 52 | ) 53 | 54 | 55 | def chunks(l, n): 56 | """ 57 | Yield successive n-sized chunks from l. 58 | """ 59 | for i in range(0, len(l), n): 60 | yield l[i : i + n] 61 | 62 | 63 | # TODO: more perfz 64 | def combine_chunks(chunks): 65 | for c in chunks: 66 | for x in c: 67 | yield x 68 | 69 | 70 | def map_pool_chunks( 71 | array, fun, num_chunks=100, chunk_size=None, combine=True, **kwargs 72 | ): 73 | if chunk_size is None: 74 | chunk_size = math.ceil(len(array) / num_chunks) 75 | results = map_pool(chunks(array, chunk_size), fun, total=len(array), **kwargs) 76 | if combine: 77 | results = list(combine_chunks(results)) 78 | return results 79 | 80 | 81 | def map_pool(array, fun, total=None, desc=None, process_chunksize=100): 82 | with futures.ProcessPoolExecutor(num_cpu) as executor: 83 | if desc is None: 84 | return list(executor.map(fun, array, chunksize=process_chunksize)) 85 | return list( 86 | tqdm( 87 | executor.map(fun, array, chunksize=process_chunksize), 88 | total=len(array) if total is None else total, 89 | desc=desc, 90 | ) 91 | ) 92 | 93 | 94 | def delete_folder(pth): 95 | for sub in pth.iterdir(): 96 | if sub.is_dir(): 97 | delete_folder(sub) 98 | else: 99 | sub.unlink() 100 | pth.rmdir() 101 | 102 | 103 | def to_pickle(ob, fn): 104 | fn.parent.mkdir(parents=True, exist_ok=True) 105 | with open(fn, "wb") as outfile: 106 | pickle.dump(ob, outfile) 107 | 108 | 109 | def read_pickle(fn): 110 | with open(fn, "rb") as infile: 111 | return pickle.load(infile) 112 | 113 | 114 | def dsum(*dicts): 115 | """ 116 | sum up numerical values in multiple dictionaries 117 | """ 118 | ret = defaultdict(int) 119 | for d in dicts: 120 | for k, v in d.items(): 121 | ret[k] += v 122 | return dict(ret) 123 | -------------------------------------------------------------------------------- /tests/test_dataset.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from pathlib import Path 3 | 4 | import pytest 5 | 6 | import hyperhyper 7 | 8 | some_text1 = """ 9 | The English Wikipedia is the English-language edition of the free online encyclopedia Wikipedia. Founded on 15 January 2001, it is the first edition of Wikipedia and, as of April 2019, has the most articles of any of the editions.[2] As of June 2019, 12% of articles in all Wikipedias belong to the English-language edition. This share has gradually declined from more than 50 percent in 2003, due to the growth of Wikipedias in other languages.[3] As of 1 June 2019, there are 5,870,200 articles on the site,[4] having surpassed the 5 million mark on 1 November 2015.[5] In October 2015, the combined text of the English Wikipedia's articles totalled 11.5 gigabytes when compressed.[6] 10 | 11 | The Simple English Wikipedia is a variation in which most of the articles use only basic English vocabulary. There is also the Old English (Ænglisc/Anglo-Saxon) Wikipedia (angwiki). Community-produced news publications include The Signpost.[7] 12 | """ 13 | 14 | some_text2 = """ 15 | The English Wikipedia was the first Wikipedia edition and has remained the largest. It has pioneered many ideas as conventions, policies or features which were later adopted by Wikipedia editions in some of the other languages. These ideas include "featured articles",[8] the neutral-point-of-view policy,[9] navigation templates,[10] the sorting of short "stub" articles into sub-categories,[11] dispute resolution mechanisms such as mediation and arbitration,[12] and weekly collaborations.[13] 16 | 17 | The English Wikipedia has adopted features from Wikipedias in other languages. These features include verified revisions from the German Wikipedia (dewiki) and town population-lookup templates from the Dutch Wikipedia (nlwiki). 18 | 19 | Although the English Wikipedia stores images and audio files, as well as text files, many of the images have been moved to Wikimedia Commons with the same name, as passed-through files. However, the English Wikipedia also has fair-use images and audio/video files (with copyright restrictions), most of which are not allowed on Commons. 20 | 21 | Many of the most active participants in the Wikimedia Foundation, and the developers of the MediaWiki software that powers Wikipedia, are English users. 22 | """ 23 | 24 | texts = [some_text1, some_text2] * 10 25 | 26 | 27 | def test_corpus(): 28 | sents = [] 29 | for t in texts: 30 | sents += t.split("\n\n") 31 | corpus = hyperhyper.Corpus.from_sents(sents) 32 | assert corpus.size == len(sents) 33 | assert corpus.counts[corpus.vocab.token2id["wikipedia"]] > 0 34 | assert corpus.vocab.token2id["wikipedia"] == corpus.vocab.tokens.index("wikipedia") 35 | 36 | keys = corpus.vocab.token2id.keys() 37 | print(len(keys)) 38 | 39 | for k in keys: 40 | i = corpus.vocab.token2id[k] 41 | assert i < len(keys) 42 | 43 | 44 | def test_sent_split(): 45 | corpus = hyperhyper.Corpus.from_texts(texts) 46 | print(corpus.texts) 47 | assert corpus.size > 2 48 | 49 | 50 | def test_text_files(): 51 | # setup 52 | test_dir = tempfile.mkdtemp() 53 | for i, t in enumerate(texts): 54 | Path(test_dir + f"/{i}.txt").write_text(t) 55 | # test 56 | corpus = hyperhyper.Corpus.from_text_files(test_dir) 57 | corpus = hyperhyper.Corpus.from_text_files(test_dir, view_fraction=0.2) 58 | print(corpus) 59 | assert corpus.size > 2 60 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/de/README.txt: -------------------------------------------------------------------------------- 1 | # Sources 2 | 3 | The evaluation datasets for German come from two sources: 4 | 5 | 1. Deep Semantic Analogies Dataset 6 | 2. Bachelors' Thesis by Andreas Müller: https://devmount.github.io/GermanWordEmbeddings/ 7 | 3. Word Similarity: https://www.informatik.tu-darmstadt.de/ukp/research_6/data/semantic_relatedness/german_relatedness_datasets/index.en.jsp 8 | 4. Multilingual SimLex999 and WordSim353: http://leviants.com/ira.leviant/MultilingualVSMdata.html 9 | 10 | 11 | Deep Semantic Analogies Dataset 12 | -------------------------------------------- 13 | 14 | Paper: https://www.aclweb.org/anthology/W15-0105 15 | 16 | This collection contains six newly created semantic datasets. 17 | 18 | It contains 5 files: 19 | * de_re-rated_Schm280.txt 20 | * de_sem-para_SemRel.txt 21 | * en_sem-para_BLESS.txt 22 | * en_sem-para_SemRel.txt 23 | * de_toefl_subset.txt 24 | * de_trans_Google_analogies.txt 25 | 26 | For a detailed description of the data, please refer to the paper (see reference below). 27 | For questions, please contact 28 | Maximilian Koeper (koepermn@ims.uni-stuttgart.de), 29 | Christian Scheible (scheibcn@ims.uni-stuttgart.de), or 30 | Sabine Schulte im Walde (schulte@ims.uni-stuttgart.de) 31 | 32 | File descriptions: 33 | ------------------ 34 | * de_re-rated_Schm280.txt contains the re-rated version of the Schm280 set (Schmidt et al. 2001). Schm280 consists of 280 translated word pairs from WordSim350. We re-rated these pairs, asking 10 Judges under the same conditions as in WordSim353. We call the resulting dataset WordSim280. Each line contains a word pair and the mean similarity score in [0,10] 35 | 36 | * en_sem-para_SemRel.txt and de_sem-para_SemRel.txt contain analogy questions based on the word pairs from (Scheible and Schulte im Walde, 2014). Each question is of the form A:B::C:D. The questions cover the relations adj-antonym, noun-hyperonym, noun-synonym, noun-antonym, and verb-antonym. For more details, please refer to the paper. This file consists of several sections (delimited by header lines), each for a different relation. Within a section, each line lists the four related words A, B, C, and D of an analogy "A is to B as C is to D". 37 | 38 | * en_sem-para_BLESS.txt was constructed the same way as the SemRel datasets, but based on hyperonymy and meronymy relations from the BLESS dataset (Baroni & Lenci. 2011). The format is the same as for the SemRel files. 39 | 40 | * de_toefl_subset.txt is a subset of the German word choice questions from the University of Darmstadt (Mohammad et al., 2007). We removed all questions that contain phrases in order to obtain a challenge of a difficulty comparable to the English TOEFL data. Each line contains a question of the form "stem correct_answer distractor1 distractor2 distractor3". 41 | 42 | * de_trans_Google_analogies.txt is the German translation of the Google (Mikolov et al., 2013a) analogy set. We omit the adjective-adverb relation as this distinction does not exist in German. The format is again the same as for the SemRel files. 43 | 44 | Reference: 45 | ---------- 46 | 47 | @inproceedings{KoeperScheibleSchulte2015IWCS, 48 | title = {Multilingual Reliability and ``Semantic'' Structure of Continuous Word Spaces}, 49 | author = {Maximilian K\"oper, Christian Scheible, Sabine {Schulte im Walde}}, 50 | booktitle = {Proceedings of the 11th International Conference on Computational Semantics (IWCS 2015) -- Short Papers}, 51 | address = {London, UK}, 52 | year = {2015} 53 | } 54 | -------------------------------------------------------------------------------- /hyperhyper/pmi.py: -------------------------------------------------------------------------------- 1 | """ 2 | implements PMI matrix (Pointwise mutual information) 3 | See: https://en.wikipedia.org/wiki/Pointwise_mutual_information 4 | """ 5 | 6 | import heapq 7 | 8 | import numpy as np 9 | from gensim import matutils 10 | from scipy.sparse import csr_matrix, dok_matrix 11 | 12 | 13 | def calc_pmi(counts, cds): 14 | """ 15 | Calculates e^PMI; PMI without the log(). 16 | """ 17 | 18 | sum_w = np.array(counts.sum(axis=1))[:, 0] 19 | sum_c = np.array(counts.sum(axis=0))[0, :] 20 | if cds != 1: 21 | sum_c = sum_c ** cds 22 | sum_total = sum_c.sum() 23 | sum_w = np.reciprocal(sum_w) 24 | sum_c = np.reciprocal(sum_c) 25 | 26 | pmi = csr_matrix(counts) 27 | pmi = multiply_by_rows(pmi, sum_w) 28 | pmi = multiply_by_columns(pmi, sum_c) 29 | pmi = pmi * sum_total 30 | return pmi 31 | 32 | 33 | def multiply_by_rows(matrix, row_coefs): 34 | normalizer = dok_matrix((len(row_coefs), len(row_coefs))) 35 | normalizer.setdiag(row_coefs) 36 | return normalizer.tocsr().dot(matrix) 37 | 38 | 39 | def multiply_by_columns(matrix, col_coefs): 40 | normalizer = dok_matrix((len(col_coefs), len(col_coefs))) 41 | normalizer.setdiag(col_coefs) 42 | return matrix.dot(normalizer.tocsr()) 43 | 44 | 45 | class PPMIEmbedding: 46 | """ 47 | Base class for explicit representations. Assumes that the serialized input is e^PMI. 48 | 49 | Positive PMI (PPMI) with negative sampling (neg). 50 | Negative samples shift the PMI matrix before truncation. 51 | """ 52 | 53 | def __init__(self, matrix, normalize=True, neg=1): 54 | self.m = matrix 55 | self.m.data = np.log(self.m.data) 56 | 57 | # not needed? 58 | # # self.normal = normalize 59 | 60 | if neg is not None: 61 | self.m.data -= np.log(neg) 62 | self.m.data[self.m.data < 0] = 0 63 | self.m.eliminate_zeros() 64 | 65 | if normalize: 66 | self.normalize() 67 | 68 | def normalize(self): 69 | m2 = self.m.copy() 70 | m2.data **= 2 71 | norm = np.reciprocal(np.sqrt(np.array(m2.sum(axis=1))[:, 0])) 72 | normalizer = dok_matrix((len(norm), len(norm))) 73 | normalizer.setdiag(norm) 74 | self.m = normalizer.tocsr().dot(self.m) 75 | 76 | def represent(self, w_idx): 77 | return self.m[w_idx, :] 78 | 79 | def similarity(self, w1, w2): 80 | """ 81 | Assumes the vectors have been normalized. 82 | """ 83 | return self.represent(w1).dot(self.represent(w2).T)[0, 0] 84 | 85 | def most_similar(self, w, n=10): 86 | """ 87 | Assumes the vectors have been normalized. 88 | """ 89 | scores = self.m.dot(self.represent(w).T).T.tocsr() 90 | return heapq.nlargest(n, zip(scores.data, scores.indices)) 91 | 92 | 93 | # TODO: working? 94 | def most_similar_vectors(self, positives, negatives, topn=10): 95 | """ 96 | Some parts taken from gensim. 97 | https://github.com/RaRe-Technologies/gensim/blob/ea87470e4c065676d3d33df15b8db4192b30ebc1/gensim/models/keyedvectors.py#L690 98 | """ 99 | mean = [np.squeeze(self.represent(x).toarray()) for x in positives] + [-1 * np.squeeze(self.represent(x).toarray()) for x in negatives] 100 | mean = matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32) 101 | 102 | dists = self.m.dot(mean) 103 | 104 | best = matutils.argsort(dists, topn=topn, reverse=True) 105 | return [(best_idx, float(dists[best_idx])) for best_idx in best] 106 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/en/ws/ws353_similarity.txt: -------------------------------------------------------------------------------- 1 | tiger cat 7.35 2 | tiger tiger 10.00 3 | plane car 5.77 4 | train car 6.31 5 | television radio 6.77 6 | media radio 7.42 7 | bread butter 6.19 8 | cucumber potato 5.92 9 | doctor nurse 7.00 10 | professor doctor 6.62 11 | student professor 6.81 12 | smart stupid 5.81 13 | wood forest 7.73 14 | money cash 9.15 15 | king queen 8.58 16 | king rook 5.92 17 | bishop rabbi 6.69 18 | fuck sex 9.44 19 | football soccer 9.03 20 | football basketball 6.81 21 | football tennis 6.63 22 | Arafat Jackson 2.50 23 | physics chemistry 7.35 24 | vodka gin 8.46 25 | vodka brandy 8.13 26 | drink eat 6.87 27 | car automobile 8.94 28 | gem jewel 8.96 29 | journey voyage 9.29 30 | boy lad 8.83 31 | coast shore 9.10 32 | asylum madhouse 8.87 33 | magician wizard 9.02 34 | midday noon 9.29 35 | furnace stove 8.79 36 | food fruit 7.52 37 | bird cock 7.10 38 | bird crane 7.38 39 | food rooster 4.42 40 | money dollar 8.42 41 | money currency 9.04 42 | tiger jaguar 8.00 43 | tiger feline 8.00 44 | tiger carnivore 7.08 45 | tiger mammal 6.85 46 | tiger animal 7.00 47 | tiger organism 4.77 48 | tiger fauna 5.62 49 | psychology psychiatry 8.08 50 | psychology science 6.71 51 | psychology discipline 5.58 52 | planet star 8.45 53 | planet moon 8.08 54 | planet sun 8.02 55 | precedent example 5.85 56 | precedent antecedent 6.04 57 | cup tableware 6.85 58 | cup artifact 2.92 59 | cup object 3.69 60 | cup entity 2.15 61 | jaguar cat 7.42 62 | jaguar car 7.27 63 | mile kilometer 8.66 64 | skin eye 6.22 65 | Japanese American 6.50 66 | century year 7.59 67 | announcement news 7.56 68 | doctor personnel 5.00 69 | Harvard Yale 8.13 70 | hospital infrastructure 4.63 71 | life death 7.88 72 | travel activity 5.00 73 | type kind 8.97 74 | street place 6.44 75 | street avenue 8.88 76 | street block 6.88 77 | cell phone 7.81 78 | dividend payment 7.63 79 | calculation computation 8.44 80 | profit loss 7.63 81 | dollar yen 7.78 82 | dollar buck 9.22 83 | phone equipment 7.13 84 | liquid water 7.89 85 | marathon sprint 7.47 86 | seafood food 8.34 87 | seafood lobster 8.70 88 | lobster food 7.81 89 | lobster wine 5.70 90 | championship tournament 8.36 91 | man woman 8.30 92 | man governor 5.25 93 | murder manslaughter 8.53 94 | opera performance 6.88 95 | Mexico Brazil 7.44 96 | glass metal 5.56 97 | aluminum metal 7.83 98 | rock jazz 7.59 99 | museum theater 7.19 100 | shower thunderstorm 6.31 101 | monk oracle 5.00 102 | cup food 5.00 103 | journal association 4.97 104 | street children 4.94 105 | car flight 4.94 106 | space chemistry 4.88 107 | situation conclusion 4.81 108 | word similarity 4.75 109 | peace plan 4.75 110 | consumer energy 4.75 111 | ministry culture 4.69 112 | smart student 4.62 113 | investigation effort 4.59 114 | image surface 4.56 115 | life term 4.50 116 | start match 4.47 117 | computer news 4.47 118 | board recommendation 4.47 119 | lad brother 4.46 120 | observation architecture 4.38 121 | coast hill 4.38 122 | deployment departure 4.25 123 | benchmark index 4.25 124 | attempt peace 4.25 125 | consumer confidence 4.13 126 | start year 4.06 127 | focus life 4.06 128 | development issue 3.97 129 | theater history 3.91 130 | situation isolation 3.88 131 | profit warning 3.88 132 | media trading 3.88 133 | chance credibility 3.88 134 | precedent information 3.85 135 | architecture century 3.78 136 | population development 3.75 137 | stock live 3.73 138 | peace atmosphere 3.69 139 | morality marriage 3.69 140 | minority peace 3.69 141 | atmosphere landscape 3.69 142 | report gain 3.63 143 | music project 3.63 144 | seven series 3.56 145 | experience music 3.47 146 | school center 3.44 147 | five month 3.38 148 | announcement production 3.38 149 | morality importance 3.31 150 | money operation 3.31 151 | delay news 3.31 152 | governor interview 3.25 153 | practice institution 3.19 154 | century nation 3.16 155 | coast forest 3.15 156 | shore woodland 3.08 157 | drink car 3.04 158 | president medal 3.00 159 | prejudice recognition 3.00 160 | viewer serial 2.97 161 | peace insurance 2.94 162 | Mars water 2.94 163 | media gain 2.88 164 | precedent cognition 2.81 165 | announcement effort 2.75 166 | line insurance 2.69 167 | crane implement 2.69 168 | drink mother 2.65 169 | opera industry 2.63 170 | volunteer motto 2.56 171 | listing proximity 2.56 172 | precedent collection 2.50 173 | cup article 2.40 174 | sign recess 2.38 175 | problem airport 2.38 176 | reason hypertension 2.31 177 | direction combination 2.25 178 | Wednesday news 2.22 179 | glass magician 2.08 180 | cemetery woodland 2.08 181 | possibility girl 1.94 182 | cup substance 1.92 183 | forest graveyard 1.85 184 | stock egg 1.81 185 | month hotel 1.81 186 | energy secretary 1.81 187 | precedent group 1.77 188 | production hike 1.75 189 | stock phone 1.62 190 | holy sex 1.62 191 | stock CD 1.31 192 | drink ear 1.31 193 | delay racism 1.19 194 | stock life 0.92 195 | stock jaguar 0.92 196 | monk slave 0.92 197 | lad wizard 0.92 198 | sugar approach 0.88 199 | rooster voyage 0.62 200 | noon string 0.54 201 | chord smile 0.54 202 | professor cucumber 0.31 203 | king cabbage 0.23 204 | -------------------------------------------------------------------------------- /hyperhyper/evaluation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Evaluate the performance of embeddings with word simularities and word analogies. 3 | 4 | Can't use the evaluation methods in gensim because the keyed vector structure does not work for PPMI. 5 | So we have to caculate the metrics ourselves. 6 | """ 7 | 8 | from pathlib import Path 9 | 10 | import numpy as np 11 | from scipy.stats.stats import spearmanr 12 | 13 | from . import evaluation_datasets 14 | 15 | try: 16 | from importlib.resources import path 17 | except ImportError: 18 | # backport for Python <3.7 19 | from importlib_resources import path 20 | 21 | 22 | def read_test_data(lang, type): 23 | """ 24 | read test data that is stored within the module 25 | """ 26 | with path(evaluation_datasets, lang) as eval_dir: 27 | for file in eval_dir.glob(f"{type}/*.txt"): 28 | yield file 29 | 30 | 31 | def to_item(li): 32 | """ 33 | squeeze 34 | """ 35 | if isinstance(li, list): 36 | if len(li) == 0: 37 | return None 38 | if len(li) == 1: 39 | return li[0] 40 | return to_item(li[0]) 41 | return li 42 | 43 | 44 | def setup_test_tokens(p, keep_len): 45 | """ 46 | Read in traning data from files and discard comments (etc.) 47 | """ 48 | lines = Path(p).read_text().split("\n") 49 | lines = [l.split() for l in lines] 50 | lines = [l for l in lines if len(l) == keep_len] 51 | return zip(*lines) 52 | 53 | 54 | def eval_similarity(vectors, token2id, preproc_fun, lang="en"): 55 | """ 56 | evaluate word similarity on several test datasets 57 | """ 58 | line_counts, spear_results, full_results = [], [], [] 59 | 60 | for data in read_test_data(lang, "ws"): 61 | results = [] 62 | 63 | token1, token2, sims = setup_test_tokens(data, 3) 64 | # preprocess tokens 'in batch' 65 | token1, token2 = preproc_fun(token1), preproc_fun(token2) 66 | lines = list(zip(token1, token2, sims)) 67 | for x, y, sim in lines: 68 | x, y = to_item(x), to_item(y) 69 | 70 | # not sure it the lines below are needed 71 | # if x is None or y is None: 72 | # continue 73 | 74 | # skip over OOV 75 | if x in token2id and y in token2id: 76 | results.append((vectors.similarity(token2id[x], token2id[y]), sim)) 77 | 78 | if len(results) == 0: 79 | print("not enough results for this dataset: ", data.name) 80 | continue 81 | 82 | actual, expected = zip(*results) 83 | spear_res = spearmanr(actual, expected)[0] 84 | spear_results.append(spear_res) 85 | line_counts.append(len(results)) 86 | oov = (len(lines) - len(results)) / len(lines) 87 | 88 | full_results.append( 89 | { 90 | "name": f"{lang}_{data.stem}", 91 | "score": spear_res, 92 | "oov": oov, 93 | "fullscore": spear_res * (1 - oov), # consider the portion of OOV 94 | } 95 | ) 96 | 97 | micro_avg = sum([x * y for x, y in zip(line_counts, spear_results)]) / sum( 98 | line_counts 99 | ) 100 | macro_avg = sum(spear_results) / len(spear_results) 101 | return {"micro": micro_avg, "macro": macro_avg, "results": full_results} 102 | 103 | 104 | # TODO: 105 | 106 | # analogies 107 | def eval_analogies(vectors, token2id, preproc_fun, lang="en"): 108 | line_counts, full_results = [], [] 109 | 110 | for data in read_test_data(lang, "analogy"): 111 | results = [] 112 | 113 | line_tokens = setup_test_tokens(data, 4) 114 | line_tokens = [preproc_fun(t) for t in line_tokens] 115 | lines = list(zip(*line_tokens)) 116 | for tokens in lines: 117 | tokens = [to_item(x) for x in tokens] 118 | # skip over OOV 119 | if not all([x in token2id for x in tokens]): 120 | continue 121 | 122 | tokens = [token2id[x] for x in tokens] 123 | a, a_, b, b_ = tokens 124 | guesses = vectors.most_similar_vectors([a, b], [a_]) 125 | result = 1 if b_ in guesses else 0 126 | results.append(result) 127 | 128 | if len(results) == 0: 129 | print("not enough results for this dataset: ", data.name) 130 | continue 131 | 132 | sum_results = sum(results) 133 | line_counts.append(len(results)) 134 | oov = (len(lines) - len(results)) / len(lines) 135 | 136 | full_results.append( 137 | { 138 | "name": f"{lang}_{data.stem}", 139 | "score": sum_results, 140 | "oov": oov, 141 | "fullscore": sum_results * (1 - oov), # consider the portion of OOV 142 | } 143 | ) 144 | 145 | scores = [x['score'] for x in full_results] 146 | micro_avg = sum([x * y for x, y in zip(line_counts, scores)]) / sum( 147 | line_counts 148 | ) 149 | macro_avg = sum(scores) / len(scores) 150 | return {"micro": micro_avg, "macro": macro_avg, "results": full_results} 151 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/de/ws/ws353sim.txt: -------------------------------------------------------------------------------- 1 | Tiger Katze 7.92 2 | Tiger Tiger 10 3 | Flugzeug Auto 4.92 4 | Zug Auto 5.54 5 | Fernseher Radio 5.77 6 | Medien Radio 8.15 7 | Brot Butter 5.62 8 | Gurke Kartoffel 4.92 9 | Arzt Krankenschwester 6.69 10 | Professor Doktor 6.77 11 | Student Professor 5.69 12 | klug Student 4.85 13 | klug dumm 5 14 | Vorrat Telefon 0.31 15 | Vorrat CD 0.54 16 | Vorrat Jaguar 0.23 17 | Vorrat Ei 2.15 18 | Aktie Live 1.08 19 | Aktie Leben 0.62 20 | Holz Wald 8.54 21 | Geld Bargeld 9.69 22 | Professor Gurke 0.15 23 | König Kohl 0.23 24 | König Königin 10 25 | König Turm 5.15 26 | Bischoff Rabbi 7 27 | Heilig Sex 0.69 28 | Ficken Sex 9.15 29 | Fußball Basketball 5.38 30 | Fußball Tennis 4.69 31 | Arafat Jackson 0.69 32 | Physik Chemie 7.54 33 | Weltall Chemie 3.31 34 | Wodka Gin 7.92 35 | Wodka Brandy 8.22 36 | Drink Auto 1.85 37 | Trinken Ohren 0.62 38 | Trinken Essen 7.23 39 | Säugen Mutter 7.69 40 | Auto Fahrzeug 9.19 41 | Edelstein Juwel 9.27 42 | Ausflug Reise 8.23 43 | Junge Bursche 9.27 44 | Irrenanstalt Tollhaus 8.23 45 | Magier Zauberer 9.65 46 | Ofen Herd 8.81 47 | Essen Frucht 6.77 48 | Vogel Hahn 6.08 49 | Vogel Kranich 7.46 50 | Kran Arbeitsgerät 6.08 51 | Bursche Bruder 4.23 52 | Mönch Orakel 1.38 53 | Friedhof Waldgebiet 1.92 54 | Essen Hahn 3.46 55 | Küste Hügel 2.69 56 | Wald Friedhof 3.31 57 | Ufer Waldgebiet 2.31 58 | Mönch Sklave 1.08 59 | Küste Wald 1.77 60 | Bursche Zauberer 0.77 61 | Akkord Lächeln 0.31 62 | Glas Magier 1.69 63 | Mittag Faden 0.15 64 | Hahn Reise 0.31 65 | Geld Dollar 7.92 66 | Geld Bargeld 9.19 67 | Geld Währung 8.12 68 | Geld Wäsche 3.62 69 | Tiger Jaguar 6 70 | Tiger Katze 6.92 71 | Tiger Raubtier 8 72 | Tiger Säugetier 6.58 73 | Tiger Tier 7.85 74 | Tiger Organismus 3.59 75 | Tiger Fauna 3.69 76 | Psychologie Psychiatrie 6.85 77 | Psychologie Wissenschaft 5.96 78 | Psychologie Disziplin 4.77 79 | Planet Stern 7.23 80 | Planet Mond 7.08 81 | Planet Sonne 7.08 82 | Präzedenz Beispiel 6.83 83 | Basis Information 3.32 84 | Voraussetzung Erkenntnis 3.15 85 | Beispielhaft Sammlung 1.92 86 | Vorbildlich Gruppe 2.19 87 | Vorangehend Vorausgehend 8.04 88 | Tasse Geschirr 8 89 | Tasse Gegenstand 6.08 90 | Tasse Artefakt 2 91 | Tasse Objekt 5.69 92 | Tasse Ding 5.08 93 | Tasse Essen 1.77 94 | Tasse Substanz 1.69 95 | Jaguar Katze 6.66 96 | Jaguar Auto 7.55 97 | Energie Minister 4.15 98 | Untersuchung Aufwand 4.15 99 | Mars Wasser 2.77 100 | Bild Oberfläche 3.38 101 | Zeichen Kerbe 3.92 102 | Mittwoch Nachrichten 1.38 103 | Meile Kilometer 7.62 104 | Computer Nachrichten 4.23 105 | Atmosphäre Landschaft 2.5 106 | Präsident Orden 2.77 107 | Haut Augen 4.12 108 | Japaner Amerikaner 5.73 109 | Theater Geschichte 3.85 110 | Freiwilliger Motto 0.77 111 | Vorurteil Anerkennung 3.46 112 | Jahrhundert Jahr 6.85 113 | Jahrhundert Nation 1.54 114 | Verzögerung Rassismus 1.31 115 | Verzögerung Nachrichten 1.08 116 | Frieden Plan 3.85 117 | Minderheit Frieden 2.23 118 | Versuch Frieden 3.46 119 | Aufmarsch Abzug 6.27 120 | Ankündigung Nachrichten 6.31 121 | Ankündigung Aufwand 1.08 122 | Zeitschrift Verein 1.77 123 | Arzt Personal 4.46 124 | Schule Zentrum 4 125 | Ursache Bluthochdruck 3.65 126 | Harvard Yale 7.85 127 | Krankenhaus Infrastruktur 5 128 | Leben Tod 8.69 129 | Leben Dauer 6.42 130 | Wort Ähnlichkeit 2.46 131 | Gremium Empfehlung 3.65 132 | Direktor Interview 2.46 133 | Frieden Stimmung 4.46 134 | Frieden Versicherung 2.15 135 | Reise Aktivität 6.31 136 | Konsument Vertrauen 3.85 137 | Konsument Energie 3.69 138 | Problem Flughafen 2.23 139 | Auto Flug 3.77 140 | Monat Hotel 0.69 141 | Art Sorte 8.81 142 | Lage Schlussfolgerung 3.69 143 | Situation Isolation 2.15 144 | Richtung Verbindung 3.65 145 | Straße Platz 5.35 146 | Straße Allee 7.73 147 | Straße Häuserblock 5.5 148 | Straße Kinder 3.73 149 | Aufzählung Nähe 0.92 150 | Zelle Telefon 6.38 151 | Herstellung Wanderung 0.69 152 | Richtwert Kennziffer 4.92 153 | Medien Handel 2.87 154 | Medien Vorteil 1.81 155 | Gewinnanteil Auszahlung 7.32 156 | Kalkulation Berechnung 9.54 157 | Ankündigung Produktion 1.65 158 | Gewinn Warnung 2.85 159 | Gewinn Verlust 7.85 160 | Dollar Yen 7.31 161 | Dollar Kohle 5.58 162 | Telefon Zubehör 4.42 163 | Fünf Monat 1.33 164 | Bericht Zuwachs 2.02 165 | Flüssigkeit Wasser 8.62 166 | Marathon Sprint 7.67 167 | Sieben Reihe 2.41 168 | Meeresfrüchte Essen 7.23 169 | Meeresfrüchte Hummer 6.81 170 | Hummer Essen 6.78 171 | Hummer Wein 4.15 172 | Beginn Jahr 4.7 173 | Beginn Partie 4.22 174 | Meisterschaft Turnier 8.05 175 | Grundsatz Versicherung 2.51 176 | Mann Frau 8.69 177 | Mann Präsident 5.46 178 | Mord Totschlag 8.62 179 | Oper Aufführung 6.38 180 | Fokus Leben 2.62 181 | Zuschauer Serie 5.38 182 | Möglichkeit Mädchen 1.65 183 | Bevölkerung Entwicklung 4.27 184 | Moral Wichtigkeit 4.65 185 | Moral Heirat 2.81 186 | Mexiko Brasil 5.23 187 | Oper Industrie 0.85 188 | Zucker Annäherung 1.46 189 | Praxis Institution 4.27 190 | Ministerium Kultur 4.38 191 | Entwicklung Ausgabe 3.15 192 | Erfahrung Musik 1.08 193 | Musik Projekt 4.04 194 | Glas Metal 3.92 195 | Aluminium Metal 7.81 196 | Möglichkeit Glaubwürdigkeit 2.69 197 | Rock Jazz 6.19 198 | Museum Theater 5.42 199 | Betrachtung Architektur 4.38 200 | Regen Gewitter 7.85 201 | Architektur Jahrhundert 3.08 202 | -------------------------------------------------------------------------------- /hyperhyper/experiment.py: -------------------------------------------------------------------------------- 1 | """ 2 | store and retrieve experiment results in a database 3 | """ 4 | 5 | import time 6 | 7 | import sqlalchemy 8 | 9 | from .pair_counts import default_pair_args 10 | 11 | 12 | def flatten_dict(prefix, dict): 13 | """ 14 | flatten a dict Django-style 15 | """ 16 | for k, v in dict.items(): 17 | yield {f"{prefix}__{k}": v} 18 | 19 | 20 | def record(func): 21 | """ 22 | record the evaluation of an embedding in a database 23 | """ 24 | 25 | def wrapper(*args, **kwargs): 26 | results = func(*args, **kwargs) 27 | 28 | if not "pair_args" in kwargs: 29 | kwargs["pair_args"] = default_pair_args 30 | 31 | if "evaluate" in kwargs and not kwargs["evaluate"]: 32 | return results 33 | 34 | if len(results) > 1: 35 | db_dic = {} 36 | # params to dict 37 | db_dic.update({"method": func.__name__}) 38 | for k, v in kwargs.items(): 39 | if type(v) is dict: 40 | if k == "pair_args": 41 | # merge with default arguments of pair counts 42 | v = {**default_pair_args, **v} 43 | for x in flatten_dict(k, v): 44 | db_dic.update(x) 45 | else: 46 | db_dic.update({k: v}) 47 | # results to dicts 48 | db_dic.update({"micro_results": results[1]["micro"]}) 49 | db_dic.update({"macro_results": results[1]["macro"]}) 50 | for r in results[1]["results"]: 51 | db_dic.update({f"{r['name']}_score": r["score"]}) 52 | db_dic.update({f"{r['name']}_oov": r["oov"]}) 53 | db_dic.update({f"{r['name']}_fullscore": r["fullscore"]}) 54 | 55 | # Couldn't figure out the timeout param for datasets so keep retrying. 56 | while True: 57 | try: 58 | # args[0] is self 59 | db = args[0].get_db() 60 | table = db["experiments"] 61 | # specify type because dataset guesses them sometimes wrongly 62 | # ensure that rows are not duplicated. This may happen, if the same function is called multiple times. 63 | table.insert_ignore( 64 | db_dic, 65 | db_dic.keys(), 66 | types={ 67 | k: sqlalchemy.types.String 68 | if type(v) is str 69 | else sqlalchemy.types.Float 70 | for k, v in db_dic.items() 71 | }, 72 | ) 73 | break 74 | except Exception as e: 75 | print(e) 76 | time.sleep(10) 77 | return results 78 | 79 | return wrapper 80 | 81 | 82 | def results_from_db(db, query={}, order="micro_results desc", limit=100): 83 | """ 84 | retrieve (the best) results from a database 85 | """ 86 | where = [] 87 | for k, v in query.items(): 88 | if type(v) is dict: 89 | for fkfv in flatten_dict(k, v): 90 | # ugly 91 | for fk, fv in fkfv.items(): 92 | where.append(f"{fk}={fv}") 93 | else: 94 | where.append(f"{k}={v}") 95 | if len(where) > 0: 96 | where = "where " + " and ".join(where) 97 | else: 98 | where = "" 99 | 100 | if order is None: 101 | order = "" 102 | if len(order) > 0: 103 | order = f"order by {order}" 104 | 105 | if limit is None: 106 | limit = "" 107 | else: 108 | limit = f"limit {limit}" 109 | 110 | query_string = f"select distinct * from experiments {where} {order} {limit}" 111 | return list(db.query(query_string)) 112 | 113 | 114 | # TODO 115 | # def get_embedding_from_params(row): 116 | # pair_args = {} 117 | # args = {} 118 | # for k, v in row.items(): 119 | # k_parts = k.split("__") 120 | # if len(k_parts) > 1: 121 | # pair_args[k_parts[1]] = v 122 | # else: 123 | # arg[k] = v 124 | 125 | # for best in list(db.query(statement)): 126 | # oov = True if best["pair_args__delete_oov"] == 1 else False 127 | # window = int(best["pair_args__window"]) 128 | # if not isinstance(window, int): 129 | # window = int.from_bytes(window, "little") 130 | # neg = float(best["neg"]) 131 | # if neg.is_integer(): 132 | # neg = int(neg) 133 | # dim = int(best["dim"]) 134 | 135 | # print(oov, best) 136 | # try: 137 | # print(best["neg"]) 138 | # kv, res = b.svd( 139 | # impl="scipy", 140 | # evaluate=True, 141 | # pair_args={ 142 | # "subsample": "deter", 143 | # "subsample_factor": best["pair_args__subsample_factor"], 144 | # "delete_oov": True, 145 | # "decay_rate": best["pair_args__decay_rate"], 146 | # "window": window, 147 | # "dynamic_window": "decay", 148 | # }, 149 | # neg=neg, 150 | # eig=best["eig"], 151 | # dim=dim, 152 | # keyed_vector=True, 153 | # ) 154 | # print(res) 155 | # print(best) 156 | # except Exception as e: 157 | # print(e) 158 | # return kv 159 | 160 | 161 | # def get_best(db, query): 162 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/en/ws/ws353_relatedness.txt: -------------------------------------------------------------------------------- 1 | computer keyboard 7.62 2 | Jerusalem Israel 8.46 3 | planet galaxy 8.11 4 | canyon landscape 7.53 5 | OPEC country 5.63 6 | day summer 3.94 7 | day dawn 7.53 8 | country citizen 7.31 9 | planet people 5.75 10 | environment ecology 8.81 11 | Maradona football 8.62 12 | OPEC oil 8.59 13 | money bank 8.50 14 | computer software 8.50 15 | law lawyer 8.38 16 | weather forecast 8.34 17 | network hardware 8.31 18 | nature environment 8.31 19 | FBI investigation 8.31 20 | money wealth 8.27 21 | psychology Freud 8.21 22 | news report 8.16 23 | war troops 8.13 24 | physics proton 8.12 25 | bank money 8.12 26 | stock market 8.08 27 | planet constellation 8.06 28 | credit card 8.06 29 | hotel reservation 8.03 30 | closet clothes 8.00 31 | soap opera 7.94 32 | planet astronomer 7.94 33 | planet space 7.92 34 | movie theater 7.92 35 | treatment recovery 7.91 36 | baby mother 7.85 37 | money deposit 7.73 38 | television film 7.72 39 | psychology mind 7.69 40 | game team 7.69 41 | admission ticket 7.69 42 | Jerusalem Palestinian 7.65 43 | Arafat terror 7.65 44 | boxing round 7.61 45 | computer internet 7.58 46 | money property 7.57 47 | tennis racket 7.56 48 | telephone communication 7.50 49 | currency market 7.50 50 | psychology cognition 7.48 51 | seafood sea 7.47 52 | book paper 7.46 53 | book library 7.46 54 | psychology depression 7.42 55 | fighting defeating 7.41 56 | movie star 7.38 57 | hundred percent 7.38 58 | dollar profit 7.38 59 | money possession 7.29 60 | cup drink 7.25 61 | psychology health 7.23 62 | summer drought 7.16 63 | investor earning 7.13 64 | company stock 7.08 65 | stroke hospital 7.03 66 | liability insurance 7.03 67 | game victory 7.03 68 | psychology anxiety 7.00 69 | game defeat 6.97 70 | FBI fingerprint 6.94 71 | money withdrawal 6.88 72 | psychology fear 6.85 73 | drug abuse 6.85 74 | concert virtuoso 6.81 75 | computer laboratory 6.78 76 | love sex 6.77 77 | problem challenge 6.75 78 | movie critic 6.73 79 | Arafat peace 6.73 80 | bed closet 6.72 81 | lawyer evidence 6.69 82 | fertility egg 6.69 83 | precedent law 6.65 84 | minister party 6.63 85 | psychology clinic 6.58 86 | cup coffee 6.58 87 | water seepage 6.56 88 | government crisis 6.56 89 | space world 6.53 90 | dividend calculation 6.48 91 | victim emergency 6.47 92 | luxury car 6.47 93 | tool implement 6.46 94 | competition price 6.44 95 | psychology doctor 6.42 96 | gender equality 6.41 97 | listing category 6.38 98 | video archive 6.34 99 | oil stock 6.34 100 | governor office 6.34 101 | discovery space 6.34 102 | record number 6.31 103 | brother monk 6.27 104 | production crew 6.25 105 | nature man 6.25 106 | family planning 6.25 107 | disaster area 6.25 108 | food preparation 6.22 109 | preservation world 6.19 110 | movie popcorn 6.19 111 | lover quarrel 6.19 112 | game series 6.19 113 | dollar loss 6.09 114 | weapon secret 6.06 115 | shower flood 6.03 116 | registration arrangement 6.00 117 | arrival hotel 6.00 118 | announcement warning 6.00 119 | game round 5.97 120 | baseball season 5.97 121 | drink mouth 5.96 122 | life lesson 5.94 123 | grocery money 5.94 124 | energy crisis 5.94 125 | reason criterion 5.91 126 | equipment maker 5.91 127 | cup liquid 5.90 128 | deployment withdrawal 5.88 129 | tiger zoo 5.87 130 | journey car 5.85 131 | money laundering 5.65 132 | summer nature 5.63 133 | decoration valor 5.63 134 | Mars scientist 5.63 135 | alcohol chemistry 5.54 136 | disability death 5.47 137 | change attitude 5.44 138 | arrangement accommodation 5.41 139 | territory surface 5.34 140 | size prominence 5.31 141 | exhibit memorabilia 5.31 142 | credit information 5.31 143 | territory kilometer 5.28 144 | death row 5.25 145 | doctor liability 5.19 146 | impartiality interest 5.16 147 | energy laboratory 5.09 148 | secretary senate 5.06 149 | death inmate 5.03 150 | monk oracle 5.00 151 | cup food 5.00 152 | journal association 4.97 153 | street children 4.94 154 | car flight 4.94 155 | space chemistry 4.88 156 | situation conclusion 4.81 157 | word similarity 4.75 158 | peace plan 4.75 159 | consumer energy 4.75 160 | ministry culture 4.69 161 | smart student 4.62 162 | investigation effort 4.59 163 | image surface 4.56 164 | life term 4.50 165 | start match 4.47 166 | computer news 4.47 167 | board recommendation 4.47 168 | lad brother 4.46 169 | observation architecture 4.38 170 | coast hill 4.38 171 | deployment departure 4.25 172 | benchmark index 4.25 173 | attempt peace 4.25 174 | consumer confidence 4.13 175 | start year 4.06 176 | focus life 4.06 177 | development issue 3.97 178 | theater history 3.91 179 | situation isolation 3.88 180 | profit warning 3.88 181 | media trading 3.88 182 | chance credibility 3.88 183 | precedent information 3.85 184 | architecture century 3.78 185 | population development 3.75 186 | stock live 3.73 187 | peace atmosphere 3.69 188 | morality marriage 3.69 189 | minority peace 3.69 190 | atmosphere landscape 3.69 191 | report gain 3.63 192 | music project 3.63 193 | seven series 3.56 194 | experience music 3.47 195 | school center 3.44 196 | five month 3.38 197 | announcement production 3.38 198 | morality importance 3.31 199 | money operation 3.31 200 | delay news 3.31 201 | governor interview 3.25 202 | practice institution 3.19 203 | century nation 3.16 204 | coast forest 3.15 205 | shore woodland 3.08 206 | drink car 3.04 207 | president medal 3.00 208 | prejudice recognition 3.00 209 | viewer serial 2.97 210 | peace insurance 2.94 211 | Mars water 2.94 212 | media gain 2.88 213 | precedent cognition 2.81 214 | announcement effort 2.75 215 | line insurance 2.69 216 | crane implement 2.69 217 | drink mother 2.65 218 | opera industry 2.63 219 | volunteer motto 2.56 220 | listing proximity 2.56 221 | precedent collection 2.50 222 | cup article 2.40 223 | sign recess 2.38 224 | problem airport 2.38 225 | reason hypertension 2.31 226 | direction combination 2.25 227 | Wednesday news 2.22 228 | glass magician 2.08 229 | cemetery woodland 2.08 230 | possibility girl 1.94 231 | cup substance 1.92 232 | forest graveyard 1.85 233 | stock egg 1.81 234 | month hotel 1.81 235 | energy secretary 1.81 236 | precedent group 1.77 237 | production hike 1.75 238 | stock phone 1.62 239 | holy sex 1.62 240 | stock CD 1.31 241 | drink ear 1.31 242 | delay racism 1.19 243 | stock life 0.92 244 | stock jaguar 0.92 245 | monk slave 0.92 246 | lad wizard 0.92 247 | sugar approach 0.88 248 | rooster voyage 0.62 249 | noon string 0.54 250 | chord smile 0.54 251 | professor cucumber 0.31 252 | king cabbage 0.23 253 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/de/ws/schm280.txt: -------------------------------------------------------------------------------- 1 | psychologie geist 7.2 2 | fünf monat 3.2 3 | planet galaxie 8.3 4 | wodka gin 7.8 5 | tiger katze 8.2 6 | auto flug 6.1 7 | ankunft hotel 6.2 8 | könig königin 9.4 9 | seife oper 5.1 10 | brühe ei 2.9 11 | essen vorbereitung 4.9 12 | trinken essen 8.1 13 | ankündigung aufwand 1.5 14 | sommer natur 5.4 15 | film star 6.9 16 | psychologie klinik 7.1 17 | spiel sieg 6.5 18 | situation isolation 1.8 19 | psychologie depression 7.6 20 | gouverneur interview 3.6 21 | bischof rabbi 6.6 22 | kredit information 2.7 23 | frieden versicherung 1.9 24 | fruchtbarkeit ei 6.5 25 | dollar verlust 5.5 26 | schlaganfall krankenhaus 7.2 27 | vorhergend information 2.4 28 | gewinn verlust 8.2 29 | opec land 4.4 30 | pokal artefakt 4.4 31 | kalkulation berechnung 8.3 32 | psychologie wissenschaft 7.5 33 | heilig sex 1.4 34 | vogel kranich 8.3 35 | psychologie gesundheit 6.4 36 | meile kilometer 8.9 37 | bank geld 7.9 38 | wodka brandy 7.7 39 | boxen runde 6.2 40 | video archiv 4.8 41 | edelstein juwel 8.1 42 | bericht gewinn 2.4 43 | geld abhebung 5.8 44 | oper aufführung 7.4 45 | aktie markt 6.4 46 | straße kinder 3.4 47 | leben tod 8.4 48 | luxus auto 5.2 49 | gurke kartoffel 6 50 | medien radio 7.1 51 | meeresfrüchte meer 7.3 52 | planet sonne 8.3 53 | fußball basketball 7.1 54 | magier zauberer 9.3 55 | fbi fingerabdruck 6 56 | mittag mittagsstunde 7.1 57 | netzwerk hardware 6.3 58 | bild oberfläche 3.5 59 | professor gurke 0.8 60 | mittwoch nachrichten 1.3 61 | öl aktie 4.3 62 | zucker ansatz 0.5 63 | tiger tierwelt 6.5 64 | leben dauer 4.7 65 | tiger zoo 7.2 66 | atmosphäre landschaft 3.3 67 | verbraucher energie 6.3 68 | energie labor 4.3 69 | ficken sex 9.1 70 | nachrichten bericht 7.8 71 | präzedenzfall beispiel 5.7 72 | computer tastatur 7.1 73 | problem flughafen 2.4 74 | buch bücherei 8.1 75 | krieg truppen 7.6 76 | spiel serie 2.8 77 | tasse nahrung 3.9 78 | psychologie angst 7.4 79 | psychologie freud 8.2 80 | spiel mannschaft 7.4 81 | beobachtung architektur 2 82 | arafat frieden 5.4 83 | produktion erhöhung 4.2 84 | schlucht landschaft 5.8 85 | straße häuserblock 5.7 86 | fußball tennis 7.3 87 | computer programm 7.8 88 | computer nachrichten 3.1 89 | haut auge 5.1 90 | waffen geheimnis 3.1 91 | zeichen pause 1.9 92 | krankenhaus infrastruktur 4 93 | tasse kaffee 7.5 94 | liebe sex 7.7 95 | währung markt 5.7 96 | küste ufer 8.1 97 | umwelt ökologie 8.2 98 | tag dämmerung 7.3 99 | kredit karte 6.1 100 | verzögerung rassismus 1.3 101 | tasse flüssig 6.1 102 | problem herausforderung 7.1 103 | geld bargeld 8.9 104 | ministerium kultur 6 105 | wort ähnlichkeit 1.7 106 | geld bank 7.9 107 | oper industrie 1.8 108 | tiger katze 8 109 | regierung krise 6.3 110 | glas zauberer 1.4 111 | trinken auto 1 112 | reise auto 6.4 113 | mittelpunkt leben 3.6 114 | flugzeug auto 7.4 115 | wald friedhof 3.2 116 | psychologie arzt 6 117 | tasse gegenstand 7.1 118 | minderheit frieden 3.4 119 | natur umwelt 8.1 120 | planet stern 8.5 121 | zuschauer serie 6.3 122 | akkord lächeln 0.6 123 | präzedenzfall gruppe 1 124 | fernsehen radio 7.8 125 | museum theater 7.3 126 | opfer notfall 6.3 127 | lebensmittelgeschäft geld 6.2 128 | straße allee 8.3 129 | tasse einheit 2.1 130 | küste wald 4.2 131 | musik projekt 3.2 132 | geld eigentum 5.8 133 | medien handel 1.9 134 | gouverneur amt 5.8 135 | ausrüstung hersteller 4.5 136 | dividend berechnung 6 137 | tiger tiger 10 138 | minister partei 7.5 139 | anfang jahr 5.2 140 | tag sommer 4.2 141 | dollar dollar 9.9 142 | entdeckung weltraum 6.2 143 | film popcorn 7 144 | getränk ohr 0.6 145 | holz wald 7.8 146 | professor doktor 7.5 147 | firma aktie 6.1 148 | geld besitz 6.8 149 | spiel runde 6.5 150 | mars wasser 4.1 151 | freiwilliger motto 0.7 152 | anwalt beweis 6 153 | hahn reise 0.2 154 | linie versicherung 0.8 155 | konzert virtuose 5.8 156 | spiel niederlage 6.8 157 | frieden atmosphäre 3.8 158 | zelle telefon 6.6 159 | geld währung 8.6 160 | film kino 8.4 161 | zug auto 7.6 162 | aluminium metall 8.4 163 | computer internet 8.3 164 | geld bargeld 9 165 | ausschuss empfehlung 4.3 166 | küste hügel 5.3 167 | fernsehen film 7.8 168 | tasse substanz 3.2 169 | rock jazz 7 170 | arafat jackson 2.3 171 | meisterschaft turnier 8.5 172 | mönch sklave 1.4 173 | fbi ermittlung 6.8 174 | weltraum chemie 4 175 | familie planung 5.5 176 | jahrhundert nation 3.2 177 | mars wissenschaftler 5.3 178 | psychologie wahrnehmung 7 179 | jaguar auto 7.5 180 | film kritiker 6.3 181 | maradona fußball 7.7 182 | richtung kombination 2.4 183 | tasse getränk 7.1 184 | planet konstellation 5.8 185 | profit warnung 2.1 186 | jerusalem israel 7.7 187 | hummer wein 3.6 188 | schrank kleidung 7.6 189 | arafat terror 6.3 190 | sommer dürre 6.3 191 | dollar yen 8.5 192 | planet astronom 7.6 193 | medien gewinn 2.5 194 | buch papier 6.6 195 | tod insasse 3 196 | tod reihe 1.7 197 | auszeichnung tapferkeit 5.7 198 | physik chemie 8.2 199 | hotel reservierung 6.9 200 | meeresfrüchte essen 7.1 201 | typ art 7.9 202 | frieden plan 4.2 203 | bevölkerung entwicklung 5.1 204 | straße ort 6 205 | tiger tier 7.8 206 | flüssigkeit wasser 8.4 207 | haftung versicherung 7 208 | küste wald 5.4 209 | mann gouverneur 6.1 210 | gebiet kilometer 4.2 211 | marathon spring 3.9 212 | glas metall 6.1 213 | energie minister 4.2 214 | mexiko brasilien 7 215 | baseball saison 5.5 216 | geld reichtum 7.8 217 | meeresfrüchte hummer 7.6 218 | arzt krankenschwester 8.2 219 | baby mutter 8 220 | japanisch amerikanisch 7.6 221 | psychologie psychiatrie 8 222 | behandlung genesung 7.9 223 | energie krise 4.8 224 | präzedenzfall recht 4 225 | monat hotel 1.3 226 | schlau dumm 8.4 227 | telefon kommunikation 7.9 228 | jerusalem palästinenser 7.5 229 | tiger jaguar 8.7 230 | jahrhundert jahr 7.9 231 | theater geschichte 3.1 232 | wetter vorhersage 6.7 233 | moral wichtigkeit 3.7 234 | reisen aktivität 5.3 235 | geld dollar 8 236 | geschlecht gleichheit 4.1 237 | ausstellung erinnerungsstück 3.3 238 | planet mond 7.9 239 | mönch orakel 4.7 240 | behinderung tod 3.5 241 | chance glaubwürdigkeit 1.4 242 | bruder mönch 5 243 | grund kriterium 6.1 244 | alkohol chemie 7.1 245 | wasser leck 5.3 246 | mord totschlag 7.9 247 | planet weltraum 8.1 248 | könig turm 5.3 249 | präsident medaille 2.9 250 | psychologie angst 7.3 251 | jaguar katze 8.1 252 | opec öl 7.3 253 | student professor 7.1 254 | tasse geschirr 7.6 255 | ankündigung neuigkeit 8.1 256 | getränk mutter 0.8 257 | mann frau 8.6 258 | psychologie disziplin 5.6 259 | hummer essen 6.5 260 | vogel hahn 8 261 | telefon ausrüstung 3.7 262 | leben lektion 3.4 263 | tiger säugetier 8.1 264 | erfahrung musik 2.7 265 | droge missbrauch 6.3 266 | möglichkeit mädchen 1.2 267 | dollar gewinn 5.8 268 | gesetz rechtsanwalt 6.9 269 | schule zentrum 2.6 270 | sekretär senat 3.4 271 | planet menschen 5.2 272 | tiger organismus 5.2 273 | physik proton 7.9 274 | harvard yale 7.7 275 | tennis schläger 7 276 | tiger fleischfresser 7.1 277 | fußball fußball 9.9 278 | architektur jahrhundert 4 279 | hundert prozent 6.4 280 | brot butter 6.7 281 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/de/ws/ws353rel.txt: -------------------------------------------------------------------------------- 1 | Liebe Sex 8.46 2 | Buch Papier 7.08 3 | Computer Tastatur 8 4 | Computer Internet 8.08 5 | Telefon Kommunikation 8.38 6 | Drogen Mißbrauch 6.46 7 | klug Student 4.85 8 | Unternehmen Aktie 6.54 9 | Aktie Börse 8.85 10 | Vorrat Telefon 0.31 11 | Vorrat CD 0.54 12 | Vorrat Jaguar 0.23 13 | Vorrat Ei 2.15 14 | Fruchtbarkeit Ei 7.92 15 | Aktie Live 1.08 16 | Aktie Leben 0.62 17 | Buch Bibliothek 8.31 18 | Bank Geld 8.15 19 | Professor Gurke 0.15 20 | König Kohl 0.23 21 | Jerusalem Israel 8.85 22 | Jerusalem Palestinensisch 6.85 23 | Heilig Sex 0.69 24 | Maradona Fußball 8 25 | Tennis Schläger 7.08 26 | Arafat Frieden 2.46 27 | Arafat Terror 5.23 28 | Gesetz Anwalt 8.38 29 | Film Star 7.62 30 | Film Popcorn 6.08 31 | Film Kritik 5.85 32 | Kino Theater 6.85 33 | Physik Proton 7 34 | Weltall Chemie 3.31 35 | Alkohol Chemie 5.08 36 | Drink Auto 1.85 37 | Trinken Ohren 0.62 38 | Trinken Mund 6.46 39 | Baby Mutter 7.85 40 | Säugen Mutter 7.69 41 | Werkzeug Arbeitsgerät 8.38 42 | Bruder Mönch 5.92 43 | Kran Arbeitsgerät 6.08 44 | Bursche Bruder 4.23 45 | Fahrt Auto 6.62 46 | Mönch Orakel 1.38 47 | Friedhof Waldgebiet 1.92 48 | Küste Hügel 2.69 49 | Wald Friedhof 3.31 50 | Ufer Waldgebiet 2.31 51 | Mönch Sklave 1.08 52 | Küste Wald 1.77 53 | Bursche Zauberer 0.77 54 | Akkord Lächeln 0.31 55 | Glas Magier 1.69 56 | Mittag Faden 0.15 57 | Hahn Reise 0.31 58 | Geld Reichtum 8.19 59 | Geld Eigentum 6.62 60 | Geld Besitz 6.92 61 | Geld Bank 8.31 62 | Geld Pfand 5.13 63 | Geld Einzahlung 6.23 64 | Geld Abheben 6.54 65 | Geld Wäsche 3.62 66 | Tiger Zoo 5.91 67 | Psychologie Beklemmung 4.35 68 | Psychologie Angst 4.92 69 | Psychologie Depression 6.77 70 | Psychologie Klinik 6.17 71 | Psychologie Arzt 5.85 72 | Psychologie Freud 7 73 | Psychologie Seele 5.88 74 | Psychologie Gesundheit 5.11 75 | Psychologie Erkenntnis 4.92 76 | Planet Konstellation 6.23 77 | Planet Galaxie 7.08 78 | Planet Weltraum 7.08 79 | Planet Astronom 6.38 80 | Basis Information 3.32 81 | Voraussetzung Erkenntnis 3.15 82 | Präzedensfall Gesetz 5.62 83 | Beispielhaft Sammlung 1.92 84 | Vorbildlich Gruppe 2.19 85 | Tasse Kaffee 7.21 86 | Tasse Gegenstand 6.08 87 | Tasse Trinken 7.62 88 | Tasse Essen 1.77 89 | Tasse Substanz 1.69 90 | Tasse Flüssigkeit 5.47 91 | Energie Minister 4.15 92 | Minister Senat 5.96 93 | Energie Labor 3.23 94 | Computer Labor 4.31 95 | Waffe Geheimnis 1.85 96 | Polizei Fingerabdruck 6.23 97 | Polizei Ermittlung 7.27 98 | Untersuchung Aufwand 4.15 99 | Mars Wasser 2.77 100 | Mars Wissenschaftler 5.54 101 | Nachrichten Bericht 7.85 102 | Schlucht Landschaft 6.54 103 | Bild Oberfläche 3.38 104 | Entdeckung Weltall 4.77 105 | Wasser Leck 5.81 106 | Zeichen Kerbe 3.92 107 | Mittwoch Nachrichten 1.38 108 | Computer Nachrichten 4.23 109 | Gebiet Oberfläche 3.77 110 | Atmosphäre Landschaft 2.5 111 | Präsident Orden 2.77 112 | Krieg Truppen 6.81 113 | Rekord Nummer 2.77 114 | Theater Geschichte 3.85 115 | Freiwilliger Motto 0.77 116 | Vorurteil Anerkennung 3.46 117 | Auszeichnung Tapferkeit 5.92 118 | Jahrhundert Nation 1.54 119 | Verzögerung Rassismus 1.31 120 | Verzögerung Nachrichten 1.08 121 | Minister Partei 7.38 122 | Frieden Plan 3.85 123 | Minderheit Frieden 2.23 124 | Versuch Frieden 3.46 125 | Regierung Krise 5.65 126 | Aufmarsch Abzug 6.27 127 | Aufmarsch Rückzug 6.81 128 | Energie Krise 4.77 129 | Ankündigung Aufwand 1.08 130 | Schlaganfall Krankenhaus 6.88 131 | Behinderung Tod 2.42 132 | Opfer Notfall 6.69 133 | Behandlung Erholung 5.46 134 | Zeitschrift Verein 1.77 135 | Arzt Verantwortung 6.65 136 | Haftung Versicherung 7.62 137 | Schule Zentrum 4 138 | Ursache Bluthochdruck 3.65 139 | Ursache Kriterium 3.69 140 | Hundert Prozent 6.92 141 | Tod Trakt 2.46 142 | Tod Insasse 2.38 143 | Rechtsanwalt Beweis 6.04 144 | Leben Dauer 6.42 145 | Wort Ähnlichkeit 2.46 146 | Gremium Empfehlung 3.65 147 | Direktor Interview 2.46 148 | OPEC Staat 3.92 149 | Frieden Stimmung 4.46 150 | Frieden Versicherung 2.15 151 | Gelände Kilometer 3.46 152 | Wettbewerb Preis 7.73 153 | Konsument Vertrauen 3.85 154 | Konsument Energie 3.69 155 | Problem Flughafen 2.23 156 | Auto Flug 3.77 157 | Kredit Karte 6.31 158 | Vertrauen Information 4.77 159 | Hotel Reservierung 6.81 160 | Lebensmittel Geld 4.88 161 | Registrierung Abmachung 2.69 162 | Vereinbarung Unterkunft 2 163 | Monat Hotel 0.69 164 | Ankunft Hotel 4.54 165 | Bett Schrank 6.15 166 | Schrank Kleider 7.5 167 | Lage Schlussfolgerung 3.69 168 | Situation Isolation 2.15 169 | Unparteilichkeit Interesse 2.5 170 | Richtung Verbindung 3.65 171 | Straße Kinder 3.73 172 | Aufzählung Nähe 0.92 173 | Liste Kategorie 5.85 174 | Herstellung Wanderung 0.69 175 | Richtwert Kennziffer 4.92 176 | Medien Handel 2.87 177 | Medien Vorteil 1.81 178 | Gewinnanteil Kalkulation 6.15 179 | Währung Markt 6.03 180 | OPEC Öl 7.65 181 | Öl Aktie 5.14 182 | Ankündigung Produktion 1.65 183 | Ankündigung Warnung 6.06 184 | Gewinn Warnung 2.85 185 | Dollar Gewinn 5.25 186 | Dollar Verlust 5.27 187 | Computer Software 8.35 188 | Netzwerk Hardware 6.88 189 | Zubehör Hersteller 4.43 190 | Luxus Auto 5.14 191 | Fünf Monat 1.33 192 | Bericht Zuwachs 2.02 193 | Investor Einkommen 4.05 194 | Baseball Saison 4.98 195 | Spiel Sieg 7.08 196 | Spiel Mannschaft 6.88 197 | Spiel Serie 4.85 198 | Spiel Niederlage 6.77 199 | Sieben Reihe 2.41 200 | Meeresfrüchte Meer 7.32 201 | Essen Vorbereitung 5.58 202 | Video Archiv 5.06 203 | Beginn Jahr 4.7 204 | Beginn Partie 4.22 205 | Spiel Runde 6.33 206 | Boxen Runde 7.35 207 | Kämpfen Besiegen 7.41 208 | Grundsatz Versicherung 2.51 209 | Tag Sommer 3.33 210 | Sommer Dürre 6.04 211 | Sommer Natur 5.37 212 | Tag Dämmerung 5.97 213 | Natur Umwelt 8 214 | Umwelt Nachhaltigkeit 6.54 215 | Natur Mensch 6.31 216 | Seife Oper 2.85 217 | Leben Lektion 4.54 218 | Fokus Leben 2.62 219 | Herstellung Belegschaft 3.23 220 | Fernsehen Film 7.31 221 | Liebhaber Streit 4.08 222 | Zuschauer Serie 5.38 223 | Möglichkeit Mädchen 1.65 224 | Bevölkerung Entwicklung 4.27 225 | Moral Wichtigkeit 4.65 226 | Moral Heirat 2.81 227 | Geschlecht Gleichheit 3.81 228 | Änderung Einstellung 4.15 229 | Familie Planung 5.62 230 | Oper Industrie 0.85 231 | Zucker Annäherung 1.46 232 | Praxis Institution 4.27 233 | Ministerium Kultur 4.38 234 | Problem Herausforderung 6.08 235 | Größe Prominenz 5.46 236 | Staat Bürger 6.77 237 | Planet Menschen 5.62 238 | Entwicklung Ausgabe 3.15 239 | Erfahrung Musik 1.08 240 | Musik Projekt 4.04 241 | Möglichkeit Glaubwürdigkeit 2.69 242 | Ausstellungsstück Erinnerungsstück 4.38 243 | Konzert virtuos 4.46 244 | Betrachtung Architektur 4.38 245 | Weltraum Erde 6.46 246 | Erhaltung Welt 4.31 247 | Einlass Eintritt 8.08 248 | Regen Flut 7.27 249 | Wetter Vorhersage 6.46 250 | Katastrophe Gebiet 4.27 251 | Präsident Büro 3.42 252 | Architektur Jahrhundert 3.08 253 | -------------------------------------------------------------------------------- /tests/test_bunch.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from pathlib import Path 3 | 4 | import pytest 5 | 6 | import hyperhyper 7 | 8 | 9 | @pytest.fixture() 10 | def corpus(): 11 | some_text1 = """ 12 | The English Wikipedia is the English-language edition of the free online encyclopedia Wikipedia. Founded on 15 January 2001, it is the first edition of Wikipedia and, as of April 2019, has the most articles of any of the editions.[2] As of June 2019, 12% of articles in all Wikipedias belong to the English-language edition. This share has gradually declined from more than 50 percent in 2003, due to the growth of Wikipedias in other languages.[3] As of 1 June 2019, there are 5,870,200 articles on the site,[4] having surpassed the 5 million mark on 1 November 2015.[5] In October 2015, the combined text of the English Wikipedia's articles totalled 11.5 gigabytes when compressed.[6] 13 | 14 | The Simple English Wikipedia is a variation in which most of the articles use only basic English vocabulary. There is also the Old English (Ænglisc/Anglo-Saxon) Wikipedia (angwiki). Community-produced news publications include The Signpost.[7] 15 | """ 16 | 17 | some_text2 = """ 18 | The English Wikipedia was the first Wikipedia edition and has remained the largest. It has pioneered many ideas as conventions, policies or features which were later adopted by Wikipedia editions in some of the other languages. These ideas include "featured articles",[8] the neutral-point-of-view policy,[9] navigation templates,[10] the sorting of short "stub" articles into sub-categories,[11] dispute resolution mechanisms such as mediation and arbitration,[12] and weekly collaborations.[13] 19 | 20 | The English Wikipedia has adopted features from Wikipedias in other languages. These features include verified revisions from the German Wikipedia (dewiki) and town population-lookup templates from the Dutch Wikipedia (nlwiki). 21 | 22 | Although the English Wikipedia stores images and audio files, as well as text files, many of the images have been moved to Wikimedia Commons with the same name, as passed-through files. However, the English Wikipedia also has fair-use images and audio/video files (with copyright restrictions), most of which are not allowed on Commons. 23 | 24 | Many of the most active participants in the Wikimedia Foundation, and the developers of the MediaWiki software that powers Wikipedia, are English users. 25 | """ 26 | 27 | texts = [some_text1, some_text2] 28 | c = hyperhyper.Corpus.from_texts(texts) 29 | return c 30 | 31 | 32 | def test_bunch(corpus): 33 | bunch = hyperhyper.Bunch("test_bunch", corpus, force_overwrite=True) 34 | pmi_matrix, _ = bunch.pmi() 35 | bunch.eval_sim(pmi_matrix) 36 | 37 | bunch.eval_analogy(pmi_matrix) 38 | 39 | # testing the evaluation of pmi 40 | english_idx = corpus.vocab.token2id["english"] 41 | wikipedia_idx = corpus.vocab.token2id["wikipedia"] 42 | for sim, token_idx in pmi_matrix.most_similar(english_idx): 43 | assert pmi_matrix.similarity(english_idx, token_idx) == pmi_matrix.similarity(token_idx, english_idx) 44 | assert pmi_matrix.similarity(english_idx, token_idx) == sim 45 | 46 | pmi_matrix.most_similar_vectors([english_idx], [wikipedia_idx]) 47 | 48 | svd_matrix, _ = bunch.svd(dim=2) 49 | 50 | # testing the evaluation of svd 51 | english_idx = corpus.vocab.token2id["english"] 52 | for sim, token_idx in svd_matrix.most_similar(english_idx): 53 | assert svd_matrix.similarity(english_idx, token_idx) == svd_matrix.similarity(token_idx, english_idx) 54 | assert svd_matrix.similarity(english_idx, token_idx) == sim 55 | 56 | svd_matrix, _ = bunch.svd(dim=2, keyed_vectors=True) 57 | svd_matrix = bunch.svd(dim=3, keyed_vectors=True, evaluate=False) 58 | 59 | # `most_similar` comes from gensim's keyedvectors 60 | svd_matrix.most_similar("english") 61 | 62 | assert pmi_matrix.m.count_nonzero() > 0 63 | 64 | 65 | def test_db_query(corpus): 66 | bunch = hyperhyper.Bunch("test_bunch", corpus, force_overwrite=True) 67 | bunch.svd(dim=2) 68 | res = bunch.results(query={"dim": 2, "pair_args": {"window": 2}}) 69 | print(res) 70 | 71 | 72 | def test_bunch_text_files(): 73 | some_text1 = """ 74 | The English Wikipedia is the English-language edition of the free online encyclopedia Wikipedia. Founded on 15 January 2001, it is the first edition of Wikipedia and, as of April 2019, has the most articles of any of the editions.[2] As of June 2019, 12% of articles in all Wikipedias belong to the English-language edition. This share has gradually declined from more than 50 percent in 2003, due to the growth of Wikipedias in other languages.[3] As of 1 June 2019, there are 5,870,200 articles on the site,[4] having surpassed the 5 million mark on 1 November 2015.[5] In October 2015, the combined text of the English Wikipedia's articles totalled 11.5 gigabytes when compressed.[6] 75 | 76 | The Simple English Wikipedia is a variation in which most of the articles use only basic English vocabulary. There is also the Old English (Ænglisc/Anglo-Saxon) Wikipedia (angwiki). Community-produced news publications include The Signpost.[7] 77 | """ 78 | 79 | some_text2 = """ 80 | The English Wikipedia was the first Wikipedia edition and has remained the largest. It has pioneered many ideas as conventions, policies or features which were later adopted by Wikipedia editions in some of the other languages. These ideas include "featured articles",[8] the neutral-point-of-view policy,[9] navigation templates,[10] the sorting of short "stub" articles into sub-categories,[11] dispute resolution mechanisms such as mediation and arbitration,[12] and weekly collaborations.[13] 81 | 82 | The English Wikipedia has adopted features from Wikipedias in other languages. These features include verified revisions from the German Wikipedia (dewiki) and town population-lookup templates from the Dutch Wikipedia (nlwiki). 83 | 84 | Although the English Wikipedia stores images and audio files, as well as text files, many of the images have been moved to Wikimedia Commons with the same name, as passed-through files. However, the English Wikipedia also has fair-use images and audio/video files (with copyright restrictions), most of which are not allowed on Commons. 85 | 86 | Many of the most active participants in the Wikimedia Foundation, and the developers of the MediaWiki software that powers Wikipedia, are English users. 87 | """ 88 | 89 | texts = [some_text1, some_text2] 90 | # setup 91 | test_dir = tempfile.mkdtemp() 92 | for i, t in enumerate(texts): 93 | Path(test_dir + f"/{i}.txt").write_text(t) 94 | # test 95 | corpus = hyperhyper.Corpus.from_text_files(test_dir) 96 | bunch = hyperhyper.Bunch("test_bunch", corpus, force_overwrite=True) 97 | 98 | pmi_matrix, _ = bunch.pmi() 99 | bunch.eval_sim(pmi_matrix) 100 | svd_matrix, _ = bunch.svd(dim=2) 101 | svd_matrix, _ = bunch.svd(dim=2, keyed_vectors=True) 102 | svd_matrix = bunch.svd(dim=2, keyed_vectors=True, evaluate=False) 103 | 104 | print(svd_matrix.most_similar("english")) 105 | 106 | assert pmi_matrix.m.count_nonzero() > 0 107 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/en/ws/radinsky_mturk.txt: -------------------------------------------------------------------------------- 1 | episcopal russia 2.75 2 | water shortage 2.714285714 3 | horse wedding 2.266666667 4 | plays losses 3.2 5 | classics advertiser 2.25 6 | latin credit 2.0625 7 | ship ballots 2.3125 8 | mistake error 4.352941176 9 | disease plague 4.117647059 10 | sake shade 2.529411765 11 | saints observatory 1.9375 12 | treaty wheat 1.8125 13 | texas death 1.533333333 14 | republicans challenge 2.3125 15 | body peaceful 2.058823529 16 | admiralty intensity 2.647058824 17 | body improving 2.117647059 18 | heroin marijuana 3.375 19 | scottish commuters 2.6875 20 | apollo myth 2.6 21 | film cautious 2.125 22 | exhibition art 4.117647059 23 | chocolate candy 3.764705882 24 | republic candidate 2.8125 25 | gospel church 4.0625 26 | momentum desirable 2.4 27 | singapore sanctions 2.117647059 28 | english french 3.823529412 29 | exile church 2.941176471 30 | navy coordinator 2.235294118 31 | adventure flood 2.4375 32 | radar plane 3.235294118 33 | pacific ocean 4.266666667 34 | scotch liquor 4.571428571 35 | kennedy gun 3 36 | garfield cat 2.866666667 37 | scale budget 3.5 38 | rhythm blues 3.071428571 39 | rich privileges 3.2 40 | navy withdrawn 1.571428571 41 | marble marching 2.615384615 42 | polo charged 2.125 43 | mark missing 2.333333333 44 | battleship army 4.235294118 45 | medium organization 2.5625 46 | pennsylvania writer 1.466666667 47 | hamlet poet 3.882352941 48 | battle prisoners 3.705882353 49 | guild smith 2.75 50 | mud soil 4.235294118 51 | crime assaulted 3.941176471 52 | mussolini stability 2.133333333 53 | lincoln division 2.4375 54 | slaves insured 2.2 55 | summer winter 4.375 56 | integration dignity 3.058823529 57 | money quota 2.5 58 | honolulu vacation 3.6875 59 | libya forged 2.461538462 60 | cheers musician 2.823529412 61 | session surprises 1.8125 62 | billion campaigning 2.571428571 63 | perjury soybean 2.0625 64 | forswearing perjury 3.3125 65 | costume halloween 3.4375 66 | bulgarian nurses 1.941176471 67 | costume ultimate 2.5 68 | faith judging 2.235294118 69 | france bridges 2.235294118 70 | citizenship casey 2.2 71 | recreation dish 1.4 72 | intelligence troubles 1.625 73 | germany worst 1.4375 74 | chaos death 2.75 75 | sydney hancock 2.857142857 76 | sabbath stevenson 2.214285714 77 | espionage passport 2.3125 78 | political today 1.6875 79 | pipe convertible 2 80 | scouting demonstrate 2.5625 81 | salute patterns 2.235294118 82 | reichstag germany 2.285714286 83 | radiation costumes 1.5625 84 | horace grief 1.764705882 85 | sale rental 3.470588235 86 | open close 4.058823529 87 | photography proving 2.375 88 | propaganda germany 1.705882353 89 | assassination forbes 2.071428571 90 | mirror duel 1.928571429 91 | probability hanging 2.058823529 92 | africa theater 1.5 93 | hell heaven 4.117647059 94 | mussolini italy 3 95 | composer beethoven 3.647058824 96 | minister forthcoming 1.764705882 97 | brussels sweden 3.176470588 98 | neutral parish 1.6 99 | emotion taxation 1.733333333 100 | louisiana simple 2 101 | quarantine disease 3 102 | cannon imprisoned 2.625 103 | bronze suspicion 2 104 | pearl interim 2.352941176 105 | artist paint 4.117647059 106 | relay family 2.0625 107 | art mortality 2.294117647 108 | food investment 2.25 109 | alt tenor 2.692307692 110 | catholics protestant 3.5625 111 | militia landlord 3.0625 112 | battle warships 4.176470588 113 | alcohol fleeing 2.5625 114 | coil ashes 3.117647059 115 | poland russia 4 116 | explosive builders 2.4375 117 | aeronautics plane 4.277777778 118 | charge sentence 3.133333333 119 | pet retiring 2 120 | drink alcohol 4.352941176 121 | stability species 2.375 122 | colonies depression 2 123 | easter preference 2.0625 124 | genius intellect 4.090909091 125 | diamond killed 1.555555556 126 | slavery african 2.8 127 | jurisdiction law 4.454545455 128 | saints repeal 1.555555556 129 | conspiracy campaign 2.166666667 130 | operator extracts 2.214285714 131 | physician action 2.153846154 132 | electronics guess 1.916666667 133 | slavery diamond 2.285714286 134 | quarterback sport 3.142857143 135 | assassination killed 4.285714286 136 | slavery klan 2.230769231 137 | heroin shoot 2.692307692 138 | birds disturbances 1.692307692 139 | palestinians turks 2.5 140 | citizenship court 2.5 141 | immunity violation 2.076923077 142 | alternative contend 2.461538462 143 | chile plates 2.692307692 144 | abraham stranger 1.846153846 145 | kansas city 3.769230769 146 | month year 3.857142857 147 | month day 3.857142857 148 | amateur actor 2.333333333 149 | afghanistan war 3.384615385 150 | transmission maxwell 2.25 151 | manchester ambitious 1.923076923 152 | program battered 1.928571429 153 | drawing music 2.583333333 154 | exile pledges 2.307692308 155 | adventure sixteen 1.538461538 156 | exile threats 2.166666667 157 | concrete wings 1.428571429 158 | seizure bishops 2 159 | submarine sea 3.857142857 160 | villa mayor 2.25 161 | trade farley 2.375 162 | nature forest 3.636363636 163 | chronicle young 1.9 164 | radical bishops 1.818181818 165 | pakistan radical 2.875 166 | fire water 4.266666667 167 | gossip nuisance 3.0625 168 | con examiner 2.266666667 169 | satellite space 3.75 170 | essay boston 2 171 | miniature statue 3.6 172 | spill pollution 3.5 173 | minister council 3.5625 174 | landscape mountain 3.5625 175 | religion remedy 2.5625 176 | ship storm 3.5 177 | college scientist 2.8125 178 | crystal oldest 2.5625 179 | afghanistan wise 2.066666667 180 | trinity religion 3.133333333 181 | homer odyssey 2.857142857 182 | parish clue 2.4375 183 | actress actor 4.0625 184 | patent professionals 2.375 185 | chaos horrible 3.066666667 186 | acre earthquake 2.125 187 | goverment immunity 2 188 | football justice 1.8 189 | gambling money 3.75 190 | corruption nervous 1.875 191 | cardinals villages 2.375 192 | life death 4.103448276 193 | artillery sanctions 2.428571429 194 | jerusalem murdered 2.357142857 195 | cell brick 3.285714286 196 | knowledge promoter 2.642857143 197 | adventure rails 2.571428571 198 | houston crash 2.357142857 199 | oxford subcommittee 2.642857143 200 | militia weapon 3.785714286 201 | manufacturer meat 1.857142857 202 | damages reaction 3.071428571 203 | sea fishing 4.357142857 204 | atomic clash 2.785714286 205 | broadcasting athletics 3 206 | mystery expedition 2.538461538 207 | kremlin soviets 3.166666667 208 | pig blaze 1.75 209 | riverside vietnamese 2.25 210 | bitter protective 1.923076923 211 | disaster announced 2.384615385 212 | pork blaze 2.230769231 213 | feet international 1.916666667 214 | radical uniform 2.5 215 | gossip condemned 2.692307692 216 | mozart wagner 3.166666667 217 | soccer boxing 3.4 218 | radical roles 2.75 219 | rescued slaying 3 220 | researchers tested 3.538461538 221 | sales season 2.307692308 222 | homeless refugees 3.615384615 223 | pakistan repair 1.75 224 | athens painting 2.294117647 225 | tiger woods 3.375 226 | aircraft plane 4.473684211 227 | solar carbon 2.842105263 228 | enterprise bankruptcy 2.5 229 | homer springfield 2.833333333 230 | coin awards 2.166666667 231 | rhodes native 2.25 232 | soccer curator 2.125 233 | gasoline stock 2.888888889 234 | guilt extended 2.105263158 235 | rapid singapore 1.764705882 236 | coin banker 3.631578947 237 | london correspondence 1.944444444 238 | pop sex 2.6 239 | medicine bread 2.176470588 240 | asia animal 1.555555556 241 | pop clubhouse 3.210526316 242 | nazi defensive 2.055555556 243 | earth poles 3.421052632 244 | thailand crowded 2.166666667 245 | day independence 3.473684211 246 | controversy pitch 2.375 247 | stock gasoline 3.166666667 248 | composers mozart 3.833333333 249 | tone piano 3.722222222 250 | paris chef 2.111111111 251 | profession responsible 2.722222222 252 | bankruptcy chronicle 2 253 | lebanon war 2.722222222 254 | israel terror 3.055555556 255 | angola military 2.941176471 256 | chemistry patients 2.357142857 257 | munich constitution 3.071428571 258 | piano theater 3.266666667 259 | poetry artist 3.8 260 | acre burned 1.769230769 261 | religion abortion 2.076923077 262 | jazz music 4.533333333 263 | government transportation 3 264 | color wine 2.533333333 265 | jackson quota 1.692307692 266 | shariff deputy 3.642857143 267 | boat negroes 2 268 | shooting sentenced 2.933333333 269 | republicans friedman 2.416666667 270 | politics brokerage 2.5 271 | russian stalin 3.357142857 272 | love philip 2.5 273 | nuclear plant 3.733333333 274 | jamaica queens 3.076923077 275 | dollar asylum 1.846153846 276 | bridge rowing 2.785714286 277 | berlin germany 4 278 | funeral death 4.714285714 279 | albert einstein 4.266666667 280 | gulf shore 3.857142857 281 | ecuador argentina 3.266666667 282 | britain france 3.714285714 283 | sports score 3.866666667 284 | socialism capitalism 3.785714286 285 | treaty peace 4.166666667 286 | exchange market 4.266666667 287 | marriage anniversary 4.333333333 288 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/en/ws/ws353.txt: -------------------------------------------------------------------------------- 1 | love sex 6.77 2 | tiger cat 7.35 3 | tiger tiger 10.00 4 | book paper 7.46 5 | computer keyboard 7.62 6 | computer internet 7.58 7 | plane car 5.77 8 | train car 6.31 9 | telephone communication 7.50 10 | television radio 6.77 11 | media radio 7.42 12 | drug abuse 6.85 13 | bread butter 6.19 14 | cucumber potato 5.92 15 | doctor nurse 7.00 16 | professor doctor 6.62 17 | student professor 6.81 18 | smart student 4.62 19 | smart stupid 5.81 20 | company stock 7.08 21 | stock market 8.08 22 | stock phone 1.62 23 | stock CD 1.31 24 | stock jaguar 0.92 25 | stock egg 1.81 26 | fertility egg 6.69 27 | stock live 3.73 28 | stock life 0.92 29 | book library 7.46 30 | bank money 8.12 31 | wood forest 7.73 32 | money cash 9.15 33 | professor cucumber 0.31 34 | king cabbage 0.23 35 | king queen 8.58 36 | king rook 5.92 37 | bishop rabbi 6.69 38 | Jerusalem Israel 8.46 39 | Jerusalem Palestinian 7.65 40 | holy sex 1.62 41 | fuck sex 9.44 42 | Maradona football 8.62 43 | football soccer 9.03 44 | football basketball 6.81 45 | football tennis 6.63 46 | tennis racket 7.56 47 | Arafat peace 6.73 48 | Arafat terror 7.65 49 | Arafat Jackson 2.50 50 | law lawyer 8.38 51 | movie star 7.38 52 | movie popcorn 6.19 53 | movie critic 6.73 54 | movie theater 7.92 55 | physics proton 8.12 56 | physics chemistry 7.35 57 | space chemistry 4.88 58 | alcohol chemistry 5.54 59 | vodka gin 8.46 60 | vodka brandy 8.13 61 | drink car 3.04 62 | drink ear 1.31 63 | drink mouth 5.96 64 | drink eat 6.87 65 | baby mother 7.85 66 | drink mother 2.65 67 | car automobile 8.94 68 | gem jewel 8.96 69 | journey voyage 9.29 70 | boy lad 8.83 71 | coast shore 9.10 72 | asylum madhouse 8.87 73 | magician wizard 9.02 74 | midday noon 9.29 75 | furnace stove 8.79 76 | food fruit 7.52 77 | bird cock 7.10 78 | bird crane 7.38 79 | tool implement 6.46 80 | brother monk 6.27 81 | crane implement 2.69 82 | lad brother 4.46 83 | journey car 5.85 84 | monk oracle 5.00 85 | cemetery woodland 2.08 86 | food rooster 4.42 87 | coast hill 4.38 88 | forest graveyard 1.85 89 | shore woodland 3.08 90 | monk slave 0.92 91 | coast forest 3.15 92 | lad wizard 0.92 93 | chord smile 0.54 94 | glass magician 2.08 95 | noon string 0.54 96 | rooster voyage 0.62 97 | money dollar 8.42 98 | money cash 9.08 99 | money currency 9.04 100 | money wealth 8.27 101 | money property 7.57 102 | money possession 7.29 103 | money bank 8.50 104 | money deposit 7.73 105 | money withdrawal 6.88 106 | money laundering 5.65 107 | money operation 3.31 108 | tiger jaguar 8.00 109 | tiger feline 8.00 110 | tiger carnivore 7.08 111 | tiger mammal 6.85 112 | tiger animal 7.00 113 | tiger organism 4.77 114 | tiger fauna 5.62 115 | tiger zoo 5.87 116 | psychology psychiatry 8.08 117 | psychology anxiety 7.00 118 | psychology fear 6.85 119 | psychology depression 7.42 120 | psychology clinic 6.58 121 | psychology doctor 6.42 122 | psychology Freud 8.21 123 | psychology mind 7.69 124 | psychology health 7.23 125 | psychology science 6.71 126 | psychology discipline 5.58 127 | psychology cognition 7.48 128 | planet star 8.45 129 | planet constellation 8.06 130 | planet moon 8.08 131 | planet sun 8.02 132 | planet galaxy 8.11 133 | planet space 7.92 134 | planet astronomer 7.94 135 | precedent example 5.85 136 | precedent information 3.85 137 | precedent cognition 2.81 138 | precedent law 6.65 139 | precedent collection 2.50 140 | precedent group 1.77 141 | precedent antecedent 6.04 142 | cup coffee 6.58 143 | cup tableware 6.85 144 | cup article 2.40 145 | cup artifact 2.92 146 | cup object 3.69 147 | cup entity 2.15 148 | cup drink 7.25 149 | cup food 5.00 150 | cup substance 1.92 151 | cup liquid 5.90 152 | jaguar cat 7.42 153 | jaguar car 7.27 154 | energy secretary 1.81 155 | secretary senate 5.06 156 | energy laboratory 5.09 157 | computer laboratory 6.78 158 | weapon secret 6.06 159 | FBI fingerprint 6.94 160 | FBI investigation 8.31 161 | investigation effort 4.59 162 | Mars water 2.94 163 | Mars scientist 5.63 164 | news report 8.16 165 | canyon landscape 7.53 166 | image surface 4.56 167 | discovery space 6.34 168 | water seepage 6.56 169 | sign recess 2.38 170 | Wednesday news 2.22 171 | mile kilometer 8.66 172 | computer news 4.47 173 | territory surface 5.34 174 | atmosphere landscape 3.69 175 | president medal 3.00 176 | war troops 8.13 177 | record number 6.31 178 | skin eye 6.22 179 | Japanese American 6.50 180 | theater history 3.91 181 | volunteer motto 2.56 182 | prejudice recognition 3.00 183 | decoration valor 5.63 184 | century year 7.59 185 | century nation 3.16 186 | delay racism 1.19 187 | delay news 3.31 188 | minister party 6.63 189 | peace plan 4.75 190 | minority peace 3.69 191 | attempt peace 4.25 192 | government crisis 6.56 193 | deployment departure 4.25 194 | deployment withdrawal 5.88 195 | energy crisis 5.94 196 | announcement news 7.56 197 | announcement effort 2.75 198 | stroke hospital 7.03 199 | disability death 5.47 200 | victim emergency 6.47 201 | treatment recovery 7.91 202 | journal association 4.97 203 | doctor personnel 5.00 204 | doctor liability 5.19 205 | liability insurance 7.03 206 | school center 3.44 207 | reason hypertension 2.31 208 | reason criterion 5.91 209 | hundred percent 7.38 210 | Harvard Yale 8.13 211 | hospital infrastructure 4.63 212 | death row 5.25 213 | death inmate 5.03 214 | lawyer evidence 6.69 215 | life death 7.88 216 | life term 4.50 217 | word similarity 4.75 218 | board recommendation 4.47 219 | governor interview 3.25 220 | OPEC country 5.63 221 | peace atmosphere 3.69 222 | peace insurance 2.94 223 | territory kilometer 5.28 224 | travel activity 5.00 225 | competition price 6.44 226 | consumer confidence 4.13 227 | consumer energy 4.75 228 | problem airport 2.38 229 | car flight 4.94 230 | credit card 8.06 231 | credit information 5.31 232 | hotel reservation 8.03 233 | grocery money 5.94 234 | registration arrangement 6.00 235 | arrangement accommodation 5.41 236 | month hotel 1.81 237 | type kind 8.97 238 | arrival hotel 6.00 239 | bed closet 6.72 240 | closet clothes 8.00 241 | situation conclusion 4.81 242 | situation isolation 3.88 243 | impartiality interest 5.16 244 | direction combination 2.25 245 | street place 6.44 246 | street avenue 8.88 247 | street block 6.88 248 | street children 4.94 249 | listing proximity 2.56 250 | listing category 6.38 251 | cell phone 7.81 252 | production hike 1.75 253 | benchmark index 4.25 254 | media trading 3.88 255 | media gain 2.88 256 | dividend payment 7.63 257 | dividend calculation 6.48 258 | calculation computation 8.44 259 | currency market 7.50 260 | OPEC oil 8.59 261 | oil stock 6.34 262 | announcement production 3.38 263 | announcement warning 6.00 264 | profit warning 3.88 265 | profit loss 7.63 266 | dollar yen 7.78 267 | dollar buck 9.22 268 | dollar profit 7.38 269 | dollar loss 6.09 270 | computer software 8.50 271 | network hardware 8.31 272 | phone equipment 7.13 273 | equipment maker 5.91 274 | luxury car 6.47 275 | five month 3.38 276 | report gain 3.63 277 | investor earning 7.13 278 | liquid water 7.89 279 | baseball season 5.97 280 | game victory 7.03 281 | game team 7.69 282 | marathon sprint 7.47 283 | game series 6.19 284 | game defeat 6.97 285 | seven series 3.56 286 | seafood sea 7.47 287 | seafood food 8.34 288 | seafood lobster 8.70 289 | lobster food 7.81 290 | lobster wine 5.70 291 | food preparation 6.22 292 | video archive 6.34 293 | start year 4.06 294 | start match 4.47 295 | game round 5.97 296 | boxing round 7.61 297 | championship tournament 8.36 298 | fighting defeating 7.41 299 | line insurance 2.69 300 | day summer 3.94 301 | summer drought 7.16 302 | summer nature 5.63 303 | day dawn 7.53 304 | nature environment 8.31 305 | environment ecology 8.81 306 | nature man 6.25 307 | man woman 8.30 308 | man governor 5.25 309 | murder manslaughter 8.53 310 | soap opera 7.94 311 | opera performance 6.88 312 | life lesson 5.94 313 | focus life 4.06 314 | production crew 6.25 315 | television film 7.72 316 | lover quarrel 6.19 317 | viewer serial 2.97 318 | possibility girl 1.94 319 | population development 3.75 320 | morality importance 3.31 321 | morality marriage 3.69 322 | Mexico Brazil 7.44 323 | gender equality 6.41 324 | change attitude 5.44 325 | family planning 6.25 326 | opera industry 2.63 327 | sugar approach 0.88 328 | practice institution 3.19 329 | ministry culture 4.69 330 | problem challenge 6.75 331 | size prominence 5.31 332 | country citizen 7.31 333 | planet people 5.75 334 | development issue 3.97 335 | experience music 3.47 336 | music project 3.63 337 | glass metal 5.56 338 | aluminum metal 7.83 339 | chance credibility 3.88 340 | exhibit memorabilia 5.31 341 | concert virtuoso 6.81 342 | rock jazz 7.59 343 | museum theater 7.19 344 | observation architecture 4.38 345 | space world 6.53 346 | preservation world 6.19 347 | admission ticket 7.69 348 | shower thunderstorm 6.31 349 | shower flood 6.03 350 | weather forecast 8.34 351 | disaster area 6.25 352 | governor office 6.34 353 | architecture century 3.78 354 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/de/ws/zg222.txt: -------------------------------------------------------------------------------- 1 | Abweichung sortieren 2 2 | agieren mobil 2 3 | aktuell Portfolioanalyse 1.523809524 4 | Altersstufe Mut 1.238095238 5 | Anbieter Bedarf 3 6 | Angebotsseite Bestandsaufnahme 1.714285714 7 | angehend Verfahrenstechnik 0.619047619 8 | anleiten rekonstruieren 1.333333333 9 | Ansatz europäisch 0.952380952 10 | anschließend Blutspendeaktion 0.476190476 11 | anschließend Maschinenfunktion 0.571428571 12 | Antrittsvorlesung Justus 0.476190476 13 | Approach Implementation 1.761904762 14 | Arbeitstitel Abkehr 0.380952381 15 | Assistentin Überblick 1.666666667 16 | Aufgabe Vertriebstechniker 2.095238095 17 | aufsuchen adäquat 0.571428571 18 | Ausbildung Beispiel 1.952380952 19 | Ausbildung Gesundheitswesen 1.761904762 20 | Ausbildung nah 1.047619048 21 | Ausgangsmaterial Probenehmer 1.428571429 22 | Ausland Suche 1.428571429 23 | Autor Identität 2.238095238 24 | Baumaschinenmeister beruflich 2.904761905 25 | Bayern Unterrichtsmittel 0.857142857 26 | Beamter Mitarbeit 1.428571429 27 | bearbeiten frühere 0.476190476 28 | Berechnung Firma 2.047619048 29 | Berlin demographisch 1.619047619 30 | Berlin Recht 1.380952381 31 | berücksichtigen kooperieren 1.904761905 32 | Berücksichtigung Branche 0.952380952 33 | beständig Managerinnen 1.095238095 34 | beurteilen denkbar 1.19047619 35 | Beurteilung Verhaltensmuster 2.380952381 36 | bezüglich Durchführung 0.857142857 37 | Bildung Ziel 2.523809524 38 | Bildungsabschluss Verhaltensmuster 1.238095238 39 | Bildungsträger Berufspraxis 1.952380952 40 | bleiben Gewalt 0.619047619 41 | Büroequipment Institut 1.285714286 42 | Computer Plattform 3.238095238 43 | Core Metadatenwerkzeug 1.380952381 44 | Datenbank Bachelorstudiengang 1.19047619 45 | Datum erheben 1.380952381 46 | Detailkonstrukteurinnen Tätigkeit 2.19047619 47 | Dienst Ingenieurbüro 2.142857143 48 | Dozent Kamera 0.571428571 49 | drängeln Bereich 0.952380952 50 | Dreieck Stuttgart 0.952380952 51 | Druckplatte Tätigkeit 1.571428571 52 | Durchführung Zusammenarbeit 2.142857143 53 | Eigenaktivität Durchführung 1.857142857 54 | eignen auswirken 1.285714286 55 | eindimensional Backware 0.19047619 56 | einschließlich häufig 0.571428571 57 | einzig Nachbar 0.619047619 58 | elektronisch neu 1.476190476 59 | Entwicklung Struktur 1.857142857 60 | erfolgreich Universität 2.571428571 61 | Erkenntnisinteresse gleichzeitig 0.523809524 62 | Erklärung Aktensperrfrist 1.19047619 63 | europäisch Intervention 2.142857143 64 | Evaluation funktional 1.333333333 65 | Fach Institut 3.047619048 66 | Fachbereichsvertreter individuell 0.80952381 67 | fallen überwachen 0.428571429 68 | Familie Beitrag 1.333333333 69 | Familie Modellvorhaben 0.571428571 70 | Feinwerktechnik Hochschule 1.952380952 71 | Flächeneinsparung Landwirtschaft 2.333333333 72 | Forschungsverbund Langzeittherapieprogramm 1.619047619 73 | Forschungszentrum Euro 1.285714286 74 | Fortschritt Asien 2.333333333 75 | Frage Universität 2.333333333 76 | Führungsmittel Gastlandkontakte 0.380952381 77 | Garnerzeugung Vliesstofferzeugung 3.238095238 78 | Gegenstand Filmherstellung 1.047619048 79 | Gegenwart beinhalten 0.666666667 80 | gelangen Sujet 0.428571429 81 | genius stilistisch 1.142857143 82 | Gentest Risiko 2.571428571 83 | Georg August 1.238095238 84 | gerade gängig 1.095238095 85 | Gespannfahren Altersgruppe 0.380952381 86 | Gesundheitsbegriff Entwicklungsgeschichte 0.952380952 87 | Gleitkomma deaktivieren 0.666666667 88 | groß Arbeitszeitregelung 0.238095238 89 | groß methodisch 0.523809524 90 | großflächig vorzeitig 0.285714286 91 | gründen Forschung 2.285714286 92 | Gymnasium Ober 1.380952381 93 | Handarbeit Flöte 1.666666667 94 | Handlungsanleitung Kooperationspartner 1.142857143 95 | Handout üben 1.428571429 96 | Helfer Problem 2.952380952 97 | Hubschraubertyp einschließlich 0.19047619 98 | Identifizierung praxisbezogen 0.761904762 99 | Institut Einführung 1.380952381 100 | Instrumentarium anwendbar 2.380952381 101 | Interaktion Auswirkung 2.619047619 102 | Internet außerdem 0.285714286 103 | Interpretation politisch 1.714285714 104 | Kenntnis Ingenieur 2.80952381 105 | Kenntnis speziell 2.80952381 106 | Kolloquium Wissen 3 107 | Kompetenz Arzt 3 108 | Konflikt deutsch 1.476190476 109 | konkret Handlungsempfehlung 2.047619048 110 | Konstruktionsbüro elektro 1.285714286 111 | Körpernorm Lebenszusammenhang 0.904761905 112 | Korrektur nutzen 1.047619048 113 | Kostüm gekonnt 1.095238095 114 | Kraft Geselle 0.952380952 115 | Krankenhausmanagement Betriebswirt 2.476190476 116 | künstlich Samenzahnrad 0.761904762 117 | lassen Elisabeth 0.142857143 118 | lehren Verkehrswirtschaft 1.571428571 119 | Lehrerausbildung Medium 1.619047619 120 | Lehrerrolle Hilfe 2.666666667 121 | Leopold Institut 0.80952381 122 | Literaturwissenschaft allgemein 0.952380952 123 | logisch Juni 0.142857143 124 | Lust Uni 1.714285714 125 | Management international 2.333333333 126 | Marketing Firma 2.904761905 127 | Maschinenbau Beschreibung 1.19047619 128 | Metall Berufsbezeichnung 1.285714286 129 | Microsoft Industries 2.523809524 130 | Migrantinnen Handelsstruktur 0.761904762 131 | MIPS Core 2.095238095 132 | mobil beschränkt 1.666666667 133 | Motivation geplant 1.333333333 134 | müssen Übergeordneter 1.285714286 135 | Mut lassen 0.80952381 136 | Neoautoritarismus Chance 0.428571429 137 | Objekt wechselseitig 1.142857143 138 | Outfit Strom 0.238095238 139 | Personaldisposition überwachen 1.238095238 140 | Pflanzenzüchtung models 0.428571429 141 | Pharmakotherapie Evidence 0.523809524 142 | Polarisierung Beurteilung 1.619047619 143 | Politikbereich altersbezogen 1.238095238 144 | Porter wechselseitig 0.428571429 145 | postmaterialistisch diesbezüglich 0.285714286 146 | praxisbezogen ausbilden 2.761904762 147 | Pressebüro Nanopartikel 0.285714286 148 | Privatkunde Bereich 0.952380952 149 | Problem Grenze 1.857142857 150 | Prof. Ludwig 0.904761905 151 | Programmiersystem Warte 0.619047619 152 | Quelle Text 3.238095238 153 | Rahmenbedingung Hochtechnologie 1.761904762 154 | Rainer Folie 0.142857143 155 | redaktionell stützen 1 156 | Reflexivität kollektiv 0.80952381 157 | Reformmöglichkeit Bildungspolitik 2.714285714 158 | Regisseur gestalterisch 2.428571429 159 | Rekonstruktion vornehmlich 0.571428571 160 | religiös Sahara 0.666666667 161 | Restaurierungsmethode Bildungsträger 0.619047619 162 | Risikokind Start 0.80952381 163 | Schiff Segelflugzeug 2.19047619 164 | schließen Reiseantrag 0.619047619 165 | Schritt Wohnung 0.619047619 166 | selbstständig individuell 3 167 | Sicherheit Frontenbildung 1.095238095 168 | sicherheitspolitisch vereinigt 1.476190476 169 | sozial insistieren 0.80952381 170 | Soziales sozial 3.761904762 171 | Spanish Latein 3.333333333 172 | Spielidee Computergraphik 2.80952381 173 | Sport bargeldlos 0.428571429 174 | starten Endlast 0.904761905 175 | stehen Finger 0.476190476 176 | stehen politisch 1.142857143 177 | strafen Paragraph 3.047619048 178 | Studie Anpassung 1.428571429 179 | Stufe beurteilen 1.952380952 180 | Stuttgart Ausbildung 0.80952381 181 | Suche Entnahme 1.142857143 182 | Tätigkeit ausführen 3.666666667 183 | Tätigkeit Maschine 2.619047619 184 | Tätigkeitsbezeichnung Personal 2.666666667 185 | Tiefbaubauingenieur heranführen 0.619047619 186 | Tomcat zentral 0.666666667 187 | Trage Berührung 0.761904762 188 | Turnier Sport 3.619047619 189 | Übersicht Kursstätte 0.904761905 190 | überzeugen Kommunikation 2.80952381 191 | üblich Sport 0.714285714 192 | Umwelt Organisationskompetenz 1.333333333 193 | Umweltschutz Gesundheitsschutz 2.857142857 194 | Uni Titel 3.095238095 195 | Universität Anforderung 2.952380952 196 | Universität Bildungseinrichtung 3.904761905 197 | Universität Euro 1.238095238 198 | Universitätsklinik Universität 3.523809524 199 | unterrichten soft 0.714285714 200 | Unterrichtsmittel Versuchsfläche 1.619047619 201 | unterschiedlich fallen 0.571428571 202 | unterzeichnen gewährleisten 1.80952381 203 | verantwortlich Firma 2.333333333 204 | verarbeiten dichten 1.380952381 205 | verfügen Kommunikation 1.095238095 206 | Verkäuferinnen Gesteck 1.19047619 207 | Verwaltung Betriebswirt 2.857142857 208 | Vortrag technisch 1.80952381 209 | Wahlfächer Gymnasium 3.238095238 210 | wahrnehmen Grundsatzfrage 0.857142857 211 | wahrnehmen selbstständig 1.238095238 212 | wahrnehmen Trägereinrichtung 0.380952381 213 | wahrnehmen zusammenarbeiten 0.761904762 214 | Wartung Einhaltung 2 215 | Weiterbildung Arbeitsbereich 2.571428571 216 | Welthungerhilfe Form 0.714285714 217 | Widerstand diagnostisch 0.571428571 218 | Wirtschaftsminister handeln 2.380952381 219 | Wörterbuch Bewertung 0.857142857 220 | Zusammenarbeit Objekt 1.095238095 221 | Zusammenarbeit Wiki 2.761904762 222 | zusammenstellen zwei 1.952380952 223 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `hyperhyper` [![Build Status](https://travis-ci.com/jfilter/hyperhyper.svg?branch=master)](https://travis-ci.com/jfilter/hyperhyper) [![PyPI](https://img.shields.io/pypi/v/hyperhyper.svg)](https://pypi.org/project/hyperhyper/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/hyperhyper.svg)](https://pypi.org/project/hyperhyper/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/hyperhyper)](https://pypistats.org/packages/hyperhyper) 2 | 3 | `hyperhyper` is a Python package to construct word embeddings for small data. 4 | 5 | ## Why? 6 | 7 | Nowadays, [word embeddings](https://en.wikipedia.org/wiki/Word_embedding) are mostly associated with [Word2vec](https://en.wikipedia.org/wiki/Word2vec) or [fastText](https://en.wikipedia.org/wiki/FastText). 8 | These approaches focus on scenarios, where an abundance of data is available. 9 | And big players such as Facebook provide ready-to-use [pre-trained word embeddings](https://fasttext.cc/docs/en/crawl-vectors.html). 10 | So often you don't have to train new word embeddings from scratch. 11 | But sometimes you do. 12 | 13 | Word2vec or fastText require a lot of data – but texts, especially domain-specific texts, may be scarce. 14 | There exist alternative methods based on counting co-locations (word pairs) that require fewer data to work. 15 | This package implements these approaches (somewhat) efficiently. 16 | 17 | ## Installation 18 | 19 | ```bash 20 | pip install hyperhyper 21 | ``` 22 | 23 | To enable all features (such as pre-processing with spaCy): 24 | 25 | ```bash 26 | pip install hyperhyper[full] 27 | ``` 28 | 29 | ## Usage 30 | 31 | ```python 32 | import hyperhyper as hy 33 | 34 | # download and uncomproess the data 35 | # wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2010.en.shuffled.gz && gzip -d news.2010.en.shuffled.gz 36 | corpus = hy.Corpus.from_file("news.2010.en.shuffled") 37 | bunch = hy.Bunch("news_bunch", corpus) 38 | 39 | # `hyperhyper` is built open `gensim`. So you can get word embeddings in a keyed vectors format. 40 | # https://radimrehurek.com/gensim/models/keyedvectors.html 41 | vectors, results = bunch.svd(keyed_vectors=True) 42 | 43 | results["results"][1] 44 | >>> {"name": "en_ws353", 45 | "score": 0.6510955349164682, 46 | "oov": 0.014164305949008499, 47 | "fullscore": 0.641873218557878} 48 | 49 | vectors.most_similar("berlin") 50 | >>> [("vienna", 0.6323208808898926), 51 | ("frankfurt", 0.5965485572814941), 52 | ("munich", 0.5737138986587524), 53 | ("amsterdam", 0.5511572360992432), 54 | ("stockholm", 0.5423270463943481)] 55 | ``` 56 | 57 | Check out the [examples](./examples). 58 | 59 | The general concepts: 60 | 61 | - preprocess data once and save them in a `bunch` 62 | - cache all results and also record their performance on test data 63 | - make it easy to fine-tune parameters for your data 64 | 65 | More documentation may be forthcoming. Until then you have to read the [source code](./hyperhyper). 66 | 67 | ## Performance Optimization 68 | 69 | ### Install MKL 70 | 71 | If you have an Intel CPU, it's recommended to use [MKL](https://en.wikipedia.org/wiki/Math_Kernel_Library) to speed up numeric executions. 72 | Otherwise, the default [OpenBLAS](https://en.wikipedia.org/wiki/OpenBLAS) will get installed when initially installing `hyperhyper`. 73 | 74 | It can be challenging to correctly set up MKL. 75 | A conda package by Intel may help you. 76 | 77 | ```bash 78 | conda install -c intel intelpython3_core 79 | pip install hyperhyper 80 | ``` 81 | 82 | Verify wheter `mkl_info` is present in the numpy config: 83 | 84 | ```python 85 | >>> import numpy 86 | >>> numpy.__config__.show() 87 | ``` 88 | 89 | ### Disable Numerical Multithreading 90 | 91 | Further, disable the internal multithreading ability of MKL or OpenBLAS (numerical libraries). 92 | This speeds up computation because you should do multiprocessing on an outer loop anyhow. 93 | But you can also leave the default to take advantage of all cores for your numerical computations. 94 | [Some Tweets why multithreading with OpenBLAS can cause problems.](https://twitter.com/honnibal/status/1067920534585917440) 95 | 96 | ```bash 97 | export OPENBLAS_NUM_THREADS=1 98 | export MKL_NUM_THREADS=1 99 | ``` 100 | 101 | ## Background 102 | 103 | `hyperhyper` is based on research by Omer Levy et al. from 2015 ([the paper](https://aclweb.org/anthology/papers/Q/Q15/Q15-1016/)). 104 | The authors published the code they used in their experiments as [Hyperwods](https://bitbucket.org/omerlevy/hyperwords). 105 | Initially, I [tried](https://github.com/jfilter/hyperwords) to port their original software to Python 3 but I ended up re-writing large parts of it. 106 | So this package was born. 107 | 108 | 109 | ![How pairs are counted](./docs/imgs/window.svg) 110 | 111 | The basic idea: Construct pairs of words that appear together in sentences (within a given window size). 112 | Then do some math magic around matrix operations (PPMI, SVD) to get low-dimensional embeddings. 113 | 114 | The count-based word-embeddings by `hyperhyper` are deterministic. 115 | So multiple runs of experiments with identical parameters will yield the same results. 116 | Word2vec and others unstable. 117 | Due to randomness, their results will vary. 118 | 119 | `hyperhyper` is built upon the seminal Python NLP package [gensim](https://radimrehurek.com/gensim/). 120 | 121 | Limitations: With `hyperhyper` you will run into (memory) problems if you need large vocabularies (set of possible words). 122 | It's fine if you have a vocabulary up until ~ 50k. 123 | Word2vec and fastText especially solve this [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality). 124 | If you're interested in details you should read the aforementioned excellent [paper by Omer Levy et al.](https://aclweb.org/anthology/papers/Q/Q15/Q15-1016/). 125 | 126 | ### Scientific Literature 127 | 128 | This software is based on ideas stemming from the following papers: 129 | 130 | - Improving Distributional Similarity with Lessons Learned from Word Embeddings, Omer Levy, Yoav Goldberg, Ido Dagan, TACL 2015. [Paper](https://aclweb.org/anthology/papers/Q/Q15/Q15-1016/) [Code](https://bitbucket.org/omerlevy/hyperwords) 131 | > Recent trends suggest that neural-network-inspired word embedding models outperform traditional count-based distributional models on word similarity and analogy detection tasks. We reveal that much of the performance gains of word embeddings are due to certain system design choices and hyperparameter optimizations, rather than the embedding algorithms themselves. Furthermore, we show that these modifications can be transferred to traditional distributional models, yielding similar gains. In contrast to prior reports, we observe mostly local or insignificant performance differences between the methods, with no global advantage to any single approach over the others. 132 | - The Influence of Down-Sampling Strategies on SVD Word Embedding Stability, Johannes Hellrich, Bernd Kampe, Udo Hahn, NAACL 2019. [Paper](https://aclweb.org/anthology/papers/W/W19/W19-2003/) [Code](https://github.com/hellrich/hyperwords) [Code](https://github.com/hellrich/embedding_downsampling_comparison) 133 | > The stability of word embedding algorithms, i.e., the consistency of the word representations they reveal when trained repeatedly on the same data set, has recently raised concerns. We here compare word embedding algorithms on three corpora of different sizes, and evaluate both their stability and accuracy. We find strong evidence that down-sampling strategies (used as part of their training procedures) are particularly influential for the stability of SVD-PPMI-type embeddings. This finding seems to explain diverging reports on their stability and lead us to a simple modification which provides superior stability as well as accuracy on par with skip-gram embedding 134 | 135 | ## Development 136 | 137 | Install and use [poetry](https://python-poetry.org/). 138 | 139 | ## Contributing 140 | 141 | If you have a **question**, found a **bug** or want to propose a new **feature**, have a look at the [issues page](https://github.com/jfilter/hyperhyper/issues). 142 | 143 | **Pull requests** are especially welcomed when they fix bugs or improve the code quality. 144 | 145 | ## Future Work / TODO 146 | 147 | - evaluation for analogies 148 | - implement counting in a more efficient programming language, e.g. Cython. 149 | 150 | ## `hyperhyper`? 151 | 152 | [![Scooter – Hyper Hyper (Song)](https://img.youtube.com/vi/7Twnmhe948A/0.jpg)](https://www.youtube.com/watch?v=7Twnmhe948A "Scooter – Hyper Hyper") 153 | 154 | ## Acknowledgments 155 | 156 | Building upon the work by Omer Levy et al. for [Hyperwords](https://bitbucket.org/omerlevy/hyperwords). 157 | 158 | ## License 159 | 160 | BSD-2-Clause 161 | 162 | ## Sponsoring 163 | 164 | This work was created as part of a [project](https://github.com/jfilter/ptf) that was funded by the German [Federal Ministry of Education and Research](https://www.bmbf.de/en/index.html). 165 | 166 | 167 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/de/analogy/opposite.txt: -------------------------------------------------------------------------------- 1 | Frage Antwort stark schwach 2 | Frage Antwort viel wenig 3 | Frage Antwort positiv negativ 4 | Frage Antwort rechts links 5 | Frage Antwort nah fern 6 | Frage Antwort männlich weiblich 7 | Frage Antwort warm kalt 8 | Frage Antwort rechts links 9 | Frage Antwort schnell langsam 10 | Frage Antwort Junge Mädchen 11 | Junge Mädchen Sommer Winter 12 | Junge Mädchen viel wenig 13 | Junge Mädchen Frage Antwort 14 | Junge Mädchen Mann Frau 15 | Junge Mädchen männlich weiblich 16 | Junge Mädchen männlich weiblich 17 | Junge Mädchen alt jung 18 | Junge Mädchen hell dunkel 19 | Junge Mädchen oben unten 20 | Junge Mädchen voll leer 21 | Leben Tod hoch tief 22 | Leben Tod davor danach 23 | Leben Tod Norden Süden 24 | Leben Tod bekannt unbekannt 25 | Leben Tod rechts links 26 | Leben Tod groß klein 27 | Leben Tod warm kalt 28 | Leben Tod männlich weiblich 29 | Leben Tod Osten Westen 30 | Leben Tod Sommer Winter 31 | Mann Frau groß klein 32 | Mann Frau schnell langsam 33 | Mann Frau oben unten 34 | Mann Frau lang kurz 35 | Mann Frau oben unten 36 | Mann Frau männlich weiblich 37 | Mann Frau warm kalt 38 | Mann Frau Osten Westen 39 | Mann Frau bekannt unbekannt 40 | Mann Frau hell dunkel 41 | Norden Süden hell dunkel 42 | Norden Süden gewinnen verlieren 43 | Norden Süden groß klein 44 | Norden Süden oben unten 45 | Norden Süden Osten Westen 46 | Norden Süden lang kurz 47 | Norden Süden viel wenig 48 | Norden Süden positiv negativ 49 | Norden Süden Mann Frau 50 | Norden Süden stark schwach 51 | Osten Westen leicht schwer 52 | Osten Westen hoch tief 53 | Osten Westen positiv negativ 54 | Osten Westen lachen weinen 55 | Osten Westen lachen weinen 56 | Osten Westen hoch tief 57 | Osten Westen nah fern 58 | Osten Westen bekannt unbekannt 59 | Osten Westen hell dunkel 60 | Osten Westen gewinnen verlieren 61 | Sommer Winter Junge Mädchen 62 | Sommer Winter warm kalt 63 | Sommer Winter positiv negativ 64 | Sommer Winter leicht schwer 65 | Sommer Winter davor danach 66 | Sommer Winter bekannt unbekannt 67 | Sommer Winter Mann Frau 68 | Sommer Winter warm kalt 69 | Sommer Winter gewinnen verlieren 70 | Sommer Winter lachen weinen 71 | Start Ziel Frage Antwort 72 | Start Ziel viel wenig 73 | Start Ziel Frage Antwort 74 | Start Ziel groß klein 75 | Start Ziel Junge Mädchen 76 | Start Ziel stark schwach 77 | Start Ziel lachen weinen 78 | Start Ziel oben unten 79 | Start Ziel oben unten 80 | Start Ziel viel wenig 81 | Tag Nacht Junge Mädchen 82 | Tag Nacht hell dunkel 83 | Tag Nacht positiv negativ 84 | Tag Nacht Start Ziel 85 | Tag Nacht Start Ziel 86 | Tag Nacht oben unten 87 | Tag Nacht männlich weiblich 88 | Tag Nacht Leben Tod 89 | Tag Nacht männlich weiblich 90 | Tag Nacht Osten Westen 91 | alt jung leicht schwer 92 | alt jung früh spät 93 | alt jung bekannt unbekannt 94 | alt jung rechts links 95 | alt jung Osten Westen 96 | alt jung nah fern 97 | alt jung Norden Süden 98 | alt jung Tag Nacht 99 | alt jung Junge Mädchen 100 | alt jung bekannt unbekannt 101 | bekannt unbekannt leicht schwer 102 | bekannt unbekannt rechts links 103 | bekannt unbekannt Osten Westen 104 | bekannt unbekannt alt jung 105 | bekannt unbekannt schnell langsam 106 | bekannt unbekannt Leben Tod 107 | bekannt unbekannt viel wenig 108 | bekannt unbekannt Mann Frau 109 | bekannt unbekannt lachen weinen 110 | bekannt unbekannt früh spät 111 | davor danach Start Ziel 112 | davor danach positiv negativ 113 | davor danach Osten Westen 114 | davor danach Norden Süden 115 | davor danach groß klein 116 | davor danach groß klein 117 | davor danach positiv negativ 118 | davor danach voll leer 119 | davor danach groß klein 120 | davor danach warm kalt 121 | früh spät stark schwach 122 | früh spät stark schwach 123 | früh spät positiv negativ 124 | früh spät schnell langsam 125 | früh spät Osten Westen 126 | früh spät Start Ziel 127 | früh spät Frage Antwort 128 | früh spät bekannt unbekannt 129 | früh spät Junge Mädchen 130 | früh spät Norden Süden 131 | groß klein Mann Frau 132 | groß klein männlich weiblich 133 | groß klein stark schwach 134 | groß klein Sommer Winter 135 | groß klein nah fern 136 | groß klein lang kurz 137 | groß klein viel wenig 138 | groß klein oben unten 139 | groß klein Leben Tod 140 | groß klein männlich weiblich 141 | hell dunkel Mann Frau 142 | hell dunkel Osten Westen 143 | hell dunkel früh spät 144 | hell dunkel alt jung 145 | hell dunkel früh spät 146 | hell dunkel männlich weiblich 147 | hell dunkel früh spät 148 | hell dunkel lachen weinen 149 | hell dunkel voll leer 150 | hell dunkel hoch tief 151 | hoch tief groß klein 152 | hoch tief warm kalt 153 | hoch tief lachen weinen 154 | hoch tief alt jung 155 | hoch tief warm kalt 156 | hoch tief bekannt unbekannt 157 | hoch tief Leben Tod 158 | hoch tief schnell langsam 159 | hoch tief rechts links 160 | hoch tief leicht schwer 161 | lang kurz gewinnen verlieren 162 | lang kurz warm kalt 163 | lang kurz Sommer Winter 164 | lang kurz Norden Süden 165 | lang kurz Junge Mädchen 166 | lang kurz Tag Nacht 167 | lang kurz bekannt unbekannt 168 | lang kurz voll leer 169 | lang kurz gewinnen verlieren 170 | lang kurz hell dunkel 171 | leicht schwer Sommer Winter 172 | leicht schwer warm kalt 173 | leicht schwer hell dunkel 174 | leicht schwer Junge Mädchen 175 | leicht schwer Mann Frau 176 | leicht schwer Leben Tod 177 | leicht schwer früh spät 178 | leicht schwer oben unten 179 | leicht schwer lachen weinen 180 | leicht schwer Start Ziel 181 | männlich weiblich stark schwach 182 | männlich weiblich Tag Nacht 183 | männlich weiblich bekannt unbekannt 184 | männlich weiblich lang kurz 185 | männlich weiblich hoch tief 186 | männlich weiblich nah fern 187 | männlich weiblich rechts links 188 | männlich weiblich Mann Frau 189 | männlich weiblich Start Ziel 190 | männlich weiblich schnell langsam 191 | nah fern hoch tief 192 | nah fern Frage Antwort 193 | nah fern bekannt unbekannt 194 | nah fern leicht schwer 195 | nah fern hoch tief 196 | nah fern hoch tief 197 | nah fern bekannt unbekannt 198 | nah fern Junge Mädchen 199 | nah fern bekannt unbekannt 200 | nah fern leicht schwer 201 | oben unten Sommer Winter 202 | oben unten voll leer 203 | oben unten davor danach 204 | oben unten lang kurz 205 | oben unten gewinnen verlieren 206 | oben unten nah fern 207 | oben unten lachen weinen 208 | oben unten Start Ziel 209 | oben unten hoch tief 210 | oben unten nah fern 211 | positiv negativ Junge Mädchen 212 | positiv negativ Leben Tod 213 | positiv negativ Junge Mädchen 214 | positiv negativ warm kalt 215 | positiv negativ leicht schwer 216 | positiv negativ hoch tief 217 | positiv negativ früh spät 218 | positiv negativ männlich weiblich 219 | positiv negativ viel wenig 220 | positiv negativ Leben Tod 221 | rechts links Frage Antwort 222 | rechts links Mann Frau 223 | rechts links hoch tief 224 | rechts links alt jung 225 | rechts links positiv negativ 226 | rechts links früh spät 227 | rechts links Start Ziel 228 | rechts links oben unten 229 | rechts links Junge Mädchen 230 | rechts links lachen weinen 231 | schnell langsam voll leer 232 | schnell langsam hoch tief 233 | schnell langsam gewinnen verlieren 234 | schnell langsam hoch tief 235 | schnell langsam davor danach 236 | schnell langsam hell dunkel 237 | schnell langsam davor danach 238 | schnell langsam männlich weiblich 239 | schnell langsam stark schwach 240 | schnell langsam viel wenig 241 | stark schwach rechts links 242 | stark schwach viel wenig 243 | stark schwach Norden Süden 244 | stark schwach Sommer Winter 245 | stark schwach hoch tief 246 | stark schwach voll leer 247 | stark schwach Sommer Winter 248 | stark schwach voll leer 249 | stark schwach nah fern 250 | stark schwach hell dunkel 251 | viel wenig hoch tief 252 | viel wenig Norden Süden 253 | viel wenig Norden Süden 254 | viel wenig schnell langsam 255 | viel wenig hell dunkel 256 | viel wenig bekannt unbekannt 257 | viel wenig früh spät 258 | viel wenig Osten Westen 259 | viel wenig hell dunkel 260 | viel wenig Tag Nacht 261 | voll leer positiv negativ 262 | voll leer nah fern 263 | voll leer rechts links 264 | voll leer groß klein 265 | voll leer Norden Süden 266 | voll leer männlich weiblich 267 | voll leer hoch tief 268 | voll leer nah fern 269 | voll leer männlich weiblich 270 | voll leer gewinnen verlieren 271 | warm kalt hoch tief 272 | warm kalt Junge Mädchen 273 | warm kalt lachen weinen 274 | warm kalt viel wenig 275 | warm kalt rechts links 276 | warm kalt hoch tief 277 | warm kalt Frage Antwort 278 | warm kalt davor danach 279 | warm kalt davor danach 280 | warm kalt positiv negativ 281 | gewinnen verlieren hoch tief 282 | gewinnen verlieren lang kurz 283 | gewinnen verlieren hoch tief 284 | gewinnen verlieren hell dunkel 285 | gewinnen verlieren Tag Nacht 286 | gewinnen verlieren schnell langsam 287 | gewinnen verlieren voll leer 288 | gewinnen verlieren lang kurz 289 | gewinnen verlieren alt jung 290 | gewinnen verlieren alt jung 291 | lachen weinen viel wenig 292 | lachen weinen oben unten 293 | lachen weinen alt jung 294 | lachen weinen stark schwach 295 | lachen weinen oben unten 296 | lachen weinen davor danach 297 | lachen weinen Sommer Winter 298 | lachen weinen alt jung 299 | lachen weinen männlich weiblich 300 | lachen weinen männlich weiblich 301 | -------------------------------------------------------------------------------- /hyperhyper/bunch.py: -------------------------------------------------------------------------------- 1 | """ 2 | The heart of the package. This combines all the function and also exposes 3 | the funtionality to the user. The `bunch` is the location where all the 4 | resulting files are stored. 5 | """ 6 | 7 | import logging 8 | from pathlib import Path 9 | from timeit import default_timer as timer 10 | 11 | import dataset 12 | import numpy as np 13 | from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors 14 | 15 | from . import evaluation, pair_counts, pmi, svd 16 | from .corpus import Corpus 17 | from .experiment import record, results_from_db 18 | from .utils import (delete_folder, load_arrays, load_matrix, save_arrays, 19 | save_matrix) 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | class Bunch: 25 | def __init__( 26 | self, path, corpus=None, force_overwrite=False, text_chunk_size=100000 27 | ): 28 | self.db = None 29 | self.path = Path(path) 30 | 31 | if force_overwrite and self.path.exists(): 32 | delete_folder(self.path) 33 | 34 | if not corpus is None and not force_overwrite: 35 | if Path(self.path / "corpus.pkl").is_file(): 36 | raise ValueError( 37 | "There is already another corpus file saved. Set `force_overwrite` to True if you want to override it." 38 | ) 39 | 40 | if corpus is None: 41 | self.corpus = Corpus.load(str(self.path / "corpus.pkl")) 42 | else: 43 | self.path.mkdir(parents=True, exist_ok=True) 44 | self.corpus = corpus 45 | self.corpus.texts_to_file(self.path / "texts", text_chunk_size) 46 | self.corpus.save(str(self.path / "corpus.pkl")) 47 | 48 | def get_db(self): 49 | """ 50 | Connecting to a SQLite database. 51 | """ 52 | if self.db is None: 53 | self.db = dataset.connect(f"sqlite:///{self.path}/results.db") 54 | return self.db 55 | 56 | def dict_to_path(self, folder, dict): 57 | """ 58 | Return a file path for an embedding based on parameters. 59 | """ 60 | 61 | # cast integer floats to ints 62 | for k, v in dict.items(): 63 | if type(v) is float: 64 | if v.is_integer(): 65 | dict[k] = int(v) 66 | 67 | filenames = [f"{k}_{v}".lower() for k, v in dict.items()] 68 | filename = "_".join(sorted(filenames)) 69 | if len(filename) == 0: 70 | filename = "default" 71 | 72 | filename += ".npz" 73 | full_path = self.path / folder / filename 74 | return full_path 75 | 76 | def pair_counts(self, **kwargs): 77 | """ 78 | Count pairs. 79 | """ 80 | pair_path = self.dict_to_path("pair_counts", kwargs) 81 | if pair_path.is_file(): 82 | try: 83 | logger.info("retrieved already saved pair count") 84 | return load_matrix(pair_path) 85 | except Exception as e: 86 | logger.info(f"creating pair counts, error while loading files: {e}") 87 | 88 | print("create new pair counts") 89 | pair_path.parent.mkdir(parents=True, exist_ok=True) 90 | count_matrix = pair_counts.count_pairs(self.corpus, **kwargs) 91 | save_matrix(pair_path, count_matrix) 92 | return count_matrix 93 | 94 | def pmi_matrix(self, cds=0.75, pair_args={}, **kwargs): 95 | """ 96 | Create a PMI matrix. 97 | """ 98 | pmi_path = self.dict_to_path("pmi", {"cds": cds, **pair_args}) 99 | if pmi_path.is_file(): 100 | try: 101 | logger.info("retrieved already saved pmi") 102 | return load_matrix(pmi_path) 103 | except Exception as e: 104 | logger.info(f"creating new pmi, error while loading files: {e}") 105 | 106 | print("create new pmi") 107 | counts = self.pair_counts(**pair_args, **kwargs) 108 | 109 | start = timer() 110 | pmi_matrix = pmi.calc_pmi(counts, cds) 111 | 112 | end = timer() 113 | logger.info("pmi took " + str(round(end - start, 2)) + " seconds") 114 | 115 | pmi_path.parent.mkdir(parents=True, exist_ok=True) 116 | save_matrix(pmi_path, pmi_matrix) 117 | logger.info("matrix saved") 118 | 119 | return pmi_matrix 120 | 121 | @record 122 | def pmi( 123 | self, 124 | neg=1, 125 | cds=0.75, 126 | pair_args={}, 127 | keyed_vectors=False, 128 | evaluate=True, 129 | **kwargs, 130 | ): 131 | """ 132 | Gets the PMI matrix. 133 | """ 134 | m = self.pmi_matrix(cds, pair_args, **kwargs) 135 | embd = pmi.PPMIEmbedding(m, neg=neg) 136 | if evaluate: 137 | eval_results = self.eval_sim(embd) 138 | if keyed_vectors: 139 | # because of the large dimensions, the matrix will get huge! 140 | return self.to_keyed_vectors(embd.m.todense(), m.shape[0]) 141 | if evaluate: 142 | return embd, eval_results 143 | return embd 144 | 145 | def svd_matrix( 146 | self, impl, impl_args={}, dim=500, neg=1, cds=0.75, pair_args={}, **kwargs 147 | ): 148 | """ 149 | Do the actual SVD computation. 150 | """ 151 | assert impl in ["scipy", "gensim", "scikit", "sparsesvd"] 152 | 153 | svd_path = self.dict_to_path( 154 | "svd", 155 | { 156 | "impl": impl, 157 | **impl_args, 158 | "neg": neg, 159 | "cds": cds, 160 | "dim": dim, 161 | **pair_args, 162 | }, 163 | ) 164 | logger.debug(f"looking up the file: {svd_path}") 165 | if svd_path.is_file(): 166 | try: 167 | logger.info("retrieved already saved svd") 168 | return load_arrays(svd_path) 169 | except Exception as e: 170 | logger.info(f"creating new svd, error while loading files: {e}") 171 | 172 | print("creating new svd") 173 | m = self.pmi_matrix(cds, pair_args, **kwargs) 174 | m = pmi.PPMIEmbedding(m, neg=neg, normalize=False) 175 | 176 | start = timer() 177 | ut, s = svd.calc_svd(m, dim, impl, impl_args) 178 | end = timer() 179 | logger.info("svd took " + str(round((end - start) / 60, 2)) + " minutes") 180 | 181 | svd_path.parent.mkdir(parents=True, exist_ok=True) 182 | save_arrays(svd_path, ut, s) 183 | logger.info("svd arrays saved") 184 | 185 | return ut, s 186 | 187 | @record 188 | def svd( 189 | self, 190 | dim=500, 191 | eig=0, 192 | neg=1, 193 | cds=0.75, 194 | impl="scipy", 195 | impl_args={}, 196 | pair_args={}, 197 | keyed_vectors=False, 198 | evaluate=True, 199 | **kwargs, 200 | ): 201 | """ 202 | Gets and SVD embedding. 203 | """ 204 | ut, s = self.svd_matrix( 205 | impl=impl, 206 | impl_args=impl_args, 207 | dim=dim, 208 | neg=neg, 209 | cds=cds, 210 | pair_args=pair_args, 211 | **kwargs, 212 | ) 213 | embedding = svd.SVDEmbedding(ut, s, eig=eig) 214 | 215 | if evaluate: 216 | eval_results = self.eval_sim(embedding) 217 | if keyed_vectors: 218 | embedding = self.to_keyed_vectors(embedding.m, dim) 219 | if evaluate: 220 | return embedding, eval_results 221 | return embedding 222 | 223 | def to_keyed_vectors(self, embd_matrix, dim, delete_unknown=True): 224 | """ 225 | Transform to gensim's keyed vectors structure for further usage. 226 | https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/keyedvectors.py 227 | """ 228 | vectors = WordEmbeddingsKeyedVectors(vector_size=dim) 229 | tokens = self.corpus.vocab.tokens 230 | if delete_unknown: 231 | # delete last row (for token) 232 | embd_matrix = np.delete(embd_matrix, (-1), axis=0) 233 | else: 234 | # the last token is the UNK token so append it 235 | tokens.append("") 236 | 237 | vectors.add(tokens, embd_matrix) 238 | return vectors 239 | 240 | def eval_sim(self, embd, **kwargs): 241 | """ 242 | Evaluate the performance on word similarity datasets. 243 | NB: The corpus has to be initialized with the correct language. 244 | """ 245 | return evaluation.eval_similarity( 246 | embd, 247 | self.corpus.vocab.token2id, 248 | self.corpus.preproc_fun, 249 | lang=self.corpus.lang, 250 | **kwargs, 251 | ) 252 | 253 | def eval_analogy(self, embd, **kwargs): 254 | """ 255 | Evaluate the performance on word analogies datasets. 256 | NB: The corpus has to be initialized with the correct language. 257 | """ 258 | return evaluation.eval_analogies( 259 | embd, 260 | self.corpus.vocab.token2id, 261 | self.corpus.preproc_fun, 262 | lang=self.corpus.lang, 263 | **kwargs, 264 | ) 265 | 266 | def results(self, **kwargs): 267 | """ 268 | Retrieve evaluation results from the database. 269 | """ 270 | return results_from_db(self.get_db(), **kwargs) 271 | -------------------------------------------------------------------------------- /hyperhyper/evaluation_datasets/de/ws/gur350.txt: -------------------------------------------------------------------------------- 1 | Absage ablehnen 3.5 2 | Absage Stellenanzeige 1.88 3 | Affe Gepäckkontrolle 0.13 4 | Affe Makake 4 5 | Afrika historisch 1 6 | Agentur Irrtum 0 7 | Airbag Kopfairbag 3.88 8 | analysieren Analyse 3.88 9 | Ansehen Schaden 0.88 10 | Arbeitssuchender Bewerbung 2.75 11 | aufklären erklären 2.5 12 | Aufpreis Grundpreis 3.13 13 | Aufstieg Erfolg 3.25 14 | aufzeichnen schreiben 2.75 15 | Aussage Auftritt 1.38 16 | Aussage Rede 2.38 17 | Aussage sagen 3.38 18 | Aussterben bedrohen 2.13 19 | Auto fahren 3.5 20 | Bayern Bayerisch 4 21 | Bayern Deutschland 3.5 22 | Bayern weißblau 2.75 23 | Beamte Amt 3.63 24 | beginnen dauern 2.38 25 | begründen ausgehen 0.88 26 | Behörde Vorschrift 2.75 27 | beinhalten umfassen 3.25 28 | Benedetto Benedikt 3.63 29 | Benziner Dieselversion 3 30 | Berlin Berlin-Kreuzberg 3.38 31 | Berufstätigkeit Erfolg 2.13 32 | beschleunigen übertreiben 1.13 33 | beschuldigen Mitschuld 2.5 34 | Besucher bekommen 1.38 35 | Bewerbung Job 2.38 36 | Bild ähneln 1.38 37 | Bild Grafik 3.13 38 | Bild Röntgenaufnahme 3 39 | Bild Symbol 2.13 40 | Bild visuell 3 41 | Böse Gott 2 42 | Botschaft sichtbar 0.25 43 | Büro Schreibtisch 3 44 | Demut demütig 4 45 | demütig selbstbewusst 1.88 46 | Design Optik 2.63 47 | Designer Eleganz 2.63 48 | deutsch Deutscher 3.88 49 | Deutscher Bundesbürger 3.5 50 | Deutschland Europa 3.25 51 | Ding Gegenstand 4 52 | Doktorandin Abteilung 1.88 53 | Doktorandin Dissertationsthema 2.63 54 | Drehmoment drehfreudig 1.75 55 | dringend rasch 2.38 56 | Durchsicht sehen 2.75 57 | einfach komplex 2.75 58 | Einkommen Gehaltsunterschied 2 59 | Einrichtung Interior 3.5 60 | Einsamkeit allein 3.5 61 | einsteigen aussteigen 2.75 62 | Eleganz klobig 1.38 63 | Eltern Vater 3.5 64 | entgehen bewundern 0.13 65 | entwickeln Entwicklungschef 2.63 66 | Erfolg erfolgreich 4 67 | Erfolg Maßstab 1.25 68 | erforschen herausfinden 3.13 69 | Erhalt bedroht 1 70 | erkennen sehen 3 71 | erklären begründen 2.5 72 | erklären machen 0.5 73 | ernst ironisch 2 74 | erst Ursprugsort 1.38 75 | Erwachsener Geist 0 76 | Erwachsener Kinder 2.63 77 | erwarten klären 0 78 | fahren Automobil 3 79 | filtern herausfiltern 3.63 80 | filtern selektieren 3.38 81 | finden herausfinden 3 82 | Fisch schwimmen 3.38 83 | Flaschenöffner Küchenwerkzeug 3.63 84 | fokussieren Aufmerksamkeit 2.63 85 | folgen sortieren 0.25 86 | Form Farbe 2.13 87 | formulieren Formulierung 3.88 88 | Formulierung Stiftung 0.13 89 | Forscher Wissenschaftler 3.88 90 | Frage Antwort 3.25 91 | Franzose Deutscher 2.38 92 | Frau Familie 2.75 93 | Frau Mann 3.25 94 | Frühlingssonne kitzeln 1.25 95 | Frust frustrieren 3.88 96 | Frust Leidensgenosse 1.88 97 | Frust Rache 1.88 98 | geben nehmen 3.25 99 | Gefühl Frau 1.75 100 | Gegenwind kritisieren 0.5 101 | Gehege Zoo 2.63 102 | Gehirn Kortex 3.25 103 | Gehirn verstehen 2.13 104 | gemeinsam leben 1 105 | Generation Jugendlicher 2.5 106 | geografisch praktisch 0.13 107 | Gepäckkontrolle Flughafen 3.13 108 | Gepäcknetz Staumöglichkeit 2.25 109 | Geschirrdurcheinander Menschenleben 0.5 110 | Geschlecht Mann 3 111 | Gewalt Frieden 2.63 112 | Gewalt Kämpfer 2.63 113 | Gewicht Karriere 0.38 114 | Glaube natürlich 0.5 115 | Glück glücklich 3.88 116 | Gorilla Schlange 1.25 117 | großzügig schrumpfen 0.5 118 | gründen Arbeitsgruppe 0.75 119 | Grundlagenforschung verstehen 1.63 120 | Hand Erwachsener 1.38 121 | Hand Mensch 2.75 122 | heimisch Urwaldhaus 1 123 | helfen unterstützen 3.38 124 | herausstreichen öffentlich 0.5 125 | Herkunft Geschlecht 1.38 126 | Hintergrund Fassade 2 127 | Hirn Gehirn 3.88 128 | Hirnsignal Neuronenaktivität 3.5 129 | Hoffnung Resignation 2.75 130 | Honorarbasis bezahlen 3 131 | Hunderttausend Menge 3 132 | Hunger Armut 2.88 133 | Inaugurationsmesse Premiere 2.13 134 | informieren erfahren 2.63 135 | Innenspiegel Auto 3.13 136 | Internetseite herunterladen 3.25 137 | italienisch vergehen 0 138 | Jäger Wald 2.75 139 | Kaffeetasse parallel 0 140 | Kaffeetasse Tasse 3.75 141 | Kamera TV-Kamera 3.75 142 | kämpfen idyllisch 0.13 143 | kämpfen Veterinär 0.38 144 | Karriere hinaufklettern 2 145 | Karriere Risiko 1 146 | Kind Familie 3.38 147 | Kompaktvan Modell 2.5 148 | Kopfairbag Seitenairbag 3.25 149 | Krankheit reißen 0.25 150 | Krebserkennung Röntgenaufnahme 2 151 | kühl hübsch 0.38 152 | Kulturwissenschaft Grafiker 0.63 153 | lachen leben 1.63 154 | lassen prägen 0.25 155 | laufen bleiben 1.25 156 | leben hellen 0.13 157 | leben Tod 3.25 158 | Lebensbedürfnis ansiedeln 0.38 159 | legen Tisch 1.13 160 | lernen gleichzeitig 0 161 | Lied singen 3.38 162 | Linguistik Wissenschaft 3.5 163 | Luft Leben 2.75 164 | Lupe suchen 2 165 | lustig Witz 3.25 166 | machen anfertigen 3.63 167 | machen ausüben 2.5 168 | Macht Reich 2.5 169 | Mai Januar 2.88 170 | Mann Geschäftspartner 1.5 171 | männlich Weiblich 3.13 172 | Marktl Bayern 2.25 173 | Mehrarbeit Workaholic 2 174 | Meinung Überzeugung 3.13 175 | Mercedes Premium-Hersteller 2.63 176 | Minister Außenminister 3.38 177 | Minister Ministerpräsident 3.38 178 | Minister Politiker 3.25 179 | mitteilen Nachricht 3 180 | moderat extra 1.25 181 | modern sportlich 1.25 182 | momentan kommend 1.38 183 | Monate alt 2.25 184 | Montag November 2.38 185 | Motor Hubraum 2.75 186 | nachgehen untersuchen 2.75 187 | Natur künstlich 2.63 188 | Niedersachsen Landesverband 1.63 189 | niederschmetternd positiv 1.63 190 | Objekt Gegenstand 3.88 191 | objektiv subjektiv 3.13 192 | pädagogisch weitläufig 0.5 193 | Papst Kirche 3.38 194 | parallel linear 1.75 195 | Pass Reiseschutzpass 2.75 196 | Petersdom Inaugurationsmesse 2.63 197 | Pinguin baden 1.5 198 | plätschern Wasser 2.88 199 | Platz aufgebläht 0.13 200 | Platz Petersplatz 3.13 201 | Pontifikat Papst 3.38 202 | Post Portokosten 3 203 | Premium-Hersteller Opel 1.63 204 | Premium-Hersteller VW 2 205 | Problem Schwierigkeit 3.25 206 | Projekt Aktion 2 207 | Prozentzeichen Symbol 3.38 208 | Prüfung Zeugnis 2.5 209 | Punktverlust Platz 1.13 210 | Ratzinger Papst 3.38 211 | Relevanz relevant 3.88 212 | riesig üppig 2.63 213 | rot-weiß weißblau 2.75 214 | sachlich Seriosität 2.13 215 | sagen erklären 2.13 216 | sagen mitteilen 3.13 217 | Sandwich-Konzept Sicherheit 0.5 218 | schauen sehen 3.75 219 | Schleusung Betrugshandlung 2.13 220 | schließen Überlegung 0.88 221 | Schrank Küchenschrank 3.38 222 | Schwabe sparen 2.75 223 | Schwabe Stuttgarter 3.38 224 | Seitenansicht A-Säule 0.88 225 | Selbstinszenierung Beziehungsarbeit 0.5 226 | serienmäßig extra 2.13 227 | Sicherheit Frontalkollision 1.63 228 | Sicherheit klobig 0.25 229 | Sohn aussteigen 0 230 | Sohn Vater 3.38 231 | Spitze allein 1.13 232 | Spitze hoch 2.25 233 | sportlich Interior 0 234 | sportlich teuer 0.38 235 | stark Gehaltsunterschied 0.13 236 | stark Kämpfer 1.88 237 | Steckdose komplex 0.13 238 | Steckdose Stern 0.13 239 | Stellenangebot sehen 0.38 240 | Stellenangebot Wochenzeitung 2.25 241 | Stellenanzeige Bewerbungsgespräch 2.25 242 | Stellenanzeige rasch 0.5 243 | Stoiber drehfreudig 0.25 244 | Stoiber Ministerpräsident 3.13 245 | Studie Dissertationsthema 1.88 246 | Studie Ergebnis 2.75 247 | Studierende Abteilung 1.63 248 | Studierende Note 2.38 249 | Studium arbeiten 2.63 250 | Studium Beruf 3 251 | Studium Deutscher 0.25 252 | Studium Europa 0.5 253 | Studium Gegenstand 0.88 254 | Studium studieren 4 255 | suchen Bundesbürger 0 256 | suchen finden 3 257 | Suchmaschinenbetreiber Eleganz 0.25 258 | Suchmaschinenbetreiber Linkstatistik 1.75 259 | Suchstrategie Optik 0.25 260 | Suchstrategie suchen 3.5 261 | summieren selbstbewusst 0.13 262 | summieren teuer 0.88 263 | Tag demütig 0.25 264 | Tag Donnerstag 3.38 265 | Tag Leben 1.5 266 | Tag Schreibtisch 0 267 | Tag sichtbar 0.63 268 | Tag Stunde 2.75 269 | Tastatur Gott 0 270 | Tastatur Suche 0.63 271 | Tätigkeit Arbeit 3 272 | Tätigkeit visuell 0.13 273 | teuer kostspielig 3.88 274 | teuer Symbol 0.25 275 | Tier Natur 2.63 276 | Tier Röntgenaufnahme 0.25 277 | Tierpark Giraffe 3 278 | Tierpark Grafik 0.5 279 | Tod ähneln 0 280 | Tod Beerdigung 3.25 281 | Topmanagement Job 2.5 282 | Topmanagement Unternehmen 2.75 283 | Traurigkeit bekommen 0.13 284 | Traurigkeit Heimgang 1.13 285 | überzeugen Mitschuld 0.5 286 | überzeugen zeigen 1.5 287 | Überzeugung übertreiben 0.63 288 | Überzeugung Zweifel 2.63 289 | Umfrage Erfolg 0.13 290 | Umfrage Quartalsumfrage 2.88 291 | umklappen Berlin-Kreuzberg 0 292 | umklappen flachlegen 1.63 293 | Unternehmen Dieselversion 0 294 | Unternehmen Firma 3.63 295 | untersuchen Benedikt 0 296 | untersuchen suchen 2.5 297 | Untersuchungsausschuss aussagen 1.88 298 | Untersuchungsausschuss umfassen 0.38 299 | Van Sports-Tourer 2.38 300 | Van Vorschrift 0.25 301 | Vatikan ausgehen 0.13 302 | Vatikan Katholik 3.25 303 | veranstalten betreuen 1.38 304 | veranstalten dauern 0.75 305 | verantwortlich Amt 2.25 306 | verantwortlich zuständig 3.63 307 | vergangen damalig 3.25 308 | vergangen weißblau 0 309 | Vergangenheit alte 2 310 | Vergangenheit Deutschland 1 311 | verhindert Bayerisch 0 312 | verhindert Beihilfe 0.75 313 | verkaufen bezahlen 2.5 314 | verkaufen fahren 0.13 315 | Vernehmung bedrohen 0.75 316 | Vernehmung vernommen 3.63 317 | versäumen sagen 0.13 318 | versäumen überprüfen 0.13 319 | verschicken Post 3 320 | verschicken Rede 0.25 321 | versichern Auftritt 0.13 322 | versichern bedauern 0.5 323 | viel groß 2 324 | viel schreiben 0.38 325 | Volierenzelt Erfolg 0 326 | Volierenzelt Käfig 2.38 327 | vorankommen Entwicklung 2.5 328 | vorankommen Grundpreis 0.25 329 | weit Bewerbung 0 330 | weit erklären 0.25 331 | weit nahe 3.13 332 | weit wegrennen 1.5 333 | Welle Schaden 1 334 | Welle Surfer 3.13 335 | Widerspruch Analyse 1.13 336 | Widerspruch Gebiet 0 337 | Wien deutschsprachig 3 338 | Wien Kopfairbag 0 339 | Wirtschaftsprofessor Irrtum 0.38 340 | Wirtschaftsprofessor Professor 3.63 341 | Wirtschaftsuniversität Abteilung 1.75 342 | Wirtschaftsuniversität historisch 0.63 343 | Witz Gepäckkontrolle 0.25 344 | Witz Joke 4 345 | Witz Kopf 1.13 346 | Witz Makake 0.13 347 | Zebra Stellenanzeige 0 348 | Zebra Tier 3.25 349 | Zielstrebigkeit ablehnen 0.25 350 | Zielstrebigkeit Erfolg 2.63 351 | -------------------------------------------------------------------------------- /hyperhyper/pair_counts.py: -------------------------------------------------------------------------------- 1 | """ 2 | construct a co-occurrence matrix by counting word pairs (co-locations of words) 3 | """ 4 | 5 | import logging 6 | import os 7 | import random 8 | from collections import defaultdict 9 | from concurrent import futures 10 | from math import ceil, e, fabs, sqrt 11 | 12 | import numpy as np 13 | from scipy.sparse import coo_matrix, csr_matrix, lil_matrix 14 | from tqdm import tqdm 15 | 16 | from .utils import read_pickle 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | def decay(distance, rate): 22 | """ 23 | simple exponential decay 24 | """ 25 | distance -= 1 # the returned value is 1 when the distance is 1 26 | return e ** -(rate * distance) 27 | 28 | 29 | def to_count_matrix(pair_counts, vocab_size): 30 | """ 31 | transforms the counts into a sparse matrix 32 | """ 33 | cols = [] 34 | rows = [] 35 | data = [] 36 | for k, v in pair_counts.items(): 37 | rows.append(k[0]) 38 | cols.append(k[1]) 39 | data.append(v) 40 | # setting to float is important, +1 for UNK 41 | # COO matrix is the fastest for constructing the matrix since we have all 42 | # the data already 43 | count_matrix = coo_matrix( 44 | (data, (rows, cols)), shape=(vocab_size + 1, vocab_size + 1), dtype=np.float32 45 | ) 46 | # CSR matrices support more arithmetic operations and are more efficient 47 | return count_matrix.tocsr() 48 | 49 | 50 | def count_pairs_parallel(texts_paths, count_pairs_closure, low_memory): 51 | """ 52 | count pairs in parallel by loading and processing files to keep memory 53 | consumption low 54 | """ 55 | # Ensure that memory is freed when a job completes. 56 | res = None 57 | with futures.ProcessPoolExecutor() as executor: 58 | # A dictionary which will contain a list the future info in the key, and the filename in the value 59 | jobs = {} 60 | files_left = len(texts_paths) 61 | files_iter = iter(texts_paths) 62 | 63 | if low_memory: 64 | MAX_JOBS_IN_QUEUE = os.cpu_count() 65 | else: 66 | MAX_JOBS_IN_QUEUE = os.cpu_count() * 2 # heuristic ;) 67 | 68 | with tqdm(total=len(texts_paths), desc="generating pairs") as pbar: 69 | while files_left: 70 | for this_file in files_iter: 71 | job = executor.submit(count_pairs_closure, this_file) 72 | jobs[job] = this_file 73 | if len(jobs) > MAX_JOBS_IN_QUEUE: 74 | break # limit the job submission for now job 75 | 76 | # Get the completed jobs whenever they are done 77 | for job in futures.as_completed(jobs): 78 | files_left -= 1 79 | pbar.update(1) 80 | m = job.result() 81 | if res is None: 82 | res = m 83 | else: 84 | res += m 85 | 86 | del jobs[job] 87 | return res 88 | 89 | 90 | class CountPairsClosure(object): 91 | """ 92 | creating a closure, has to be an object to be pickle-able when doing 93 | multiprocessing 94 | """ 95 | 96 | def __init__(self, **kwargs): 97 | self.__dict__.update(kwargs) 98 | 99 | def __call__(self, text_path): 100 | texts = read_pickle(text_path) 101 | counter = defaultdict(int) 102 | for t in texts: 103 | for pair in iterate_tokens( 104 | t, 105 | self.window, 106 | self.dynamic_window_prob, 107 | self.dynamic_window_deter, 108 | self.dynamic_window_decay, 109 | self.delete_oov, 110 | self.subsampler_prob, 111 | self.vocab_size, # id 112 | ): 113 | counter[pair[0], pair[1]] += pair[2] 114 | return to_count_matrix(counter, self.vocab_size) 115 | 116 | 117 | def iterate_tokens( 118 | tokens, 119 | window, 120 | dynamic_window_prob, 121 | dynamic_window_deter, 122 | dynamic_window_decay, 123 | delete_oov, 124 | subsampler_prob, 125 | unkown_id, 126 | ): 127 | """ 128 | iterate over tokens in a sentence and counting pairs 129 | """ 130 | if delete_oov: 131 | tokens = [t for t in tokens if t != unkown_id] 132 | 133 | if not subsampler_prob is None: 134 | tokens = [ 135 | t 136 | if t not in subsampler_prob or random.random() <= subsampler_prob[t] 137 | else None 138 | for t in tokens 139 | ] 140 | 141 | len_tokens = len(tokens) 142 | res = [] 143 | for i, tok in enumerate(tokens): 144 | if tok is not None: 145 | if dynamic_window_prob: 146 | offset = random.randint(1, window) 147 | else: 148 | offset = window 149 | start = i - offset 150 | if start < 0: 151 | start = 0 152 | end = i + offset + 1 153 | if end > len_tokens: 154 | end = len_tokens 155 | for j in range(start, end): 156 | if j != i and tokens[j] is not None: 157 | count = 1 158 | # the variations are exclusive 159 | if dynamic_window_deter: 160 | distance = fabs(i - j) 161 | count = (window + 1 - distance) / window 162 | if not dynamic_window_decay is None: 163 | distance = fabs(i - j) 164 | count = decay(distance, dynamic_window_decay) 165 | res.append((tok, tokens[j], count)) 166 | return res 167 | 168 | 169 | # storing the default values here again to re-use them when writing to the db 170 | # TODO: implement in a more elegant way 171 | default_pair_args = { 172 | "window": 2, 173 | "dynamic_window": "deter", 174 | "decay_rate": 0.25, 175 | "delete_oov": True, 176 | "subsample": "deter", 177 | "subsample_factor": 1e-5, 178 | } 179 | 180 | 181 | def count_pairs( 182 | corpus, 183 | window=2, 184 | dynamic_window="deter", 185 | decay_rate=0.25, 186 | delete_oov=True, 187 | subsample="deter", 188 | subsample_factor=1e-5, 189 | seed=1312, 190 | low_memory=False, 191 | low_memory_chunk=100, 192 | min_count=0, 193 | ): 194 | """ 195 | counting pairs in a corpus 196 | 197 | TODO: instead of giving a subsample_factor, give a portion of tokens to apply subsample 198 | """ 199 | for x in [dynamic_window, subsample]: 200 | if not x is None and not x == False: 201 | assert x in ("deter", "prob", "off", "decay") 202 | 203 | random.seed(seed) 204 | 205 | subsampler_prob = None 206 | if subsample == "prob": 207 | subsampler_prob = subsample_factor * corpus.size 208 | subsampler_prob = { 209 | word: 1 - sqrt(subsampler_prob / count) 210 | for word, count in corpus.counts.items() 211 | if count > subsampler_prob 212 | } 213 | 214 | count_matrix = count_pairs_parallel( 215 | corpus.texts, 216 | CountPairsClosure( 217 | window=window, 218 | dynamic_window_prob=dynamic_window == "prob", 219 | dynamic_window_deter=dynamic_window == "deter", 220 | dynamic_window_decay=decay_rate if dynamic_window == "decay" else None, 221 | delete_oov=delete_oov, 222 | subsampler_prob=subsampler_prob, 223 | vocab_size=corpus.vocab.size, 224 | ), 225 | low_memory=low_memory, 226 | ) 227 | 228 | # already prunning with a `min_count` of 1 can greatly reduces memory usage 229 | logger.info(f"Sparseness rate: {count_matrix.nnz / (corpus.vocab.size ** 2)}") 230 | if not min_count is None and min_count > 0: 231 | count_matrix.data *= count_matrix.data >= min_count 232 | count_matrix.eliminate_zeros() 233 | logger.info( 234 | f"Sparseness rate after pruning: {count_matrix.nnz / (corpus.vocab.size ** 2)}" 235 | ) 236 | 237 | # down sample in a deterministic way 238 | if subsample == "deter": 239 | # construct array with appropriate factor 240 | logger.info("creating array for the subsampling") 241 | subsample_value = subsample_factor * corpus.size 242 | subsampler = np.ones(corpus.vocab.size + 1, dtype=np.float32) 243 | num_sub = 0 244 | for word, count in corpus.counts.items(): 245 | if count > subsample_value: 246 | subsampler[word] = sqrt(subsample_value / count) 247 | num_sub += 1 248 | print(f"subsampling applied to {num_sub / corpus.vocab.size} of the tokens") 249 | 250 | if low_memory: 251 | # iterate over all rows in blocks 252 | count_matrix = lil_matrix(count_matrix) 253 | for i in tqdm(range(ceil((corpus.vocab.size + 1) / low_memory_chunk))): 254 | count_matrix[ 255 | i * low_memory_chunk : (i + 1) * low_memory_chunk, 256 | ] = count_matrix[ 257 | i * low_memory_chunk : (i + 1) * low_memory_chunk, 258 | ].multiply( 259 | subsampler[i * low_memory_chunk : (i + 1) * low_memory_chunk] 260 | .reshape((-1, 1)) 261 | .dot(subsampler.reshape(1, -1)) 262 | ) 263 | else: 264 | logger.info("creating subsampler matrix") 265 | # to 2d matrix 266 | subsampler = subsampler.reshape((-1, 1)).dot(subsampler.reshape(1, -1)) 267 | logger.info("multiply elementwise: start") 268 | # elementwise muplication of 2 matrices 269 | count_matrix = count_matrix.multiply(subsampler) 270 | logger.info("multiply elementwise: done") 271 | # in both cases: transform to csr matrix 272 | count_matrix = csr_matrix(count_matrix) 273 | return count_matrix 274 | -------------------------------------------------------------------------------- /hyperhyper/corpus.py: -------------------------------------------------------------------------------- 1 | """ 2 | represent a collection of texts 3 | """ 4 | 5 | import logging 6 | import os 7 | import random 8 | from array import array 9 | from collections import defaultdict 10 | from concurrent import futures 11 | from pathlib import Path 12 | 13 | from gensim.corpora import Dictionary 14 | from gensim.utils import SaveLoad 15 | from tqdm import tqdm 16 | 17 | from .preprocessing import (texts_to_sents, tokenize_texts, 18 | tokenize_texts_parallel) 19 | from .utils import chunks, dsum, read_pickle, to_pickle 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | class Vocab(Dictionary): 25 | """ 26 | Holds mapping for the integer ids to the tokens (words). 27 | """ 28 | 29 | def __init__(self, texts=None, **kwargs): 30 | super().__init__(texts) 31 | if not texts is None: 32 | self.filter(**kwargs) 33 | 34 | def filter(self, no_below=0, no_above=1, keep_n=50000, keep_tokens=None): 35 | """ 36 | Filter extremes with sane defaults. 37 | """ 38 | self.filter_extremes( 39 | no_below=no_below, no_above=no_above, keep_n=keep_n, keep_tokens=keep_tokens 40 | ) 41 | 42 | @property 43 | def size(self): 44 | return len(self.token2id) 45 | 46 | @property 47 | def tokens(self): 48 | """ 49 | Return tokens as array (in order of id). 50 | """ 51 | return [tup[0] for tup in sorted(self.token2id.items(), key=lambda x: x[1])] 52 | 53 | 54 | class TransformToIndicesClosure(object): 55 | """ 56 | A closure that is pickable, usefull for multiprocessing. 57 | is the last ID (thus vocab_size) 58 | for sizes: https://docs.python.org/3/library/array.html 59 | """ 60 | 61 | def __init__(self, c): 62 | self.vocab_size = c.vocab.size 63 | self.d = c.vocab.doc2idx 64 | if self.vocab_size <= 65535: 65 | self.size = "H" 66 | else: 67 | self.size = "L" 68 | 69 | def __call__(self, texts): 70 | return array(self.size, self.d(texts, self.vocab_size)) 71 | 72 | 73 | def count_tokens(texts): 74 | """ 75 | Count token frequencies since gensim's dictionary only provides document frequencies. 76 | """ 77 | counts = defaultdict(int) 78 | for text in texts: 79 | for token in text: 80 | counts[token] += 1 81 | return counts 82 | 83 | 84 | def _texts_to_ids(args): 85 | f, to_indices = args[0], args[1] 86 | texts = read_pickle(f) 87 | transformed = [to_indices(t) for t in texts] 88 | to_pickle(transformed, f) 89 | counts = count_tokens(transformed) 90 | return len(transformed), counts 91 | 92 | 93 | def texts_to_ids(input_text_fns, to_indices): 94 | """ 95 | transform the raw texts to integer ids 96 | """ 97 | total_len = 0 98 | all_counts = [] 99 | with futures.ProcessPoolExecutor() as executor: 100 | # A dictionary which will contain a list the future info in the key, and the filename in the value 101 | jobs = {} 102 | files_left = len(input_text_fns) 103 | files_iter = iter(input_text_fns) 104 | MAX_JOBS_IN_QUEUE = os.cpu_count() * 2 105 | 106 | with tqdm(total=len(input_text_fns), desc="texts to ids") as pbar: 107 | while files_left: 108 | for this_file in files_iter: 109 | job = executor.submit(_texts_to_ids, [this_file, to_indices]) 110 | jobs[job] = this_file 111 | if len(jobs) > MAX_JOBS_IN_QUEUE: 112 | break # limit the job submission for now job 113 | 114 | # Get the completed jobs whenever they are done 115 | for job in futures.as_completed(jobs): 116 | files_left -= 1 117 | pbar.update(1) 118 | num_sents, counts = job.result() 119 | all_counts.append(counts) 120 | total_len += num_sents 121 | del jobs[job] 122 | 123 | return total_len, dsum(*all_counts) 124 | 125 | 126 | def _build_vocab_from_file(args): 127 | f, preproc_func, view_fraction = args[0], args[1], args[2] 128 | 129 | texts = f.read_text().split("\n") 130 | texts = preproc_func(texts) 131 | 132 | # temporary save processed files to continue working later 133 | to_pickle(texts, f.with_suffix(".pkl")) 134 | 135 | # skip at random 136 | if 0.999 > view_fraction < random.random(): 137 | return Vocab() 138 | return Vocab(texts) 139 | 140 | 141 | class Corpus(SaveLoad): 142 | """ 143 | An object to hold text. 144 | """ 145 | 146 | def __init__(self, vocab, preproc_fun, texts=None, input_text_fns=None, lang="en"): 147 | self.vocab = vocab 148 | self.vocab_size = vocab.size 149 | self.lang = lang 150 | self.preproc_fun = preproc_fun 151 | 152 | if texts is None: 153 | to_indices = TransformToIndicesClosure(self) 154 | self.size, self.counts = texts_to_ids(input_text_fns, to_indices) 155 | self.input_text_fns = input_text_fns 156 | self.texts = None 157 | else: 158 | to_indices = TransformToIndicesClosure(self) 159 | transformed = [ 160 | to_indices(t) for t in tqdm(texts, desc="transform to indices") 161 | ] 162 | self.texts = transformed 163 | self.counts = count_tokens(transformed) 164 | self.size = len(transformed) 165 | 166 | def texts_to_file(self, dir, text_chunk_size): 167 | """ 168 | If we haven't created the temporay text files yet, do it here. 169 | We could't do it earlier since we only have location on the filesystem 170 | through the `bunch`. 171 | """ 172 | if self.texts is None: 173 | # re-use the texts that were created for initialization of the corpus 174 | # TODO: make use of chunk size? 175 | self.texts = self.input_text_fns 176 | fns = [] 177 | Path(dir).mkdir(parents=True, exist_ok=True) 178 | for i, f in enumerate(self.input_text_fns): 179 | new_path = Path(f"{dir}/texts_{i}.pkl").resolve() 180 | # only works if data and bunch are on same file system 181 | f.rename(new_path) 182 | fns.append(new_path) 183 | self.texts = fns 184 | else: 185 | fns = [] 186 | for i, c in enumerate(chunks(self.texts, text_chunk_size)): 187 | fn = Path(f"{dir}/texts_{i}.pkl").resolve() 188 | to_pickle(c, fn) 189 | fns.append(fn) 190 | self.texts = fns 191 | 192 | @staticmethod 193 | def from_file(input_path, limit=None, **kwargs): 194 | """ 195 | Construct a Corpus from a text file with newline-delimited sentences. 196 | """ 197 | logger.info("reading file") 198 | text = Path(input_path).read_text() 199 | lines = text.splitlines() 200 | if limit is not None: 201 | lines = lines[:limit] 202 | logger.info("done reading file") 203 | return Corpus.from_sents(lines, **kwargs) 204 | 205 | @staticmethod 206 | def from_sents( 207 | texts, vocab=None, preproc_func=tokenize_texts_parallel, lang="en", **kwargs 208 | ): 209 | """ 210 | Construct corpus from lists of sentences. 211 | """ 212 | texts = preproc_func(texts) 213 | if vocab is None: 214 | vocab = Vocab(texts, **kwargs) 215 | corpus = Corpus(vocab, preproc_func, texts=texts, lang=lang) 216 | return corpus 217 | 218 | @staticmethod 219 | def from_texts(texts, preproc_func=texts_to_sents, **kwargs): 220 | """ 221 | Construct corpus from list of texts. 222 | """ 223 | return Corpus.from_sents(texts, preproc_func=preproc_func, **kwargs) 224 | 225 | @staticmethod 226 | def from_text_files( 227 | base_dir, preproc_func=texts_to_sents, view_fraction=1, lang="en", **kwargs 228 | ): 229 | """ 230 | Construct a corpus from a folder of text files. 231 | The size of the text files determine the working memory size later on. 232 | This is usefull for larger amount of text. 233 | 234 | Args: 235 | base_dir (str): The directory with the text files. 236 | preproc_func (fun): The funcation to preprocess texts into sentences. 237 | view_fraction (float): Option to only look at portions of the text to determine the most frequent words. 238 | lang (str): The language of the texts, defaults to "en". 239 | 240 | Returns: 241 | Corpus 242 | """ 243 | voc = Vocab() 244 | input_text_fns = list(Path(base_dir).glob("*.txt")) 245 | proc_fns = [f.with_suffix(".pkl") for f in input_text_fns] 246 | 247 | with futures.ProcessPoolExecutor() as executor: 248 | jobs = {} 249 | files_left = len(input_text_fns) 250 | files_iter = iter(input_text_fns) 251 | MAX_JOBS_IN_QUEUE = os.cpu_count() * 2 252 | 253 | with tqdm(total=len(input_text_fns), desc="build up vocab") as pbar: 254 | while files_left: 255 | for this_file in files_iter: 256 | job = executor.submit( 257 | _build_vocab_from_file, 258 | [this_file, preproc_func, view_fraction], 259 | ) 260 | jobs[job] = this_file 261 | if len(jobs) > MAX_JOBS_IN_QUEUE: 262 | break 263 | 264 | for job in futures.as_completed(jobs): 265 | files_left -= 1 266 | pbar.update(1) 267 | # merge into one vocab 268 | voc.merge_with(job.result()) 269 | del jobs[job] 270 | 271 | # only consider most frequent terms etc. 272 | voc.filter(**kwargs) 273 | 274 | return Corpus(voc, preproc_func, input_text_fns=proc_fns, lang=lang) 275 | 276 | -------------------------------------------------------------------------------- /examples/02_wikipedia.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# hyerhyper is mainly for constructing word embeddings if you you don't have enough data. For large corpora, use word2vec or fastText. However, it still (somewhat) works for Wikipedia.\n", 10 | "\n", 11 | "# A dump of the English wikipedia is quite large. So preprocessing may take a while (hours or days).\n", 12 | "# 1. download wikipedia dump (https://dumps.wikimedia.org/enwiki/), wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2\n", 13 | "# 2. `python WikiExtractor.py enwiki-latest-pages-articles.xml -o en-wiki` WikiExtraktor (https://github.com/attardi/wikiextractor)\n", 14 | "# 3. i=0; for f in en-wiki/*/*; do cp $f en-wiki-flat/$i.txt && ((i++)) && echo $i; done" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import hyperhyper as hy" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# We change the default preprocessing to speed things up. An sentence in this case means a whole article. \n", 33 | "# This is quite dirty, but proper preprocessing (splitting into sentences, removing stop words) takes more time." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stderr", 43 | "output_type": "stream", 44 | "text": [ 45 | "build up vocab: 100%|██████████| 13039/13039 [28:24<00:00, 7.65it/s]\n", 46 | "texts to ids: 100%|██████████| 13039/13039 [13:45<00:00, 15.80it/s]\n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "corpus = hy.Corpus.from_text_files('/mnt/data/datasets/wiki/en-wiki-flat', preproc_func=hy.tokenize_texts) " 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 5, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stderr", 61 | "output_type": "stream", 62 | "text": [ 63 | "/home/filter/anaconda3/envs/hyperhyper/lib/python3.6/site-packages/smart_open/smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n", 64 | " 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n" 65 | ] 66 | } 67 | ], 68 | "source": [ 69 | "bunch = hy.Bunch(\"/mnt/data/datasets/wiki/wikibunch\", corpus) # saves data (corpus) to disk" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "bunch = hy.Bunch(\"/mnt/data/datasets/wiki/wikibunch\") # load already saved bunch" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 6, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stderr", 88 | "output_type": "stream", 89 | "text": [ 90 | "generating pairs: 0%| | 0/13039 [00:00] 344,11M 439KB/s in 16m 33s \n", 20 | "\n", 21 | "2020-02-27 21:00:06 (355 KB/s) - ‘news.2010.en.shuffled.gz’ saved [360828816/360828816]\n", 22 | "\n" 23 | ] 24 | } 25 | ], 26 | "source": [ 27 | "# get data, a text file with one sentence per line\n", 28 | "! wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2010.en.shuffled.gz && gzip -d news.2010.en.shuffled.gz" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 1, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "import hyperhyper as hy" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "name": "stderr", 47 | "output_type": "stream", 48 | "text": [ 49 | "transform to indices: 100%|██████████| 6797225/6797225 [01:05<00:00, 103953.79it/s]\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "corpus = hy.Corpus.from_file('news.2010.en.shuffled') # this may take a while" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "name": "stderr", 64 | "output_type": "stream", 65 | "text": [ 66 | "/home/filter/anaconda3/envs/hyperhyper/lib/python3.6/site-packages/smart_open/smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n", 67 | " 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "bunch = hy.Bunch(\"news_bunch\", corpus) # saves data (corpus) to disk" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "bunch = hy.Bunch(\"news_bunch\") # load already saved bunch" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 5, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "name": "stderr", 91 | "output_type": "stream", 92 | "text": [ 93 | "generating pairs: 0%| | 0/68 [00:00