├── tests ├── __init__.py ├── benchmarks │ ├── __init__.py │ ├── test_outliers.py │ ├── test_misc.py │ ├── test_relation_extraction.py │ ├── test_synonymy_detection.py │ ├── test_similarity.py │ ├── test_language_modeling.py │ ├── test_sequence_labeling.py │ ├── test_text_classification.py │ ├── test_analogy.py │ └── test_categorization.py ├── data │ ├── embeddings │ │ ├── npy │ │ │ ├── metadata.json │ │ │ ├── test.vocab │ │ │ └── test.npy │ │ └── text │ │ │ ├── corrupted │ │ │ ├── metadata.json │ │ │ └── emb.txt │ │ │ ├── plain_no_file_header │ │ │ ├── metadata.json │ │ │ └── emb.txt │ │ │ └── plain_with_file_header │ │ │ ├── metadata.json │ │ │ └── emb.txt │ ├── corpora │ │ ├── jap │ │ │ ├── char2radical │ │ │ │ └── char2radical.txt │ │ │ └── tokenized │ │ │ │ └── corpus.txt │ │ ├── bzipped │ │ │ └── sense_small.txt.bz2 │ │ ├── gzipped │ │ │ └── sense_small.txt.gz │ │ ├── xzipped │ │ │ └── sense_small.txt.xz │ │ └── multiple_small │ │ │ ├── one.txt │ │ │ └── two.txt │ ├── benchmarks │ │ ├── analogy │ │ │ ├── category1 │ │ │ │ └── subcategory_a.txt │ │ │ ├── category2 │ │ │ │ └── subcategory_b.txt │ │ │ └── metadata.json │ │ ├── similarity │ │ │ ├── ws.csv │ │ │ ├── ws.json │ │ │ ├── ws │ │ │ └── ws.txt │ │ ├── categorization │ │ │ └── essli-2008-lite.csv │ │ ├── synonymy_detection │ │ │ └── test.csv │ │ ├── outliers │ │ │ └── test.csv │ │ ├── sequence_labeling │ │ │ ├── chunk │ │ │ │ ├── valid.txt │ │ │ │ ├── test.txt │ │ │ │ └── train.txt │ │ │ ├── ner │ │ │ │ ├── valid.txt │ │ │ │ ├── test.txt │ │ │ │ └── train.txt │ │ │ └── pos │ │ │ │ ├── valid.txt │ │ │ │ ├── test.txt │ │ │ │ └── train.txt │ │ ├── text_classification │ │ │ ├── test │ │ │ └── train │ │ └── relation_extraction │ │ │ ├── test.txt │ │ │ └── train.txt │ ├── vocabs │ │ ├── one_column │ │ │ └── something.vocab │ │ ├── numbers │ │ │ ├── metadata.json │ │ │ └── vocab.tsv │ │ └── plain │ │ │ ├── metadata.json │ │ │ └── vocab.tsv │ └── benchmarks_results │ │ ├── text_classification │ │ ├── vocab.json │ │ ├── best_model.npz │ │ ├── args.json │ │ └── log │ │ ├── similarity │ │ ├── 1 │ │ └── 2 │ │ └── analogy │ │ ├── 1 │ │ ├── 2 │ │ ├── 3 │ │ └── 4 ├── test_datasets.py ├── test_cli_misc.py ├── test_misc.py ├── test_format.py ├── test_config.py ├── test_embeddings.py └── test_training.py ├── docs ├── requirements.txt ├── source │ ├── reference │ │ └── index.rst │ ├── tutorial │ │ ├── images │ │ │ ├── cat.png │ │ │ ├── pear.png │ │ │ ├── contexts.png │ │ │ ├── img_tips.png │ │ │ ├── std_to_img.png │ │ │ ├── draw_features.png │ │ │ └── draw_similarity.png │ │ ├── index.rst │ │ ├── installing.rst │ │ ├── visualization.rst │ │ ├── basic.rst │ │ ├── training_vectors.rst │ │ ├── roadmap.rst │ │ └── working_with_vectors.rst │ ├── index.rst │ └── contribution.rst ├── Makefile └── make.bat ├── .readthedocs.yml ├── vecto ├── benchmarks │ ├── outliers │ │ ├── __init__.py │ │ └── __main__.py │ ├── synonymy_detection │ │ ├── __init__.py │ │ ├── __main__.py │ │ └── synonymy_detection.py │ ├── base.py │ ├── similarity │ │ └── __init__.py │ ├── sequence_labeling │ │ └── __init__.py │ ├── categorization │ │ ├── metrics.py │ │ └── __init__.py │ ├── relation_extraction │ │ ├── __init__.py │ │ ├── preprocess.py │ │ └── relation_extraction.py │ ├── analogy │ │ ├── io.py │ │ └── __init__.py │ ├── language_modeling │ │ └── __init__.py │ ├── text_classification │ │ ├── __init__.py │ │ └── nlp_utils.py │ ├── __init__.py │ └── visualize.py ├── data │ ├── __init__.py │ ├── io.py │ └── base.py ├── embeddings │ ├── utils │ │ └── __init__.py │ ├── base.py │ ├── legacy_w2v.py │ └── __init__.py ├── _version.py ├── __main__.py ├── corpus │ ├── __init__.py │ ├── base.py │ └── tokenization.py ├── utils │ ├── blas.py │ ├── __init__.py │ ├── tqdm_utils.py │ ├── convert.py │ ├── fetch_benchmarks.py │ ├── formathelper.py │ ├── data.py │ └── metadata.py ├── __init__.py ├── vocabulary │ ├── __init__.py │ └── __main__.py ├── config.py └── cli.py ├── MANIFEST.in ├── requirements.txt ├── test_requirements.txt ├── check_ds.py ├── setup.py ├── .appveyor.yml ├── .travis.yml ├── corpus_test.py ├── examples ├── analogy.ipynb └── most_similar.ipynb ├── .gitignore └── README.rst /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | sphinx_rtd_theme 3 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | requirements_file: docs/requirements.txt 2 | -------------------------------------------------------------------------------- /tests/data/embeddings/npy/metadata.json: -------------------------------------------------------------------------------- 1 | {"type":"test"} 2 | -------------------------------------------------------------------------------- /vecto/benchmarks/outliers/__init__.py: -------------------------------------------------------------------------------- 1 | from .outliers import * -------------------------------------------------------------------------------- /tests/data/corpora/jap/char2radical/char2radical.txt: -------------------------------------------------------------------------------- 1 | 仲 亻中 2 | 間 門日 -------------------------------------------------------------------------------- /vecto/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import Dataset, get_dataset_by_name 2 | -------------------------------------------------------------------------------- /vecto/embeddings/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from . import word 2 | from . import subword -------------------------------------------------------------------------------- /tests/data/embeddings/npy/test.vocab: -------------------------------------------------------------------------------- 1 | the 2 | apple 3 | banana 4 | fast 5 | quick -------------------------------------------------------------------------------- /vecto/_version.py: -------------------------------------------------------------------------------- 1 | """Version of vecto package.""" 2 | 3 | VERSION = "0.2.16" 4 | -------------------------------------------------------------------------------- /vecto/benchmarks/synonymy_detection/__init__.py: -------------------------------------------------------------------------------- 1 | from .synonymy_detection import * -------------------------------------------------------------------------------- /tests/data/benchmarks/analogy/category1/subcategory_a.txt: -------------------------------------------------------------------------------- 1 | apple banana 2 | fast quick -------------------------------------------------------------------------------- /tests/data/benchmarks/similarity/ws.csv: -------------------------------------------------------------------------------- 1 | love,sex,6.77 2 | tiger,cat,7.35 3 | tiger,tiger,10 -------------------------------------------------------------------------------- /tests/data/corpora/jap/tokenized/corpus.txt: -------------------------------------------------------------------------------- 1 | いつも 忙しい 仲間 と やっと 会え た 2 | いつも 忙しい 仲間 と やっと 会え た -------------------------------------------------------------------------------- /tests/data/vocabs/one_column/something.vocab: -------------------------------------------------------------------------------- 1 | apple 2 | banana 3 | mango 4 | potato 5 | the -------------------------------------------------------------------------------- /vecto/__main__.py: -------------------------------------------------------------------------------- 1 | from .cli import CLI 2 | 3 | if __name__ == "__main__": 4 | CLI() 5 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include setup_boilerplate.py 2 | include requirements.txt 3 | include test_requirements.txt 4 | -------------------------------------------------------------------------------- /docs/source/reference/index.rst: -------------------------------------------------------------------------------- 1 | API reference 2 | ============= 3 | 4 | .. automodule:: vecto 5 | 6 | -------------------------------------------------------------------------------- /tests/data/embeddings/text/corrupted/metadata.json: -------------------------------------------------------------------------------- 1 | {"type":"test","foldername":"plain_with_file_header"} 2 | -------------------------------------------------------------------------------- /tests/data/embeddings/npy/test.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/tests/data/embeddings/npy/test.npy -------------------------------------------------------------------------------- /tests/data/embeddings/text/plain_no_file_header/metadata.json: -------------------------------------------------------------------------------- 1 | {"type":"test","foldername":"plain_no_file_header"} 2 | -------------------------------------------------------------------------------- /docs/source/tutorial/images/cat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/docs/source/tutorial/images/cat.png -------------------------------------------------------------------------------- /docs/source/tutorial/images/pear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/docs/source/tutorial/images/pear.png -------------------------------------------------------------------------------- /tests/data/embeddings/text/plain_with_file_header/metadata.json: -------------------------------------------------------------------------------- 1 | {"type":"test","foldername":"plain_with_file_header"} 2 | -------------------------------------------------------------------------------- /docs/source/tutorial/images/contexts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/docs/source/tutorial/images/contexts.png -------------------------------------------------------------------------------- /docs/source/tutorial/images/img_tips.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/docs/source/tutorial/images/img_tips.png -------------------------------------------------------------------------------- /docs/source/tutorial/images/std_to_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/docs/source/tutorial/images/std_to_img.png -------------------------------------------------------------------------------- /tests/data/benchmarks/analogy/category2/subcategory_b.txt: -------------------------------------------------------------------------------- 1 | apple banana 2 | fast quick 3 | apple banana_missing 4 | apple_missing banana -------------------------------------------------------------------------------- /tests/data/vocabs/numbers/metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "path_source": "./test/data/corpora/numbers", 3 | "vsmlib_version": "0.1.6" 4 | } -------------------------------------------------------------------------------- /docs/source/tutorial/images/draw_features.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/docs/source/tutorial/images/draw_features.png -------------------------------------------------------------------------------- /tests/data/benchmarks_results/text_classification/vocab.json: -------------------------------------------------------------------------------- 1 | {"the": 0, "apple": 1, "banana": 2, "fast": 3, "quick": 4, "tiger": 5, "cat": 6} -------------------------------------------------------------------------------- /tests/data/corpora/bzipped/sense_small.txt.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/tests/data/corpora/bzipped/sense_small.txt.bz2 -------------------------------------------------------------------------------- /tests/data/corpora/gzipped/sense_small.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/tests/data/corpora/gzipped/sense_small.txt.gz -------------------------------------------------------------------------------- /tests/data/corpora/xzipped/sense_small.txt.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/tests/data/corpora/xzipped/sense_small.txt.xz -------------------------------------------------------------------------------- /docs/source/tutorial/images/draw_similarity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/docs/source/tutorial/images/draw_similarity.png -------------------------------------------------------------------------------- /tests/data/benchmarks/similarity/ws.json: -------------------------------------------------------------------------------- 1 | { 2 | "language": "en", 3 | "task": "similarity", 4 | "description": "TEST FILE", 5 | "version": "-", 6 | "cite": "-" 7 | } 8 | -------------------------------------------------------------------------------- /tests/data/benchmarks_results/text_classification/best_model.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/tests/data/benchmarks_results/text_classification/best_model.npz -------------------------------------------------------------------------------- /tests/data/embeddings/text/plain_no_file_header/emb.txt: -------------------------------------------------------------------------------- 1 | the 0.4 0.3 0.2 0.1 2 | apple 0.1 0.2 0.3 0.4 3 | banana 0.1 0.2 0.3 0.41 4 | fast 0.1 0.1 0.1 0.1 5 | quick 0.1 0.1 0.1 0.2 -------------------------------------------------------------------------------- /tests/data/vocabs/plain/metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "cnt_words": 142, 3 | "min_frequency": 10, 4 | "path_source": "./test/data/corpora/plain", 5 | "vsmlib_version": "0.1.6" 6 | } -------------------------------------------------------------------------------- /tests/data/benchmarks/categorization/essli-2008-lite.csv: -------------------------------------------------------------------------------- 1 | ,category,word 2 | 0,cats,the 3 | 1,cats,tiger 4 | 2,cats,cat 5 | 3,food,apple 6 | 4,food,banana 7 | 5,motion,walk 8 | 6,motion,fly 9 | -------------------------------------------------------------------------------- /tests/data/benchmarks/synonymy_detection/test.csv: -------------------------------------------------------------------------------- 1 | ,target,word,is_solution 2 | 0,tiger,cat,yes 3 | 1,tiger,run,no 4 | 2,tiger,banana,no 5 | 3,run,walk,yes 6 | 4,run,cat,no 7 | 5,run,the,no -------------------------------------------------------------------------------- /tests/data/embeddings/text/corrupted/emb.txt: -------------------------------------------------------------------------------- 1 | 7 4 2 | the 0.4 0.3 0.2 3 | apple 0.1 0.2 0.3 0.4 4 | banana 0.1 0.2 0.3 0.41 5 | fast 0.1 0.1 0.1 0.1 6 | quick 0.1 0.1 0.1 0.2 7 | tiger 0.1 0.1 0.1 0.2 8 | cat 0.1 0.1 0.1 0.2 -------------------------------------------------------------------------------- /tests/data/vocabs/numbers/vocab.tsv: -------------------------------------------------------------------------------- 1 | #word frequency 2 | one 1 3 | two 406 4 | three 345 5 | four 330 6 | five 324 7 | six 271 8 | seven 184 9 | eight 177 10 | nine 176 11 | ten 10 12 | eleven 146 13 | twelve 170 14 | -------------------------------------------------------------------------------- /tests/data/benchmarks/outliers/test.csv: -------------------------------------------------------------------------------- 1 | ,category,word,is_outlier 2 | 0,cats,cat,false 3 | 1,cats,tiger,false 4 | 2,cats,run,true 5 | 3,cats,walk,true 6 | 4,fruits,apple,false 7 | 5,fruits,banana,false 8 | 6,fruits,the,true 9 | 7,fruits,fly,true -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # requrements for vecto 2 | brewer2mpl 3 | # gitpython 4 | numpy 5 | nltk 6 | pandas 7 | progressbar2 8 | matplotlib 9 | scipy 10 | scikit-learn 11 | system-query 12 | tables 13 | traitlets 14 | tqdm 15 | requests 16 | docutils 17 | -------------------------------------------------------------------------------- /test_requirements.txt: -------------------------------------------------------------------------------- 1 | -rrequirements.txt 2 | pip >= 21.0 3 | setuptools >= 40.0 4 | docutils 5 | pygments 6 | wheel 7 | fabric3 8 | progressbar2 9 | sklearn 10 | pyyaml 11 | coveralls 12 | coverage 13 | pylint 14 | chainer 15 | keras 16 | tensorflow -------------------------------------------------------------------------------- /tests/data/corpora/multiple_small/one.txt: -------------------------------------------------------------------------------- 1 | line0 token token 2 | line1 token token 3 | line2 token token 4 | line3 token token 5 | line4 token token 6 | line5 token token 7 | line6 token token 8 | line7 token token 9 | line8 token token 10 | line9 token token 11 | -------------------------------------------------------------------------------- /docs/source/tutorial/index.rst: -------------------------------------------------------------------------------- 1 | Tutorial 2 | ======== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | basic 8 | installing 9 | metadata 10 | getting_vectors 11 | training_vectors 12 | working_with_vectors 13 | visualization 14 | evaluating 15 | roadmap -------------------------------------------------------------------------------- /vecto/corpus/__init__.py: -------------------------------------------------------------------------------- 1 | from .corpus import FileCorpus, DirCorpus, CorpusView, Corpus 2 | from .corpus import corpus_chain, load_path_as_ids, DirSlidingWindowCorpus 3 | from .tokenization import DEFAULT_JAP_TOKENIZER, DEFAULT_TOKENIZER, DEFAULT_SENT_TOKENIZER, ANNOTATED_TEXT_TOKENIZER 4 | -------------------------------------------------------------------------------- /tests/data/embeddings/text/plain_with_file_header/emb.txt: -------------------------------------------------------------------------------- 1 | 9 4 2 | the 0.4 0.3 0.2 0.1 3 | apple 0.1 0.2 0.3 0.4 4 | banana 0.1 0.2 0.3 0.41 5 | fast 0.1 0.1 0.1 0.1 6 | quick 0.1 0.1 0.1 0.2 7 | tiger 0.1 0.1 0.1 0.2 8 | cat 0.1 0.1 0.1 0.2 9 | walk 0.9 0.5 0.6 0.3 10 | fly 0.7 0.1 0.6 0.2 -------------------------------------------------------------------------------- /vecto/embeddings/base.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from vecto.utils.metadata import WithMetaData 3 | 4 | 5 | class WordEmbeddings(WithMetaData, metaclass=abc.ABCMeta): 6 | # TODO: define proper interface 7 | 8 | @abc.abstractmethod 9 | def get_vector(self, w): 10 | pass 11 | -------------------------------------------------------------------------------- /check_ds.py: -------------------------------------------------------------------------------- 1 | from vecto.data import Dataset 2 | path = "/mnt/storage/data/NLP/datasets/text_classification/SST-2" 3 | #path = "/home/blackbird/Projects/NLP/datasets/STSA/binary" 4 | 5 | ds = Dataset(path) 6 | 7 | print(ds) 8 | print(ds.metadata) 9 | train = ds.get_train() 10 | print(train) 11 | -------------------------------------------------------------------------------- /vecto/utils/blas.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | # import scipy.sparse.linalg 3 | 4 | 5 | def normed(v): 6 | return v / np.linalg.norm(v) 7 | 8 | 9 | # def normalize_sparse(m): 10 | # norm = scipy.sparse.linalg.norm(m, axis=1)[:, None] 11 | # m.data /= norm.repeat(np.diff(m.indptr)) 12 | -------------------------------------------------------------------------------- /vecto/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Helpers for various things 2 | """ 3 | 4 | import datetime 5 | 6 | 7 | def get_time_str(): 8 | """ 9 | returs current time formatted nicely 10 | """ 11 | time_now = datetime.datetime.now() 12 | str_time = time_now.strftime("%y.%m.%d_%H.%M.%S") 13 | return str_time 14 | -------------------------------------------------------------------------------- /tests/data/corpora/multiple_small/two.txt: -------------------------------------------------------------------------------- 1 | file2-0 lalala la la la 2 | file2-1 lalala la la la 3 | file2-2 lalala la la la 4 | file2-3 lalala la la la 5 | file2-4 lalala la la la 6 | file2-5 lalala la la la 7 | file2-6 lalala la la la 8 | file2-7 lalala la la la 9 | file2-8 lalala la la la 10 | file2-9 lalala la la la 11 | -------------------------------------------------------------------------------- /vecto/benchmarks/base.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class Benchmark(): 5 | # TODO: define proper interface 6 | 7 | @abc.abstractmethod 8 | def __init__(self): 9 | raise NotImplementedError 10 | 11 | # @abc.abstractmethod 12 | # def get_result(self, embeddings, path_dataset): 13 | # raise NotImplementedError 14 | -------------------------------------------------------------------------------- /vecto/__init__.py: -------------------------------------------------------------------------------- 1 | """vecto is a library for all things related to vector space models in NLP 2 | 3 | Submodules 4 | ========== 5 | 6 | .. autosummary:: 7 | :toctree: _autosummary 8 | 9 | embeddings 10 | corpus 11 | vocabulary 12 | benchmarks 13 | """ 14 | 15 | from ._version import VERSION 16 | 17 | 18 | __version__ = VERSION 19 | -------------------------------------------------------------------------------- /vecto/utils/tqdm_utils.py: -------------------------------------------------------------------------------- 1 | import tqdm 2 | 3 | 4 | def is_in_jupyter(): 5 | try: 6 | get_ipython 7 | return True 8 | except: 9 | return False 10 | 11 | 12 | def get_tqdm(*args, **kwargs): 13 | if is_in_jupyter(): 14 | return tqdm.tqdm_notebook(*args, **kwargs) 15 | return tqdm.tqdm(*args, **kwargs) 16 | -------------------------------------------------------------------------------- /tests/test_datasets.py: -------------------------------------------------------------------------------- 1 | """Tests for datasets""" 2 | import unittest 3 | from vecto.data import Dataset 4 | 5 | 6 | class Tests(unittest.TestCase): 7 | 8 | def test_datasets(self): 9 | Dataset("./") 10 | 11 | def test_dataset(self): 12 | with self.assertRaises(FileNotFoundError): 13 | Dataset("./path/does/not/exist/") 14 | -------------------------------------------------------------------------------- /vecto/benchmarks/similarity/__init__.py: -------------------------------------------------------------------------------- 1 | from .similarity import Similarity as Benchmark 2 | 3 | 4 | def add_extra_args(parser): 5 | parser.add_argument("embeddings") 6 | parser.add_argument("dataset") 7 | parser.add_argument('--normalize', dest='normalize', action='store_true') 8 | parser.add_argument('--ignore_oov', dest='ignore_oov', action='store_true') 9 | -------------------------------------------------------------------------------- /vecto/vocabulary/__init__.py: -------------------------------------------------------------------------------- 1 | """The model module that implements vocabulary. 2 | 3 | .. autosummary:: 4 | :toctree: _autosummary 5 | 6 | """ 7 | 8 | from .vocabulary import Vocabulary 9 | from .vocabulary import create_from_path, create_ngram_tokens_from_dir, create_from_annotated_dir 10 | 11 | 12 | def load(path): 13 | v = Vocabulary() 14 | v.load(path) 15 | return v 16 | -------------------------------------------------------------------------------- /vecto/utils/convert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | def main(): 3 | path = sys.argv[1] 4 | print(path) 5 | with open(path, encoding='utf-8', errors='ignore') as f_in: 6 | with open(path + ".out", "w", encoding='utf-8') as f_out: 7 | for l in f_in: 8 | label, text = l.rstrip().split(None, 1) 9 | f_out.write(f"{label}\t{text}\n") 10 | 11 | 12 | if __name__ == '__main__': 13 | main() -------------------------------------------------------------------------------- /tests/test_cli_misc.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from io import StringIO 3 | from contextlib import redirect_stdout 4 | from .test_setup import run_module 5 | 6 | 7 | class Tests(unittest.TestCase): 8 | 9 | def test_cli(self): 10 | with self.assertRaises(SystemExit): 11 | sio = StringIO() 12 | with redirect_stdout(sio): 13 | run_module('vecto', 14 | 'WRONG_COMMAND') 15 | -------------------------------------------------------------------------------- /tests/test_misc.py: -------------------------------------------------------------------------------- 1 | """Tests for misc""" 2 | import unittest 3 | import logging 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class Tests(unittest.TestCase): 8 | 9 | def test_import(self): 10 | logger.info("testing deprecated") 11 | import vecto 12 | 13 | def test_utils(self): 14 | from vecto.utils.data import jsonify 15 | data = {"test": 1, "dict": {"i": 2}} 16 | res = jsonify(data) 17 | self.assertIsInstance(res, dict) 18 | -------------------------------------------------------------------------------- /tests/data/benchmarks/similarity/ws: -------------------------------------------------------------------------------- 1 | love sex 6.77 2 | tiger cat 7.35 3 | tiger tiger 10 4 | book paper 7.46 5 | computer keyboard 7.62 6 | computer internet 7.58 7 | plane car 5.77 8 | train car 6.31 9 | telephone communication 7.5 10 | television radio 6.77 11 | media radio 7.42 12 | drug abuse 6.85 13 | bread butter 6.19 14 | cucumber potato 5.92 15 | doctor nurse 7 16 | professor doctor 6.62 17 | student professor 6.81 18 | smart student 4.62 19 | smart stupid 5.81 20 | company stock 7.08 21 | -------------------------------------------------------------------------------- /tests/data/benchmarks/similarity/ws.txt: -------------------------------------------------------------------------------- 1 | love sex 6.77 2 | tiger cat 7.35 3 | tiger tiger 10 4 | book paper 7.46 5 | computer keyboard 7.62 6 | computer internet 7.58 7 | plane car 5.77 8 | train car 6.31 9 | telephone communication 7.5 10 | television radio 6.77 11 | media radio 7.42 12 | drug abuse 6.85 13 | bread butter 6.19 14 | cucumber potato 5.92 15 | doctor nurse 7 16 | professor doctor 6.62 17 | student professor 6.81 18 | smart student 4.62 19 | smart stupid 5.81 20 | company stock 7.08 21 | -------------------------------------------------------------------------------- /vecto/benchmarks/sequence_labeling/__init__.py: -------------------------------------------------------------------------------- 1 | from .sequence_labeling import Sequence_labeling as Benchmark 2 | 3 | 4 | def add_extra_args(parser): 5 | parser.add_argument("embeddings") 6 | parser.add_argument("dataset") 7 | parser.add_argument("--window_size", default=5, type=int) 8 | parser.add_argument("--method", default='lr', choices=['lr', '2FFNN'], 9 | help='name of method') 10 | parser.add_argument('--normalize', dest='normalize', action='store_true') 11 | -------------------------------------------------------------------------------- /vecto/utils/fetch_benchmarks.py: -------------------------------------------------------------------------------- 1 | from git import Repo 2 | from git.exc import GitCommandError 3 | from os import path 4 | 5 | path_to_repo = 'https://github.com/vecto-ai/benchmarks.git' 6 | 7 | 8 | def fetch_benchmarks(path_to_local_dir=path.join('data', 'benchmarks')): 9 | try: 10 | Repo.clone_from('https://github.com/vecto-ai/benchmarks.git', path_to_local_dir) 11 | except GitCommandError: 12 | raise ValueError('Directory exists') 13 | 14 | if __name__ == "__main__": 15 | fetch_benchmarks() 16 | -------------------------------------------------------------------------------- /tests/data/benchmarks_results/text_classification/args.json: -------------------------------------------------------------------------------- 1 | {"current_datetime": "2018-05-04 11:39:50.824318", "batchsize": 64, "epoch": 5, "gpu": -1, "layer": 1, "dropout": 0, "model": "bow", "char_based": false, "out": "./tests/data/benchmarks_results/text_classification/", "unit": 4, "dataset": "./tests/data/benchmarks/text_classification/", "vocab_path": "./tests/data/benchmarks_results/text_classification/vocab.json", "model_path": "./tests/data/benchmarks_results/text_classification/best_model.npz", "n_class": 2, "datetime": "2018-05-04 11:39:50.824318"} -------------------------------------------------------------------------------- /tests/test_format.py: -------------------------------------------------------------------------------- 1 | """Tests for format module.""" 2 | 3 | import unittest 4 | from vecto.utils.formathelper import sizeof_fmt, countof_fmt 5 | 6 | 7 | class Tests(unittest.TestCase): 8 | 9 | def test_sizeof(self): 10 | val = 12345667 11 | print("sizeof:", sizeof_fmt(val)) 12 | val = 10.0 ** 32 13 | print("sizeof:", sizeof_fmt(val)) 14 | 15 | def test_countof(self): 16 | val = 12345667 17 | print("countof:", countof_fmt(val)) 18 | val = 10.0 ** 32 19 | print("countof:", countof_fmt(val)) 20 | -------------------------------------------------------------------------------- /vecto/config.py: -------------------------------------------------------------------------------- 1 | """Configuration support for vecto 2 | 3 | Config files are expected to be found in the .vecto folder in user's home. 4 | The format is the same as jupyter notebooks 5 | """ 6 | 7 | from traitlets.config.loader import load_pyconfig_files 8 | import os.path 9 | 10 | 11 | def load_config(): 12 | default_dir = os.path.expanduser("~/.vecto/") 13 | if os.path.isfile(os.path.join(default_dir, 'config.py')): 14 | c = load_pyconfig_files(['config.py'], default_dir) 15 | return c 16 | else: 17 | # TODO: create default config 18 | raise RuntimeError('configuration file not found, please create one in ~/.vecto/config.py') 19 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from vecto.config import load_config 4 | 5 | 6 | class Tests(unittest.TestCase): 7 | 8 | @unittest.skipUnless(os.environ.get('CI'), 'skipping as local config likely exists') 9 | def test_file_corpus(self): 10 | default_dir = os.path.expanduser("~/.vecto/") 11 | os.makedirs(default_dir, exist_ok=True) 12 | path_config = os.path.join(default_dir, 'config.py') 13 | with self.assertRaises(RuntimeError): 14 | load_config() 15 | if not os.path.isfile(path_config): 16 | with open(path_config, "w") as f: 17 | f.write("test=1") 18 | load_config() 19 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = vecto 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /tests/data/benchmarks_results/similarity/2: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": [], 4 | "experiment_setup": { 5 | "category": "default", 6 | "cnt_found_pairs_total": 0, 7 | "cnt_pairs_total": 20, 8 | "dataset": "ws", 9 | "embeddings": { 10 | "_class": "vecto.embeddings.dense.WordEmbeddingsDense", 11 | "foldername": "plain_no_file_header", 12 | "type": "test", 13 | "vecto_version": "0.1.2" 14 | }, 15 | "measurement": "spearman", 16 | "method": "cosine_distance", 17 | "task": "word_similarity", 18 | "timestamp": "2018-05-03T00:16:05.143078" 19 | }, 20 | "result": -1 21 | } 22 | ] -------------------------------------------------------------------------------- /tests/data/benchmarks_results/similarity/1: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": [], 4 | "experiment_setup": { 5 | "category": "default", 6 | "cnt_found_pairs_total": 2, 7 | "cnt_pairs_total": 20, 8 | "dataset": "ws", 9 | "embeddings": { 10 | "_class": "vecto.embeddings.dense.WordEmbeddingsDense", 11 | "foldername": "plain_with_file_header", 12 | "type": "test", 13 | "vecto_version": "0.1.2" 14 | }, 15 | "measurement": "spearman", 16 | "method": "cosine_distance", 17 | "task": "word_similarity", 18 | "timestamp": "2018-05-03T00:16:05.141905" 19 | }, 20 | "result": -1 21 | } 22 | ] -------------------------------------------------------------------------------- /docs/source/tutorial/installing.rst: -------------------------------------------------------------------------------- 1 | Installing Vecto 2 | ================= 3 | 4 | .. currentmodule:: vecto 5 | 6 | 7 | System requirements 8 | ------------------- 9 | 10 | - Python 3.5 or later 11 | 12 | Method 1: Pip-install 13 | --------------------- 14 | 15 | The latest stable version: 16 | 17 | 18 | >>> pip3 install vecto 19 | 20 | The latest development version: 21 | 22 | >>> pip3 install git+https://github.com/vecto-ai/vecto.git 23 | 24 | 25 | Method 2: Clone or download the github repo 26 | ------------------------------------------- 27 | 28 | You can avoid intalling vecto system-wide. Simply download and unpack the github repo into your project's working directory. 29 | 30 | Either way, you can access the vecto's modules by issuing 31 | 32 | >>> import vecto 33 | 34 | at the beginning of your code. 35 | -------------------------------------------------------------------------------- /vecto/utils/formathelper.py: -------------------------------------------------------------------------------- 1 | class bcolors: 2 | HEADER = '\033[95m' 3 | OKBLUE = '\033[94m' 4 | OKGREEN = '\033[92m' 5 | WARNING = '\033[93m' 6 | FAIL = '\033[91m' 7 | ENDC = '\033[0m' 8 | BOLD = '\033[1m' 9 | UNDERLINE = '\033[4m' 10 | 11 | 12 | def sizeof_fmt(num, suffix='B'): 13 | for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: 14 | if abs(num) < 1024.0: 15 | return "%3.1f%s%s" % (num, unit, suffix) 16 | num /= 1024.0 17 | return "%.1f%s%s" % (num, 'Yi', suffix) 18 | 19 | 20 | def countof_fmt(num, suffix=''): 21 | for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: 22 | if abs(num) < 1000.0: 23 | return "%3.1f%s%s" % (num, unit, suffix) 24 | num /= 1000.0 25 | return "%.1f%s%s" % (num, 'Y', suffix) 26 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | ============================================== 2 | vecto - Python library for vector space models 3 | ============================================== 4 | 5 | Vecto is an open-source Python library for working with vector space models (VSMs), 6 | including various word embeddings such as word2vec. Vecto can load various popular 7 | formats of VSMs and perform a set of basic operations like dimensionality reduction, seach for nearest neighbors etc. It includes a growing 8 | list of benchmarks with which VSMs are evaluated in most current research, and a few visualization tools. 9 | It also includes a growing list of modules for creating VSMs, both explicit and based on neural networks. 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | 14 | tutorial/index 15 | reference/index 16 | contribution 17 | 18 | Indices and tables 19 | ================== 20 | 21 | * :ref:`genindex` 22 | * :ref:`modindex` 23 | * :ref:`search` 24 | -------------------------------------------------------------------------------- /vecto/benchmarks/categorization/metrics.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import accuracy_score 2 | from sklearn.metrics import adjusted_rand_score, v_measure_score, homogeneity_score, completeness_score, \ 3 | mutual_info_score, fowlkes_mallows_score, silhouette_score, calinski_harabasz_score 4 | import numpy as np 5 | 6 | 7 | def purity_score(y_true, y_pred): 8 | y_voted_labels = np.zeros(y_true.shape) 9 | labels = np.unique(y_true) 10 | ordered_labels = np.arange(labels.shape[0]) 11 | for k in range(labels.shape[0]): 12 | y_true[y_true == labels[k]] = ordered_labels[k] 13 | labels = np.unique(y_true) 14 | bins = np.concatenate((labels, [np.max(labels) + 1]), axis=0) 15 | for cluster in np.unique(y_pred): 16 | hist, _ = np.histogram(y_true[y_pred == cluster], bins=bins) 17 | winner = np.argmax(hist) 18 | y_voted_labels[y_pred == cluster] = winner 19 | return accuracy_score(y_true, y_voted_labels) 20 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | set SPHINXPROJ=vecto 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """Setup script for vecto package.""" 2 | 3 | import setup_boilerplate 4 | 5 | 6 | class Package(setup_boilerplate.Package): 7 | 8 | """Package metadata.""" 9 | 10 | name = 'vecto' 11 | description = 'toolbox for various tasks in the area of vector space models of computational linguistic' 12 | url = "http://vecto.space" 13 | classifiers = [ 14 | 'Development Status :: 3 - Alpha', 15 | 'Environment :: Console', 16 | 'Intended Audience :: Science/Research', 17 | 'License :: OSI Approved :: Apache Software License', 18 | 'Natural Language :: English', 19 | 'Operating System :: POSIX', 20 | 'Programming Language :: Python :: 3.5', 21 | 'Programming Language :: Python :: 3.6', 22 | 'Programming Language :: Python :: 3 :: Only', 23 | 'Topic :: Text Processing :: Linguistic'] 24 | keywords = ['NLP', 'linguistics', 'language'] 25 | 26 | 27 | if __name__ == '__main__': 28 | Package.setup() 29 | -------------------------------------------------------------------------------- /vecto/benchmarks/relation_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | from .relation_extraction import Relation_extraction as Benchmark 2 | 3 | 4 | def add_extra_args(parser): 5 | parser.add_argument("embeddings") 6 | parser.add_argument("dataset") 7 | 8 | parser.add_argument('--batchsize', '-b', type=int, default=64, 9 | help='Number of images in each mini-batch') 10 | parser.add_argument('--epoch', '-e', type=int, default=1, 11 | help='Number of sweeps over the dataset to train') 12 | parser.add_argument('--nb_filter', '-nf', type=int, default=100, 13 | help='filter number') 14 | parser.add_argument('--filter_length', '-fl', type=int, default=3, 15 | help='filter length') 16 | parser.add_argument('--hidden_dims', '-hd', type=int, default=100, 17 | help='D') 18 | parser.add_argument('--position_dims', '-pd', type=int, default=100, 19 | help='D') 20 | -------------------------------------------------------------------------------- /docs/source/contribution.rst: -------------------------------------------------------------------------------- 1 | .. _contrib: 2 | 3 | Contribution Guide 4 | ================== 5 | 6 | This is a guide for all contributions to vecto. 7 | The development of vecto is happening on `the official repository at GitHub `_. 8 | 9 | Some quick notes: 10 | ----------------- 11 | 12 | Please send pull requests to the ``dev`` branch. 13 | 14 | Pull requests must not lower test coverage score. 15 | 16 | If you send a pull request, please make sure your code is pep8-compliant. 17 | 18 | If you want to raise an ussue, please first do a quick search to see if it has already been reported. If so, it's often better to just leave a comment on an existing issue, rather than creating a new one. 19 | 20 | Issues are for bug reports, feature requests etc. For usage-related questions please consult the tutorial; if something is not covered, raise an issue, and we will update the tutorial. 21 | 22 | If there's an issue you would like to fix - this is very welcome, please get in touch. 23 | -------------------------------------------------------------------------------- /vecto/benchmarks/analogy/io.py: -------------------------------------------------------------------------------- 1 | def get_pairs(fname): 2 | pairs = [] 3 | with open(fname) as file_in: 4 | id_line = 0 5 | for line in file_in: 6 | if line.strip() == '': 7 | continue 8 | try: 9 | id_line += 1 10 | if "\t" in line: 11 | parts = line.lower().split("\t") 12 | else: 13 | parts = line.lower().split() 14 | left = parts[0] 15 | right = parts[1] 16 | right = right.strip() 17 | if "/" in right: 18 | right = [i.strip() for i in right.split("/")] 19 | else: 20 | right = [i.strip() for i in right.split(",")] 21 | pairs.append([left, right]) 22 | except: 23 | print("error reading pairs") 24 | print("in file", fname) 25 | print("in line", id_line, line) 26 | exit(-1) 27 | return pairs 28 | -------------------------------------------------------------------------------- /tests/data/benchmarks/analogy/metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "class": "dataset", 3 | "task": "analogy", 4 | "language": ["english"], 5 | "name": "dummy_analogy", 6 | "description": "Test Analogy Set", 7 | "domain": "general", 8 | "date": "2016", 9 | "source": "original", 10 | "project_page": "http://vecto.space/", 11 | "version": "3.0", 12 | "size": "small", 13 | "cite": 14 | { 15 | "title": "Analogy-based detection of morphological and semantic relations with word embeddings: what works and what doesn't", 16 | "author": "Gladkova, Anna and Drozd, Aleksandr and Matsuoka, Satoshi", 17 | "doi": "10.18653/v1/N16-2002", 18 | "url": "https://www.aclweb.org/anthology/N/N16/N16-2002.pdf", 19 | "booktitle": "Proceedings of the NAACL-HLT SRW", 20 | "publisher": "ACL", 21 | "year": 2016, 22 | "pages": "47-54", 23 | "type": "inproceedings", 24 | "id":"GladkovaDrozdEtAl_2016" 25 | } 26 | } -------------------------------------------------------------------------------- /vecto/benchmarks/analogy/__init__.py: -------------------------------------------------------------------------------- 1 | """Benchmark on word analogy 2 | 3 | .. autosummary:: 4 | :toctree: _autosummary 5 | 6 | analogy 7 | """ 8 | 9 | # import logging 10 | from .analogy import Analogy as Benchmark 11 | import numpy as np 12 | 13 | # logging.basicConfig(level=logging.DEBUG) 14 | 15 | 16 | def add_extra_args(parser): 17 | parser.add_argument("embeddings") 18 | parser.add_argument("dataset") 19 | parser.add_argument("--method", 20 | help="analogy solving method", 21 | default="LRCos") 22 | 23 | 24 | # TODO: move this to proper location, reuse between senchmarks 25 | def get_mean_reciprocal_rank(results): 26 | mean_reciprocal_rank=np.mean([(lambda r : 0 if r<=0 else 1/r) (experiment["rank"]) for category in results for experiment in category["details"] ]) 27 | return mean_reciprocal_rank 28 | 29 | 30 | def get_mean_accuracy(results): 31 | mean_accuracy=np.mean([experiment["rank"]==0 for category in results for experiment in category["details"] ]) 32 | return mean_accuracy 33 | -------------------------------------------------------------------------------- /vecto/corpus/base.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | from vecto.utils.metadata import WithMetaData 4 | from vecto.utils.tqdm_utils import get_tqdm 5 | 6 | 7 | class BaseIterator(WithMetaData): 8 | """ 9 | Base class for all corpora and iterators. 10 | Responsible for base logic like metadata collection, __len__, 11 | iteration, tqdm progressbar etc. 12 | """ 13 | 14 | def __init__(self, verbose=False, **metadata_kwargs): 15 | super(BaseIterator, self).__init__(**metadata_kwargs) 16 | self._verbose = verbose 17 | 18 | def __iter__(self): 19 | for elem in self._generate_samples_outer(): 20 | yield elem 21 | 22 | def __len__(self): 23 | return self.metadata.get('samples_count', 0) 24 | 25 | def _generate_samples_outer(self): 26 | gen = self._generate_samples() 27 | if self._verbose > 0: 28 | cur_len = len(self) 29 | if cur_len is None: 30 | return get_tqdm(gen) 31 | else: 32 | return get_tqdm(gen, total=cur_len) 33 | return gen 34 | -------------------------------------------------------------------------------- /vecto/benchmarks/language_modeling/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_modeling import Language_modeling as Benchmark 2 | 3 | 4 | def add_extra_args(parser): 5 | parser.add_argument("embeddings") 6 | # parser.add_argument("dataset", default="ptb") 7 | parser.add_argument("--window_size", default=5, type=int) 8 | parser.add_argument("--test", default=True, 9 | help='use small test dataset') 10 | parser.add_argument("--method", 11 | default='lstm', 12 | choices=['lr', '2FFNN', 'lstm'], 13 | help='name of method') 14 | parser.add_argument('--normalize', dest='normalize', action='store_true') 15 | 16 | # args = parser.parse_args(extra_args) 17 | # TODO: add warning that other datasets not supported 18 | #args.dataset = "ptb" 19 | #language_modeling = Language_modeling(normalize=args.normalize, 20 | # window_size=args.window_size, 21 | # method=args.method, 22 | # test=args.test) 23 | #language_modeling.run_with_args(args) 24 | -------------------------------------------------------------------------------- /.appveyor.yml: -------------------------------------------------------------------------------- 1 | environment: 2 | global: 3 | CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\appveyor\\run_with_env.cmd" 4 | 5 | matrix: 6 | - ARCHITECTURE: "x64" 7 | PYTHON_VERSION: "3.7" 8 | PYTHON: "C:\\Python37-x64" 9 | 10 | init: 11 | - set PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH% 12 | 13 | install: 14 | - "python --version" 15 | - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" 16 | 17 | - "python -m pip install --upgrade pip" 18 | # - "python -m pip install --upgrade setuptools" 19 | # - "python -m pip install --upgrade cython" 20 | - "python -m pip install -r test_requirements.txt" 21 | 22 | build_script: 23 | - "python setup.py build" 24 | 25 | test_script: 26 | # - "%CMD_IN_ENV% python -m unittest discover" 27 | - "python -m coverage run --branch --source . -m unittest discover --verbose" 28 | # - "%CMD_IN_ENV% python -m coverage run --source . setup.py test" 29 | 30 | after_test: 31 | # - "python setup.py bdist_wheel" 32 | # - "%CMD_IN_ENV% python setup.py bdist_wininst" 33 | # - "%CMD_IN_ENV% python setup.py bdist_msi" 34 | - ps: "ls" 35 | 36 | notifications: 37 | - provider: Webhook 38 | url: https://webhooks.gitter.im/e/25b43ed5bc5e1d3a0772 39 | on_build_success: true 40 | on_build_failure: true 41 | on_build_status_changed: true 42 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: focal 2 | 3 | language: python 4 | 5 | sudo: false 6 | 7 | python: 8 | - '3.8' 9 | 10 | os: 11 | - linux 12 | 13 | install: 14 | - pip install -U coveralls 15 | - pip install -U coverage 16 | - pip install -U pylint 17 | - pip install -U -r test_requirements.txt 18 | 19 | # before_script: # configure a headless display to test plot generation 20 | # - "export DISPLAY=:99.0" 21 | # - "sh -e /etc/init.d/xvfb start" 22 | # - sleep 1 # give xvfb some time to start 23 | 24 | script: 25 | - python -m coverage run --source . -m unittest discover --verbose 26 | # - python -m coverage run --source . setup.py test 27 | 28 | after_success: 29 | # - python -m pylint --load-plugins=pylint.extensions.mccabe --docstring-min-length 5 --no-docstring-rgx "^(test)?_|.*Tests$" --unsafe-load-any-extension y --output-format colorized --reports y $(find . -name "*.py") 30 | - python -m coverage report --show-missing 31 | - coveralls 32 | 33 | notifications: 34 | webhooks: 35 | urls: 36 | - https://webhooks.gitter.im/e/a75d423f7dff38862a1a 37 | on_success: always # options: [always|never|change] default: always 38 | on_failure: always # options: [always|never|change] default: always 39 | on_start: never # options: [always|never|change] default: always 40 | 41 | email: false 42 | -------------------------------------------------------------------------------- /tests/data/benchmarks_results/text_classification/log: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "main/loss": 0.7636308670043945, 4 | "main/accuracy": 0.46875, 5 | "validation/main/loss": 0.7174736261367798, 6 | "validation/main/accuracy": 0.5333333611488342, 7 | "epoch": 1, 8 | "iteration": 1, 9 | "elapsed_time": 0.00541385097312741 10 | }, 11 | { 12 | "main/loss": 0.7468587160110474, 13 | "main/accuracy": 0.484375, 14 | "validation/main/loss": 0.716312050819397, 15 | "validation/main/accuracy": 0.5333333611488342, 16 | "epoch": 2, 17 | "iteration": 2, 18 | "elapsed_time": 0.012854741973569617 19 | }, 20 | { 21 | "main/loss": 0.7709426283836365, 22 | "main/accuracy": 0.4545454680919647, 23 | "validation/main/loss": 0.7152009010314941, 24 | "validation/main/accuracy": 0.5333333611488342, 25 | "epoch": 3, 26 | "iteration": 3, 27 | "elapsed_time": 0.020706564973806962 28 | }, 29 | { 30 | "main/loss": 0.7399059534072876, 31 | "main/accuracy": 0.5348837375640869, 32 | "validation/main/loss": 0.7141532897949219, 33 | "validation/main/accuracy": 0.5333333611488342, 34 | "epoch": 4, 35 | "iteration": 4, 36 | "elapsed_time": 0.026553130999673158 37 | } 38 | ] -------------------------------------------------------------------------------- /vecto/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import vecto 3 | 4 | 5 | class CLI(object): 6 | 7 | def __init__(self): 8 | parser = argparse.ArgumentParser( 9 | prog="vecto", 10 | description='vecto commad line interface', 11 | add_help=True, 12 | epilog="\n", 13 | usage='''vecto [], 14 | 15 | The most commonly used vecto commands are: 16 | benchmark Run benchmarks 17 | create_vocab Create vocabulary from a folder 18 | ''') 19 | 20 | parser.add_argument('--version', action='version', 21 | version=f'Vecto version {vecto.__version__}') 22 | parser.add_argument('command', help='Subcommand to run') 23 | args, self.unknownargs = parser.parse_known_args() 24 | if not hasattr(self, args.command): 25 | print('Unrecognized command') 26 | parser.print_help() 27 | exit(1) 28 | # use dispatch pattern to invoke method with same name 29 | getattr(self, args.command)() 30 | 31 | def benchmark(self): 32 | from vecto.benchmarks import run_benchmarks_cli 33 | run_benchmarks_cli(self.unknownargs) 34 | 35 | def create_vocab(self): 36 | print("CLI for vocabulary routines not implemented yet") 37 | 38 | 39 | def main(): 40 | CLI() 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | -------------------------------------------------------------------------------- /tests/data/benchmarks/sequence_labeling/chunk/valid.txt: -------------------------------------------------------------------------------- 1 | -DOCSTART- -X- O O 2 | 3 | Fischler JJR I-NP I-PER 4 | proposed VBN I-NP O 5 | EU-wide NNP I-NP I-MISC 6 | measures VBZ I-VP O 7 | after IN I-PP O 8 | reports NNS I-NP O 9 | from IN I-PP O 10 | Britain NNP I-NP I-LOC 11 | and CC I-NP O 12 | France NNP I-NP I-LOC 13 | that WDT B-NP O 14 | under IN I-PP O 15 | laboratory NN I-NP O 16 | conditions NNS B-NP O 17 | sheep NN I-NP O 18 | could MD I-VP O 19 | contract VB I-VP O 20 | Bovine NNP I-NP I-MISC 21 | Spongiform NNP I-NP I-MISC 22 | Encephalopathy NNP I-NP I-MISC 23 | ( ( O O 24 | BSE NNP I-NP I-MISC 25 | ) ) O O 26 | -- : O O 27 | mad JJ I-NP O 28 | cow NN I-NP O 29 | disease NN I-NP O 30 | . . O O 31 | 32 | But CC O O 33 | Fischler NNP I-NP I-PER 34 | agreed VBD I-VP O 35 | to TO I-VP O 36 | review VB I-VP O 37 | his PRP$ I-NP O 38 | proposal NN I-NP O 39 | after IN I-PP O 40 | the DT I-NP O 41 | EU NNP I-NP I-ORG 42 | 's POS B-NP O 43 | standing NN I-NP O 44 | veterinary JJ I-NP O 45 | committee NN I-NP O 46 | , , O O 47 | mational JJ I-NP O 48 | animal NN I-NP O 49 | health NN I-NP O 50 | officials NNS I-NP O 51 | , , O O 52 | questioned VBD I-VP O 53 | if IN I-SBAR O 54 | such JJ I-NP O 55 | action NN I-NP O 56 | was VBD I-VP O 57 | justified VBN I-VP O 58 | as IN I-PP O 59 | there RB I-ADVP O 60 | was VBD I-VP O 61 | only RB I-ADVP O 62 | a DT I-NP O 63 | slight JJ I-NP O 64 | risk NN I-NP O 65 | to TO I-PP O 66 | human JJ I-NP O 67 | health NN I-NP O 68 | . . O O 69 | -------------------------------------------------------------------------------- /tests/data/benchmarks/sequence_labeling/ner/valid.txt: -------------------------------------------------------------------------------- 1 | -DOCSTART- -X- O O 2 | 3 | Fischler JJR I-NP I-PER 4 | proposed VBN I-NP O 5 | EU-wide NNP I-NP I-MISC 6 | measures VBZ I-VP O 7 | after IN I-PP O 8 | reports NNS I-NP O 9 | from IN I-PP O 10 | Britain NNP I-NP I-LOC 11 | and CC I-NP O 12 | France NNP I-NP I-LOC 13 | that WDT B-NP O 14 | under IN I-PP O 15 | laboratory NN I-NP O 16 | conditions NNS B-NP O 17 | sheep NN I-NP O 18 | could MD I-VP O 19 | contract VB I-VP O 20 | Bovine NNP I-NP I-MISC 21 | Spongiform NNP I-NP I-MISC 22 | Encephalopathy NNP I-NP I-MISC 23 | ( ( O O 24 | BSE NNP I-NP I-MISC 25 | ) ) O O 26 | -- : O O 27 | mad JJ I-NP O 28 | cow NN I-NP O 29 | disease NN I-NP O 30 | . . O O 31 | 32 | But CC O O 33 | Fischler NNP I-NP I-PER 34 | agreed VBD I-VP O 35 | to TO I-VP O 36 | review VB I-VP O 37 | his PRP$ I-NP O 38 | proposal NN I-NP O 39 | after IN I-PP O 40 | the DT I-NP O 41 | EU NNP I-NP I-ORG 42 | 's POS B-NP O 43 | standing NN I-NP O 44 | veterinary JJ I-NP O 45 | committee NN I-NP O 46 | , , O O 47 | mational JJ I-NP O 48 | animal NN I-NP O 49 | health NN I-NP O 50 | officials NNS I-NP O 51 | , , O O 52 | questioned VBD I-VP O 53 | if IN I-SBAR O 54 | such JJ I-NP O 55 | action NN I-NP O 56 | was VBD I-VP O 57 | justified VBN I-VP O 58 | as IN I-PP O 59 | there RB I-ADVP O 60 | was VBD I-VP O 61 | only RB I-ADVP O 62 | a DT I-NP O 63 | slight JJ I-NP O 64 | risk NN I-NP O 65 | to TO I-PP O 66 | human JJ I-NP O 67 | health NN I-NP O 68 | . . O O 69 | -------------------------------------------------------------------------------- /tests/data/benchmarks/sequence_labeling/pos/valid.txt: -------------------------------------------------------------------------------- 1 | -DOCSTART- -X- O O 2 | 3 | Fischler JJR I-NP I-PER 4 | proposed VBN I-NP O 5 | EU-wide NNP I-NP I-MISC 6 | measures VBZ I-VP O 7 | after IN I-PP O 8 | reports NNS I-NP O 9 | from IN I-PP O 10 | Britain NNP I-NP I-LOC 11 | and CC I-NP O 12 | France NNP I-NP I-LOC 13 | that WDT B-NP O 14 | under IN I-PP O 15 | laboratory NN I-NP O 16 | conditions NNS B-NP O 17 | sheep NN I-NP O 18 | could MD I-VP O 19 | contract VB I-VP O 20 | Bovine NNP I-NP I-MISC 21 | Spongiform NNP I-NP I-MISC 22 | Encephalopathy NNP I-NP I-MISC 23 | ( ( O O 24 | BSE NNP I-NP I-MISC 25 | ) ) O O 26 | -- : O O 27 | mad JJ I-NP O 28 | cow NN I-NP O 29 | disease NN I-NP O 30 | . . O O 31 | 32 | But CC O O 33 | Fischler NNP I-NP I-PER 34 | agreed VBD I-VP O 35 | to TO I-VP O 36 | review VB I-VP O 37 | his PRP$ I-NP O 38 | proposal NN I-NP O 39 | after IN I-PP O 40 | the DT I-NP O 41 | EU NNP I-NP I-ORG 42 | 's POS B-NP O 43 | standing NN I-NP O 44 | veterinary JJ I-NP O 45 | committee NN I-NP O 46 | , , O O 47 | mational JJ I-NP O 48 | animal NN I-NP O 49 | health NN I-NP O 50 | officials NNS I-NP O 51 | , , O O 52 | questioned VBD I-VP O 53 | if IN I-SBAR O 54 | such JJ I-NP O 55 | action NN I-NP O 56 | was VBD I-VP O 57 | justified VBN I-VP O 58 | as IN I-PP O 59 | there RB I-ADVP O 60 | was VBD I-VP O 61 | only RB I-ADVP O 62 | a DT I-NP O 63 | slight JJ I-NP O 64 | risk NN I-NP O 65 | to TO I-PP O 66 | human JJ I-NP O 67 | health NN I-NP O 68 | . . O O 69 | -------------------------------------------------------------------------------- /corpus_test.py: -------------------------------------------------------------------------------- 1 | from vecto.corpus import ViewCorpus 2 | 3 | path = "./tests/data/corpora/multiple_small" 4 | corpus = ViewCorpus(path) 5 | corpus.load_dir_strucute() 6 | print("three is ", corpus.tree) 7 | 8 | # TODO: move all this to unittests 9 | print("testing get offset") 10 | for q in [9, 11]: 11 | for start in [True, False]: 12 | print("search ", q, " with start=", start) 13 | pos, offset = corpus.get_file_and_offset(q, start_of_range=start, epsilon=2) 14 | print("pos", pos, ", offset", offset, "\n") 15 | pos, offset = corpus.get_file_and_offset(15, start_of_range=False, epsilon=2) 16 | print("pos", pos, ", offset", offset, "\n") 17 | 18 | 19 | print("testing get get_line_iterator") 20 | cnt_workers = 2 21 | for i in range(cnt_workers): 22 | iterator = corpus.get_line_iterator(i, cnt_workers) 23 | print("worker", i, iterator) 24 | for line in iterator: 25 | print("line", line) 26 | 27 | # rank 0 creates corpus from dir 28 | # corpus has inside all file list and sizes 29 | # use manually splits sends metadata of corpus : tree of dirs and files with uncompressed sizes to all workers 30 | # otehr workers create corpora from that metadata using special service method like __from_metadata 31 | # to avoid exessive file IO 32 | 33 | # for time being - everybody just reads from FS 34 | 35 | # # view = corpus.view(start_percent, end_pecent) 36 | # print(corpus) 37 | # iter_token = corpus.get_line_iterator() 38 | # for s in iter_token: 39 | # print(s) 40 | 41 | -------------------------------------------------------------------------------- /tests/data/benchmarks_results/analogy/1: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": [], 4 | "experiment_setup": { 5 | "category": "analogy", 6 | "cnt_questions_correct": 0, 7 | "cnt_questions_total": 0, 8 | "embeddings": { 9 | "_class": "vecto.embeddings.dense.WordEmbeddingsDense", 10 | "foldername": "plain_no_file_header", 11 | "normalized": true, 12 | "type": "test", 13 | "vecto_version": "0.1.2" 14 | }, 15 | "measurement": "accuracy", 16 | "method": "LRCos", 17 | "subcategory": "I01 [noun - plural_reg].txt", 18 | "task": "word_analogy", 19 | "timestamp": "2018-05-03T00:17:17.608651" 20 | }, 21 | "result": -1 22 | }, 23 | { 24 | "details": [], 25 | "experiment_setup": { 26 | "category": "analogy", 27 | "cnt_questions_correct": 0, 28 | "cnt_questions_total": 0, 29 | "embeddings": { 30 | "_class": "vecto.embeddings.dense.WordEmbeddingsDense", 31 | "foldername": "plain_no_file_header", 32 | "normalized": true, 33 | "type": "test", 34 | "vecto_version": "0.1.2" 35 | }, 36 | "measurement": "accuracy", 37 | "method": "LRCos", 38 | "subcategory": "I02 [noun - plural_irreg].txt", 39 | "task": "word_analogy", 40 | "timestamp": "2018-05-03T00:17:17.616280" 41 | }, 42 | "result": -1 43 | } 44 | ] -------------------------------------------------------------------------------- /tests/data/benchmarks_results/analogy/4: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": [], 4 | "experiment_setup": { 5 | "category": "analogy", 6 | "cnt_questions_correct": 0, 7 | "cnt_questions_total": 0, 8 | "embeddings": { 9 | "_class": "vecto.embeddings.dense.WordEmbeddingsDense", 10 | "foldername": "plain_with_file_header", 11 | "normalized": true, 12 | "type": "test", 13 | "vecto_version": "0.1.2" 14 | }, 15 | "measurement": "accuracy", 16 | "method": "LRCos", 17 | "subcategory": "I01 [noun - plural_reg].txt", 18 | "task": "word_analogy", 19 | "timestamp": "2018-05-03T00:17:17.590643" 20 | }, 21 | "result": -1 22 | }, 23 | { 24 | "details": [], 25 | "experiment_setup": { 26 | "category": "analogy", 27 | "cnt_questions_correct": 0, 28 | "cnt_questions_total": 0, 29 | "embeddings": { 30 | "_class": "vecto.embeddings.dense.WordEmbeddingsDense", 31 | "foldername": "plain_with_file_header", 32 | "normalized": true, 33 | "type": "test", 34 | "vecto_version": "0.1.2" 35 | }, 36 | "measurement": "accuracy", 37 | "method": "LRCos", 38 | "subcategory": "I02 [noun - plural_irreg].txt", 39 | "task": "word_analogy", 40 | "timestamp": "2018-05-03T00:17:17.598640" 41 | }, 42 | "result": -1 43 | } 44 | ] -------------------------------------------------------------------------------- /tests/data/benchmarks_results/analogy/3: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": [], 4 | "experiment_setup": { 5 | "category": "analogy", 6 | "cnt_questions_correct": 0, 7 | "cnt_questions_total": 0, 8 | "embeddings": { 9 | "_class": "vecto.embeddings.dense.WordEmbeddingsDense", 10 | "foldername": "plain_no_file_header", 11 | "normalized": true, 12 | "type": "test", 13 | "vecto_version": "0.1.2" 14 | }, 15 | "measurement": "accuracy", 16 | "method": "LinearOffset", 17 | "subcategory": "I01 [noun - plural_reg].txt", 18 | "task": "word_analogy", 19 | "timestamp": "2018-05-03T00:17:17.620866" 20 | }, 21 | "result": -1 22 | }, 23 | { 24 | "details": [], 25 | "experiment_setup": { 26 | "category": "analogy", 27 | "cnt_questions_correct": 0, 28 | "cnt_questions_total": 0, 29 | "embeddings": { 30 | "_class": "vecto.embeddings.dense.WordEmbeddingsDense", 31 | "foldername": "plain_no_file_header", 32 | "normalized": true, 33 | "type": "test", 34 | "vecto_version": "0.1.2" 35 | }, 36 | "measurement": "accuracy", 37 | "method": "LinearOffset", 38 | "subcategory": "I02 [noun - plural_irreg].txt", 39 | "task": "word_analogy", 40 | "timestamp": "2018-05-03T00:17:17.621908" 41 | }, 42 | "result": -1 43 | } 44 | ] -------------------------------------------------------------------------------- /tests/data/benchmarks_results/analogy/2: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": [], 4 | "experiment_setup": { 5 | "category": "analogy", 6 | "cnt_questions_correct": 0, 7 | "cnt_questions_total": 0, 8 | "embeddings": { 9 | "_class": "vecto.embeddings.dense.WordEmbeddingsDense", 10 | "foldername": "plain_with_file_header", 11 | "normalized": true, 12 | "type": "test", 13 | "vecto_version": "0.1.2" 14 | }, 15 | "measurement": "accuracy", 16 | "method": "LinearOffset", 17 | "subcategory": "I01 [noun - plural_reg].txt", 18 | "task": "word_analogy", 19 | "timestamp": "2018-05-03T00:17:17.618278" 20 | }, 21 | "result": -1 22 | }, 23 | { 24 | "details": [], 25 | "experiment_setup": { 26 | "category": "analogy", 27 | "cnt_questions_correct": 0, 28 | "cnt_questions_total": 0, 29 | "embeddings": { 30 | "_class": "vecto.embeddings.dense.WordEmbeddingsDense", 31 | "foldername": "plain_with_file_header", 32 | "normalized": true, 33 | "type": "test", 34 | "vecto_version": "0.1.2" 35 | }, 36 | "measurement": "accuracy", 37 | "method": "LinearOffset", 38 | "subcategory": "I02 [noun - plural_irreg].txt", 39 | "task": "word_analogy", 40 | "timestamp": "2018-05-03T00:17:17.619183" 41 | }, 42 | "result": -1 43 | } 44 | ] -------------------------------------------------------------------------------- /vecto/benchmarks/outliers/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | 5 | from vecto.utils.data import save_json 6 | from vecto.benchmarks.outliers import * 7 | from vecto.embeddings import load_from_dir 8 | 9 | logging.basicConfig(level=logging.DEBUG) 10 | 11 | 12 | def print_json(data): 13 | print(json.dumps(data, ensure_ascii=False, indent=4, sort_keys=False)) 14 | 15 | 16 | def select_method(key): 17 | options = {} 18 | if key == 'AveragePairwiseCosine': 19 | method = AveragePairwiseCosine(options) 20 | else: 21 | raise RuntimeError('The method name was not recognized.') 22 | return method 23 | 24 | 25 | def main(): 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument('embeddings') 28 | parser.add_argument('dataset') 29 | parser.add_argument('--method', help='Outlier detection method', default='AveragePairwiseCosine') 30 | parser.add_argument('--path_out', help='Destination folder to save the results') 31 | args = parser.parse_args() 32 | embeddings = load_from_dir(args.embeddings) 33 | benchmark = select_method(args.method) 34 | results = benchmark.get_result(embeddings, args.dataset) 35 | if args.path_out: 36 | if path.isdir(args.path_out) or args.path_out.endswith('/'): 37 | dataset = path.basename(path.normpath(args.dataset)) 38 | name_file_out = path.join(args.path_out, dataset, args.method, 'results.json') 39 | save_json(results, name_file_out) 40 | else: 41 | save_json(results, args.path_out) 42 | else: 43 | print_json(results) 44 | 45 | 46 | if __name__ == '__main__': 47 | main() 48 | -------------------------------------------------------------------------------- /vecto/benchmarks/synonymy_detection/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | 5 | from vecto.utils.data import save_json 6 | from vecto.benchmarks.synonymy_detection import * 7 | from vecto.embeddings import load_from_dir 8 | 9 | logging.basicConfig(level=logging.DEBUG) 10 | 11 | 12 | def print_json(data): 13 | print(json.dumps(data, ensure_ascii=False, indent=4, sort_keys=False)) 14 | 15 | 16 | def select_method(key): 17 | options = {} 18 | if key == 'CosineDistance': 19 | method = CosineDistance(options) 20 | else: 21 | raise RuntimeError('The method name was not recognized.') 22 | return method 23 | 24 | 25 | def main(): 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument('embeddings') 28 | parser.add_argument('dataset') 29 | parser.add_argument('--method', help='Synonymy detection method', default='CosineDistance') 30 | parser.add_argument('--path_out', help='Destination folder to save the results') 31 | args = parser.parse_args() 32 | embeddings = load_from_dir(args.embeddings) 33 | benchmark = select_method(args.method) 34 | results = benchmark.get_result(embeddings, args.dataset) 35 | if args.path_out: 36 | if path.isdir(args.path_out) or args.path_out.endswith('/'): 37 | dataset = path.basename(path.normpath(args.dataset)) 38 | name_file_out = path.join(args.path_out, dataset, args.method, 'results.json') 39 | save_json(results, name_file_out) 40 | else: 41 | save_json(results, args.path_out) 42 | else: 43 | print_json(results) 44 | 45 | 46 | if __name__ == '__main__': 47 | main() 48 | -------------------------------------------------------------------------------- /vecto/utils/data.py: -------------------------------------------------------------------------------- 1 | import bz2 2 | import gzip 3 | import json 4 | import lzma 5 | import os 6 | 7 | 8 | def detect_archive_format_and_open(path): 9 | if path.endswith(".xz"): 10 | return lzma.open(path, mode="rt", encoding="utf-8", errors="replace") 11 | if path.endswith(".bz2"): 12 | return bz2.open(path, mode="rt", encoding="utf-8", errors="replace") 13 | if path.endswith(".gz"): 14 | return gzip.open(path, mode="rt", encoding="utf-8", errors="replace") 15 | return open(path, encoding="utf8", errors="replace") 16 | 17 | 18 | def get_uncompressed_size(path): 19 | with detect_archive_format_and_open(path) as f: 20 | size = f.seek(0, 2) 21 | return size 22 | 23 | 24 | def print_json(data): 25 | print(json.dumps(data, ensure_ascii=False, indent=4, sort_keys=False)) 26 | 27 | 28 | def save_json(data, path): 29 | basedir = os.path.dirname(path) 30 | os.makedirs(basedir, exist_ok=True) 31 | str_data = json.dumps(data, ensure_ascii=False, indent=4, sort_keys=False) 32 | file_out = open(path, "w") 33 | file_out.write(str_data) 34 | file_out.close() 35 | 36 | 37 | def load_json(path): 38 | f = open(path) 39 | s_data = f.read() 40 | data = json.loads(s_data) 41 | f.close() 42 | return data 43 | 44 | 45 | def jsonify(data): 46 | if isinstance(data, list): 47 | return [jsonify(item) for item in data] 48 | if isinstance(data, dict): 49 | return {jsonify(key): jsonify(value) for key, value in data.items()} 50 | if isinstance(data, int): 51 | return str(data) 52 | if type(data).__module__ == "numpy": 53 | return data.tolist() 54 | return str(data) 55 | -------------------------------------------------------------------------------- /tests/benchmarks/test_outliers.py: -------------------------------------------------------------------------------- 1 | """Tests for outliers benchmark.""" 2 | 3 | import unittest 4 | from io import StringIO 5 | from contextlib import redirect_stdout 6 | from vecto.benchmarks.outliers import * 7 | from vecto.embeddings import load_from_dir 8 | from ..test_setup import run_module 9 | 10 | path_outliers_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'outliers') 11 | 12 | 13 | class Tests(unittest.TestCase): 14 | @classmethod 15 | def test_outliers(self): 16 | embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) 17 | outliers = AveragePairwiseCosine() 18 | outliers.get_result(embs, path_outliers_dataset) 19 | 20 | @classmethod 21 | def test_cli(self): 22 | sio = StringIO() 23 | with redirect_stdout(sio): 24 | run_module('vecto.benchmarks.outliers', 25 | './tests/data/embeddings/text/plain_with_file_header/', 26 | './tests/data/benchmarks/outliers/', 27 | '--path_out', '/tmp/vecto/benchmarks', '--method', 'AveragePairwiseCosine') 28 | 29 | def test_outliers_results(self): 30 | embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) 31 | outliers = AveragePairwiseCosine() 32 | result = outliers.get_result(embs, path_outliers_dataset)['test'] 33 | amount_of_categories = 2 34 | # TODO: refactor to be understandable, check if ok after covab to UNK 35 | amount_of_word_in_cats = 4 36 | 37 | self.assertEqual(len(result.keys()), amount_of_categories) 38 | self.assertEqual(len(result['cats']), amount_of_word_in_cats) -------------------------------------------------------------------------------- /examples/analogy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from vecto.benchmarks.analogy import Analogy\n", 10 | "import vecto.embeddings" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 6, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "embeddings = vecto.embeddings.load_from_dir(\"/storage/data/NLP/embeddings/6b.wiki_giga\")" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "analogy = Analogy()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "analogy.get_result()" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [] 53 | } 54 | ], 55 | "metadata": { 56 | "kernelspec": { 57 | "display_name": "Python 3", 58 | "language": "python", 59 | "name": "python3" 60 | }, 61 | "language_info": { 62 | "codemirror_mode": { 63 | "name": "ipython", 64 | "version": 3 65 | }, 66 | "file_extension": ".py", 67 | "mimetype": "text/x-python", 68 | "name": "python", 69 | "nbconvert_exporter": "python", 70 | "pygments_lexer": "ipython3", 71 | "version": "3.6.3" 72 | } 73 | }, 74 | "nbformat": 4, 75 | "nbformat_minor": 2 76 | } 77 | -------------------------------------------------------------------------------- /vecto/embeddings/legacy_w2v.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from vecto.vocabulary import Vocabulary 4 | from .dense import WordEmbeddingsDense 5 | 6 | 7 | class ModelW2V(WordEmbeddingsDense): 8 | """extends dense embeddings to support loading 9 | of original binary format from Mikolov's w2v""" 10 | 11 | @staticmethod 12 | def _load_word(file): 13 | result = b'' 14 | w = b'' 15 | while w != b' ': 16 | w = file.read(1) 17 | result = result + w 18 | return result[:-1] 19 | 20 | def load_from_file(self, filename): 21 | self.vocabulary = Vocabulary() 22 | f = open(filename, "rb") 23 | header = f.readline().split() 24 | cnt_rows = int(header[0]) 25 | size_row = int(header[1]) 26 | # self.name += "_{}".format(size_row) 27 | self.matrix = np.zeros((cnt_rows, size_row), dtype=np.float32) 28 | # logger.debug("cnt rows = {}, size row = {}".format(cnt_rows, size_row)) 29 | for i in range(cnt_rows): 30 | word = ModelW2V._load_word(f).decode( 31 | 'UTF-8', errors="ignore").strip() 32 | self.vocabulary.dic_words_ids[word] = i 33 | self.vocabulary.lst_words.append(word) 34 | s_row = f.read(size_row * 4) 35 | row = np.fromstring(s_row, dtype=np.float32) 36 | # row = row / np.linalg.norm(row) 37 | self.matrix[i] = row 38 | f.close() 39 | 40 | def load_from_dir(self, path): 41 | # self.name += "w2v_" + os.path.basename(os.path.normpath(path)) 42 | filename = [file for file in os.listdir(path) if file.endswith("bin")][0] 43 | self.load_from_file(os.path.join(path, filename)) 44 | # self.load_from_file(os.path.join(path, "vectors.bin")) 45 | # self.load_provenance(path) 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # project-specific 2 | 3 | _autosummary 4 | 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | 111 | # idea 112 | .idea/ 113 | -------------------------------------------------------------------------------- /tests/benchmarks/test_misc.py: -------------------------------------------------------------------------------- 1 | """Tests for embeddings module.""" 2 | 3 | import unittest 4 | import io 5 | import contextlib 6 | from tests.test_setup import run_module 7 | import vecto 8 | import vecto.benchmarks 9 | import vecto.benchmarks.base 10 | from os import path 11 | 12 | # from shutil import rmtree 13 | 14 | path_similarity_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'similarity') 15 | path_sequence_labeling_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'sequence_labeling') 16 | path_language_modeling_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'language_modeling') 17 | 18 | 19 | class Tests(unittest.TestCase): 20 | 21 | # def test_fetcher(self): 22 | # if path.isdir(path.join('.', 'tests', 'data', 'benchmarks_test')): 23 | # return 24 | # fetch_benchmarks(path.join('.', 'tests', 'data', 'benchmarks_test')) 25 | # embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) 26 | # similarity = Similarity() 27 | # path_similarity_dataset = path.join('.', 'tests', 'data', 'benchmarks_test', 'benchmarks', 'similarity', 'en') 28 | # similarity.get_result(embs, path_similarity_dataset) 29 | 30 | def test_abc(self): 31 | with self.assertRaises(NotImplementedError): 32 | vecto.benchmarks.base.Benchmark() 33 | # base.get_result(1, 2) 34 | 35 | def test_cli(self): 36 | with self.assertRaises(SystemExit): 37 | sio = io.StringIO() 38 | with contextlib.redirect_stdout(sio): 39 | run_module("vecto", 40 | "benchmark", 41 | "WRONG_NAME", 42 | "path_embs") 43 | 44 | sio = io.StringIO() 45 | with contextlib.redirect_stdout(sio): 46 | run_module("vecto", 47 | "benchmark", 48 | "help") 49 | -------------------------------------------------------------------------------- /vecto/benchmarks/text_classification/__init__.py: -------------------------------------------------------------------------------- 1 | """Text classification benchmark. 2 | 3 | One of the pre-defined models is trained to convergence 4 | to predict labels for text fragments in a provided dataset. 5 | Sentiment analysis is an example of text classification task. 6 | 7 | .. autosummary:: 8 | :toctree: _autosummary 9 | 10 | text_classification 11 | """ 12 | 13 | import argparse 14 | from .text_classification import Text_classification as Benchmark 15 | # TODO: figure out where to put it better 16 | from .text_classification import load_model, predict, get_vectors 17 | 18 | 19 | def add_extra_args(parser): 20 | parser.add_argument("embeddings") 21 | parser.add_argument("dataset") 22 | parser.add_argument('--batchsize', '-b', type=int, default=64, 23 | help='Number of images in each mini-batch') 24 | parser.add_argument('--epoch', '-e', type=int, default=30, 25 | help='Number of sweeps over the dataset to train') 26 | parser.add_argument('--gpu', '-g', type=int, default=-1, 27 | help='GPU ID (negative value indicates CPU)') 28 | parser.add_argument('--layer', '-l', type=int, default=1, 29 | help='Number of layers of RNN or MLP following CNN') 30 | parser.add_argument('--dropout', '-d', type=float, default=0.4, 31 | help='Dropout rate') 32 | parser.add_argument('--model', '-model', default='cnn', 33 | choices=['cnn', 'rnn', 'bow'], 34 | help='Name of encoder model type') 35 | # args = parser.parse_args(extra_args) 36 | # embeddings = load_from_dir(args.embeddings) 37 | # text_classification = Text_classification(batchsize=args.batchsize, epoch=args.epoch, gpu=args.gpu, 38 | # layer=args.layer, dropout=args.dropout, model=args.model) 39 | # text_classification.run_with_args(args) 40 | -------------------------------------------------------------------------------- /vecto/benchmarks/categorization/__init__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from .categorization import KMeansCategorization as Benchmark 3 | from .categorization import purity_score 4 | from vecto.embeddings import load_from_dir 5 | from vecto.utils.data import save_json, print_json 6 | from vecto.utils import get_time_str 7 | 8 | 9 | def select_method(key): 10 | options = {} 11 | # if key == 'SpectralCategorization': 12 | # method = SpectralCategorization(options) 13 | if key == 'KMeansCategorization': 14 | method = KMeansCategorization(options) 15 | else: 16 | raise RuntimeError('The method name was not recognized.') 17 | return method 18 | 19 | 20 | def add_extra_args(parser): 21 | parser.add_argument('embeddings') 22 | parser.add_argument('dataset') 23 | # TODO: move method selection to benchmark class 24 | # parser.add_argument('--method', help='Categorization method', default='KMeansCategorization') 25 | # args = parser.parse_args(extra_args) 26 | # embeddings = load_from_dir(args.embeddings) 27 | # benchmark = select_method(args.method) 28 | # results = benchmark.get_result(embeddings, args.dataset) 29 | # if args.path_out: 30 | # # TODO: this does not seem to work if the dir does not exist 31 | # # let us always assume dir, clean this up later if no better idea 32 | # # if path.isdir(args.path_out) or args.path_out.endswith('/'): 33 | # dataset = path.basename(path.normpath(args.dataset)) 34 | # timestamp = get_time_str() 35 | # name_file_out = path.join(args.path_out, 36 | # dataset, 37 | # args.method, 38 | # timestamp, 39 | # 'results.json') 40 | # save_json(results, name_file_out) 41 | # # else: 42 | # # save_json(results, args.path_out) 43 | # else: 44 | # print_json(results) 45 | -------------------------------------------------------------------------------- /tests/data/vocabs/plain/vocab.tsv: -------------------------------------------------------------------------------- 1 | #word frequency 2 | , 671 3 | . 406 4 | of 345 5 | the 330 6 | to 324 7 | and 271 8 | her 184 9 | was 177 10 | his 176 11 | a 170 12 | it 146 13 | in 145 14 | for 119 15 | be 118 16 | she 107 17 | he 101 18 | as 97 19 | i 95 20 | that 92 21 | not 82 22 | their 70 23 | him 69 24 | by 65 25 | had 63 26 | which 63 27 | but 62 28 | at 60 29 | them 60 30 | no 60 31 | have 58 32 | with 57 33 | so 56 34 | on 54 35 | you 54 36 | is 50 37 | from 47 38 | would 47 39 | they 45 40 | could 45 41 | will 44 42 | dashwood 42 43 | ! 42 44 | my 39 45 | were 38 46 | more 38 47 | than 37 48 | very 36 49 | mrs 35 50 | all 34 51 | any 34 52 | mother 33 53 | house 32 54 | such 31 55 | every 29 56 | elinor 27 57 | this 26 58 | do 26 59 | norland 25 60 | own 25 61 | what 25 62 | if 25 63 | who 24 64 | an 24 65 | been 23 66 | one 23 67 | much 23 68 | or 23 69 | john 21 70 | your 21 71 | might 20 72 | pounds 19 73 | when 19 74 | think 19 75 | said 19 76 | himself 18 77 | too 18 78 | should 18 79 | great 17 80 | only 17 81 | how 17 82 | must 17 83 | may 17 84 | are 16 85 | there 16 86 | can 16 87 | far 15 88 | make 15 89 | though 15 90 | marianne 15 91 | soon 14 92 | father 14 93 | thousand 14 94 | well 14 95 | did 14 96 | some 14 97 | we 14 98 | man 13 99 | sister 13 100 | mr 13 101 | present 13 102 | first 13 103 | other 13 104 | time 13 105 | give 13 106 | now 13 107 | herself 13 108 | sure 13 109 | shall 13 110 | edward 13 111 | many 12 112 | opinion 12 113 | into 12 114 | fortune 12 115 | half 12 116 | really 12 117 | sisters 12 118 | thing 12 119 | enough 12 120 | day 12 121 | me 12 122 | say 12 123 | taste 12 124 | good 11 125 | years 11 126 | three 11 127 | comfortable 11 128 | handsome 11 129 | little 11 130 | love 11 131 | ? 11 132 | am 11 133 | barton 11 134 | before 10 135 | heart 10 136 | gave 10 137 | child 10 138 | most 10 139 | then 10 140 | feel 10 141 | ever 10 142 | beyond 10 143 | see 10 144 | -------------------------------------------------------------------------------- /docs/source/tutorial/visualization.rst: -------------------------------------------------------------------------------- 1 | Visualization 2 | ============= 3 | 4 | .. currentmodule:: vecto 5 | 6 | When you have the numerical vectors for the units you are interested in, you can use all the goodies of matplotlib to create any kind of visualizaion you like. The visualize module of Vecto provides a few simple examples to get you started and/or quickly explore your model as you go. 7 | 8 | The `visualize` module of vecto comes with several functions to quickly explore the representations. 9 | 10 | Drawing features 11 | ---------------- 12 | 13 | >>> from vecto import visualize as vz 14 | >>> vs.draw_features(vsm, ["apple", "pear", "cat", "dog"], num_features=20) 15 | 16 | .. image:: images/draw_features.png 17 | 18 | TODO: how to interpret this. 19 | 20 | Visualizing similarity between certain words. 21 | --------------------------------------------- 22 | 23 | >>> vs.draw_features_and_similarity(vsm, ["apple", "pear", "cat", "dog"]) 24 | 25 | .. image:: images/draw_similarity.png 26 | 27 | The color intensity indicates the degre of similarity. We can see that apple is more similar to pear than to cat or dog, and the other way round. 28 | 29 | Visualizing dimensions 30 | ---------------------- 31 | 32 | In a dense VSM, each dimension on its own is `not likely to be an interpretable semantic feature on its own `_. Still, it is the overall pattern of the dimensios that encodes the meaning of any given language unit, and so it may be useful to visually inspect them. 33 | 34 | >>> vs.std_to_img(vsm.get_row("apple")) 35 | 36 | .. image:: images/std_to_img.png 37 | 38 | >>> vs.std_to_img(vsm.get_row("cat")) 39 | 40 | .. image:: images/cat.png 41 | 42 | 43 | The `rows_to_img` function displays only the end points of all dimensions in a given collection of vectors. 44 | 45 | >>> vectors = vs.wordlist_to_rows(vsm, ["apple", "pear", "cat", "dog"]) 46 | >>> vs.rows_to_img_tips(vectors,max_y=0.8) 47 | 48 | .. image:: images/img_tips.png 49 | -------------------------------------------------------------------------------- /vecto/data/io.py: -------------------------------------------------------------------------------- 1 | from requests import get 2 | from vecto.corpus.tokenization import word_tokenize_txt 3 | 4 | 5 | # TODO: move this to corpus module 6 | def normalize_text(text): 7 | return text.strip().lower() 8 | 9 | 10 | def read_first_col_is_label_format(path, char_based=False): 11 | dataset = [] 12 | with open(path, encoding='utf-8', errors='ignore') as f: 13 | for i, l in enumerate(f): 14 | if i == 0: 15 | continue 16 | if len(l.strip()) < 3: 17 | continue 18 | label, text = l.strip().split("\t", 1) 19 | # TODO: make lower-casing optional 20 | text = normalize_text(text) 21 | label = int(label) 22 | # if char_based: 23 | # tokens = list(text) 24 | # else: 25 | # tokens = word_tokenize_txt(text) 26 | dataset.append((text, label)) 27 | return dataset 28 | 29 | # TODO: detect where's lavel or specify format 30 | def read_tsv_label_last(path): 31 | dataset = [] 32 | with open(path, encoding='utf-8', errors='ignore') as f: 33 | for i, l in enumerate(f): 34 | if len(l.strip()) < 3: 35 | continue 36 | text, label = l.strip().split("\t", 1) 37 | # print(label) 38 | if label == "label": 39 | continue 40 | # TODO: make lower-casing optional 41 | text = normalize_text(text) 42 | label = int(label) 43 | # TODO: move tokenization to another layer 44 | # if char_based: 45 | # tokens = list(text) 46 | # else: 47 | # tokens = word_tokenize_txt(text) 48 | dataset.append((text, label)) 49 | return dataset 50 | 51 | 52 | def fetch_file(url, path, chunk_size=512): 53 | myfile = get(url, allow_redirects=True) 54 | open(path, 'wb').write(myfile.content) 55 | #response = get(url, stream=True) 56 | #handle = open(path, 'wb') 57 | #for chunk in response.iter_content(chunk_size=chunk_size): 58 | # if chunk: 59 | # handle.write(chunk) 60 | #handle.close() -------------------------------------------------------------------------------- /examples/most_similar.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import vecto\n", 10 | "import vecto.embeddings" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 8, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "embeddings = vecto.embeddings.load_from_dir(\"/storage/data/NLP/embeddings/6b.wiki_giga\")" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 10, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "embeddings.cache_normalized_copy()" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 11, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "data": { 38 | "text/plain": [ 39 | "[['apple', 1.0],\n", 40 | " ['iphone', 0.79935205],\n", 41 | " ['macintosh', 0.79181653],\n", 42 | " ['ipod', 0.78805625],\n", 43 | " ['microsoft', 0.7831917],\n", 44 | " ['ipad', 0.781405],\n", 45 | " ['intel', 0.77287817],\n", 46 | " ['ibm', 0.7643097],\n", 47 | " ['google', 0.7641237],\n", 48 | " ['imac', 0.753626]]" 49 | ] 50 | }, 51 | "execution_count": 11, 52 | "metadata": {}, 53 | "output_type": "execute_result" 54 | } 55 | ], 56 | "source": [ 57 | "embeddings.get_most_similar_words(\"apple\")" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [] 66 | } 67 | ], 68 | "metadata": { 69 | "kernelspec": { 70 | "display_name": "Python 3", 71 | "language": "python", 72 | "name": "python3" 73 | }, 74 | "language_info": { 75 | "codemirror_mode": { 76 | "name": "ipython", 77 | "version": 3 78 | }, 79 | "file_extension": ".py", 80 | "mimetype": "text/x-python", 81 | "name": "python", 82 | "nbconvert_exporter": "python", 83 | "pygments_lexer": "ipython3", 84 | "version": "3.6.3" 85 | } 86 | }, 87 | "nbformat": 4, 88 | "nbformat_minor": 2 89 | } 90 | -------------------------------------------------------------------------------- /tests/benchmarks/test_relation_extraction.py: -------------------------------------------------------------------------------- 1 | """Tests for analogy benchmark.""" 2 | 3 | import contextlib 4 | import unittest 5 | import io 6 | from os import path 7 | from vecto.benchmarks import visualize 8 | from vecto.embeddings import load_from_dir 9 | from vecto.data import Dataset 10 | from tests.test_setup import run_module 11 | 12 | 13 | path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header') 14 | path_dataset = path.join('tests', 'data', 'benchmarks', 'relation_extraction') 15 | 16 | 17 | class Tests(unittest.TestCase): 18 | # def test_api(self): 19 | # embs = load_from_dir(path_emb) 20 | 21 | # for method in ['lr', '2FFNN']: 22 | # sequence_labeling = Sequence_labeling(method=method) 23 | # for subtask in ['chunk', 'pos', 'ner']: # , 'chunk', 'pos', 'ner' 24 | # result = sequence_labeling.get_result(embs, path.join(path_sequence_labeling_dataset, subtask)) 25 | # self.assertIsInstance(result[0], dict) 26 | # print(result) 27 | 28 | def test_cli(self): 29 | sio = io.StringIO() 30 | with contextlib.redirect_stdout(sio): 31 | run_module("vecto", 32 | "benchmark", 33 | "relation_extraction", 34 | path_emb, 35 | path_dataset, 36 | "--path_out", "/tmp/vecto/benchmarks/") 37 | 38 | with self.assertRaises(FileNotFoundError): 39 | sio = io.StringIO() 40 | with contextlib.redirect_stdout(sio): 41 | run_module("vecto", 42 | "benchmark", 43 | "relation_extraction", 44 | path_emb + "NONEXISTING", 45 | path_dataset, 46 | "--path_out", 47 | "/tmp/vecto/benchmarks/") 48 | 49 | from matplotlib import pyplot as plt 50 | visualize.plot_accuracy("/tmp/vecto/benchmarks/relation_extraction", key_secondary="experiment_setup.dataset") 51 | plt.savefig("/tmp/vecto/benchmarks/relation_extraction.pdf", bbox_inches="tight") 52 | -------------------------------------------------------------------------------- /vecto/vocabulary/__main__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from .vocabulary import create_ngram_tokens_from_dir, create_from_annotated_dir 4 | from .vocabulary import create_from_path 5 | 6 | 7 | def parse_args(): 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('--type', '-t', choices=['normal', 'annotated', 'ngram_tokens'], 10 | default='normal', 11 | help='vocab type,') 12 | parser.add_argument('--min_ngram', '-minn', default=2, type=int, 13 | help='minimal number of ngrams') 14 | parser.add_argument('--max_ngram', '-maxn', default=3, type=int, 15 | help='minimal number of ngrams') 16 | parser.add_argument('--min_frequency', '-minf', default=100, type=int, 17 | help='minimal number of ngrams') 18 | parser.add_argument('--context_representation', '-cp', choices=['word', 'deps', 'ne', ], 19 | default='word', 20 | help='context representation' 21 | 'the annotated corpus is required') 22 | parser.add_argument('--path_corpus', help='path to the corpus', required=True) 23 | parser.add_argument('--path_out', help='path to save vocab', required=True) 24 | 25 | args = parser.parse_args() 26 | return args 27 | 28 | 29 | def run(args): 30 | print(args.type) 31 | if args.type == "normal": 32 | v = create_from_path(args.path_corpus, args.min_frequency) 33 | v.save_to_dir(os.path.join(args.path_out, args.type)) 34 | if args.type == "annotated": 35 | v = create_from_annotated_dir(args.path_corpus, args.min_frequency, args.context_representation) 36 | v.save_to_dir(os.path.join(args.path_out, args.type, args.context_representation)) 37 | if args.type == "ngram_tokens": 38 | v = create_ngram_tokens_from_dir(args.path_corpus, args.min_ngram, args.max_ngram, args.min_frequency) 39 | v.save_to_dir(os.path.join(args.path_out, args.type, str(args.min_ngram), str(args.max_ngram))) 40 | 41 | 42 | def main(): 43 | args = parse_args() 44 | # print(args) 45 | run(args) 46 | 47 | 48 | if __name__ == "__main__": 49 | main() 50 | -------------------------------------------------------------------------------- /tests/benchmarks/test_synonymy_detection.py: -------------------------------------------------------------------------------- 1 | """Tests for synonymy detection benchmark.""" 2 | 3 | import unittest 4 | from io import StringIO 5 | from contextlib import redirect_stdout 6 | from vecto.benchmarks.synonymy_detection import * 7 | from vecto.embeddings import load_from_dir 8 | from ..test_setup import run_module 9 | 10 | path_synonymy_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'synonymy_detection') 11 | 12 | 13 | class Tests(unittest.TestCase): 14 | @classmethod 15 | def test_synonymy(self): 16 | embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) 17 | synonymy = CosineDistance() 18 | synonymy.get_result(embs, path_synonymy_dataset) 19 | 20 | @classmethod 21 | def test_cli(self): 22 | sio = StringIO() 23 | with redirect_stdout(sio): 24 | run_module('vecto.benchmarks.synonymy_detection', 25 | './tests/data/embeddings/text/plain_with_file_header/', 26 | './tests/data/benchmarks/synonymy_detection', 27 | '--path_out', '/tmp/vecto/benchmarks', '--method', 'CosineDistance') 28 | 29 | def test_synonymy_results(self): 30 | embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) 31 | synonymy = CosineDistance() 32 | result = synonymy.get_result(embs, path_synonymy_dataset)['test'] 33 | cat_is_synonym = 'yes' 34 | cat_is_hit = False 35 | distance_to_cat = 1.0 36 | 37 | self.assertEqual(result['tiger'][0]['is_synonym'], cat_is_synonym) 38 | self.assertEqual(result['tiger'][0]['hit'], cat_is_hit) 39 | self.assertEqual(result['tiger'][0]['distance'], distance_to_cat) 40 | 41 | def test_synonymy_reader(self): 42 | synonymy = CosineDistance() 43 | test_set = synonymy.read_test_set(path.join(path_synonymy_dataset, 'test.csv')) 44 | expected_amount_of_keys = 2 45 | expected_amount_of_tiger_suspicious = 3 46 | cat_is_synonym_with_tiger = 'yes' 47 | 48 | self.assertEqual(len(test_set.keys()), expected_amount_of_keys) 49 | self.assertEqual(len(test_set['tiger']), expected_amount_of_tiger_suspicious) 50 | self.assertEqual(test_set['tiger'][0][1], cat_is_synonym_with_tiger) 51 | -------------------------------------------------------------------------------- /tests/data/benchmarks/sequence_labeling/chunk/test.txt: -------------------------------------------------------------------------------- 1 | -DOCSTART- -X- O O 2 | 3 | Fischler JJR I-NP I-PER 4 | proposed VBN I-NP O 5 | EU-wide NNP I-NP I-MISC 6 | measures VBZ I-VP O 7 | after IN I-PP O 8 | reports NNS I-NP O 9 | from IN I-PP O 10 | Britain NNP I-NP I-LOC 11 | and CC I-NP O 12 | France NNP I-NP I-LOC 13 | that WDT B-NP O 14 | under IN I-PP O 15 | laboratory NN I-NP O 16 | conditions NNS B-NP O 17 | sheep NN I-NP O 18 | could MD I-VP O 19 | contract VB I-VP O 20 | Bovine NNP I-NP I-MISC 21 | Spongiform NNP I-NP I-MISC 22 | Encephalopathy NNP I-NP I-MISC 23 | ( ( O O 24 | BSE NNP I-NP I-MISC 25 | ) ) O O 26 | -- : O O 27 | mad JJ I-NP O 28 | cow NN I-NP O 29 | disease NN I-NP O 30 | . . O O 31 | 32 | But CC O O 33 | Fischler NNP I-NP I-PER 34 | agreed VBD I-VP O 35 | to TO I-VP O 36 | review VB I-VP O 37 | his PRP$ I-NP O 38 | proposal NN I-NP O 39 | after IN I-PP O 40 | the DT I-NP O 41 | EU NNP I-NP I-ORG 42 | 's POS B-NP O 43 | standing NN I-NP O 44 | veterinary JJ I-NP O 45 | committee NN I-NP O 46 | , , O O 47 | mational JJ I-NP O 48 | animal NN I-NP O 49 | health NN I-NP O 50 | officials NNS I-NP O 51 | , , O O 52 | questioned VBD I-VP O 53 | if IN I-SBAR O 54 | such JJ I-NP O 55 | action NN I-NP O 56 | was VBD I-VP O 57 | justified VBN I-VP O 58 | as IN I-PP O 59 | there RB I-ADVP O 60 | was VBD I-VP O 61 | only RB I-ADVP O 62 | a DT I-NP O 63 | slight JJ I-NP O 64 | risk NN I-NP O 65 | to TO I-PP O 66 | human JJ I-NP O 67 | health NN I-NP O 68 | . . O O 69 | 70 | Spanish NNP I-NP I-MISC 71 | Farm NNP I-NP O 72 | Minister NNP I-NP O 73 | Loyola NNP I-NP I-PER 74 | de NNP I-NP I-PER 75 | Palacio NNP I-NP I-PER 76 | had VBD I-VP O 77 | earlier RBR I-VP O 78 | accused VBN I-VP O 79 | Fischler NNP I-NP I-PER 80 | at IN I-PP O 81 | an DT I-NP O 82 | EU JJ I-NP I-ORG 83 | farm NN I-NP O 84 | ministers NNS I-NP O 85 | ' POS B-NP O 86 | meeting NN I-NP O 87 | of IN I-PP O 88 | causing VBG I-VP O 89 | unjustified JJ I-ADJP O 90 | alarm NN I-NP O 91 | through IN I-PP O 92 | " " O O 93 | dangerous JJ I-NP O 94 | generalisation NN I-NP O 95 | . . O O 96 | " " O O 97 | 98 | . . O O 99 | 100 | Only RB I-NP O 101 | France NNP I-NP I-LOC 102 | and CC I-NP O 103 | Britain NNP I-NP I-LOC 104 | backed VBD I-VP O 105 | Fischler NNP I-NP I-PER 106 | 's POS B-NP O 107 | proposal NN I-NP O 108 | . . O O 109 | -------------------------------------------------------------------------------- /tests/data/benchmarks/sequence_labeling/ner/test.txt: -------------------------------------------------------------------------------- 1 | -DOCSTART- -X- O O 2 | 3 | Fischler JJR I-NP I-PER 4 | proposed VBN I-NP O 5 | EU-wide NNP I-NP I-MISC 6 | measures VBZ I-VP O 7 | after IN I-PP O 8 | reports NNS I-NP O 9 | from IN I-PP O 10 | Britain NNP I-NP I-LOC 11 | and CC I-NP O 12 | France NNP I-NP I-LOC 13 | that WDT B-NP O 14 | under IN I-PP O 15 | laboratory NN I-NP O 16 | conditions NNS B-NP O 17 | sheep NN I-NP O 18 | could MD I-VP O 19 | contract VB I-VP O 20 | Bovine NNP I-NP I-MISC 21 | Spongiform NNP I-NP I-MISC 22 | Encephalopathy NNP I-NP I-MISC 23 | ( ( O O 24 | BSE NNP I-NP I-MISC 25 | ) ) O O 26 | -- : O O 27 | mad JJ I-NP O 28 | cow NN I-NP O 29 | disease NN I-NP O 30 | . . O O 31 | 32 | But CC O O 33 | Fischler NNP I-NP I-PER 34 | agreed VBD I-VP O 35 | to TO I-VP O 36 | review VB I-VP O 37 | his PRP$ I-NP O 38 | proposal NN I-NP O 39 | after IN I-PP O 40 | the DT I-NP O 41 | EU NNP I-NP I-ORG 42 | 's POS B-NP O 43 | standing NN I-NP O 44 | veterinary JJ I-NP O 45 | committee NN I-NP O 46 | , , O O 47 | mational JJ I-NP O 48 | animal NN I-NP O 49 | health NN I-NP O 50 | officials NNS I-NP O 51 | , , O O 52 | questioned VBD I-VP O 53 | if IN I-SBAR O 54 | such JJ I-NP O 55 | action NN I-NP O 56 | was VBD I-VP O 57 | justified VBN I-VP O 58 | as IN I-PP O 59 | there RB I-ADVP O 60 | was VBD I-VP O 61 | only RB I-ADVP O 62 | a DT I-NP O 63 | slight JJ I-NP O 64 | risk NN I-NP O 65 | to TO I-PP O 66 | human JJ I-NP O 67 | health NN I-NP O 68 | . . O O 69 | 70 | Spanish NNP I-NP I-MISC 71 | Farm NNP I-NP O 72 | Minister NNP I-NP O 73 | Loyola NNP I-NP I-PER 74 | de NNP I-NP I-PER 75 | Palacio NNP I-NP I-PER 76 | had VBD I-VP O 77 | earlier RBR I-VP O 78 | accused VBN I-VP O 79 | Fischler NNP I-NP I-PER 80 | at IN I-PP O 81 | an DT I-NP O 82 | EU JJ I-NP I-ORG 83 | farm NN I-NP O 84 | ministers NNS I-NP O 85 | ' POS B-NP O 86 | meeting NN I-NP O 87 | of IN I-PP O 88 | causing VBG I-VP O 89 | unjustified JJ I-ADJP O 90 | alarm NN I-NP O 91 | through IN I-PP O 92 | " " O O 93 | dangerous JJ I-NP O 94 | generalisation NN I-NP O 95 | . . O O 96 | " " O O 97 | 98 | . . O O 99 | 100 | Only RB I-NP O 101 | France NNP I-NP I-LOC 102 | and CC I-NP O 103 | Britain NNP I-NP I-LOC 104 | backed VBD I-VP O 105 | Fischler NNP I-NP I-PER 106 | 's POS B-NP O 107 | proposal NN I-NP O 108 | . . O O 109 | -------------------------------------------------------------------------------- /tests/data/benchmarks/sequence_labeling/pos/test.txt: -------------------------------------------------------------------------------- 1 | -DOCSTART- -X- O O 2 | 3 | Fischler JJR I-NP I-PER 4 | proposed VBN I-NP O 5 | EU-wide NNP I-NP I-MISC 6 | measures VBZ I-VP O 7 | after IN I-PP O 8 | reports NNS I-NP O 9 | from IN I-PP O 10 | Britain NNP I-NP I-LOC 11 | and CC I-NP O 12 | France NNP I-NP I-LOC 13 | that WDT B-NP O 14 | under IN I-PP O 15 | laboratory NN I-NP O 16 | conditions NNS B-NP O 17 | sheep NN I-NP O 18 | could MD I-VP O 19 | contract VB I-VP O 20 | Bovine NNP I-NP I-MISC 21 | Spongiform NNP I-NP I-MISC 22 | Encephalopathy NNP I-NP I-MISC 23 | ( ( O O 24 | BSE NNP I-NP I-MISC 25 | ) ) O O 26 | -- : O O 27 | mad JJ I-NP O 28 | cow NN I-NP O 29 | disease NN I-NP O 30 | . . O O 31 | 32 | But CC O O 33 | Fischler NNP I-NP I-PER 34 | agreed VBD I-VP O 35 | to TO I-VP O 36 | review VB I-VP O 37 | his PRP$ I-NP O 38 | proposal NN I-NP O 39 | after IN I-PP O 40 | the DT I-NP O 41 | EU NNP I-NP I-ORG 42 | 's POS B-NP O 43 | standing NN I-NP O 44 | veterinary JJ I-NP O 45 | committee NN I-NP O 46 | , , O O 47 | mational JJ I-NP O 48 | animal NN I-NP O 49 | health NN I-NP O 50 | officials NNS I-NP O 51 | , , O O 52 | questioned VBD I-VP O 53 | if IN I-SBAR O 54 | such JJ I-NP O 55 | action NN I-NP O 56 | was VBD I-VP O 57 | justified VBN I-VP O 58 | as IN I-PP O 59 | there RB I-ADVP O 60 | was VBD I-VP O 61 | only RB I-ADVP O 62 | a DT I-NP O 63 | slight JJ I-NP O 64 | risk NN I-NP O 65 | to TO I-PP O 66 | human JJ I-NP O 67 | health NN I-NP O 68 | . . O O 69 | 70 | Spanish NNP I-NP I-MISC 71 | Farm NNP I-NP O 72 | Minister NNP I-NP O 73 | Loyola NNP I-NP I-PER 74 | de NNP I-NP I-PER 75 | Palacio NNP I-NP I-PER 76 | had VBD I-VP O 77 | earlier RBR I-VP O 78 | accused VBN I-VP O 79 | Fischler NNP I-NP I-PER 80 | at IN I-PP O 81 | an DT I-NP O 82 | EU JJ I-NP I-ORG 83 | farm NN I-NP O 84 | ministers NNS I-NP O 85 | ' POS B-NP O 86 | meeting NN I-NP O 87 | of IN I-PP O 88 | causing VBG I-VP O 89 | unjustified JJ I-ADJP O 90 | alarm NN I-NP O 91 | through IN I-PP O 92 | " " O O 93 | dangerous JJ I-NP O 94 | generalisation NN I-NP O 95 | . . O O 96 | " " O O 97 | 98 | . . O O 99 | 100 | Only RB I-NP O 101 | France NNP I-NP I-LOC 102 | and CC I-NP O 103 | Britain NNP I-NP I-LOC 104 | backed VBD I-VP O 105 | Fischler NNP I-NP I-PER 106 | 's POS B-NP O 107 | proposal NN I-NP O 108 | . . O O 109 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | vecto 2 | ***** 3 | 4 | .. image:: https://api.travis-ci.com/vecto-ai/vecto.svg?branch=master 5 | :target: https://travis-ci.com/vecto-ai/vecto 6 | :alt: build status from Travis CI 7 | 8 | .. image:: https://ci.appveyor.com/api/projects/status/github/vecto-ai/vecto?branch=master&svg=true 9 | :target: https://ci.appveyor.com/project/undertherain/vecto 10 | :alt: build status from AppVeyor 11 | 12 | .. image:: https://coveralls.io/repos/github/vecto-ai/vecto/badge.svg?branch=master 13 | :target: https://coveralls.io/github/vecto-ai/vecto?branch=master 14 | :alt: coveralls badge 15 | 16 | .. image:: https://api.codacy.com/project/badge/Grade/65aabe10113d45819091d005414462ca 17 | :target: https://www.codacy.com/app/undertherain/vecto 18 | :alt: grade from Codacy 19 | 20 | .. image:: https://badge.fury.io/py/vecto.svg 21 | :target: https://badge.fury.io/py/vecto 22 | :alt: pypi version 23 | 24 | .. image:: https://badges.gitter.im/badge.svg 25 | :alt: Join the chat at https://gitter.im/vecto-ai/Lobby 26 | :target: https://gitter.im/vecto-ai/Lobby 27 | 28 | Vecto helps to perform a range of tasks within the framework of vector space models of computational linguistics. 29 | 30 | What functionality is included 31 | ============================== 32 | 33 | * creating word embeddings by counting and neural-based methods, including sub-word-level models; 34 | * importing and exporting from a number of popular formats of word embeddings and providing unified access to word vectors; 35 | * perfroming a range of downstream tasks / benchmarks; 36 | * visualising embeddings. 37 | 38 | How do I get set up? 39 | ==================== 40 | 41 | * ``pip3 install vecto`` for stable version 42 | * ``pip3 install git+https://github.com/vecto-ai/vecto.git`` for latest dev version 43 | * Python 3.6 or later is required 44 | 45 | 📖 Documentation 46 | ================ 47 | 48 | =================== === 49 | `Tutorial`_ vecto overview and end-to-end examples. 50 | `API Reference`_ The detailed reference for vecto API. 51 | `Contribute`_ How to contribute to the vecto project and code base. 52 | =================== === 53 | 54 | .. _Tutorial: http://vecto.readthedocs.io/en/docs/tutorial/index.html 55 | .. _API Reference: http://vecto.readthedocs.io/en/docs/reference/index.html 56 | .. _Contribute: http://vecto.readthedocs.io/en/docs/contribution.html 57 | -------------------------------------------------------------------------------- /vecto/utils/metadata.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .data import load_json, save_json 3 | from vecto._version import VERSION 4 | 5 | METADATA_SUFFIX = 'metadata.json' 6 | 7 | 8 | def make_metadata_path(fname): 9 | if os.path.isdir(fname): 10 | return os.path.join(fname, METADATA_SUFFIX) 11 | return '{}.{}'.format(fname, METADATA_SUFFIX) 12 | 13 | 14 | def save_metadata(data, base_path): 15 | save_json(data, make_metadata_path(base_path)) 16 | 17 | 18 | def try_load_metadata(base_path): 19 | try: 20 | return load_json(make_metadata_path(base_path)) 21 | except IOError: 22 | return {} 23 | 24 | 25 | def get_full_typename(obj): 26 | # cls = type(obj) 27 | if obj.__class__.__name__ == 'function': 28 | clsname = obj.__name__ 29 | else: 30 | clsname = obj.__class__.__name__ 31 | return '{}.{}'.format(obj.__module__, clsname) 32 | 33 | 34 | class WithMetaData(object): 35 | """ 36 | Base object for all objects with metadata. Contains utilities for metadata loading from files, storing to files, 37 | collecting/merging etc. 38 | 39 | User of this class is responsible for calling __init__ or init_metadata and save_metadata 40 | in proper places of inheritor. 41 | """ 42 | 43 | def __init__(self, base_path=None, **other_metadata): 44 | """ 45 | see init_metadata 46 | """ 47 | self.metadata = {} 48 | self.init_metadata(base_path=base_path, **other_metadata) 49 | 50 | def init_metadata(self, base_path=None, **other_metadata): 51 | """ 52 | :param base_path: path from which metadata.json path will be constructed 53 | :param other_metadata: anything json serializable 54 | """ 55 | # self._metadata = {"vecto_version": VERSION} 56 | if base_path is not None: 57 | self.metadata['_base_path'] = base_path 58 | self.load_metadata(base_path) 59 | self.metadata.update(other_metadata) 60 | self.metadata['_class'] = get_full_typename(self) 61 | 62 | def save_metadata(self, base_path): 63 | """ 64 | :param base_path: path from which metadata.json path will be constructed 65 | """ 66 | save_metadata(self.metadata, base_path) 67 | 68 | def load_metadata(self, base_path): 69 | """ 70 | :param base_path: path from which metadata.json path will be constructed 71 | """ 72 | self.metadata.update(try_load_metadata(base_path)) 73 | -------------------------------------------------------------------------------- /vecto/benchmarks/text_classification/nlp_utils.py: -------------------------------------------------------------------------------- 1 | # import collections 2 | # import io 3 | 4 | import numpy 5 | 6 | import chainer 7 | from chainer.backends import cuda 8 | 9 | 10 | def normalize_text(text): 11 | return text.strip().lower() 12 | 13 | 14 | # def make_vocab(dataset, max_vocab_size=20000, min_freq=2): 15 | # counts = collections.defaultdict(int) 16 | # for tokens, _ in dataset: 17 | # for token in tokens: 18 | # counts[token] += 1 19 | # 20 | # vocab = {'': 0, '': 1} 21 | # for w, c in sorted(counts.items(), key=lambda x: (-x[1], x[0])): 22 | # if len(vocab) >= max_vocab_size or c < min_freq: 23 | # break 24 | # vocab[w] = len(vocab) 25 | # return vocab 26 | 27 | 28 | # def read_vocab_list(path, max_vocab_size=20000): 29 | # vocab = {'': 0, '': 1} 30 | # with io.open(path, encoding='utf-8', errors='ignore') as f: 31 | # for l in f: 32 | # w = l.strip() 33 | # if w not in vocab and w: 34 | # vocab[w] = len(vocab) 35 | # if len(vocab) >= max_vocab_size: 36 | # break 37 | # return vocab 38 | 39 | 40 | def make_array(tokens, vocab, add_eos=True): 41 | ids = [vocab[token] for token in tokens if token in vocab ] 42 | ids.append(0) 43 | return numpy.array(ids, numpy.int32) 44 | 45 | 46 | def transform_to_array(dataset, vocab, with_label=True): 47 | if with_label: 48 | return [(make_array(tokens, vocab), numpy.array([cls], numpy.int32)) 49 | for tokens, cls in dataset] 50 | else: 51 | return [make_array(tokens, vocab) 52 | for tokens in dataset] 53 | 54 | 55 | def convert_seq(batch, device=None, with_label=True): 56 | def to_device_batch(batch): 57 | if device is None: 58 | return batch 59 | elif device < 0: 60 | return [chainer.dataset.to_device(device, x) for x in batch] 61 | else: 62 | xp = cuda.cupy.get_array_module(*batch) 63 | concat = xp.concatenate(batch, axis=0) 64 | sections = numpy.cumsum([len(x) 65 | for x in batch[:-1]], dtype=numpy.int32) 66 | concat_dev = chainer.dataset.to_device(device, concat) 67 | batch_dev = cuda.cupy.split(concat_dev, sections) 68 | return batch_dev 69 | 70 | if with_label: 71 | return {'xs': to_device_batch([x for x, _ in batch]), 72 | 'ys': to_device_batch([y for _, y in batch])} 73 | else: 74 | return to_device_batch([x for x in batch]) 75 | -------------------------------------------------------------------------------- /tests/benchmarks/test_similarity.py: -------------------------------------------------------------------------------- 1 | """Tests for analogy benchmark.""" 2 | 3 | import contextlib 4 | import unittest 5 | import io 6 | from os import path 7 | from vecto.benchmarks.similarity import Benchmark as Similarity 8 | from vecto.benchmarks import visualize 9 | from vecto.embeddings import load_from_dir 10 | from vecto.data import Dataset 11 | from tests.test_setup import run_module 12 | 13 | 14 | path_similarity_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'similarity') 15 | path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header') 16 | 17 | 18 | class Tests(unittest.TestCase): 19 | 20 | def test_api(self): 21 | embs = load_from_dir(path_emb) 22 | dataset = Dataset(path_similarity_dataset) 23 | similarity = Similarity() 24 | result = similarity.run(embs, dataset) 25 | self.assertIsInstance(result[0], dict) 26 | print(result) 27 | 28 | similarity = Similarity(ignore_oov=False) 29 | result = similarity.run(embs, dataset) 30 | self.assertIsInstance(result[0], dict) 31 | print(result) 32 | 33 | similarity = Similarity(normalize=False) 34 | result = similarity.run(embs, dataset) 35 | self.assertIsInstance(result[0], dict) 36 | print(result) 37 | 38 | def test_cli(self): 39 | sio = io.StringIO() 40 | with contextlib.redirect_stdout(sio): 41 | run_module("vecto", 42 | "benchmark", 43 | "similarity", 44 | path_emb, 45 | path_similarity_dataset, 46 | "--path_out", "/tmp/vecto/benchmarks/") 47 | 48 | sio = io.StringIO() 49 | with contextlib.redirect_stdout(sio): 50 | run_module("vecto", 51 | "benchmark", 52 | "similarity", 53 | path_emb, 54 | path_similarity_dataset, 55 | "--path_out", "/tmp/vecto/benchmarks/tmp") 56 | 57 | with self.assertRaises(FileNotFoundError): 58 | sio = io.StringIO() 59 | with contextlib.redirect_stdout(sio): 60 | run_module("vecto", 61 | "benchmark", 62 | "similarity", 63 | path_emb + "NONEXISTING", 64 | path_similarity_dataset, 65 | "--path_out", "/tmp/vecto/benchmarks/") 66 | 67 | from matplotlib import pyplot as plt 68 | visualize.plot_accuracy("/tmp/vecto/benchmarks/word_similarity", key_secondary="experiment_setup.dataset") 69 | plt.savefig("/tmp/vecto/benchmarks/similarity.pdf", bbox_inches="tight") 70 | -------------------------------------------------------------------------------- /docs/source/tutorial/basic.rst: -------------------------------------------------------------------------------- 1 | Introduction to Vecto 2 | ====================== 3 | 4 | .. currentmodule:: vecto 5 | 6 | This is the tutorial for Vecto. It describes: 7 | 8 | * What it is, and why we are developing it. 9 | * what you can do with Vecto. 10 | * the roadmap of the project. 11 | 12 | Both the library and the documentation are actively developed, check back for more! If you have questions, or would like to contribute, feel free to get in touch on `github `_. 13 | 14 | What is Vecto? 15 | ------------------- 16 | 17 | Vecto is an open-source Python library for working with vector space models (VSMs), including various word embeddings such as word2vec. Vecto can load various popular formats of VSMs and retrieve nearest neighbors of a given vector. It includes a growing list of benchmarks with which VSMs are evaluated in most current research, and a few visualization tools. It also includes a growing list of modules for creating VSMs, both explicit and based on neural networks. 18 | 19 | Why do you bother? 20 | -------------------- 21 | 22 | There are a few other libraries for working with VSMs, including gensim and spacy. Vecto differs from them in that its primary goal is to facilitate principled, systematic research in providing **a framework for reproducible experiments** on VSMs. 23 | 24 | From the academic perspective, this matters because this is the only way to understand more about what VSMs are and what kind of meaning representation they offer. 25 | 26 | From the practical perspective, this matters because otherwise we can not tell which VSM would be the best to use for what task. Existing extrinsic evaluations of VSMs such as popular word similarity, relatedness, analogy and intrusion tasks have methodological problems and do not correlate well with performance on all extrinsic tasks. Therefore basically to pick the best representation for a task you have to try different kinds of VSMs until you find the best-performing one. 27 | 28 | Furthermore, there is the important and unpleasant part of parameter tuning and optimizing for a particular task. `Levy et al. (2015) `_ showed that the choice of hyperparameters may make more of a difference than the choice of model itself. Even more frustratingly, when you have a relatively comprehensive task covering a wide range of linguistic relations, you may find that the parameters beneficial to a part of the task are detrimental for another part `(Gladkova et al. 2016) `_. 29 | 30 | The neural parts of Vecto is implemented in `Chainer `_, a new deep learning framework that is friendly to high-performance multi-GPU environments. This should make Vecto useful in both academic and industrial settings. 31 | -------------------------------------------------------------------------------- /tests/benchmarks/test_language_modeling.py: -------------------------------------------------------------------------------- 1 | """Tests for analogy benchmark.""" 2 | 3 | import contextlib 4 | import unittest 5 | import io 6 | from os import path 7 | from vecto.benchmarks.language_modeling import Benchmark as Language_modeling 8 | from vecto.benchmarks import visualize 9 | from vecto.embeddings import load_from_dir 10 | from tests.test_setup import run_module 11 | 12 | path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header') 13 | 14 | 15 | class Tests(unittest.TestCase): 16 | 17 | def test_api(self): 18 | embs = load_from_dir(path_emb) 19 | language_modeling = Language_modeling(method='lstm') 20 | result = language_modeling.run(embs) 21 | self.assertIsInstance(result[0], dict) 22 | print(result) 23 | 24 | language_modeling = Language_modeling(method='lr') 25 | result = language_modeling.run(embs) 26 | self.assertIsInstance(result[0], dict) 27 | print(result) 28 | 29 | language_modeling = Language_modeling(method='2FFNN') 30 | result = language_modeling.run(embs) 31 | self.assertIsInstance(result[0], dict) 32 | print(result) 33 | 34 | language_modeling = Language_modeling(method='rnn') 35 | result = language_modeling.run(embs) 36 | self.assertIsInstance(result[0], dict) 37 | print(result) 38 | 39 | def test_cli(self): 40 | sio = io.StringIO() 41 | with contextlib.redirect_stdout(sio): 42 | run_module("vecto", 43 | "benchmark", 44 | "language_modeling", 45 | path_emb, 46 | "--window_size", "5", 47 | "--path_out", "/tmp/vecto/benchmarks/") 48 | 49 | sio = io.StringIO() 50 | with contextlib.redirect_stdout(sio): 51 | run_module("vecto", 52 | "benchmark", 53 | "language_modeling", 54 | path_emb, 55 | "--method", "lr", 56 | "--path_out", "/tmp/vecto/benchmarks/tmp") 57 | 58 | with self.assertRaises(FileNotFoundError): 59 | sio = io.StringIO() 60 | with contextlib.redirect_stdout(sio): 61 | run_module("vecto", 62 | "benchmark", 63 | "language_modeling", 64 | path_emb + "NONEXISTING", 65 | "--path_out", "/tmp/vecto/benchmarks/") 66 | 67 | from matplotlib import pyplot as plt 68 | visualize.plot_accuracy("/tmp/vecto/benchmarks/language_modeling", 69 | key_secondary="experiment_setup.dataset") 70 | plt.savefig("/tmp/vecto/benchmarks/language_modeling.pdf", 71 | bbox_inches="tight") 72 | 73 | 74 | # Tests().test_cli() 75 | -------------------------------------------------------------------------------- /tests/benchmarks/test_sequence_labeling.py: -------------------------------------------------------------------------------- 1 | """Tests for analogy benchmark.""" 2 | 3 | import contextlib 4 | import unittest 5 | import io 6 | from os import path 7 | from vecto.benchmarks.sequence_labeling import Benchmark as Sequence_labeling 8 | from vecto.benchmarks import visualize 9 | from vecto.embeddings import load_from_dir 10 | from vecto.data import Dataset 11 | from tests.test_setup import run_module 12 | 13 | 14 | path_sequence_labeling_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'sequence_labeling') 15 | path_sequence_labeling_dataset_ner = path.join('.', 'tests', 'data', 'benchmarks', 'sequence_labeling', 'ner') # sequence labeling need to specify a sub task (pos, chunk, or ner) 16 | path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header') 17 | 18 | 19 | class Tests(unittest.TestCase): 20 | def test_api(self): 21 | embs = load_from_dir(path_emb) 22 | 23 | for method in ['lr', '2FFNN']: 24 | sequence_labeling = Sequence_labeling(method=method) 25 | for subtask in ['chunk', 'pos', 'ner']: # , 'chunk', 'pos', 'ner' 26 | dataset = Dataset(path.join(path_sequence_labeling_dataset, subtask)) 27 | result = sequence_labeling.run(embs, dataset) 28 | self.assertIsInstance(result[0], dict) 29 | print(result) 30 | 31 | def test_cli(self): 32 | sio = io.StringIO() 33 | with contextlib.redirect_stdout(sio): 34 | run_module("vecto", 35 | "benchmark", 36 | "sequence_labeling", 37 | path_emb, 38 | path_sequence_labeling_dataset_ner, 39 | "--path_out", "/tmp/vecto/benchmarks/") 40 | 41 | sio = io.StringIO() 42 | with contextlib.redirect_stdout(sio): 43 | run_module("vecto", 44 | "benchmark", 45 | "sequence_labeling", 46 | path_emb, 47 | path_sequence_labeling_dataset_ner, 48 | "--path_out", "/tmp/vecto/benchmarks/") 49 | 50 | with self.assertRaises(FileNotFoundError): 51 | sio = io.StringIO() 52 | with contextlib.redirect_stdout(sio): 53 | run_module("vecto", 54 | "benchmark", 55 | "sequence_labeling", 56 | path_emb + "NONEXISTING", 57 | path_sequence_labeling_dataset_ner, 58 | "--path_out", 59 | "/tmp/vecto/benchmarks/") 60 | 61 | from matplotlib import pyplot as plt 62 | # here the visualization only for the ner sub task. 63 | visualize.plot_accuracy("/tmp/vecto/benchmarks/sequence_labeling/ner", key_secondary="experiment_setup.dataset") 64 | plt.savefig("/tmp/vecto/benchmarks/sequence_labeling.pdf", bbox_inches="tight") 65 | -------------------------------------------------------------------------------- /tests/data/benchmarks/text_classification/test: -------------------------------------------------------------------------------- 1 | 0 a standard police-oriented drama that , were it not for de niro's participation , would have likely wound up a tnt original . 2 | 0 afraid to pitch into farce , yet only half-hearted in its spy mechanics , all the queen's men is finally just one long drag . 3 | 0 with minimal imagination , you could restage the whole thing in your bathtub . 4 | 0 spousal abuse is a major problem in contemporary society , but the film reduces this domestic tragedy to florid melodrama . 5 | 0 too slick and manufactured to claim street credibility . 6 | 0 smothered by its own solemnity . 7 | 0 the 50-something lovebirds are too immature and unappealing to care about . 8 | 0 basically , it's pretty but dumb . 9 | 0 it's a deeply serious movie that cares passionately about its subject , but too often becomes ponderous in its teaching of history , or lost in the intricate connections and multiple timelines of its story . 10 | 0 i'm not suggesting that you actually see it , unless you're the kind of person who has seen every wim wenders film of the '70s . 11 | 0 kids who are into this thornberry stuff will probably be in wedgie heaven . anyone else who may , for whatever reason , be thinking about going to see this movie is hereby given fair warning . 12 | 0 as vulgar as it is banal . 13 | 0 return to neverland manages to straddle the line between another classic for the company and just another run-of-the-mill disney sequel intended for the home video market . 14 | 0 if you're not fans of the adventues of steve and terri , you should avoid this like the dreaded king brown snake . personally , i'd rather watch them on the animal planet . 15 | 0 femme fatale offers nothing more than a bait-and-switch that is beyond playing fair with the audience . are we dealing with dreams , visions or being told what actually happened as if it were the third ending of clue ? 16 | 0 life or something like it has its share of high points , but it misses too many opportunities . 17 | 1 will amuse and provoke adventurous adults in specialty venues . 18 | 1 great character interaction . 19 | 1 leave it to the french to truly capture the terrifying angst of the modern working man without turning the film into a cheap thriller , a dumb comedy or a sappy melodrama . 20 | 1 sits uneasily as a horror picture . . . but finds surprising depth in its look at the binds of a small family . 21 | 1 remarkably accessible and affecting . 22 | 1 a slick , well-oiled machine , exquisitely polished and upholstered . 23 | 1 a compelling film . 24 | 1 a refreshing korean film about five female high school friends who face an uphill battle when they try to take their relationships into deeper waters . 25 | 1 denis forges out of the theories of class- based rage and sisterly obsession a razor-sided tuning fork that rings with cultural , sexual and social discord . 26 | 1 mostly , [goldbacher] just lets her complicated characters be unruly , confusing and , through it all , human . 27 | 1 it is a challenging film , if not always a narratively cohesive one . 28 | 1 a worthy tribute to a great humanitarian and her vibrant 'co-stars . ' 29 | 1 . . . with " the bourne identity " we return to the more traditional action genre . 30 | 1 kaufman and jonze take huge risks to ponder the whole notion of passion -- our desire as human beings for passion in our lives and the emptiness one feels when it is missing . 31 | -------------------------------------------------------------------------------- /docs/source/tutorial/training_vectors.rst: -------------------------------------------------------------------------------- 1 | Training new models 2 | =================== 3 | 4 | .. currentmodule:: vsmlib 5 | 6 | 7 | This page describes how to train vectors with the models that are currently implemented in VSMlib. 8 | 9 | 10 | Word2vec 11 | -------- 12 | 13 | `Word2vec `_ is arguably the most popular word embedding model. 14 | We provide implementation of extended word2vec model, which can be trained on linear and dependency-based contexts, 15 | with bound and unbound context representations. 16 | 17 | Additionally we provide an implementation which considers characters rather than words to be the minimal units. This enables it to take advantage of morphological information: as far as a word-level models such as word2vec is concerned, "walk" and "walking" are completely unrelated, except through similarities in their distributions. 18 | 19 | To train word2vec embeddings vsmlib can be envoked via the command line interface: 20 | 21 | >>> python3 -m vsmlib.embeddings.train_word2vec 22 | 23 | The command line parameters are as 24 | 25 | --dimensions size of embeddings 26 | --context_type context type [linear' or 'deps'], for deps context, the annotated corpus is required 27 | --context_representation context representation ['bound' or 'unbound'] 28 | --window window size') 29 | --model base model type ['skipgram' or 'cbow'] 30 | --negative-size number of negative samples 31 | --out_type output model type ["hsm": hierarchical softmax, "ns": negative sampling, "original": no approximation] 32 | --subword specify if subword-level approach should be used ["none", "rnn"] 33 | --batchsize learning minibatch size 34 | --gpu GPU ID (negative value indicates CPU) 35 | --epochs number of epochs to learn 36 | --maxWordLength max word length (only used for char-level subword) 37 | --path_vocab path to the vocabulary 38 | --path_corpus path to the corpus 39 | --path_out path to save embeddings 40 | --test run in test mode 41 | --verbose verbose mode 42 | 43 | 44 | Alternatively, word2vec training can be done though vsmlib python API. 45 | 46 | >>> vsmlib.embeddings.train_word2vec.train(args) 47 | 48 | The arguments are argparse.namespace identical to command line arguments. Instance of ModelDense is returned. 49 | 50 | Realted papers: original w2v, Bofang, Mnih, subword. 51 | 52 | :: 53 | 54 | @inproceedings{MikolovChenEtAl_2013_Efficient_estimation_of_word_representations_in_vector_space, 55 | title = {Efficient Estimation of Word Representations in Vector Space}, 56 | urldate = {2015-12-03}, 57 | booktitle = {Proceedings of International Conference on Learning Representations (ICLR)}, 58 | author = {Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey}, 59 | year = {2013}} 60 | 61 | :: 62 | 63 | @inproceedings{Li2017InvestigatingDS, 64 | title={Investigating Different Syntactic Context Types and Context Representations for Learning Word Embeddings}, 65 | author={Bofang Li and Tao Liu and Zhe Zhao and Buzhou Tang and Aleksandr Drozd and Anna Rogers and Xiaoyong Du}, 66 | booktitle={EMNLP}, 67 | year={2017}} 68 | 69 | 70 | -------------------------------------------------------------------------------- /tests/benchmarks/test_text_classification.py: -------------------------------------------------------------------------------- 1 | """Tests for analogy benchmark.""" 2 | 3 | import contextlib 4 | import unittest 5 | import io 6 | from os import path 7 | from vecto.benchmarks.text_classification import Benchmark as Text_classification 8 | from vecto.benchmarks.text_classification import load_model, predict, get_vectors 9 | from vecto.benchmarks import visualize 10 | from vecto.embeddings import load_from_dir 11 | from vecto.data import Dataset 12 | from tests.test_setup import run_module 13 | 14 | path_text_classification_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'text_classification') 15 | path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header') 16 | 17 | class Tests(unittest.TestCase): 18 | 19 | def test_api(self): 20 | embs = load_from_dir(path_emb) 21 | dataset = Dataset(path_text_classification_dataset) 22 | 23 | tc = Text_classification(model='cnn') 24 | result = tc.run(embs, dataset, 25 | "/tmp/vecto/benchmarks/text_classification_model/") 26 | self.assertIsInstance(result[0], dict) 27 | print(result) 28 | 29 | tc = Text_classification(model='rnn') 30 | result = tc.run(embs, dataset, 31 | "/tmp/vecto/benchmarks/text_classification_model/") 32 | self.assertIsInstance(result[0], dict) 33 | print(result) 34 | 35 | tc = Text_classification(model='bow') 36 | result = tc.run(embs, dataset, 37 | "/tmp/vecto/benchmarks/text_classification_model/") 38 | self.assertIsInstance(result[0], dict) 39 | print(result) 40 | 41 | model = load_model("/tmp/vecto/benchmarks/text_classification_model/args.json", 42 | embs.matrix) 43 | print(predict(model, "I like this")) 44 | print(get_vectors(model, ["I like this", "I hate this"])) 45 | 46 | def test_cli(self): 47 | sio = io.StringIO() 48 | with contextlib.redirect_stdout(sio): 49 | run_module("vecto", 50 | "benchmark", 51 | "text_classification", 52 | path_emb, 53 | path_text_classification_dataset, 54 | "--model", "cnn", 55 | "--path_out", "/tmp/vecto/benchmarks/") 56 | 57 | sio = io.StringIO() 58 | with contextlib.redirect_stdout(sio): 59 | run_module("vecto", 60 | "benchmark", 61 | "text_classification", 62 | path_emb, 63 | path_text_classification_dataset, 64 | "--model", "cnn", 65 | "--path_out", "/tmp/vecto/benchmarks/") 66 | 67 | with self.assertRaises(FileNotFoundError): 68 | sio = io.StringIO() 69 | with contextlib.redirect_stdout(sio): 70 | run_module("vecto", 71 | "benchmark", 72 | "text_classification", 73 | path_emb + "NONEXISTING", 74 | path_text_classification_dataset, 75 | "--path_out", "/tmp/vecto/benchmarks/") 76 | 77 | from matplotlib import pyplot as plt 78 | visualize.plot_accuracy("/tmp/vecto/benchmarks/text_classification", key_secondary="experiment_setup.dataset") 79 | plt.savefig("/tmp/vecto/benchmarks/text_classification.pdf", bbox_inches="tight") 80 | 81 | -------------------------------------------------------------------------------- /tests/test_embeddings.py: -------------------------------------------------------------------------------- 1 | """Tests for embeddings module.""" 2 | 3 | import unittest 4 | from unittest.mock import patch 5 | from os import path 6 | import numpy as np 7 | from vecto.embeddings.dense import WordEmbeddingsDense 8 | from vecto.embeddings.base import WordEmbeddings 9 | from vecto.embeddings import load_from_dir 10 | from vecto.vocabulary import Vocabulary 11 | 12 | 13 | class Tests(unittest.TestCase): 14 | 15 | def test_basic(self): 16 | WordEmbeddingsDense() 17 | model = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) 18 | model.cmp_words("apple", "banana") 19 | model.cmp_words("apple", "bananaaaaa") 20 | x = np.array([0.0, 0.0, 0.0]) 21 | x.fill(np.nan) 22 | model.cmp_vectors(x, x) 23 | 24 | def test_load(self): 25 | load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) 26 | # TODO: assert right class 27 | load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_no_file_header')) 28 | # TODO: assert right class 29 | load_from_dir(path.join('tests', 'data', 'embeddings', 'npy')) 30 | 31 | embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) 32 | embs.get_vector('apple') 33 | #with self.assertRaises(RuntimeError): 34 | # embs.get_vector('word_that_not_in_vocabulary_27') 35 | embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'corrupted')) 36 | with self.assertRaises(RuntimeError): 37 | embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text')) 38 | 39 | def test_normalize(self): 40 | embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) 41 | embs.normalize() 42 | embs.cache_normalized_copy() 43 | 44 | def test_utils(self): 45 | embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) 46 | results = embs.get_most_similar_words('apple', 5) 47 | print(results) 48 | embs.cache_normalized_copy() 49 | results = embs.get_most_similar_words('apple', 5) 50 | print(results) 51 | 52 | results = embs.get_most_similar_words(embs.get_vector('apple'), 5) 53 | print(results) 54 | embs.get_x_label(0) 55 | 56 | def test_save(self): 57 | embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) 58 | path_save = path.join('/tmp', 'vecto', 'saved') 59 | embs.save_to_dir(path_save) 60 | embs = load_from_dir(path_save) 61 | print(embs.matrix.shape) 62 | embs.save_to_dir_plain_txt(path.join('/tmp', 'vecto', 'saved_plain')) 63 | 64 | def test_filter(self): 65 | embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) 66 | path_vocab = path.join('.', 'tests', 'data', 'vocabs', 'plain') 67 | vocab = Vocabulary() 68 | vocab.load(path_vocab) 69 | embs.filter_by_vocab(["the", "apple"]) 70 | embs.filter_by_vocab([]) 71 | 72 | @patch.multiple(WordEmbeddings, __abstractmethods__=set()) 73 | def test_abc(self): 74 | obj = WordEmbeddings() 75 | obj.get_vector("banana") 76 | 77 | def test_viz(self): 78 | embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) 79 | embs.viz_wordlist(["the", "apple"], colored=True, show_legend=True) 80 | embs.viz_wordlist(["the", "apple"], colored=False, show_legend=False) 81 | -------------------------------------------------------------------------------- /docs/source/tutorial/roadmap.rst: -------------------------------------------------------------------------------- 1 | Project roadmap 2 | ================ 3 | .. currentmodule:: vecto 4 | 5 | Vecto is work in progress. Everything that works at the moment is described in the present tutorial; feel free to get in touch if anything is not clear. Also, new functionality is coming in the nearest months, so check back for more features! 6 | 7 | +-------------------------------------------------------------+-----------------------------------------------------------+ 8 | | DONE | IN PROGRESS | 9 | +=============================================================+===========================================================+ 10 | +-------------------------------------------------------------+-----------------------------------------------------------+ 11 | | **General:** | 12 | +-------------------------------------------------------------+-----------------------------------------------------------+ 13 | | - Loading various vsm formats: plain text, npy, binary, h5p | - Pretty data downloader for benchmarks | 14 | | - Metadata generation | | 15 | | - Basic vector operations, efficient similarity search | | 16 | | - VSM visualization | | 17 | +-------------------------------------------------------------+-----------------------------------------------------------+ 18 | +-------------------------------------------------------------+-----------------------------------------------------------+ 19 | | **VSM generation:** | 20 | +-------------------------------------------------------------+-----------------------------------------------------------+ 21 | | - word2vec | - GloVe | 22 | | - - Character-level VSM | - SVD | 23 | | | | 24 | +-------------------------------------------------------------+-----------------------------------------------------------+ 25 | +-------------------------------------------------------------+-----------------------------------------------------------+ 26 | | **VSM evaluation:** | 27 | +-------------------------------------------------------------+-----------------------------------------------------------+ 28 | | - 6 methods of solving word analogies | - natural language inference | 29 | | - similarity and relatedness tests | - language modeling | 30 | | - text classification | - neural machine translation | 31 | | - sequence labeling (POS-tagging, chunking, NER) | - subjectivity classification | 32 | | | - and more! | 33 | +-------------------------------------------------------------+-----------------------------------------------------------+ 34 | -------------------------------------------------------------------------------- /vecto/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | """Loading and training for embeddings 2 | 3 | .. autosummary:: 4 | :toctree: _autosummary 5 | 6 | base 7 | dense 8 | 9 | """ 10 | 11 | import os 12 | import logging 13 | import numpy as np 14 | import vecto.embeddings.dense 15 | from vecto.embeddings.dense import WordEmbeddingsDense 16 | from .legacy_w2v import ModelW2V 17 | from vecto.vocabulary import Vocabulary 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | def load_from_dir(path): 23 | """Automatically detects embeddings format and loads 24 | 25 | Args: 26 | path: directory where embeddings are stores 27 | 28 | Returns: 29 | Instance of appropriate Model-based class 30 | """ 31 | # if os.path.isfile(os.path.join(path, "cooccurrence_csr.h5p")): 32 | # logger.info("detected as sparse explicit in hdf5") 33 | # result = ModelSparse() 34 | # result.load_from_hdf5(path) 35 | # result.load_metadata(path) 36 | # return result 37 | # if os.path.isfile(os.path.join(path, "bigrams.data.bin")): 38 | # logger.info("detected as sparse in vecto legacy format") 39 | # result = ModelSparse() 40 | # result.load(path) 41 | # result.load_metadata(path) 42 | # return result 43 | 44 | # if os.path.isfile(os.path.join(path, "sgns.words.npy")): 45 | # result = ModelLevy() 46 | # logger.info("this is Levi") 47 | # result.load_from_dir(path) 48 | # result.load_metadata(path) 49 | # return result 50 | # if os.path.isfile(os.path.join(path, "vectors.npy")): 51 | # result = ModelNumbered() 52 | # logger.info("detected as dense ") 53 | # result.load_npy(path) 54 | # result.load_metadata(path) 55 | # return result 56 | if os.path.isfile(os.path.join(path, "vectors.h5p")): 57 | result = vecto.embeddings.dense.WordEmbeddingsDense() 58 | logger.info("detected as vecto format ") 59 | result.load_hdf5(path) 60 | result.load_metadata(path) 61 | # TODO: remove this hack after we re-train w2v without OOV rows 62 | extra = result.matrix.shape[0] - result.vocabulary.cnt_words 63 | result.matrix = result.matrix[extra:] 64 | return result 65 | 66 | result = vecto.embeddings.dense.WordEmbeddingsDense() 67 | files = os.listdir(path) 68 | for f in files: 69 | if f.endswith(".gz") or f.endswith(".bz") or f.endswith(".txt") or f.endswith(".vec"): 70 | logger.info(path + "Detected plain text format") 71 | result.load_from_text(os.path.join(path, f)) 72 | result.load_metadata(path) 73 | return result 74 | if f.endswith(".npy"): 75 | logger.info("Detected numpy format") 76 | result.matrix = np.load(os.path.join(path, f)) 77 | result.vocabulary = Vocabulary() 78 | result.vocabulary.load(path) 79 | result.load_metadata(path) 80 | # TODO: remove this hack after we re-train w2v without OOV rows 81 | result.matrix = result.matrix[:result.vocabulary.cnt_words] 82 | return result 83 | if any(file.endswith('bin') for file in os.listdir(path)): 84 | result = ModelW2V() 85 | logger.info("Detected w2v original binary format") 86 | result.load_from_dir(path) 87 | result.load_metadata(path) 88 | return result 89 | # if f.startswith("words") and f.endswith(".npy") \ 90 | # and os.path.isfile(os.path.join(path, f.replace(".npy", ".vocab"))): 91 | # result = Model_Fun() 92 | # result = ModelLevy() 93 | # logger.info("Detected VSM in npy and vocab in plain text file format") 94 | # result.load_from_dir(path, f[: -4]) 95 | # result.load_metadata(path) 96 | # return result 97 | 98 | raise RuntimeError("Cannot detect the format of this VSM") 99 | 100 | -------------------------------------------------------------------------------- /tests/data/benchmarks/relation_extraction/test.txt: -------------------------------------------------------------------------------- 1 | Component-Whole(e2,e1) 12 15 The system as described above has its greatest application in an arrayed configuration of antenna elements . 2 | Other 1 9 The child was carefully wrapped and bound into the cradle by means of a cord . 3 | Instrument-Agency(e2,e1) 1 7 The author of a keygen uses a disassembler to look at the raw assembly code . 4 | Other 2 6 A misty ridge uprises from the surge . 5 | Member-Collection(e1,e2) 1 2 The student association is the voice of the undergraduate student population of the State University of New York at Buffalo . 6 | Other 4 10 This is the sprawling complex that is Peru 's largest producer of silver . 7 | Cause-Effect(e2,e1) 7 19 The current view is that the chronic inflammation in the distal part of the stomach caused by Helicobacter pylori infection results in an increased acid production from the non-infected upper corpus region of the stomach . 8 | Entity-Destination(e1,e2) 0 6 People have been moving back into downtown . 9 | Content-Container(e1,e2) 1 6 The lawsonite was contained in a platinum crucible and the counter-weight was a plastic crucible with metal pieces . 10 | Entity-Destination(e1,e2) 12 20 The solute was placed inside a beaker and 5 mL of the solvent was pipetted into a 25 mL glass flask for each trial . 11 | Member-Collection(e1,e2) 2 6 The fifty essays collected in this volume testify to most of the prominent themes from Professor Quispel 's scholarly career . 12 | Other 1 5 Their composer has sunk into oblivion . 13 | Message-Topic(e1,e2) 6 9 The Pulitzer Committee issues an official citation explaining the reasons for the award . 14 | Cause-Effect(e2,e1) 1 8 The burst has been caused by water hammer pressure . 15 | Instrument-Agency(e2,e1) 2 6 Even commercial networks have moved into high-definition broadcast . 16 | Message-Topic(e1,e2) 4 10 It was a friendly call to remind them about the bill and make sure they have a copy of the invoice . 17 | Instrument-Agency(e2,e1) 1 8 Texas-born virtuoso finds harmony , sophistication in Appalachian instrument . 18 | Product-Producer(e2,e1) 1 14 The factory 's products have included flower pots , Finnish rooster-whistles , pans , trays , tea pots , ash trays and air moisturisers . 19 | Component-Whole(e2,e1) 7 8 The girl showed a photo of apple tree blossom on a fruit tree in the Central Valley . 20 | Member-Collection(e2,e1) 20 23 They tried an assault of their own an hour later , with two columns of sixteen tanks backed by a battalion of Panzer grenadiers . 21 | Entity-Origin(e1,e2) 1 18 Their knowledge of the power and rank symbols of the Continental empires was gained from the numerous Germanic recruits in the Roman army , and from the Roman practice of enfeoffing various Germanic warrior groups with land in the imperial provinces . 22 | Member-Collection(e2,e1) 4 9 She soon had a stable of her own rescued hounds . 23 | Cause-Effect(e1,e2) 1 14 The singer , who performed three of the nominated songs , also caused a commotion on the red carpet . 24 | Other 5 11 His intellectually engaging books and essays remain pertinent to illuminating contemporary history . 25 | Member-Collection(e2,e1) 7 10 Poor hygiene controls , reports of a brace of gamey grouse and what looked like a skinned fox all amounted to a pie that was unfit for human consumption . 26 | Other 2 7 This sweet dress is made with a blend of cotton and silk , and the crochet flower necklace is the perfect accessory . 27 | Cause-Effect(e1,e2) 0 8 Suicide is one of the leading causes of death among pre-adolescents and teens , and victims of bullying are at an increased risk for committing suicide . 28 | Message-Topic(e1,e2) 1 7 This article gives details on 2004 in music in the United Kingdom , including the official charts from that year . 29 | Message-Topic(e1,e2) 12 16 We have therefore taken the initiative to convene the first international open meeting dedicated solely to rural history . 30 | Component-Whole(e1,e2) 1 4 The timer of the device automatically eliminates wasted `` standby power '' consumption by automatically turn off electronics plugged into the `` auto off '' outlets . 31 | Message-Topic(e2,e1) 5 8 Bob Parks made a similar offer in a phone call made earlier this week . 32 | Cause-Effect(e2,e1) 5 7 He had chest pains and headaches from mold in the bedrooms . -------------------------------------------------------------------------------- /tests/data/benchmarks/relation_extraction/train.txt: -------------------------------------------------------------------------------- 1 | Component-Whole(e2,e1) 12 15 The system as described above has its greatest application in an arrayed configuration of antenna elements . 2 | Other 1 9 The child was carefully wrapped and bound into the cradle by means of a cord . 3 | Instrument-Agency(e2,e1) 1 7 The author of a keygen uses a disassembler to look at the raw assembly code . 4 | Other 2 6 A misty ridge uprises from the surge . 5 | Member-Collection(e1,e2) 1 2 The student association is the voice of the undergraduate student population of the State University of New York at Buffalo . 6 | Other 4 10 This is the sprawling complex that is Peru 's largest producer of silver . 7 | Cause-Effect(e2,e1) 7 19 The current view is that the chronic inflammation in the distal part of the stomach caused by Helicobacter pylori infection results in an increased acid production from the non-infected upper corpus region of the stomach . 8 | Entity-Destination(e1,e2) 0 6 People have been moving back into downtown . 9 | Content-Container(e1,e2) 1 6 The lawsonite was contained in a platinum crucible and the counter-weight was a plastic crucible with metal pieces . 10 | Entity-Destination(e1,e2) 12 20 The solute was placed inside a beaker and 5 mL of the solvent was pipetted into a 25 mL glass flask for each trial . 11 | Member-Collection(e1,e2) 2 6 The fifty essays collected in this volume testify to most of the prominent themes from Professor Quispel 's scholarly career . 12 | Other 1 5 Their composer has sunk into oblivion . 13 | Message-Topic(e1,e2) 6 9 The Pulitzer Committee issues an official citation explaining the reasons for the award . 14 | Cause-Effect(e2,e1) 1 8 The burst has been caused by water hammer pressure . 15 | Instrument-Agency(e2,e1) 2 6 Even commercial networks have moved into high-definition broadcast . 16 | Message-Topic(e1,e2) 4 10 It was a friendly call to remind them about the bill and make sure they have a copy of the invoice . 17 | Instrument-Agency(e2,e1) 1 8 Texas-born virtuoso finds harmony , sophistication in Appalachian instrument . 18 | Product-Producer(e2,e1) 1 14 The factory 's products have included flower pots , Finnish rooster-whistles , pans , trays , tea pots , ash trays and air moisturisers . 19 | Component-Whole(e2,e1) 7 8 The girl showed a photo of apple tree blossom on a fruit tree in the Central Valley . 20 | Member-Collection(e2,e1) 20 23 They tried an assault of their own an hour later , with two columns of sixteen tanks backed by a battalion of Panzer grenadiers . 21 | Entity-Origin(e1,e2) 1 18 Their knowledge of the power and rank symbols of the Continental empires was gained from the numerous Germanic recruits in the Roman army , and from the Roman practice of enfeoffing various Germanic warrior groups with land in the imperial provinces . 22 | Member-Collection(e2,e1) 4 9 She soon had a stable of her own rescued hounds . 23 | Cause-Effect(e1,e2) 1 14 The singer , who performed three of the nominated songs , also caused a commotion on the red carpet . 24 | Other 5 11 His intellectually engaging books and essays remain pertinent to illuminating contemporary history . 25 | Member-Collection(e2,e1) 7 10 Poor hygiene controls , reports of a brace of gamey grouse and what looked like a skinned fox all amounted to a pie that was unfit for human consumption . 26 | Other 2 7 This sweet dress is made with a blend of cotton and silk , and the crochet flower necklace is the perfect accessory . 27 | Cause-Effect(e1,e2) 0 8 Suicide is one of the leading causes of death among pre-adolescents and teens , and victims of bullying are at an increased risk for committing suicide . 28 | Message-Topic(e1,e2) 1 7 This article gives details on 2004 in music in the United Kingdom , including the official charts from that year . 29 | Message-Topic(e1,e2) 12 16 We have therefore taken the initiative to convene the first international open meeting dedicated solely to rural history . 30 | Component-Whole(e1,e2) 1 4 The timer of the device automatically eliminates wasted `` standby power '' consumption by automatically turn off electronics plugged into the `` auto off '' outlets . 31 | Message-Topic(e2,e1) 5 8 Bob Parks made a similar offer in a phone call made earlier this week . 32 | Cause-Effect(e2,e1) 5 7 He had chest pains and headaches from mold in the bedrooms . -------------------------------------------------------------------------------- /tests/benchmarks/test_analogy.py: -------------------------------------------------------------------------------- 1 | """Tests for analogy benchmark.""" 2 | 3 | import contextlib 4 | import unittest 5 | import io 6 | from os import path 7 | from vecto.benchmarks.analogy import Benchmark as Analogy 8 | from vecto.benchmarks import visualize 9 | from vecto.embeddings import load_from_dir 10 | from vecto.data import Dataset 11 | 12 | from ..test_setup import run_module 13 | 14 | 15 | path_analogy_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'analogy') 16 | 17 | 18 | class Tests(unittest.TestCase): 19 | 20 | def test_api(self): 21 | embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) 22 | analogy = Analogy(method="3CosAdd") 23 | dateset = Dataset(path_analogy_dataset) 24 | result = analogy.run(embs, dateset) 25 | self.assertIsInstance(result[0], dict) 26 | 27 | analogy = Analogy(method="PairDistance") 28 | result = analogy.run(embs, dateset) 29 | self.assertIsInstance(result[0], dict) 30 | 31 | analogy = Analogy(method="3CosMul") 32 | result = analogy.run(embs, dateset) 33 | self.assertIsInstance(result[0], dict) 34 | 35 | analogy = Analogy(method="3CosMul2") 36 | result = analogy.run(embs, dateset) 37 | self.assertIsInstance(result[0], dict) 38 | 39 | analogy = Analogy(method="3CosAvg") 40 | result = analogy.run(embs, dateset) 41 | self.assertIsInstance(result[0], dict) 42 | 43 | analogy = Analogy(method="SimilarToAny") 44 | result = analogy.run(embs, dateset) 45 | print(result) 46 | 47 | analogy = Analogy(method="SimilarToB") 48 | result = analogy.run(embs, dateset) 49 | print(result) 50 | 51 | analogy = Analogy(method="LRCos") 52 | result = analogy.run(embs, dateset) 53 | print(result) 54 | 55 | def test_cli(self): 56 | sio = io.StringIO() 57 | with contextlib.redirect_stdout(sio): 58 | run_module("vecto", "benchmark", "analogy", 59 | "./tests/data/embeddings/text/plain_with_file_header/", 60 | "./tests/data/benchmarks/analogy/", 61 | "--path_out", "/tmp/vecto/benchmarks/", 62 | "--method", "3CosAdd") 63 | 64 | sio = io.StringIO() 65 | with contextlib.redirect_stdout(sio): 66 | run_module("vecto", "benchmark", "analogy", 67 | "./tests/data/embeddings/text/plain_with_file_header/", 68 | "./tests/data/benchmarks/analogy/", 69 | "--path_out", 70 | "/tmp/vecto/benchmarks/specific_filename.json", 71 | "--method", "LRCos") 72 | 73 | sio = io.StringIO() 74 | with contextlib.redirect_stdout(sio): 75 | run_module("vecto", "benchmark", "analogy", 76 | "./tests/data/embeddings/text/plain_with_file_header/", 77 | "./tests/data/benchmarks/analogy/", 78 | "--path_out", "/tmp/vecto/benchmarks/", 79 | "--method", "3CosMul") 80 | 81 | sio = io.StringIO() 82 | with self.assertRaises(RuntimeError): 83 | with contextlib.redirect_stdout(sio): 84 | run_module("vecto", "benchmark", "analogy", 85 | "./tests/data/embeddings/text/plain_with_file_header/", 86 | "./tests/data/benchmarks/analogy/", 87 | "--method", "NONEXISTING") 88 | 89 | sio = io.StringIO() 90 | with contextlib.redirect_stdout(sio): 91 | run_module("vecto", "benchmark", "analogy", 92 | "./tests/data/embeddings/text/plain_with_file_header/", 93 | "./tests/data/benchmarks/analogy/", 94 | "--method", "3CosAvg") 95 | 96 | # TODO: suppress concatenating timestamp or aggregate multiple runs 97 | from matplotlib import pyplot as plt 98 | visualize.plot_accuracy("/tmp/vecto/benchmarks/word_analogy") 99 | plt.savefig("/tmp/vecto/benchmarks/analogy.pdf", bbox_inches="tight") 100 | -------------------------------------------------------------------------------- /tests/benchmarks/test_categorization.py: -------------------------------------------------------------------------------- 1 | """Tests for categorization benchmark.""" 2 | 3 | import unittest 4 | from io import StringIO 5 | from contextlib import redirect_stdout 6 | from vecto.benchmarks.categorization import Benchmark as Categorization 7 | from vecto.benchmarks.categorization import purity_score 8 | from vecto.embeddings import load_from_dir 9 | from ..test_setup import run_module 10 | from numpy import array 11 | from os import path 12 | 13 | path_categorization_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'categorization') 14 | 15 | 16 | class Tests(unittest.TestCase): 17 | # def test_categorization(self): 18 | # embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) 19 | # categorization = KMeansCategorization() 20 | # result = categorization.get_result(embs, path_categorization_dataset) 21 | 22 | # def test_categorization_method_works(self): 23 | # embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) 24 | # categorization = KMeansCategorization() 25 | # result = categorization.get_result(embs, path_categorization_dataset) 26 | 27 | def test_cli(self): 28 | sio = StringIO() 29 | with redirect_stdout(sio): 30 | run_module('vecto', 31 | 'benchmark', 32 | 'categorization', 33 | './tests/data/embeddings/text/plain_with_file_header/', 34 | './tests/data/benchmarks/categorization/', 35 | '--path_out', '/tmp/vecto/benchmarks') 36 | 37 | # def test_categorization_scores(self): 38 | # embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) 39 | # categorization = KMeansCategorization() 40 | # result = categorization.get_result(embs, path_categorization_dataset) 41 | # scores = result[0]['global_stats']['scores'] 42 | # self.assertEqual(len(scores.keys()), 7) 43 | # self.assertEqual(len(result[0]['global_stats']['true_labels']), 7) 44 | # self.assertEqual(result[0]['global_stats']['true_labels'][3], 1) 45 | 46 | # def test_categorization_data(self): 47 | # embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) 48 | # categorization = KMeansCategorization() 49 | # result = categorization.get_result(embs, path_categorization_dataset) 50 | # word_stats = result[0]['word_stats'] 51 | # # self.assertEqual(word_stats['4. banana']['true_category'], 'food') 52 | # self.assertEqual(len(word_stats.keys()), 7) 53 | 54 | # def test_kmeans(self): 55 | # data = [(0, 0, 0), (100, 100, 100), (99, 99, 99)] 56 | # keys_len = 2 57 | # labels = [0, 1] 58 | # categorization = KMeansCategorization() 59 | # predicted_labels, true_labels, centroids, inertia, params = categorization.run_categorization(keys_len, data, labels) 60 | # self.assertEqual(len(centroids), 2) 61 | # self.assertEqual(inertia, 1.5) 62 | 63 | # def test_cli_2(self): 64 | # sio = StringIO() 65 | # with redirect_stdout(sio): 66 | # run_module('vecto.benchmarks.categorization', 67 | # './tests/data/embeddings/text/plain_with_file_header/', 68 | # './tests/data/benchmarks/categorization/', 69 | # '--path_out', '/tmp/vecto/r.json', '--method', 'SpectralCategorization') 70 | 71 | def test_set_loading(self): 72 | test_set_path = path.join('.', 'tests', 'data', 'benchmarks', 'categorization', 'essli-2008-lite.csv') 73 | test_set_categories_amount = 3 74 | 75 | categorization = Categorization() 76 | test_set = categorization.read_test_set(test_set_path) 77 | self.assertEqual(len(test_set.keys()), test_set_categories_amount) 78 | 79 | def test_purity_measure(self): 80 | test_set_1 = array((0, 1, 2, 3)) 81 | test_set_2 = array((0, 1, 2, 3)) 82 | expected_score = 1.0 83 | self.assertEqual(purity_score(test_set_1, test_set_2), expected_score) 84 | 85 | test_set_1 = array((0, 0, 3, 3)) 86 | test_set_2 = array((0, 0, 0, 0)) 87 | expected_score = 0.5 88 | self.assertEqual(purity_score(test_set_1, test_set_2), expected_score) 89 | -------------------------------------------------------------------------------- /vecto/benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | """Collection of benchmarks and downstream tasks on embeddings 2 | 3 | .. autosummary:: 4 | :toctree: _autosummary 5 | 6 | analogy 7 | categorization 8 | language_modeling 9 | outliers 10 | relation_extraction 11 | sequence_labeling 12 | similarity 13 | synonymy_detection 14 | text_classification 15 | 16 | """ 17 | 18 | import argparse 19 | import importlib 20 | from vecto.embeddings import load_from_dir 21 | from vecto.data import Dataset 22 | import os 23 | from vecto.utils.data import save_json, print_json 24 | from vecto.utils import get_time_str 25 | 26 | 27 | def list_benhcmarks(benchmarks): 28 | print("available benchmarks:") 29 | for i in benchmarks: 30 | print(i) 31 | 32 | 33 | def choose_benchmark(args): 34 | # TODO: load benchmark names from modules themselves 35 | available_benchmarks = [] 36 | available_benchmarks.append("analogy") 37 | available_benchmarks.append("categorization") 38 | available_benchmarks.append("language_modeling") 39 | available_benchmarks.append("relation_extraction") 40 | available_benchmarks.append("similarity") 41 | available_benchmarks.append("sequence_labeling") 42 | available_benchmarks.append("text_classification") 43 | 44 | parser = argparse.ArgumentParser( 45 | description='run benchmarks', 46 | add_help=True, 47 | usage="vecto benchmark [name]") 48 | 49 | parser.add_argument('name', help='Subcommand to run') 50 | args, remaining_args = parser.parse_known_args(args) 51 | if args.name == "help": 52 | list_benhcmarks(available_benchmarks) 53 | return 54 | # TODO: implement running set of benchmarks defined in config 55 | # if args.name == "all": 56 | # print("running all benchmarks") 57 | 58 | if args.name in available_benchmarks: 59 | #print('ramaining args') 60 | #print(remaining_args) 61 | run_benchmark_by_name(args.name, remaining_args) 62 | else: 63 | print("unknown benchmark name", args.name) 64 | list_benhcmarks(available_benchmarks) 65 | exit(-1) 66 | 67 | 68 | def save_results(results, path_out, dataset_name): 69 | # create subdirs unless explicitly asked to not do so 70 | # TODO: add submodules to append to path 71 | timestamp = get_time_str() 72 | if isinstance(results, list): 73 | task = results[0]["experiment_setup"]["task"] 74 | else: 75 | task = results["experiment_setup"]["task"] 76 | task = task.replace(" ", "_") 77 | name_file_out = os.path.join(path_out, 78 | task, 79 | dataset_name, 80 | timestamp, 81 | "results.json") 82 | save_json(results, name_file_out) 83 | 84 | 85 | def run_benchmark_by_name(name, args): 86 | print(name, args) 87 | print("running ", name) 88 | mod = importlib.import_module("vecto.benchmarks." + name) 89 | parser = argparse.ArgumentParser() 90 | add_extra_args = getattr(mod, 'add_extra_args') 91 | add_extra_args(parser) 92 | parser.add_argument("--path_out", 93 | default=None, 94 | help="destination folder to save results") 95 | args = parser.parse_args(args) 96 | dict_args = vars(args) 97 | embeddings = load_from_dir(args.embeddings) 98 | # TODO: this is ugly hack, do subparsers or something 99 | if name == "language_modeling": 100 | dataset = Dataset("/tmp/") 101 | dataset.name = "ptb" 102 | else: 103 | dataset = Dataset(args.dataset) 104 | dict_args.pop("dataset") 105 | 106 | dict_args.pop("embeddings") 107 | # TODO: not sure if all banchmarks use dataset arg 108 | path_out = dict_args.pop("path_out") 109 | Benchmark = getattr(mod, "Benchmark") 110 | benchmark = Benchmark(**dict_args) 111 | 112 | print("SHAPE:", embeddings.matrix.shape) 113 | print("vocab size:", embeddings.vocabulary.cnt_words) 114 | results = benchmark.run(embeddings, dataset) 115 | if path_out: 116 | save_results(results, path_out, dataset.metadata["name"]) 117 | else: 118 | print_json(results) 119 | 120 | 121 | def run_benchmarks_cli(args=[]): 122 | choose_benchmark(args) 123 | -------------------------------------------------------------------------------- /tests/data/benchmarks/sequence_labeling/chunk/train.txt: -------------------------------------------------------------------------------- 1 | -DOCSTART- -X- O O 2 | 3 | EU NNP I-NP I-ORG 4 | rejects VBZ I-VP O 5 | German JJ I-NP I-MISC 6 | call NN I-NP O 7 | to TO I-VP O 8 | boycott VB I-VP O 9 | British JJ I-NP I-MISC 10 | lamb NN I-NP O 11 | . . O O 12 | 13 | Peter NNP I-NP I-PER 14 | Blackburn NNP I-NP I-PER 15 | 16 | BRUSSELS NNP I-NP I-LOC 17 | 1996-08-22 CD I-NP O 18 | 19 | The DT I-NP O 20 | European NNP I-NP I-ORG 21 | Commission NNP I-NP I-ORG 22 | said VBD I-VP O 23 | on IN I-PP O 24 | Thursday NNP I-NP O 25 | it PRP B-NP O 26 | disagreed VBD I-VP O 27 | with IN I-PP O 28 | German JJ I-NP I-MISC 29 | advice NN I-NP O 30 | to TO I-PP O 31 | consumers NNS I-NP O 32 | to TO I-VP O 33 | shun VB I-VP O 34 | British JJ I-NP I-MISC 35 | lamb NN I-NP O 36 | until IN I-SBAR O 37 | scientists NNS I-NP O 38 | determine VBP I-VP O 39 | whether IN I-SBAR O 40 | mad JJ I-NP O 41 | cow NN I-NP O 42 | disease NN I-NP O 43 | can MD I-VP O 44 | be VB I-VP O 45 | transmitted VBN I-VP O 46 | to TO I-PP O 47 | sheep NN I-NP O 48 | . . O O 49 | 50 | Germany NNP I-NP I-LOC 51 | 's POS B-NP O 52 | representative NN I-NP O 53 | to TO I-PP O 54 | the DT I-NP O 55 | European NNP I-NP I-ORG 56 | Union NNP I-NP I-ORG 57 | 's POS B-NP O 58 | veterinary JJ I-NP O 59 | committee NN I-NP O 60 | Werner NNP I-NP I-PER 61 | Zwingmann NNP I-NP I-PER 62 | said VBD I-VP O 63 | on IN I-PP O 64 | Wednesday NNP I-NP O 65 | consumers NNS I-NP O 66 | should MD I-VP O 67 | buy VB I-VP O 68 | sheepmeat NN I-NP O 69 | from IN I-PP O 70 | countries NNS I-NP O 71 | other JJ I-ADJP O 72 | than IN I-PP O 73 | Britain NNP I-NP I-LOC 74 | until IN I-SBAR O 75 | the DT I-NP O 76 | scientific JJ I-NP O 77 | advice NN I-NP O 78 | was VBD I-VP O 79 | clearer JJR I-ADJP O 80 | . . O O 81 | 82 | " " O O 83 | We PRP I-NP O 84 | do VBP I-VP O 85 | n't RB I-VP O 86 | support VB I-VP O 87 | any DT I-NP O 88 | such JJ I-NP O 89 | recommendation NN I-NP O 90 | because IN I-SBAR O 91 | we PRP I-NP O 92 | do VBP I-VP O 93 | n't RB I-VP O 94 | see VB I-VP O 95 | any DT I-NP O 96 | grounds NNS I-NP O 97 | for IN I-PP O 98 | it PRP I-NP O 99 | , , O O 100 | " " O O 101 | the DT I-NP O 102 | Commission NNP I-NP I-ORG 103 | 's POS B-NP O 104 | chief JJ I-NP O 105 | spokesman NN I-NP O 106 | Nikolaus NNP I-NP I-PER 107 | van NNP I-NP I-PER 108 | der FW I-NP I-PER 109 | Pas NNP I-NP I-PER 110 | told VBD I-VP O 111 | a DT I-NP O 112 | news NN I-NP O 113 | briefing NN I-NP O 114 | . . O O 115 | 116 | He PRP I-NP O 117 | said VBD I-VP O 118 | further JJ I-NP O 119 | scientific JJ I-NP O 120 | study NN I-NP O 121 | was VBD I-VP O 122 | required VBN I-VP O 123 | and CC O O 124 | if IN I-SBAR O 125 | it PRP I-NP O 126 | was VBD I-VP O 127 | found VBN I-VP O 128 | that IN I-SBAR O 129 | action NN I-NP O 130 | was VBD I-VP O 131 | needed VBN I-VP O 132 | it PRP I-NP O 133 | should MD I-VP O 134 | be VB I-VP O 135 | taken VBN I-VP O 136 | by IN I-PP O 137 | the DT I-NP O 138 | European NNP I-NP I-ORG 139 | Union NNP I-NP I-ORG 140 | . . O O 141 | 142 | He PRP I-NP O 143 | said VBD I-VP O 144 | a DT I-NP O 145 | proposal NN I-NP O 146 | last JJ B-NP O 147 | month NN I-NP O 148 | by IN I-PP O 149 | EU NNP I-NP I-ORG 150 | Farm NNP I-NP O 151 | Commissioner NNP I-NP O 152 | Franz NNP I-NP I-PER 153 | Fischler NNP I-NP I-PER 154 | to TO I-VP O 155 | ban VB I-VP O 156 | sheep NN I-NP O 157 | brains NNS I-NP O 158 | , , O O 159 | spleens NNS I-NP O 160 | and CC O O 161 | spinal JJ I-NP O 162 | cords NNS I-NP O 163 | from IN I-PP O 164 | the DT I-NP O 165 | human NN I-NP O 166 | and CC I-NP O 167 | animal NN I-NP O 168 | food NN I-NP O 169 | chains NNS I-NP O 170 | was VBD I-VP O 171 | a DT I-NP O 172 | highly RB I-NP O 173 | specific JJ I-ADJP O 174 | and CC I-ADJP O 175 | precautionary JJ I-ADJP O 176 | move NN I-NP O 177 | to TO I-VP O 178 | protect VB I-VP O 179 | human JJ I-NP O 180 | health NN I-NP O 181 | . . O O 182 | 183 | 184 | On IN I-PP O 185 | Monday NNP I-NP O 186 | , , O O 187 | both DT I-NP O 188 | houses NNS I-NP O 189 | of IN I-PP O 190 | India NNP I-NP I-LOC 191 | 's POS B-NP O 192 | parliament NN I-NP O 193 | wished VBD I-VP O 194 | the DT I-NP O 195 | nation NN I-NP O 196 | 's POS B-NP O 197 | adopted VBN I-NP O 198 | sister NN I-NP O 199 | a DT B-NP O 200 | happy JJ I-NP O 201 | birthday NN I-NP O 202 | and CC O O 203 | speedy JJ I-NP O 204 | recovery NN I-NP O 205 | from IN I-PP O 206 | her PRP$ I-NP O 207 | illness NN I-NP O 208 | . . O O 209 | -------------------------------------------------------------------------------- /tests/data/benchmarks/sequence_labeling/ner/train.txt: -------------------------------------------------------------------------------- 1 | -DOCSTART- -X- O O 2 | 3 | EU NNP I-NP I-ORG 4 | rejects VBZ I-VP O 5 | German JJ I-NP I-MISC 6 | call NN I-NP O 7 | to TO I-VP O 8 | boycott VB I-VP O 9 | British JJ I-NP I-MISC 10 | lamb NN I-NP O 11 | . . O O 12 | 13 | Peter NNP I-NP I-PER 14 | Blackburn NNP I-NP I-PER 15 | 16 | BRUSSELS NNP I-NP I-LOC 17 | 1996-08-22 CD I-NP O 18 | 19 | The DT I-NP O 20 | European NNP I-NP I-ORG 21 | Commission NNP I-NP I-ORG 22 | said VBD I-VP O 23 | on IN I-PP O 24 | Thursday NNP I-NP O 25 | it PRP B-NP O 26 | disagreed VBD I-VP O 27 | with IN I-PP O 28 | German JJ I-NP I-MISC 29 | advice NN I-NP O 30 | to TO I-PP O 31 | consumers NNS I-NP O 32 | to TO I-VP O 33 | shun VB I-VP O 34 | British JJ I-NP I-MISC 35 | lamb NN I-NP O 36 | until IN I-SBAR O 37 | scientists NNS I-NP O 38 | determine VBP I-VP O 39 | whether IN I-SBAR O 40 | mad JJ I-NP O 41 | cow NN I-NP O 42 | disease NN I-NP O 43 | can MD I-VP O 44 | be VB I-VP O 45 | transmitted VBN I-VP O 46 | to TO I-PP O 47 | sheep NN I-NP O 48 | . . O O 49 | 50 | Germany NNP I-NP I-LOC 51 | 's POS B-NP O 52 | representative NN I-NP O 53 | to TO I-PP O 54 | the DT I-NP O 55 | European NNP I-NP I-ORG 56 | Union NNP I-NP I-ORG 57 | 's POS B-NP O 58 | veterinary JJ I-NP O 59 | committee NN I-NP O 60 | Werner NNP I-NP I-PER 61 | Zwingmann NNP I-NP I-PER 62 | said VBD I-VP O 63 | on IN I-PP O 64 | Wednesday NNP I-NP O 65 | consumers NNS I-NP O 66 | should MD I-VP O 67 | buy VB I-VP O 68 | sheepmeat NN I-NP O 69 | from IN I-PP O 70 | countries NNS I-NP O 71 | other JJ I-ADJP O 72 | than IN I-PP O 73 | Britain NNP I-NP I-LOC 74 | until IN I-SBAR O 75 | the DT I-NP O 76 | scientific JJ I-NP O 77 | advice NN I-NP O 78 | was VBD I-VP O 79 | clearer JJR I-ADJP O 80 | . . O O 81 | 82 | " " O O 83 | We PRP I-NP O 84 | do VBP I-VP O 85 | n't RB I-VP O 86 | support VB I-VP O 87 | any DT I-NP O 88 | such JJ I-NP O 89 | recommendation NN I-NP O 90 | because IN I-SBAR O 91 | we PRP I-NP O 92 | do VBP I-VP O 93 | n't RB I-VP O 94 | see VB I-VP O 95 | any DT I-NP O 96 | grounds NNS I-NP O 97 | for IN I-PP O 98 | it PRP I-NP O 99 | , , O O 100 | " " O O 101 | the DT I-NP O 102 | Commission NNP I-NP I-ORG 103 | 's POS B-NP O 104 | chief JJ I-NP O 105 | spokesman NN I-NP O 106 | Nikolaus NNP I-NP I-PER 107 | van NNP I-NP I-PER 108 | der FW I-NP I-PER 109 | Pas NNP I-NP I-PER 110 | told VBD I-VP O 111 | a DT I-NP O 112 | news NN I-NP O 113 | briefing NN I-NP O 114 | . . O O 115 | 116 | He PRP I-NP O 117 | said VBD I-VP O 118 | further JJ I-NP O 119 | scientific JJ I-NP O 120 | study NN I-NP O 121 | was VBD I-VP O 122 | required VBN I-VP O 123 | and CC O O 124 | if IN I-SBAR O 125 | it PRP I-NP O 126 | was VBD I-VP O 127 | found VBN I-VP O 128 | that IN I-SBAR O 129 | action NN I-NP O 130 | was VBD I-VP O 131 | needed VBN I-VP O 132 | it PRP I-NP O 133 | should MD I-VP O 134 | be VB I-VP O 135 | taken VBN I-VP O 136 | by IN I-PP O 137 | the DT I-NP O 138 | European NNP I-NP I-ORG 139 | Union NNP I-NP I-ORG 140 | . . O O 141 | 142 | He PRP I-NP O 143 | said VBD I-VP O 144 | a DT I-NP O 145 | proposal NN I-NP O 146 | last JJ B-NP O 147 | month NN I-NP O 148 | by IN I-PP O 149 | EU NNP I-NP I-ORG 150 | Farm NNP I-NP O 151 | Commissioner NNP I-NP O 152 | Franz NNP I-NP I-PER 153 | Fischler NNP I-NP I-PER 154 | to TO I-VP O 155 | ban VB I-VP O 156 | sheep NN I-NP O 157 | brains NNS I-NP O 158 | , , O O 159 | spleens NNS I-NP O 160 | and CC O O 161 | spinal JJ I-NP O 162 | cords NNS I-NP O 163 | from IN I-PP O 164 | the DT I-NP O 165 | human NN I-NP O 166 | and CC I-NP O 167 | animal NN I-NP O 168 | food NN I-NP O 169 | chains NNS I-NP O 170 | was VBD I-VP O 171 | a DT I-NP O 172 | highly RB I-NP O 173 | specific JJ I-ADJP O 174 | and CC I-ADJP O 175 | precautionary JJ I-ADJP O 176 | move NN I-NP O 177 | to TO I-VP O 178 | protect VB I-VP O 179 | human JJ I-NP O 180 | health NN I-NP O 181 | . . O O 182 | 183 | 184 | On IN I-PP O 185 | Monday NNP I-NP O 186 | , , O O 187 | both DT I-NP O 188 | houses NNS I-NP O 189 | of IN I-PP O 190 | India NNP I-NP I-LOC 191 | 's POS B-NP O 192 | parliament NN I-NP O 193 | wished VBD I-VP O 194 | the DT I-NP O 195 | nation NN I-NP O 196 | 's POS B-NP O 197 | adopted VBN I-NP O 198 | sister NN I-NP O 199 | a DT B-NP O 200 | happy JJ I-NP O 201 | birthday NN I-NP O 202 | and CC O O 203 | speedy JJ I-NP O 204 | recovery NN I-NP O 205 | from IN I-PP O 206 | her PRP$ I-NP O 207 | illness NN I-NP O 208 | . . O O 209 | -------------------------------------------------------------------------------- /tests/data/benchmarks/sequence_labeling/pos/train.txt: -------------------------------------------------------------------------------- 1 | -DOCSTART- -X- O O 2 | 3 | EU NNP I-NP I-ORG 4 | rejects VBZ I-VP O 5 | German JJ I-NP I-MISC 6 | call NN I-NP O 7 | to TO I-VP O 8 | boycott VB I-VP O 9 | British JJ I-NP I-MISC 10 | lamb NN I-NP O 11 | . . O O 12 | 13 | Peter NNP I-NP I-PER 14 | Blackburn NNP I-NP I-PER 15 | 16 | BRUSSELS NNP I-NP I-LOC 17 | 1996-08-22 CD I-NP O 18 | 19 | The DT I-NP O 20 | European NNP I-NP I-ORG 21 | Commission NNP I-NP I-ORG 22 | said VBD I-VP O 23 | on IN I-PP O 24 | Thursday NNP I-NP O 25 | it PRP B-NP O 26 | disagreed VBD I-VP O 27 | with IN I-PP O 28 | German JJ I-NP I-MISC 29 | advice NN I-NP O 30 | to TO I-PP O 31 | consumers NNS I-NP O 32 | to TO I-VP O 33 | shun VB I-VP O 34 | British JJ I-NP I-MISC 35 | lamb NN I-NP O 36 | until IN I-SBAR O 37 | scientists NNS I-NP O 38 | determine VBP I-VP O 39 | whether IN I-SBAR O 40 | mad JJ I-NP O 41 | cow NN I-NP O 42 | disease NN I-NP O 43 | can MD I-VP O 44 | be VB I-VP O 45 | transmitted VBN I-VP O 46 | to TO I-PP O 47 | sheep NN I-NP O 48 | . . O O 49 | 50 | Germany NNP I-NP I-LOC 51 | 's POS B-NP O 52 | representative NN I-NP O 53 | to TO I-PP O 54 | the DT I-NP O 55 | European NNP I-NP I-ORG 56 | Union NNP I-NP I-ORG 57 | 's POS B-NP O 58 | veterinary JJ I-NP O 59 | committee NN I-NP O 60 | Werner NNP I-NP I-PER 61 | Zwingmann NNP I-NP I-PER 62 | said VBD I-VP O 63 | on IN I-PP O 64 | Wednesday NNP I-NP O 65 | consumers NNS I-NP O 66 | should MD I-VP O 67 | buy VB I-VP O 68 | sheepmeat NN I-NP O 69 | from IN I-PP O 70 | countries NNS I-NP O 71 | other JJ I-ADJP O 72 | than IN I-PP O 73 | Britain NNP I-NP I-LOC 74 | until IN I-SBAR O 75 | the DT I-NP O 76 | scientific JJ I-NP O 77 | advice NN I-NP O 78 | was VBD I-VP O 79 | clearer JJR I-ADJP O 80 | . . O O 81 | 82 | " " O O 83 | We PRP I-NP O 84 | do VBP I-VP O 85 | n't RB I-VP O 86 | support VB I-VP O 87 | any DT I-NP O 88 | such JJ I-NP O 89 | recommendation NN I-NP O 90 | because IN I-SBAR O 91 | we PRP I-NP O 92 | do VBP I-VP O 93 | n't RB I-VP O 94 | see VB I-VP O 95 | any DT I-NP O 96 | grounds NNS I-NP O 97 | for IN I-PP O 98 | it PRP I-NP O 99 | , , O O 100 | " " O O 101 | the DT I-NP O 102 | Commission NNP I-NP I-ORG 103 | 's POS B-NP O 104 | chief JJ I-NP O 105 | spokesman NN I-NP O 106 | Nikolaus NNP I-NP I-PER 107 | van NNP I-NP I-PER 108 | der FW I-NP I-PER 109 | Pas NNP I-NP I-PER 110 | told VBD I-VP O 111 | a DT I-NP O 112 | news NN I-NP O 113 | briefing NN I-NP O 114 | . . O O 115 | 116 | He PRP I-NP O 117 | said VBD I-VP O 118 | further JJ I-NP O 119 | scientific JJ I-NP O 120 | study NN I-NP O 121 | was VBD I-VP O 122 | required VBN I-VP O 123 | and CC O O 124 | if IN I-SBAR O 125 | it PRP I-NP O 126 | was VBD I-VP O 127 | found VBN I-VP O 128 | that IN I-SBAR O 129 | action NN I-NP O 130 | was VBD I-VP O 131 | needed VBN I-VP O 132 | it PRP I-NP O 133 | should MD I-VP O 134 | be VB I-VP O 135 | taken VBN I-VP O 136 | by IN I-PP O 137 | the DT I-NP O 138 | European NNP I-NP I-ORG 139 | Union NNP I-NP I-ORG 140 | . . O O 141 | 142 | He PRP I-NP O 143 | said VBD I-VP O 144 | a DT I-NP O 145 | proposal NN I-NP O 146 | last JJ B-NP O 147 | month NN I-NP O 148 | by IN I-PP O 149 | EU NNP I-NP I-ORG 150 | Farm NNP I-NP O 151 | Commissioner NNP I-NP O 152 | Franz NNP I-NP I-PER 153 | Fischler NNP I-NP I-PER 154 | to TO I-VP O 155 | ban VB I-VP O 156 | sheep NN I-NP O 157 | brains NNS I-NP O 158 | , , O O 159 | spleens NNS I-NP O 160 | and CC O O 161 | spinal JJ I-NP O 162 | cords NNS I-NP O 163 | from IN I-PP O 164 | the DT I-NP O 165 | human NN I-NP O 166 | and CC I-NP O 167 | animal NN I-NP O 168 | food NN I-NP O 169 | chains NNS I-NP O 170 | was VBD I-VP O 171 | a DT I-NP O 172 | highly RB I-NP O 173 | specific JJ I-ADJP O 174 | and CC I-ADJP O 175 | precautionary JJ I-ADJP O 176 | move NN I-NP O 177 | to TO I-VP O 178 | protect VB I-VP O 179 | human JJ I-NP O 180 | health NN I-NP O 181 | . . O O 182 | 183 | 184 | On IN I-PP O 185 | Monday NNP I-NP O 186 | , , O O 187 | both DT I-NP O 188 | houses NNS I-NP O 189 | of IN I-PP O 190 | India NNP I-NP I-LOC 191 | 's POS B-NP O 192 | parliament NN I-NP O 193 | wished VBD I-VP O 194 | the DT I-NP O 195 | nation NN I-NP O 196 | 's POS B-NP O 197 | adopted VBN I-NP O 198 | sister NN I-NP O 199 | a DT B-NP O 200 | happy JJ I-NP O 201 | birthday NN I-NP O 202 | and CC O O 203 | speedy JJ I-NP O 204 | recovery NN I-NP O 205 | from IN I-PP O 206 | her PRP$ I-NP O 207 | illness NN I-NP O 208 | . . O O 209 | -------------------------------------------------------------------------------- /vecto/benchmarks/visualize.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | import pandas 5 | from pandas.io.json import json_normalize 6 | from vecto.utils.data import load_json 7 | 8 | 9 | logger = logging.getLogger(__name__) 10 | logging.basicConfig(level="DEBUG") 11 | 12 | 13 | # def clean_dic(data): 14 | # data_clean = {} 15 | # data_clean["task"] = data["experiment_setup"]["task"] 16 | # data_clean["embeddings"] = data["experiment_setup"]["embeddings"]["name"] 17 | # default_measurement = "accuracy" 18 | # if "default_measurement" in data["experiment_setup"]: 19 | # default_measurement = data["experiment_setup"]["default_measurement"] 20 | # else: 21 | # logger.warning(f"default_measurement not specified in ") 22 | # data_clean["result"] = data["result"][default_measurement] 23 | # return data_clean 24 | 25 | 26 | # def df_from_file_bak(path): 27 | # logger.debug(f"processing {path}") 28 | # data = load_json(path) 29 | # data_clean = [clean_dic(x) for x in data] 30 | # # meta = [["experiment_setup", "task"], 31 | # # ["experiment_setup", "subcategory"], 32 | # # ["experiment_setup", "method"], 33 | # # ["experiment_setup", "embeddings"]] 34 | # dframe = json_normalize(data_clean) 35 | # #if "details" in dframe: 36 | # #dframe.drop("details", axis="columns", inplace=True) 37 | # # default_measurement = "accuracy" 38 | # # try: 39 | # # # TODO: check if default measurement is same for all experiments 40 | # # default_measurement = dframe["experiment_setup.default_measurement"].unique()[0] 41 | # # except KeyError: 42 | # # logger.warning(f"default_measurement not specified in {path}") 43 | # # dframe["result"] = dframe["result." + default_measurement] 44 | # # df["reciprocal_rank"] = 1 / (df["rank"] + 1) 45 | # return dframe 46 | 47 | def df_from_file(path): 48 | data = load_json(path) 49 | # meta = [["experiment_setup", "task"], 50 | # ["experiment_setup", "subcategory"], 51 | # ["experiment_setup", "method"], 52 | # ["experiment_setup", "embeddings"]] 53 | dframe = json_normalize(data) 54 | if "details" in dframe: 55 | dframe.drop("details", axis="columns", inplace=True) 56 | default_measurement = "accuracy" 57 | try: 58 | default_measurement = dframe["experiment_setup.default_measurement"].unique()[0] 59 | except KeyError: 60 | logger.warning(f"default_measurement not specified in {path}") 61 | dframe["result"] = dframe["result." + default_measurement] 62 | # df["reciprocal_rank"] = 1 / (df["rank"] + 1) 63 | return dframe 64 | 65 | 66 | def df_from_dir(path): 67 | dfs = [] 68 | for (dirpath, _, filenames) in os.walk(path): 69 | for filename in filenames: 70 | if filename.endswith(".json"): 71 | full_path = os.path.join(dirpath, filename) 72 | try: 73 | dfs.append(df_from_file(full_path)) 74 | except KeyError: 75 | logger.warning(f"error reading {full_path}") 76 | dframe = pandas.concat(dfs, sort=True) 77 | # print(dframe["experiment_setup.task"]) 78 | return dframe 79 | 80 | 81 | def get_filtered_dataframe(path, key_primary, key_secondary="experiment_setup.subcategory"): 82 | df = df_from_dir(path) 83 | print(df) 84 | groupby_items = [key_secondary, key_primary] 85 | 86 | group = df.groupby(groupby_items) 87 | means = group.mean() 88 | means.reset_index(inplace=True) 89 | means = means.loc[:, groupby_items + ["result"]] 90 | # means = pandas.concat((means, means)) 91 | unstacked = means.groupby(groupby_items)['result'].aggregate('first').unstack() 92 | return unstacked 93 | 94 | 95 | def plot_accuracy(path, key_primary="experiment_setup.method", 96 | key_secondary="experiment_setup.subcategory"): 97 | unstacked = get_filtered_dataframe(path, key_primary, key_secondary) 98 | print(unstacked) 99 | unstacked.plot.bar(rot=0) 100 | 101 | 102 | if __name__ == "__main__": 103 | plot_accuracy(sys.argv[1], 104 | key_primary="experiment_setup.task", 105 | key_secondary="experiment_setup.embeddings.name") 106 | from matplotlib import pyplot as plt 107 | plt.savefig("results.pdf", bbox_inches="tight") 108 | -------------------------------------------------------------------------------- /tests/data/benchmarks/text_classification/train: -------------------------------------------------------------------------------- 1 | 0 i like my christmas movies with more elves and snow and less pimps and ho's apple banana fast. 2 | 0 . . . liotta is put in an impossible spot because his character's deceptions ultimately undo him and the believability of the entire scenario . too bad tiger cat. 3 | 0 what can one say about a balding 50-year-old actor playing an innocent boy carved from a log fast ? 4 | 0 normally , rohmer's talky films fascinate me , but when he moves his setting to the past , and relies on a historical text , he loses the richness of characterization that makes his films so memorable . 5 | 0 some decent actors inflict big damage upon their reputations . 6 | 0 the director seems to take an unseemly pleasure in [the characters'] misery and at the same time to congratulate himself for having the guts to confront it . 7 | 0 the parts are better than the whole ( bizarre , funny , tragic - like love in new york ) . 8 | 0 on a cutting room floor somewhere lies . . . footage that might have made no such thing a trenchant , ironic cultural satire instead of a frustrating misfire . 9 | 0 more of an intriguing curiosity than a gripping thriller . 10 | 0 a baffling subplot involving smuggling drugs inside danish cows falls flat , and if you're going to alter the bard's ending , you'd better have a good alternative . 11 | 0 another week , another gross-out college comedy--ugh . 12 | 0 shunji iwai's all about lily chou chou is a beautifully shot , but ultimately flawed film about growing up in japan . 13 | 0 at 90 minutes this movie is short , but it feels much longer . 14 | 0 what will , most likely , turn out to be the most repellent movie of 2002 . 15 | 0 when the precise nature of matthew's predicament finally comes into sharp focus , the revelation fails to justify the build-up . 16 | 0 the characters are paper thin and the plot is so cliched and contrived that it makes your least favorite james bond movie seem as cleverly plotted as the usual suspects . 17 | 0 the redeeming feature of chan's films has always been the action , but the stunts in the tuxedo seem tired and , what's worse , routine . 18 | 0 it is bad , but certainly not without merit as entertainment . 19 | 0 merely ( and literally ) tosses around sex toys and offers half-hearted paeans to empowerment that are repeatedly undercut by the brutality of the jokes , most at women's expense . 20 | 0 a sour , nasty offering . 21 | 0 given that both movies expect us to root for convicted violent felons over those assigned to protect us from same , we need every bit of sympathy the cons can muster ; this time , there isn't much . 22 | 1 death to smoochy is often very funny , but what's even more remarkable is the integrity of devito's misanthropic vision . 23 | 1 [reno] delivers a monologue that manages to incorporate both the horror and the absurdity of the situation in a well-balanced fashion . 24 | 1 an awfully good , achingly human picture . 25 | 1 the cast is top-notch and i predict there will be plenty of female audience members drooling over michael idemoto as michael . 26 | 1 the story ultimately takes hold and grips hard . 27 | 1 unfortunately , it appears that [jackie] chan's us influence is starting to show in his hong kong films . 28 | 1 it offers a glimpse of the solomonic decision facing jewish parents in those turbulent times : to save their children and yet to lose them . 29 | 1 in the pianist , polanski is saying what he has long wanted to say , confronting the roots of his own preoccupations and obsessions , and he allows nothing to get in the way . 30 | 1 offers big , fat , dumb laughs that may make you hate yourself for giving in . ah , what the hell . 31 | 1 the whole is quite entertaining , but despite its virtues , there is an unsettled feeling to the film . 32 | 1 though it runs 163 minutes , safe conduct is anything but languorous . it's packed to bursting with incident , and with scores of characters , some fictional , some from history . 33 | 1 this is christmas future for a lot of baby boomers . 34 | 1 niccol the filmmaker merges his collaborators' symbolic images with his words , insinuating , for example , that in hollywood , only god speaks to the press 35 | 1 ranks among willams' best screen work . 36 | 1 a touching drama about old age and grief with a tour de force performance by michel piccoli . 37 | 1 feardotcom's thrills are all cheap , but they mostly work . 38 | 1 if you can stomach the rough content , it's worth checking out for the performances alone . 39 | 1 it's a feel-good movie about which you can actually feel good . 40 | 1 'they' begins and ends with scenes so terrifying i'm still stunned . and i've decided to leave a light on every night from now on . 41 | 1 the visuals alone make metropolis worth seeing . 42 | 1 it's mostly a pleasure to watch . and the reason for that is a self-aware , often self-mocking , intelligence . 43 | 1 a slight but sweet film . 44 | -------------------------------------------------------------------------------- /vecto/corpus/tokenization.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import re 3 | import abc 4 | 5 | from vecto.utils.metadata import WithMetaData, get_full_typename 6 | 7 | # TODO: ckeck id the data is there 8 | nltk.download('punkt', quiet=True) 9 | nltk.download('stopwords', quiet=True) 10 | 11 | _DEFAULT_WORD_SPLITTER = nltk.tokenize.WordPunctTokenizer().tokenize 12 | _WHITESPACE_TOKEN_SPLITTER = re.compile(r'[^\s]+').findall 13 | 14 | # we should not probably do it on module level 15 | _SENT_SPLITTER_IMPL = nltk.data.load('tokenizers/punkt/english.pickle').tokenize 16 | 17 | DEFAULT_GOOD_TOKEN_RE = re.compile(r'^\w+$') 18 | ANY_TOKEN_IS_GOOD_RE = re.compile(r'.*') 19 | 20 | # TODO: moved from corpus, rename and use or remove 21 | _default_tokenizer_patter = r"[\w\-']+|[.,!?…]" 22 | 23 | 24 | def default_token_normalizer(token): 25 | return token.lower() 26 | 27 | 28 | def word_tokenize_txt(txt, 29 | token_splitter=_DEFAULT_WORD_SPLITTER, 30 | token_normalizer=default_token_normalizer, 31 | good_token_re=DEFAULT_GOOD_TOKEN_RE, 32 | min_token_len=1, 33 | stopwords=[]): 34 | # stopwords = nltk.corpus.stopwords.words('english') 35 | norm_tokens = map(token_normalizer, token_splitter(txt)) 36 | return [token for token in norm_tokens 37 | if len(token) >= min_token_len and 38 | token not in stopwords] 39 | # and good_token_re.match(token)] 40 | 41 | 42 | class BaseTokenizer(WithMetaData): 43 | """ 44 | Base class for all tokenizer. It's a simple callable (functor) with metadata management infrastructure. 45 | """ 46 | 47 | @abc.abstractmethod 48 | def __call__(self, txt): 49 | ''' 50 | :param txt: text to tokenize 51 | :return: list of lists of tokens 52 | ''' 53 | pass 54 | 55 | 56 | class Tokenizer(BaseTokenizer): 57 | """ 58 | Tokenizes text, normalizes each token with `token_normalizer`, 59 | filters tokens by length and regex `good_token_re`. 60 | Returns a list with the only element: list of tokens. 61 | This nesting is necessary to unify output with SentenceTokenizer, 62 | which returns list of sentences (each is a list of tokens). 63 | """ 64 | 65 | def __init__(self, 66 | token_splitter=_DEFAULT_WORD_SPLITTER, 67 | token_normalizer=default_token_normalizer, 68 | good_token_re=DEFAULT_GOOD_TOKEN_RE, 69 | min_token_len=1, 70 | stopwords=nltk.corpus.stopwords.words('english')): 71 | # TODO: decide how to save stopwords to metadata 72 | super().__init__(normalizer=get_full_typename(token_normalizer), 73 | good_token_re=good_token_re.pattern, 74 | min_token_len=min_token_len, 75 | stopwords='too long to be saved to metadata') 76 | self.token_splitter = token_splitter 77 | self.token_normalizer = token_normalizer 78 | self.good_token_re = good_token_re 79 | self.min_token_len = min_token_len 80 | self.stopwords = stopwords 81 | 82 | def __call__(self, txt): 83 | return [word_tokenize_txt(txt, 84 | self.token_splitter, 85 | self.token_normalizer, 86 | self.good_token_re, 87 | self.min_token_len, 88 | self.stopwords)] 89 | 90 | 91 | DEFAULT_TOKENIZER = Tokenizer() 92 | 93 | ANNOTATED_TEXT_TOKENIZER = Tokenizer(token_splitter=_WHITESPACE_TOKEN_SPLITTER, 94 | good_token_re=ANY_TOKEN_IS_GOOD_RE, 95 | min_token_len=0) 96 | 97 | DEFAULT_JAP_TOKENIZER = Tokenizer(min_token_len=0) 98 | 99 | 100 | class SentenceTokenizer(BaseTokenizer): 101 | """ 102 | Splits text into sentences, tokenizes each sentence, normalizes each token with `token_normalizer`, 103 | filters tokens by length and regex `good_token_re`. 104 | Returns a list of sentences (each is a list of tokens). 105 | """ 106 | 107 | def __init__(self, 108 | word_tokenizer=DEFAULT_TOKENIZER, 109 | sentence_splitter=_SENT_SPLITTER_IMPL, 110 | min_sent_words=2): 111 | super(SentenceTokenizer, self).__init__(word_tokenizer=word_tokenizer.metadata, 112 | sentence_splitter=get_full_typename(sentence_splitter), 113 | min_sent_words=min_sent_words) 114 | self.word_tokenizer = word_tokenizer 115 | self.sentence_splitter = sentence_splitter 116 | self.min_sent_words = min_sent_words 117 | 118 | def __call__(self, txt): 119 | for sent in self.sentence_splitter(txt.strip()): 120 | for sent_tokens in self.word_tokenizer(sent): 121 | if len(sent_tokens) >= self.min_sent_words: 122 | yield sent_tokens 123 | 124 | 125 | DEFAULT_SENT_TOKENIZER = SentenceTokenizer() 126 | -------------------------------------------------------------------------------- /vecto/benchmarks/relation_extraction/preprocess.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import print_function 3 | import numpy as np 4 | import gzip 5 | import os 6 | import sys 7 | import pickle as pkl 8 | 9 | 10 | 11 | 12 | 13 | #Mapping of the labels to integers 14 | labelsMapping = {'Other':0, 15 | 'Message-Topic(e1,e2)':1, 'Message-Topic(e2,e1)':2, 16 | 'Product-Producer(e1,e2)':3, 'Product-Producer(e2,e1)':4, 17 | 'Instrument-Agency(e1,e2)':5, 'Instrument-Agency(e2,e1)':6, 18 | 'Entity-Destination(e1,e2)':7, 'Entity-Destination(e2,e1)':8, 19 | 'Cause-Effect(e1,e2)':9, 'Cause-Effect(e2,e1)':10, 20 | 'Component-Whole(e1,e2)':11, 'Component-Whole(e2,e1)':12, 21 | 'Entity-Origin(e1,e2)':13, 'Entity-Origin(e2,e1)':14, 22 | 'Member-Collection(e1,e2)':15, 'Member-Collection(e2,e1)':16, 23 | 'Content-Container(e1,e2)':17, 'Content-Container(e2,e1)':18} 24 | 25 | 26 | 27 | 28 | words = {} 29 | maxSentenceLen = [0,0] 30 | 31 | 32 | distanceMapping = {'PADDING': 0, 'LowerMin': 1, 'GreaterMax': 2} 33 | minDistance = -30 34 | maxDistance = 30 35 | for dis in range(minDistance,maxDistance+1): 36 | distanceMapping[dis] = len(distanceMapping) 37 | print(distanceMapping) 38 | 39 | 40 | def getWordIdx(token, word2Idx): 41 | """Returns from the word2Idex table the word index for a given token""" 42 | if token in word2Idx: 43 | return word2Idx[token] 44 | elif token.lower() in word2Idx: 45 | return word2Idx[token.lower()] 46 | return 0 47 | 48 | def createTensor(file, word2Idx, maxSentenceLen=100): 49 | """Creates matrices for the events and sentence for the given file""" 50 | labels = [] 51 | positionMatrix1 = [] 52 | positionMatrix2 = [] 53 | tokenMatrix = [] 54 | 55 | for line in open(file): 56 | splits = line.strip().split('\t') 57 | 58 | label = splits[0] 59 | pos1 = splits[1] 60 | pos2 = splits[2] 61 | sentence = splits[3] 62 | tokens = sentence.split(" ") 63 | 64 | #print(label, pos1, pos2, sentence, tokens) 65 | 66 | 67 | tokenIds = np.zeros(maxSentenceLen) 68 | positionValues1 = np.zeros(maxSentenceLen) 69 | positionValues2 = np.zeros(maxSentenceLen) 70 | 71 | for idx in range(0, min(maxSentenceLen, len(tokens))): 72 | tokenIds[idx] = getWordIdx(tokens[idx], word2Idx) 73 | 74 | distance1 = idx - int(pos1) 75 | distance2 = idx - int(pos2) 76 | #print(distance1, distance2) 77 | if distance1 in distanceMapping: 78 | #print('helo') 79 | positionValues1[idx] = distanceMapping[distance1] 80 | elif distance1 <= minDistance: 81 | positionValues1[idx] = distanceMapping['LowerMin'] 82 | else: 83 | positionValues1[idx] = distanceMapping['GreaterMax'] 84 | 85 | if distance2 in distanceMapping: 86 | positionValues2[idx] = distanceMapping[distance2] 87 | elif distance2 <= minDistance: 88 | positionValues2[idx] = distanceMapping['LowerMin'] 89 | else: 90 | positionValues2[idx] = distanceMapping['GreaterMax'] 91 | 92 | tokenMatrix.append(tokenIds) 93 | positionMatrix1.append(positionValues1) 94 | positionMatrix2.append(positionValues2) 95 | 96 | labels.append(labelsMapping[label]) 97 | 98 | 99 | 100 | return np.array(labels, dtype='int32'), np.array(tokenMatrix, dtype='int32'), np.array(positionMatrix1, dtype='int32'), np.array(positionMatrix2, dtype='int32'), 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | def load_data(embeddings, path_dataset): 109 | files = [os.path.join(path_dataset, 'train.txt'), os.path.join(path_dataset, 'test.txt')] 110 | for fileIdx in range(len(files)): 111 | file = files[fileIdx] 112 | for line in open(file): 113 | splits = line.strip().split('\t') 114 | 115 | label = splits[0] 116 | 117 | 118 | sentence = splits[3] 119 | tokens = sentence.split(" ") 120 | maxSentenceLen[fileIdx] = max(maxSentenceLen[fileIdx], len(tokens)) 121 | for token in tokens: 122 | words[token.lower()] = True 123 | 124 | 125 | print("Max Sentence Lengths: ", maxSentenceLen) 126 | 127 | # :: Read in word embeddings :: 128 | # :: Read in word embeddings :: 129 | word2Idx = embeddings.vocabulary.dic_words_ids 130 | wordEmbeddings = embeddings.matrix 131 | 132 | 133 | print("Embeddings shape: ", wordEmbeddings.shape) 134 | print("Len words: ", len(words)) 135 | 136 | 137 | 138 | # :: Create token matrix :: 139 | train_set = createTensor(files[0], word2Idx, max(maxSentenceLen)) 140 | test_set = createTensor(files[1], word2Idx, max(maxSentenceLen)) 141 | 142 | 143 | data = {'wordEmbeddings': wordEmbeddings, 'word2Idx': word2Idx, 144 | 'train_set': train_set, 'test_set': test_set} 145 | 146 | return data 147 | 148 | 149 | 150 | print("Data preprocessing done!") -------------------------------------------------------------------------------- /docs/source/tutorial/working_with_vectors.rst: -------------------------------------------------------------------------------- 1 | Basic operations 2 | ===================== 3 | 4 | .. currentmodule:: vecto 5 | 6 | Supported VSM formats 7 | ----------------------- 8 | 9 | At the moment the following data formats are supported: 10 | 11 | * .bin format of word2vec (the file has to be called "vectors.bin") 12 | * .npy arrays with separate vocab files 13 | * .txt plain-text vectors 14 | * sparse vectors in hp5 format 15 | 16 | :todo: fasttext .vec format? 17 | 18 | Importing vectors 19 | ------------------- 20 | 21 | Vecto assumes a one-folder-per-vsm folder structure. All files related to the same vsm - the metadata, vectors, vocab files, etc. - must all be stored in one directory. If the vector files has the correct extension (.npy, .txt, .bin, .hp5), the library will attempt to "guess" the correct module to load it with. 22 | 23 | >>> import vecto 24 | >>> path_to_vsm = "/path/to/your/model" 25 | >>> my_vsm = vecto.model.load_from_dir(path_to_vsm) 26 | 27 | The name of the model is the name directory in which the vector files are stored. For models generated with Vecto, interpretable folder names with parameters are generated automatically. 28 | 29 | >>> print(my_vsm.name) 30 | w2v_comb2_w8_n25_i6_d300_skip_300 31 | 32 | You can access the VSM metadata (recorded in metadata.json file located in the same directory as the VSM) as a Python dictionary: 33 | 34 | >>> print(my_vsm.metadata) 35 | {'size_dimensions': 300, 'dimensions': 300, 'size_window': '8'} 36 | 37 | Getting top similar neighbors of a word 38 | --------------------------------------- 39 | 40 | >>> my_vsm.get_most_similar_words("apple", cnt=5) 41 | [['apple', 1.0000000999898755], 42 | ['fruit', 0.61400752577032369], 43 | ['banana', 0.58657183882050712], 44 | ['plum', 0.5850951585421692], 45 | ['apples', 0.58464719369713347]] 46 | 47 | This method takes an optional ``cnt`` argument specifying how many top similar neighbors to output (the default is 10). Note that the top similar vector is always the target word itself. 48 | 49 | If you need to compute nearest neighbors for many words, this function works 50 | faster if the VSM is normalized. If it was generated with vecto, the 51 | normalization will be recorded in metadata, and can be checked with `:meth: 52 | .normalized()` method. Vecto will automatically check for normalization and use 53 | the faster routine if possible. If not, you can first normalize your model as 54 | follows: 55 | 56 | >>> my_embeddings.normalize() 57 | 58 | Please note that this changes the original embeddings, and to reverse this 59 | operation you will have to re-load them. 60 | 61 | If you're going to use the same normalized model several times, you can 62 | avoid re-doing the normalization with: 63 | 64 | >>> my_embeddings.cache_normalized_copy() 65 | 66 | In this case the original embeddings remain unchanged, but the neighbor 67 | retrieval will be performed with the cached normalized version. Please note 68 | that this will use additional memory. 69 | 70 | `.get_most_similar_vectors()` enables you to do the same as ``.get_most_similar_words()``, but searching the top neighbors by the vector representation rather than its label. 71 | 72 | Note: 73 | 74 | The speed of vector neighborhood computation depends on whether your numpy 75 | package has access 76 | to the right linear algebra library - MKL, OpenBLAS or whatever is available 77 | for your system. With the OpenBLAS and 4 Ghz Core i7-6700K processor in Ubuntu we're 78 | processing 900 words for 300K 500-dimensional embeddings in under three 79 | minutes. 80 | 81 | If you do have the library, but the neighbor extraction is still slow, 82 | check if it is actually used by numpy. This can be done as 83 | follows: 84 | 85 | >>> import numpy as np 86 | >>> np.show_config() 87 | 88 | 89 | 90 | Words to vectors and back 91 | ------------------------- 92 | 93 | First, you need to import your model from a directory that holds only that model (.npy, .bin, .hp5 or .txt formats) and any associated files. 94 | 95 | getting the vector representation of a word 96 | 97 | >>> my_vsm.get_row("apple") 98 | array([-0.17980662, 0.27027196, -0.33250481, ... -0.22577444], dtype=float32) 99 | 100 | You can use the above top-similar function to get the label of the vector most corresponding to your vector in your VSM vocabulary: 101 | 102 | >>> vsm.get_most_similar_vectors(vsm.get_row("apple")) 103 | 104 | Filtering the vocabulary of a VSM 105 | --------------------------------- 106 | 107 | In certain cases it may be useful to filter the vocabulary of a pre-trained VSM, e.g. to ensure that two models you are comparing have the same vocabulary. Vecto provides a ``.filter_by_vocab()`` method that returns a new model instance, the vocabulary of which contains only the words in the provided Python list of words. The list can be empty. 108 | 109 | >>> my_vsm.get_most_similar_words("cat", cnt=5) 110 | [['cat', 1.0], 111 | ['monkey', 0.95726192], 112 | ['dog', 0.95372206], 113 | ['koala', 0.94773519], 114 | ['puppy', 0.94360757]] 115 | >>> my_new_vsm = my_vsm.filter_by_vocab(["dog", "hotdog", "zoo", "hammer", "cat"]) 116 | >>> my_new_vsm.get_most_similar_words("cat", cnt=5) 117 | [['cat', 1.0], 118 | ['dog', 0.95372206], 119 | ['hotdog', 0.84262532], 120 | ['hammer', 0.80627602], 121 | ['zoo', 0.7463485]] 122 | -------------------------------------------------------------------------------- /vecto/benchmarks/synonymy_detection/synonymy_detection.py: -------------------------------------------------------------------------------- 1 | from ..base import Benchmark 2 | from collections import defaultdict 3 | from os import path, listdir 4 | import csv 5 | import numpy as np 6 | from scipy.spatial import distance 7 | 8 | OTHER_EXT = 'None' 9 | BENCHMARK = 'benchmark' 10 | 11 | 12 | class SynonymyDetection(Benchmark): 13 | def __init__(self, normalize=True, 14 | ignore_oov=True, 15 | do_top5=True, 16 | need_subsample=False, 17 | size_cv_test=1, 18 | set_aprimes_test=None, 19 | inverse_regularization_strength=1.0, 20 | exclude=True): 21 | self.normalize = normalize 22 | self.ignore_oov = ignore_oov 23 | self.do_top5 = do_top5 24 | self.need_subsample = need_subsample 25 | self.normalize = normalize 26 | self.size_cv_test = size_cv_test 27 | self.set_aprimes_test = set_aprimes_test 28 | self.inverse_regularization_strength = inverse_regularization_strength 29 | self.exclude = exclude 30 | 31 | self.stats = {} 32 | self.cnt_total_correct = 0 33 | self.cnt_total_total = 0 34 | 35 | # this are some hard-coded bits which will be implemented later 36 | self.result_miss = { 37 | 'rank': -1, 38 | 'reason': 'missing words' 39 | } 40 | 41 | @property 42 | def method(self): 43 | return type(self).__name__ 44 | 45 | @classmethod 46 | def read_test_set(self, path): 47 | data = defaultdict(lambda: []) 48 | if path.endswith('.csv'): 49 | with open(path, 'r') as csvfile: 50 | reader = csv.reader(csvfile) 51 | head = True 52 | for row in reader: 53 | if len(row) < 3: 54 | continue 55 | if not head: 56 | target_word = row[1] 57 | word = row[2] 58 | is_synonym = row[3] 59 | data[target_word].append([word, is_synonym]) 60 | head = False 61 | else: 62 | with open(path) as f: 63 | for line in f: 64 | _, target_word, word, is_synonym = line.strip().split() 65 | data[target_word].append([word, is_synonym]) 66 | return dict(data) 67 | 68 | def collect_stats(self, embs, data): 69 | corrected_data = defaultdict(lambda: []) 70 | for word, suspicious_words in data.items(): 71 | if not embs.has_word(word): 72 | continue 73 | for susp_word, is_synonym in suspicious_words: 74 | if embs.has_word(susp_word): 75 | corrected_data[word].append([susp_word, is_synonym]) 76 | result = self.run_synonym_finding(embs, dict(corrected_data)) 77 | return result 78 | 79 | def evaluate(self, embs, data): 80 | result = self.collect_stats(embs, data) 81 | return result 82 | 83 | def read_datasets_from_dir(self, path_to_dir): 84 | datasets = defaultdict(lambda: {}) 85 | for file in listdir(path_to_dir): 86 | dataset_name, dataset_data = self.read_single_dataset(path_to_dir, file) 87 | if type != OTHER_EXT: 88 | datasets[dataset_name] = dataset_data 89 | return datasets 90 | 91 | def read_single_dataset(self, path_to_dir, file_name): 92 | dataset_name, _ = path.splitext(file_name) 93 | data = self.read_test_set(path.join(path_to_dir, file_name)) 94 | return dataset_name, data 95 | 96 | def run(self, embeds, path_dataset): 97 | results = defaultdict(lambda: {}) 98 | datasets = self.read_datasets_from_dir(path_dataset) 99 | for dataset_name, dataset_data in datasets.items(): 100 | result = self.evaluate(embeds, dataset_data) 101 | results[dataset_name] = result 102 | return dict(results) 103 | 104 | def get_result(self, embeds, path_dataset): 105 | if self.normalize: 106 | embeds.normalize() 107 | 108 | results = self.run(embeds, path_dataset) 109 | return results 110 | 111 | 112 | class CosineDistance(SynonymyDetection): 113 | @classmethod 114 | def run_synonym_finding(self, embs, data): 115 | result = defaultdict(lambda: {}) 116 | for word, suspicious_words in data.items(): 117 | distances = [] 118 | for susp_word, _ in suspicious_words: 119 | distances.append(1 - distance.cosine(embs.get_vector(susp_word), embs.get_vector(word))) 120 | guessed_word_index = distances.index(np.min(distances)) 121 | results_for_word = [] 122 | for dist_id, cosine_distance in enumerate(distances): 123 | d = {} 124 | d['suspicious_word'] = suspicious_words[dist_id][0] 125 | d['is_synonym'] = suspicious_words[dist_id][1] 126 | if dist_id == guessed_word_index: 127 | d['hit'] = True 128 | else: 129 | d['hit'] = False 130 | d['distance'] = cosine_distance 131 | results_for_word.append(d) 132 | result[word] = results_for_word 133 | return dict(result) 134 | -------------------------------------------------------------------------------- /tests/test_training.py: -------------------------------------------------------------------------------- 1 | """Tests for embeddings module.""" 2 | 3 | import unittest 4 | import io 5 | import contextlib 6 | import sys 7 | import runpy 8 | import os 9 | 10 | 11 | def run_module(name: str, args, run_name: str = '__main__') -> None: 12 | backup_sys_argv = sys.argv 13 | sys.argv = [name + '.py'] + list(args) 14 | runpy.run_module(name, run_name=run_name) 15 | sys.argv = backup_sys_argv 16 | 17 | 18 | class Tests(unittest.TestCase): 19 | 20 | # def test_train_word2vec(self): 21 | # path_corpus = "./tests/data/corpora/plain/" 22 | # sio = io.StringIO() 23 | # with contextlib.redirect_stderr(sio): 24 | # run_module('vecto.embeddings.train_word2vec', 25 | # ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--out_type', 'ns']) 26 | # run_module('vecto.embeddings.train_word2vec', 27 | # ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--out_type', 'hsm']) 28 | # run_module('vecto.embeddings.train_word2vec', 29 | # ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--out_type', 'original']) 30 | # run_module('vecto.embeddings.train_word2vec', 31 | # ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--out_type', 'ns', 32 | # '--model', 'cbow']) 33 | # with self.assertRaises(RuntimeError): 34 | # run_module('vecto.embeddings.train_word2vec', 35 | # ['--path_corpus', path_corpus + "NONEXISTING", '--path_out', '/tmp/vecto/embeddings/', 36 | # '--out_type', 'ns', 37 | # '--model', 'cbow']) 38 | 39 | # @unittest.skipIf(os.environ.get('APPVEYOR'), 'skipping Appveyor due to memory error') 40 | # def test_train_word2vec_subword_cnn1d(self): 41 | # path_corpus = "./tests/data/corpora/plain/" 42 | # run_module('vecto.embeddings.train_word2vec', 43 | # ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--dimension', '5', 44 | # '--subword', 'cnn1d']) 45 | # with self.assertRaises(RuntimeError): 46 | # run_module('vecto.embeddings.train_word2vec', 47 | # ['--path_corpus', path_corpus + "NONEXISTING", '--path_out', '/tmp/vecto/embeddings/', 48 | # '--dimension', '5', 49 | # '--subword', 'cnn1d']) 50 | 51 | def test_train_word2vec_subword(self): 52 | path_corpus = "./tests/data/corpora/plain/" 53 | path_vocab = "./tests/data/vocabs/plain/" 54 | sio = io.StringIO() 55 | with contextlib.redirect_stderr(sio): 56 | run_module('vecto.embeddings.train_word2vec', 57 | ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--dimension', '5', 58 | '--subword', 'cnn1d_small']) 59 | 60 | run_module('vecto.embeddings.train_word2vec', 61 | ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--dimension', '5', 62 | '--subword', 'bilstm']) 63 | run_module('vecto.embeddings.train_word2vec', 64 | ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--dimension', '5', 65 | '--subword', 'sum']) 66 | run_module('vecto.embeddings.train_word2vec', 67 | ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--dimension', '5', 68 | '--subword', '_none', '--path_vocab', path_vocab]) 69 | run_module('vecto.embeddings.train_word2vec', 70 | ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--dimension', '5', 71 | '--subword', 'bilstm_sum']) 72 | with self.assertRaises(RuntimeError): 73 | run_module('vecto.embeddings.train_word2vec', 74 | ['--path_corpus', path_corpus + "NONEXISTING", '--path_out', '/tmp/vecto/embeddings/', 75 | '--dimension', '5', 76 | '--subword', 'bilstm_sum']) 77 | 78 | def test_train_word2vec_subword_jap(self): 79 | path_corpus = "./tests/data/corpora/jap/tokenized/" 80 | path_word2chars = "./tests/data/corpora/jap/char2radical/char2radical.txt" 81 | sio = io.StringIO() 82 | with contextlib.redirect_stderr(sio): 83 | run_module('vecto.embeddings.train_word2vec', 84 | ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--dimension', '5', 85 | '--subword', 'sum', '--language', 'jap', '--min_gram', '1', '--max_gram', '1']) 86 | run_module('vecto.embeddings.train_word2vec', 87 | ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--dimension', '5', 88 | '--subword', 'sum', '--language', 'jap', '--min_gram', '1', '--max_gram', '1', 89 | '--path_word2chars', path_word2chars]) 90 | 91 | with self.assertRaises(RuntimeError): 92 | run_module('vecto.embeddings.train_word2vec', 93 | ['--path_corpus', path_corpus + "NONEXISTING", '--path_out', '/tmp/vecto/embeddings/', 94 | '--dimension', '5', 95 | '--subword', 'sum', '--language', 'jap', '--min_gram', '1', '--max_gram', '1']) 96 | -------------------------------------------------------------------------------- /vecto/data/base.py: -------------------------------------------------------------------------------- 1 | 2 | import fnmatch 3 | import os 4 | from pathlib import Path 5 | import tarfile 6 | from zipfile import ZipFile 7 | import logging 8 | import tempfile 9 | import shutil 10 | # from vecto.config import load_config 11 | from vecto.utils.metadata import WithMetaData 12 | from vecto.utils.data import load_json 13 | from .io import fetch_file, read_first_col_is_label_format, read_tsv_label_last 14 | 15 | logger = logging.getLogger(__name__) 16 | # TODO: make config module-global 17 | # config = load_config() 18 | # TODO: get dataset dir from config 19 | # TODO: use pathlib everywhere 20 | dir_datasets = os.path.expanduser("~/.vecto/datasets") 21 | dir_temp = os.path.join(tempfile.gettempdir(), "vecto", "tmp") 22 | os.makedirs(dir_datasets, exist_ok=True) 23 | os.makedirs(dir_temp, exist_ok=True) 24 | resources = {} 25 | 26 | 27 | class Dataset(WithMetaData): 28 | """ 29 | Container class for stock datasets. 30 | Arguments: 31 | path (str): local path to place files 32 | """ 33 | 34 | def __init__(self, path): 35 | if not os.path.exists(path): 36 | raise FileNotFoundError("test dataset dir does not exist:" + path) 37 | super().__init__(path) 38 | self.path = path 39 | if "name" not in self.metadata: 40 | # TODO: use proper path magic 41 | self.metadata["name"] = path.split("/")[-1] 42 | 43 | def file_iterator(self): 44 | for root, _, filenames in os.walk(self.path): 45 | for filename in fnmatch.filter(sorted(filenames), '*'): 46 | if filename.endswith('json'): 47 | continue 48 | yield(os.path.join(root, filename)) 49 | 50 | def _load_tsv(self, names): 51 | # TODO: decide what to do with char_basrd 52 | char_based = False 53 | for candidate_name in names: 54 | path_full_candidate = os.path.join(self.path, candidate_name) 55 | print(path_full_candidate) 56 | if os.path.isfile(path_full_candidate): 57 | # train = read_first_col_is_label_format(path_full_candidate, char_based=char_based) 58 | train = read_first_col_is_label_format(path_full_candidate) 59 | return train 60 | # test = read_first_col_is_label_format(os.path.join(self.path, 'test'), 61 | # char_based=char_based) 62 | raise RuntimeError("can not find dataset") 63 | 64 | def get_train(self): 65 | return self._load_tsv(["train", "train.tsv"]) 66 | 67 | def get_test(self): 68 | return self._load_tsv(["dev", "dev.tsv", "test", "test.tsv"]) 69 | 70 | 71 | def download_index(): 72 | logger.info("downloading index of resources") 73 | path_tar = os.path.join(dir_temp, "resources.tar") 74 | url_resources = "https://github.com/vecto-ai/vecto-resources/tarball/master/" 75 | fetch_file(url_resources, path_tar) 76 | with tarfile.open(path_tar) as tar: 77 | for member in tar.getmembers(): 78 | parts = member.name.split("/") 79 | if len(parts) <= 1: 80 | continue 81 | if parts[1] != "resources": 82 | continue 83 | member.path = os.path.join(*parts[1:]) 84 | tar.extract(member, dir_datasets) 85 | 86 | 87 | def gen_metadata_snippets(path): 88 | for sub in path.iterdir(): 89 | if sub.name == "metadata.json": 90 | yield sub 91 | else: 92 | if sub.is_dir(): 93 | yield from gen_metadata_snippets(sub) 94 | 95 | 96 | def load_dataset_infos(): 97 | for f_meta in gen_metadata_snippets(Path(dir_datasets)): 98 | # print("visiting", f_meta.parent) 99 | metadata = load_json(f_meta) 100 | if "name" in metadata: 101 | metadata["local_path"] = f_meta.parent 102 | resources[metadata["name"]] = metadata 103 | 104 | 105 | def download_dataset_by_name(name, path_dataset): 106 | filename = resources[name]["url"].split("/")[-1] 107 | logger.debug("downloading ", filename) 108 | path_download_archive = Path(dir_temp) / filename 109 | if "url" not in resources[name]: 110 | raise RuntimeError(f"no URL to download dataset {name}") 111 | fetch_file(resources[name]["url"], path_download_archive) 112 | path_extracted = Path(dir_temp) / name 113 | with ZipFile(path_download_archive) as z: 114 | z.extractall(path_extracted) 115 | # TODO: make sure this returns topmost entry from the tree 116 | first_metadata_path = next(gen_metadata_snippets(path_extracted)).parent 117 | # print(first_metadata_path) 118 | for f in first_metadata_path.iterdir(): 119 | if not (path_dataset / f.name).exists(): 120 | shutil.move(str(f), str(path_dataset)) 121 | 122 | 123 | def is_dataset_downloaded(path_dataset): 124 | for f in path_dataset.iterdir(): 125 | if f.name.endswith("metadata.json"): 126 | continue 127 | return True 128 | return False 129 | 130 | 131 | def get_dataset_by_name(name): 132 | load_dataset_infos() 133 | if not resources: 134 | logger.info("index not found, forcing download") 135 | download_index() 136 | load_dataset_infos() 137 | # print(resources) 138 | if name in resources: 139 | path_dataset = resources[name]["local_path"] 140 | else: 141 | raise RuntimeError("Dataset %s not known" % name) 142 | if not is_dataset_downloaded(path_dataset): 143 | logger.info("only metadata is present, need to download") 144 | download_dataset_by_name(name, path_dataset) 145 | dataset = Dataset(path_dataset) 146 | return dataset 147 | -------------------------------------------------------------------------------- /vecto/benchmarks/relation_extraction/relation_extraction.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | 4 | np.random.seed(1337) # for reproducibility 5 | import gzip 6 | import sys 7 | import pickle as pkl 8 | from .preprocess import load_data 9 | from ..base import Benchmark 10 | import os 11 | 12 | from keras.models import Model 13 | from keras.layers import Input, Dense, Dropout, concatenate 14 | from keras.layers import Embedding 15 | from keras.layers import Convolution1D, GlobalMaxPooling1D 16 | 17 | 18 | # Function to calculate the precision 19 | def getPrecision(pred_test, yTest, targetLabel): 20 | # Precision for non-vague 21 | targetLabelCount = 0 22 | correctTargetLabelCount = 0 23 | 24 | for idx, prediction in enumerate(pred_test): 25 | if prediction == targetLabel: 26 | targetLabelCount += 1 27 | 28 | if prediction == yTest[idx]: 29 | correctTargetLabelCount += 1 30 | 31 | if correctTargetLabelCount == 0: 32 | return 0 33 | 34 | return float(correctTargetLabelCount) / targetLabelCount 35 | 36 | class Relation_extraction(Benchmark): 37 | 38 | def __init__(self, batchsize=16, nb_filter=100, filter_length=3, hidden_dims=100, epoch=1, position_dims=50): 39 | self.batchsize = batchsize 40 | self.nb_filter = nb_filter 41 | self.filter_length = filter_length 42 | self.hidden_dims = hidden_dims 43 | self.epoch = epoch 44 | self.position_dims = position_dims 45 | 46 | def run(self, embeddings, dataset): 47 | print("Load dataset") 48 | path_dataset = dataset.path 49 | data = load_data(embeddings, path_dataset) 50 | 51 | yTrain, sentenceTrain, positionTrain1, positionTrain2 = data['train_set'] 52 | yTest, sentenceTest, positionTest1, positionTest2 = data['test_set'] 53 | 54 | max_position = max(np.max(positionTrain1), np.max(positionTrain2)) + 1 55 | 56 | n_out = max(yTrain) + 1 57 | # train_y_cat = np_utils.to_categorical(yTrain, n_out) 58 | max_sentence_len = sentenceTrain.shape[1] 59 | 60 | print(sentenceTrain[10]) 61 | 62 | print("sentenceTrain: ", sentenceTrain.shape) 63 | print("positionTrain1: ", positionTrain1.shape) 64 | print("yTrain: ", yTrain.shape) 65 | 66 | print("sentenceTest: ", sentenceTest.shape) 67 | print("positionTest1: ", positionTest1.shape) 68 | print("yTest: ", yTest.shape) 69 | 70 | print("Embeddings: ", embeddings.matrix.shape) 71 | 72 | words_input = Input(shape=(max_sentence_len,), dtype='int32', name='words_input') 73 | words = Embedding(embeddings.matrix.shape[0], embeddings.matrix.shape[1], weights=[embeddings.matrix], 74 | trainable=False)(words_input) 75 | distance1_input = Input(shape=(max_sentence_len,), dtype='int32', name='distance1_input') 76 | distance1 = Embedding(max_position, self.position_dims)(distance1_input) 77 | distance2_input = Input(shape=(max_sentence_len,), dtype='int32', name='distance2_input') 78 | distance2 = Embedding(max_position, self.position_dims)(distance2_input) 79 | output = concatenate([words, distance1, distance2], -1) 80 | output = Convolution1D(filters=self.nb_filter, 81 | kernel_size=self.filter_length, 82 | padding='same', 83 | activation='tanh', 84 | strides=1)(output) 85 | # we use standard max over time pooling 86 | output = GlobalMaxPooling1D()(output) 87 | output = Dropout(0.25)(output) 88 | output = Dense(n_out, activation='softmax')(output) 89 | # create the model 90 | model = Model(inputs=[words_input, distance1_input, distance2_input], outputs=[output]) 91 | model.compile(loss='sparse_categorical_crossentropy', optimizer='Adam', metrics=['accuracy']) 92 | model.summary() 93 | 94 | print("Start training") 95 | max_prec, max_rec, max_acc, max_f1 = 0, 0, 0, 0 96 | accs = [] 97 | 98 | def predict_classes(prediction): 99 | return prediction.argmax(axis=-1) 100 | 101 | # for epoch in range(nb_epoch): 102 | model.fit([sentenceTrain, positionTrain1, positionTrain2], yTrain, batch_size=self.batchsize, verbose=True, 103 | epochs=self.epoch) 104 | pred_test = predict_classes(model.predict([sentenceTest, positionTest1, positionTest2], verbose=False)) 105 | 106 | dctLabels = np.sum(pred_test) 107 | totalDCTLabels = np.sum(yTest) 108 | 109 | acc = np.sum(pred_test == yTest) / float(len(yTest)) 110 | max_acc = max(max_acc, acc) 111 | print("Accuracy: %.4f (max: %.4f)" % (acc, max_acc)) 112 | 113 | f1Sum = 0 114 | f1Count = 0 115 | for targetLabel in range(1, max(yTest)): 116 | prec = getPrecision(pred_test, yTest, targetLabel) 117 | recall = getPrecision(yTest, pred_test, targetLabel) 118 | f1 = 0 if (prec + recall) == 0 else 2 * prec * recall / (prec + recall) 119 | f1Sum += f1 120 | f1Count += 1 121 | accs.append(max_acc) 122 | macroF1 = f1Sum / float(f1Count) 123 | max_f1 = max(max_f1, macroF1) 124 | print("Non-other Macro-Averaged F1: %.4f (max: %.4f)\n" % (macroF1, max_f1)) 125 | 126 | experiment_setup = self.__dict__ 127 | experiment_setup["embeddings"] = embeddings.metadata 128 | experiment_setup["category"] = "default" 129 | experiment_setup["dataset"] = os.path.basename(path_dataset) 130 | experiment_setup["method"] = 'cnn' 131 | experiment_setup['task'] = 'relation_extraction' 132 | result = {} 133 | result['experiment_setup'] = experiment_setup 134 | result['experiment_setup']['default_measurement'] = 'macroF1' 135 | result['result'] = [] 136 | result['result'] = {"macroF1": macroF1, "max_f1": max_f1, "accuracy": acc, "max_accuracy": max_acc} 137 | return result 138 | 139 | def get_result(self, embeddings, path_dataset): 140 | results = self.run(embeddings, path_dataset) 141 | return [results] 142 | --------------------------------------------------------------------------------