├── tests
    ├── __init__.py
    ├── benchmarks
    │   ├── __init__.py
    │   ├── test_outliers.py
    │   ├── test_misc.py
    │   ├── test_relation_extraction.py
    │   ├── test_synonymy_detection.py
    │   ├── test_similarity.py
    │   ├── test_language_modeling.py
    │   ├── test_sequence_labeling.py
    │   ├── test_text_classification.py
    │   ├── test_analogy.py
    │   └── test_categorization.py
    ├── data
    │   ├── embeddings
    │   │   ├── npy
    │   │   │   ├── metadata.json
    │   │   │   ├── test.vocab
    │   │   │   └── test.npy
    │   │   └── text
    │   │   │   ├── corrupted
    │   │   │       ├── metadata.json
    │   │   │       └── emb.txt
    │   │   │   ├── plain_no_file_header
    │   │   │       ├── metadata.json
    │   │   │       └── emb.txt
    │   │   │   └── plain_with_file_header
    │   │   │       ├── metadata.json
    │   │   │       └── emb.txt
    │   ├── corpora
    │   │   ├── jap
    │   │   │   ├── char2radical
    │   │   │   │   └── char2radical.txt
    │   │   │   └── tokenized
    │   │   │   │   └── corpus.txt
    │   │   ├── bzipped
    │   │   │   └── sense_small.txt.bz2
    │   │   ├── gzipped
    │   │   │   └── sense_small.txt.gz
    │   │   ├── xzipped
    │   │   │   └── sense_small.txt.xz
    │   │   └── multiple_small
    │   │   │   ├── one.txt
    │   │   │   └── two.txt
    │   ├── benchmarks
    │   │   ├── analogy
    │   │   │   ├── category1
    │   │   │   │   └── subcategory_a.txt
    │   │   │   ├── category2
    │   │   │   │   └── subcategory_b.txt
    │   │   │   └── metadata.json
    │   │   ├── similarity
    │   │   │   ├── ws.csv
    │   │   │   ├── ws.json
    │   │   │   ├── ws
    │   │   │   └── ws.txt
    │   │   ├── categorization
    │   │   │   └── essli-2008-lite.csv
    │   │   ├── synonymy_detection
    │   │   │   └── test.csv
    │   │   ├── outliers
    │   │   │   └── test.csv
    │   │   ├── sequence_labeling
    │   │   │   ├── chunk
    │   │   │   │   ├── valid.txt
    │   │   │   │   ├── test.txt
    │   │   │   │   └── train.txt
    │   │   │   ├── ner
    │   │   │   │   ├── valid.txt
    │   │   │   │   ├── test.txt
    │   │   │   │   └── train.txt
    │   │   │   └── pos
    │   │   │   │   ├── valid.txt
    │   │   │   │   ├── test.txt
    │   │   │   │   └── train.txt
    │   │   ├── text_classification
    │   │   │   ├── test
    │   │   │   └── train
    │   │   └── relation_extraction
    │   │   │   ├── test.txt
    │   │   │   └── train.txt
    │   ├── vocabs
    │   │   ├── one_column
    │   │   │   └── something.vocab
    │   │   ├── numbers
    │   │   │   ├── metadata.json
    │   │   │   └── vocab.tsv
    │   │   └── plain
    │   │   │   ├── metadata.json
    │   │   │   └── vocab.tsv
    │   └── benchmarks_results
    │   │   ├── text_classification
    │   │       ├── vocab.json
    │   │       ├── best_model.npz
    │   │       ├── args.json
    │   │       └── log
    │   │   ├── similarity
    │   │       ├── 1
    │   │       └── 2
    │   │   └── analogy
    │   │       ├── 1
    │   │       ├── 2
    │   │       ├── 3
    │   │       └── 4
    ├── test_datasets.py
    ├── test_cli_misc.py
    ├── test_misc.py
    ├── test_format.py
    ├── test_config.py
    ├── test_embeddings.py
    └── test_training.py
├── docs
    ├── requirements.txt
    ├── source
    │   ├── reference
    │   │   └── index.rst
    │   ├── tutorial
    │   │   ├── images
    │   │   │   ├── cat.png
    │   │   │   ├── pear.png
    │   │   │   ├── contexts.png
    │   │   │   ├── img_tips.png
    │   │   │   ├── std_to_img.png
    │   │   │   ├── draw_features.png
    │   │   │   └── draw_similarity.png
    │   │   ├── index.rst
    │   │   ├── installing.rst
    │   │   ├── visualization.rst
    │   │   ├── basic.rst
    │   │   ├── training_vectors.rst
    │   │   ├── roadmap.rst
    │   │   └── working_with_vectors.rst
    │   ├── index.rst
    │   └── contribution.rst
    ├── Makefile
    └── make.bat
├── .readthedocs.yml
├── vecto
    ├── benchmarks
    │   ├── outliers
    │   │   ├── __init__.py
    │   │   └── __main__.py
    │   ├── synonymy_detection
    │   │   ├── __init__.py
    │   │   ├── __main__.py
    │   │   └── synonymy_detection.py
    │   ├── base.py
    │   ├── similarity
    │   │   └── __init__.py
    │   ├── sequence_labeling
    │   │   └── __init__.py
    │   ├── categorization
    │   │   ├── metrics.py
    │   │   └── __init__.py
    │   ├── relation_extraction
    │   │   ├── __init__.py
    │   │   ├── preprocess.py
    │   │   └── relation_extraction.py
    │   ├── analogy
    │   │   ├── io.py
    │   │   └── __init__.py
    │   ├── language_modeling
    │   │   └── __init__.py
    │   ├── text_classification
    │   │   ├── __init__.py
    │   │   └── nlp_utils.py
    │   ├── __init__.py
    │   └── visualize.py
    ├── data
    │   ├── __init__.py
    │   ├── io.py
    │   └── base.py
    ├── embeddings
    │   ├── utils
    │   │   └── __init__.py
    │   ├── base.py
    │   ├── legacy_w2v.py
    │   └── __init__.py
    ├── _version.py
    ├── __main__.py
    ├── corpus
    │   ├── __init__.py
    │   ├── base.py
    │   └── tokenization.py
    ├── utils
    │   ├── blas.py
    │   ├── __init__.py
    │   ├── tqdm_utils.py
    │   ├── convert.py
    │   ├── fetch_benchmarks.py
    │   ├── formathelper.py
    │   ├── data.py
    │   └── metadata.py
    ├── __init__.py
    ├── vocabulary
    │   ├── __init__.py
    │   └── __main__.py
    ├── config.py
    └── cli.py
├── MANIFEST.in
├── requirements.txt
├── test_requirements.txt
├── check_ds.py
├── setup.py
├── .appveyor.yml
├── .travis.yml
├── corpus_test.py
├── examples
    ├── analogy.ipynb
    └── most_similar.ipynb
├── .gitignore
└── README.rst


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/benchmarks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | sphinx_rtd_theme
3 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | requirements_file: docs/requirements.txt
2 | 


--------------------------------------------------------------------------------
/tests/data/embeddings/npy/metadata.json:
--------------------------------------------------------------------------------
1 | {"type":"test"}
2 | 


--------------------------------------------------------------------------------
/vecto/benchmarks/outliers/__init__.py:
--------------------------------------------------------------------------------
1 | from .outliers import *


--------------------------------------------------------------------------------
/tests/data/corpora/jap/char2radical/char2radical.txt:
--------------------------------------------------------------------------------
1 | 仲 亻中
2 | 間 門日


--------------------------------------------------------------------------------
/vecto/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import Dataset, get_dataset_by_name
2 | 


--------------------------------------------------------------------------------
/vecto/embeddings/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from . import word
2 | from . import subword


--------------------------------------------------------------------------------
/tests/data/embeddings/npy/test.vocab:
--------------------------------------------------------------------------------
1 | the
2 | apple
3 | banana
4 | fast
5 | quick


--------------------------------------------------------------------------------
/vecto/_version.py:
--------------------------------------------------------------------------------
1 | """Version of vecto package."""
2 | 
3 | VERSION = "0.2.16"
4 | 


--------------------------------------------------------------------------------
/vecto/benchmarks/synonymy_detection/__init__.py:
--------------------------------------------------------------------------------
1 | from .synonymy_detection import *


--------------------------------------------------------------------------------
/tests/data/benchmarks/analogy/category1/subcategory_a.txt:
--------------------------------------------------------------------------------
1 | apple banana
2 | fast quick


--------------------------------------------------------------------------------
/tests/data/benchmarks/similarity/ws.csv:
--------------------------------------------------------------------------------
1 | love,sex,6.77
2 | tiger,cat,7.35
3 | tiger,tiger,10


--------------------------------------------------------------------------------
/tests/data/corpora/jap/tokenized/corpus.txt:
--------------------------------------------------------------------------------
1 | いつも 忙しい 仲間 と やっと 会え た
2 | いつも 忙しい 仲間 と やっと 会え た


--------------------------------------------------------------------------------
/tests/data/vocabs/one_column/something.vocab:
--------------------------------------------------------------------------------
1 | apple
2 | banana
3 | mango
4 | potato
5 | the


--------------------------------------------------------------------------------
/vecto/__main__.py:
--------------------------------------------------------------------------------
1 | from .cli import CLI
2 | 
3 | if __name__ == "__main__":
4 |     CLI()
5 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include setup_boilerplate.py
2 | include requirements.txt
3 | include test_requirements.txt
4 | 


--------------------------------------------------------------------------------
/docs/source/reference/index.rst:
--------------------------------------------------------------------------------
1 | API reference
2 | =============
3 | 
4 | .. automodule:: vecto
5 |     
6 | 


--------------------------------------------------------------------------------
/tests/data/embeddings/text/corrupted/metadata.json:
--------------------------------------------------------------------------------
1 | {"type":"test","foldername":"plain_with_file_header"}
2 | 


--------------------------------------------------------------------------------
/tests/data/embeddings/npy/test.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/tests/data/embeddings/npy/test.npy


--------------------------------------------------------------------------------
/tests/data/embeddings/text/plain_no_file_header/metadata.json:
--------------------------------------------------------------------------------
1 | {"type":"test","foldername":"plain_no_file_header"}
2 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/images/cat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/docs/source/tutorial/images/cat.png


--------------------------------------------------------------------------------
/docs/source/tutorial/images/pear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/docs/source/tutorial/images/pear.png


--------------------------------------------------------------------------------
/tests/data/embeddings/text/plain_with_file_header/metadata.json:
--------------------------------------------------------------------------------
1 | {"type":"test","foldername":"plain_with_file_header"}
2 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/images/contexts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/docs/source/tutorial/images/contexts.png


--------------------------------------------------------------------------------
/docs/source/tutorial/images/img_tips.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/docs/source/tutorial/images/img_tips.png


--------------------------------------------------------------------------------
/docs/source/tutorial/images/std_to_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/docs/source/tutorial/images/std_to_img.png


--------------------------------------------------------------------------------
/tests/data/benchmarks/analogy/category2/subcategory_b.txt:
--------------------------------------------------------------------------------
1 | apple banana
2 | fast quick
3 | apple banana_missing
4 | apple_missing banana


--------------------------------------------------------------------------------
/tests/data/vocabs/numbers/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "path_source": "./test/data/corpora/numbers",
3 |     "vsmlib_version": "0.1.6"
4 | }


--------------------------------------------------------------------------------
/docs/source/tutorial/images/draw_features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/docs/source/tutorial/images/draw_features.png


--------------------------------------------------------------------------------
/tests/data/benchmarks_results/text_classification/vocab.json:
--------------------------------------------------------------------------------
1 | {"the": 0, "apple": 1, "banana": 2, "fast": 3, "quick": 4, "tiger": 5, "cat": 6}


--------------------------------------------------------------------------------
/tests/data/corpora/bzipped/sense_small.txt.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/tests/data/corpora/bzipped/sense_small.txt.bz2


--------------------------------------------------------------------------------
/tests/data/corpora/gzipped/sense_small.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/tests/data/corpora/gzipped/sense_small.txt.gz


--------------------------------------------------------------------------------
/tests/data/corpora/xzipped/sense_small.txt.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/tests/data/corpora/xzipped/sense_small.txt.xz


--------------------------------------------------------------------------------
/docs/source/tutorial/images/draw_similarity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/docs/source/tutorial/images/draw_similarity.png


--------------------------------------------------------------------------------
/tests/data/benchmarks/similarity/ws.json:
--------------------------------------------------------------------------------
1 | {
2 |   "language": "en",
3 |   "task": "similarity",
4 |   "description": "TEST FILE",
5 |   "version": "-",
6 |   "cite": "-"
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks_results/text_classification/best_model.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vecto-ai/vecto/HEAD/tests/data/benchmarks_results/text_classification/best_model.npz


--------------------------------------------------------------------------------
/tests/data/embeddings/text/plain_no_file_header/emb.txt:
--------------------------------------------------------------------------------
1 | the 0.4 0.3 0.2 0.1
2 | apple 0.1 0.2 0.3 0.4
3 | banana 0.1 0.2 0.3 0.41
4 | fast 0.1 0.1 0.1 0.1
5 | quick 0.1 0.1 0.1 0.2


--------------------------------------------------------------------------------
/tests/data/vocabs/plain/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "cnt_words": 142,
3 |     "min_frequency": 10,
4 |     "path_source": "./test/data/corpora/plain",
5 |     "vsmlib_version": "0.1.6"
6 | }


--------------------------------------------------------------------------------
/tests/data/benchmarks/categorization/essli-2008-lite.csv:
--------------------------------------------------------------------------------
1 | ,category,word
2 | 0,cats,the
3 | 1,cats,tiger
4 | 2,cats,cat
5 | 3,food,apple
6 | 4,food,banana
7 | 5,motion,walk
8 | 6,motion,fly
9 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks/synonymy_detection/test.csv:
--------------------------------------------------------------------------------
1 | ,target,word,is_solution
2 | 0,tiger,cat,yes
3 | 1,tiger,run,no
4 | 2,tiger,banana,no
5 | 3,run,walk,yes
6 | 4,run,cat,no
7 | 5,run,the,no


--------------------------------------------------------------------------------
/tests/data/embeddings/text/corrupted/emb.txt:
--------------------------------------------------------------------------------
1 | 7 4
2 | the 0.4 0.3 0.2 
3 | apple 0.1 0.2 0.3 0.4
4 | banana 0.1 0.2 0.3 0.41
5 | fast 0.1 0.1 0.1 0.1
6 | quick 0.1 0.1 0.1 0.2
7 | tiger 0.1 0.1 0.1 0.2
8 | cat 0.1 0.1 0.1 0.2


--------------------------------------------------------------------------------
/tests/data/vocabs/numbers/vocab.tsv:
--------------------------------------------------------------------------------
 1 | #word	frequency
 2 | one	1
 3 | two	406
 4 | three	345
 5 | four	330
 6 | five	324
 7 | six	271
 8 | seven	184
 9 | eight	177
10 | nine	176
11 | ten	10
12 | eleven	146
13 | twelve	170
14 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks/outliers/test.csv:
--------------------------------------------------------------------------------
1 | ,category,word,is_outlier
2 | 0,cats,cat,false
3 | 1,cats,tiger,false
4 | 2,cats,run,true
5 | 3,cats,walk,true
6 | 4,fruits,apple,false
7 | 5,fruits,banana,false
8 | 6,fruits,the,true
9 | 7,fruits,fly,true


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # requrements for vecto
 2 | brewer2mpl
 3 | # gitpython
 4 | numpy
 5 | nltk
 6 | pandas
 7 | progressbar2
 8 | matplotlib
 9 | scipy
10 | scikit-learn
11 | system-query
12 | tables
13 | traitlets
14 | tqdm
15 | requests
16 | docutils
17 | 


--------------------------------------------------------------------------------
/test_requirements.txt:
--------------------------------------------------------------------------------
 1 | -rrequirements.txt
 2 | pip >= 21.0
 3 | setuptools >= 40.0
 4 | docutils
 5 | pygments
 6 | wheel
 7 | fabric3
 8 | progressbar2
 9 | sklearn
10 | pyyaml
11 | coveralls
12 | coverage
13 | pylint
14 | chainer
15 | keras
16 | tensorflow


--------------------------------------------------------------------------------
/tests/data/corpora/multiple_small/one.txt:
--------------------------------------------------------------------------------
 1 | line0 token token
 2 | line1 token token
 3 | line2 token token 
 4 | line3 token token 
 5 | line4 token token 
 6 | line5 token token 
 7 | line6 token token 
 8 | line7 token token 
 9 | line8 token token 
10 | line9 token token 
11 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/index.rst:
--------------------------------------------------------------------------------
 1 | Tutorial
 2 | ========
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 2
 6 | 
 7 |    basic
 8 |    installing
 9 |    metadata
10 |    getting_vectors
11 |    training_vectors
12 |    working_with_vectors
13 |    visualization
14 |    evaluating
15 |    roadmap


--------------------------------------------------------------------------------
/vecto/corpus/__init__.py:
--------------------------------------------------------------------------------
1 | from .corpus import FileCorpus, DirCorpus, CorpusView, Corpus
2 | from .corpus import corpus_chain, load_path_as_ids, DirSlidingWindowCorpus
3 | from .tokenization import DEFAULT_JAP_TOKENIZER, DEFAULT_TOKENIZER, DEFAULT_SENT_TOKENIZER, ANNOTATED_TEXT_TOKENIZER
4 | 


--------------------------------------------------------------------------------
/tests/data/embeddings/text/plain_with_file_header/emb.txt:
--------------------------------------------------------------------------------
 1 | 9 4
 2 | the 0.4 0.3 0.2 0.1
 3 | apple 0.1 0.2 0.3 0.4
 4 | banana 0.1 0.2 0.3 0.41
 5 | fast 0.1 0.1 0.1 0.1
 6 | quick 0.1 0.1 0.1 0.2
 7 | tiger 0.1 0.1 0.1 0.2
 8 | cat 0.1 0.1 0.1 0.2
 9 | walk 0.9 0.5 0.6 0.3
10 | fly 0.7 0.1 0.6 0.2


--------------------------------------------------------------------------------
/vecto/embeddings/base.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | from vecto.utils.metadata import WithMetaData
 3 | 
 4 | 
 5 | class WordEmbeddings(WithMetaData, metaclass=abc.ABCMeta):
 6 |     # TODO: define proper interface
 7 | 
 8 |     @abc.abstractmethod
 9 |     def get_vector(self, w):
10 |         pass
11 | 


--------------------------------------------------------------------------------
/check_ds.py:
--------------------------------------------------------------------------------
 1 | from vecto.data import Dataset
 2 | path = "/mnt/storage/data/NLP/datasets/text_classification/SST-2"
 3 | #path = "/home/blackbird/Projects/NLP/datasets/STSA/binary"
 4 | 
 5 | ds = Dataset(path)
 6 | 
 7 | print(ds)
 8 | print(ds.metadata)
 9 | train = ds.get_train()
10 | print(train)
11 | 


--------------------------------------------------------------------------------
/vecto/utils/blas.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | # import scipy.sparse.linalg
 3 | 
 4 | 
 5 | def normed(v):
 6 |     return v / np.linalg.norm(v)
 7 | 
 8 | 
 9 | # def normalize_sparse(m):
10 | #     norm = scipy.sparse.linalg.norm(m, axis=1)[:, None]
11 | #     m.data /= norm.repeat(np.diff(m.indptr))
12 | 


--------------------------------------------------------------------------------
/vecto/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | """Helpers for various things
 2 | """
 3 | 
 4 | import datetime
 5 | 
 6 | 
 7 | def get_time_str():
 8 |     """
 9 |     returs current time formatted nicely
10 |     """
11 |     time_now = datetime.datetime.now()
12 |     str_time = time_now.strftime("%y.%m.%d_%H.%M.%S")
13 |     return str_time
14 | 


--------------------------------------------------------------------------------
/tests/data/corpora/multiple_small/two.txt:
--------------------------------------------------------------------------------
 1 | file2-0 lalala la la la 
 2 | file2-1 lalala la la la 
 3 | file2-2 lalala la la la 
 4 | file2-3 lalala la la la 
 5 | file2-4 lalala la la la 
 6 | file2-5 lalala la la la 
 7 | file2-6 lalala la la la 
 8 | file2-7 lalala la la la 
 9 | file2-8 lalala la la la 
10 | file2-9 lalala la la la 
11 | 


--------------------------------------------------------------------------------
/vecto/benchmarks/base.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | 
 4 | class Benchmark():
 5 |     # TODO: define proper interface
 6 | 
 7 |     @abc.abstractmethod
 8 |     def __init__(self):
 9 |         raise NotImplementedError
10 | 
11 |     # @abc.abstractmethod
12 |     # def get_result(self, embeddings, path_dataset):
13 |     #     raise NotImplementedError
14 | 


--------------------------------------------------------------------------------
/vecto/__init__.py:
--------------------------------------------------------------------------------
 1 | """vecto is a library for all things related to vector space models in NLP
 2 | 
 3 | Submodules
 4 | ==========
 5 | 
 6 | .. autosummary::
 7 |     :toctree: _autosummary
 8 | 
 9 |     embeddings
10 |     corpus
11 |     vocabulary
12 |     benchmarks
13 | """
14 | 
15 | from ._version import VERSION
16 | 
17 | 
18 | __version__ = VERSION
19 | 


--------------------------------------------------------------------------------
/vecto/utils/tqdm_utils.py:
--------------------------------------------------------------------------------
 1 | import tqdm
 2 | 
 3 | 
 4 | def is_in_jupyter():
 5 |     try:
 6 |         get_ipython
 7 |         return True
 8 |     except:
 9 |         return False
10 | 
11 | 
12 | def get_tqdm(*args, **kwargs):
13 |     if is_in_jupyter():
14 |         return tqdm.tqdm_notebook(*args, **kwargs)
15 |     return tqdm.tqdm(*args, **kwargs)
16 | 


--------------------------------------------------------------------------------
/tests/test_datasets.py:
--------------------------------------------------------------------------------
 1 | """Tests for datasets"""
 2 | import unittest
 3 | from vecto.data import Dataset
 4 | 
 5 | 
 6 | class Tests(unittest.TestCase):
 7 | 
 8 |     def test_datasets(self):
 9 |         Dataset("./")
10 | 
11 |     def test_dataset(self):
12 |         with self.assertRaises(FileNotFoundError):
13 |             Dataset("./path/does/not/exist/")
14 | 


--------------------------------------------------------------------------------
/vecto/benchmarks/similarity/__init__.py:
--------------------------------------------------------------------------------
1 | from .similarity import Similarity as Benchmark
2 | 
3 | 
4 | def add_extra_args(parser):
5 |     parser.add_argument("embeddings")
6 |     parser.add_argument("dataset")
7 |     parser.add_argument('--normalize', dest='normalize', action='store_true')
8 |     parser.add_argument('--ignore_oov', dest='ignore_oov', action='store_true')
9 | 


--------------------------------------------------------------------------------
/vecto/vocabulary/__init__.py:
--------------------------------------------------------------------------------
 1 | """The model module that implements vocabulary.
 2 | 
 3 | .. autosummary::
 4 |     :toctree: _autosummary
 5 | 
 6 | """
 7 | 
 8 | from .vocabulary import Vocabulary
 9 | from .vocabulary import create_from_path, create_ngram_tokens_from_dir, create_from_annotated_dir
10 | 
11 | 
12 | def load(path):
13 |     v = Vocabulary()
14 |     v.load(path)
15 |     return v
16 | 


--------------------------------------------------------------------------------
/vecto/utils/convert.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | def main():
 3 |     path = sys.argv[1]
 4 |     print(path)
 5 |     with open(path, encoding='utf-8', errors='ignore') as f_in:
 6 |         with open(path + ".out", "w", encoding='utf-8') as f_out:
 7 |             for l in f_in:
 8 |                 label, text = l.rstrip().split(None, 1)
 9 |                 f_out.write(f"{label}\t{text}\n")
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     main()


--------------------------------------------------------------------------------
/tests/test_cli_misc.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from io import StringIO
 3 | from contextlib import redirect_stdout
 4 | from .test_setup import run_module
 5 | 
 6 | 
 7 | class Tests(unittest.TestCase):
 8 | 
 9 |     def test_cli(self):
10 |         with self.assertRaises(SystemExit):
11 |             sio = StringIO()
12 |             with redirect_stdout(sio):
13 |                 run_module('vecto',
14 |                            'WRONG_COMMAND')
15 | 


--------------------------------------------------------------------------------
/tests/test_misc.py:
--------------------------------------------------------------------------------
 1 | """Tests for misc"""
 2 | import unittest
 3 | import logging
 4 | logger = logging.getLogger(__name__)
 5 | 
 6 | 
 7 | class Tests(unittest.TestCase):
 8 | 
 9 |     def test_import(self):
10 |         logger.info("testing deprecated")
11 |         import vecto
12 | 
13 |     def test_utils(self):
14 |         from vecto.utils.data import jsonify
15 |         data = {"test": 1, "dict": {"i": 2}}
16 |         res = jsonify(data)
17 |         self.assertIsInstance(res, dict)
18 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks/similarity/ws:
--------------------------------------------------------------------------------
 1 | love	sex	6.77
 2 | tiger	cat	7.35
 3 | tiger	tiger	10
 4 | book	paper	7.46
 5 | computer	keyboard	7.62
 6 | computer	internet	7.58
 7 | plane	car	5.77
 8 | train	car	6.31
 9 | telephone	communication	7.5
10 | television	radio	6.77
11 | media	radio	7.42
12 | drug	abuse	6.85
13 | bread	butter	6.19
14 | cucumber	potato	5.92
15 | doctor	nurse	7
16 | professor	doctor	6.62
17 | student	professor	6.81
18 | smart	student	4.62
19 | smart	stupid	5.81
20 | company	stock	7.08
21 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks/similarity/ws.txt:
--------------------------------------------------------------------------------
 1 | love	sex	6.77
 2 | tiger	cat	7.35
 3 | tiger	tiger	10
 4 | book	paper	7.46
 5 | computer	keyboard	7.62
 6 | computer	internet	7.58
 7 | plane	car	5.77
 8 | train	car	6.31
 9 | telephone	communication	7.5
10 | television	radio	6.77
11 | media	radio	7.42
12 | drug	abuse	6.85
13 | bread	butter	6.19
14 | cucumber	potato	5.92
15 | doctor	nurse	7
16 | professor	doctor	6.62
17 | student	professor	6.81
18 | smart	student	4.62
19 | smart	stupid	5.81
20 | company	stock	7.08
21 | 


--------------------------------------------------------------------------------
/vecto/benchmarks/sequence_labeling/__init__.py:
--------------------------------------------------------------------------------
 1 | from .sequence_labeling import Sequence_labeling as Benchmark
 2 | 
 3 | 
 4 | def add_extra_args(parser):
 5 |     parser.add_argument("embeddings")
 6 |     parser.add_argument("dataset")
 7 |     parser.add_argument("--window_size", default=5, type=int)
 8 |     parser.add_argument("--method", default='lr', choices=['lr', '2FFNN'],
 9 |                         help='name of method')
10 |     parser.add_argument('--normalize', dest='normalize', action='store_true')
11 | 


--------------------------------------------------------------------------------
/vecto/utils/fetch_benchmarks.py:
--------------------------------------------------------------------------------
 1 | from git import Repo
 2 | from git.exc import GitCommandError
 3 | from os import path
 4 | 
 5 | path_to_repo = 'https://github.com/vecto-ai/benchmarks.git'
 6 | 
 7 | 
 8 | def fetch_benchmarks(path_to_local_dir=path.join('data', 'benchmarks')):
 9 |     try:
10 |         Repo.clone_from('https://github.com/vecto-ai/benchmarks.git', path_to_local_dir)
11 |     except GitCommandError:
12 |         raise ValueError('Directory exists')
13 | 
14 | if __name__ == "__main__":
15 |     fetch_benchmarks()
16 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks_results/text_classification/args.json:
--------------------------------------------------------------------------------
1 | {"current_datetime": "2018-05-04 11:39:50.824318", "batchsize": 64, "epoch": 5, "gpu": -1, "layer": 1, "dropout": 0, "model": "bow", "char_based": false, "out": "./tests/data/benchmarks_results/text_classification/", "unit": 4, "dataset": "./tests/data/benchmarks/text_classification/", "vocab_path": "./tests/data/benchmarks_results/text_classification/vocab.json", "model_path": "./tests/data/benchmarks_results/text_classification/best_model.npz", "n_class": 2, "datetime": "2018-05-04 11:39:50.824318"}


--------------------------------------------------------------------------------
/tests/test_format.py:
--------------------------------------------------------------------------------
 1 | """Tests for format module."""
 2 | 
 3 | import unittest
 4 | from vecto.utils.formathelper import sizeof_fmt, countof_fmt
 5 | 
 6 | 
 7 | class Tests(unittest.TestCase):
 8 | 
 9 |     def test_sizeof(self):
10 |         val = 12345667
11 |         print("sizeof:", sizeof_fmt(val))
12 |         val = 10.0 ** 32
13 |         print("sizeof:", sizeof_fmt(val))
14 | 
15 |     def test_countof(self):
16 |         val = 12345667
17 |         print("countof:", countof_fmt(val))
18 |         val = 10.0 ** 32
19 |         print("countof:", countof_fmt(val))
20 | 


--------------------------------------------------------------------------------
/vecto/config.py:
--------------------------------------------------------------------------------
 1 | """Configuration support for vecto
 2 | 
 3 | Config files are expected to be found in the .vecto folder in user's home.
 4 | The format is the same as jupyter notebooks
 5 | """
 6 | 
 7 | from traitlets.config.loader import load_pyconfig_files
 8 | import os.path
 9 | 
10 | 
11 | def load_config():
12 |     default_dir = os.path.expanduser("~/.vecto/")
13 |     if os.path.isfile(os.path.join(default_dir, 'config.py')):
14 |         c = load_pyconfig_files(['config.py'], default_dir)
15 |         return c
16 |     else:
17 |     	# TODO: create default config
18 |         raise RuntimeError('configuration file not found, please create one in ~/.vecto/config.py')
19 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | from vecto.config import load_config
 4 | 
 5 | 
 6 | class Tests(unittest.TestCase):
 7 | 
 8 |     @unittest.skipUnless(os.environ.get('CI'), 'skipping as local config likely exists')
 9 |     def test_file_corpus(self):
10 |         default_dir = os.path.expanduser("~/.vecto/")
11 |         os.makedirs(default_dir, exist_ok=True)
12 |         path_config = os.path.join(default_dir, 'config.py')
13 |         with self.assertRaises(RuntimeError):
14 |             load_config()
15 |         if not os.path.isfile(path_config):
16 |             with open(path_config, "w") as f:
17 |                 f.write("test=1")
18 |         load_config()
19 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = vecto
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/tests/data/benchmarks_results/similarity/2:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "details": [],
 4 |         "experiment_setup": {
 5 |             "category": "default",
 6 |             "cnt_found_pairs_total": 0,
 7 |             "cnt_pairs_total": 20,
 8 |             "dataset": "ws",
 9 |             "embeddings": {
10 |                 "_class": "vecto.embeddings.dense.WordEmbeddingsDense",
11 |                 "foldername": "plain_no_file_header",
12 |                 "type": "test",
13 |                 "vecto_version": "0.1.2"
14 |             },
15 |             "measurement": "spearman",
16 |             "method": "cosine_distance",
17 |             "task": "word_similarity",
18 |             "timestamp": "2018-05-03T00:16:05.143078"
19 |         },
20 |         "result": -1
21 |     }
22 | ]


--------------------------------------------------------------------------------
/tests/data/benchmarks_results/similarity/1:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "details": [],
 4 |         "experiment_setup": {
 5 |             "category": "default",
 6 |             "cnt_found_pairs_total": 2,
 7 |             "cnt_pairs_total": 20,
 8 |             "dataset": "ws",
 9 |             "embeddings": {
10 |                 "_class": "vecto.embeddings.dense.WordEmbeddingsDense",
11 |                 "foldername": "plain_with_file_header",
12 |                 "type": "test",
13 |                 "vecto_version": "0.1.2"
14 |             },
15 |             "measurement": "spearman",
16 |             "method": "cosine_distance",
17 |             "task": "word_similarity",
18 |             "timestamp": "2018-05-03T00:16:05.141905"
19 |         },
20 |         "result": -1
21 |     }
22 | ]


--------------------------------------------------------------------------------
/docs/source/tutorial/installing.rst:
--------------------------------------------------------------------------------
 1 | Installing Vecto
 2 | =================
 3 | 
 4 | .. currentmodule:: vecto
 5 | 
 6 | 
 7 | System requirements
 8 | -------------------
 9 | 
10 | - Python 3.5 or later
11 | 
12 | Method 1: Pip-install
13 | ---------------------
14 | 
15 | The latest stable version:
16 | 
17 | 
18 | >>> pip3 install vecto
19 | 
20 | The latest development version:
21 | 
22 | >>> pip3 install git+https://github.com/vecto-ai/vecto.git
23 | 
24 | 
25 | Method 2: Clone or download the github repo
26 | -------------------------------------------
27 | 
28 | You can avoid intalling vecto system-wide. Simply download and unpack the github repo into your project's working directory.
29 | 
30 | Either way, you can access the vecto's modules by issuing
31 | 
32 | >>> import vecto
33 | 
34 | at the beginning of your code.
35 | 


--------------------------------------------------------------------------------
/vecto/utils/formathelper.py:
--------------------------------------------------------------------------------
 1 | class bcolors:
 2 |     HEADER = '\033[95m'
 3 |     OKBLUE = '\033[94m'
 4 |     OKGREEN = '\033[92m'
 5 |     WARNING = '\033[93m'
 6 |     FAIL = '\033[91m'
 7 |     ENDC = '\033[0m'
 8 |     BOLD = '\033[1m'
 9 |     UNDERLINE = '\033[4m'
10 | 
11 | 
12 | def sizeof_fmt(num, suffix='B'):
13 |     for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
14 |         if abs(num) < 1024.0:
15 |             return "%3.1f%s%s" % (num, unit, suffix)
16 |         num /= 1024.0
17 |     return "%.1f%s%s" % (num, 'Yi', suffix)
18 | 
19 | 
20 | def countof_fmt(num, suffix=''):
21 |     for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
22 |         if abs(num) < 1000.0:
23 |             return "%3.1f%s%s" % (num, unit, suffix)
24 |         num /= 1000.0
25 |     return "%.1f%s%s" % (num, 'Y', suffix)
26 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | ==============================================
 2 | vecto - Python library for vector space models
 3 | ==============================================
 4 | 
 5 | Vecto is an open-source Python library for working with vector space models (VSMs), 
 6 | including various word embeddings such as word2vec. Vecto can load various popular 
 7 | formats of VSMs and perform a set of basic operations like dimensionality reduction, seach for nearest neighbors etc. It includes a growing 
 8 | list of benchmarks with which VSMs are evaluated in most current research, and a few visualization tools. 
 9 | It also includes a growing list of modules for creating VSMs, both explicit and based on neural networks.
10 | 
11 | .. toctree::
12 |    :maxdepth: 1
13 | 
14 |    tutorial/index
15 |    reference/index
16 |    contribution
17 | 
18 | Indices and tables
19 | ==================
20 | 
21 | * :ref:`genindex`
22 | * :ref:`modindex`
23 | * :ref:`search`
24 | 


--------------------------------------------------------------------------------
/vecto/benchmarks/categorization/metrics.py:
--------------------------------------------------------------------------------
 1 | from sklearn.metrics import accuracy_score
 2 | from sklearn.metrics import adjusted_rand_score, v_measure_score, homogeneity_score, completeness_score, \
 3 |     mutual_info_score, fowlkes_mallows_score, silhouette_score, calinski_harabasz_score
 4 | import numpy as np
 5 | 
 6 | 
 7 | def purity_score(y_true, y_pred):
 8 |     y_voted_labels = np.zeros(y_true.shape)
 9 |     labels = np.unique(y_true)
10 |     ordered_labels = np.arange(labels.shape[0])
11 |     for k in range(labels.shape[0]):
12 |         y_true[y_true == labels[k]] = ordered_labels[k]
13 |     labels = np.unique(y_true)
14 |     bins = np.concatenate((labels, [np.max(labels) + 1]), axis=0)
15 |     for cluster in np.unique(y_pred):
16 |         hist, _ = np.histogram(y_true[y_pred == cluster], bins=bins)
17 |         winner = np.argmax(hist)
18 |         y_voted_labels[y_pred == cluster] = winner
19 |     return accuracy_score(y_true, y_voted_labels)
20 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | set SPHINXPROJ=vecto
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """Setup script for vecto package."""
 2 | 
 3 | import setup_boilerplate
 4 | 
 5 | 
 6 | class Package(setup_boilerplate.Package):
 7 | 
 8 |     """Package metadata."""
 9 | 
10 |     name = 'vecto'
11 |     description = 'toolbox for various tasks in the area of vector space models of computational linguistic'
12 |     url = "http://vecto.space"
13 |     classifiers = [
14 |         'Development Status :: 3 - Alpha',
15 |         'Environment :: Console',
16 |         'Intended Audience :: Science/Research',
17 |         'License :: OSI Approved :: Apache Software License',
18 |         'Natural Language :: English',
19 |         'Operating System :: POSIX',
20 |         'Programming Language :: Python :: 3.5',
21 |         'Programming Language :: Python :: 3.6',
22 |         'Programming Language :: Python :: 3 :: Only',
23 |         'Topic :: Text Processing :: Linguistic']
24 |     keywords = ['NLP', 'linguistics', 'language']
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     Package.setup()
29 | 


--------------------------------------------------------------------------------
/vecto/benchmarks/relation_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | from .relation_extraction import Relation_extraction as Benchmark
 2 | 
 3 | 
 4 | def add_extra_args(parser):
 5 |     parser.add_argument("embeddings")
 6 |     parser.add_argument("dataset")
 7 | 
 8 |     parser.add_argument('--batchsize', '-b', type=int, default=64,
 9 |                         help='Number of images in each mini-batch')
10 |     parser.add_argument('--epoch', '-e', type=int, default=1,
11 |                         help='Number of sweeps over the dataset to train')
12 |     parser.add_argument('--nb_filter', '-nf', type=int, default=100,
13 |                         help='filter number')
14 |     parser.add_argument('--filter_length', '-fl', type=int, default=3,
15 |                         help='filter length')
16 |     parser.add_argument('--hidden_dims', '-hd', type=int, default=100,
17 |                         help='D')
18 |     parser.add_argument('--position_dims', '-pd', type=int, default=100,
19 |                         help='D')
20 | 


--------------------------------------------------------------------------------
/docs/source/contribution.rst:
--------------------------------------------------------------------------------
 1 | .. _contrib:
 2 | 
 3 | Contribution Guide
 4 | ==================
 5 | 
 6 | This is a guide for all contributions to vecto.
 7 | The development of vecto is happening on `the official repository at GitHub <https://github.com/vecto-ai/vecto>`_.
 8 | 
 9 | Some quick notes:
10 | -----------------
11 | 
12 | Please send pull requests to the ``dev`` branch.
13 | 
14 | Pull requests must not lower test coverage score. 
15 | 
16 | If you send a pull request, please make sure your code is pep8-compliant.
17 | 
18 | If you want to raise an ussue, please first do a quick search to see if it has already been reported. If so, it's often better to just leave a comment on an existing issue, rather than creating a new one.
19 | 
20 | Issues are for bug reports, feature requests etc. For usage-related questions please consult the tutorial; if something is not covered, raise an issue, and we will update the tutorial. 
21 | 
22 | If there's an issue you would like to fix - this is very welcome, please get in touch. 
23 | 


--------------------------------------------------------------------------------
/vecto/benchmarks/analogy/io.py:
--------------------------------------------------------------------------------
 1 | def get_pairs(fname):
 2 |     pairs = []
 3 |     with open(fname) as file_in:
 4 |         id_line = 0
 5 |         for line in file_in:
 6 |             if line.strip() == '':
 7 |                 continue
 8 |             try:
 9 |                 id_line += 1
10 |                 if "\t" in line:
11 |                     parts = line.lower().split("\t")
12 |                 else:
13 |                     parts = line.lower().split()
14 |                 left = parts[0]
15 |                 right = parts[1]
16 |                 right = right.strip()
17 |                 if "/" in right:
18 |                     right = [i.strip() for i in right.split("/")]
19 |                 else:
20 |                     right = [i.strip() for i in right.split(",")]
21 |                 pairs.append([left, right])
22 |             except:
23 |                 print("error reading pairs")
24 |                 print("in file", fname)
25 |                 print("in line", id_line, line)
26 |                 exit(-1)
27 |     return pairs
28 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks/analogy/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "class": "dataset",
 3 |         "task": "analogy",
 4 |         "language": ["english"],
 5 |         "name": "dummy_analogy",
 6 |         "description": "Test Analogy Set",
 7 |         "domain": "general",
 8 |         "date": "2016",
 9 |         "source": "original",
10 |         "project_page": "http://vecto.space/",
11 |         "version": "3.0",
12 |         "size": "small",
13 |         "cite": 
14 |         {
15 |           "title": "Analogy-based detection of morphological and semantic relations with word embeddings: what works and what doesn't",
16 |           "author": "Gladkova, Anna and Drozd, Aleksandr and Matsuoka, Satoshi",
17 |           "doi":  "10.18653/v1/N16-2002",
18 |           "url": "https://www.aclweb.org/anthology/N/N16/N16-2002.pdf",
19 |           "booktitle": "Proceedings of the NAACL-HLT SRW",
20 |           "publisher": "ACL",
21 |           "year": 2016,
22 |           "pages": "47-54",
23 |           "type": "inproceedings",
24 |           "id":"GladkovaDrozdEtAl_2016"
25 |         }
26 | }


--------------------------------------------------------------------------------
/vecto/benchmarks/analogy/__init__.py:
--------------------------------------------------------------------------------
 1 | """Benchmark on word analogy
 2 | 
 3 | .. autosummary::
 4 |     :toctree: _autosummary
 5 | 
 6 |     analogy
 7 | """
 8 | 
 9 | # import logging
10 | from .analogy import Analogy as Benchmark
11 | import numpy as np
12 | 
13 | # logging.basicConfig(level=logging.DEBUG)
14 | 
15 | 
16 | def add_extra_args(parser):
17 |     parser.add_argument("embeddings")
18 |     parser.add_argument("dataset")
19 |     parser.add_argument("--method",
20 |                         help="analogy solving method",
21 |                         default="LRCos")
22 | 
23 | 
24 | # TODO: move this to proper location, reuse between senchmarks
25 | def get_mean_reciprocal_rank(results):
26 |     mean_reciprocal_rank=np.mean([(lambda r : 0 if r<=0 else 1/r) (experiment["rank"]) for category in results for experiment in category["details"] ])
27 |     return mean_reciprocal_rank
28 | 
29 | 
30 | def get_mean_accuracy(results):
31 |     mean_accuracy=np.mean([experiment["rank"]==0 for category in results for experiment in category["details"] ])
32 |     return mean_accuracy
33 | 


--------------------------------------------------------------------------------
/vecto/corpus/base.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | from vecto.utils.metadata import WithMetaData
 4 | from vecto.utils.tqdm_utils import get_tqdm
 5 | 
 6 | 
 7 | class BaseIterator(WithMetaData):
 8 |     """
 9 |     Base class for all corpora and iterators.
10 |     Responsible for base logic like metadata collection, __len__,
11 |     iteration, tqdm progressbar etc.
12 |     """
13 | 
14 |     def __init__(self, verbose=False, **metadata_kwargs):
15 |         super(BaseIterator, self).__init__(**metadata_kwargs)
16 |         self._verbose = verbose
17 | 
18 |     def __iter__(self):
19 |         for elem in self._generate_samples_outer():
20 |             yield elem
21 | 
22 |     def __len__(self):
23 |         return self.metadata.get('samples_count', 0)
24 | 
25 |     def _generate_samples_outer(self):
26 |         gen = self._generate_samples()
27 |         if self._verbose > 0:
28 |             cur_len = len(self)
29 |             if cur_len is None:
30 |                 return get_tqdm(gen)
31 |             else:
32 |                 return get_tqdm(gen, total=cur_len)
33 |         return gen
34 | 


--------------------------------------------------------------------------------
/vecto/benchmarks/language_modeling/__init__.py:
--------------------------------------------------------------------------------
 1 | from .language_modeling import Language_modeling as Benchmark
 2 | 
 3 | 
 4 | def add_extra_args(parser):
 5 |     parser.add_argument("embeddings")
 6 | #    parser.add_argument("dataset", default="ptb")
 7 |     parser.add_argument("--window_size", default=5, type=int)
 8 |     parser.add_argument("--test", default=True,
 9 |                         help='use small test dataset')
10 |     parser.add_argument("--method",
11 |                         default='lstm',
12 |                         choices=['lr', '2FFNN', 'lstm'],
13 |                         help='name of method')
14 |     parser.add_argument('--normalize', dest='normalize', action='store_true')
15 | 
16 |     # args = parser.parse_args(extra_args)
17 |     # TODO: add warning that other datasets not supported
18 |     #args.dataset = "ptb"
19 |     #language_modeling = Language_modeling(normalize=args.normalize,
20 |     #                                      window_size=args.window_size,
21 |     #                                      method=args.method,
22 |     #                                      test=args.test)
23 |     #language_modeling.run_with_args(args)
24 | 


--------------------------------------------------------------------------------
/.appveyor.yml:
--------------------------------------------------------------------------------
 1 | environment:
 2 |   global:
 3 |     CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\appveyor\\run_with_env.cmd"
 4 | 
 5 |   matrix:
 6 |     - ARCHITECTURE: "x64"
 7 |       PYTHON_VERSION: "3.7"
 8 |       PYTHON: "C:\\Python37-x64"
 9 | 
10 | init:
11 |   - set PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%
12 | 
13 | install:
14 |   - "python --version"
15 |   - "python -c \"import struct; print(struct.calcsize('P') * 8)\""
16 | 
17 |   - "python -m pip install --upgrade pip"
18 |   # - "python -m pip install --upgrade setuptools"
19 |   # - "python -m pip install --upgrade cython"
20 |   - "python -m pip install -r test_requirements.txt"
21 | 
22 | build_script:
23 |   - "python setup.py build"
24 | 
25 | test_script:
26 |   # - "%CMD_IN_ENV% python -m unittest discover"
27 |   - "python -m coverage run --branch --source . -m unittest discover --verbose"
28 |   # - "%CMD_IN_ENV% python -m coverage run --source . setup.py test"
29 | 
30 | after_test:
31 |   # - "python setup.py bdist_wheel"
32 |   # - "%CMD_IN_ENV% python setup.py bdist_wininst"
33 |   # - "%CMD_IN_ENV% python setup.py bdist_msi"
34 |   - ps: "ls"
35 | 
36 | notifications:
37 |   - provider: Webhook
38 |     url: https://webhooks.gitter.im/e/25b43ed5bc5e1d3a0772
39 |     on_build_success: true
40 |     on_build_failure: true
41 |     on_build_status_changed: true
42 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist: focal
 2 | 
 3 | language: python
 4 | 
 5 | sudo: false
 6 | 
 7 | python:
 8 | - '3.8'
 9 | 
10 | os:
11 | - linux
12 | 
13 | install:
14 |   - pip install -U coveralls
15 |   - pip install -U coverage
16 |   - pip install -U pylint
17 |   - pip install -U -r test_requirements.txt
18 | 
19 | # before_script: # configure a headless display to test plot generation
20 | # - "export DISPLAY=:99.0"
21 | # - "sh -e /etc/init.d/xvfb start"
22 | # - sleep 1 # give xvfb some time to start
23 | 
24 | script:
25 | - python -m coverage run --source . -m unittest discover --verbose
26 | # - python -m coverage run --source . setup.py test
27 | 
28 | after_success:
29 |   # - python -m pylint --load-plugins=pylint.extensions.mccabe --docstring-min-length 5 --no-docstring-rgx "^(test)?_|.*Tests$" --unsafe-load-any-extension y --output-format colorized  --reports y $(find . -name "*.py")
30 |   - python -m coverage report --show-missing
31 |   - coveralls
32 | 
33 | notifications:
34 |   webhooks:
35 |     urls:
36 |       - https://webhooks.gitter.im/e/a75d423f7dff38862a1a
37 |     on_success: always  # options: [always|never|change] default: always
38 |     on_failure: always  # options: [always|never|change] default: always
39 |     on_start: never     # options: [always|never|change] default: always
40 | 
41 |   email: false
42 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks_results/text_classification/log:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "main/loss": 0.7636308670043945,
 4 |         "main/accuracy": 0.46875,
 5 |         "validation/main/loss": 0.7174736261367798,
 6 |         "validation/main/accuracy": 0.5333333611488342,
 7 |         "epoch": 1,
 8 |         "iteration": 1,
 9 |         "elapsed_time": 0.00541385097312741
10 |     },
11 |     {
12 |         "main/loss": 0.7468587160110474,
13 |         "main/accuracy": 0.484375,
14 |         "validation/main/loss": 0.716312050819397,
15 |         "validation/main/accuracy": 0.5333333611488342,
16 |         "epoch": 2,
17 |         "iteration": 2,
18 |         "elapsed_time": 0.012854741973569617
19 |     },
20 |     {
21 |         "main/loss": 0.7709426283836365,
22 |         "main/accuracy": 0.4545454680919647,
23 |         "validation/main/loss": 0.7152009010314941,
24 |         "validation/main/accuracy": 0.5333333611488342,
25 |         "epoch": 3,
26 |         "iteration": 3,
27 |         "elapsed_time": 0.020706564973806962
28 |     },
29 |     {
30 |         "main/loss": 0.7399059534072876,
31 |         "main/accuracy": 0.5348837375640869,
32 |         "validation/main/loss": 0.7141532897949219,
33 |         "validation/main/accuracy": 0.5333333611488342,
34 |         "epoch": 4,
35 |         "iteration": 4,
36 |         "elapsed_time": 0.026553130999673158
37 |     }
38 | ]


--------------------------------------------------------------------------------
/vecto/cli.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import vecto
 3 | 
 4 | 
 5 | class CLI(object):
 6 | 
 7 |     def __init__(self):
 8 |         parser = argparse.ArgumentParser(
 9 |             prog="vecto",
10 |             description='vecto commad line interface',
11 |             add_help=True,
12 |             epilog="\n",
13 |             usage='''vecto <command> [<args>],
14 | 
15 | The most commonly used vecto commands are:
16 |    benchmark        Run benchmarks
17 |    create_vocab     Create vocabulary from a folder
18 | ''')
19 | 
20 |         parser.add_argument('--version', action='version',
21 |                             version=f'Vecto version {vecto.__version__}')
22 |         parser.add_argument('command', help='Subcommand to run')
23 |         args, self.unknownargs = parser.parse_known_args()
24 |         if not hasattr(self, args.command):
25 |             print('Unrecognized command')
26 |             parser.print_help()
27 |             exit(1)
28 |         # use dispatch pattern to invoke method with same name
29 |         getattr(self, args.command)()
30 | 
31 |     def benchmark(self):
32 |         from vecto.benchmarks import run_benchmarks_cli
33 |         run_benchmarks_cli(self.unknownargs)
34 | 
35 |     def create_vocab(self):
36 |         print("CLI for vocabulary routines not implemented yet")
37 | 
38 | 
39 | def main():
40 |     CLI()
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks/sequence_labeling/chunk/valid.txt:
--------------------------------------------------------------------------------
 1 | -DOCSTART- -X- O O
 2 | 
 3 | Fischler JJR I-NP I-PER
 4 | proposed VBN I-NP O
 5 | EU-wide NNP I-NP I-MISC
 6 | measures VBZ I-VP O
 7 | after IN I-PP O
 8 | reports NNS I-NP O
 9 | from IN I-PP O
10 | Britain NNP I-NP I-LOC
11 | and CC I-NP O
12 | France NNP I-NP I-LOC
13 | that WDT B-NP O
14 | under IN I-PP O
15 | laboratory NN I-NP O
16 | conditions NNS B-NP O
17 | sheep NN I-NP O
18 | could MD I-VP O
19 | contract VB I-VP O
20 | Bovine NNP I-NP I-MISC
21 | Spongiform NNP I-NP I-MISC
22 | Encephalopathy NNP I-NP I-MISC
23 | ( ( O O
24 | BSE NNP I-NP I-MISC
25 | ) ) O O
26 | -- : O O
27 | mad JJ I-NP O
28 | cow NN I-NP O
29 | disease NN I-NP O
30 | . . O O
31 | 
32 | But CC O O
33 | Fischler NNP I-NP I-PER
34 | agreed VBD I-VP O
35 | to TO I-VP O
36 | review VB I-VP O
37 | his PRP$ I-NP O
38 | proposal NN I-NP O
39 | after IN I-PP O
40 | the DT I-NP O
41 | EU NNP I-NP I-ORG
42 | 's POS B-NP O
43 | standing NN I-NP O
44 | veterinary JJ I-NP O
45 | committee NN I-NP O
46 | , , O O
47 | mational JJ I-NP O
48 | animal NN I-NP O
49 | health NN I-NP O
50 | officials NNS I-NP O
51 | , , O O
52 | questioned VBD I-VP O
53 | if IN I-SBAR O
54 | such JJ I-NP O
55 | action NN I-NP O
56 | was VBD I-VP O
57 | justified VBN I-VP O
58 | as IN I-PP O
59 | there RB I-ADVP O
60 | was VBD I-VP O
61 | only RB I-ADVP O
62 | a DT I-NP O
63 | slight JJ I-NP O
64 | risk NN I-NP O
65 | to TO I-PP O
66 | human JJ I-NP O
67 | health NN I-NP O
68 | . . O O
69 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks/sequence_labeling/ner/valid.txt:
--------------------------------------------------------------------------------
 1 | -DOCSTART- -X- O O
 2 | 
 3 | Fischler JJR I-NP I-PER
 4 | proposed VBN I-NP O
 5 | EU-wide NNP I-NP I-MISC
 6 | measures VBZ I-VP O
 7 | after IN I-PP O
 8 | reports NNS I-NP O
 9 | from IN I-PP O
10 | Britain NNP I-NP I-LOC
11 | and CC I-NP O
12 | France NNP I-NP I-LOC
13 | that WDT B-NP O
14 | under IN I-PP O
15 | laboratory NN I-NP O
16 | conditions NNS B-NP O
17 | sheep NN I-NP O
18 | could MD I-VP O
19 | contract VB I-VP O
20 | Bovine NNP I-NP I-MISC
21 | Spongiform NNP I-NP I-MISC
22 | Encephalopathy NNP I-NP I-MISC
23 | ( ( O O
24 | BSE NNP I-NP I-MISC
25 | ) ) O O
26 | -- : O O
27 | mad JJ I-NP O
28 | cow NN I-NP O
29 | disease NN I-NP O
30 | . . O O
31 | 
32 | But CC O O
33 | Fischler NNP I-NP I-PER
34 | agreed VBD I-VP O
35 | to TO I-VP O
36 | review VB I-VP O
37 | his PRP$ I-NP O
38 | proposal NN I-NP O
39 | after IN I-PP O
40 | the DT I-NP O
41 | EU NNP I-NP I-ORG
42 | 's POS B-NP O
43 | standing NN I-NP O
44 | veterinary JJ I-NP O
45 | committee NN I-NP O
46 | , , O O
47 | mational JJ I-NP O
48 | animal NN I-NP O
49 | health NN I-NP O
50 | officials NNS I-NP O
51 | , , O O
52 | questioned VBD I-VP O
53 | if IN I-SBAR O
54 | such JJ I-NP O
55 | action NN I-NP O
56 | was VBD I-VP O
57 | justified VBN I-VP O
58 | as IN I-PP O
59 | there RB I-ADVP O
60 | was VBD I-VP O
61 | only RB I-ADVP O
62 | a DT I-NP O
63 | slight JJ I-NP O
64 | risk NN I-NP O
65 | to TO I-PP O
66 | human JJ I-NP O
67 | health NN I-NP O
68 | . . O O
69 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks/sequence_labeling/pos/valid.txt:
--------------------------------------------------------------------------------
 1 | -DOCSTART- -X- O O
 2 | 
 3 | Fischler JJR I-NP I-PER
 4 | proposed VBN I-NP O
 5 | EU-wide NNP I-NP I-MISC
 6 | measures VBZ I-VP O
 7 | after IN I-PP O
 8 | reports NNS I-NP O
 9 | from IN I-PP O
10 | Britain NNP I-NP I-LOC
11 | and CC I-NP O
12 | France NNP I-NP I-LOC
13 | that WDT B-NP O
14 | under IN I-PP O
15 | laboratory NN I-NP O
16 | conditions NNS B-NP O
17 | sheep NN I-NP O
18 | could MD I-VP O
19 | contract VB I-VP O
20 | Bovine NNP I-NP I-MISC
21 | Spongiform NNP I-NP I-MISC
22 | Encephalopathy NNP I-NP I-MISC
23 | ( ( O O
24 | BSE NNP I-NP I-MISC
25 | ) ) O O
26 | -- : O O
27 | mad JJ I-NP O
28 | cow NN I-NP O
29 | disease NN I-NP O
30 | . . O O
31 | 
32 | But CC O O
33 | Fischler NNP I-NP I-PER
34 | agreed VBD I-VP O
35 | to TO I-VP O
36 | review VB I-VP O
37 | his PRP$ I-NP O
38 | proposal NN I-NP O
39 | after IN I-PP O
40 | the DT I-NP O
41 | EU NNP I-NP I-ORG
42 | 's POS B-NP O
43 | standing NN I-NP O
44 | veterinary JJ I-NP O
45 | committee NN I-NP O
46 | , , O O
47 | mational JJ I-NP O
48 | animal NN I-NP O
49 | health NN I-NP O
50 | officials NNS I-NP O
51 | , , O O
52 | questioned VBD I-VP O
53 | if IN I-SBAR O
54 | such JJ I-NP O
55 | action NN I-NP O
56 | was VBD I-VP O
57 | justified VBN I-VP O
58 | as IN I-PP O
59 | there RB I-ADVP O
60 | was VBD I-VP O
61 | only RB I-ADVP O
62 | a DT I-NP O
63 | slight JJ I-NP O
64 | risk NN I-NP O
65 | to TO I-PP O
66 | human JJ I-NP O
67 | health NN I-NP O
68 | . . O O
69 | 


--------------------------------------------------------------------------------
/corpus_test.py:
--------------------------------------------------------------------------------
 1 | from vecto.corpus import ViewCorpus
 2 | 
 3 | path = "./tests/data/corpora/multiple_small"
 4 | corpus = ViewCorpus(path)
 5 | corpus.load_dir_strucute()
 6 | print("three is ", corpus.tree)
 7 | 
 8 | # TODO: move all this to unittests 
 9 | print("testing get offset")
10 | for q in [9, 11]:
11 |     for start in [True, False]:
12 |         print("search ", q, " with start=", start)
13 |         pos, offset = corpus.get_file_and_offset(q, start_of_range=start, epsilon=2)
14 |         print("pos", pos, ", offset", offset, "\n")
15 | pos, offset = corpus.get_file_and_offset(15, start_of_range=False, epsilon=2)
16 | print("pos", pos, ", offset", offset, "\n")
17 | 
18 | 
19 | print("testing get get_line_iterator")
20 | cnt_workers = 2
21 | for i in range(cnt_workers):
22 |     iterator = corpus.get_line_iterator(i, cnt_workers)
23 |     print("worker", i, iterator)
24 |     for line in iterator:
25 |         print("line", line)
26 | 
27 | # rank 0 creates corpus from dir
28 | # corpus has inside all file list and sizes
29 | # use manually splits sends metadata of corpus : tree of dirs and files with uncompressed sizes to all workers
30 | # otehr workers create corpora from that metadata using special service method like __from_metadata
31 | # to avoid exessive file IO
32 | 
33 | # for time being - everybody just reads from FS
34 | 
35 | # # view = corpus.view(start_percent, end_pecent)
36 | # print(corpus)
37 | # iter_token = corpus.get_line_iterator()
38 | # for s in iter_token:
39 | #     print(s)
40 | 
41 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks_results/analogy/1:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "details": [],
 4 |         "experiment_setup": {
 5 |             "category": "analogy",
 6 |             "cnt_questions_correct": 0,
 7 |             "cnt_questions_total": 0,
 8 |             "embeddings": {
 9 |                 "_class": "vecto.embeddings.dense.WordEmbeddingsDense",
10 |                 "foldername": "plain_no_file_header",
11 |                 "normalized": true,
12 |                 "type": "test",
13 |                 "vecto_version": "0.1.2"
14 |             },
15 |             "measurement": "accuracy",
16 |             "method": "LRCos",
17 |             "subcategory": "I01 [noun - plural_reg].txt",
18 |             "task": "word_analogy",
19 |             "timestamp": "2018-05-03T00:17:17.608651"
20 |         },
21 |         "result": -1
22 |     },
23 |     {
24 |         "details": [],
25 |         "experiment_setup": {
26 |             "category": "analogy",
27 |             "cnt_questions_correct": 0,
28 |             "cnt_questions_total": 0,
29 |             "embeddings": {
30 |                 "_class": "vecto.embeddings.dense.WordEmbeddingsDense",
31 |                 "foldername": "plain_no_file_header",
32 |                 "normalized": true,
33 |                 "type": "test",
34 |                 "vecto_version": "0.1.2"
35 |             },
36 |             "measurement": "accuracy",
37 |             "method": "LRCos",
38 |             "subcategory": "I02 [noun - plural_irreg].txt",
39 |             "task": "word_analogy",
40 |             "timestamp": "2018-05-03T00:17:17.616280"
41 |         },
42 |         "result": -1
43 |     }
44 | ]


--------------------------------------------------------------------------------
/tests/data/benchmarks_results/analogy/4:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "details": [],
 4 |         "experiment_setup": {
 5 |             "category": "analogy",
 6 |             "cnt_questions_correct": 0,
 7 |             "cnt_questions_total": 0,
 8 |             "embeddings": {
 9 |                 "_class": "vecto.embeddings.dense.WordEmbeddingsDense",
10 |                 "foldername": "plain_with_file_header",
11 |                 "normalized": true,
12 |                 "type": "test",
13 |                 "vecto_version": "0.1.2"
14 |             },
15 |             "measurement": "accuracy",
16 |             "method": "LRCos",
17 |             "subcategory": "I01 [noun - plural_reg].txt",
18 |             "task": "word_analogy",
19 |             "timestamp": "2018-05-03T00:17:17.590643"
20 |         },
21 |         "result": -1
22 |     },
23 |     {
24 |         "details": [],
25 |         "experiment_setup": {
26 |             "category": "analogy",
27 |             "cnt_questions_correct": 0,
28 |             "cnt_questions_total": 0,
29 |             "embeddings": {
30 |                 "_class": "vecto.embeddings.dense.WordEmbeddingsDense",
31 |                 "foldername": "plain_with_file_header",
32 |                 "normalized": true,
33 |                 "type": "test",
34 |                 "vecto_version": "0.1.2"
35 |             },
36 |             "measurement": "accuracy",
37 |             "method": "LRCos",
38 |             "subcategory": "I02 [noun - plural_irreg].txt",
39 |             "task": "word_analogy",
40 |             "timestamp": "2018-05-03T00:17:17.598640"
41 |         },
42 |         "result": -1
43 |     }
44 | ]


--------------------------------------------------------------------------------
/tests/data/benchmarks_results/analogy/3:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "details": [],
 4 |         "experiment_setup": {
 5 |             "category": "analogy",
 6 |             "cnt_questions_correct": 0,
 7 |             "cnt_questions_total": 0,
 8 |             "embeddings": {
 9 |                 "_class": "vecto.embeddings.dense.WordEmbeddingsDense",
10 |                 "foldername": "plain_no_file_header",
11 |                 "normalized": true,
12 |                 "type": "test",
13 |                 "vecto_version": "0.1.2"
14 |             },
15 |             "measurement": "accuracy",
16 |             "method": "LinearOffset",
17 |             "subcategory": "I01 [noun - plural_reg].txt",
18 |             "task": "word_analogy",
19 |             "timestamp": "2018-05-03T00:17:17.620866"
20 |         },
21 |         "result": -1
22 |     },
23 |     {
24 |         "details": [],
25 |         "experiment_setup": {
26 |             "category": "analogy",
27 |             "cnt_questions_correct": 0,
28 |             "cnt_questions_total": 0,
29 |             "embeddings": {
30 |                 "_class": "vecto.embeddings.dense.WordEmbeddingsDense",
31 |                 "foldername": "plain_no_file_header",
32 |                 "normalized": true,
33 |                 "type": "test",
34 |                 "vecto_version": "0.1.2"
35 |             },
36 |             "measurement": "accuracy",
37 |             "method": "LinearOffset",
38 |             "subcategory": "I02 [noun - plural_irreg].txt",
39 |             "task": "word_analogy",
40 |             "timestamp": "2018-05-03T00:17:17.621908"
41 |         },
42 |         "result": -1
43 |     }
44 | ]


--------------------------------------------------------------------------------
/tests/data/benchmarks_results/analogy/2:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "details": [],
 4 |         "experiment_setup": {
 5 |             "category": "analogy",
 6 |             "cnt_questions_correct": 0,
 7 |             "cnt_questions_total": 0,
 8 |             "embeddings": {
 9 |                 "_class": "vecto.embeddings.dense.WordEmbeddingsDense",
10 |                 "foldername": "plain_with_file_header",
11 |                 "normalized": true,
12 |                 "type": "test",
13 |                 "vecto_version": "0.1.2"
14 |             },
15 |             "measurement": "accuracy",
16 |             "method": "LinearOffset",
17 |             "subcategory": "I01 [noun - plural_reg].txt",
18 |             "task": "word_analogy",
19 |             "timestamp": "2018-05-03T00:17:17.618278"
20 |         },
21 |         "result": -1
22 |     },
23 |     {
24 |         "details": [],
25 |         "experiment_setup": {
26 |             "category": "analogy",
27 |             "cnt_questions_correct": 0,
28 |             "cnt_questions_total": 0,
29 |             "embeddings": {
30 |                 "_class": "vecto.embeddings.dense.WordEmbeddingsDense",
31 |                 "foldername": "plain_with_file_header",
32 |                 "normalized": true,
33 |                 "type": "test",
34 |                 "vecto_version": "0.1.2"
35 |             },
36 |             "measurement": "accuracy",
37 |             "method": "LinearOffset",
38 |             "subcategory": "I02 [noun - plural_irreg].txt",
39 |             "task": "word_analogy",
40 |             "timestamp": "2018-05-03T00:17:17.619183"
41 |         },
42 |         "result": -1
43 |     }
44 | ]


--------------------------------------------------------------------------------
/vecto/benchmarks/outliers/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import logging
 4 | 
 5 | from vecto.utils.data import save_json
 6 | from vecto.benchmarks.outliers import *
 7 | from vecto.embeddings import load_from_dir
 8 | 
 9 | logging.basicConfig(level=logging.DEBUG)
10 | 
11 | 
12 | def print_json(data):
13 |     print(json.dumps(data, ensure_ascii=False, indent=4, sort_keys=False))
14 | 
15 | 
16 | def select_method(key):
17 |     options = {}
18 |     if key == 'AveragePairwiseCosine':
19 |         method = AveragePairwiseCosine(options)
20 |     else:
21 |         raise RuntimeError('The method name was not recognized.')
22 |     return method
23 | 
24 | 
25 | def main():
26 |     parser = argparse.ArgumentParser()
27 |     parser.add_argument('embeddings')
28 |     parser.add_argument('dataset')
29 |     parser.add_argument('--method', help='Outlier detection method', default='AveragePairwiseCosine')
30 |     parser.add_argument('--path_out', help='Destination folder to save the results')
31 |     args = parser.parse_args()
32 |     embeddings = load_from_dir(args.embeddings)
33 |     benchmark = select_method(args.method)
34 |     results = benchmark.get_result(embeddings, args.dataset)
35 |     if args.path_out:
36 |         if path.isdir(args.path_out) or args.path_out.endswith('/'):
37 |             dataset = path.basename(path.normpath(args.dataset))
38 |             name_file_out = path.join(args.path_out, dataset, args.method, 'results.json')
39 |             save_json(results, name_file_out)
40 |         else:
41 |             save_json(results, args.path_out)
42 |     else:
43 |         print_json(results)
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     main()
48 | 


--------------------------------------------------------------------------------
/vecto/benchmarks/synonymy_detection/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import logging
 4 | 
 5 | from vecto.utils.data import save_json
 6 | from vecto.benchmarks.synonymy_detection import *
 7 | from vecto.embeddings import load_from_dir
 8 | 
 9 | logging.basicConfig(level=logging.DEBUG)
10 | 
11 | 
12 | def print_json(data):
13 |     print(json.dumps(data, ensure_ascii=False, indent=4, sort_keys=False))
14 | 
15 | 
16 | def select_method(key):
17 |     options = {}
18 |     if key == 'CosineDistance':
19 |         method = CosineDistance(options)
20 |     else:
21 |         raise RuntimeError('The method name was not recognized.')
22 |     return method
23 | 
24 | 
25 | def main():
26 |     parser = argparse.ArgumentParser()
27 |     parser.add_argument('embeddings')
28 |     parser.add_argument('dataset')
29 |     parser.add_argument('--method', help='Synonymy detection method', default='CosineDistance')
30 |     parser.add_argument('--path_out', help='Destination folder to save the results')
31 |     args = parser.parse_args()
32 |     embeddings = load_from_dir(args.embeddings)
33 |     benchmark = select_method(args.method)
34 |     results = benchmark.get_result(embeddings, args.dataset)
35 |     if args.path_out:
36 |         if path.isdir(args.path_out) or args.path_out.endswith('/'):
37 |             dataset = path.basename(path.normpath(args.dataset))
38 |             name_file_out = path.join(args.path_out, dataset, args.method, 'results.json')
39 |             save_json(results, name_file_out)
40 |         else:
41 |             save_json(results, args.path_out)
42 |     else:
43 |         print_json(results)
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     main()
48 | 


--------------------------------------------------------------------------------
/vecto/utils/data.py:
--------------------------------------------------------------------------------
 1 | import bz2
 2 | import gzip
 3 | import json
 4 | import lzma
 5 | import os
 6 | 
 7 | 
 8 | def detect_archive_format_and_open(path):
 9 |     if path.endswith(".xz"):
10 |         return lzma.open(path, mode="rt", encoding="utf-8", errors="replace")
11 |     if path.endswith(".bz2"):
12 |         return bz2.open(path, mode="rt", encoding="utf-8", errors="replace")
13 |     if path.endswith(".gz"):
14 |         return gzip.open(path, mode="rt", encoding="utf-8", errors="replace")
15 |     return open(path, encoding="utf8", errors="replace")
16 | 
17 | 
18 | def get_uncompressed_size(path):
19 |     with detect_archive_format_and_open(path) as f:
20 |         size = f.seek(0, 2)
21 |     return size
22 | 
23 | 
24 | def print_json(data):
25 |     print(json.dumps(data, ensure_ascii=False, indent=4, sort_keys=False))
26 | 
27 | 
28 | def save_json(data, path):
29 |     basedir = os.path.dirname(path)
30 |     os.makedirs(basedir, exist_ok=True)
31 |     str_data = json.dumps(data, ensure_ascii=False, indent=4, sort_keys=False)
32 |     file_out = open(path, "w")
33 |     file_out.write(str_data)
34 |     file_out.close()
35 | 
36 | 
37 | def load_json(path):
38 |     f = open(path)
39 |     s_data = f.read()
40 |     data = json.loads(s_data)
41 |     f.close()
42 |     return data
43 | 
44 | 
45 | def jsonify(data):
46 |     if isinstance(data, list):
47 |         return [jsonify(item) for item in data]
48 |     if isinstance(data, dict):
49 |         return {jsonify(key): jsonify(value) for key, value in data.items()}
50 |     if isinstance(data, int):
51 |         return str(data)
52 |     if type(data).__module__ == "numpy":
53 |         return data.tolist()
54 |     return str(data)
55 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_outliers.py:
--------------------------------------------------------------------------------
 1 | """Tests for outliers benchmark."""
 2 | 
 3 | import unittest
 4 | from io import StringIO
 5 | from contextlib import redirect_stdout
 6 | from vecto.benchmarks.outliers import *
 7 | from vecto.embeddings import load_from_dir
 8 | from ..test_setup import run_module
 9 | 
10 | path_outliers_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'outliers')
11 | 
12 | 
13 | class Tests(unittest.TestCase):
14 |     @classmethod
15 |     def test_outliers(self):
16 |         embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
17 |         outliers = AveragePairwiseCosine()
18 |         outliers.get_result(embs, path_outliers_dataset)
19 | 
20 |     @classmethod
21 |     def test_cli(self):
22 |         sio = StringIO()
23 |         with redirect_stdout(sio):
24 |             run_module('vecto.benchmarks.outliers',
25 |                        './tests/data/embeddings/text/plain_with_file_header/',
26 |                        './tests/data/benchmarks/outliers/',
27 |                        '--path_out', '/tmp/vecto/benchmarks', '--method', 'AveragePairwiseCosine')
28 | 
29 |     def test_outliers_results(self):
30 |         embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
31 |         outliers = AveragePairwiseCosine()
32 |         result = outliers.get_result(embs, path_outliers_dataset)['test']
33 |         amount_of_categories = 2
34 |         # TODO: refactor to be understandable, check if ok after covab to UNK
35 |         amount_of_word_in_cats = 4
36 | 
37 |         self.assertEqual(len(result.keys()), amount_of_categories)
38 |         self.assertEqual(len(result['cats']), amount_of_word_in_cats)


--------------------------------------------------------------------------------
/examples/analogy.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from vecto.benchmarks.analogy import Analogy\n",
10 |     "import vecto.embeddings"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": 6,
16 |    "metadata": {},
17 |    "outputs": [],
18 |    "source": [
19 |     "embeddings = vecto.embeddings.load_from_dir(\"/storage/data/NLP/embeddings/6b.wiki_giga\")"
20 |    ]
21 |   },
22 |   {
23 |    "cell_type": "code",
24 |    "execution_count": null,
25 |    "metadata": {},
26 |    "outputs": [],
27 |    "source": []
28 |   },
29 |   {
30 |    "cell_type": "code",
31 |    "execution_count": null,
32 |    "metadata": {},
33 |    "outputs": [],
34 |    "source": [
35 |     "analogy = Analogy()"
36 |    ]
37 |   },
38 |   {
39 |    "cell_type": "code",
40 |    "execution_count": null,
41 |    "metadata": {},
42 |    "outputs": [],
43 |    "source": [
44 |     "analogy.get_result()"
45 |    ]
46 |   },
47 |   {
48 |    "cell_type": "code",
49 |    "execution_count": null,
50 |    "metadata": {},
51 |    "outputs": [],
52 |    "source": []
53 |   }
54 |  ],
55 |  "metadata": {
56 |   "kernelspec": {
57 |    "display_name": "Python 3",
58 |    "language": "python",
59 |    "name": "python3"
60 |   },
61 |   "language_info": {
62 |    "codemirror_mode": {
63 |     "name": "ipython",
64 |     "version": 3
65 |    },
66 |    "file_extension": ".py",
67 |    "mimetype": "text/x-python",
68 |    "name": "python",
69 |    "nbconvert_exporter": "python",
70 |    "pygments_lexer": "ipython3",
71 |    "version": "3.6.3"
72 |   }
73 |  },
74 |  "nbformat": 4,
75 |  "nbformat_minor": 2
76 | }
77 | 


--------------------------------------------------------------------------------
/vecto/embeddings/legacy_w2v.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from vecto.vocabulary import Vocabulary
 4 | from .dense import WordEmbeddingsDense
 5 | 
 6 | 
 7 | class ModelW2V(WordEmbeddingsDense):
 8 |     """extends dense embeddings to support loading
 9 |     of original binary format from Mikolov's w2v"""
10 | 
11 |     @staticmethod
12 |     def _load_word(file):
13 |         result = b''
14 |         w = b''
15 |         while w != b' ':
16 |             w = file.read(1)
17 |             result = result + w
18 |         return result[:-1]
19 | 
20 |     def load_from_file(self, filename):
21 |         self.vocabulary = Vocabulary()
22 |         f = open(filename, "rb")
23 |         header = f.readline().split()
24 |         cnt_rows = int(header[0])
25 |         size_row = int(header[1])
26 |         # self.name += "_{}".format(size_row)
27 |         self.matrix = np.zeros((cnt_rows, size_row), dtype=np.float32)
28 |         # logger.debug("cnt rows = {}, size row = {}".format(cnt_rows, size_row))
29 |         for i in range(cnt_rows):
30 |             word = ModelW2V._load_word(f).decode(
31 |                 'UTF-8', errors="ignore").strip()
32 |             self.vocabulary.dic_words_ids[word] = i
33 |             self.vocabulary.lst_words.append(word)
34 |             s_row = f.read(size_row * 4)
35 |             row = np.fromstring(s_row, dtype=np.float32)
36 |             # row = row / np.linalg.norm(row)
37 |             self.matrix[i] = row
38 |         f.close()
39 | 
40 |     def load_from_dir(self, path):
41 |         # self.name += "w2v_" + os.path.basename(os.path.normpath(path))
42 |         filename = [file for file in os.listdir(path) if file.endswith("bin")][0]
43 |         self.load_from_file(os.path.join(path, filename))
44 | #        self.load_from_file(os.path.join(path, "vectors.bin"))
45 |         # self.load_provenance(path)
46 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # project-specific
  2 | 
  3 | _autosummary
  4 | 
  5 | 
  6 | # Byte-compiled / optimized / DLL files
  7 | __pycache__/
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # SageMath parsed files
 87 | *.sage.py
 88 | 
 89 | # Environments
 90 | .env
 91 | .venv
 92 | env/
 93 | venv/
 94 | ENV/
 95 | env.bak/
 96 | venv.bak/
 97 | 
 98 | # Spyder project settings
 99 | .spyderproject
100 | .spyproject
101 | 
102 | # Rope project settings
103 | .ropeproject
104 | 
105 | # mkdocs documentation
106 | /site
107 | 
108 | # mypy
109 | .mypy_cache/
110 | 
111 | # idea
112 | .idea/
113 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_misc.py:
--------------------------------------------------------------------------------
 1 | """Tests for embeddings module."""
 2 | 
 3 | import unittest
 4 | import io
 5 | import contextlib
 6 | from tests.test_setup import run_module
 7 | import vecto
 8 | import vecto.benchmarks
 9 | import vecto.benchmarks.base
10 | from os import path
11 | 
12 | # from shutil import rmtree
13 | 
14 | path_similarity_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'similarity')
15 | path_sequence_labeling_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'sequence_labeling')
16 | path_language_modeling_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'language_modeling')
17 | 
18 | 
19 | class Tests(unittest.TestCase):
20 | 
21 |     # def test_fetcher(self):
22 |     #     if path.isdir(path.join('.', 'tests', 'data', 'benchmarks_test')):
23 |     #         return
24 |     #     fetch_benchmarks(path.join('.', 'tests', 'data', 'benchmarks_test'))
25 |     #     embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
26 |     #     similarity = Similarity()
27 |     #     path_similarity_dataset = path.join('.', 'tests', 'data', 'benchmarks_test', 'benchmarks', 'similarity', 'en')
28 |     #     similarity.get_result(embs, path_similarity_dataset)
29 | 
30 |     def test_abc(self):
31 |         with self.assertRaises(NotImplementedError):
32 |             vecto.benchmarks.base.Benchmark()
33 |             # base.get_result(1, 2)
34 | 
35 |     def test_cli(self):
36 |         with self.assertRaises(SystemExit):
37 |             sio = io.StringIO()
38 |             with contextlib.redirect_stdout(sio):
39 |                 run_module("vecto",
40 |                            "benchmark",
41 |                            "WRONG_NAME",
42 |                            "path_embs")
43 | 
44 |         sio = io.StringIO()
45 |         with contextlib.redirect_stdout(sio):
46 |             run_module("vecto",
47 |                        "benchmark",
48 |                        "help")
49 | 


--------------------------------------------------------------------------------
/vecto/benchmarks/text_classification/__init__.py:
--------------------------------------------------------------------------------
 1 | """Text classification benchmark.
 2 | 
 3 |     One of the pre-defined models is trained to convergence
 4 |     to predict labels for text fragments in a provided dataset.
 5 |     Sentiment analysis is an example of text classification task.
 6 | 
 7 | .. autosummary::
 8 |     :toctree: _autosummary
 9 | 
10 |     text_classification
11 | """
12 | 
13 | import argparse
14 | from .text_classification import Text_classification as Benchmark
15 | # TODO: figure out where to put it better
16 | from .text_classification import load_model, predict, get_vectors
17 | 
18 | 
19 | def add_extra_args(parser):
20 |     parser.add_argument("embeddings")
21 |     parser.add_argument("dataset")
22 |     parser.add_argument('--batchsize', '-b', type=int, default=64,
23 |                         help='Number of images in each mini-batch')
24 |     parser.add_argument('--epoch', '-e', type=int, default=30,
25 |                         help='Number of sweeps over the dataset to train')
26 |     parser.add_argument('--gpu', '-g', type=int, default=-1,
27 |                         help='GPU ID (negative value indicates CPU)')
28 |     parser.add_argument('--layer', '-l', type=int, default=1,
29 |                         help='Number of layers of RNN or MLP following CNN')
30 |     parser.add_argument('--dropout', '-d', type=float, default=0.4,
31 |                         help='Dropout rate')
32 |     parser.add_argument('--model', '-model', default='cnn',
33 |                         choices=['cnn', 'rnn', 'bow'],
34 |                         help='Name of encoder model type')
35 |     # args = parser.parse_args(extra_args)
36 |     # embeddings = load_from_dir(args.embeddings)
37 |     # text_classification = Text_classification(batchsize=args.batchsize, epoch=args.epoch, gpu=args.gpu,
38 |     #                                           layer=args.layer, dropout=args.dropout, model=args.model)
39 |     # text_classification.run_with_args(args)
40 | 


--------------------------------------------------------------------------------
/vecto/benchmarks/categorization/__init__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from .categorization import KMeansCategorization as Benchmark
 3 | from .categorization import purity_score
 4 | from vecto.embeddings import load_from_dir
 5 | from vecto.utils.data import save_json, print_json
 6 | from vecto.utils import get_time_str
 7 | 
 8 | 
 9 | def select_method(key):
10 |     options = {}
11 |     # if key == 'SpectralCategorization':
12 |     #     method = SpectralCategorization(options)
13 |     if key == 'KMeansCategorization':
14 |         method = KMeansCategorization(options)
15 |     else:
16 |         raise RuntimeError('The method name was not recognized.')
17 |     return method
18 | 
19 | 
20 | def add_extra_args(parser):
21 |     parser.add_argument('embeddings')
22 |     parser.add_argument('dataset')
23 |     # TODO: move method selection to benchmark class
24 | #   parser.add_argument('--method', help='Categorization method', default='KMeansCategorization')
25 |     # args = parser.parse_args(extra_args)
26 |     # embeddings = load_from_dir(args.embeddings)
27 |     # benchmark = select_method(args.method)
28 |     # results = benchmark.get_result(embeddings, args.dataset)
29 |     # if args.path_out:
30 |     #     # TODO: this does not seem to work if the dir does not exist
31 |     #     # let us always assume dir, clean this up later if no better idea 
32 |     #     # if path.isdir(args.path_out) or args.path_out.endswith('/'):
33 |     #     dataset = path.basename(path.normpath(args.dataset))
34 |     #     timestamp = get_time_str()
35 |     #     name_file_out = path.join(args.path_out,
36 |     #                               dataset,
37 |     #                               args.method,
38 |     #                               timestamp,
39 |     #                               'results.json')
40 |     #     save_json(results, name_file_out)
41 |     #     # else:
42 |     #         # save_json(results, args.path_out)
43 |     # else:
44 |     #     print_json(results)
45 | 


--------------------------------------------------------------------------------
/tests/data/vocabs/plain/vocab.tsv:
--------------------------------------------------------------------------------
  1 | #word	frequency
  2 | ,	671
  3 | .	406
  4 | of	345
  5 | the	330
  6 | to	324
  7 | and	271
  8 | her	184
  9 | was	177
 10 | his	176
 11 | a	170
 12 | it	146
 13 | in	145
 14 | for	119
 15 | be	118
 16 | she	107
 17 | he	101
 18 | as	97
 19 | i	95
 20 | that	92
 21 | not	82
 22 | their	70
 23 | him	69
 24 | by	65
 25 | had	63
 26 | which	63
 27 | but	62
 28 | at	60
 29 | them	60
 30 | no	60
 31 | have	58
 32 | with	57
 33 | so	56
 34 | on	54
 35 | you	54
 36 | is	50
 37 | from	47
 38 | would	47
 39 | they	45
 40 | could	45
 41 | will	44
 42 | dashwood	42
 43 | !	42
 44 | my	39
 45 | were	38
 46 | more	38
 47 | than	37
 48 | very	36
 49 | mrs	35
 50 | all	34
 51 | any	34
 52 | mother	33
 53 | house	32
 54 | such	31
 55 | every	29
 56 | elinor	27
 57 | this	26
 58 | do	26
 59 | norland	25
 60 | own	25
 61 | what	25
 62 | if	25
 63 | who	24
 64 | an	24
 65 | been	23
 66 | one	23
 67 | much	23
 68 | or	23
 69 | john	21
 70 | your	21
 71 | might	20
 72 | pounds	19
 73 | when	19
 74 | think	19
 75 | said	19
 76 | himself	18
 77 | too	18
 78 | should	18
 79 | great	17
 80 | only	17
 81 | how	17
 82 | must	17
 83 | may	17
 84 | are	16
 85 | there	16
 86 | can	16
 87 | far	15
 88 | make	15
 89 | though	15
 90 | marianne	15
 91 | soon	14
 92 | father	14
 93 | thousand	14
 94 | well	14
 95 | did	14
 96 | some	14
 97 | we	14
 98 | man	13
 99 | sister	13
100 | mr	13
101 | present	13
102 | first	13
103 | other	13
104 | time	13
105 | give	13
106 | now	13
107 | herself	13
108 | sure	13
109 | shall	13
110 | edward	13
111 | many	12
112 | opinion	12
113 | into	12
114 | fortune	12
115 | half	12
116 | really	12
117 | sisters	12
118 | thing	12
119 | enough	12
120 | day	12
121 | me	12
122 | say	12
123 | taste	12
124 | good	11
125 | years	11
126 | three	11
127 | comfortable	11
128 | handsome	11
129 | little	11
130 | love	11
131 | ?	11
132 | am	11
133 | barton	11
134 | before	10
135 | heart	10
136 | gave	10
137 | child	10
138 | most	10
139 | then	10
140 | feel	10
141 | ever	10
142 | beyond	10
143 | see	10
144 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/visualization.rst:
--------------------------------------------------------------------------------
 1 | Visualization
 2 | =============
 3 | 
 4 | .. currentmodule:: vecto
 5 | 
 6 | When you have the numerical vectors for the units you are interested in, you can use all the goodies of matplotlib to create any kind of visualizaion you like. The visualize module of Vecto provides a few simple examples to get you started and/or quickly explore your model as you go.
 7 | 
 8 | The `visualize` module of vecto comes with several functions to quickly explore the representations.
 9 | 
10 | Drawing features
11 | ----------------
12 | 
13 | >>> from vecto import visualize as vz
14 | >>> vs.draw_features(vsm, ["apple", "pear", "cat", "dog"], num_features=20)
15 | 
16 | .. image:: images/draw_features.png
17 | 
18 | TODO: how to interpret this.
19 | 
20 | Visualizing similarity between certain words.
21 | ---------------------------------------------
22 | 
23 | >>> vs.draw_features_and_similarity(vsm, ["apple", "pear", "cat", "dog"])
24 | 
25 | .. image:: images/draw_similarity.png
26 | 
27 | The color intensity indicates the degre of similarity. We can see that apple is more similar to pear than to cat or dog, and the other way round.
28 | 
29 | Visualizing dimensions
30 | ----------------------
31 | 
32 | In a dense VSM, each dimension on its own is `not likely to be an interpretable semantic feature on its own <http%3A%2F%2Fwww.aclweb.org%2Fanthology%2FW16-2507&usg=AOvVaw05u2poEJDhTWcWk19t5HLE>`_. Still, it is the overall pattern of the dimensios that encodes the meaning of any given language unit, and so it may be useful to visually inspect them.
33 | 
34 | >>> vs.std_to_img(vsm.get_row("apple"))
35 | 
36 | .. image:: images/std_to_img.png
37 | 
38 | >>> vs.std_to_img(vsm.get_row("cat"))
39 | 
40 | .. image:: images/cat.png
41 | 
42 | 
43 | The `rows_to_img` function displays only the end points of all dimensions in a given collection of vectors.
44 | 
45 | >>> vectors = vs.wordlist_to_rows(vsm, ["apple", "pear", "cat", "dog"])
46 | >>> vs.rows_to_img_tips(vectors,max_y=0.8)
47 | 
48 | .. image:: images/img_tips.png
49 | 


--------------------------------------------------------------------------------
/vecto/data/io.py:
--------------------------------------------------------------------------------
 1 | from requests import get
 2 | from vecto.corpus.tokenization import word_tokenize_txt
 3 | 
 4 | 
 5 | # TODO: move this to corpus module
 6 | def normalize_text(text):
 7 |     return text.strip().lower()
 8 | 
 9 | 
10 | def read_first_col_is_label_format(path, char_based=False):
11 |     dataset = []
12 |     with open(path, encoding='utf-8', errors='ignore') as f:
13 |         for i, l in enumerate(f):
14 |             if i == 0:
15 |                 continue
16 |             if len(l.strip()) < 3:
17 |                 continue
18 |             label, text = l.strip().split("\t", 1)
19 |             # TODO: make lower-casing optional
20 |             text = normalize_text(text)
21 |             label = int(label)
22 |             # if char_based:
23 |             #     tokens = list(text)
24 |             # else:
25 |             #     tokens = word_tokenize_txt(text)
26 |             dataset.append((text, label))
27 |     return dataset
28 | 
29 | # TODO: detect where's lavel or specify format
30 | def read_tsv_label_last(path):
31 |     dataset = []
32 |     with open(path, encoding='utf-8', errors='ignore') as f:
33 |         for i, l in enumerate(f):
34 |             if len(l.strip()) < 3:
35 |                 continue
36 |             text, label = l.strip().split("\t", 1)
37 |             # print(label)
38 |             if label == "label":
39 |                 continue
40 |             # TODO: make lower-casing optional
41 |             text = normalize_text(text)
42 |             label = int(label)
43 |             # TODO: move tokenization to another layer
44 | #            if char_based:
45 | #                tokens = list(text)
46 | #            else:
47 | #                tokens = word_tokenize_txt(text)
48 |             dataset.append((text, label))
49 |     return dataset
50 | 
51 | 
52 | def fetch_file(url, path, chunk_size=512):
53 |     myfile = get(url, allow_redirects=True)
54 |     open(path, 'wb').write(myfile.content)
55 | #response = get(url, stream=True)
56 |     #handle = open(path, 'wb')
57 |     #for chunk in response.iter_content(chunk_size=chunk_size):
58 |     #    if chunk:
59 |     #        handle.write(chunk)
60 |     #handle.close()


--------------------------------------------------------------------------------
/examples/most_similar.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import vecto\n",
10 |     "import vecto.embeddings"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": 8,
16 |    "metadata": {},
17 |    "outputs": [],
18 |    "source": [
19 |     "embeddings = vecto.embeddings.load_from_dir(\"/storage/data/NLP/embeddings/6b.wiki_giga\")"
20 |    ]
21 |   },
22 |   {
23 |    "cell_type": "code",
24 |    "execution_count": 10,
25 |    "metadata": {},
26 |    "outputs": [],
27 |    "source": [
28 |     "embeddings.cache_normalized_copy()"
29 |    ]
30 |   },
31 |   {
32 |    "cell_type": "code",
33 |    "execution_count": 11,
34 |    "metadata": {},
35 |    "outputs": [
36 |     {
37 |      "data": {
38 |       "text/plain": [
39 |        "[['apple', 1.0],\n",
40 |        " ['iphone', 0.79935205],\n",
41 |        " ['macintosh', 0.79181653],\n",
42 |        " ['ipod', 0.78805625],\n",
43 |        " ['microsoft', 0.7831917],\n",
44 |        " ['ipad', 0.781405],\n",
45 |        " ['intel', 0.77287817],\n",
46 |        " ['ibm', 0.7643097],\n",
47 |        " ['google', 0.7641237],\n",
48 |        " ['imac', 0.753626]]"
49 |       ]
50 |      },
51 |      "execution_count": 11,
52 |      "metadata": {},
53 |      "output_type": "execute_result"
54 |     }
55 |    ],
56 |    "source": [
57 |     "embeddings.get_most_similar_words(\"apple\")"
58 |    ]
59 |   },
60 |   {
61 |    "cell_type": "code",
62 |    "execution_count": null,
63 |    "metadata": {},
64 |    "outputs": [],
65 |    "source": []
66 |   }
67 |  ],
68 |  "metadata": {
69 |   "kernelspec": {
70 |    "display_name": "Python 3",
71 |    "language": "python",
72 |    "name": "python3"
73 |   },
74 |   "language_info": {
75 |    "codemirror_mode": {
76 |     "name": "ipython",
77 |     "version": 3
78 |    },
79 |    "file_extension": ".py",
80 |    "mimetype": "text/x-python",
81 |    "name": "python",
82 |    "nbconvert_exporter": "python",
83 |    "pygments_lexer": "ipython3",
84 |    "version": "3.6.3"
85 |   }
86 |  },
87 |  "nbformat": 4,
88 |  "nbformat_minor": 2
89 | }
90 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_relation_extraction.py:
--------------------------------------------------------------------------------
 1 | """Tests for analogy benchmark."""
 2 | 
 3 | import contextlib
 4 | import unittest
 5 | import io
 6 | from os import path
 7 | from vecto.benchmarks import visualize
 8 | from vecto.embeddings import load_from_dir
 9 | from vecto.data import Dataset
10 | from tests.test_setup import run_module
11 | 
12 | 
13 | path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')
14 | path_dataset = path.join('tests', 'data', 'benchmarks', 'relation_extraction')
15 | 
16 | 
17 | class Tests(unittest.TestCase):
18 |     # def test_api(self):
19 |     #     embs = load_from_dir(path_emb)
20 | 
21 |     #     for method in ['lr', '2FFNN']:
22 |     #         sequence_labeling = Sequence_labeling(method=method)
23 |     #         for subtask in ['chunk', 'pos', 'ner']:  # , 'chunk', 'pos', 'ner'
24 |     #             result = sequence_labeling.get_result(embs, path.join(path_sequence_labeling_dataset, subtask))
25 |     #             self.assertIsInstance(result[0], dict)
26 |     #             print(result)
27 | 
28 |     def test_cli(self):
29 |         sio = io.StringIO()
30 |         with contextlib.redirect_stdout(sio):
31 |             run_module("vecto",
32 |                        "benchmark",
33 |                        "relation_extraction",
34 |                        path_emb,
35 |                        path_dataset,
36 |                        "--path_out", "/tmp/vecto/benchmarks/")
37 | 
38 |         with self.assertRaises(FileNotFoundError):
39 |             sio = io.StringIO()
40 |             with contextlib.redirect_stdout(sio):
41 |                 run_module("vecto",
42 |                            "benchmark",
43 |                            "relation_extraction",
44 |                            path_emb + "NONEXISTING",
45 |                            path_dataset,
46 |                            "--path_out",
47 |                            "/tmp/vecto/benchmarks/")
48 | 
49 |         from matplotlib import pyplot as plt
50 |         visualize.plot_accuracy("/tmp/vecto/benchmarks/relation_extraction", key_secondary="experiment_setup.dataset")
51 |         plt.savefig("/tmp/vecto/benchmarks/relation_extraction.pdf", bbox_inches="tight")
52 | 


--------------------------------------------------------------------------------
/vecto/vocabulary/__main__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | from .vocabulary import create_ngram_tokens_from_dir, create_from_annotated_dir
 4 | from .vocabulary import create_from_path
 5 | 
 6 | 
 7 | def parse_args():
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument('--type', '-t', choices=['normal', 'annotated', 'ngram_tokens'],
10 |                         default='normal',
11 |                         help='vocab type,')
12 |     parser.add_argument('--min_ngram', '-minn', default=2, type=int,
13 |                         help='minimal number of ngrams')
14 |     parser.add_argument('--max_ngram', '-maxn', default=3, type=int,
15 |                         help='minimal number of ngrams')
16 |     parser.add_argument('--min_frequency', '-minf', default=100, type=int,
17 |                         help='minimal number of ngrams')
18 |     parser.add_argument('--context_representation', '-cp', choices=['word', 'deps', 'ne', ],
19 |                         default='word',
20 |                         help='context representation'
21 |                              'the annotated corpus is required')
22 |     parser.add_argument('--path_corpus', help='path to the corpus', required=True)
23 |     parser.add_argument('--path_out', help='path to save vocab', required=True)
24 | 
25 |     args = parser.parse_args()
26 |     return args
27 | 
28 | 
29 | def run(args):
30 |     print(args.type)
31 |     if args.type == "normal":
32 |         v = create_from_path(args.path_corpus, args.min_frequency)
33 |         v.save_to_dir(os.path.join(args.path_out, args.type))
34 |     if args.type == "annotated":
35 |         v = create_from_annotated_dir(args.path_corpus, args.min_frequency, args.context_representation)
36 |         v.save_to_dir(os.path.join(args.path_out, args.type, args.context_representation))
37 |     if args.type == "ngram_tokens":
38 |         v = create_ngram_tokens_from_dir(args.path_corpus, args.min_ngram, args.max_ngram, args.min_frequency)
39 |         v.save_to_dir(os.path.join(args.path_out, args.type, str(args.min_ngram), str(args.max_ngram)))
40 | 
41 | 
42 | def main():
43 |     args = parse_args()
44 |     # print(args)
45 |     run(args)
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     main()
50 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_synonymy_detection.py:
--------------------------------------------------------------------------------
 1 | """Tests for synonymy detection benchmark."""
 2 | 
 3 | import unittest
 4 | from io import StringIO
 5 | from contextlib import redirect_stdout
 6 | from vecto.benchmarks.synonymy_detection import *
 7 | from vecto.embeddings import load_from_dir
 8 | from ..test_setup import run_module
 9 | 
10 | path_synonymy_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'synonymy_detection')
11 | 
12 | 
13 | class Tests(unittest.TestCase):
14 |     @classmethod
15 |     def test_synonymy(self):
16 |         embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
17 |         synonymy = CosineDistance()
18 |         synonymy.get_result(embs, path_synonymy_dataset)
19 | 
20 |     @classmethod
21 |     def test_cli(self):
22 |         sio = StringIO()
23 |         with redirect_stdout(sio):
24 |             run_module('vecto.benchmarks.synonymy_detection',
25 |                        './tests/data/embeddings/text/plain_with_file_header/',
26 |                        './tests/data/benchmarks/synonymy_detection',
27 |                        '--path_out', '/tmp/vecto/benchmarks', '--method', 'CosineDistance')
28 | 
29 |     def test_synonymy_results(self):
30 |         embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
31 |         synonymy = CosineDistance()
32 |         result = synonymy.get_result(embs, path_synonymy_dataset)['test']
33 |         cat_is_synonym = 'yes'
34 |         cat_is_hit = False
35 |         distance_to_cat = 1.0
36 | 
37 |         self.assertEqual(result['tiger'][0]['is_synonym'], cat_is_synonym)
38 |         self.assertEqual(result['tiger'][0]['hit'], cat_is_hit)
39 |         self.assertEqual(result['tiger'][0]['distance'], distance_to_cat)
40 | 
41 |     def test_synonymy_reader(self):
42 |         synonymy = CosineDistance()
43 |         test_set = synonymy.read_test_set(path.join(path_synonymy_dataset, 'test.csv'))
44 |         expected_amount_of_keys = 2
45 |         expected_amount_of_tiger_suspicious = 3
46 |         cat_is_synonym_with_tiger = 'yes'
47 | 
48 |         self.assertEqual(len(test_set.keys()), expected_amount_of_keys)
49 |         self.assertEqual(len(test_set['tiger']), expected_amount_of_tiger_suspicious)
50 |         self.assertEqual(test_set['tiger'][0][1], cat_is_synonym_with_tiger)
51 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks/sequence_labeling/chunk/test.txt:
--------------------------------------------------------------------------------
  1 | -DOCSTART- -X- O O
  2 | 
  3 | Fischler JJR I-NP I-PER
  4 | proposed VBN I-NP O
  5 | EU-wide NNP I-NP I-MISC
  6 | measures VBZ I-VP O
  7 | after IN I-PP O
  8 | reports NNS I-NP O
  9 | from IN I-PP O
 10 | Britain NNP I-NP I-LOC
 11 | and CC I-NP O
 12 | France NNP I-NP I-LOC
 13 | that WDT B-NP O
 14 | under IN I-PP O
 15 | laboratory NN I-NP O
 16 | conditions NNS B-NP O
 17 | sheep NN I-NP O
 18 | could MD I-VP O
 19 | contract VB I-VP O
 20 | Bovine NNP I-NP I-MISC
 21 | Spongiform NNP I-NP I-MISC
 22 | Encephalopathy NNP I-NP I-MISC
 23 | ( ( O O
 24 | BSE NNP I-NP I-MISC
 25 | ) ) O O
 26 | -- : O O
 27 | mad JJ I-NP O
 28 | cow NN I-NP O
 29 | disease NN I-NP O
 30 | . . O O
 31 | 
 32 | But CC O O
 33 | Fischler NNP I-NP I-PER
 34 | agreed VBD I-VP O
 35 | to TO I-VP O
 36 | review VB I-VP O
 37 | his PRP$ I-NP O
 38 | proposal NN I-NP O
 39 | after IN I-PP O
 40 | the DT I-NP O
 41 | EU NNP I-NP I-ORG
 42 | 's POS B-NP O
 43 | standing NN I-NP O
 44 | veterinary JJ I-NP O
 45 | committee NN I-NP O
 46 | , , O O
 47 | mational JJ I-NP O
 48 | animal NN I-NP O
 49 | health NN I-NP O
 50 | officials NNS I-NP O
 51 | , , O O
 52 | questioned VBD I-VP O
 53 | if IN I-SBAR O
 54 | such JJ I-NP O
 55 | action NN I-NP O
 56 | was VBD I-VP O
 57 | justified VBN I-VP O
 58 | as IN I-PP O
 59 | there RB I-ADVP O
 60 | was VBD I-VP O
 61 | only RB I-ADVP O
 62 | a DT I-NP O
 63 | slight JJ I-NP O
 64 | risk NN I-NP O
 65 | to TO I-PP O
 66 | human JJ I-NP O
 67 | health NN I-NP O
 68 | . . O O
 69 | 
 70 | Spanish NNP I-NP I-MISC
 71 | Farm NNP I-NP O
 72 | Minister NNP I-NP O
 73 | Loyola NNP I-NP I-PER
 74 | de NNP I-NP I-PER
 75 | Palacio NNP I-NP I-PER
 76 | had VBD I-VP O
 77 | earlier RBR I-VP O
 78 | accused VBN I-VP O
 79 | Fischler NNP I-NP I-PER
 80 | at IN I-PP O
 81 | an DT I-NP O
 82 | EU JJ I-NP I-ORG
 83 | farm NN I-NP O
 84 | ministers NNS I-NP O
 85 | ' POS B-NP O
 86 | meeting NN I-NP O
 87 | of IN I-PP O
 88 | causing VBG I-VP O
 89 | unjustified JJ I-ADJP O
 90 | alarm NN I-NP O
 91 | through IN I-PP O
 92 | " " O O
 93 | dangerous JJ I-NP O
 94 | generalisation NN I-NP O
 95 | . . O O
 96 | " " O O
 97 | 
 98 | . . O O
 99 | 
100 | Only RB I-NP O
101 | France NNP I-NP I-LOC
102 | and CC I-NP O
103 | Britain NNP I-NP I-LOC
104 | backed VBD I-VP O
105 | Fischler NNP I-NP I-PER
106 | 's POS B-NP O
107 | proposal NN I-NP O
108 | . . O O
109 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks/sequence_labeling/ner/test.txt:
--------------------------------------------------------------------------------
  1 | -DOCSTART- -X- O O
  2 | 
  3 | Fischler JJR I-NP I-PER
  4 | proposed VBN I-NP O
  5 | EU-wide NNP I-NP I-MISC
  6 | measures VBZ I-VP O
  7 | after IN I-PP O
  8 | reports NNS I-NP O
  9 | from IN I-PP O
 10 | Britain NNP I-NP I-LOC
 11 | and CC I-NP O
 12 | France NNP I-NP I-LOC
 13 | that WDT B-NP O
 14 | under IN I-PP O
 15 | laboratory NN I-NP O
 16 | conditions NNS B-NP O
 17 | sheep NN I-NP O
 18 | could MD I-VP O
 19 | contract VB I-VP O
 20 | Bovine NNP I-NP I-MISC
 21 | Spongiform NNP I-NP I-MISC
 22 | Encephalopathy NNP I-NP I-MISC
 23 | ( ( O O
 24 | BSE NNP I-NP I-MISC
 25 | ) ) O O
 26 | -- : O O
 27 | mad JJ I-NP O
 28 | cow NN I-NP O
 29 | disease NN I-NP O
 30 | . . O O
 31 | 
 32 | But CC O O
 33 | Fischler NNP I-NP I-PER
 34 | agreed VBD I-VP O
 35 | to TO I-VP O
 36 | review VB I-VP O
 37 | his PRP$ I-NP O
 38 | proposal NN I-NP O
 39 | after IN I-PP O
 40 | the DT I-NP O
 41 | EU NNP I-NP I-ORG
 42 | 's POS B-NP O
 43 | standing NN I-NP O
 44 | veterinary JJ I-NP O
 45 | committee NN I-NP O
 46 | , , O O
 47 | mational JJ I-NP O
 48 | animal NN I-NP O
 49 | health NN I-NP O
 50 | officials NNS I-NP O
 51 | , , O O
 52 | questioned VBD I-VP O
 53 | if IN I-SBAR O
 54 | such JJ I-NP O
 55 | action NN I-NP O
 56 | was VBD I-VP O
 57 | justified VBN I-VP O
 58 | as IN I-PP O
 59 | there RB I-ADVP O
 60 | was VBD I-VP O
 61 | only RB I-ADVP O
 62 | a DT I-NP O
 63 | slight JJ I-NP O
 64 | risk NN I-NP O
 65 | to TO I-PP O
 66 | human JJ I-NP O
 67 | health NN I-NP O
 68 | . . O O
 69 | 
 70 | Spanish NNP I-NP I-MISC
 71 | Farm NNP I-NP O
 72 | Minister NNP I-NP O
 73 | Loyola NNP I-NP I-PER
 74 | de NNP I-NP I-PER
 75 | Palacio NNP I-NP I-PER
 76 | had VBD I-VP O
 77 | earlier RBR I-VP O
 78 | accused VBN I-VP O
 79 | Fischler NNP I-NP I-PER
 80 | at IN I-PP O
 81 | an DT I-NP O
 82 | EU JJ I-NP I-ORG
 83 | farm NN I-NP O
 84 | ministers NNS I-NP O
 85 | ' POS B-NP O
 86 | meeting NN I-NP O
 87 | of IN I-PP O
 88 | causing VBG I-VP O
 89 | unjustified JJ I-ADJP O
 90 | alarm NN I-NP O
 91 | through IN I-PP O
 92 | " " O O
 93 | dangerous JJ I-NP O
 94 | generalisation NN I-NP O
 95 | . . O O
 96 | " " O O
 97 | 
 98 | . . O O
 99 | 
100 | Only RB I-NP O
101 | France NNP I-NP I-LOC
102 | and CC I-NP O
103 | Britain NNP I-NP I-LOC
104 | backed VBD I-VP O
105 | Fischler NNP I-NP I-PER
106 | 's POS B-NP O
107 | proposal NN I-NP O
108 | . . O O
109 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks/sequence_labeling/pos/test.txt:
--------------------------------------------------------------------------------
  1 | -DOCSTART- -X- O O
  2 | 
  3 | Fischler JJR I-NP I-PER
  4 | proposed VBN I-NP O
  5 | EU-wide NNP I-NP I-MISC
  6 | measures VBZ I-VP O
  7 | after IN I-PP O
  8 | reports NNS I-NP O
  9 | from IN I-PP O
 10 | Britain NNP I-NP I-LOC
 11 | and CC I-NP O
 12 | France NNP I-NP I-LOC
 13 | that WDT B-NP O
 14 | under IN I-PP O
 15 | laboratory NN I-NP O
 16 | conditions NNS B-NP O
 17 | sheep NN I-NP O
 18 | could MD I-VP O
 19 | contract VB I-VP O
 20 | Bovine NNP I-NP I-MISC
 21 | Spongiform NNP I-NP I-MISC
 22 | Encephalopathy NNP I-NP I-MISC
 23 | ( ( O O
 24 | BSE NNP I-NP I-MISC
 25 | ) ) O O
 26 | -- : O O
 27 | mad JJ I-NP O
 28 | cow NN I-NP O
 29 | disease NN I-NP O
 30 | . . O O
 31 | 
 32 | But CC O O
 33 | Fischler NNP I-NP I-PER
 34 | agreed VBD I-VP O
 35 | to TO I-VP O
 36 | review VB I-VP O
 37 | his PRP$ I-NP O
 38 | proposal NN I-NP O
 39 | after IN I-PP O
 40 | the DT I-NP O
 41 | EU NNP I-NP I-ORG
 42 | 's POS B-NP O
 43 | standing NN I-NP O
 44 | veterinary JJ I-NP O
 45 | committee NN I-NP O
 46 | , , O O
 47 | mational JJ I-NP O
 48 | animal NN I-NP O
 49 | health NN I-NP O
 50 | officials NNS I-NP O
 51 | , , O O
 52 | questioned VBD I-VP O
 53 | if IN I-SBAR O
 54 | such JJ I-NP O
 55 | action NN I-NP O
 56 | was VBD I-VP O
 57 | justified VBN I-VP O
 58 | as IN I-PP O
 59 | there RB I-ADVP O
 60 | was VBD I-VP O
 61 | only RB I-ADVP O
 62 | a DT I-NP O
 63 | slight JJ I-NP O
 64 | risk NN I-NP O
 65 | to TO I-PP O
 66 | human JJ I-NP O
 67 | health NN I-NP O
 68 | . . O O
 69 | 
 70 | Spanish NNP I-NP I-MISC
 71 | Farm NNP I-NP O
 72 | Minister NNP I-NP O
 73 | Loyola NNP I-NP I-PER
 74 | de NNP I-NP I-PER
 75 | Palacio NNP I-NP I-PER
 76 | had VBD I-VP O
 77 | earlier RBR I-VP O
 78 | accused VBN I-VP O
 79 | Fischler NNP I-NP I-PER
 80 | at IN I-PP O
 81 | an DT I-NP O
 82 | EU JJ I-NP I-ORG
 83 | farm NN I-NP O
 84 | ministers NNS I-NP O
 85 | ' POS B-NP O
 86 | meeting NN I-NP O
 87 | of IN I-PP O
 88 | causing VBG I-VP O
 89 | unjustified JJ I-ADJP O
 90 | alarm NN I-NP O
 91 | through IN I-PP O
 92 | " " O O
 93 | dangerous JJ I-NP O
 94 | generalisation NN I-NP O
 95 | . . O O
 96 | " " O O
 97 | 
 98 | . . O O
 99 | 
100 | Only RB I-NP O
101 | France NNP I-NP I-LOC
102 | and CC I-NP O
103 | Britain NNP I-NP I-LOC
104 | backed VBD I-VP O
105 | Fischler NNP I-NP I-PER
106 | 's POS B-NP O
107 | proposal NN I-NP O
108 | . . O O
109 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | vecto
 2 | *****
 3 | 
 4 | .. image:: https://api.travis-ci.com/vecto-ai/vecto.svg?branch=master
 5 |     :target: https://travis-ci.com/vecto-ai/vecto
 6 |     :alt: build status from Travis CI
 7 | 
 8 | .. image:: https://ci.appveyor.com/api/projects/status/github/vecto-ai/vecto?branch=master&svg=true
 9 |     :target: https://ci.appveyor.com/project/undertherain/vecto
10 |     :alt: build status from AppVeyor
11 | 
12 | .. image:: https://coveralls.io/repos/github/vecto-ai/vecto/badge.svg?branch=master
13 |     :target: https://coveralls.io/github/vecto-ai/vecto?branch=master
14 |     :alt: coveralls badge
15 | 
16 | .. image:: https://api.codacy.com/project/badge/Grade/65aabe10113d45819091d005414462ca    
17 |     :target: https://www.codacy.com/app/undertherain/vecto
18 |     :alt: grade from Codacy
19 | 
20 | .. image:: https://badge.fury.io/py/vecto.svg
21 |     :target: https://badge.fury.io/py/vecto
22 |     :alt: pypi version
23 | 
24 | .. image:: https://badges.gitter.im/badge.svg
25 |    :alt: Join the chat at https://gitter.im/vecto-ai/Lobby
26 |    :target: https://gitter.im/vecto-ai/Lobby
27 | 
28 | Vecto helps to perform a range of tasks within the framework of vector space models of computational linguistics.
29 | 
30 | What functionality is included
31 | ==============================
32 | 
33 | * creating word embeddings by counting and neural-based methods, including sub-word-level models;
34 | * importing and exporting from a number of popular formats of word embeddings and providing unified access to word vectors;
35 | * perfroming a range of downstream tasks / benchmarks;
36 | * visualising embeddings.
37 | 
38 | How do I get set up?
39 | ====================
40 | 
41 | * ``pip3 install vecto`` for stable version
42 | * ``pip3 install git+https://github.com/vecto-ai/vecto.git`` for latest dev version
43 | * Python 3.6 or later is required
44 | 
45 | 📖 Documentation
46 | ================
47 | 
48 | =================== ===
49 | `Tutorial`_         vecto overview and end-to-end examples.
50 | `API Reference`_    The detailed reference for vecto API.
51 | `Contribute`_       How to contribute to the vecto project and code base.
52 | =================== ===
53 | 
54 | .. _Tutorial: http://vecto.readthedocs.io/en/docs/tutorial/index.html
55 | .. _API Reference: http://vecto.readthedocs.io/en/docs/reference/index.html
56 | .. _Contribute: http://vecto.readthedocs.io/en/docs/contribution.html
57 | 


--------------------------------------------------------------------------------
/vecto/utils/metadata.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .data import load_json, save_json
 3 | from vecto._version import VERSION
 4 | 
 5 | METADATA_SUFFIX = 'metadata.json'
 6 | 
 7 | 
 8 | def make_metadata_path(fname):
 9 |     if os.path.isdir(fname):
10 |         return os.path.join(fname, METADATA_SUFFIX)
11 |     return '{}.{}'.format(fname, METADATA_SUFFIX)
12 | 
13 | 
14 | def save_metadata(data, base_path):
15 |     save_json(data, make_metadata_path(base_path))
16 | 
17 | 
18 | def try_load_metadata(base_path):
19 |     try:
20 |         return load_json(make_metadata_path(base_path))
21 |     except IOError:
22 |         return {}
23 | 
24 | 
25 | def get_full_typename(obj):
26 |     # cls = type(obj)
27 |     if obj.__class__.__name__ == 'function':
28 |         clsname = obj.__name__
29 |     else:
30 |         clsname = obj.__class__.__name__
31 |     return '{}.{}'.format(obj.__module__, clsname)
32 | 
33 | 
34 | class WithMetaData(object):
35 |     """
36 |     Base object for all objects with metadata. Contains utilities for metadata loading from files, storing to files,
37 |     collecting/merging etc.
38 | 
39 |     User of this class is responsible for calling __init__ or init_metadata and save_metadata
40 |     in proper places of inheritor.
41 |     """
42 | 
43 |     def __init__(self, base_path=None, **other_metadata):
44 |         """
45 |         see init_metadata
46 |         """
47 |         self.metadata = {}
48 |         self.init_metadata(base_path=base_path, **other_metadata)
49 | 
50 |     def init_metadata(self, base_path=None, **other_metadata):
51 |         """
52 |         :param base_path: path from which metadata.json path will be constructed
53 |         :param other_metadata: anything json serializable
54 |         """
55 |         # self._metadata = {"vecto_version": VERSION}
56 |         if base_path is not None:
57 |             self.metadata['_base_path'] = base_path
58 |             self.load_metadata(base_path)
59 |         self.metadata.update(other_metadata)
60 |         self.metadata['_class'] = get_full_typename(self)
61 | 
62 |     def save_metadata(self, base_path):
63 |         """
64 |         :param base_path: path from which metadata.json path will be constructed
65 |         """
66 |         save_metadata(self.metadata, base_path)
67 | 
68 |     def load_metadata(self, base_path):
69 |         """
70 |         :param base_path: path from which metadata.json path will be constructed
71 |         """
72 |         self.metadata.update(try_load_metadata(base_path))
73 | 


--------------------------------------------------------------------------------
/vecto/benchmarks/text_classification/nlp_utils.py:
--------------------------------------------------------------------------------
 1 | # import collections
 2 | # import io
 3 | 
 4 | import numpy
 5 | 
 6 | import chainer
 7 | from chainer.backends import cuda
 8 | 
 9 | 
10 | def normalize_text(text):
11 |     return text.strip().lower()
12 | 
13 | 
14 | # def make_vocab(dataset, max_vocab_size=20000, min_freq=2):
15 | #     counts = collections.defaultdict(int)
16 | #     for tokens, _ in dataset:
17 | #         for token in tokens:
18 | #             counts[token] += 1
19 | #
20 | #     vocab = {'<eos>': 0, '<unk>': 1}
21 | #     for w, c in sorted(counts.items(), key=lambda x: (-x[1], x[0])):
22 | #         if len(vocab) >= max_vocab_size or c < min_freq:
23 | #             break
24 | #         vocab[w] = len(vocab)
25 | #     return vocab
26 | 
27 | 
28 | # def read_vocab_list(path, max_vocab_size=20000):
29 | #     vocab = {'<eos>': 0, '<unk>': 1}
30 | #     with io.open(path, encoding='utf-8', errors='ignore') as f:
31 | #         for l in f:
32 | #             w = l.strip()
33 | #             if w not in vocab and w:
34 | #                 vocab[w] = len(vocab)
35 | #             if len(vocab) >= max_vocab_size:
36 | #                 break
37 | #     return vocab
38 | 
39 | 
40 | def make_array(tokens, vocab, add_eos=True):
41 |     ids = [vocab[token] for token in tokens if token in vocab ]
42 |     ids.append(0)
43 |     return numpy.array(ids, numpy.int32)
44 | 
45 | 
46 | def transform_to_array(dataset, vocab, with_label=True):
47 |     if with_label:
48 |         return [(make_array(tokens, vocab), numpy.array([cls], numpy.int32))
49 |                 for tokens, cls in dataset]
50 |     else:
51 |         return [make_array(tokens, vocab)
52 |                 for tokens in dataset]
53 | 
54 | 
55 | def convert_seq(batch, device=None, with_label=True):
56 |     def to_device_batch(batch):
57 |         if device is None:
58 |             return batch
59 |         elif device < 0:
60 |             return [chainer.dataset.to_device(device, x) for x in batch]
61 |         else:
62 |             xp = cuda.cupy.get_array_module(*batch)
63 |             concat = xp.concatenate(batch, axis=0)
64 |             sections = numpy.cumsum([len(x)
65 |                                      for x in batch[:-1]], dtype=numpy.int32)
66 |             concat_dev = chainer.dataset.to_device(device, concat)
67 |             batch_dev = cuda.cupy.split(concat_dev, sections)
68 |             return batch_dev
69 | 
70 |     if with_label:
71 |         return {'xs': to_device_batch([x for x, _ in batch]),
72 |                 'ys': to_device_batch([y for _, y in batch])}
73 |     else:
74 |         return to_device_batch([x for x in batch])
75 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_similarity.py:
--------------------------------------------------------------------------------
 1 | """Tests for analogy benchmark."""
 2 | 
 3 | import contextlib
 4 | import unittest
 5 | import io
 6 | from os import path
 7 | from vecto.benchmarks.similarity import Benchmark as Similarity
 8 | from vecto.benchmarks import visualize
 9 | from vecto.embeddings import load_from_dir
10 | from vecto.data import Dataset
11 | from tests.test_setup import run_module
12 | 
13 | 
14 | path_similarity_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'similarity')
15 | path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')
16 | 
17 | 
18 | class Tests(unittest.TestCase):
19 | 
20 |     def test_api(self):
21 |         embs = load_from_dir(path_emb)
22 |         dataset = Dataset(path_similarity_dataset)
23 |         similarity = Similarity()
24 |         result = similarity.run(embs, dataset)
25 |         self.assertIsInstance(result[0], dict)
26 |         print(result)
27 | 
28 |         similarity = Similarity(ignore_oov=False)
29 |         result = similarity.run(embs, dataset)
30 |         self.assertIsInstance(result[0], dict)
31 |         print(result)
32 | 
33 |         similarity = Similarity(normalize=False)
34 |         result = similarity.run(embs, dataset)
35 |         self.assertIsInstance(result[0], dict)
36 |         print(result)
37 | 
38 |     def test_cli(self):
39 |         sio = io.StringIO()
40 |         with contextlib.redirect_stdout(sio):
41 |             run_module("vecto",
42 |                        "benchmark",
43 |                        "similarity",
44 |                        path_emb,
45 |                        path_similarity_dataset,
46 |                        "--path_out", "/tmp/vecto/benchmarks/")
47 | 
48 |         sio = io.StringIO()
49 |         with contextlib.redirect_stdout(sio):
50 |             run_module("vecto",
51 |                        "benchmark",
52 |                        "similarity",
53 |                        path_emb,
54 |                        path_similarity_dataset,
55 |                        "--path_out", "/tmp/vecto/benchmarks/tmp")
56 | 
57 |         with self.assertRaises(FileNotFoundError):
58 |             sio = io.StringIO()
59 |             with contextlib.redirect_stdout(sio):
60 |                 run_module("vecto",
61 |                            "benchmark",
62 |                            "similarity",
63 |                            path_emb + "NONEXISTING",
64 |                            path_similarity_dataset,
65 |                            "--path_out", "/tmp/vecto/benchmarks/")
66 | 
67 |         from matplotlib import pyplot as plt
68 |         visualize.plot_accuracy("/tmp/vecto/benchmarks/word_similarity", key_secondary="experiment_setup.dataset")
69 |         plt.savefig("/tmp/vecto/benchmarks/similarity.pdf", bbox_inches="tight")
70 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/basic.rst:
--------------------------------------------------------------------------------
 1 | Introduction to Vecto
 2 | ======================
 3 | 
 4 | .. currentmodule:: vecto
 5 | 
 6 | This is the tutorial for Vecto. It describes:
 7 | 
 8 | * What it is, and why we are developing it.
 9 | * what you can do with Vecto.
10 | * the roadmap of the project.
11 | 
12 | Both the library and the documentation are actively developed, check back for more! If you have questions, or would like to contribute, feel free to get in touch on `github <https://github.com/undertherain/Vecto>`_.
13 | 
14 | What is Vecto?
15 | -------------------
16 | 
17 | Vecto is an open-source Python library for working with vector space models (VSMs), including various word embeddings such as word2vec. Vecto can load various popular formats of VSMs and retrieve nearest neighbors of a given vector. It includes a growing list of benchmarks with which VSMs are evaluated in most current research, and a few visualization tools. It also includes a growing list of modules for creating VSMs, both explicit and based on neural networks. 
18 | 
19 | Why do you bother?
20 | --------------------
21 | 
22 | There are a few other libraries for working with VSMs, including gensim and spacy. Vecto differs from them in that its primary goal is to facilitate principled, systematic research in providing **a framework for reproducible experiments** on VSMs.
23 | 
24 | From the academic perspective, this matters because this is the only way to understand more about what VSMs are and what kind of meaning representation they offer.
25 | 
26 | From the practical perspective, this matters because otherwise we can not tell which VSM would be the best to use for what task. Existing extrinsic evaluations of VSMs such as popular word similarity, relatedness, analogy and intrusion tasks have methodological problems and do not correlate well with performance on all extrinsic tasks. Therefore basically to pick the best representation for a task you have to try different kinds of VSMs until you find the best-performing one.
27 | 
28 | Furthermore, there is the important and unpleasant part of parameter tuning and optimizing for a particular task. `Levy et al. (2015) <http://www.aclweb.org/anthology/Q15-1016>`_ showed that the choice of hyperparameters may make more of a difference than the choice of model itself. Even more frustratingly, when you have a relatively comprehensive task covering a wide range of linguistic relations, you may find that the parameters beneficial to a part of the task are detrimental for another part `(Gladkova et al. 2016) <http://www.aclweb.org/anthology/N16-2002>`_.
29 | 
30 | The neural parts of Vecto is implemented in `Chainer <https://www.chainer.org>`_, a new deep learning framework that is friendly to high-performance multi-GPU environments. This should make Vecto useful in both academic and industrial settings.
31 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_language_modeling.py:
--------------------------------------------------------------------------------
 1 | """Tests for analogy benchmark."""
 2 | 
 3 | import contextlib
 4 | import unittest
 5 | import io
 6 | from os import path
 7 | from vecto.benchmarks.language_modeling import Benchmark as Language_modeling
 8 | from vecto.benchmarks import visualize
 9 | from vecto.embeddings import load_from_dir
10 | from tests.test_setup import run_module
11 | 
12 | path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')
13 | 
14 | 
15 | class Tests(unittest.TestCase):
16 | 
17 |     def test_api(self):
18 |         embs = load_from_dir(path_emb)
19 |         language_modeling = Language_modeling(method='lstm')
20 |         result = language_modeling.run(embs)
21 |         self.assertIsInstance(result[0], dict)
22 |         print(result)
23 | 
24 |         language_modeling = Language_modeling(method='lr')
25 |         result = language_modeling.run(embs)
26 |         self.assertIsInstance(result[0], dict)
27 |         print(result)
28 | 
29 |         language_modeling = Language_modeling(method='2FFNN')
30 |         result = language_modeling.run(embs)
31 |         self.assertIsInstance(result[0], dict)
32 |         print(result)
33 | 
34 |         language_modeling = Language_modeling(method='rnn')
35 |         result = language_modeling.run(embs)
36 |         self.assertIsInstance(result[0], dict)
37 |         print(result)
38 | 
39 |     def test_cli(self):
40 |         sio = io.StringIO()
41 |         with contextlib.redirect_stdout(sio):
42 |             run_module("vecto",
43 |                        "benchmark",
44 |                        "language_modeling",
45 |                        path_emb,
46 |                        "--window_size", "5",
47 |                        "--path_out", "/tmp/vecto/benchmarks/")
48 | 
49 |         sio = io.StringIO()
50 |         with contextlib.redirect_stdout(sio):
51 |             run_module("vecto",
52 |                        "benchmark",
53 |                        "language_modeling",
54 |                        path_emb,
55 |                        "--method", "lr",
56 |                        "--path_out", "/tmp/vecto/benchmarks/tmp")
57 | 
58 |         with self.assertRaises(FileNotFoundError):
59 |             sio = io.StringIO()
60 |             with contextlib.redirect_stdout(sio):
61 |                 run_module("vecto",
62 |                            "benchmark",
63 |                            "language_modeling",
64 |                            path_emb + "NONEXISTING",
65 |                            "--path_out", "/tmp/vecto/benchmarks/")
66 | 
67 |         from matplotlib import pyplot as plt
68 |         visualize.plot_accuracy("/tmp/vecto/benchmarks/language_modeling",
69 |                                 key_secondary="experiment_setup.dataset")
70 |         plt.savefig("/tmp/vecto/benchmarks/language_modeling.pdf",
71 |                     bbox_inches="tight")
72 | 
73 | 
74 | # Tests().test_cli()
75 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_sequence_labeling.py:
--------------------------------------------------------------------------------
 1 | """Tests for analogy benchmark."""
 2 | 
 3 | import contextlib
 4 | import unittest
 5 | import io
 6 | from os import path
 7 | from vecto.benchmarks.sequence_labeling import Benchmark as Sequence_labeling
 8 | from vecto.benchmarks import visualize
 9 | from vecto.embeddings import load_from_dir
10 | from vecto.data import Dataset
11 | from tests.test_setup import run_module
12 | 
13 | 
14 | path_sequence_labeling_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'sequence_labeling')
15 | path_sequence_labeling_dataset_ner = path.join('.', 'tests', 'data', 'benchmarks', 'sequence_labeling', 'ner') # sequence labeling need to specify a sub task (pos, chunk, or ner)
16 | path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')
17 | 
18 | 
19 | class Tests(unittest.TestCase):
20 |     def test_api(self):
21 |         embs = load_from_dir(path_emb)
22 | 
23 |         for method in ['lr', '2FFNN']:
24 |             sequence_labeling = Sequence_labeling(method=method)
25 |             for subtask in ['chunk', 'pos', 'ner']:  # , 'chunk', 'pos', 'ner'
26 |                 dataset = Dataset(path.join(path_sequence_labeling_dataset, subtask))
27 |                 result = sequence_labeling.run(embs, dataset)
28 |                 self.assertIsInstance(result[0], dict)
29 |                 print(result)
30 | 
31 |     def test_cli(self):
32 |         sio = io.StringIO()
33 |         with contextlib.redirect_stdout(sio):
34 |             run_module("vecto",
35 |                        "benchmark",
36 |                        "sequence_labeling",
37 |                        path_emb,
38 |                        path_sequence_labeling_dataset_ner,
39 |                        "--path_out", "/tmp/vecto/benchmarks/")
40 | 
41 |         sio = io.StringIO()
42 |         with contextlib.redirect_stdout(sio):
43 |             run_module("vecto",
44 |                        "benchmark",
45 |                        "sequence_labeling",
46 |                        path_emb,
47 |                        path_sequence_labeling_dataset_ner,
48 |                        "--path_out", "/tmp/vecto/benchmarks/")
49 | 
50 |         with self.assertRaises(FileNotFoundError):
51 |             sio = io.StringIO()
52 |             with contextlib.redirect_stdout(sio):
53 |                 run_module("vecto",
54 |                            "benchmark",
55 |                            "sequence_labeling",
56 |                            path_emb + "NONEXISTING",
57 |                            path_sequence_labeling_dataset_ner,
58 |                            "--path_out",
59 |                            "/tmp/vecto/benchmarks/")
60 | 
61 |         from matplotlib import pyplot as plt
62 |         # here the visualization only for the ner sub task.
63 |         visualize.plot_accuracy("/tmp/vecto/benchmarks/sequence_labeling/ner", key_secondary="experiment_setup.dataset")
64 |         plt.savefig("/tmp/vecto/benchmarks/sequence_labeling.pdf", bbox_inches="tight")
65 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks/text_classification/test:
--------------------------------------------------------------------------------
 1 | 0	a standard police-oriented drama that , were it not for de niro's participation , would have likely wound up a tnt original .
 2 | 0	afraid to pitch into farce , yet only half-hearted in its spy mechanics , all the queen's men is finally just one long drag .
 3 | 0	with minimal imagination , you could restage the whole thing in your bathtub .
 4 | 0	spousal abuse is a major problem in contemporary society , but the film reduces this domestic tragedy to florid melodrama .
 5 | 0	too slick and manufactured to claim street credibility .
 6 | 0	smothered by its own solemnity .
 7 | 0	the 50-something lovebirds are too immature and unappealing to care about .
 8 | 0	basically , it's pretty but dumb .
 9 | 0	it's a deeply serious movie that cares passionately about its subject , but too often becomes ponderous in its teaching of history , or lost in the intricate connections and multiple timelines of its story .
10 | 0	i'm not suggesting that you actually see it , unless you're the kind of person who has seen every wim wenders film of the '70s .
11 | 0	kids who are into this thornberry stuff will probably be in wedgie heaven . anyone else who may , for whatever reason , be thinking about going to see this movie is hereby given fair warning .
12 | 0	as vulgar as it is banal .
13 | 0	return to neverland manages to straddle the line between another classic for the company and just another run-of-the-mill disney sequel intended for the home video market .
14 | 0	if you're not fans of the adventues of steve and terri , you should avoid this like the dreaded king brown snake . personally , i'd rather watch them on the animal planet .
15 | 0	femme fatale offers nothing more than a bait-and-switch that is beyond playing fair with the audience . are we dealing with dreams , visions or being told what actually happened as if it were the third ending of clue ?
16 | 0	life or something like it has its share of high points , but it misses too many opportunities .
17 | 1	will amuse and provoke adventurous adults in specialty venues .
18 | 1	great character interaction .
19 | 1	leave it to the french to truly capture the terrifying angst of the modern working man without turning the film into a cheap thriller , a dumb comedy or a sappy melodrama .
20 | 1	sits uneasily as a horror picture . . . but finds surprising depth in its look at the binds of a small family .
21 | 1	remarkably accessible and affecting .
22 | 1	a slick , well-oiled machine , exquisitely polished and upholstered .
23 | 1	a compelling film .
24 | 1	a refreshing korean film about five female high school friends who face an uphill battle when they try to take their relationships into deeper waters .
25 | 1	denis forges out of the theories of class- based rage and sisterly obsession a razor-sided tuning fork that rings with cultural , sexual and social discord .
26 | 1	mostly , [goldbacher] just lets her complicated characters be unruly , confusing and , through it all , human .
27 | 1	it is a challenging film , if not always a narratively cohesive one .
28 | 1	a worthy tribute to a great humanitarian and her vibrant 'co-stars . '
29 | 1	. . . with " the bourne identity " we return to the more traditional action genre .
30 | 1	kaufman and jonze take huge risks to ponder the whole notion of passion -- our desire as human beings for passion in our lives and the emptiness one feels when it is missing .
31 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/training_vectors.rst:
--------------------------------------------------------------------------------
 1 | Training new models
 2 | ===================
 3 | 
 4 | .. currentmodule:: vsmlib
 5 | 
 6 | 
 7 | This page describes how to train vectors with the models that are currently implemented in VSMlib.
 8 | 
 9 | 
10 | Word2vec
11 | --------
12 | 
13 | `Word2vec <https://arxiv.org/pdf/1301.3781.pdf>`_ is arguably the most popular word embedding model. 
14 | We provide implementation of extended word2vec model, which can be trained on linear and dependency-based contexts,
15 | with bound and unbound context representations.
16 | 
17 | Additionally we provide an implementation which considers characters rather than words to be the minimal units. This enables it to take advantage of morphological information: as far as a word-level models such as word2vec is concerned, "walk" and "walking" are completely unrelated, except  through similarities in their distributions.
18 | 
19 | To train word2vec embeddings vsmlib can be envoked via the command line interface:
20 | 
21 | >>> python3 -m vsmlib.embeddings.train_word2vec
22 | 
23 | The command line parameters are as 
24 | 
25 | --dimensions                  size of embeddings
26 | --context_type                context type [linear' or 'deps'], for deps context, the annotated corpus is required
27 | --context_representation      context representation ['bound' or 'unbound']
28 | --window                      window size')
29 | --model                       base model type ['skipgram' or 'cbow']
30 | --negative-size               number of negative samples
31 | --out_type                    output model type ["hsm": hierarchical softmax, "ns": negative sampling, "original": no approximation]
32 | --subword                     specify if subword-level approach should be used ["none", "rnn"]
33 | --batchsize                   learning minibatch size
34 | --gpu                         GPU ID (negative value indicates CPU)
35 | --epochs                      number of epochs to learn
36 | --maxWordLength               max word length (only used for char-level subword)
37 | --path_vocab                  path to the vocabulary
38 | --path_corpus                 path to the corpus
39 | --path_out                    path to save embeddings
40 | --test                        run in test mode
41 | --verbose                     verbose mode
42 | 
43 | 
44 | Alternatively, word2vec training can be done though vsmlib python API. 
45 | 
46 | >>> vsmlib.embeddings.train_word2vec.train(args)
47 | 
48 | The arguments are argparse.namespace identical to command line arguments. Instance of ModelDense is returned. 
49 | 
50 | Realted papers: original w2v, Bofang, Mnih, subword.
51 | 
52 | ::
53 | 
54 |  @inproceedings{MikolovChenEtAl_2013_Efficient_estimation_of_word_representations_in_vector_space,
55 |   title = {Efficient Estimation of Word Representations in Vector Space},
56 |   urldate = {2015-12-03},
57 |   booktitle = {Proceedings of International Conference on Learning Representations (ICLR)},
58 |   author = {Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey},
59 |   year = {2013}}
60 | 
61 | ::
62 | 
63 |  @inproceedings{Li2017InvestigatingDS,
64 |   title={Investigating Different Syntactic Context Types and Context Representations for Learning Word Embeddings},
65 |   author={Bofang Li and Tao Liu and Zhe Zhao and Buzhou Tang and Aleksandr Drozd and Anna Rogers and Xiaoyong Du},
66 |   booktitle={EMNLP},
67 |   year={2017}}
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_text_classification.py:
--------------------------------------------------------------------------------
 1 | """Tests for analogy benchmark."""
 2 | 
 3 | import contextlib
 4 | import unittest
 5 | import io
 6 | from os import path
 7 | from vecto.benchmarks.text_classification import Benchmark as Text_classification
 8 | from vecto.benchmarks.text_classification import load_model, predict, get_vectors
 9 | from vecto.benchmarks import visualize
10 | from vecto.embeddings import load_from_dir
11 | from vecto.data import Dataset
12 | from tests.test_setup import run_module
13 | 
14 | path_text_classification_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'text_classification')
15 | path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')
16 | 
17 | class Tests(unittest.TestCase):
18 | 
19 |     def test_api(self):
20 |         embs = load_from_dir(path_emb)
21 |         dataset = Dataset(path_text_classification_dataset)
22 | 
23 |         tc = Text_classification(model='cnn')
24 |         result = tc.run(embs, dataset,
25 |                         "/tmp/vecto/benchmarks/text_classification_model/")
26 |         self.assertIsInstance(result[0], dict)
27 |         print(result)
28 | 
29 |         tc = Text_classification(model='rnn')
30 |         result = tc.run(embs, dataset,
31 |                         "/tmp/vecto/benchmarks/text_classification_model/")
32 |         self.assertIsInstance(result[0], dict)
33 |         print(result)
34 | 
35 |         tc = Text_classification(model='bow')
36 |         result = tc.run(embs, dataset,
37 |                         "/tmp/vecto/benchmarks/text_classification_model/")
38 |         self.assertIsInstance(result[0], dict)
39 |         print(result)
40 | 
41 |         model = load_model("/tmp/vecto/benchmarks/text_classification_model/args.json",
42 |                                                embs.matrix)
43 |         print(predict(model, "I like this"))
44 |         print(get_vectors(model, ["I like this", "I hate this"]))
45 | 
46 |     def test_cli(self):
47 |         sio = io.StringIO()
48 |         with contextlib.redirect_stdout(sio):
49 |             run_module("vecto",
50 |                        "benchmark",
51 |                        "text_classification",
52 |                        path_emb,
53 |                        path_text_classification_dataset,
54 |                        "--model", "cnn",
55 |                        "--path_out", "/tmp/vecto/benchmarks/")
56 | 
57 |         sio = io.StringIO()
58 |         with contextlib.redirect_stdout(sio):
59 |             run_module("vecto",
60 |                        "benchmark",
61 |                        "text_classification",
62 |                        path_emb,
63 |                        path_text_classification_dataset,
64 |                        "--model", "cnn",
65 |                        "--path_out", "/tmp/vecto/benchmarks/")
66 | 
67 |         with self.assertRaises(FileNotFoundError):
68 |             sio = io.StringIO()
69 |             with contextlib.redirect_stdout(sio):
70 |                 run_module("vecto",
71 |                            "benchmark",
72 |                            "text_classification",
73 |                            path_emb + "NONEXISTING",
74 |                            path_text_classification_dataset,
75 |                            "--path_out", "/tmp/vecto/benchmarks/")
76 | 
77 |         from matplotlib import pyplot as plt
78 |         visualize.plot_accuracy("/tmp/vecto/benchmarks/text_classification", key_secondary="experiment_setup.dataset")
79 |         plt.savefig("/tmp/vecto/benchmarks/text_classification.pdf", bbox_inches="tight")
80 | 
81 | 


--------------------------------------------------------------------------------
/tests/test_embeddings.py:
--------------------------------------------------------------------------------
 1 | """Tests for embeddings module."""
 2 | 
 3 | import unittest
 4 | from unittest.mock import patch
 5 | from os import path
 6 | import numpy as np
 7 | from vecto.embeddings.dense import WordEmbeddingsDense
 8 | from vecto.embeddings.base import WordEmbeddings
 9 | from vecto.embeddings import load_from_dir
10 | from vecto.vocabulary import Vocabulary
11 | 
12 | 
13 | class Tests(unittest.TestCase):
14 | 
15 |     def test_basic(self):
16 |         WordEmbeddingsDense()
17 |         model = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
18 |         model.cmp_words("apple", "banana")
19 |         model.cmp_words("apple", "bananaaaaa")
20 |         x = np.array([0.0, 0.0, 0.0])
21 |         x.fill(np.nan)
22 |         model.cmp_vectors(x, x)
23 | 
24 |     def test_load(self):
25 |         load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
26 |         # TODO: assert right class
27 |         load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_no_file_header'))
28 |         # TODO: assert right class
29 |         load_from_dir(path.join('tests', 'data', 'embeddings', 'npy'))
30 | 
31 |         embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
32 |         embs.get_vector('apple')
33 |         #with self.assertRaises(RuntimeError):
34 |         #    embs.get_vector('word_that_not_in_vocabulary_27')
35 |         embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'corrupted'))
36 |         with self.assertRaises(RuntimeError):
37 |             embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text'))
38 | 
39 |     def test_normalize(self):
40 |         embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
41 |         embs.normalize()
42 |         embs.cache_normalized_copy()
43 | 
44 |     def test_utils(self):
45 |         embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
46 |         results = embs.get_most_similar_words('apple', 5)
47 |         print(results)
48 |         embs.cache_normalized_copy()
49 |         results = embs.get_most_similar_words('apple', 5)
50 |         print(results)
51 | 
52 |         results = embs.get_most_similar_words(embs.get_vector('apple'), 5)
53 |         print(results)
54 |         embs.get_x_label(0)
55 | 
56 |     def test_save(self):
57 |         embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
58 |         path_save = path.join('/tmp', 'vecto', 'saved')
59 |         embs.save_to_dir(path_save)
60 |         embs = load_from_dir(path_save)
61 |         print(embs.matrix.shape)
62 |         embs.save_to_dir_plain_txt(path.join('/tmp', 'vecto', 'saved_plain'))
63 | 
64 |     def test_filter(self):
65 |         embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
66 |         path_vocab = path.join('.', 'tests', 'data', 'vocabs', 'plain')
67 |         vocab = Vocabulary()
68 |         vocab.load(path_vocab)
69 |         embs.filter_by_vocab(["the", "apple"])
70 |         embs.filter_by_vocab([])
71 | 
72 |     @patch.multiple(WordEmbeddings, __abstractmethods__=set())
73 |     def test_abc(self):
74 |         obj = WordEmbeddings()
75 |         obj.get_vector("banana")
76 | 
77 |     def test_viz(self):
78 |         embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
79 |         embs.viz_wordlist(["the", "apple"], colored=True, show_legend=True)
80 |         embs.viz_wordlist(["the", "apple"], colored=False, show_legend=False)
81 | 


--------------------------------------------------------------------------------
/docs/source/tutorial/roadmap.rst:
--------------------------------------------------------------------------------
 1 | Project roadmap
 2 | ================
 3 | .. currentmodule:: vecto
 4 | 
 5 | Vecto is work in progress. Everything that works at the moment is described in the present tutorial; feel free to get in touch if anything is not clear. Also, new functionality is coming in the nearest months, so check back for more features!
 6 | 
 7 | +-------------------------------------------------------------+-----------------------------------------------------------+
 8 | |      DONE                                                   | IN PROGRESS                                               |
 9 | +=============================================================+===========================================================+
10 | +-------------------------------------------------------------+-----------------------------------------------------------+
11 | | **General:**                                                                                                            |
12 | +-------------------------------------------------------------+-----------------------------------------------------------+
13 | | - Loading various vsm formats: plain text, npy, binary, h5p | - Pretty data downloader for benchmarks                   |
14 | | - Metadata generation                                       |                                                           |
15 | | - Basic vector operations, efficient similarity search      |                                                           |
16 | | - VSM visualization                                         |                                                           |
17 | +-------------------------------------------------------------+-----------------------------------------------------------+
18 | +-------------------------------------------------------------+-----------------------------------------------------------+
19 | | **VSM generation:**                                                                                                     |
20 | +-------------------------------------------------------------+-----------------------------------------------------------+
21 | | - word2vec                                                  | - GloVe                                                   |
22 | | - - Character-level VSM                                     | - SVD                                                     |
23 | |                                                             |                                                           |
24 | +-------------------------------------------------------------+-----------------------------------------------------------+
25 | +-------------------------------------------------------------+-----------------------------------------------------------+
26 | | **VSM evaluation:**                                                                                                     |
27 | +-------------------------------------------------------------+-----------------------------------------------------------+
28 | | - 6 methods of solving word analogies                       | - natural language inference                              |
29 | | - similarity and relatedness tests                          | - language modeling                                       |
30 | | - text classification                                       | - neural machine translation                              |
31 | | - sequence labeling (POS-tagging, chunking, NER)            | - subjectivity classification                             |
32 | |                                                             | - and more!                                               |
33 | +-------------------------------------------------------------+-----------------------------------------------------------+
34 | 


--------------------------------------------------------------------------------
/vecto/embeddings/__init__.py:
--------------------------------------------------------------------------------
  1 | """Loading and training for embeddings
  2 | 
  3 | .. autosummary::
  4 |     :toctree: _autosummary
  5 | 
  6 |     base
  7 |     dense
  8 | 
  9 | """
 10 | 
 11 | import os
 12 | import logging
 13 | import numpy as np
 14 | import vecto.embeddings.dense
 15 | from vecto.embeddings.dense import WordEmbeddingsDense
 16 | from .legacy_w2v import ModelW2V
 17 | from vecto.vocabulary import Vocabulary
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | def load_from_dir(path):
 23 |     """Automatically detects embeddings format and loads
 24 | 
 25 |     Args:
 26 |         path: directory where embeddings are stores
 27 | 
 28 |     Returns:
 29 |         Instance of appropriate Model-based class
 30 |     """
 31 | #    if os.path.isfile(os.path.join(path, "cooccurrence_csr.h5p")):
 32 | #        logger.info("detected as sparse explicit in hdf5")
 33 | #        result = ModelSparse()
 34 | #        result.load_from_hdf5(path)
 35 | #        result.load_metadata(path)
 36 | #        return result
 37 | #    if os.path.isfile(os.path.join(path, "bigrams.data.bin")):
 38 | #        logger.info("detected as sparse in vecto legacy format")
 39 | #        result = ModelSparse()
 40 | #        result.load(path)
 41 | #        result.load_metadata(path)
 42 | #        return result
 43 | 
 44 | #    if os.path.isfile(os.path.join(path, "sgns.words.npy")):
 45 | #        result = ModelLevy()
 46 | #        logger.info("this is Levi")
 47 | #        result.load_from_dir(path)
 48 | #        result.load_metadata(path)
 49 | #        return result
 50 | #     if os.path.isfile(os.path.join(path, "vectors.npy")):
 51 | #         result = ModelNumbered()
 52 | #         logger.info("detected as dense ")
 53 | #         result.load_npy(path)
 54 | #         result.load_metadata(path)
 55 | #         return result
 56 |     if os.path.isfile(os.path.join(path, "vectors.h5p")):
 57 |         result = vecto.embeddings.dense.WordEmbeddingsDense()
 58 |         logger.info("detected as vecto format ")
 59 |         result.load_hdf5(path)
 60 |         result.load_metadata(path)
 61 |         # TODO: remove this hack after we re-train w2v without OOV rows
 62 |         extra = result.matrix.shape[0] - result.vocabulary.cnt_words
 63 |         result.matrix = result.matrix[extra:]
 64 |         return result
 65 | 
 66 |     result = vecto.embeddings.dense.WordEmbeddingsDense()
 67 |     files = os.listdir(path)
 68 |     for f in files:
 69 |         if f.endswith(".gz") or f.endswith(".bz") or f.endswith(".txt") or f.endswith(".vec"):
 70 |             logger.info(path + "Detected plain text format")
 71 |             result.load_from_text(os.path.join(path, f))
 72 |             result.load_metadata(path)
 73 |             return result
 74 |         if f.endswith(".npy"):
 75 |             logger.info("Detected numpy format")
 76 |             result.matrix = np.load(os.path.join(path, f))
 77 |             result.vocabulary = Vocabulary()
 78 |             result.vocabulary.load(path)
 79 |             result.load_metadata(path)
 80 |             # TODO: remove this hack after we re-train w2v without OOV rows
 81 |             result.matrix = result.matrix[:result.vocabulary.cnt_words]
 82 |             return result
 83 |         if any(file.endswith('bin') for file in os.listdir(path)):
 84 |             result = ModelW2V()
 85 |             logger.info("Detected w2v original binary format")
 86 |             result.load_from_dir(path)
 87 |             result.load_metadata(path)
 88 |             return result
 89 | #        if f.startswith("words") and f.endswith(".npy") \
 90 | #               and os.path.isfile(os.path.join(path, f.replace(".npy", ".vocab"))):
 91 | #            result = Model_Fun()
 92 | #            result = ModelLevy()
 93 | #            logger.info("Detected VSM in npy and vocab in plain text file format")
 94 | #            result.load_from_dir(path, f[: -4])
 95 | #            result.load_metadata(path)
 96 | #            return result
 97 | 
 98 |     raise RuntimeError("Cannot detect the format of this VSM")
 99 | 
100 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks/relation_extraction/test.txt:
--------------------------------------------------------------------------------
 1 | Component-Whole(e2,e1)	12	15	The system as described above has its greatest application in an arrayed configuration of antenna elements .
 2 | Other	1	9	The child was carefully wrapped and bound into the cradle by means of a cord .
 3 | Instrument-Agency(e2,e1)	1	7	The author of a keygen uses a disassembler to look at the raw assembly code .
 4 | Other	2	6	A misty ridge uprises from the surge .
 5 | Member-Collection(e1,e2)	1	2	The student association is the voice of the undergraduate student population of the State University of New York at Buffalo .
 6 | Other	4	10	This is the sprawling complex that is Peru 's largest producer of silver .
 7 | Cause-Effect(e2,e1)	7	19	The current view is that the chronic inflammation in the distal part of the stomach caused by Helicobacter pylori infection results in an increased acid production from the non-infected upper corpus region of the stomach .
 8 | Entity-Destination(e1,e2)	0	6	People have been moving back into downtown .
 9 | Content-Container(e1,e2)	1	6	The lawsonite was contained in a platinum crucible and the counter-weight was a plastic crucible with metal pieces .
10 | Entity-Destination(e1,e2)	12	20	The solute was placed inside a beaker and 5 mL of the solvent was pipetted into a 25 mL glass flask for each trial .
11 | Member-Collection(e1,e2)	2	6	The fifty essays collected in this volume testify to most of the prominent themes from Professor Quispel 's scholarly career .
12 | Other	1	5	Their composer has sunk into oblivion .
13 | Message-Topic(e1,e2)	6	9	The Pulitzer Committee issues an official citation explaining the reasons for the award .
14 | Cause-Effect(e2,e1)	1	8	The burst has been caused by water hammer pressure .
15 | Instrument-Agency(e2,e1)	2	6	Even commercial networks have moved into high-definition broadcast .
16 | Message-Topic(e1,e2)	4	10	It was a friendly call to remind them about the bill and make sure they have a copy of the invoice .
17 | Instrument-Agency(e2,e1)	1	8	Texas-born virtuoso finds harmony , sophistication in Appalachian instrument .
18 | Product-Producer(e2,e1)	1	14	The factory 's products have included flower pots , Finnish rooster-whistles , pans , trays , tea pots , ash trays and air moisturisers .
19 | Component-Whole(e2,e1)	7	8	The girl showed a photo of apple tree blossom on a fruit tree in the Central Valley .
20 | Member-Collection(e2,e1)	20	23	They tried an assault of their own an hour later , with two columns of sixteen tanks backed by a battalion of Panzer grenadiers .
21 | Entity-Origin(e1,e2)	1	18	Their knowledge of the power and rank symbols of the Continental empires was gained from the numerous Germanic recruits in the Roman army , and from the Roman practice of enfeoffing various Germanic warrior groups with land in the imperial provinces .
22 | Member-Collection(e2,e1)	4	9	She soon had a stable of her own rescued hounds .
23 | Cause-Effect(e1,e2)	1	14	The singer , who performed three of the nominated songs , also caused a commotion on the red carpet .
24 | Other	5	11	His intellectually engaging books and essays remain pertinent to illuminating contemporary history .
25 | Member-Collection(e2,e1)	7	10	Poor hygiene controls , reports of a brace of gamey grouse and what looked like a skinned fox all amounted to a pie that was unfit for human consumption .
26 | Other	2	7	This sweet dress is made with a blend of cotton and silk , and the crochet flower necklace is the perfect accessory .
27 | Cause-Effect(e1,e2)	0	8	Suicide is one of the leading causes of death among pre-adolescents and teens , and victims of bullying are at an increased risk for committing suicide .
28 | Message-Topic(e1,e2)	1	7	This article gives details on 2004 in music in the United Kingdom , including the official charts from that year .
29 | Message-Topic(e1,e2)	12	16	We have therefore taken the initiative to convene the first international open meeting dedicated solely to rural history .
30 | Component-Whole(e1,e2)	1	4	The timer of the device automatically eliminates wasted `` standby power '' consumption by automatically turn off electronics plugged into the `` auto off '' outlets .
31 | Message-Topic(e2,e1)	5	8	Bob Parks made a similar offer in a phone call made earlier this week .
32 | Cause-Effect(e2,e1)	5	7	He had chest pains and headaches from mold in the bedrooms .


--------------------------------------------------------------------------------
/tests/data/benchmarks/relation_extraction/train.txt:
--------------------------------------------------------------------------------
 1 | Component-Whole(e2,e1)	12	15	The system as described above has its greatest application in an arrayed configuration of antenna elements .
 2 | Other	1	9	The child was carefully wrapped and bound into the cradle by means of a cord .
 3 | Instrument-Agency(e2,e1)	1	7	The author of a keygen uses a disassembler to look at the raw assembly code .
 4 | Other	2	6	A misty ridge uprises from the surge .
 5 | Member-Collection(e1,e2)	1	2	The student association is the voice of the undergraduate student population of the State University of New York at Buffalo .
 6 | Other	4	10	This is the sprawling complex that is Peru 's largest producer of silver .
 7 | Cause-Effect(e2,e1)	7	19	The current view is that the chronic inflammation in the distal part of the stomach caused by Helicobacter pylori infection results in an increased acid production from the non-infected upper corpus region of the stomach .
 8 | Entity-Destination(e1,e2)	0	6	People have been moving back into downtown .
 9 | Content-Container(e1,e2)	1	6	The lawsonite was contained in a platinum crucible and the counter-weight was a plastic crucible with metal pieces .
10 | Entity-Destination(e1,e2)	12	20	The solute was placed inside a beaker and 5 mL of the solvent was pipetted into a 25 mL glass flask for each trial .
11 | Member-Collection(e1,e2)	2	6	The fifty essays collected in this volume testify to most of the prominent themes from Professor Quispel 's scholarly career .
12 | Other	1	5	Their composer has sunk into oblivion .
13 | Message-Topic(e1,e2)	6	9	The Pulitzer Committee issues an official citation explaining the reasons for the award .
14 | Cause-Effect(e2,e1)	1	8	The burst has been caused by water hammer pressure .
15 | Instrument-Agency(e2,e1)	2	6	Even commercial networks have moved into high-definition broadcast .
16 | Message-Topic(e1,e2)	4	10	It was a friendly call to remind them about the bill and make sure they have a copy of the invoice .
17 | Instrument-Agency(e2,e1)	1	8	Texas-born virtuoso finds harmony , sophistication in Appalachian instrument .
18 | Product-Producer(e2,e1)	1	14	The factory 's products have included flower pots , Finnish rooster-whistles , pans , trays , tea pots , ash trays and air moisturisers .
19 | Component-Whole(e2,e1)	7	8	The girl showed a photo of apple tree blossom on a fruit tree in the Central Valley .
20 | Member-Collection(e2,e1)	20	23	They tried an assault of their own an hour later , with two columns of sixteen tanks backed by a battalion of Panzer grenadiers .
21 | Entity-Origin(e1,e2)	1	18	Their knowledge of the power and rank symbols of the Continental empires was gained from the numerous Germanic recruits in the Roman army , and from the Roman practice of enfeoffing various Germanic warrior groups with land in the imperial provinces .
22 | Member-Collection(e2,e1)	4	9	She soon had a stable of her own rescued hounds .
23 | Cause-Effect(e1,e2)	1	14	The singer , who performed three of the nominated songs , also caused a commotion on the red carpet .
24 | Other	5	11	His intellectually engaging books and essays remain pertinent to illuminating contemporary history .
25 | Member-Collection(e2,e1)	7	10	Poor hygiene controls , reports of a brace of gamey grouse and what looked like a skinned fox all amounted to a pie that was unfit for human consumption .
26 | Other	2	7	This sweet dress is made with a blend of cotton and silk , and the crochet flower necklace is the perfect accessory .
27 | Cause-Effect(e1,e2)	0	8	Suicide is one of the leading causes of death among pre-adolescents and teens , and victims of bullying are at an increased risk for committing suicide .
28 | Message-Topic(e1,e2)	1	7	This article gives details on 2004 in music in the United Kingdom , including the official charts from that year .
29 | Message-Topic(e1,e2)	12	16	We have therefore taken the initiative to convene the first international open meeting dedicated solely to rural history .
30 | Component-Whole(e1,e2)	1	4	The timer of the device automatically eliminates wasted `` standby power '' consumption by automatically turn off electronics plugged into the `` auto off '' outlets .
31 | Message-Topic(e2,e1)	5	8	Bob Parks made a similar offer in a phone call made earlier this week .
32 | Cause-Effect(e2,e1)	5	7	He had chest pains and headaches from mold in the bedrooms .


--------------------------------------------------------------------------------
/tests/benchmarks/test_analogy.py:
--------------------------------------------------------------------------------
  1 | """Tests for analogy benchmark."""
  2 | 
  3 | import contextlib
  4 | import unittest
  5 | import io
  6 | from os import path
  7 | from vecto.benchmarks.analogy import Benchmark as Analogy
  8 | from vecto.benchmarks import visualize
  9 | from vecto.embeddings import load_from_dir
 10 | from vecto.data import Dataset
 11 | 
 12 | from ..test_setup import run_module
 13 | 
 14 | 
 15 | path_analogy_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'analogy')
 16 | 
 17 | 
 18 | class Tests(unittest.TestCase):
 19 | 
 20 |     def test_api(self):
 21 |         embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
 22 |         analogy = Analogy(method="3CosAdd")
 23 |         dateset = Dataset(path_analogy_dataset)
 24 |         result = analogy.run(embs, dateset)
 25 |         self.assertIsInstance(result[0], dict)
 26 | 
 27 |         analogy = Analogy(method="PairDistance")
 28 |         result = analogy.run(embs, dateset)
 29 |         self.assertIsInstance(result[0], dict)
 30 | 
 31 |         analogy = Analogy(method="3CosMul")
 32 |         result = analogy.run(embs, dateset)
 33 |         self.assertIsInstance(result[0], dict)
 34 | 
 35 |         analogy = Analogy(method="3CosMul2")
 36 |         result = analogy.run(embs, dateset)
 37 |         self.assertIsInstance(result[0], dict)
 38 | 
 39 |         analogy = Analogy(method="3CosAvg")
 40 |         result = analogy.run(embs, dateset)
 41 |         self.assertIsInstance(result[0], dict)
 42 | 
 43 |         analogy = Analogy(method="SimilarToAny")
 44 |         result = analogy.run(embs, dateset)
 45 |         print(result)
 46 | 
 47 |         analogy = Analogy(method="SimilarToB")
 48 |         result = analogy.run(embs, dateset)
 49 |         print(result)
 50 | 
 51 |         analogy = Analogy(method="LRCos")
 52 |         result = analogy.run(embs, dateset)
 53 |         print(result)
 54 | 
 55 |     def test_cli(self):
 56 |         sio = io.StringIO()
 57 |         with contextlib.redirect_stdout(sio):
 58 |             run_module("vecto", "benchmark", "analogy",
 59 |                        "./tests/data/embeddings/text/plain_with_file_header/",
 60 |                        "./tests/data/benchmarks/analogy/",
 61 |                        "--path_out", "/tmp/vecto/benchmarks/",
 62 |                        "--method", "3CosAdd")
 63 | 
 64 |         sio = io.StringIO()
 65 |         with contextlib.redirect_stdout(sio):
 66 |             run_module("vecto", "benchmark", "analogy",
 67 |                        "./tests/data/embeddings/text/plain_with_file_header/",
 68 |                        "./tests/data/benchmarks/analogy/",
 69 |                        "--path_out",
 70 |                        "/tmp/vecto/benchmarks/specific_filename.json",
 71 |                        "--method", "LRCos")
 72 | 
 73 |         sio = io.StringIO()
 74 |         with contextlib.redirect_stdout(sio):
 75 |             run_module("vecto", "benchmark", "analogy",
 76 |                        "./tests/data/embeddings/text/plain_with_file_header/",
 77 |                        "./tests/data/benchmarks/analogy/",
 78 |                        "--path_out", "/tmp/vecto/benchmarks/",
 79 |                        "--method", "3CosMul")
 80 | 
 81 |         sio = io.StringIO()
 82 |         with self.assertRaises(RuntimeError):
 83 |             with contextlib.redirect_stdout(sio):
 84 |                 run_module("vecto", "benchmark", "analogy",
 85 |                            "./tests/data/embeddings/text/plain_with_file_header/",
 86 |                            "./tests/data/benchmarks/analogy/",
 87 |                            "--method", "NONEXISTING")
 88 | 
 89 |         sio = io.StringIO()
 90 |         with contextlib.redirect_stdout(sio):
 91 |             run_module("vecto", "benchmark", "analogy",
 92 |                        "./tests/data/embeddings/text/plain_with_file_header/",
 93 |                        "./tests/data/benchmarks/analogy/",
 94 |                        "--method", "3CosAvg")
 95 | 
 96 |         # TODO: suppress concatenating timestamp or aggregate multiple runs
 97 |         from matplotlib import pyplot as plt
 98 |         visualize.plot_accuracy("/tmp/vecto/benchmarks/word_analogy")
 99 |         plt.savefig("/tmp/vecto/benchmarks/analogy.pdf", bbox_inches="tight")
100 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_categorization.py:
--------------------------------------------------------------------------------
 1 | """Tests for categorization benchmark."""
 2 | 
 3 | import unittest
 4 | from io import StringIO
 5 | from contextlib import redirect_stdout
 6 | from vecto.benchmarks.categorization import Benchmark as Categorization
 7 | from vecto.benchmarks.categorization import purity_score
 8 | from vecto.embeddings import load_from_dir
 9 | from ..test_setup import run_module
10 | from numpy import array
11 | from os import path
12 | 
13 | path_categorization_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'categorization')
14 | 
15 | 
16 | class Tests(unittest.TestCase):
17 |     # def test_categorization(self):
18 |     #     embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
19 |     #     categorization = KMeansCategorization()
20 |     #     result = categorization.get_result(embs, path_categorization_dataset)
21 | 
22 |     # def test_categorization_method_works(self):
23 |     #     embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
24 |     #     categorization = KMeansCategorization()
25 |     #     result = categorization.get_result(embs, path_categorization_dataset)
26 | 
27 |     def test_cli(self):
28 |         sio = StringIO()
29 |         with redirect_stdout(sio):
30 |             run_module('vecto',
31 |                        'benchmark',
32 |                        'categorization',
33 |                        './tests/data/embeddings/text/plain_with_file_header/',
34 |                        './tests/data/benchmarks/categorization/',
35 |                        '--path_out', '/tmp/vecto/benchmarks')
36 | 
37 |     # def test_categorization_scores(self):
38 |     #     embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
39 |     #     categorization = KMeansCategorization()
40 |     #     result = categorization.get_result(embs, path_categorization_dataset)
41 |     #     scores = result[0]['global_stats']['scores']
42 |     #     self.assertEqual(len(scores.keys()), 7)
43 |     #     self.assertEqual(len(result[0]['global_stats']['true_labels']), 7)
44 |     #     self.assertEqual(result[0]['global_stats']['true_labels'][3], 1)
45 | 
46 |     # def test_categorization_data(self):
47 |     #     embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
48 |     #     categorization = KMeansCategorization()
49 |     #     result = categorization.get_result(embs, path_categorization_dataset)
50 |     #     word_stats = result[0]['word_stats']
51 |     #     # self.assertEqual(word_stats['4. banana']['true_category'], 'food')
52 |     #     self.assertEqual(len(word_stats.keys()), 7)
53 | 
54 |     # def test_kmeans(self):
55 |     #     data = [(0, 0, 0), (100, 100, 100), (99, 99, 99)]
56 |     #     keys_len = 2
57 |     #     labels = [0, 1]
58 |     #     categorization = KMeansCategorization()
59 |     #     predicted_labels, true_labels, centroids, inertia, params = categorization.run_categorization(keys_len, data, labels)
60 |     #     self.assertEqual(len(centroids), 2)
61 |     #     self.assertEqual(inertia, 1.5)
62 | 
63 |     # def test_cli_2(self):
64 |     #     sio = StringIO()
65 |     #     with redirect_stdout(sio):
66 |     #         run_module('vecto.benchmarks.categorization',
67 |     #                    './tests/data/embeddings/text/plain_with_file_header/',
68 |     #                    './tests/data/benchmarks/categorization/',
69 |     #                    '--path_out', '/tmp/vecto/r.json', '--method', 'SpectralCategorization')
70 | 
71 |     def test_set_loading(self):
72 |         test_set_path = path.join('.', 'tests', 'data', 'benchmarks', 'categorization', 'essli-2008-lite.csv')
73 |         test_set_categories_amount = 3
74 | 
75 |         categorization = Categorization()
76 |         test_set = categorization.read_test_set(test_set_path)
77 |         self.assertEqual(len(test_set.keys()), test_set_categories_amount)
78 | 
79 |     def test_purity_measure(self):
80 |         test_set_1 = array((0, 1, 2, 3))
81 |         test_set_2 = array((0, 1, 2, 3))
82 |         expected_score = 1.0
83 |         self.assertEqual(purity_score(test_set_1, test_set_2), expected_score)
84 | 
85 |         test_set_1 = array((0, 0, 3, 3))
86 |         test_set_2 = array((0, 0, 0, 0))
87 |         expected_score = 0.5
88 |         self.assertEqual(purity_score(test_set_1, test_set_2), expected_score)
89 | 


--------------------------------------------------------------------------------
/vecto/benchmarks/__init__.py:
--------------------------------------------------------------------------------
  1 | """Collection of benchmarks and downstream tasks on embeddings
  2 | 
  3 | .. autosummary::
  4 |     :toctree: _autosummary
  5 | 
  6 |     analogy
  7 |     categorization
  8 |     language_modeling
  9 |     outliers
 10 |     relation_extraction
 11 |     sequence_labeling
 12 |     similarity
 13 |     synonymy_detection
 14 |     text_classification
 15 | 
 16 | """
 17 | 
 18 | import argparse
 19 | import importlib
 20 | from vecto.embeddings import load_from_dir
 21 | from vecto.data import Dataset
 22 | import os
 23 | from vecto.utils.data import save_json, print_json
 24 | from vecto.utils import get_time_str
 25 | 
 26 | 
 27 | def list_benhcmarks(benchmarks):
 28 |     print("available benchmarks:")
 29 |     for i in benchmarks:
 30 |         print(i)
 31 | 
 32 | 
 33 | def choose_benchmark(args):
 34 |     # TODO: load benchmark names from modules themselves
 35 |     available_benchmarks = []
 36 |     available_benchmarks.append("analogy")
 37 |     available_benchmarks.append("categorization")
 38 |     available_benchmarks.append("language_modeling")
 39 |     available_benchmarks.append("relation_extraction")
 40 |     available_benchmarks.append("similarity")
 41 |     available_benchmarks.append("sequence_labeling")
 42 |     available_benchmarks.append("text_classification")
 43 | 
 44 |     parser = argparse.ArgumentParser(
 45 |         description='run benchmarks',
 46 |         add_help=True,
 47 |         usage="vecto benchmark [name]")
 48 | 
 49 |     parser.add_argument('name', help='Subcommand to run')
 50 |     args, remaining_args = parser.parse_known_args(args)
 51 |     if args.name == "help":
 52 |         list_benhcmarks(available_benchmarks)
 53 |         return
 54 |     # TODO: implement running set of benchmarks defined in config
 55 |     # if args.name == "all":
 56 |         # print("running all benchmarks")
 57 | 
 58 |     if args.name in available_benchmarks:
 59 |         #print('ramaining args')
 60 |         #print(remaining_args)
 61 |         run_benchmark_by_name(args.name, remaining_args)
 62 |     else:
 63 |         print("unknown benchmark name", args.name)
 64 |         list_benhcmarks(available_benchmarks)
 65 |         exit(-1)
 66 | 
 67 | 
 68 | def save_results(results, path_out, dataset_name):
 69 |     # create subdirs unless explicitly asked to not do so
 70 |     # TODO: add submodules to append to path
 71 |     timestamp = get_time_str()
 72 |     if isinstance(results, list):
 73 |         task = results[0]["experiment_setup"]["task"]
 74 |     else:
 75 |         task = results["experiment_setup"]["task"]
 76 |     task = task.replace(" ", "_")
 77 |     name_file_out = os.path.join(path_out,
 78 |                                  task,
 79 |                                  dataset_name,
 80 |                                  timestamp,
 81 |                                  "results.json")
 82 |     save_json(results, name_file_out)
 83 | 
 84 | 
 85 | def run_benchmark_by_name(name, args):
 86 |     print(name, args)
 87 |     print("running ", name)
 88 |     mod = importlib.import_module("vecto.benchmarks." + name)
 89 |     parser = argparse.ArgumentParser()
 90 |     add_extra_args = getattr(mod, 'add_extra_args')
 91 |     add_extra_args(parser)
 92 |     parser.add_argument("--path_out",
 93 |                         default=None,
 94 |                         help="destination folder to save results")
 95 |     args = parser.parse_args(args)
 96 |     dict_args = vars(args)
 97 |     embeddings = load_from_dir(args.embeddings)
 98 |     # TODO: this is ugly hack, do subparsers or something
 99 |     if name == "language_modeling":
100 |         dataset = Dataset("/tmp/")
101 |         dataset.name = "ptb"
102 |     else:
103 |         dataset = Dataset(args.dataset)
104 |         dict_args.pop("dataset")
105 | 
106 |     dict_args.pop("embeddings")
107 |     # TODO: not sure if all banchmarks use dataset arg
108 |     path_out = dict_args.pop("path_out")
109 |     Benchmark = getattr(mod, "Benchmark")
110 |     benchmark = Benchmark(**dict_args)
111 | 
112 |     print("SHAPE:", embeddings.matrix.shape)
113 |     print("vocab size:", embeddings.vocabulary.cnt_words)
114 |     results = benchmark.run(embeddings, dataset)
115 |     if path_out:
116 |         save_results(results, path_out, dataset.metadata["name"])
117 |     else:
118 |         print_json(results)
119 | 
120 | 
121 | def run_benchmarks_cli(args=[]):
122 |     choose_benchmark(args)
123 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks/sequence_labeling/chunk/train.txt:
--------------------------------------------------------------------------------
  1 | -DOCSTART- -X- O O
  2 | 
  3 | EU NNP I-NP I-ORG
  4 | rejects VBZ I-VP O
  5 | German JJ I-NP I-MISC
  6 | call NN I-NP O
  7 | to TO I-VP O
  8 | boycott VB I-VP O
  9 | British JJ I-NP I-MISC
 10 | lamb NN I-NP O
 11 | . . O O
 12 | 
 13 | Peter NNP I-NP I-PER
 14 | Blackburn NNP I-NP I-PER
 15 | 
 16 | BRUSSELS NNP I-NP I-LOC
 17 | 1996-08-22 CD I-NP O
 18 | 
 19 | The DT I-NP O
 20 | European NNP I-NP I-ORG
 21 | Commission NNP I-NP I-ORG
 22 | said VBD I-VP O
 23 | on IN I-PP O
 24 | Thursday NNP I-NP O
 25 | it PRP B-NP O
 26 | disagreed VBD I-VP O
 27 | with IN I-PP O
 28 | German JJ I-NP I-MISC
 29 | advice NN I-NP O
 30 | to TO I-PP O
 31 | consumers NNS I-NP O
 32 | to TO I-VP O
 33 | shun VB I-VP O
 34 | British JJ I-NP I-MISC
 35 | lamb NN I-NP O
 36 | until IN I-SBAR O
 37 | scientists NNS I-NP O
 38 | determine VBP I-VP O
 39 | whether IN I-SBAR O
 40 | mad JJ I-NP O
 41 | cow NN I-NP O
 42 | disease NN I-NP O
 43 | can MD I-VP O
 44 | be VB I-VP O
 45 | transmitted VBN I-VP O
 46 | to TO I-PP O
 47 | sheep NN I-NP O
 48 | . . O O
 49 | 
 50 | Germany NNP I-NP I-LOC
 51 | 's POS B-NP O
 52 | representative NN I-NP O
 53 | to TO I-PP O
 54 | the DT I-NP O
 55 | European NNP I-NP I-ORG
 56 | Union NNP I-NP I-ORG
 57 | 's POS B-NP O
 58 | veterinary JJ I-NP O
 59 | committee NN I-NP O
 60 | Werner NNP I-NP I-PER
 61 | Zwingmann NNP I-NP I-PER
 62 | said VBD I-VP O
 63 | on IN I-PP O
 64 | Wednesday NNP I-NP O
 65 | consumers NNS I-NP O
 66 | should MD I-VP O
 67 | buy VB I-VP O
 68 | sheepmeat NN I-NP O
 69 | from IN I-PP O
 70 | countries NNS I-NP O
 71 | other JJ I-ADJP O
 72 | than IN I-PP O
 73 | Britain NNP I-NP I-LOC
 74 | until IN I-SBAR O
 75 | the DT I-NP O
 76 | scientific JJ I-NP O
 77 | advice NN I-NP O
 78 | was VBD I-VP O
 79 | clearer JJR I-ADJP O
 80 | . . O O
 81 | 
 82 | " " O O
 83 | We PRP I-NP O
 84 | do VBP I-VP O
 85 | n't RB I-VP O
 86 | support VB I-VP O
 87 | any DT I-NP O
 88 | such JJ I-NP O
 89 | recommendation NN I-NP O
 90 | because IN I-SBAR O
 91 | we PRP I-NP O
 92 | do VBP I-VP O
 93 | n't RB I-VP O
 94 | see VB I-VP O
 95 | any DT I-NP O
 96 | grounds NNS I-NP O
 97 | for IN I-PP O
 98 | it PRP I-NP O
 99 | , , O O
100 | " " O O
101 | the DT I-NP O
102 | Commission NNP I-NP I-ORG
103 | 's POS B-NP O
104 | chief JJ I-NP O
105 | spokesman NN I-NP O
106 | Nikolaus NNP I-NP I-PER
107 | van NNP I-NP I-PER
108 | der FW I-NP I-PER
109 | Pas NNP I-NP I-PER
110 | told VBD I-VP O
111 | a DT I-NP O
112 | news NN I-NP O
113 | briefing NN I-NP O
114 | . . O O
115 | 
116 | He PRP I-NP O
117 | said VBD I-VP O
118 | further JJ I-NP O
119 | scientific JJ I-NP O
120 | study NN I-NP O
121 | was VBD I-VP O
122 | required VBN I-VP O
123 | and CC O O
124 | if IN I-SBAR O
125 | it PRP I-NP O
126 | was VBD I-VP O
127 | found VBN I-VP O
128 | that IN I-SBAR O
129 | action NN I-NP O
130 | was VBD I-VP O
131 | needed VBN I-VP O
132 | it PRP I-NP O
133 | should MD I-VP O
134 | be VB I-VP O
135 | taken VBN I-VP O
136 | by IN I-PP O
137 | the DT I-NP O
138 | European NNP I-NP I-ORG
139 | Union NNP I-NP I-ORG
140 | . . O O
141 | 
142 | He PRP I-NP O
143 | said VBD I-VP O
144 | a DT I-NP O
145 | proposal NN I-NP O
146 | last JJ B-NP O
147 | month NN I-NP O
148 | by IN I-PP O
149 | EU NNP I-NP I-ORG
150 | Farm NNP I-NP O
151 | Commissioner NNP I-NP O
152 | Franz NNP I-NP I-PER
153 | Fischler NNP I-NP I-PER
154 | to TO I-VP O
155 | ban VB I-VP O
156 | sheep NN I-NP O
157 | brains NNS I-NP O
158 | , , O O
159 | spleens NNS I-NP O
160 | and CC O O
161 | spinal JJ I-NP O
162 | cords NNS I-NP O
163 | from IN I-PP O
164 | the DT I-NP O
165 | human NN I-NP O
166 | and CC I-NP O
167 | animal NN I-NP O
168 | food NN I-NP O
169 | chains NNS I-NP O
170 | was VBD I-VP O
171 | a DT I-NP O
172 | highly RB I-NP O
173 | specific JJ I-ADJP O
174 | and CC I-ADJP O
175 | precautionary JJ I-ADJP O
176 | move NN I-NP O
177 | to TO I-VP O
178 | protect VB I-VP O
179 | human JJ I-NP O
180 | health NN I-NP O
181 | . . O O
182 | 
183 | 
184 | On IN I-PP O
185 | Monday NNP I-NP O
186 | , , O O
187 | both DT I-NP O
188 | houses NNS I-NP O
189 | of IN I-PP O
190 | India NNP I-NP I-LOC
191 | 's POS B-NP O
192 | parliament NN I-NP O
193 | wished VBD I-VP O
194 | the DT I-NP O
195 | nation NN I-NP O
196 | 's POS B-NP O
197 | adopted VBN I-NP O
198 | sister NN I-NP O
199 | a DT B-NP O
200 | happy JJ I-NP O
201 | birthday NN I-NP O
202 | and CC O O
203 | speedy JJ I-NP O
204 | recovery NN I-NP O
205 | from IN I-PP O
206 | her PRP$ I-NP O
207 | illness NN I-NP O
208 | . . O O
209 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks/sequence_labeling/ner/train.txt:
--------------------------------------------------------------------------------
  1 | -DOCSTART- -X- O O
  2 | 
  3 | EU NNP I-NP I-ORG
  4 | rejects VBZ I-VP O
  5 | German JJ I-NP I-MISC
  6 | call NN I-NP O
  7 | to TO I-VP O
  8 | boycott VB I-VP O
  9 | British JJ I-NP I-MISC
 10 | lamb NN I-NP O
 11 | . . O O
 12 | 
 13 | Peter NNP I-NP I-PER
 14 | Blackburn NNP I-NP I-PER
 15 | 
 16 | BRUSSELS NNP I-NP I-LOC
 17 | 1996-08-22 CD I-NP O
 18 | 
 19 | The DT I-NP O
 20 | European NNP I-NP I-ORG
 21 | Commission NNP I-NP I-ORG
 22 | said VBD I-VP O
 23 | on IN I-PP O
 24 | Thursday NNP I-NP O
 25 | it PRP B-NP O
 26 | disagreed VBD I-VP O
 27 | with IN I-PP O
 28 | German JJ I-NP I-MISC
 29 | advice NN I-NP O
 30 | to TO I-PP O
 31 | consumers NNS I-NP O
 32 | to TO I-VP O
 33 | shun VB I-VP O
 34 | British JJ I-NP I-MISC
 35 | lamb NN I-NP O
 36 | until IN I-SBAR O
 37 | scientists NNS I-NP O
 38 | determine VBP I-VP O
 39 | whether IN I-SBAR O
 40 | mad JJ I-NP O
 41 | cow NN I-NP O
 42 | disease NN I-NP O
 43 | can MD I-VP O
 44 | be VB I-VP O
 45 | transmitted VBN I-VP O
 46 | to TO I-PP O
 47 | sheep NN I-NP O
 48 | . . O O
 49 | 
 50 | Germany NNP I-NP I-LOC
 51 | 's POS B-NP O
 52 | representative NN I-NP O
 53 | to TO I-PP O
 54 | the DT I-NP O
 55 | European NNP I-NP I-ORG
 56 | Union NNP I-NP I-ORG
 57 | 's POS B-NP O
 58 | veterinary JJ I-NP O
 59 | committee NN I-NP O
 60 | Werner NNP I-NP I-PER
 61 | Zwingmann NNP I-NP I-PER
 62 | said VBD I-VP O
 63 | on IN I-PP O
 64 | Wednesday NNP I-NP O
 65 | consumers NNS I-NP O
 66 | should MD I-VP O
 67 | buy VB I-VP O
 68 | sheepmeat NN I-NP O
 69 | from IN I-PP O
 70 | countries NNS I-NP O
 71 | other JJ I-ADJP O
 72 | than IN I-PP O
 73 | Britain NNP I-NP I-LOC
 74 | until IN I-SBAR O
 75 | the DT I-NP O
 76 | scientific JJ I-NP O
 77 | advice NN I-NP O
 78 | was VBD I-VP O
 79 | clearer JJR I-ADJP O
 80 | . . O O
 81 | 
 82 | " " O O
 83 | We PRP I-NP O
 84 | do VBP I-VP O
 85 | n't RB I-VP O
 86 | support VB I-VP O
 87 | any DT I-NP O
 88 | such JJ I-NP O
 89 | recommendation NN I-NP O
 90 | because IN I-SBAR O
 91 | we PRP I-NP O
 92 | do VBP I-VP O
 93 | n't RB I-VP O
 94 | see VB I-VP O
 95 | any DT I-NP O
 96 | grounds NNS I-NP O
 97 | for IN I-PP O
 98 | it PRP I-NP O
 99 | , , O O
100 | " " O O
101 | the DT I-NP O
102 | Commission NNP I-NP I-ORG
103 | 's POS B-NP O
104 | chief JJ I-NP O
105 | spokesman NN I-NP O
106 | Nikolaus NNP I-NP I-PER
107 | van NNP I-NP I-PER
108 | der FW I-NP I-PER
109 | Pas NNP I-NP I-PER
110 | told VBD I-VP O
111 | a DT I-NP O
112 | news NN I-NP O
113 | briefing NN I-NP O
114 | . . O O
115 | 
116 | He PRP I-NP O
117 | said VBD I-VP O
118 | further JJ I-NP O
119 | scientific JJ I-NP O
120 | study NN I-NP O
121 | was VBD I-VP O
122 | required VBN I-VP O
123 | and CC O O
124 | if IN I-SBAR O
125 | it PRP I-NP O
126 | was VBD I-VP O
127 | found VBN I-VP O
128 | that IN I-SBAR O
129 | action NN I-NP O
130 | was VBD I-VP O
131 | needed VBN I-VP O
132 | it PRP I-NP O
133 | should MD I-VP O
134 | be VB I-VP O
135 | taken VBN I-VP O
136 | by IN I-PP O
137 | the DT I-NP O
138 | European NNP I-NP I-ORG
139 | Union NNP I-NP I-ORG
140 | . . O O
141 | 
142 | He PRP I-NP O
143 | said VBD I-VP O
144 | a DT I-NP O
145 | proposal NN I-NP O
146 | last JJ B-NP O
147 | month NN I-NP O
148 | by IN I-PP O
149 | EU NNP I-NP I-ORG
150 | Farm NNP I-NP O
151 | Commissioner NNP I-NP O
152 | Franz NNP I-NP I-PER
153 | Fischler NNP I-NP I-PER
154 | to TO I-VP O
155 | ban VB I-VP O
156 | sheep NN I-NP O
157 | brains NNS I-NP O
158 | , , O O
159 | spleens NNS I-NP O
160 | and CC O O
161 | spinal JJ I-NP O
162 | cords NNS I-NP O
163 | from IN I-PP O
164 | the DT I-NP O
165 | human NN I-NP O
166 | and CC I-NP O
167 | animal NN I-NP O
168 | food NN I-NP O
169 | chains NNS I-NP O
170 | was VBD I-VP O
171 | a DT I-NP O
172 | highly RB I-NP O
173 | specific JJ I-ADJP O
174 | and CC I-ADJP O
175 | precautionary JJ I-ADJP O
176 | move NN I-NP O
177 | to TO I-VP O
178 | protect VB I-VP O
179 | human JJ I-NP O
180 | health NN I-NP O
181 | . . O O
182 | 
183 | 
184 | On IN I-PP O
185 | Monday NNP I-NP O
186 | , , O O
187 | both DT I-NP O
188 | houses NNS I-NP O
189 | of IN I-PP O
190 | India NNP I-NP I-LOC
191 | 's POS B-NP O
192 | parliament NN I-NP O
193 | wished VBD I-VP O
194 | the DT I-NP O
195 | nation NN I-NP O
196 | 's POS B-NP O
197 | adopted VBN I-NP O
198 | sister NN I-NP O
199 | a DT B-NP O
200 | happy JJ I-NP O
201 | birthday NN I-NP O
202 | and CC O O
203 | speedy JJ I-NP O
204 | recovery NN I-NP O
205 | from IN I-PP O
206 | her PRP$ I-NP O
207 | illness NN I-NP O
208 | . . O O
209 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks/sequence_labeling/pos/train.txt:
--------------------------------------------------------------------------------
  1 | -DOCSTART- -X- O O
  2 | 
  3 | EU NNP I-NP I-ORG
  4 | rejects VBZ I-VP O
  5 | German JJ I-NP I-MISC
  6 | call NN I-NP O
  7 | to TO I-VP O
  8 | boycott VB I-VP O
  9 | British JJ I-NP I-MISC
 10 | lamb NN I-NP O
 11 | . . O O
 12 | 
 13 | Peter NNP I-NP I-PER
 14 | Blackburn NNP I-NP I-PER
 15 | 
 16 | BRUSSELS NNP I-NP I-LOC
 17 | 1996-08-22 CD I-NP O
 18 | 
 19 | The DT I-NP O
 20 | European NNP I-NP I-ORG
 21 | Commission NNP I-NP I-ORG
 22 | said VBD I-VP O
 23 | on IN I-PP O
 24 | Thursday NNP I-NP O
 25 | it PRP B-NP O
 26 | disagreed VBD I-VP O
 27 | with IN I-PP O
 28 | German JJ I-NP I-MISC
 29 | advice NN I-NP O
 30 | to TO I-PP O
 31 | consumers NNS I-NP O
 32 | to TO I-VP O
 33 | shun VB I-VP O
 34 | British JJ I-NP I-MISC
 35 | lamb NN I-NP O
 36 | until IN I-SBAR O
 37 | scientists NNS I-NP O
 38 | determine VBP I-VP O
 39 | whether IN I-SBAR O
 40 | mad JJ I-NP O
 41 | cow NN I-NP O
 42 | disease NN I-NP O
 43 | can MD I-VP O
 44 | be VB I-VP O
 45 | transmitted VBN I-VP O
 46 | to TO I-PP O
 47 | sheep NN I-NP O
 48 | . . O O
 49 | 
 50 | Germany NNP I-NP I-LOC
 51 | 's POS B-NP O
 52 | representative NN I-NP O
 53 | to TO I-PP O
 54 | the DT I-NP O
 55 | European NNP I-NP I-ORG
 56 | Union NNP I-NP I-ORG
 57 | 's POS B-NP O
 58 | veterinary JJ I-NP O
 59 | committee NN I-NP O
 60 | Werner NNP I-NP I-PER
 61 | Zwingmann NNP I-NP I-PER
 62 | said VBD I-VP O
 63 | on IN I-PP O
 64 | Wednesday NNP I-NP O
 65 | consumers NNS I-NP O
 66 | should MD I-VP O
 67 | buy VB I-VP O
 68 | sheepmeat NN I-NP O
 69 | from IN I-PP O
 70 | countries NNS I-NP O
 71 | other JJ I-ADJP O
 72 | than IN I-PP O
 73 | Britain NNP I-NP I-LOC
 74 | until IN I-SBAR O
 75 | the DT I-NP O
 76 | scientific JJ I-NP O
 77 | advice NN I-NP O
 78 | was VBD I-VP O
 79 | clearer JJR I-ADJP O
 80 | . . O O
 81 | 
 82 | " " O O
 83 | We PRP I-NP O
 84 | do VBP I-VP O
 85 | n't RB I-VP O
 86 | support VB I-VP O
 87 | any DT I-NP O
 88 | such JJ I-NP O
 89 | recommendation NN I-NP O
 90 | because IN I-SBAR O
 91 | we PRP I-NP O
 92 | do VBP I-VP O
 93 | n't RB I-VP O
 94 | see VB I-VP O
 95 | any DT I-NP O
 96 | grounds NNS I-NP O
 97 | for IN I-PP O
 98 | it PRP I-NP O
 99 | , , O O
100 | " " O O
101 | the DT I-NP O
102 | Commission NNP I-NP I-ORG
103 | 's POS B-NP O
104 | chief JJ I-NP O
105 | spokesman NN I-NP O
106 | Nikolaus NNP I-NP I-PER
107 | van NNP I-NP I-PER
108 | der FW I-NP I-PER
109 | Pas NNP I-NP I-PER
110 | told VBD I-VP O
111 | a DT I-NP O
112 | news NN I-NP O
113 | briefing NN I-NP O
114 | . . O O
115 | 
116 | He PRP I-NP O
117 | said VBD I-VP O
118 | further JJ I-NP O
119 | scientific JJ I-NP O
120 | study NN I-NP O
121 | was VBD I-VP O
122 | required VBN I-VP O
123 | and CC O O
124 | if IN I-SBAR O
125 | it PRP I-NP O
126 | was VBD I-VP O
127 | found VBN I-VP O
128 | that IN I-SBAR O
129 | action NN I-NP O
130 | was VBD I-VP O
131 | needed VBN I-VP O
132 | it PRP I-NP O
133 | should MD I-VP O
134 | be VB I-VP O
135 | taken VBN I-VP O
136 | by IN I-PP O
137 | the DT I-NP O
138 | European NNP I-NP I-ORG
139 | Union NNP I-NP I-ORG
140 | . . O O
141 | 
142 | He PRP I-NP O
143 | said VBD I-VP O
144 | a DT I-NP O
145 | proposal NN I-NP O
146 | last JJ B-NP O
147 | month NN I-NP O
148 | by IN I-PP O
149 | EU NNP I-NP I-ORG
150 | Farm NNP I-NP O
151 | Commissioner NNP I-NP O
152 | Franz NNP I-NP I-PER
153 | Fischler NNP I-NP I-PER
154 | to TO I-VP O
155 | ban VB I-VP O
156 | sheep NN I-NP O
157 | brains NNS I-NP O
158 | , , O O
159 | spleens NNS I-NP O
160 | and CC O O
161 | spinal JJ I-NP O
162 | cords NNS I-NP O
163 | from IN I-PP O
164 | the DT I-NP O
165 | human NN I-NP O
166 | and CC I-NP O
167 | animal NN I-NP O
168 | food NN I-NP O
169 | chains NNS I-NP O
170 | was VBD I-VP O
171 | a DT I-NP O
172 | highly RB I-NP O
173 | specific JJ I-ADJP O
174 | and CC I-ADJP O
175 | precautionary JJ I-ADJP O
176 | move NN I-NP O
177 | to TO I-VP O
178 | protect VB I-VP O
179 | human JJ I-NP O
180 | health NN I-NP O
181 | . . O O
182 | 
183 | 
184 | On IN I-PP O
185 | Monday NNP I-NP O
186 | , , O O
187 | both DT I-NP O
188 | houses NNS I-NP O
189 | of IN I-PP O
190 | India NNP I-NP I-LOC
191 | 's POS B-NP O
192 | parliament NN I-NP O
193 | wished VBD I-VP O
194 | the DT I-NP O
195 | nation NN I-NP O
196 | 's POS B-NP O
197 | adopted VBN I-NP O
198 | sister NN I-NP O
199 | a DT B-NP O
200 | happy JJ I-NP O
201 | birthday NN I-NP O
202 | and CC O O
203 | speedy JJ I-NP O
204 | recovery NN I-NP O
205 | from IN I-PP O
206 | her PRP$ I-NP O
207 | illness NN I-NP O
208 | . . O O
209 | 


--------------------------------------------------------------------------------
/vecto/benchmarks/visualize.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import sys
  4 | import pandas
  5 | from pandas.io.json import json_normalize
  6 | from vecto.utils.data import load_json
  7 | 
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | logging.basicConfig(level="DEBUG")
 11 | 
 12 | 
 13 | # def clean_dic(data):
 14 | #     data_clean = {}
 15 | #     data_clean["task"] = data["experiment_setup"]["task"]
 16 | #     data_clean["embeddings"] = data["experiment_setup"]["embeddings"]["name"]
 17 | #     default_measurement = "accuracy"
 18 | #     if "default_measurement" in data["experiment_setup"]:
 19 | #         default_measurement = data["experiment_setup"]["default_measurement"]
 20 | #     else:
 21 | #         logger.warning(f"default_measurement not specified in ")
 22 | #     data_clean["result"] = data["result"][default_measurement]
 23 | #     return data_clean
 24 | 
 25 | 
 26 | # def df_from_file_bak(path):
 27 | #     logger.debug(f"processing {path}")
 28 | #     data = load_json(path)
 29 | #     data_clean = [clean_dic(x) for x in data]
 30 | #     # meta = [["experiment_setup", "task"],
 31 | #     #         ["experiment_setup", "subcategory"],
 32 | #     #         ["experiment_setup", "method"],
 33 | #     #         ["experiment_setup", "embeddings"]]
 34 | #     dframe = json_normalize(data_clean)
 35 | #     #if "details" in dframe:
 36 | #         #dframe.drop("details", axis="columns", inplace=True)
 37 | #     # default_measurement = "accuracy"
 38 | #     # try:
 39 | #     #     # TODO: check if default measurement is same for all experiments
 40 | #     #     default_measurement = dframe["experiment_setup.default_measurement"].unique()[0]
 41 | #     # except KeyError:
 42 | #     #     logger.warning(f"default_measurement not specified in {path}")
 43 | #     # dframe["result"] = dframe["result." + default_measurement]
 44 | #     # df["reciprocal_rank"] = 1 / (df["rank"] + 1)
 45 | #     return dframe
 46 | 
 47 | def df_from_file(path):
 48 |     data = load_json(path)
 49 |     # meta = [["experiment_setup", "task"],
 50 |     #         ["experiment_setup", "subcategory"],
 51 |     #         ["experiment_setup", "method"],
 52 |     #         ["experiment_setup", "embeddings"]]
 53 |     dframe = json_normalize(data)
 54 |     if "details" in dframe:
 55 |         dframe.drop("details", axis="columns", inplace=True)
 56 |     default_measurement = "accuracy"
 57 |     try:
 58 |         default_measurement = dframe["experiment_setup.default_measurement"].unique()[0]
 59 |     except KeyError:
 60 |         logger.warning(f"default_measurement not specified in {path}")
 61 |     dframe["result"] = dframe["result." + default_measurement]
 62 |     # df["reciprocal_rank"] = 1 / (df["rank"] + 1)
 63 |     return dframe
 64 | 
 65 | 
 66 | def df_from_dir(path):
 67 |     dfs = []
 68 |     for (dirpath, _, filenames) in os.walk(path):
 69 |         for filename in filenames:
 70 |             if filename.endswith(".json"):
 71 |                 full_path = os.path.join(dirpath, filename)
 72 |                 try:
 73 |                     dfs.append(df_from_file(full_path))
 74 |                 except KeyError:
 75 |                     logger.warning(f"error reading {full_path}")
 76 |     dframe = pandas.concat(dfs, sort=True)
 77 |     # print(dframe["experiment_setup.task"])
 78 |     return dframe
 79 | 
 80 | 
 81 | def get_filtered_dataframe(path, key_primary, key_secondary="experiment_setup.subcategory"):
 82 |     df = df_from_dir(path)
 83 |     print(df)
 84 |     groupby_items = [key_secondary, key_primary]
 85 | 
 86 |     group = df.groupby(groupby_items)
 87 |     means = group.mean()
 88 |     means.reset_index(inplace=True)
 89 |     means = means.loc[:, groupby_items + ["result"]]
 90 |     # means = pandas.concat((means, means))
 91 |     unstacked = means.groupby(groupby_items)['result'].aggregate('first').unstack()
 92 |     return unstacked
 93 | 
 94 | 
 95 | def plot_accuracy(path, key_primary="experiment_setup.method",
 96 |                   key_secondary="experiment_setup.subcategory"):
 97 |     unstacked = get_filtered_dataframe(path, key_primary, key_secondary)
 98 |     print(unstacked)
 99 |     unstacked.plot.bar(rot=0)
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     plot_accuracy(sys.argv[1],
104 |                   key_primary="experiment_setup.task",
105 |                   key_secondary="experiment_setup.embeddings.name")
106 |     from matplotlib import pyplot as plt
107 |     plt.savefig("results.pdf", bbox_inches="tight")
108 | 


--------------------------------------------------------------------------------
/tests/data/benchmarks/text_classification/train:
--------------------------------------------------------------------------------
 1 | 0	i like my christmas movies with more elves and snow and less pimps and ho's apple banana fast.
 2 | 0	. . . liotta is put in an impossible spot because his character's deceptions ultimately undo him and the believability of the entire scenario . too bad tiger cat.
 3 | 0	what can one say about a balding 50-year-old actor playing an innocent boy carved from a log fast ?
 4 | 0	normally , rohmer's talky films fascinate me , but when he moves his setting to the past , and relies on a historical text , he loses the richness of characterization that makes his films so memorable .
 5 | 0	some decent actors inflict big damage upon their reputations .
 6 | 0	the director seems to take an unseemly pleasure in [the characters'] misery and at the same time to congratulate himself for having the guts to confront it .
 7 | 0	the parts are better than the whole ( bizarre , funny , tragic - like love in new york ) .
 8 | 0	on a cutting room floor somewhere lies . . . footage that might have made no such thing a trenchant , ironic cultural satire instead of a frustrating misfire .
 9 | 0	more of an intriguing curiosity than a gripping thriller .
10 | 0	a baffling subplot involving smuggling drugs inside danish cows falls flat , and if you're going to alter the bard's ending , you'd better have a good alternative .
11 | 0	another week , another gross-out college comedy--ugh .
12 | 0	shunji iwai's all about lily chou chou is a beautifully shot , but ultimately flawed film about growing up in japan .
13 | 0	at 90 minutes this movie is short , but it feels much longer .
14 | 0	what will , most likely , turn out to be the most repellent movie of 2002 .
15 | 0	when the precise nature of matthew's predicament finally comes into sharp focus , the revelation fails to justify the build-up .
16 | 0	the characters are paper thin and the plot is so cliched and contrived that it makes your least favorite james bond movie seem as cleverly plotted as the usual suspects .
17 | 0	the redeeming feature of chan's films has always been the action , but the stunts in the tuxedo seem tired and , what's worse , routine .
18 | 0	it is bad , but certainly not without merit as entertainment .
19 | 0	merely ( and literally ) tosses around sex toys and offers half-hearted paeans to empowerment that are repeatedly undercut by the brutality of the jokes , most at women's expense .
20 | 0	a sour , nasty offering .
21 | 0	given that both movies expect us to root for convicted violent felons over those assigned to protect us from same , we need every bit of sympathy the cons can muster ; this time , there isn't much .
22 | 1	death to smoochy is often very funny , but what's even more remarkable is the integrity of devito's misanthropic vision .
23 | 1	[reno] delivers a monologue that manages to incorporate both the horror and the absurdity of the situation in a well-balanced fashion .
24 | 1	an awfully good , achingly human picture .
25 | 1	the cast is top-notch and i predict there will be plenty of female audience members drooling over michael idemoto as michael .
26 | 1	the story ultimately takes hold and grips hard .
27 | 1	unfortunately , it appears that [jackie] chan's us influence is starting to show in his hong kong films .
28 | 1	it offers a glimpse of the solomonic decision facing jewish parents in those turbulent times : to save their children and yet to lose them .
29 | 1	in the pianist , polanski is saying what he has long wanted to say , confronting the roots of his own preoccupations and obsessions , and he allows nothing to get in the way .
30 | 1	offers big , fat , dumb laughs that may make you hate yourself for giving in . ah , what the hell .
31 | 1	the whole is quite entertaining , but despite its virtues , there is an unsettled feeling to the film .
32 | 1	though it runs 163 minutes , safe conduct is anything but languorous . it's packed to bursting with incident , and with scores of characters , some fictional , some from history .
33 | 1	this is christmas future for a lot of baby boomers .
34 | 1	niccol the filmmaker merges his collaborators' symbolic images with his words , insinuating , for example , that in hollywood , only god speaks to the press
35 | 1	ranks among willams' best screen work .
36 | 1	a touching drama about old age and grief with a tour de force performance by michel piccoli .
37 | 1	feardotcom's thrills are all cheap , but they mostly work .
38 | 1	if you can stomach the rough content , it's worth checking out for the performances alone .
39 | 1	it's a feel-good movie about which you can actually feel good .
40 | 1	'they' begins and ends with scenes so terrifying i'm still stunned . and i've decided to leave a light on every night from now on .
41 | 1	the visuals alone make metropolis worth seeing .
42 | 1	it's mostly a pleasure to watch . and the reason for that is a self-aware , often self-mocking , intelligence .
43 | 1	a slight but sweet film .
44 | 


--------------------------------------------------------------------------------
/vecto/corpus/tokenization.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import re
  3 | import abc
  4 | 
  5 | from vecto.utils.metadata import WithMetaData, get_full_typename
  6 | 
  7 | # TODO: ckeck id the data is there
  8 | nltk.download('punkt', quiet=True)
  9 | nltk.download('stopwords', quiet=True)
 10 | 
 11 | _DEFAULT_WORD_SPLITTER = nltk.tokenize.WordPunctTokenizer().tokenize
 12 | _WHITESPACE_TOKEN_SPLITTER = re.compile(r'[^\s]+').findall
 13 | 
 14 | # we should not probably do it on module level
 15 | _SENT_SPLITTER_IMPL = nltk.data.load('tokenizers/punkt/english.pickle').tokenize
 16 | 
 17 | DEFAULT_GOOD_TOKEN_RE = re.compile(r'^\w+$')
 18 | ANY_TOKEN_IS_GOOD_RE = re.compile(r'.*')
 19 | 
 20 | # TODO: moved from corpus, rename and use or remove
 21 | _default_tokenizer_patter = r"[\w\-']+|[.,!?…]"
 22 | 
 23 | 
 24 | def default_token_normalizer(token):
 25 |     return token.lower()
 26 | 
 27 | 
 28 | def word_tokenize_txt(txt,
 29 |                       token_splitter=_DEFAULT_WORD_SPLITTER,
 30 |                       token_normalizer=default_token_normalizer,
 31 |                       good_token_re=DEFAULT_GOOD_TOKEN_RE,
 32 |                       min_token_len=1,
 33 |                       stopwords=[]):
 34 |     # stopwords = nltk.corpus.stopwords.words('english')
 35 |     norm_tokens = map(token_normalizer, token_splitter(txt))
 36 |     return [token for token in norm_tokens
 37 |             if len(token) >= min_token_len and
 38 |             token not in stopwords]
 39 |             # and good_token_re.match(token)]
 40 | 
 41 | 
 42 | class BaseTokenizer(WithMetaData):
 43 |     """
 44 |     Base class for all tokenizer. It's a simple callable (functor) with metadata management infrastructure.
 45 |     """
 46 | 
 47 |     @abc.abstractmethod
 48 |     def __call__(self, txt):
 49 |         '''
 50 |         :param txt: text to tokenize
 51 |         :return: list of lists of tokens
 52 |         '''
 53 |         pass
 54 | 
 55 | 
 56 | class Tokenizer(BaseTokenizer):
 57 |     """
 58 |     Tokenizes text, normalizes each token with `token_normalizer`,
 59 |     filters tokens by length and regex `good_token_re`.
 60 |     Returns a list with the only element: list of tokens.
 61 |     This nesting is necessary to unify output with SentenceTokenizer,
 62 |     which returns list of sentences (each is a list of tokens).
 63 |     """
 64 | 
 65 |     def __init__(self,
 66 |                  token_splitter=_DEFAULT_WORD_SPLITTER,
 67 |                  token_normalizer=default_token_normalizer,
 68 |                  good_token_re=DEFAULT_GOOD_TOKEN_RE,
 69 |                  min_token_len=1,
 70 |                  stopwords=nltk.corpus.stopwords.words('english')):
 71 |         # TODO: decide how to save stopwords to metadata
 72 |         super().__init__(normalizer=get_full_typename(token_normalizer),
 73 |                          good_token_re=good_token_re.pattern,
 74 |                          min_token_len=min_token_len,
 75 |                          stopwords='too long to be saved to metadata')
 76 |         self.token_splitter = token_splitter
 77 |         self.token_normalizer = token_normalizer
 78 |         self.good_token_re = good_token_re
 79 |         self.min_token_len = min_token_len
 80 |         self.stopwords = stopwords
 81 | 
 82 |     def __call__(self, txt):
 83 |         return [word_tokenize_txt(txt,
 84 |                                   self.token_splitter,
 85 |                                   self.token_normalizer,
 86 |                                   self.good_token_re,
 87 |                                   self.min_token_len,
 88 |                                   self.stopwords)]
 89 | 
 90 | 
 91 | DEFAULT_TOKENIZER = Tokenizer()
 92 | 
 93 | ANNOTATED_TEXT_TOKENIZER = Tokenizer(token_splitter=_WHITESPACE_TOKEN_SPLITTER,
 94 |                                      good_token_re=ANY_TOKEN_IS_GOOD_RE,
 95 |                                      min_token_len=0)
 96 | 
 97 | DEFAULT_JAP_TOKENIZER = Tokenizer(min_token_len=0)
 98 | 
 99 | 
100 | class SentenceTokenizer(BaseTokenizer):
101 |     """
102 |     Splits text into sentences, tokenizes each sentence, normalizes each token with `token_normalizer`,
103 |     filters tokens by length and regex `good_token_re`.
104 |     Returns a list of sentences (each is a list of tokens).
105 |     """
106 | 
107 |     def __init__(self,
108 |                  word_tokenizer=DEFAULT_TOKENIZER,
109 |                  sentence_splitter=_SENT_SPLITTER_IMPL,
110 |                  min_sent_words=2):
111 |         super(SentenceTokenizer, self).__init__(word_tokenizer=word_tokenizer.metadata,
112 |                                                 sentence_splitter=get_full_typename(sentence_splitter),
113 |                                                 min_sent_words=min_sent_words)
114 |         self.word_tokenizer = word_tokenizer
115 |         self.sentence_splitter = sentence_splitter
116 |         self.min_sent_words = min_sent_words
117 | 
118 |     def __call__(self, txt):
119 |         for sent in self.sentence_splitter(txt.strip()):
120 |             for sent_tokens in self.word_tokenizer(sent):
121 |                 if len(sent_tokens) >= self.min_sent_words:
122 |                     yield sent_tokens
123 | 
124 | 
125 | DEFAULT_SENT_TOKENIZER = SentenceTokenizer()
126 | 


--------------------------------------------------------------------------------
/vecto/benchmarks/relation_extraction/preprocess.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from __future__ import print_function
  3 | import numpy as np
  4 | import gzip
  5 | import os
  6 | import sys
  7 | import pickle as pkl
  8 | 
  9 | 
 10 | 
 11 | 
 12 | 
 13 | #Mapping of the labels to integers
 14 | labelsMapping = {'Other':0,
 15 |                  'Message-Topic(e1,e2)':1, 'Message-Topic(e2,e1)':2,
 16 |                  'Product-Producer(e1,e2)':3, 'Product-Producer(e2,e1)':4,
 17 |                  'Instrument-Agency(e1,e2)':5, 'Instrument-Agency(e2,e1)':6,
 18 |                  'Entity-Destination(e1,e2)':7, 'Entity-Destination(e2,e1)':8,
 19 |                  'Cause-Effect(e1,e2)':9, 'Cause-Effect(e2,e1)':10,
 20 |                  'Component-Whole(e1,e2)':11, 'Component-Whole(e2,e1)':12,
 21 |                  'Entity-Origin(e1,e2)':13, 'Entity-Origin(e2,e1)':14,
 22 |                  'Member-Collection(e1,e2)':15, 'Member-Collection(e2,e1)':16,
 23 |                  'Content-Container(e1,e2)':17, 'Content-Container(e2,e1)':18}
 24 | 
 25 | 
 26 | 
 27 | 
 28 | words = {}
 29 | maxSentenceLen = [0,0]
 30 | 
 31 | 
 32 | distanceMapping = {'PADDING': 0, 'LowerMin': 1, 'GreaterMax': 2}
 33 | minDistance = -30
 34 | maxDistance = 30
 35 | for dis in range(minDistance,maxDistance+1):
 36 |     distanceMapping[dis] = len(distanceMapping)
 37 | print(distanceMapping)
 38 | 
 39 | 
 40 | def getWordIdx(token, word2Idx):
 41 |     """Returns from the word2Idex table the word index for a given token"""
 42 |     if token in word2Idx:
 43 |         return word2Idx[token]
 44 |     elif token.lower() in word2Idx:
 45 |         return word2Idx[token.lower()]
 46 |     return 0
 47 | 
 48 | def createTensor(file, word2Idx, maxSentenceLen=100):
 49 |     """Creates matrices for the events and sentence for the given file"""
 50 |     labels = []
 51 |     positionMatrix1 = []
 52 |     positionMatrix2 = []
 53 |     tokenMatrix = []
 54 | 
 55 |     for line in open(file):
 56 |         splits = line.strip().split('\t')
 57 | 
 58 |         label = splits[0]
 59 |         pos1 = splits[1]
 60 |         pos2 = splits[2]
 61 |         sentence = splits[3]
 62 |         tokens = sentence.split(" ")
 63 | 
 64 |         #print(label, pos1, pos2, sentence, tokens)
 65 | 
 66 | 
 67 |         tokenIds = np.zeros(maxSentenceLen)
 68 |         positionValues1 = np.zeros(maxSentenceLen)
 69 |         positionValues2 = np.zeros(maxSentenceLen)
 70 | 
 71 |         for idx in range(0, min(maxSentenceLen, len(tokens))):
 72 |             tokenIds[idx] = getWordIdx(tokens[idx], word2Idx)
 73 | 
 74 |             distance1 = idx - int(pos1)
 75 |             distance2 = idx - int(pos2)
 76 |             #print(distance1, distance2)
 77 |             if distance1 in distanceMapping:
 78 |                 #print('helo')
 79 |                 positionValues1[idx] = distanceMapping[distance1]
 80 |             elif distance1 <= minDistance:
 81 |                 positionValues1[idx] = distanceMapping['LowerMin']
 82 |             else:
 83 |                 positionValues1[idx] = distanceMapping['GreaterMax']
 84 | 
 85 |             if distance2 in distanceMapping:
 86 |                 positionValues2[idx] = distanceMapping[distance2]
 87 |             elif distance2 <= minDistance:
 88 |                 positionValues2[idx] = distanceMapping['LowerMin']
 89 |             else:
 90 |                 positionValues2[idx] = distanceMapping['GreaterMax']
 91 | 
 92 |         tokenMatrix.append(tokenIds)
 93 |         positionMatrix1.append(positionValues1)
 94 |         positionMatrix2.append(positionValues2)
 95 | 
 96 |         labels.append(labelsMapping[label])
 97 | 
 98 | 
 99 | 
100 |     return np.array(labels, dtype='int32'), np.array(tokenMatrix, dtype='int32'), np.array(positionMatrix1, dtype='int32'), np.array(positionMatrix2, dtype='int32'),
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | def load_data(embeddings, path_dataset):
109 |     files = [os.path.join(path_dataset, 'train.txt'), os.path.join(path_dataset, 'test.txt')]
110 |     for fileIdx in range(len(files)):
111 |         file = files[fileIdx]
112 |         for line in open(file):
113 |             splits = line.strip().split('\t')
114 | 
115 |             label = splits[0]
116 | 
117 | 
118 |             sentence = splits[3]
119 |             tokens = sentence.split(" ")
120 |             maxSentenceLen[fileIdx] = max(maxSentenceLen[fileIdx], len(tokens))
121 |             for token in tokens:
122 |                 words[token.lower()] = True
123 | 
124 | 
125 |     print("Max Sentence Lengths: ", maxSentenceLen)
126 | 
127 |     # :: Read in word embeddings ::
128 |     # :: Read in word embeddings ::
129 |     word2Idx = embeddings.vocabulary.dic_words_ids
130 |     wordEmbeddings = embeddings.matrix
131 | 
132 | 
133 |     print("Embeddings shape: ", wordEmbeddings.shape)
134 |     print("Len words: ", len(words))
135 | 
136 | 
137 | 
138 |     # :: Create token matrix ::
139 |     train_set = createTensor(files[0], word2Idx, max(maxSentenceLen))
140 |     test_set = createTensor(files[1], word2Idx, max(maxSentenceLen))
141 | 
142 | 
143 |     data = {'wordEmbeddings': wordEmbeddings, 'word2Idx': word2Idx,
144 |             'train_set': train_set, 'test_set': test_set}
145 | 
146 |     return data
147 | 
148 | 
149 | 
150 |     print("Data preprocessing done!")


--------------------------------------------------------------------------------
/docs/source/tutorial/working_with_vectors.rst:
--------------------------------------------------------------------------------
  1 | Basic operations
  2 | =====================
  3 | 
  4 | .. currentmodule:: vecto
  5 | 
  6 | Supported VSM formats
  7 | -----------------------
  8 | 
  9 | At the moment the following data formats are supported:
 10 | 
 11 | *    .bin format of word2vec (the file has to be called "vectors.bin")
 12 | *    .npy arrays with separate vocab files
 13 | *    .txt plain-text vectors
 14 | *    sparse vectors in hp5 format
 15 | 
 16 | :todo: fasttext .vec format?
 17 | 
 18 | Importing vectors
 19 | -------------------
 20 | 
 21 | Vecto assumes a one-folder-per-vsm folder structure. All files related to the same vsm - the metadata, vectors, vocab files,  etc. - must all be stored in one directory. If the vector files has the correct extension (.npy, .txt, .bin, .hp5), the library will attempt to "guess" the correct module to load it with.
 22 | 
 23 | >>> import vecto
 24 | >>> path_to_vsm = "/path/to/your/model"
 25 | >>> my_vsm = vecto.model.load_from_dir(path_to_vsm)
 26 | 
 27 | The name of the model is the name directory in which the vector files are stored. For models generated with Vecto, interpretable folder names with parameters are generated automatically.
 28 | 
 29 | >>> print(my_vsm.name)
 30 | w2v_comb2_w8_n25_i6_d300_skip_300
 31 | 
 32 | You can access the VSM metadata (recorded in metadata.json file located in the same directory as the VSM) as a Python dictionary:
 33 | 
 34 | >>> print(my_vsm.metadata)
 35 | {'size_dimensions': 300, 'dimensions': 300, 'size_window': '8'}
 36 | 
 37 | Getting top similar neighbors of a word
 38 | ---------------------------------------
 39 | 
 40 | >>> my_vsm.get_most_similar_words("apple", cnt=5)
 41 | [['apple', 1.0000000999898755],
 42 |  ['fruit', 0.61400752577032369],
 43 |  ['banana', 0.58657183882050712],
 44 |  ['plum', 0.5850951585421692],
 45 |  ['apples', 0.58464719369713347]]
 46 | 
 47 | This method takes an optional ``cnt`` argument specifying how many top similar neighbors to output (the default is 10). Note that the top similar vector is always the target word itself.
 48 | 
 49 | If you need to compute nearest neighbors for many words, this function works
 50 | faster if the VSM is normalized. If it was generated with vecto, the
 51 | normalization will be recorded in metadata, and can be checked with `:meth:
 52 | .normalized()` method. Vecto will automatically check for normalization and use
 53 | the faster routine if possible. If not, you can first normalize your model as
 54 | follows:
 55 | 
 56 | >>> my_embeddings.normalize()
 57 | 
 58 | Please note that this changes the original embeddings, and to reverse this
 59 | operation you will have to re-load them.
 60 | 
 61 | If you're going to use the same normalized model several times, you can
 62 | avoid re-doing the normalization with:
 63 | 
 64 | >>> my_embeddings.cache_normalized_copy()
 65 | 
 66 | In this case the original embeddings remain unchanged, but the neighbor
 67 | retrieval will be performed with the cached normalized version. Please note
 68 | that this will use additional memory.
 69 | 
 70 | `.get_most_similar_vectors()` enables you to do the same as ``.get_most_similar_words()``, but searching the top neighbors by the vector representation rather than its label.
 71 | 
 72 | Note:
 73 | 
 74 |   The speed of vector neighborhood computation depends on whether your numpy
 75 |   package has access
 76 |   to the right linear algebra library - MKL, OpenBLAS or whatever is available
 77 |   for your system. With the OpenBLAS and 4 Ghz Core i7-6700K processor in Ubuntu we're
 78 |   processing 900 words for 300K 500-dimensional embeddings in under three
 79 |   minutes.
 80 | 
 81 |   If you do have the library, but the neighbor extraction is   still slow,
 82 |   check if it is actually used by numpy. This can be done as
 83 |   follows:
 84 | 
 85 |   >>> import numpy as np
 86 |   >>> np.show_config()
 87 | 
 88 | 
 89 | 
 90 | Words to vectors and back
 91 | -------------------------
 92 | 
 93 | First, you need to import your model from a directory that holds only that model (.npy, .bin, .hp5 or .txt formats) and any associated files.
 94 | 
 95 | getting the vector representation of a word
 96 | 
 97 | >>> my_vsm.get_row("apple")
 98 | array([-0.17980662,  0.27027196, -0.33250481,  ... -0.22577444], dtype=float32)
 99 | 
100 | You can use the above top-similar function to get the label of the vector most corresponding to your vector in your VSM vocabulary:
101 | 
102 | >>> vsm.get_most_similar_vectors(vsm.get_row("apple"))
103 | 
104 | Filtering the vocabulary of a VSM
105 | ---------------------------------
106 | 
107 | In certain cases it may be useful to filter the vocabulary of a pre-trained VSM, e.g. to ensure that two models you are comparing have the same vocabulary. Vecto provides a ``.filter_by_vocab()`` method that returns a new model instance, the vocabulary of which contains only the words in the provided Python list of words. The list can be empty.
108 | 
109 | >>> my_vsm.get_most_similar_words("cat", cnt=5)
110 | [['cat', 1.0],
111 |  ['monkey', 0.95726192],
112 |  ['dog', 0.95372206],
113 |  ['koala', 0.94773519],
114 |  ['puppy', 0.94360757]]
115 | >>> my_new_vsm = my_vsm.filter_by_vocab(["dog", "hotdog", "zoo", "hammer", "cat"])
116 | >>> my_new_vsm.get_most_similar_words("cat", cnt=5)
117 | [['cat', 1.0],
118 |  ['dog', 0.95372206],
119 |  ['hotdog', 0.84262532],
120 |  ['hammer', 0.80627602],
121 |  ['zoo', 0.7463485]]
122 | 


--------------------------------------------------------------------------------
/vecto/benchmarks/synonymy_detection/synonymy_detection.py:
--------------------------------------------------------------------------------
  1 | from ..base import Benchmark
  2 | from collections import defaultdict
  3 | from os import path, listdir
  4 | import csv
  5 | import numpy as np
  6 | from scipy.spatial import distance
  7 | 
  8 | OTHER_EXT = 'None'
  9 | BENCHMARK = 'benchmark'
 10 | 
 11 | 
 12 | class SynonymyDetection(Benchmark):
 13 |     def __init__(self, normalize=True,
 14 |                  ignore_oov=True,
 15 |                  do_top5=True,
 16 |                  need_subsample=False,
 17 |                  size_cv_test=1,
 18 |                  set_aprimes_test=None,
 19 |                  inverse_regularization_strength=1.0,
 20 |                  exclude=True):
 21 |         self.normalize = normalize
 22 |         self.ignore_oov = ignore_oov
 23 |         self.do_top5 = do_top5
 24 |         self.need_subsample = need_subsample
 25 |         self.normalize = normalize
 26 |         self.size_cv_test = size_cv_test
 27 |         self.set_aprimes_test = set_aprimes_test
 28 |         self.inverse_regularization_strength = inverse_regularization_strength
 29 |         self.exclude = exclude
 30 | 
 31 |         self.stats = {}
 32 |         self.cnt_total_correct = 0
 33 |         self.cnt_total_total = 0
 34 | 
 35 |         # this are some hard-coded bits which will be implemented later
 36 |         self.result_miss = {
 37 |             'rank': -1,
 38 |             'reason': 'missing words'
 39 |         }
 40 | 
 41 |     @property
 42 |     def method(self):
 43 |         return type(self).__name__
 44 | 
 45 |     @classmethod
 46 |     def read_test_set(self, path):
 47 |         data = defaultdict(lambda: [])
 48 |         if path.endswith('.csv'):
 49 |             with open(path, 'r') as csvfile:
 50 |                 reader = csv.reader(csvfile)
 51 |                 head = True
 52 |                 for row in reader:
 53 |                     if len(row) < 3:
 54 |                         continue
 55 |                     if not head:
 56 |                         target_word = row[1]
 57 |                         word = row[2]
 58 |                         is_synonym = row[3]
 59 |                         data[target_word].append([word, is_synonym])
 60 |                     head = False
 61 |         else:
 62 |             with open(path) as f:
 63 |                 for line in f:
 64 |                     _, target_word, word, is_synonym = line.strip().split()
 65 |                     data[target_word].append([word, is_synonym])
 66 |         return dict(data)
 67 | 
 68 |     def collect_stats(self, embs, data):
 69 |         corrected_data = defaultdict(lambda: [])
 70 |         for word, suspicious_words in data.items():
 71 |             if not embs.has_word(word):
 72 |                 continue
 73 |             for susp_word, is_synonym in suspicious_words:
 74 |                 if embs.has_word(susp_word):
 75 |                     corrected_data[word].append([susp_word, is_synonym])
 76 |         result = self.run_synonym_finding(embs, dict(corrected_data))
 77 |         return result
 78 | 
 79 |     def evaluate(self, embs, data):
 80 |         result = self.collect_stats(embs, data)
 81 |         return result
 82 | 
 83 |     def read_datasets_from_dir(self, path_to_dir):
 84 |         datasets = defaultdict(lambda: {})
 85 |         for file in listdir(path_to_dir):
 86 |             dataset_name, dataset_data = self.read_single_dataset(path_to_dir, file)
 87 |             if type != OTHER_EXT:
 88 |                 datasets[dataset_name] = dataset_data
 89 |         return datasets
 90 | 
 91 |     def read_single_dataset(self, path_to_dir, file_name):
 92 |         dataset_name, _ = path.splitext(file_name)
 93 |         data = self.read_test_set(path.join(path_to_dir, file_name))
 94 |         return dataset_name, data
 95 | 
 96 |     def run(self, embeds, path_dataset):
 97 |         results = defaultdict(lambda: {})
 98 |         datasets = self.read_datasets_from_dir(path_dataset)
 99 |         for dataset_name, dataset_data in datasets.items():
100 |             result = self.evaluate(embeds, dataset_data)
101 |             results[dataset_name] = result
102 |         return dict(results)
103 | 
104 |     def get_result(self, embeds, path_dataset):
105 |         if self.normalize:
106 |             embeds.normalize()
107 | 
108 |         results = self.run(embeds, path_dataset)
109 |         return results
110 | 
111 | 
112 | class CosineDistance(SynonymyDetection):
113 |     @classmethod
114 |     def run_synonym_finding(self, embs, data):
115 |         result = defaultdict(lambda: {})
116 |         for word, suspicious_words in data.items():
117 |             distances = []
118 |             for susp_word, _ in suspicious_words:
119 |                 distances.append(1 - distance.cosine(embs.get_vector(susp_word), embs.get_vector(word)))
120 |             guessed_word_index = distances.index(np.min(distances))
121 |             results_for_word = []
122 |             for dist_id, cosine_distance in enumerate(distances):
123 |                 d = {}
124 |                 d['suspicious_word'] = suspicious_words[dist_id][0]
125 |                 d['is_synonym'] = suspicious_words[dist_id][1]
126 |                 if dist_id == guessed_word_index:
127 |                     d['hit'] = True
128 |                 else:
129 |                     d['hit'] = False
130 |                 d['distance'] = cosine_distance
131 |                 results_for_word.append(d)
132 |             result[word] = results_for_word
133 |         return dict(result)
134 | 


--------------------------------------------------------------------------------
/tests/test_training.py:
--------------------------------------------------------------------------------
 1 | """Tests for embeddings module."""
 2 | 
 3 | import unittest
 4 | import io
 5 | import contextlib
 6 | import sys
 7 | import runpy
 8 | import os
 9 | 
10 | 
11 | def run_module(name: str, args, run_name: str = '__main__') -> None:
12 |     backup_sys_argv = sys.argv
13 |     sys.argv = [name + '.py'] + list(args)
14 |     runpy.run_module(name, run_name=run_name)
15 |     sys.argv = backup_sys_argv
16 | 
17 | 
18 | class Tests(unittest.TestCase):
19 | 
20 |     # def test_train_word2vec(self):
21 |     #     path_corpus = "./tests/data/corpora/plain/"
22 |     #     sio = io.StringIO()
23 |     #     with contextlib.redirect_stderr(sio):
24 |     #         run_module('vecto.embeddings.train_word2vec',
25 |     #                    ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--out_type', 'ns'])
26 |     #         run_module('vecto.embeddings.train_word2vec',
27 |     #                    ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--out_type', 'hsm'])
28 |     #         run_module('vecto.embeddings.train_word2vec',
29 |     #                    ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--out_type', 'original'])
30 |     #         run_module('vecto.embeddings.train_word2vec',
31 |     #                    ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--out_type', 'ns',
32 |     #                     '--model', 'cbow'])
33 |     #         with self.assertRaises(RuntimeError):
34 |     #             run_module('vecto.embeddings.train_word2vec',
35 |     #                        ['--path_corpus', path_corpus + "NONEXISTING", '--path_out', '/tmp/vecto/embeddings/',
36 |     #                         '--out_type', 'ns',
37 |     #                         '--model', 'cbow'])
38 | 
39 |     # @unittest.skipIf(os.environ.get('APPVEYOR'), 'skipping Appveyor due to memory error')
40 |     # def test_train_word2vec_subword_cnn1d(self):
41 |     #     path_corpus = "./tests/data/corpora/plain/"
42 |     #     run_module('vecto.embeddings.train_word2vec',
43 |     #                ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--dimension', '5',
44 |     #                 '--subword', 'cnn1d'])
45 |     #     with self.assertRaises(RuntimeError):
46 |     #         run_module('vecto.embeddings.train_word2vec',
47 |     #                    ['--path_corpus', path_corpus + "NONEXISTING", '--path_out', '/tmp/vecto/embeddings/',
48 |     #                     '--dimension', '5',
49 |     #                     '--subword', 'cnn1d'])
50 | 
51 |     def test_train_word2vec_subword(self):
52 |         path_corpus = "./tests/data/corpora/plain/"
53 |         path_vocab = "./tests/data/vocabs/plain/"
54 |         sio = io.StringIO()
55 |         with contextlib.redirect_stderr(sio):
56 |             run_module('vecto.embeddings.train_word2vec',
57 |                        ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--dimension', '5',
58 |                         '--subword', 'cnn1d_small'])
59 | 
60 |             run_module('vecto.embeddings.train_word2vec',
61 |                        ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--dimension', '5',
62 |                         '--subword', 'bilstm'])
63 |             run_module('vecto.embeddings.train_word2vec',
64 |                        ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--dimension', '5',
65 |                         '--subword', 'sum'])
66 |             run_module('vecto.embeddings.train_word2vec',
67 |                        ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--dimension', '5',
68 |                         '--subword', '_none', '--path_vocab', path_vocab])
69 |             run_module('vecto.embeddings.train_word2vec',
70 |                        ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--dimension', '5',
71 |                         '--subword', 'bilstm_sum'])
72 |             with self.assertRaises(RuntimeError):
73 |                 run_module('vecto.embeddings.train_word2vec',
74 |                            ['--path_corpus', path_corpus + "NONEXISTING", '--path_out', '/tmp/vecto/embeddings/',
75 |                             '--dimension', '5',
76 |                             '--subword', 'bilstm_sum'])
77 | 
78 |     def test_train_word2vec_subword_jap(self):
79 |         path_corpus = "./tests/data/corpora/jap/tokenized/"
80 |         path_word2chars = "./tests/data/corpora/jap/char2radical/char2radical.txt"
81 |         sio = io.StringIO()
82 |         with contextlib.redirect_stderr(sio):
83 |             run_module('vecto.embeddings.train_word2vec',
84 |                        ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--dimension', '5',
85 |                         '--subword', 'sum', '--language', 'jap', '--min_gram', '1', '--max_gram', '1'])
86 |             run_module('vecto.embeddings.train_word2vec',
87 |                        ['--path_corpus', path_corpus, '--path_out', '/tmp/vecto/embeddings/', '--dimension', '5',
88 |                         '--subword', 'sum', '--language', 'jap', '--min_gram', '1', '--max_gram', '1',
89 |                         '--path_word2chars', path_word2chars])
90 | 
91 |             with self.assertRaises(RuntimeError):
92 |                 run_module('vecto.embeddings.train_word2vec',
93 |                            ['--path_corpus', path_corpus + "NONEXISTING", '--path_out', '/tmp/vecto/embeddings/',
94 |                             '--dimension', '5',
95 |                             '--subword', 'sum', '--language', 'jap', '--min_gram', '1', '--max_gram', '1'])
96 | 


--------------------------------------------------------------------------------
/vecto/data/base.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import fnmatch
  3 | import os
  4 | from pathlib import Path
  5 | import tarfile
  6 | from zipfile import ZipFile
  7 | import logging
  8 | import tempfile
  9 | import shutil
 10 | # from vecto.config import load_config
 11 | from vecto.utils.metadata import WithMetaData
 12 | from vecto.utils.data import load_json
 13 | from .io import fetch_file, read_first_col_is_label_format, read_tsv_label_last
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | # TODO: make config module-global
 17 | # config = load_config()
 18 | # TODO: get dataset dir from config
 19 | # TODO: use pathlib everywhere
 20 | dir_datasets = os.path.expanduser("~/.vecto/datasets")
 21 | dir_temp = os.path.join(tempfile.gettempdir(), "vecto", "tmp")
 22 | os.makedirs(dir_datasets, exist_ok=True)
 23 | os.makedirs(dir_temp, exist_ok=True)
 24 | resources = {}
 25 | 
 26 | 
 27 | class Dataset(WithMetaData):
 28 |     """
 29 |     Container class for stock datasets.
 30 |     Arguments:
 31 |         path (str): local path to place files
 32 |     """
 33 | 
 34 |     def __init__(self, path):
 35 |         if not os.path.exists(path):
 36 |             raise FileNotFoundError("test dataset dir does not exist:" + path)
 37 |         super().__init__(path)
 38 |         self.path = path
 39 |         if "name" not in self.metadata:
 40 |             # TODO: use proper path magic
 41 |             self.metadata["name"] = path.split("/")[-1]
 42 | 
 43 |     def file_iterator(self):
 44 |         for root, _, filenames in os.walk(self.path):
 45 |             for filename in fnmatch.filter(sorted(filenames), '*'):
 46 |                 if filename.endswith('json'):
 47 |                     continue
 48 |                 yield(os.path.join(root, filename))
 49 | 
 50 |     def _load_tsv(self, names):
 51 |         # TODO: decide what to do with char_basrd
 52 |         char_based = False
 53 |         for candidate_name in names:
 54 |             path_full_candidate = os.path.join(self.path, candidate_name)
 55 |             print(path_full_candidate)
 56 |             if os.path.isfile(path_full_candidate):
 57 |                 # train = read_first_col_is_label_format(path_full_candidate, char_based=char_based)
 58 |                 train = read_first_col_is_label_format(path_full_candidate)
 59 |                 return train
 60 |         # test = read_first_col_is_label_format(os.path.join(self.path, 'test'),
 61 |         #                      char_based=char_based)
 62 |         raise RuntimeError("can not find dataset")
 63 | 
 64 |     def get_train(self):
 65 |         return self._load_tsv(["train", "train.tsv"])
 66 | 
 67 |     def get_test(self):
 68 |         return self._load_tsv(["dev", "dev.tsv", "test", "test.tsv"])
 69 | 
 70 | 
 71 | def download_index():
 72 |     logger.info("downloading index of resources")
 73 |     path_tar = os.path.join(dir_temp, "resources.tar")
 74 |     url_resources = "https://github.com/vecto-ai/vecto-resources/tarball/master/"
 75 |     fetch_file(url_resources, path_tar)
 76 |     with tarfile.open(path_tar) as tar:
 77 |         for member in tar.getmembers():
 78 |             parts = member.name.split("/")
 79 |             if len(parts) <= 1: 
 80 |                 continue
 81 |             if parts[1] != "resources":
 82 |                 continue
 83 |             member.path = os.path.join(*parts[1:])
 84 |             tar.extract(member, dir_datasets)
 85 | 
 86 | 
 87 | def gen_metadata_snippets(path):
 88 |     for sub in path.iterdir():
 89 |         if sub.name == "metadata.json":
 90 |             yield sub
 91 |         else:
 92 |             if sub.is_dir():
 93 |                 yield from gen_metadata_snippets(sub)
 94 | 
 95 | 
 96 | def load_dataset_infos():
 97 |     for f_meta in gen_metadata_snippets(Path(dir_datasets)):
 98 |         # print("visiting", f_meta.parent)
 99 |         metadata = load_json(f_meta)
100 |         if "name" in metadata:
101 |             metadata["local_path"] = f_meta.parent
102 |             resources[metadata["name"]] = metadata
103 | 
104 | 
105 | def download_dataset_by_name(name, path_dataset):
106 |     filename = resources[name]["url"].split("/")[-1]
107 |     logger.debug("downloading ", filename)
108 |     path_download_archive = Path(dir_temp) / filename
109 |     if "url" not in resources[name]:
110 |         raise RuntimeError(f"no URL to download dataset {name}")
111 |     fetch_file(resources[name]["url"], path_download_archive)
112 |     path_extracted = Path(dir_temp) / name
113 |     with ZipFile(path_download_archive) as z:
114 |         z.extractall(path_extracted)
115 |     # TODO: make sure this returns topmost entry from the tree
116 |     first_metadata_path = next(gen_metadata_snippets(path_extracted)).parent
117 |     # print(first_metadata_path)
118 |     for f in first_metadata_path.iterdir():
119 |         if not (path_dataset / f.name).exists():
120 |             shutil.move(str(f), str(path_dataset))
121 | 
122 | 
123 | def is_dataset_downloaded(path_dataset):
124 |     for f in path_dataset.iterdir():
125 |         if f.name.endswith("metadata.json"):
126 |             continue
127 |         return True
128 |     return False
129 | 
130 | 
131 | def get_dataset_by_name(name):
132 |     load_dataset_infos()
133 |     if not resources:
134 |         logger.info("index not found, forcing download")
135 |         download_index()
136 |         load_dataset_infos()
137 |     # print(resources)
138 |     if name in resources:
139 |         path_dataset = resources[name]["local_path"]
140 |     else:
141 |         raise RuntimeError("Dataset %s not known" % name)
142 |     if not is_dataset_downloaded(path_dataset):
143 |         logger.info("only metadata is present, need to download")
144 |         download_dataset_by_name(name, path_dataset)
145 |     dataset = Dataset(path_dataset)
146 |     return dataset
147 | 


--------------------------------------------------------------------------------
/vecto/benchmarks/relation_extraction/relation_extraction.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import numpy as np
  3 | 
  4 | np.random.seed(1337)  # for reproducibility
  5 | import gzip
  6 | import sys
  7 | import pickle as pkl
  8 | from .preprocess import load_data
  9 | from ..base import Benchmark
 10 | import os
 11 | 
 12 | from keras.models import Model
 13 | from keras.layers import Input, Dense, Dropout, concatenate
 14 | from keras.layers import Embedding
 15 | from keras.layers import Convolution1D, GlobalMaxPooling1D
 16 | 
 17 | 
 18 | # Function to calculate the precision
 19 | def getPrecision(pred_test, yTest, targetLabel):
 20 |     # Precision for non-vague
 21 |     targetLabelCount = 0
 22 |     correctTargetLabelCount = 0
 23 | 
 24 |     for idx, prediction in enumerate(pred_test):
 25 |         if prediction == targetLabel:
 26 |             targetLabelCount += 1
 27 | 
 28 |             if prediction == yTest[idx]:
 29 |                 correctTargetLabelCount += 1
 30 | 
 31 |     if correctTargetLabelCount == 0:
 32 |         return 0
 33 | 
 34 |     return float(correctTargetLabelCount) / targetLabelCount
 35 | 
 36 | class Relation_extraction(Benchmark):
 37 | 
 38 |     def __init__(self, batchsize=16, nb_filter=100, filter_length=3, hidden_dims=100, epoch=1, position_dims=50):
 39 |         self.batchsize = batchsize
 40 |         self.nb_filter = nb_filter
 41 |         self.filter_length = filter_length
 42 |         self.hidden_dims = hidden_dims
 43 |         self.epoch = epoch
 44 |         self.position_dims = position_dims
 45 | 
 46 |     def run(self, embeddings, dataset):
 47 |         print("Load dataset")
 48 |         path_dataset = dataset.path
 49 |         data = load_data(embeddings, path_dataset)
 50 | 
 51 |         yTrain, sentenceTrain, positionTrain1, positionTrain2 = data['train_set']
 52 |         yTest, sentenceTest, positionTest1, positionTest2 = data['test_set']
 53 | 
 54 |         max_position = max(np.max(positionTrain1), np.max(positionTrain2)) + 1
 55 | 
 56 |         n_out = max(yTrain) + 1
 57 |         # train_y_cat = np_utils.to_categorical(yTrain, n_out)
 58 |         max_sentence_len = sentenceTrain.shape[1]
 59 | 
 60 |         print(sentenceTrain[10])
 61 | 
 62 |         print("sentenceTrain: ", sentenceTrain.shape)
 63 |         print("positionTrain1: ", positionTrain1.shape)
 64 |         print("yTrain: ", yTrain.shape)
 65 | 
 66 |         print("sentenceTest: ", sentenceTest.shape)
 67 |         print("positionTest1: ", positionTest1.shape)
 68 |         print("yTest: ", yTest.shape)
 69 | 
 70 |         print("Embeddings: ", embeddings.matrix.shape)
 71 | 
 72 |         words_input = Input(shape=(max_sentence_len,), dtype='int32', name='words_input')
 73 |         words = Embedding(embeddings.matrix.shape[0], embeddings.matrix.shape[1], weights=[embeddings.matrix],
 74 |                           trainable=False)(words_input)
 75 |         distance1_input = Input(shape=(max_sentence_len,), dtype='int32', name='distance1_input')
 76 |         distance1 = Embedding(max_position, self.position_dims)(distance1_input)
 77 |         distance2_input = Input(shape=(max_sentence_len,), dtype='int32', name='distance2_input')
 78 |         distance2 = Embedding(max_position, self.position_dims)(distance2_input)
 79 |         output = concatenate([words, distance1, distance2], -1)
 80 |         output = Convolution1D(filters=self.nb_filter,
 81 |                                kernel_size=self.filter_length,
 82 |                                padding='same',
 83 |                                activation='tanh',
 84 |                                strides=1)(output)
 85 |         # we use standard max over time pooling
 86 |         output = GlobalMaxPooling1D()(output)
 87 |         output = Dropout(0.25)(output)
 88 |         output = Dense(n_out, activation='softmax')(output)
 89 |         # create the model
 90 |         model = Model(inputs=[words_input, distance1_input, distance2_input], outputs=[output])
 91 |         model.compile(loss='sparse_categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
 92 |         model.summary()
 93 | 
 94 |         print("Start training")
 95 |         max_prec, max_rec, max_acc, max_f1 = 0, 0, 0, 0
 96 |         accs = []
 97 | 
 98 |         def predict_classes(prediction):
 99 |             return prediction.argmax(axis=-1)
100 | 
101 |         # for epoch in range(nb_epoch):
102 |         model.fit([sentenceTrain, positionTrain1, positionTrain2], yTrain, batch_size=self.batchsize, verbose=True,
103 |                   epochs=self.epoch)
104 |         pred_test = predict_classes(model.predict([sentenceTest, positionTest1, positionTest2], verbose=False))
105 | 
106 |         dctLabels = np.sum(pred_test)
107 |         totalDCTLabels = np.sum(yTest)
108 | 
109 |         acc = np.sum(pred_test == yTest) / float(len(yTest))
110 |         max_acc = max(max_acc, acc)
111 |         print("Accuracy: %.4f (max: %.4f)" % (acc, max_acc))
112 | 
113 |         f1Sum = 0
114 |         f1Count = 0
115 |         for targetLabel in range(1, max(yTest)):
116 |             prec = getPrecision(pred_test, yTest, targetLabel)
117 |             recall = getPrecision(yTest, pred_test, targetLabel)
118 |             f1 = 0 if (prec + recall) == 0 else 2 * prec * recall / (prec + recall)
119 |             f1Sum += f1
120 |             f1Count += 1
121 |         accs.append(max_acc)
122 |         macroF1 = f1Sum / float(f1Count)
123 |         max_f1 = max(max_f1, macroF1)
124 |         print("Non-other Macro-Averaged F1: %.4f (max: %.4f)\n" % (macroF1, max_f1))
125 | 
126 |         experiment_setup = self.__dict__
127 |         experiment_setup["embeddings"] = embeddings.metadata
128 |         experiment_setup["category"] = "default"
129 |         experiment_setup["dataset"] = os.path.basename(path_dataset)
130 |         experiment_setup["method"] = 'cnn'
131 |         experiment_setup['task'] = 'relation_extraction'
132 |         result = {}
133 |         result['experiment_setup'] = experiment_setup
134 |         result['experiment_setup']['default_measurement'] = 'macroF1'
135 |         result['result'] = []
136 |         result['result'] = {"macroF1": macroF1, "max_f1": max_f1, "accuracy": acc, "max_accuracy": max_acc}
137 |         return result
138 | 
139 |     def get_result(self, embeddings, path_dataset):
140 |         results = self.run(embeddings, path_dataset)
141 |         return [results]
142 | 


--------------------------------------------------------------------------------