├── .gitignore ├── .travis.yml ├── CHANGES.rst ├── LICENSE ├── README.rst ├── examples ├── evaluate_similarity.ipynb ├── evaluate_similarity.py ├── solve_analogy.ipynb └── solve_analogy.py ├── requirements.txt ├── scripts ├── evaluate_embeddings.py ├── evaluate_on_all.py └── word2vec_wikipedia │ ├── process_wiki.py │ └── train.py ├── setup.py └── web ├── __init__.py ├── _utils ├── __init__.py └── compat.py ├── analogy.py ├── datasets ├── __init__.py ├── analogy.py ├── categorization.py ├── similarity.py └── utils.py ├── embedding.py ├── embeddings.py ├── evaluate.py ├── tests ├── test_analogy.py ├── test_categorization.py ├── test_embedding.py ├── test_fetchers.py ├── test_similarity.py ├── test_transform_words.py └── test_vocabulary.py ├── utils.py ├── version.py └── vocabulary.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Pycharm 62 | .idea/ 63 | /scripts/*.csv 64 | 65 | 66 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Config file for automatic testing at travis-ci.org 2 | 3 | language: python 4 | 5 | python: 6 | - 2.7 7 | - 3.4 8 | 9 | addons: 10 | apt: 11 | packages: 12 | - libblas-dev 13 | - liblapack-dev 14 | - gfortran 15 | - python-numpy 16 | - python-scipy 17 | 18 | before_install: 19 | - pip install -U pip 20 | 21 | install: 22 | - travis_wait travis_retry pip install -r requirements.txt 23 | - travis_retry python setup.py install 24 | 25 | script: python setup.py test -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kudkudak/word-embeddings-benchmarks/c78272b8c1374e5e518915a240ab2b348b59f44e/CHANGES.rst -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2010-2015 Google, Inc. http://angularjs.org 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Word Embeddings Benchmarks 2 | ===== 3 | 4 | .. image:: https://travis-ci.org/kudkudak/word-embeddings-benchmarks.svg?branch=master 5 | 6 | Word Embedding Benchmark (web) package is focused on providing methods for easy evaluating and reporting 7 | results on common benchmarks (analogy, similarity and categorization). 8 | 9 | Research goal of the package is to help drive research in word embeddings by easily accessible reproducible 10 | results (as there is a lot of contradictory results in the literature right now). 11 | This should also help to answer question if we should devise new methods for evaluating word embeddings. 12 | 13 | To evaluate your embedding (converted to word2vec or python dict pickle) 14 | on all fast-running benchmarks execute ``./scripts/eval_on_all.py ``. 15 | See `here `_ results for embeddings available in the package. 16 | 17 | Warnings and Disclaimers: 18 | 19 | * Analogy test does not normalize internally word embeddings. 20 | * **Package is currently under development, and we expect within next few months an official release**. The main issue that might hit you at the moment is rather long embeddings loading times (especially if you use fetchers). 21 | 22 | Please also refer to our recent publication on evaluation methods https://arxiv.org/abs/1702.02170. 23 | 24 | Features: 25 | 26 | * scikit-learn API and conventions 27 | * 18 popular datasets 28 | * 11 word embeddings (word2vec, HPCA, morphoRNNLM, GloVe, LexVec, ConceptNet, HDC/PDC and others) 29 | * methods to solve analogy, similarity and categorization tasks 30 | 31 | Included datasets: 32 | 33 | * TR9856 34 | * WordRep 35 | * Google Analogy 36 | * MSR Analogy 37 | * SemEval2012 38 | * AP 39 | * BLESS 40 | * Battig 41 | * ESSLI (2b, 2a, 1c) 42 | * WS353 43 | * MTurk 44 | * RG65 45 | * RW 46 | * SimLex999 47 | * MEN 48 | 49 | Note: embeddings are not hosted currently on a proper server, if the download is too slow consider downloading embeddings manually from original sources referred in docstrings. 50 | 51 | Dependencies 52 | ====== 53 | 54 | Please see ``requirements.txt``. 55 | 56 | Install 57 | ====== 58 | 59 | This package uses setuptools. You can install it running:: 60 | 61 | python setup.py install 62 | 63 | If you have problems during this installation. First you may need to install the dependencies:: 64 | 65 | pip install -r requirements.txt 66 | 67 | If you already have the dependencies listed in ``requirements.txt`` installed, 68 | to install in your home directory, use:: 69 | 70 | python setup.py install --user 71 | 72 | To install for all users on Unix/Linux:: 73 | 74 | python setup.py build 75 | sudo python setup.py install 76 | 77 | You can also install it in development mode with:: 78 | 79 | python setup.py develop 80 | 81 | 82 | Examples 83 | ======== 84 | See `examples` folder. 85 | 86 | License 87 | ======= 88 | Code is licensed under MIT, however available embeddings distributed within package might be under different license. If you are unsure please reach to authors (references are included in docstrings) 89 | -------------------------------------------------------------------------------- /examples/evaluate_similarity.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import logging\n", 10 | "from six import iteritems\n", 11 | "from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999\n", 12 | "from web.embeddings import fetch_GloVe\n", 13 | "from web.evaluate import evaluate_similarity" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 3, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "# Configure logging\n", 25 | "logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 4, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "File already downloaded, skipping\n" 38 | ] 39 | }, 40 | { 41 | "name": "stderr", 42 | "output_type": "stream", 43 | "text": [ 44 | "05:49:40 INFO:Tranformed 400000 into 381871 words\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "# Fetch GloVe embedding (warning: it might take few minutes)\n", 50 | "w_glove = fetch_GloVe(corpus=\"wiki-6B\", dim=300)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 6, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "# Define tasks\n", 60 | "tasks = {\n", 61 | " \"MEN\": fetch_MEN(),\n", 62 | " \"WS353\": fetch_WS353(),\n", 63 | " \"SIMLEX999\": fetch_SimLex999()\n", 64 | "}" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 7, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "Sample data from SIMLEX999: pair \"old\" and \"new\" is assigned score 1.58\n", 77 | "Sample data from MEN: pair \"sun\" and \"sunlight\" is assigned score [ 10.]\n", 78 | "Sample data from WS353: pair \"love\" and \"sex\" is assigned score 6.77\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "# Print sample data\n", 84 | "for name, data in iteritems(tasks):\n", 85 | " print(\"Sample data from {}: pair \\\"{}\\\" and \\\"{}\\\" is assigned score {}\".format(name, data.X[0][0], data.X[0][1], data.y[0]))" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 8, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "name": "stderr", 95 | "output_type": "stream", 96 | "text": [ 97 | "05:51:18 WARNING:Missing 24 words. Will replace them with mean vector\n" 98 | ] 99 | }, 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "Spearman correlation of scores on SIMLEX999 0.370500357109\n", 105 | "Spearman correlation of scores on MEN 0.737464696981\n", 106 | "Spearman correlation of scores on WS353 0.521712569525\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "# Calculate results using helper function\n", 112 | "for name, data in iteritems(tasks):\n", 113 | " print \"Spearman correlation of scores on {} {}\".format(name, evaluate_similarity(w_glove, data.X, data.y))" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "outputs": [], 123 | "source": [] 124 | } 125 | ], 126 | "metadata": { 127 | "kernelspec": { 128 | "display_name": "Python [default]", 129 | "language": "python", 130 | "name": "python2" 131 | }, 132 | "language_info": { 133 | "codemirror_mode": { 134 | "name": "ipython", 135 | "version": 2 136 | }, 137 | "file_extension": ".py", 138 | "mimetype": "text/x-python", 139 | "name": "python", 140 | "nbconvert_exporter": "python", 141 | "pygments_lexer": "ipython2", 142 | "version": "2.7.13" 143 | } 144 | }, 145 | "nbformat": 4, 146 | "nbformat_minor": 1 147 | } 148 | -------------------------------------------------------------------------------- /examples/evaluate_similarity.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Simple example showing evaluating embedding on similarity datasets 5 | """ 6 | import logging 7 | from six import iteritems 8 | from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999 9 | from web.embeddings import fetch_GloVe 10 | from web.evaluate import evaluate_similarity 11 | 12 | # Configure logging 13 | logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S') 14 | 15 | # Fetch GloVe embedding (warning: it might take few minutes) 16 | w_glove = fetch_GloVe(corpus="wiki-6B", dim=300) 17 | 18 | # Define tasks 19 | tasks = { 20 | "MEN": fetch_MEN(), 21 | "WS353": fetch_WS353(), 22 | "SIMLEX999": fetch_SimLex999() 23 | } 24 | 25 | # Print sample data 26 | for name, data in iteritems(tasks): 27 | print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".format(name, data.X[0][0], data.X[0][1], data.y[0])) 28 | 29 | # Calculate results using helper function 30 | for name, data in iteritems(tasks): 31 | print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(w_glove, data.X, data.y))) 32 | -------------------------------------------------------------------------------- /examples/solve_analogy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import logging\n", 12 | "from web.datasets.analogy import fetch_google_analogy\n", 13 | "from web.embeddings import fetch_SG_GoogleNews" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "# Configure logging\n", 25 | "logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stderr", 35 | "output_type": "stream", 36 | "text": [ 37 | "05:53:11 INFO:loading projection weights from /home/pocha/web_data/embeddings/GoogleNews-vectors-negative300.bin.gz\n", 38 | "05:53:11 INFO:Loading #3000000 words with 300 dim\n" 39 | ] 40 | }, 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "File already downloaded, skipping\n" 46 | ] 47 | }, 48 | { 49 | "name": "stderr", 50 | "output_type": "stream", 51 | "text": [ 52 | "05:55:25 INFO:Tranformed 3000000 into 2665071 words\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "# Fetch skip-gram trained on GoogleNews corpus and clean it slightly\n", 58 | "w = fetch_SG_GoogleNews(lower=True, clean_words=True)\n", 59 | "\n", 60 | "# Fetch analogy dataset\n", 61 | "data = fetch_google_analogy()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "name": "stdout", 71 | "output_type": "stream", 72 | "text": [ 73 | "gram3-comparative\n", 74 | "gram8-plural\n", 75 | "capital-common-countries\n", 76 | "city-in-state\n", 77 | "family\n", 78 | "gram9-plural-verbs\n", 79 | "gram2-opposite\n", 80 | "currency\n", 81 | "gram4-superlative\n", 82 | "gram6-nationality-adjective\n", 83 | "gram7-past-tense\n", 84 | "gram5-present-participle\n", 85 | "capital-world\n", 86 | "gram1-adjective-to-adverb\n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "for cat in (set(data.category)):\n", 92 | " print(cat)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 5, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "Question: bangkok is to thailand as havana is to ?\n", 105 | "Answer: cuba\n", 106 | "Predicted: asi\n", 107 | "Question: baku is to azerbaijan as dushanbe is to ?\n", 108 | "Answer: tajikistan\n", 109 | "Predicted: tajikistan\n", 110 | "Question: rome is to italy as windhoek is to ?\n", 111 | "Answer: namibia\n", 112 | "Predicted: otjiwarongo\n", 113 | "Question: comfortable is to uncomfortable as clear is to ?\n", 114 | "Answer: unclear\n", 115 | "Predicted: abundantly_clear\n", 116 | "Question: slow is to slowing as describe is to ?\n", 117 | "Answer: describing\n", 118 | "Predicted: describing\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "# Pick a sample of data and calculate answers\n", 124 | "subset = [50, 1000, 4000, 10000, 14000]\n", 125 | "for id in subset:\n", 126 | " w1, w2, w3 = data.X[id][0], data.X[id][1], data.X[id][2]\n", 127 | " print(\"Question: {} is to {} as {} is to ?\".format(w1, w2, w3))\n", 128 | " print(\"Answer: \" + data.y[id])\n", 129 | " print(\"Predicted: \" + \" \".join(w.nearest_neighbors(w[w2] - w[w1] + w[w3], exclude=[w1, w2, w3])))" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": { 136 | "collapsed": true 137 | }, 138 | "outputs": [], 139 | "source": [] 140 | } 141 | ], 142 | "metadata": { 143 | "kernelspec": { 144 | "display_name": "Python [default]", 145 | "language": "python", 146 | "name": "python2" 147 | }, 148 | "language_info": { 149 | "codemirror_mode": { 150 | "name": "ipython", 151 | "version": 2 152 | }, 153 | "file_extension": ".py", 154 | "mimetype": "text/x-python", 155 | "name": "python", 156 | "nbconvert_exporter": "python", 157 | "pygments_lexer": "ipython2", 158 | "version": "2.7.13" 159 | } 160 | }, 161 | "nbformat": 4, 162 | "nbformat_minor": 1 163 | } 164 | -------------------------------------------------------------------------------- /examples/solve_analogy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Simple example showing answering analogy questions 5 | """ 6 | import logging 7 | from web.datasets.analogy import fetch_google_analogy 8 | from web.embeddings import fetch_SG_GoogleNews 9 | 10 | # Configure logging 11 | logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S') 12 | 13 | # Fetch skip-gram trained on GoogleNews corpus and clean it slightly 14 | w = fetch_SG_GoogleNews(lower=True, clean_words=True) 15 | 16 | # Fetch analogy dataset 17 | data = fetch_google_analogy() 18 | 19 | for cat in (set(data.category)): 20 | print(cat) 21 | 22 | # Pick a sample of data and calculate answers 23 | subset = [50, 1000, 4000, 10000, 14000] 24 | for id in subset: 25 | w1, w2, w3 = data.X[id][0], data.X[id][1], data.X[id][2] 26 | print("Question: {} is to {} as {} is to ?".format(w1, w2, w3)) 27 | print("Answer: " + data.y[id]) 28 | print("Predicted: " + " ".join(w.nearest_neighbors(w[w2] - w[w1] + w[w3], exclude=[w1, w2, w3]))) 29 | 30 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib>=1.5.0 2 | numpy>=1.10.0 3 | Cython 4 | pandas==0.19 5 | pytest>=2.8.3 6 | scipy>=0.9 7 | scikit-learn>=0.16.1 8 | seaborn>=0.6.0 9 | futures 10 | tqdm 11 | docopt 12 | -------------------------------------------------------------------------------- /scripts/evaluate_embeddings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This script evaluates all embeddings available in the package 5 | and saves .csv results 6 | 7 | Usage: 8 | 9 | ./evaluate_embeddings 10 | """ 11 | from web.evaluate import evaluate_on_all 12 | from web import embeddings 13 | from six import iteritems 14 | from multiprocessing import Pool 15 | from os import path 16 | import logging 17 | import optparse 18 | import multiprocessing 19 | 20 | parser = optparse.OptionParser() 21 | parser.add_option("-j", "--n_jobs", type="int", default=4) 22 | parser.add_option("-o", "--output_dir", type="str", default="") 23 | (opts, args) = parser.parse_args() 24 | 25 | # Configure logging 26 | logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S') 27 | logger = logging.getLogger(__name__) 28 | 29 | jobs = [] 30 | 31 | ## GloVe 32 | 33 | for dim in [50, 100, 200, 300]: 34 | jobs.append(["fetch_GloVe", {"dim": dim, "corpus": "wiki-6B"}]) 35 | 36 | for dim in [25, 50, 100, 200]: 37 | jobs.append(["fetch_GloVe", {"dim": dim, "corpus": "twitter-27B"}]) 38 | 39 | 40 | for corpus in ["common-crawl-42B", "common-crawl-840B"]: 41 | jobs.append(["fetch_GloVe", {"dim": 300, "corpus": corpus}]) 42 | 43 | ## NMT 44 | 45 | jobs.append(["fetch_NMT", {"which": "FR"}]) 46 | jobs.append(["fetch_NMT", {"which": "DE"}]) 47 | 48 | ## PDC and HDC 49 | 50 | for dim in [50, 100, 300]: 51 | jobs.append(["fetch_PDC", {"dim": dim}]) 52 | jobs.append(["fetch_HDC", {"dim": dim}]) 53 | 54 | ## SG 55 | 56 | jobs.append(["fetch_SG_GoogleNews", {}]) 57 | 58 | ## LexVec 59 | 60 | jobs.append(["fetch_LexVec", {}]) 61 | 62 | ## ConceptNet Numberbatch 63 | jobs.append(["fetch_conceptnet_numberbatch", {}]) 64 | 65 | ## FastText 66 | jobs.append(["fetch_FastText", {}]) 67 | 68 | 69 | def run_job(j): 70 | fn, kwargs = j 71 | outf = path.join(opts.output_dir, fn + "_" + "_".join(str(k) + "=" + str(v) for k, v in iteritems(kwargs))) + ".csv" 72 | logger.info("Processing " + outf) 73 | if not path.exists(outf): 74 | w = getattr(embeddings, fn)(**kwargs) 75 | res = evaluate_on_all(w) 76 | res.to_csv(outf) 77 | 78 | 79 | if __name__ == "__main__": 80 | Pool(opts.n_jobs).map(run_job, jobs) 81 | -------------------------------------------------------------------------------- /scripts/evaluate_on_all.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This script calculates embedding results against all available fast running 5 | benchmarks in the repository and saves results as single row csv table. 6 | 7 | Usage: ./evaluate_on_all -f -o 8 | 9 | NOTE: 10 | * script doesn't evaluate on WordRep (nor its subset) as it is non standard 11 | for now and long running (unless some nearest neighbor approximation is used). 12 | 13 | * script is using CosAdd for calculating analogy answer. 14 | 15 | * script is not reporting results per category (for instance semantic/syntactic) in analogy benchmarks. 16 | It is easy to change it by passing category parameter to evaluate_analogy function (see help). 17 | """ 18 | from optparse import OptionParser 19 | import logging 20 | import os 21 | from web.embeddings import fetch_GloVe, load_embedding 22 | from web.datasets.utils import _get_dataset_dir 23 | 24 | from web.evaluate import evaluate_on_all 25 | 26 | 27 | # Configure logging 28 | logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S') 29 | logger = logging.getLogger(__name__) 30 | 31 | parser = OptionParser() 32 | parser.add_option("-f", "--file", dest="filename", 33 | help="Path to the file with embedding. If relative will load from data directory.", 34 | default=None) 35 | 36 | parser.add_option("-p", "--format", dest="format", 37 | help="Format of the embedding, possible values are: word2vec, word2vec_bin, dict and glove.", 38 | default=None) 39 | 40 | parser.add_option("-o", "--output", dest="output", 41 | help="Path where to save results.", 42 | default=None) 43 | 44 | parser.add_option("-c", "--clean_words", dest="clean_words", 45 | help="Clean_words argument passed to load_embedding function. If set to True will remove" 46 | "most of the non-alphanumeric characters, which should speed up evaluation.", 47 | default=False) 48 | 49 | if __name__ == "__main__": 50 | (options, args) = parser.parse_args() 51 | 52 | # Load embeddings 53 | fname = options.filename 54 | if not fname: 55 | w = fetch_GloVe(corpus="wiki-6B", dim=300) 56 | else: 57 | if not os.path.isabs(fname): 58 | fname = os.path.join(_get_dataset_dir(), fname) 59 | 60 | format = options.format 61 | 62 | if not format: 63 | _, ext = os.path.splitext(fname) 64 | if ext == ".bin": 65 | format = "word2vec_bin" 66 | elif ext == ".txt": 67 | format = "word2vec" 68 | elif ext == ".pkl": 69 | format = "dict" 70 | 71 | assert format in ['word2vec_bin', 'word2vec', 'glove', 'bin'], "Unrecognized format" 72 | 73 | load_kwargs = {} 74 | if format == "glove": 75 | load_kwargs['vocab_size'] = sum(1 for line in open(fname)) 76 | load_kwargs['dim'] = len(next(open(fname)).split()) - 1 77 | 78 | w = load_embedding(fname, format=format, normalize=True, lower=True, clean_words=options.clean_words, 79 | load_kwargs=load_kwargs) 80 | 81 | out_fname = options.output if options.output else "results.csv" 82 | 83 | results = evaluate_on_all(w) 84 | 85 | logger.info("Saving results...") 86 | print(results) 87 | results.to_csv(out_fname) 88 | -------------------------------------------------------------------------------- /scripts/word2vec_wikipedia/process_wiki.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Usage: ./process_wiki.py 6 | 7 | Adapted from http://textminingonline.com/training-word2vec-model-on-english-wikipedia-by-gensim 8 | 9 | TODO: add shuffle? 10 | """ 11 | 12 | import logging 13 | import os.path 14 | import sys 15 | import tqdm 16 | 17 | from gensim.corpora import WikiCorpus 18 | 19 | if __name__ == '__main__': 20 | program = os.path.basename(sys.argv[0]) 21 | logger = logging.getLogger(program) 22 | 23 | logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') 24 | logging.root.setLevel(level=logging.INFO) 25 | logger.info("running %s" % ' '.join(sys.argv)) 26 | 27 | # check and process input arguments 28 | if len(sys.argv) < 3: 29 | print globals()['__doc__'] % locals() 30 | sys.exit(1) 31 | inp, outp = sys.argv[1:3] 32 | space = " " 33 | i = 0 34 | 35 | output = open(outp, 'w') 36 | wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) 37 | for text in wiki.get_texts(): 38 | output.write(space.join(text) + "\n") 39 | i = i + 1 40 | if (i % 10000 == 0): 41 | logger.info("Saved " + str(i) + " articles") 42 | 43 | output.close() 44 | logger.info("Finished Saved " + str(i) + " articles") 45 | 46 | -------------------------------------------------------------------------------- /scripts/word2vec_wikipedia/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Usage: ./train.py 6 | 7 | Adapted from http://textminingonline.com/training-word2vec-model-on-english-wikipedia-by-gensim 8 | """ 9 | 10 | import logging 11 | import os.path 12 | import sys 13 | from gensim.models import Word2Vec 14 | from gensim.models.word2vec import LineSentence 15 | 16 | if __name__ == '__main__': 17 | program = os.path.basename(sys.argv[0]) 18 | logger = logging.getLogger(program) 19 | 20 | logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') 21 | logging.root.setLevel(level=logging.INFO) 22 | logger.info("running %s" % ' '.join(sys.argv)) 23 | 24 | # check and process input arguments 25 | if len(sys.argv) < 3: 26 | print globals()['__doc__'] % locals() 27 | sys.exit(1) 28 | inp, outp1 = sys.argv[1:4] 29 | 30 | # NOTE: it doesn't shuffle data between passes, which might degrade performance 31 | model = Word2Vec(LineSentence(inp), 32 | size=300, 33 | negative=5, 34 | workers=5) 35 | 36 | model.save_word2vec_format(outp1, binary=False) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | web 5 | ---- 6 | 7 | Word Embeddings Benchmarks 8 | """ 9 | 10 | from __future__ import print_function 11 | 12 | import os.path as op 13 | import io 14 | import sys 15 | 16 | from setuptools import setup, find_packages 17 | from setuptools.command.test import test as TestCommand 18 | 19 | 20 | # long description 21 | def read(*filenames, **kwargs): 22 | encoding = kwargs.get('encoding', 'utf-8') 23 | sep = kwargs.get('sep', '\n') 24 | buf = [] 25 | for filename in filenames: 26 | with io.open(filename, encoding=encoding) as f: 27 | buf.append(f.read()) 28 | return sep.join(buf) 29 | 30 | 31 | # Get version without importing, which avoids dependency issues 32 | MODULE_NAME = find_packages(exclude=['tests'])[0] 33 | VERSION_PYFILE = op.join(MODULE_NAME, 'version.py') 34 | # set __version__ variable 35 | exec (compile(read(VERSION_PYFILE), VERSION_PYFILE, 'exec')) 36 | 37 | # INSTALL_REQUIRES = list(parse_requirements('requirements.txt')) 38 | # req_files = ['requirements.txt', 'pip_requirements.txt'] 39 | 40 | LICENSE = 'New BSD' 41 | 42 | setup_dict = dict( 43 | name=MODULE_NAME, 44 | version=__version__, 45 | description='Word Embedding Benchmarks', 46 | 47 | license='New BSD', 48 | author='Stanislaw Jastrzebski', 49 | author_email='grimghil@gmail.com', 50 | maintainer='Stanislaw Jastrzebski', 51 | maintainer_email='grimghil@gmail.com', 52 | 53 | packages=find_packages(), 54 | 55 | install_requires=['numpy', 56 | 'scipy', 57 | 'scikit-learn'], 58 | 59 | extra_files=['CHANGES.rst', 'COPYING', 'README.rst'], 60 | 61 | scripts=[], 62 | 63 | long_description=read('README.rst', 'CHANGES.rst'), 64 | 65 | platforms='Linux/MacOSX', 66 | 67 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers 68 | classifiers=[ 69 | 'Programming Language :: Python', 70 | 'Development Status :: 1 - Alpha', 71 | 'Natural Language :: English', 72 | 'Environment :: Console', 73 | 'Intended Audience :: Machine Learning Research', 74 | 'License :: OSI Approved ::' + LICENSE, 75 | 'Operating System :: OS Independent', 76 | 'Topic :: Software Development :: Libraries :: Python Modules', 77 | 'Topic :: Scientific/Engineering :: Machine Learning', 78 | 'Topic :: Scientific/Engineering :: Information Analysis', 79 | 'Operating System :: POSIX', 80 | 'Operating System :: Unix', 81 | 'Operating System :: MacOS', 82 | 'Programming Language :: Python :: 2', 83 | 'Programming Language :: Python :: 2.7', 84 | 'Programming Language :: Python :: 3', 85 | 'Programming Language :: Python :: 3.5', 86 | ], 87 | 88 | extras_require={ 89 | 'testing': ['pytest', 'pytest-cov'], 90 | } 91 | ) 92 | 93 | 94 | # Python3 support keywords 95 | if sys.version_info >= (3,): 96 | setup_dict['use_2to3'] = False 97 | setup_dict['convert_2to3_doctests'] = [''] 98 | setup_dict['use_2to3_fixers'] = [''] 99 | 100 | 101 | class PyTest(TestCommand): 102 | user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")] 103 | 104 | def initialize_options(self): 105 | TestCommand.initialize_options(self) 106 | self.pytest_args = [] 107 | 108 | def finalize_options(self): 109 | TestCommand.finalize_options(self) 110 | self.test_args = [] 111 | self.test_suite = True 112 | 113 | def run_tests(self): 114 | # import here, cause outside the eggs aren't loaded 115 | import pytest 116 | errno = pytest.main(self.pytest_args) 117 | sys.exit(errno) 118 | 119 | 120 | setup_dict.update(dict(tests_require=['pytest'], 121 | cmdclass={'test': PyTest})) 122 | 123 | if __name__ == '__main__': 124 | setup(**setup_dict) 125 | -------------------------------------------------------------------------------- /web/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kudkudak/word-embeddings-benchmarks/c78272b8c1374e5e518915a240ab2b348b59f44e/web/__init__.py -------------------------------------------------------------------------------- /web/_utils/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /web/_utils/compat.py: -------------------------------------------------------------------------------- 1 | """ 2 | Compatibility layer for Python 3/Python 2 single codebase 3 | """ 4 | import sys 5 | import hashlib 6 | 7 | 8 | if sys.version_info[0] == 3: 9 | import pickle 10 | import io 11 | import urllib 12 | 13 | _basestring = str 14 | cPickle = pickle 15 | StringIO = io.StringIO 16 | BytesIO = io.BytesIO 17 | _urllib = urllib 18 | izip = zip 19 | 20 | def md5_hash(string): 21 | m = hashlib.md5() 22 | m.update(string.encode('utf-8')) 23 | return m.hexdigest() 24 | else: 25 | import cPickle 26 | import StringIO 27 | import urllib 28 | import urllib2 29 | import urlparse 30 | import types 31 | import itertools 32 | 33 | _basestring = basestring 34 | cPickle = cPickle 35 | StringIO = BytesIO = StringIO.StringIO 36 | izip = itertools.izip 37 | 38 | class _module_lookup(object): 39 | modules = [urlparse, urllib2, urllib] 40 | 41 | def __getattr__(self, name): 42 | for module in self.modules: 43 | if hasattr(module, name): 44 | attr = getattr(module, name) 45 | if not isinstance(attr, types.ModuleType): 46 | return attr 47 | raise NotImplementedError( 48 | 'This function has not been imported properly') 49 | 50 | module_lookup = _module_lookup() 51 | 52 | class _urllib(): 53 | request = module_lookup 54 | error = module_lookup 55 | parse = module_lookup 56 | 57 | def md5_hash(string): 58 | m = hashlib.md5() 59 | m.update(string) 60 | return m.hexdigest() 61 | -------------------------------------------------------------------------------- /web/analogy.py: -------------------------------------------------------------------------------- 1 | """ 2 | Classes and function for answering analogy questions 3 | """ 4 | 5 | import logging 6 | from collections import OrderedDict 7 | import six 8 | from six.moves import range 9 | import scipy 10 | import pandas as pd 11 | from itertools import product 12 | 13 | logger = logging.getLogger(__name__) 14 | import sklearn 15 | from .datasets.analogy import * 16 | from .utils import batched 17 | from web.embedding import Embedding 18 | 19 | class SimpleAnalogySolver(sklearn.base.BaseEstimator): 20 | """ 21 | Answer analogy questions 22 | 23 | Parameters 24 | ---------- 25 | w : Embedding instance 26 | 27 | method : {"add", "mul"} 28 | Method to use when finding analogy answer, see "Improving Distributional Similarity 29 | with Lessons Learned from Word Embeddings" O. Levy et al. 2014. 30 | 31 | batch_size : int 32 | Batch size to use while computing accuracy. This is because of extensive memory usage. 33 | 34 | k: int 35 | If not None will select k top most frequent words from embedding before doing analogy prediction 36 | (this can offer significant speedups) 37 | 38 | Note 39 | ---- 40 | It is suggested to normalize and standardize embedding before passing it to SimpleAnalogySolver. 41 | To speed up code consider installing OpenBLAS and setting OMP_NUM_THREADS. 42 | """ 43 | 44 | def __init__(self, w, method="add", batch_size=300, k=None): 45 | self.w = w 46 | self.batch_size = batch_size 47 | self.method = method 48 | self.k = k 49 | 50 | def score(self, X, y): 51 | """ 52 | Calculate accuracy on analogy questions dataset 53 | 54 | Parameters 55 | ---------- 56 | X : array-like, shape (n_samples, 3) 57 | Analogy questions. 58 | 59 | y : array-like, shape (n_samples, ) 60 | Analogy answers. 61 | 62 | Returns 63 | ------- 64 | acc : float 65 | Accuracy 66 | """ 67 | return np.mean(y == self.predict(X)) 68 | 69 | def predict(self, X): 70 | """ 71 | Answer analogy questions 72 | 73 | Parameters 74 | ---------- 75 | X : array-like, shape (n_samples, 3) 76 | Analogy questions. 77 | 78 | Returns 79 | ------- 80 | y_pred : array-like, shape (n_samples, ) 81 | Predicted words. 82 | """ 83 | w = self.w.most_frequent(self.k) if self.k else self.w 84 | words = self.w.vocabulary.words 85 | word_id = self.w.vocabulary.word_id 86 | mean_vector = np.mean(w.vectors, axis=0) 87 | output = [] 88 | 89 | missing_words = 0 90 | for query in X: 91 | for query_word in query: 92 | if query_word not in word_id: 93 | missing_words += 1 94 | if missing_words > 0: 95 | logger.warning("Missing {} words. Will replace them with mean vector".format(missing_words)) 96 | 97 | # Batch due to memory constaints (in dot operation) 98 | for id_batch, batch in enumerate(batched(range(len(X)), self.batch_size)): 99 | ids = list(batch) 100 | X_b = X[ids] 101 | if id_batch % np.floor(len(X) / (10. * self.batch_size)) == 0: 102 | logger.info("Processing {}/{} batch".format(int(np.ceil(ids[1] / float(self.batch_size))), 103 | int(np.ceil(X.shape[0] / float(self.batch_size))))) 104 | 105 | A, B, C = np.vstack(w.get(word, mean_vector) for word in X_b[:, 0]), \ 106 | np.vstack(w.get(word, mean_vector) for word in X_b[:, 1]), \ 107 | np.vstack(w.get(word, mean_vector) for word in X_b[:, 2]) 108 | 109 | if self.method == "add": 110 | D = np.dot(w.vectors, (B - A + C).T) 111 | elif self.method == "mul": 112 | D_A = np.log((1.0 + np.dot(w.vectors, A.T)) / 2.0 + 1e-5) 113 | D_B = np.log((1.0 + np.dot(w.vectors, B.T)) / 2.0 + 1e-5) 114 | D_C = np.log((1.0 + np.dot(w.vectors, C.T)) / 2.0 + 1e-5) 115 | D = D_B - D_A + D_C 116 | else: 117 | raise RuntimeError("Unrecognized method parameter") 118 | 119 | # Remove words that were originally in the query 120 | for id, row in enumerate(X_b): 121 | D[[w.vocabulary.word_id[r] for r in row if r in 122 | w.vocabulary.word_id], id] = np.finfo(np.float32).min 123 | 124 | output.append([words[id] for id in D.argmax(axis=0)]) 125 | 126 | return np.array([item for sublist in output for item in sublist]) -------------------------------------------------------------------------------- /web/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'kudkudak' 2 | -------------------------------------------------------------------------------- /web/datasets/analogy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Functions for fetching analogy datasets 5 | """ 6 | 7 | from collections import defaultdict 8 | import glob 9 | import os 10 | import numpy as np 11 | 12 | from sklearn.utils import check_random_state 13 | 14 | from sklearn.datasets.base import Bunch 15 | from .utils import _get_dataset_dir, _fetch_file, _change_list_to_np 16 | from ..utils import standardize_string 17 | 18 | 19 | def fetch_wordrep(subsample=None, rng=None): 20 | """ 21 | Fetch MSR WordRep dataset for testing both syntactic and semantic dataset 22 | 23 | Returns 24 | ------- 25 | data : sklearn.datasets.base.Bunch 26 | dictionary-like object. Keys of interest: 27 | 'X': matrix of word pairs 28 | 'y': vector of answers 29 | 'category': name of category 30 | 'category_high_level': name of high level category (semantic/syntactic) 31 | 32 | References 33 | ---------- 34 | Gao, Bin and Bian, Jiang and Liu, Tie-Yan, 35 | "Wordrep: A benchmark for research on learning word representations", 2014 36 | 37 | 38 | Notes 39 | ----- 40 | This dataset is too big to calculate and store all word analogy quadruples, this is 41 | why it returns word paris 42 | 43 | """ 44 | path = _fetch_file(url="https://www.dropbox.com/sh/5k78h9gllvc44vt/AAALLQq-Bge605OIMlmGBbNJa?dl=1", 45 | data_dir="analogy", 46 | uncompress=True, 47 | move="EN-WORDREP/EN-WORDREP.zip", 48 | verbose=0) 49 | 50 | wikipedia_dict = glob.glob(os.path.join(path, "Pairs_from_Wikipedia_and_Dictionary/*.txt")) 51 | wordnet = glob.glob(os.path.join(path, "Pairs_from_WordNet/*.txt")) 52 | 53 | # This dataset is too big to calculate and store all word analogy quadruples 54 | word_pairs = [] 55 | category = [] 56 | category_high_level = [] 57 | 58 | files = wikipedia_dict + wordnet 59 | 60 | for file_name in files: 61 | c = os.path.basename(file_name).split(".")[0] 62 | c = c[c.index("-")+1:] 63 | with open(file_name, "r") as f: 64 | for l in f.read().splitlines(): 65 | word_pairs.append(standardize_string(l).split()) 66 | category.append(c) 67 | category_high_level.append("wikipedia-dict" if file_name in wikipedia_dict else "wordnet") 68 | 69 | if subsample: 70 | assert 0 <= subsample <= 1.0 71 | rng = check_random_state(rng) 72 | ids = rng.choice(range(len(word_pairs)), int(subsample * len(word_pairs)), replace=False) 73 | word_pairs = [word_pairs[i] for i in ids] 74 | category = [category[i] for i in ids] 75 | category_high_level = [category_high_level[i] for i in ids] 76 | 77 | wordnet_categories = {'Antonym', 78 | 'Attribute', 79 | 'Causes', 80 | 'DerivedFrom', 81 | 'Entails', 82 | 'HasContext', 83 | 'InstanceOf', 84 | 'IsA', 85 | 'MadeOf', 86 | 'MemberOf', 87 | 'PartOf', 88 | 'RelatedTo', 89 | 'SimilarTo'} 90 | 91 | wikipedia_categories = {'adjective-to-adverb', 92 | 'all-capital-cities', 93 | 'city-in-state', 94 | 'comparative', 95 | 'currency', 96 | 'man-woman', 97 | 'nationality-adjective', 98 | 'past-tense', 99 | 'plural-nouns', 100 | 'plural-verbs', 101 | 'present-participle', 102 | 'superlative'} 103 | 104 | return Bunch(category_high_level=np.array(category_high_level), 105 | X=np.array(word_pairs), 106 | category=np.array(category), 107 | wikipedia_categories=wordnet_categories, 108 | wordnet_categories=wikipedia_categories) 109 | 110 | 111 | def fetch_google_analogy(): 112 | """ 113 | Fetch Google dataset for testing both semantic and syntactic analogies. 114 | 115 | Returns 116 | ------- 117 | data : sklearn.datasets.base.Bunch 118 | dictionary-like object. Keys of interest: 119 | 'X': matrix of word questions 120 | 'y': vector of answers 121 | 'category': name of category 122 | 'category_high_level': name of high level category (semantic/syntactic) 123 | 124 | References 125 | ---------- 126 | Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S and Dean, Jeff, 127 | "Distributed representations of words and phrases and their compositionality", 2013 128 | 129 | Notes 130 | ----- 131 | This dataset is a subset of WordRep dataset. 132 | 133 | """ 134 | 135 | url = "https://www.dropbox.com/s/eujtyfb5zem1mim/EN-GOOGLE.txt?dl=1" 136 | with open(_fetch_file(url, "analogy/EN-GOOGLE", verbose=0), "r") as f: 137 | L = f.read().splitlines() 138 | 139 | # Simple 4 word analogy questions with categories 140 | questions = [] 141 | answers = [] 142 | category = [] 143 | cat = None 144 | for l in L: 145 | if l.startswith(":"): 146 | cat =l.lower().split()[1] 147 | else: 148 | words = standardize_string(l).split() 149 | questions.append(words[0:3]) 150 | answers.append(words[3]) 151 | category.append(cat) 152 | 153 | assert set(category) == set(['gram3-comparative', 'gram8-plural', 'capital-common-countries', 154 | 'city-in-state', 'family', 'gram9-plural-verbs', 'gram2-opposite', 155 | 'currency', 'gram4-superlative', 'gram6-nationality-adjective', 156 | 'gram7-past-tense', 157 | 'gram5-present-participle', 'capital-world', 'gram1-adjective-to-adverb']) 158 | 159 | 160 | syntactic = set([c for c in set(category) if c.startswith("gram")]) 161 | category_high_level = [] 162 | for cat in category: 163 | category_high_level.append("syntactic" if cat in syntactic else "semantic") 164 | 165 | # dtype=object for memory efficiency 166 | return Bunch(X=np.vstack(questions).astype("object"), 167 | y=np.hstack(answers).astype("object"), 168 | category=np.hstack(category).astype("object"), 169 | category_high_level=np.hstack(category_high_level).astype("object")) 170 | 171 | 172 | 173 | def fetch_msr_analogy(): 174 | """ 175 | Fetch MSR dataset for testing performance on syntactic analogies 176 | 177 | Returns 178 | ------- 179 | data : sklearn.datasets.base.Bunch 180 | dictionary-like object. Keys of interest: 181 | 'X': matrix of word questions 182 | 'y': vector of answers 183 | 'category': name of category 184 | 'category_high_level': name of high level category (noun/adjective/verb) 185 | 186 | References 187 | ---------- 188 | Originally published at http://research.microsoft.com/en-us/projects/rnn/. 189 | 190 | Notes 191 | ----- 192 | Authors description: "more precisely, we tagged 267M words of newspaper text 193 | with Treebank POS tags (Marcus et al., 1993). We then selected 100 of the most frequent comparative adjectives 194 | (words labeled JJR); 100 of the most frequent plural nouns (NNS); 100 of the most frequent possessive nouns 195 | (NN POS); and 100 of the most frequent base form verbs (VB). 196 | We then systematically generated analogy questions by randomly matching each of the 100 words with 5 other words 197 | from the same category, and creating variants. 198 | """ 199 | url = "https://www.dropbox.com/s/ne0fib302jqbatw/EN-MSR.txt?dl=1" 200 | with open(_fetch_file(url, "analogy/EN-MSR", verbose=0), "r") as f: 201 | L = f.read().splitlines() 202 | 203 | # Typical 4 words analogy questions 204 | questions = [] 205 | answers = [] 206 | category = [] 207 | for l in L: 208 | words = standardize_string(l).split() 209 | questions.append(words[0:3]) 210 | answers.append(words[4]) 211 | category.append(words[3]) 212 | 213 | verb = set([c for c in set(category) if c.startswith("VB")]) 214 | noun = set([c for c in set(category) if c.startswith("NN")]) 215 | category_high_level = [] 216 | for cat in category: 217 | if cat in verb: 218 | category_high_level.append("verb") 219 | elif cat in noun: 220 | category_high_level.append("noun") 221 | else: 222 | category_high_level.append("adjective") 223 | 224 | assert set([c.upper() for c in category]) == set(['VBD_VBZ', 'VB_VBD', 'VBZ_VBD', 225 | 'VBZ_VB', 'NNPOS_NN', 'JJR_JJS', 'JJS_JJR', 'NNS_NN', 'JJR_JJ', 226 | 'NN_NNS', 'VB_VBZ', 'VBD_VB', 'JJS_JJ', 'NN_NNPOS', 'JJ_JJS', 'JJ_JJR']) 227 | 228 | return Bunch(X=np.vstack(questions).astype("object"), 229 | y=np.hstack(answers).astype("object"), 230 | category=np.hstack(category).astype("object"), 231 | category_high_level=np.hstack(category_high_level).astype("object")) 232 | 233 | 234 | # TODO: rewrite to a more standarized version 235 | def fetch_semeval_2012_2(which="all", which_scoring="golden"): 236 | """ 237 | Fetch dataset used for SEMEVAL 2012 task 2 competition 238 | 239 | Parameters 240 | ------- 241 | which : "all", "train" or "test" 242 | which_scoring: "golden" or "platinium" (see Notes) 243 | 244 | Returns 245 | ------- 246 | data : sklearn.datasets.base.Bunch 247 | dictionary-like object. Keys of interest: 248 | 'X_prot': dictionary keyed on category. Each entry is a matrix of prototype word pairs (see Notes) 249 | 'X': dictionary keyed on category. Each entry is a matrix of question word pairs 250 | 'y': dictionary keyed on category. Each entry is a dictionary word pair -> score 251 | 252 | 'categories_names': dictionary keyed on category. Each entry is a human readable name of 253 | category. 254 | 'categories_descriptions': dictionary keyed on category. Each entry is a human readable description of 255 | category. 256 | 257 | References 258 | ---------- 259 | DA Jurgens et al., 260 | "Measuring degrees of relational similarity. In *SEM 2012: The First Joint Conference on Lexical 261 | and Computational Semantics", 2012 262 | 263 | Notes 264 | ----- 265 | Dataset used in competition was scored as in golden scoring (which_scoring) parameter, however 266 | organiser have release improved labels afterwards (platinium scoring) 267 | 268 | The task is, given two pairs of words, A:B and C:D, determine the degree to which the semantic relations between 269 | A and B are similar to those between C and D. Unlike the more familiar task of semantic relation identification, 270 | which assigns each word pair to a discrete semantic relation class, this task recognizes the continuous range of 271 | degrees of relational similarity. The challenge is to determine the degrees of relational similarity between a 272 | given reference word pair and a variety of other pairs, mostly in the same general semantic relation class as the 273 | reference pair. 274 | """ 275 | assert which in ['all', 'train', 'test'] 276 | assert which_scoring in ['golden', 'platinium'] 277 | 278 | path = _fetch_file(url="https://www.dropbox.com/sh/aarqsfnumx3d8ds/AAB05Mu2HdypP0pudGrNjooaa?dl=1", 279 | data_dir="analogy", 280 | uncompress=True, 281 | move="EN-SEMVAL-2012-2/EN-SEMVAL-2012-2.zip", 282 | verbose=0) 283 | 284 | train_files = set(glob.glob(os.path.join(path, "train*.txt"))) - \ 285 | set(glob.glob(os.path.join(path, "train*_meta.txt"))) 286 | test_files = set(glob.glob(os.path.join(path, "test*.txt"))) - \ 287 | set(glob.glob(os.path.join(path, "test*_meta.txt"))) 288 | 289 | if which == "train": 290 | files = train_files 291 | elif which == "test": 292 | files = test_files 293 | elif which == "all": 294 | files = train_files.union(test_files) 295 | 296 | # Every question is formed as similarity to analogy category that is 297 | # posed as a list of 3 prototype word pairs 298 | questions = defaultdict(list) 299 | prototypes = {} 300 | golden_scores = {} 301 | platinium_scores = {} 302 | scores = {"platinium": platinium_scores, "golden": golden_scores} 303 | categories_names = {} 304 | categories_descriptions = {} 305 | for f in files: 306 | with open(f[0:-4] + "_meta.txt") as meta_f: 307 | meta = meta_f.read().splitlines()[1].split(",") 308 | 309 | with open(os.path.dirname(f) + "/pl-" + os.path.basename(f)) as f_pl: 310 | platinium = f_pl.read().splitlines() 311 | 312 | with open(f) as f_gl: 313 | golden = f_gl.read().splitlines() 314 | 315 | assert platinium[0] == golden[0], ("Incorrect file for ", f) 316 | 317 | c = meta[0] + "_" + meta[1] 318 | categories_names[c] = meta[2] + "_" + meta[3] 319 | categories_descriptions[c] = meta[4] 320 | 321 | prototypes[c] = [l.split(":") for l in \ 322 | platinium[0].replace(": ", ":").replace(" ", ",").replace(".", "").split(",")] 323 | golden_scores[c] = {} 324 | platinium_scores[c] = {} 325 | questions_raw = [] 326 | for line_pl in platinium[1:]: 327 | word_pair, score = line_pl.split() 328 | questions_raw.append(word_pair) 329 | questions[c].append([standardize_string(w) for w in word_pair.split(":")]) 330 | platinium_scores[c][word_pair] = score 331 | 332 | for line_g in golden[1:]: 333 | word_pair, score = line_g.split() 334 | golden_scores[c][word_pair] = score 335 | 336 | # Make scores a list 337 | platinium_scores[c] = [platinium_scores[c][w] for w in questions_raw] 338 | golden_scores[c] = [golden_scores[c][w] for w in questions_raw] 339 | 340 | return Bunch(X_prot=_change_list_to_np(prototypes), 341 | X=_change_list_to_np(questions), 342 | y=scores[which_scoring], 343 | categories_names=categories_names, 344 | categories_descriptions=categories_descriptions) 345 | 346 | 347 | -------------------------------------------------------------------------------- /web/datasets/categorization.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Functions for fetching categorization datasets 5 | """ 6 | 7 | from sklearn.datasets.base import Bunch 8 | from .utils import _get_cluster_assignments 9 | 10 | 11 | def fetch_AP(): 12 | """ 13 | Fetch Almuhareb and Abdulrahman categorization dataset 14 | 15 | Returns 16 | ------- 17 | data : sklearn.datasets.base.Bunch 18 | dictionary-like object. Keys of interest: 19 | 'clusters': dict of arrays of words representing 20 | 21 | References 22 | ---------- 23 | Almuhareb et al., "Concept learning and categorization from the web", 2005 24 | 25 | Notes 26 | ----- 27 | Authors description: 28 | Our goal was to create a dataset balanced with respect to 29 | three factors: class type, frequency, and ambiguity. 30 | First of all, we aimed to include one class of nouns for 31 | each of the 21 unique beginners of the WordNet noun 32 | hierarchy4 33 | . We chose subclasses for each of these 21 34 | beginners that would represent a reasonably natural cluster: 35 | e.g., the hyponym social occasion for the unique beginner 36 | event. From each such class, we selected between 13 and 21 37 | nouns to be representative concepts for the class (e.g., 38 | ceremony, feast, and graduation for the class social 39 | occasion). 40 | Secondly, we aimed to include about 1/3 high frequency 41 | nouns, 1/3 medium frequency, and 1/3 low frequency. Noun 42 | frequencies where estimated using the British National 43 | Corpus. We considered as highly frequent those nouns with 44 | frequency 1,000 or more; as medium frequent the nouns 45 | with between 1,000 and 100 occurrences; and those between 46 | 100 and 5 as low frequent. 47 | Thirdly, we wanted the dataset to be balanced as to 48 | ambiguity, estimated on the basis of the number of senses in 49 | WordNet. Nouns with 4 or more senses were considered 50 | highly ambiguous; nouns with 2 or 3 senses medium 51 | ambiguous; and nouns with a single sense as not ambiguous. 52 | """ 53 | return _get_cluster_assignments(dataset_name="EN-AP", 54 | url="https://www.dropbox.com/sh/6xu1c1aan8f83p3/AACMyoLwncNhRkUkqvGurYB6a?dl=1") 55 | 56 | 57 | def fetch_BLESS(): 58 | """ 59 | Fetch Baroni and Marco categorization dataset 60 | 61 | Parameters 62 | ------- 63 | 64 | Returns 65 | ------- 66 | data : sklearn.datasets.base.Bunch 67 | dictionary-like object. Keys of interest: 68 | 'X': words 69 | 'y': cluster assignment 70 | 71 | References 72 | ---------- 73 | Baroni et al. "How we BLESSed distributional semantic evaluation", 2011 74 | 75 | Notes 76 | ----- 77 | Data set includes 200 concrete nouns (100 animate and 100 inanimate nouns) 78 | from different classes (e.g., tools, clothing, vehicles, animals, etc.). 79 | """ 80 | return _get_cluster_assignments(dataset_name="EN-BLESS", 81 | url="https://www.dropbox.com/sh/5qbl5cmh17o3eh0/AACyCEqpMktdMI05zwphJRI7a?dl=1") 82 | 83 | 84 | def fetch_battig(): 85 | """ 86 | Fetch 1969 Battig dataset 87 | 88 | Returns 89 | ------- 90 | data : sklearn.datasets.base.Bunch 91 | dictionary-like object. Keys of interest: 92 | 'X': words 93 | 'y': cluster assignment 94 | 'freq': frequency of response 95 | 'frequency': Kucera-Francis word frequency 96 | 'rank': rank of frequence within response 97 | 'rfreq': rated frequency 98 | 99 | References 100 | ---------- 101 | W.F Battig & W.E Montague (1968). Category norms for verbal items in 56 categories: A replication 102 | and extension of the Connecticut norms using University of Maryland and Illinois students 103 | (Tech. Rep.) University of Colorado, Boulder, CO (1968) 104 | 105 | Notes 106 | ----- 107 | This dataset comprises a ranked list of 5231 words listed in 56 taxonomic categories by people 108 | who were asked to list as many exemplars of a given category ("a precious stone", "a unit of time", 109 | "a fruit", "a color", etc.). Participants had 30s to generate as many responses to each category as 110 | possible, after which time the next category name was presented. 111 | Included in this dataset are all words from the Battig and Montague (1969) norms listed with 112 | freq > 1. 113 | 114 | This is not the same dataset as 'battig' in Baroni et al. "Don’t count, predict! A systematic comparison of 115 | context-counting vs. context-predicting semantic vectors" 116 | """ 117 | data = _get_cluster_assignments(dataset_name="EN-BATTIG", 118 | url="https://www.dropbox.com/sh/ckp4yu7k7xl7u2a/AABhmpgU3ake3T9liA9BR8EBa?dl=1", 119 | sep=",", skip_header=True) 120 | return Bunch(X=data.X[:, 0], y=data.y, 121 | freq=data.X[:, 1], frequency=data.X[:, 2], rank=data.X[:, 3], rfreq=data.X[:, 4]) 122 | 123 | 124 | 125 | def fetch_ESSLI_2c(): 126 | """ 127 | Fetch ESSLI 2c task categorization dataset 128 | 129 | Returns 130 | ------- 131 | data : sklearn.datasets.base.Bunch 132 | dictionary-like object. Keys of interest: 133 | 'X': words 134 | 'y': cluster assignment 135 | 136 | References 137 | ---------- 138 | Originally published at http://wordspace.collocations.de/doku.php/data:esslli2008:verb_categorization 139 | 140 | Notes 141 | ----- 142 | The goal of the sub-task is to group verbs into semantic categories. The data set consists of 45 verbs, 143 | belonging to 9 semantic classes. The classification scheme is inspired by P. Vinson & G. Vigliocco (2007), 144 | “Semantic Feature Production Norms for a Large Set of Objects and Events”, Behavior Research Methods, 145 | which in turn closely follows the classification proposed in Levin (1993). The data set consists of 44 concrete 146 | nouns, belonging to 6 semantic categories (four animates and two inanimates). The nouns are included in the 147 | feature norms described in McRae et al. (2005) 148 | """ 149 | return _get_cluster_assignments(dataset_name="EN-ESSLI-2c", 150 | url="https://www.dropbox.com/sh/d3mcyl3b5mawfhm/AAABygW1rguhI4L0XSw_I68ta?dl=1") 151 | 152 | 153 | def fetch_ESSLI_2b(): 154 | """ 155 | Fetch ESSLI 2c task categorization dataset 156 | 157 | Parameters 158 | ------- 159 | 160 | Returns 161 | ------- 162 | data : sklearn.datasets.base.Bunch 163 | dictionary-like object. Keys of interest: 164 | 'X': words 165 | 'y': cluster assignment 166 | 167 | References 168 | ---------- 169 | Originally published at 170 | http://wordspace.collocations.de/doku.php/data:esslli2008:abstract_concrete_nouns_discrimination. 171 | 172 | Notes 173 | ----- 174 | The data set consists of 40 nouns extracted from the MRC Psycholinguistic Database, with ratings by human subjects 175 | on the concreteness scale. The nouns have been classified into three classes: HI, LO and ME being highly, 176 | low and medium abstract nouns. 177 | """ 178 | return _get_cluster_assignments(dataset_name="EN-ESSLI-2b", 179 | url="https://www.dropbox.com/sh/7gdv52gy9vb4mf2/AACExLgHdbvbBrRZBP6CcdDaa?dl=1") 180 | 181 | 182 | def fetch_ESSLI_1a(): 183 | """ 184 | Fetch ESSLI 1a task categorization dataset. 185 | 186 | Returns 187 | ------- 188 | data : sklearn.datasets.base.Bunch 189 | dictionary-like object. Keys of interest: 190 | 'X': words 191 | 'y': cluster assignment 192 | 193 | References 194 | ---------- 195 | Originally published at http://wordspace.collocations.de/doku.php/data:esslli2008:concrete_nouns_categorization. 196 | 197 | Notes 198 | ----- 199 | The goal of the sub-task is to group concrete nouns into semantic categories. 200 | The data set consists of 44 concrete nouns, belonging to 6 semantic categories (four animates and two inanimates). 201 | The nouns are included in the feature norms described in McRae et al. (2005) 202 | """ 203 | return _get_cluster_assignments(dataset_name="EN-ESSLI-1a", 204 | url="https://www.dropbox.com/sh/h362565r1sk5wii/AADjcdYy3nRo-MjuFUSvb-0ya?dl=1") 205 | -------------------------------------------------------------------------------- /web/datasets/similarity.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Functions for fetching similarity datasets 5 | """ 6 | 7 | import os 8 | 9 | import numpy as np 10 | import pandas as pd 11 | from sklearn.datasets.base import Bunch 12 | 13 | from .utils import _get_as_pd, _fetch_file 14 | 15 | 16 | def fetch_MTurk(): 17 | """ 18 | Fetch MTurk dataset for testing attributional similarity 19 | 20 | Returns 21 | ------- 22 | data : sklearn.datasets.base.Bunch 23 | dictionary-like object. Keys of interest: 24 | 'X': matrix of 2 words per column, 25 | 'y': vector with scores, 26 | 27 | References 28 | ---------- 29 | Radinsky, Kira et al., "A Word at a Time: Computing Word Relatedness Using Temporal Semantic Analysis", 2011 30 | 31 | Notes 32 | ----- 33 | Human labeled examples of word semantic relatedness. The data pairs were generated using an algorithm as 34 | described in the paper by [K. Radinsky, E. Agichtein, E. Gabrilovich, S. Markovitch.]. 35 | Each pair of words was evaluated by 10 people on a scale of 1-5. 36 | 37 | Additionally scores were multiplied by factor of 2. 38 | """ 39 | data = _get_as_pd('https://www.dropbox.com/s/f1v4ve495mmd9pw/EN-TRUK.txt?dl=1', 40 | 'similarity', header=None, sep=" ").values 41 | return Bunch(X=data[:, 0:2].astype("object"), 42 | y=2 * data[:, 2].astype(np.float)) 43 | 44 | 45 | def fetch_MEN(which="all", form="natural"): 46 | """ 47 | Fetch MEN dataset for testing similarity and relatedness 48 | 49 | Parameters 50 | ---------- 51 | which : "all", "test" or "dev" 52 | form : "lem" or "natural" 53 | 54 | Returns 55 | ------- 56 | data : sklearn.datasets.base.Bunch 57 | dictionary-like object. Keys of interest: 58 | 'X': matrix of 2 words per column, 59 | 'y': vector with scores 60 | 61 | References 62 | ---------- 63 | Published at http://clic.cimec.unitn.it/~elia.bruni/MEN.html. 64 | 65 | Notes 66 | ----- 67 | Scores for MEN are calculated differently than in WS353 or SimLex999. 68 | Furthermore scores where rescaled to 0 - 10 scale to match standard scaling. 69 | 70 | The MEN Test Collection contains two sets of English word pairs (one for training and one for testing) 71 | together with human-assigned similarity judgments, obtained by crowdsourcing using Amazon Mechanical 72 | Turk via the CrowdFlower interface. The collection can be used to train and/or test computer algorithms 73 | implementing semantic similarity and relatedness measures. 74 | """ 75 | if which == "dev": 76 | data = _get_as_pd('https://www.dropbox.com/s/c0hm5dd95xapenf/EN-MEN-LEM-DEV.txt?dl=1', 77 | 'similarity', header=None, sep=" ") 78 | elif which == "test": 79 | data = _get_as_pd('https://www.dropbox.com/s/vdmqgvn65smm2ah/EN-MEN-LEM-TEST.txt?dl=1', 80 | 'similarity/EN-MEN-LEM-TEST', header=None, sep=" ") 81 | elif which == "all": 82 | data = _get_as_pd('https://www.dropbox.com/s/b9rv8s7l32ni274/EN-MEN-LEM.txt?dl=1', 83 | 'similarity', header=None, sep=" ") 84 | else: 85 | raise RuntimeError("Not recognized which parameter") 86 | 87 | if form == "natural": 88 | # Remove last two chars from first two columns 89 | data = data.apply(lambda x: [y if isinstance(y, float) else y[0:-2] for y in x]) 90 | elif form != "lem": 91 | raise RuntimeError("Not recognized form argument") 92 | 93 | return Bunch(X=data.values[:, 0:2].astype("object"), y=data.values[:, 2:].astype(np.float) / 5.0) 94 | 95 | 96 | def fetch_WS353(which="all"): 97 | """ 98 | Fetch WS353 dataset for testing attributional and 99 | relatedness similarity 100 | 101 | Parameters 102 | ---------- 103 | which : 'all': for both relatedness and attributional similarity, 104 | 'relatedness': for relatedness similarity 105 | 'similarity': for attributional similarity 106 | 'set1': as divided by authors 107 | 'set2': as divided by authors 108 | 109 | References 110 | ---------- 111 | Finkelstein, Gabrilovich, "Placing Search in Context: The Concept Revisited†", 2002 112 | Agirre, Eneko et al., "A Study on Similarity and Relatedness Using Distributional and WordNet-based Approaches", 113 | 2009 114 | 115 | Returns 116 | ------- 117 | data : sklearn.datasets.base.Bunch 118 | dictionary-like object. Keys of interest: 119 | 'X': matrix of 2 words per column, 120 | 'y': vector with scores, 121 | 'sd': vector of std of scores if available (for set1 and set2) 122 | """ 123 | if which == "all": 124 | data = _get_as_pd('https://www.dropbox.com/s/eqal5qj97ajaycz/EN-WS353.txt?dl=1', 125 | 'similarity', header=0, sep="\t") 126 | elif which == "relatedness": 127 | data = _get_as_pd('https://www.dropbox.com/s/x94ob9zg0kj67xg/EN-WSR353.txt?dl=1', 128 | 'similarity', header=None, sep="\t") 129 | elif which == "similarity": 130 | data = _get_as_pd('https://www.dropbox.com/s/ohbamierd2kt1kp/EN-WSS353.txt?dl=1', 131 | 'similarity', header=None, sep="\t") 132 | elif which == "set1": 133 | data = _get_as_pd('https://www.dropbox.com/s/opj6uxzh5ov8gha/EN-WS353-SET1.txt?dl=1', 134 | 'similarity', header=0, sep="\t") 135 | elif which == "set2": 136 | data = _get_as_pd('https://www.dropbox.com/s/w03734er70wyt5o/EN-WS353-SET2.txt?dl=1', 137 | 'similarity', header=0, sep="\t") 138 | else: 139 | raise RuntimeError("Not recognized which parameter") 140 | 141 | # We basically select all the columns available 142 | X = data.values[:, 0:2] 143 | y = data.values[:, 2].astype(np.float) 144 | 145 | # We have also scores 146 | if data.values.shape[1] > 3: 147 | sd = np.std(data.values[:, 2:15].astype(np.float), axis=1).flatten() 148 | return Bunch(X=X.astype("object"), y=y, sd=sd) 149 | else: 150 | return Bunch(X=X.astype("object"), y=y) 151 | 152 | 153 | def fetch_RG65(): 154 | """ 155 | Fetch Rubenstein and Goodenough dataset for testing attributional and 156 | relatedness similarity 157 | 158 | Returns 159 | ------- 160 | data : sklearn.datasets.base.Bunch 161 | dictionary-like object. Keys of interest: 162 | 'X': matrix of 2 words per column, 163 | 'y': vector with scores, 164 | 'sd': vector of std of scores if available (for set1 and set2) 165 | 166 | References 167 | ---------- 168 | Rubenstein, Goodenough, "Contextual correlates of synonymy", 1965 169 | 170 | Notes 171 | ----- 172 | Scores were scaled by factor 10/4 173 | """ 174 | data = _get_as_pd('https://www.dropbox.com/s/chopke5zqly228d/EN-RG-65.txt?dl=1', 175 | 'similarity', header=None, sep="\t").values 176 | 177 | return Bunch(X=data[:, 0:2].astype("object"), 178 | y=data[:, 2].astype(np.float) * 10.0 / 4.0) 179 | 180 | 181 | def fetch_RW(): 182 | """ 183 | Fetch Rare Words dataset for testing attributional similarity 184 | 185 | Returns 186 | ------- 187 | data : sklearn.datasets.base.Bunch 188 | dictionary-like object. Keys of interest: 189 | 'X': matrix of 2 words per column, 190 | 'y': vector with scores, 191 | 'sd': vector of std of scores 192 | 193 | References 194 | ---------- 195 | Published at http://www-nlp.stanford.edu/~lmthang/morphoNLM/. 196 | 197 | Notes 198 | ----- 199 | 2034 word pairs that are relatively rare with human similarity scores. Rare word selection: our choices of 200 | rare words (word1) are based on their frequencies – based on five bins (5, 10], (10, 100], (100, 1000], 201 | (1000, 10000], and the affixes they possess. To create a diverse set of candidates, we randomly 202 | select 15 words for each configuration (a frequency bin, an affix). At the scale of Wikipedia, 203 | a word with frequency of 1-5 is most likely a junk word, and even restricted to words with 204 | frequencies above five, there are still many non-English words. To counter such problems, 205 | each word selected is required to have a non-zero number of synsets in WordNet(Miller, 1995). 206 | """ 207 | data = _get_as_pd('https://www.dropbox.com/s/xhimnr51kcla62k/EN-RW.txt?dl=1', 208 | 'similarity', header=None, sep="\t").values 209 | return Bunch(X=data[:, 0:2].astype("object"), 210 | y=data[:, 2].astype(np.float), 211 | sd=np.std(data[:, 3:].astype(np.float))) 212 | 213 | 214 | def fetch_multilingual_SimLex999(which="EN"): 215 | """ 216 | Fetch Multilingual SimLex999 dataset for testing attributional similarity 217 | 218 | Parameters 219 | ------- 220 | which : "EN", "RU", "IT" or "DE" for language 221 | 222 | Returns 223 | ------- 224 | data : sklearn.datasets.base.Bunch 225 | dictionary-like object. Keys of interest: 226 | 'X': matrix of 2 words per column, 227 | 'y': vector with scores, 228 | 'sd': vector of sd of scores, 229 | 230 | References 231 | ---------- 232 | Published at http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html. 233 | 234 | Notes 235 | ----- 236 | Scores for EN are different than the original SimLex999 dataset. 237 | 238 | Authors description: 239 | Multilingual SimLex999 resource consists of translations of the SimLex999 word similarity data set to 240 | three languages: German, Italian and Russian. Each of the translated datasets is scored by 241 | 13 human judges (crowdworkers) - all fluent speakers of its language. For consistency, we 242 | also collected human judgments for the original English corpus according to the same protocol 243 | applied to the other languages. This dataset allows to explore the impact of the "judgement language" 244 | (the language in which word pairs are presented to the human judges) on the resulted similarity scores 245 | and to evaluate vector space models on a truly multilingual setup (i.e. when both the training and the 246 | test data are multilingual). 247 | """ 248 | if which == "EN": 249 | data = _get_as_pd('https://www.dropbox.com/s/nczc4ao6koqq7qm/EN-MSIM999.txt?dl=1', 250 | 'similarity', header=None, encoding='utf-8', sep=" ") 251 | elif which == "DE": 252 | data = _get_as_pd('https://www.dropbox.com/s/ucpwrp0ahawsdtf/DE-MSIM999.txt?dl=1', 253 | 'similarity', header=None, encoding='utf-8', sep=" ") 254 | elif which == "IT": 255 | data = _get_as_pd('https://www.dropbox.com/s/siqjagyz8dkjb9q/IT-MSIM999.txt?dl=1', 256 | 'similarity', header=None, encoding='utf-8', sep=" ") 257 | elif which == "RU": 258 | data = _get_as_pd('https://www.dropbox.com/s/3v26edm9a31klko/RU-MSIM999.txt?dl=1', 259 | 'similarity', header=None, encoding='utf-8', sep=" ") 260 | else: 261 | raise RuntimeError("Not recognized which parameter") 262 | 263 | # We basically select all the columns available 264 | X = data.values[:, 0:2] 265 | scores = data.values[:, 2:].astype(np.float) 266 | y = np.mean(scores, axis=1) 267 | sd = np.std(scores, axis=1) 268 | 269 | return Bunch(X=X.astype("object"), y=y, sd=sd) 270 | 271 | 272 | def fetch_SimLex999(): 273 | """ 274 | Fetch SimLex999 dataset for testing attributional similarity 275 | 276 | Returns 277 | ------- 278 | data : sklearn.datasets.base.Bunch 279 | dictionary-like object. Keys of interest: 280 | 'X': matrix of 2 words per column, 281 | 'y': vector with scores, 282 | 'sd': vector of sd of scores, 283 | 'conc': matrix with columns conc(w1), conc(w2) and concQ the from dataset 284 | 'POS': vector with POS tag 285 | 'assoc': matrix with columns denoting free association: Assoc(USF) and SimAssoc333 286 | 287 | References 288 | ---------- 289 | Hill, Felix et al., "Simlex-999: Evaluating semantic models with (genuine) similarity estimation", 2014 290 | 291 | Notes 292 | ----- 293 | SimLex-999 is a gold standard resource for the evaluation of models that learn the meaning of words and concepts. 294 | SimLex-999 provides a way of measuring how well models capture similarity, rather than relatedness or 295 | association. The scores in SimLex-999 therefore differ from other well-known evaluation datasets 296 | such as WordSim-353 (Finkelstein et al. 2002). The following two example pairs illustrate the 297 | difference - note that clothes are not similar to closets (different materials, function etc.), 298 | even though they are very much related: coast - shore 9.00 9.10, clothes - closet 1.96 8.00 299 | """ 300 | data = _get_as_pd('https://www.dropbox.com/s/0jpa1x8vpmk3ych/EN-SIM999.txt?dl=1', 301 | 'similarity', sep="\t") 302 | 303 | # We basically select all the columns available 304 | X = data[['word1', 'word2']].values 305 | y = data['SimLex999'].values 306 | sd = data['SD(SimLex)'].values 307 | conc = data[['conc(w1)', 'conc(w2)', 'concQ']].values 308 | POS = data[['POS']].values 309 | assoc = data[['Assoc(USF)', 'SimAssoc333']].values 310 | 311 | return Bunch(X=X.astype("object"), y=y, sd=sd, conc=conc, POS=POS, assoc=assoc) 312 | 313 | 314 | def fetch_TR9856(): 315 | """ 316 | Fetch TR9856 dataset for testing multi-word term relatedness 317 | 318 | Returns 319 | ------- 320 | data : sklearn.datasets.base.Bunch 321 | dictionary-like object. Keys of interest: 322 | 'X': matrix of 2 words per column, 323 | 'y': vector with scores, 324 | 'topic': vector of topics providing context for each pair of terms 325 | 326 | References 327 | ---------- 328 | Levy, Ran et al., "TR9856: A multi-word term relatedness benchmark", 2015. 329 | 330 | Notes 331 | ----- 332 | """ 333 | data = pd.read_csv(os.path.join(_fetch_file( 334 | 'https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_TR9856.v2.zip', 335 | 'similarity', uncompress=True, verbose=0), 336 | 'IBM_Debater_(R)_TR9856.v0.2', 'TermRelatednessResults.csv'), encoding="iso-8859-1") 337 | 338 | # We basically select all the columns available 339 | X = data[['term1', 'term2']].values 340 | y = data['score'].values 341 | topic = data['topic'].values 342 | 343 | return Bunch(X=X.astype("object"), y=y, topic=topic) 344 | -------------------------------------------------------------------------------- /web/datasets/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Downloading datasets: utility functions 3 | 4 | This is a copy of nilearn.datasets. 5 | """ 6 | 7 | import errno 8 | import os 9 | import numpy as np 10 | import base64 11 | import collections 12 | import contextlib 13 | import fnmatch 14 | import hashlib 15 | import shutil 16 | import tempfile 17 | import time 18 | import sys 19 | import tarfile 20 | import warnings 21 | import zipfile 22 | import glob 23 | import pandas as pd 24 | from tqdm import tqdm 25 | from sklearn.datasets.base import Bunch 26 | from .._utils.compat import _basestring, cPickle, _urllib, md5_hash 27 | 28 | 29 | TEMP = tempfile.gettempdir() 30 | 31 | 32 | def _makedirs(path): # https://stackoverflow.com/a/600612/223267 33 | try: 34 | os.makedirs(path) 35 | except OSError as e: 36 | if e.errno == errno.EEXIST and os.path.isdir(path): 37 | pass 38 | else: 39 | raise 40 | 41 | 42 | 43 | def _get_cluster_assignments(dataset_name, url, sep=" ", skip_header=False): 44 | data_dir = _get_dataset_dir("categorization", verbose=0) 45 | _fetch_file(url=url, 46 | data_dir=data_dir, 47 | uncompress=True, 48 | move="{0}/{0}.txt".format(dataset_name), 49 | verbose=0) 50 | files = glob.glob(os.path.join(data_dir, dataset_name + "/*.txt")) 51 | X = [] 52 | y = [] 53 | names = [] 54 | for cluster_id, file_name in enumerate(files): 55 | with open(file_name) as f: 56 | lines = f.read().splitlines()[(int(skip_header)):] 57 | 58 | X += [l.split(sep) for l in lines] 59 | y += [os.path.basename(file_name).split(".")[0]] * len(lines) 60 | return Bunch(X=np.array(X, dtype="object"), y=np.array(y).astype("object")) 61 | 62 | def _get_as_pd(url, dataset_name, **read_csv_kwargs): 63 | return pd.read_csv(_fetch_file(url, dataset_name, verbose=0), **read_csv_kwargs) 64 | 65 | def _change_list_to_np(dict): 66 | return {k: np.array(dict[k], dtype="object") for k in dict} 67 | 68 | def _format_time(t): 69 | if t > 60: 70 | return "%4.1fmin" % (t / 60.) 71 | else: 72 | return " %5.1fs" % (t) 73 | 74 | 75 | def _md5_sum_file(path): 76 | """ Calculates the MD5 sum of a file. 77 | """ 78 | with open(path, 'rb') as f: 79 | m = hashlib.md5() 80 | while True: 81 | data = f.read(8192) 82 | if not data: 83 | break 84 | m.update(data) 85 | return m.hexdigest() 86 | 87 | 88 | def _read_md5_sum_file(path): 89 | """ Reads a MD5 checksum file and returns hashes as a dictionary. 90 | """ 91 | with open(path, "r") as f: 92 | hashes = {} 93 | while True: 94 | line = f.readline() 95 | if not line: 96 | break 97 | h, name = line.rstrip().split(' ', 1) 98 | hashes[name] = h 99 | return hashes 100 | 101 | 102 | def readlinkabs(link): 103 | """ 104 | Return an absolute path for the destination 105 | of a symlink 106 | """ 107 | path = os.readlink(link) 108 | if os.path.isabs(path): 109 | return path 110 | return os.path.join(os.path.dirname(link), path) 111 | 112 | 113 | 114 | def _chunk_report_(bytes_so_far, total_size, initial_size, t0): 115 | """Show downloading percentage. 116 | 117 | Parameters 118 | ---------- 119 | bytes_so_far: int 120 | Number of downloaded bytes 121 | 122 | total_size: int 123 | Total size of the file (may be 0/None, depending on download method). 124 | 125 | t0: int 126 | The time in seconds (as returned by time.time()) at which the 127 | download was resumed / started. 128 | 129 | initial_size: int 130 | If resuming, indicate the initial size of the file. 131 | If not resuming, set to zero. 132 | """ 133 | 134 | if not total_size: 135 | sys.stderr.write("Downloaded %d of ? bytes\r" % (bytes_so_far)) 136 | 137 | else: 138 | # Estimate remaining download time 139 | total_percent = float(bytes_so_far) / total_size 140 | 141 | current_download_size = bytes_so_far - initial_size 142 | bytes_remaining = total_size - bytes_so_far 143 | dt = time.time() - t0 144 | download_rate = current_download_size / max(1e-8, float(dt)) 145 | # Minimum rate of 0.01 bytes/s, to avoid dividing by zero. 146 | time_remaining = bytes_remaining / max(0.01, download_rate) 147 | 148 | # Trailing whitespace is to erase extra char when message length 149 | # varies 150 | sys.stderr.write( 151 | "Downloaded %d of %d bytes (%0.2f%%, %s remaining) \r" 152 | % (bytes_so_far, total_size, total_percent * 100, 153 | _format_time(time_remaining))) 154 | 155 | 156 | def _chunk_read_(response, local_file, chunk_size=8192, report_hook=None, 157 | initial_size=0, total_size=None, verbose=1): 158 | """Download a file chunk by chunk and show advancement 159 | 160 | Parameters 161 | ---------- 162 | response: _urllib.response.addinfourl 163 | Response to the download request in order to get file size 164 | 165 | local_file: file 166 | Hard disk file where data should be written 167 | 168 | chunk_size: int, optional 169 | Size of downloaded chunks. Default: 8192 170 | 171 | report_hook: bool 172 | Whether or not to show downloading advancement. Default: None 173 | 174 | initial_size: int, optional 175 | If resuming, indicate the initial size of the file 176 | 177 | total_size: int, optional 178 | Expected final size of download (None means it is unknown). 179 | 180 | verbose: int, optional 181 | verbosity level (0 means no message). 182 | 183 | Returns 184 | ------- 185 | data: string 186 | The downloaded file. 187 | 188 | """ 189 | 190 | 191 | try: 192 | if total_size is None: 193 | total_size = response.info().get('Content-Length').strip() 194 | total_size = int(total_size) + initial_size 195 | except Exception as e: 196 | if verbose > 1: 197 | print("Warning: total size could not be determined.") 198 | if verbose > 2: 199 | print("Full stack trace: %s" % e) 200 | total_size = None 201 | bytes_so_far = initial_size 202 | 203 | # t0 = time.time() 204 | if report_hook: 205 | pbar = tqdm(total=total_size, unit="b", unit_scale=True) 206 | 207 | while True: 208 | chunk = response.read(chunk_size) 209 | bytes_so_far += len(chunk) 210 | 211 | if not chunk: 212 | if report_hook: 213 | # sys.stderr.write('\n') 214 | pbar.close() 215 | break 216 | 217 | local_file.write(chunk) 218 | if report_hook: 219 | pbar.update(len(chunk)) # This is better because works in ipython 220 | # _chunk_report_(bytes_so_far, total_size, initial_size, t0) 221 | 222 | if report_hook: 223 | pbar.close() 224 | 225 | return 226 | 227 | 228 | def _get_dataset_dir(sub_dir=None, data_dir=None, default_paths=None, 229 | verbose=1): 230 | """ Create if necessary and returns data directory of given dataset. 231 | 232 | Parameters 233 | ---------- 234 | sub_dir: string 235 | Name of sub-dir 236 | 237 | data_dir: string, optional 238 | Path of the data directory. Used to force data storage in a specified 239 | location. Default: None 240 | 241 | default_paths: list of string, optional 242 | Default system paths in which the dataset may already have been 243 | installed by a third party software. They will be checked first. 244 | 245 | verbose: int, optional 246 | verbosity level (0 means no message). 247 | 248 | Returns 249 | ------- 250 | data_dir: string 251 | Path of the given dataset directory. 252 | 253 | Notes 254 | ----- 255 | This function retrieves the datasets directory (or data directory) using 256 | the following priority : 257 | 1. defaults system paths 258 | 2. the keyword argument data_dir 259 | 3. the global environment variable WEB_SHARED_DATA 260 | 4. the user environment variable WEB_DATA 261 | 5. web_data in the user home folder 262 | """ 263 | # We build an array of successive paths by priority 264 | # The boolean indicates if it is a pre_dir: in that case, we won't add the 265 | # dataset name to the path. 266 | paths = [] 267 | 268 | 269 | # Search given environment variables 270 | if default_paths is not None: 271 | for default_path in default_paths: 272 | paths.extend([(d, True) for d in default_path.split(':')]) 273 | 274 | # Check data_dir which force storage in a specific location 275 | if data_dir is not None: 276 | paths.extend([(d, False) for d in data_dir.split(':')]) 277 | else: 278 | global_data = os.getenv('WEB_SHARED_DATA') 279 | if global_data is not None: 280 | paths.extend([(d, False) for d in global_data.split(':')]) 281 | 282 | local_data = os.getenv('WEB_DATA') 283 | if local_data is not None: 284 | paths.extend([(d, False) for d in local_data.split(':')]) 285 | 286 | paths.append((os.path.expanduser('~/web_data'), False)) 287 | 288 | if verbose > 2: 289 | print('Dataset search paths: %s' % paths) 290 | 291 | # Check if the dataset exists somewhere 292 | for path, is_pre_dir in paths: 293 | if not is_pre_dir and sub_dir: 294 | path = os.path.join(path, sub_dir) 295 | if os.path.islink(path): 296 | # Resolve path 297 | path = readlinkabs(path) 298 | if os.path.exists(path) and os.path.isdir(path): 299 | if verbose > 1: 300 | print('\nDataset found in %s\n' % path) 301 | return path 302 | 303 | # If not, create a folder in the first writeable directory 304 | errors = [] 305 | for (path, is_pre_dir) in paths: 306 | if not is_pre_dir and sub_dir: 307 | path = os.path.join(path, sub_dir) 308 | if not os.path.exists(path): 309 | try: 310 | _makedirs(path) 311 | if verbose > 0: 312 | print('\nDataset created in %s\n' % path) 313 | return path 314 | except Exception as exc: 315 | short_error_message = getattr(exc, 'strerror', str(exc)) 316 | errors.append('\n -{0} ({1})'.format( 317 | path, short_error_message)) 318 | 319 | raise OSError('Web tried to store the dataset in the following ' 320 | 'directories, but:' + ''.join(errors)) 321 | 322 | 323 | def _uncompress_file(file_, delete_archive=True, verbose=1): 324 | """Uncompress files contained in a data_set. 325 | 326 | Parameters 327 | ---------- 328 | file: string 329 | path of file to be uncompressed. 330 | 331 | delete_archive: bool, optional 332 | Wheteher or not to delete archive once it is uncompressed. 333 | Default: True 334 | 335 | verbose: int, optional 336 | verbosity level (0 means no message). 337 | 338 | Notes 339 | ----- 340 | This handles zip, tar, gzip and bzip files only. 341 | """ 342 | if verbose > 0: 343 | print('Extracting data from %s...' % file_) 344 | data_dir = os.path.dirname(file_) 345 | # We first try to see if it is a zip file 346 | try: 347 | filename, ext = os.path.splitext(file_) 348 | with open(file_, "rb") as fd: 349 | header = fd.read(4) 350 | processed = False 351 | if zipfile.is_zipfile(file_): 352 | z = zipfile.ZipFile(file_) 353 | z.extractall(data_dir) 354 | z.close() 355 | processed = True 356 | elif ext == '.gz' or header.startswith(b'\x1f\x8b'): 357 | import gzip 358 | gz = gzip.open(file_) 359 | if ext == '.tgz': 360 | filename = filename + '.tar' 361 | out = open(filename, 'wb') 362 | shutil.copyfileobj(gz, out, 8192) 363 | gz.close() 364 | out.close() 365 | # If file is .tar.gz, this will be handle in the next case 366 | if delete_archive: 367 | os.remove(file_) 368 | file_ = filename 369 | filename, ext = os.path.splitext(file_) 370 | processed = True 371 | if tarfile.is_tarfile(file_): 372 | with contextlib.closing(tarfile.open(file_, "r")) as tar: 373 | tar.extractall(path=data_dir) 374 | processed = True 375 | if not processed: 376 | raise IOError( 377 | "[Uncompress] unknown archive file format: %s" % file_) 378 | if delete_archive: 379 | os.remove(file_) 380 | if verbose > 0: 381 | print(' ...done.') 382 | except Exception as e: 383 | if verbose > 0: 384 | print('Error uncompressing file: %s' % e) 385 | raise 386 | 387 | 388 | def _filter_column(array, col, criteria): 389 | """ Return index array matching criteria 390 | 391 | Parameters 392 | ---------- 393 | 394 | array: numpy array with columns 395 | Array in which data will be filtered 396 | 397 | col: string 398 | Name of the column 399 | 400 | criteria: integer (or float), pair of integers, string or list of these 401 | if integer, select elements in column matching integer 402 | if a tuple, select elements between the limits given by the tuple 403 | if a string, select elements that match the string 404 | """ 405 | # Raise an error if the column does not exist. This is the only way to 406 | # test it across all possible types (pandas, recarray...) 407 | try: 408 | array[col] 409 | except: 410 | raise KeyError('Filtering criterion %s does not exist' % col) 411 | 412 | if (not isinstance(criteria, _basestring) and 413 | not isinstance(criteria, bytes) and 414 | not isinstance(criteria, tuple) and 415 | isinstance(criteria, collections.Iterable)): 416 | 417 | filter = np.zeros(array.shape[0], dtype=np.bool) 418 | for criterion in criteria: 419 | filter = np.logical_or(filter, 420 | _filter_column(array, col, criterion)) 421 | return filter 422 | 423 | if isinstance(criteria, tuple): 424 | if len(criteria) != 2: 425 | raise ValueError("An interval must have 2 values") 426 | if criteria[0] is None: 427 | return array[col] <= criteria[1] 428 | if criteria[1] is None: 429 | return array[col] >= criteria[0] 430 | filter = array[col] <= criteria[1] 431 | return np.logical_and(filter, array[col] >= criteria[0]) 432 | 433 | return array[col] == criteria 434 | 435 | 436 | def _filter_columns(array, filters, combination='and'): 437 | """ Return indices of recarray entries that match criteria. 438 | 439 | Parameters 440 | ---------- 441 | 442 | array: numpy array with columns 443 | Array in which data will be filtered 444 | 445 | filters: list of criteria 446 | See _filter_column 447 | 448 | combination: string, optional 449 | String describing the combination operator. Possible values are "and" 450 | and "or". 451 | """ 452 | if combination == 'and': 453 | fcomb = np.logical_and 454 | mask = np.ones(array.shape[0], dtype=np.bool) 455 | elif combination == 'or': 456 | fcomb = np.logical_or 457 | mask = np.zeros(array.shape[0], dtype=np.bool) 458 | else: 459 | raise ValueError('Combination mode not known: %s' % combination) 460 | 461 | for column in filters: 462 | mask = fcomb(mask, _filter_column(array, column, filters[column])) 463 | return mask 464 | 465 | 466 | 467 | 468 | 469 | def _get_dataset_descr(ds_name): 470 | module_path = os.path.dirname(os.path.abspath(__file__)) 471 | 472 | fname = ds_name 473 | 474 | try: 475 | with open(os.path.join(module_path, 'description', fname + '.rst'))\ 476 | as rst_file: 477 | descr = rst_file.read() 478 | except IOError: 479 | descr = '' 480 | 481 | if descr == '': 482 | print("Warning: Could not find dataset description.") 483 | 484 | return descr 485 | 486 | 487 | def movetree(src, dst): 488 | """Move an entire tree to another directory. Any existing file is 489 | overwritten""" 490 | names = os.listdir(src) 491 | 492 | # Create destination dir if it does not exist 493 | _makedirs(dst) 494 | errors = [] 495 | 496 | for name in names: 497 | srcname = os.path.join(src, name) 498 | dstname = os.path.join(dst, name) 499 | try: 500 | if os.path.isdir(srcname) and os.path.isdir(dstname): 501 | movetree(srcname, dstname) 502 | os.rmdir(srcname) 503 | else: 504 | shutil.move(srcname, dstname) 505 | except (IOError, os.error) as why: 506 | errors.append((srcname, dstname, str(why))) 507 | # catch the Error from the recursive movetree so that we can 508 | # continue with other files 509 | except Exception as err: 510 | errors.extend(err.args[0]) 511 | if errors: 512 | raise Exception(errors) 513 | 514 | 515 | # TODO: refactor, this function is a mess, it was adapted from other project 516 | # and it might have not been an optimal choice 517 | def _fetch_file(url, data_dir=TEMP, uncompress=False, move=False,md5sum=None, 518 | username=None, password=None, mock=False, handlers=[], resume=True, verbose=0): 519 | """Load requested dataset, downloading it if needed or requested. 520 | 521 | This function retrieves files from the hard drive or download them from 522 | the given urls. Note to developpers: All the files will be first 523 | downloaded in a sandbox and, if everything goes well, they will be moved 524 | into the folder of the dataset. This prevents corrupting previously 525 | downloaded data. In case of a big dataset, do not hesitate to make several 526 | calls if needed. 527 | 528 | Parameters 529 | ---------- 530 | dataset_name: string 531 | Unique dataset name 532 | 533 | resume: bool, optional 534 | If true, try to resume partially downloaded files 535 | 536 | uncompress: bool, optional 537 | If true, will uncompress zip 538 | 539 | move: str, optional 540 | If True, will move downloaded file to given relative path. 541 | NOTE: common usage is zip_file_id/zip_file.zip together 542 | with uncompress set to True 543 | 544 | md5sum: string, optional 545 | MD5 sum of the file. Checked if download of the file is required 546 | 547 | username: string, optional 548 | Username used for basic HTTP authentication 549 | 550 | password: string, optional 551 | Password used for basic HTTP authentication 552 | 553 | handlers: list of BaseHandler, optional 554 | urllib handlers passed to urllib.request.build_opener. Used by 555 | advanced users to customize request handling. 556 | 557 | data_dir: string, optional 558 | Path of the data directory. Used to force data storage in a specified 559 | location. Default: None 560 | 561 | resume: bool, optional 562 | If true, try resuming download if possible 563 | 564 | verbose: int, optional 565 | verbosity level (0 means no message). 566 | 567 | Returns 568 | ------- 569 | files: list of string 570 | Absolute paths of downloaded files on disk 571 | """ 572 | 573 | # TODO: move to global scope and rename 574 | def _fetch_helper(url, data_dir=TEMP, resume=True, overwrite=False, 575 | md5sum=None, username=None, password=None, handlers=[], 576 | verbose=1): 577 | if not os.path.isabs(data_dir): 578 | data_dir = _get_dataset_dir(data_dir) 579 | 580 | # Determine data path 581 | _makedirs(data_dir) 582 | 583 | # Determine filename using URL 584 | parse = _urllib.parse.urlparse(url) 585 | file_name = os.path.basename(parse.path) 586 | if file_name == '': 587 | file_name = md5_hash(parse.path) 588 | 589 | temp_file_name = file_name + ".part" 590 | full_name = os.path.join(data_dir, file_name) 591 | temp_full_name = os.path.join(data_dir, temp_file_name) 592 | if os.path.exists(full_name): 593 | if overwrite: 594 | os.remove(full_name) 595 | else: 596 | return full_name 597 | if os.path.exists(temp_full_name): 598 | if overwrite: 599 | os.remove(temp_full_name) 600 | t0 = time.time() 601 | local_file = None 602 | initial_size = 0 603 | 604 | try: 605 | # Download data 606 | url_opener = _urllib.request.build_opener(*handlers) 607 | request = _urllib.request.Request(url) 608 | request.add_header('Connection', 'Keep-Alive') 609 | if username is not None and password is not None: 610 | if not url.startswith('https'): 611 | raise ValueError( 612 | 'Authentication was requested on a non secured URL (%s).' 613 | 'Request has been blocked for security reasons.' % url) 614 | # Note: HTTPBasicAuthHandler is not fitted here because it relies 615 | # on the fact that the server will return a 401 error with proper 616 | # www-authentication header, which is not the case of most 617 | # servers. 618 | encoded_auth = base64.b64encode( 619 | (username + ':' + password).encode()) 620 | request.add_header(b'Authorization', b'Basic ' + encoded_auth) 621 | if verbose > 0: 622 | displayed_url = url.split('?')[0] if verbose == 1 else url 623 | print('Downloading data from %s ...' % displayed_url) 624 | if resume and os.path.exists(temp_full_name): 625 | # Download has been interrupted, we try to resume it. 626 | local_file_size = os.path.getsize(temp_full_name) 627 | # If the file exists, then only download the remainder 628 | request.add_header("Range", "bytes=%s-" % (local_file_size)) 629 | try: 630 | data = url_opener.open(request) 631 | content_range = data.info().get('Content-Range') 632 | if (content_range is None or not content_range.startswith( 633 | 'bytes %s-' % local_file_size)): 634 | raise IOError('Server does not support resuming') 635 | except Exception: 636 | # A wide number of errors can be raised here. HTTPError, 637 | # URLError... I prefer to catch them all and rerun without 638 | # resuming. 639 | if verbose > 0: 640 | print('Resuming failed, try to download the whole file.') 641 | return _fetch_helper( 642 | url, data_dir, resume=False, overwrite=overwrite, 643 | md5sum=md5sum, username=username, password=password, 644 | handlers=handlers, verbose=verbose) 645 | local_file = open(temp_full_name, "ab") 646 | initial_size = local_file_size 647 | else: 648 | data = url_opener.open(request) 649 | local_file = open(temp_full_name, "wb") 650 | _chunk_read_(data, local_file, report_hook=(verbose > 0), 651 | initial_size=initial_size, verbose=verbose) 652 | # temp file must be closed prior to the move 653 | if not local_file.closed: 654 | local_file.close() 655 | shutil.move(temp_full_name, full_name) 656 | dt = time.time() - t0 657 | if verbose > 0: 658 | print('...done. (%i seconds, %i min)' % (dt, dt // 60)) 659 | except _urllib.error.HTTPError as e: 660 | if verbose > 0: 661 | print('Error while fetching file %s. Dataset fetching aborted.' % 662 | (file_name)) 663 | if verbose > 1: 664 | print("HTTP Error: %s, %s" % (e, url)) 665 | raise 666 | except _urllib.error.URLError as e: 667 | if verbose > 0: 668 | print('Error while fetching file %s. Dataset fetching aborted.' % 669 | (file_name)) 670 | if verbose > 1: 671 | print("URL Error: %s, %s" % (e, url)) 672 | raise 673 | finally: 674 | if local_file is not None: 675 | if not local_file.closed: 676 | local_file.close() 677 | if md5sum is not None: 678 | if (_md5_sum_file(full_name) != md5sum): 679 | raise ValueError("File %s checksum verification has failed." 680 | " Dataset fetching aborted." % local_file) 681 | return full_name 682 | 683 | if not os.path.isabs(data_dir): 684 | data_dir = _get_dataset_dir(data_dir) 685 | 686 | 687 | # There are two working directories here: 688 | # - data_dir is the destination directory of the dataset 689 | # - temp_dir is a temporary directory dedicated to this fetching call. All 690 | # files that must be downloaded will be in this directory. If a corrupted 691 | # file is found, or a file is missing, this working directory will be 692 | # deleted. 693 | parse = _urllib.parse.urlparse(url) 694 | file_name = os.path.basename(parse.path) 695 | 696 | files_pickle = cPickle.dumps([(file_, url) for file_, url in zip([file_name], [url])]) 697 | files_md5 = hashlib.md5(files_pickle).hexdigest() 698 | temp_dir = os.path.join(data_dir, files_md5) 699 | 700 | # Create destination dir 701 | _makedirs(data_dir) 702 | 703 | # Abortion flag, in case of error 704 | abort = None 705 | 706 | # 2 possibilities: 707 | # - the file exists in data_dir, nothing to do (we have to account for move parameter here) 708 | # - the file does not exists: we download it in temp_dir 709 | 710 | # Target file in the data_dir 711 | target_file = os.path.join(data_dir, file_name) 712 | 713 | # Change move so we always uncompress to some folder (this is important for 714 | # detecting already downloaded files) 715 | # Ex. glove.4B.zip -> glove.4B/glove.4B.zip 716 | if uncompress and not move: 717 | dirname, _ = os.path.splitext(file_name) 718 | move = os.path.join(dirname, os.path.basename(file_name)) 719 | 720 | if (abort is None 721 | and not os.path.exists(target_file) 722 | and (not move or (move and uncompress and not os.path.exists(os.path.dirname(os.path.join(data_dir, move))))) 723 | or (move and not uncompress and not os.path.exists(os.path.join(data_dir, move)))): 724 | 725 | # Target file in temp dir 726 | temp_target_file = os.path.join(temp_dir, file_name) 727 | # We may be in a global read-only repository. If so, we cannot 728 | # download files. 729 | if not os.access(data_dir, os.W_OK): 730 | raise ValueError('Dataset files are missing but dataset' 731 | ' repository is read-only. Contact your data' 732 | ' administrator to solve the problem') 733 | 734 | if not os.path.exists(temp_dir): 735 | os.mkdir(temp_dir) 736 | 737 | dl_file = _fetch_helper(url, temp_dir, resume=resume, 738 | verbose=verbose, md5sum=md5sum, 739 | username=username, 740 | password=password, 741 | handlers=handlers) 742 | 743 | if (abort is None and not os.path.exists(target_file) and not 744 | os.path.exists(temp_target_file)): 745 | if not mock: 746 | warnings.warn('An error occured while fetching %s' % file_) 747 | abort = ("Dataset has been downloaded but requested file was " 748 | "not provided:\nURL:%s\nFile:%s" % 749 | (url, target_file)) 750 | else: 751 | _makedirs(os.path.dirname(temp_target_file)) 752 | open(temp_target_file, 'w').close() 753 | 754 | if move: 755 | move = os.path.join(data_dir, move) 756 | move_dir = os.path.dirname(move) 757 | _makedirs(move_dir) 758 | shutil.move(dl_file, move) 759 | dl_file = move 760 | target_file = dl_file 761 | 762 | if uncompress: 763 | try: 764 | if os.path.getsize(dl_file) != 0: 765 | _uncompress_file(dl_file, verbose=verbose) 766 | else: 767 | os.remove(dl_file) 768 | target_file = os.path.dirname(target_file) 769 | except Exception as e: 770 | abort = str(e) 771 | else: 772 | if verbose > 0: 773 | print("File already downloaded, skipping") 774 | 775 | if move: 776 | target_file = os.path.join(data_dir, move) 777 | 778 | if uncompress: 779 | target_file = os.path.dirname(target_file) 780 | 781 | if abort is not None: 782 | if os.path.exists(temp_dir): 783 | shutil.rmtree(temp_dir) 784 | raise IOError('Fetching aborted: ' + abort) 785 | # If needed, move files from temps directory to final directory. 786 | if os.path.exists(temp_dir): 787 | # XXX We could only moved the files requested 788 | # XXX Movetree can go wrong 789 | movetree(temp_dir, data_dir) 790 | shutil.rmtree(temp_dir) 791 | return target_file 792 | 793 | def _tree(path, pattern=None, dictionary=False): 794 | """ Return a directory tree under the form of a dictionaries and list 795 | 796 | Parameters: 797 | ----------- 798 | path: string 799 | Path browsed 800 | 801 | pattern: string, optional 802 | Pattern used to filter files (see fnmatch) 803 | 804 | dictionary: boolean, optional 805 | If True, the function will return a dict instead of a list 806 | """ 807 | files = [] 808 | dirs = [] if not dictionary else {} 809 | for file_ in os.listdir(path): 810 | file_path = os.path.join(path, file_) 811 | if os.path.isdir(file_path): 812 | if not dictionary: 813 | dirs.append((file_, _tree(file_path, pattern))) 814 | else: 815 | dirs[file_] = _tree(file_path, pattern) 816 | else: 817 | if pattern is None or fnmatch.fnmatch(file_, pattern): 818 | files.append(file_path) 819 | files = sorted(files) 820 | if not dictionary: 821 | return sorted(dirs) + files 822 | if len(dirs) == 0: 823 | return files 824 | if len(files) > 0: 825 | dirs['.'] = files 826 | return dirs 827 | -------------------------------------------------------------------------------- /web/embedding.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base class for embedding. 3 | 4 | NOTE: This file was adapted from the polyglot package 5 | """ 6 | 7 | import logging 8 | from collections import OrderedDict 9 | 10 | import numpy as np 11 | import sys 12 | 13 | from six import text_type 14 | from six import PY2 15 | from six import iteritems 16 | from six import string_types 17 | from .utils import _open 18 | from .vocabulary import Vocabulary, CountedVocabulary, OrderedVocabulary 19 | from six.moves import cPickle as pickle 20 | from six.moves import range 21 | from functools import partial 22 | from .utils import standardize_string, to_utf8 23 | 24 | from sklearn.metrics import pairwise_distances 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | 29 | class Embedding(object): 30 | """ Mapping a vocabulary to a d-dimensional points.""" 31 | 32 | def __init__(self, vocabulary, vectors): 33 | self.vocabulary = vocabulary 34 | self.vectors = np.asarray(vectors) 35 | if len(self.vocabulary) != self.vectors.shape[0]: 36 | raise ValueError("Vocabulary has {} items but we have {} " 37 | "vectors." 38 | .format(len(vocabulary), self.vectors.shape[0])) 39 | 40 | if len(self.vocabulary.words) != len(set(self.vocabulary.words)): 41 | logger.warning("Vocabulary has duplicates.") 42 | 43 | def __getitem__(self, k): 44 | return self.vectors[self.vocabulary[k]] 45 | 46 | def __setitem__(self, k, v): 47 | if not v.shape[0] == self.vectors.shape[1]: 48 | raise RuntimeError("Please pass vector of len {}".format(self.vectors.shape[1])) 49 | 50 | if k not in self.vocabulary: 51 | self.vocabulary.add(k) 52 | self.vectors = np.vstack([self.vectors, v.reshape(1, -1)]) 53 | else: 54 | self.vectors[self.vocabulary[k]] = v 55 | 56 | def __contains__(self, k): 57 | return k in self.vocabulary 58 | 59 | def __delitem__(self, k): 60 | """Remove the word and its vector from the embedding. 61 | 62 | Note: 63 | This operation costs \\theta(n). Be careful putting it in a loop. 64 | """ 65 | index = self.vocabulary[k] 66 | del self.vocabulary[k] 67 | self.vectors = np.delete(self.vectors, index, 0) 68 | 69 | def __len__(self): 70 | return len(self.vocabulary) 71 | 72 | def __iter__(self): 73 | for w in self.vocabulary: 74 | yield w, self[w] 75 | 76 | @property 77 | def words(self): 78 | return self.vocabulary.words 79 | 80 | @property 81 | def shape(self): 82 | return self.vectors.shape 83 | 84 | def get(self, k, default=None): 85 | try: 86 | return self[k] 87 | except KeyError as e: 88 | return default 89 | 90 | def standardize_words(self, lower=False, clean_words=False, inplace=False): 91 | tw = self.transform_words(partial(standardize_string, lower=lower, clean_words=clean_words), inplace=inplace, 92 | lower=lower) 93 | 94 | if clean_words: 95 | tw = tw.transform_words(partial(lambda w: w.strip(" ")), inplace=inplace, lower=lower) 96 | return tw 97 | 98 | def transform_words(self, f, inplace=False, lower=False): 99 | """ 100 | Transform words in vocabulary according to following strategy. 101 | Prefer shortest and most often occurring words- after transforming by some (lambda f) function. 102 | 103 | This allow eliminate noisy and wrong coded words. 104 | 105 | Strategy is implemented for all types of Vocabulary- they can be polymorphicaly extended. 106 | 107 | Parameters 108 | ---------- 109 | f: lambda 110 | Function called on each word- for transformation it. 111 | 112 | inplace: bool, default: False 113 | Return new Embedding instance or modify existing 114 | 115 | lower: bool, default: False 116 | If true, will convert all words to lowercase 117 | 118 | Returns 119 | ------- 120 | e: Embedding 121 | Instance of Embedding class with this same Vocabulary type as previous. 122 | """ 123 | id_map = OrderedDict() 124 | word_count = len(self.vectors) 125 | # store max word length before f(w)- in corpora 126 | words_len = {} 127 | # store max occurrence count of word 128 | counts = {} 129 | is_vocab_generic = False 130 | 131 | curr_words = self.vocabulary.words 132 | curr_vec = self.vectors 133 | 134 | if isinstance(self.vocabulary, CountedVocabulary): 135 | _, counter_of_words = self.vocabulary.getstate() 136 | elif isinstance(self.vocabulary, OrderedVocabulary): 137 | # range in python3 is lazy 138 | counter_of_words = range(len(self.vocabulary.words) - 1, -1, -1) 139 | 140 | elif isinstance(self.vocabulary, Vocabulary): 141 | is_vocab_generic = True 142 | # if corpora contain lowercase version of word i- for case Vocabulary 143 | lowered_words = {} 144 | 145 | if lower: 146 | 147 | for w, v in zip(self.vocabulary.words, self.vectors): 148 | wl = w.lower() 149 | if wl == w: 150 | lowered_words[wl] = v 151 | elif wl != w and wl not in lowered_words: 152 | lowered_words[wl] = v 153 | 154 | curr_words = list(lowered_words.keys()) 155 | curr_vec = np.asanyarray(list(lowered_words.values())) 156 | 157 | else: 158 | raise NotImplementedError( 159 | 'This kind of Vocabulary is not implemented in transform_words strategy and can not be matched') 160 | 161 | for id, w in enumerate(curr_words): 162 | 163 | fw = f(w) 164 | if len(fw) and fw not in id_map: 165 | id_map[fw] = id 166 | 167 | if not is_vocab_generic: 168 | counts[fw] = counter_of_words[id] 169 | words_len[fw] = len(w) 170 | 171 | # overwrite 172 | elif len(fw) and fw in id_map: 173 | if not is_vocab_generic and counter_of_words[id] > counts[fw]: 174 | id_map[fw] = id 175 | 176 | counts[fw] = counter_of_words[id] 177 | words_len[fw] = len(w) 178 | elif is_vocab_generic and len(w) < words_len[fw]: 179 | # for generic Vocabulary 180 | id_map[fw] = id 181 | 182 | words_len[fw] = len(w) 183 | elif not is_vocab_generic and counter_of_words[id] == counts[fw] and len(w) < words_len[fw]: 184 | id_map[fw] = id 185 | 186 | counts[fw] = counter_of_words[id] 187 | words_len[fw] = len(w) 188 | 189 | logger.warning("Overwriting {}".format(fw)) 190 | 191 | if isinstance(self.vocabulary, CountedVocabulary): 192 | words_only = id_map.keys() 193 | vectors = curr_vec[[id_map[w] for w in words_only]] 194 | words = {w: counter_of_words[id_map[w]] for w in words_only} 195 | 196 | elif isinstance(self.vocabulary, OrderedVocabulary): 197 | words = sorted(id_map.keys(), key=lambda x: id_map[x]) 198 | vectors = curr_vec[[id_map[w] for w in words]] 199 | 200 | elif isinstance(self.vocabulary, Vocabulary): 201 | words = sorted(id_map.keys(), key=lambda x: id_map[x]) 202 | vectors = curr_vec[[id_map[w] for w in words]] 203 | 204 | logger.info("Transformed {} into {} words".format(word_count, len(words))) 205 | 206 | if inplace: 207 | self.vectors = vectors 208 | self.vocabulary = self.vocabulary.__class__(words) 209 | 210 | return self 211 | else: 212 | return Embedding(vectors=vectors, vocabulary=self.vocabulary.__class__(words)) 213 | 214 | def most_frequent(self, k, inplace=False): 215 | """Only most frequent k words to be included in the embeddings.""" 216 | 217 | assert isinstance(self.vocabulary, OrderedVocabulary), \ 218 | "most_frequent can be called only on Embedding with OrderedVocabulary" 219 | 220 | vocabulary = self.vocabulary.most_frequent(k) 221 | vectors = np.asarray([self[w] for w in vocabulary]) 222 | if inplace: 223 | self.vocabulary = vocabulary 224 | self.vectors = vectors 225 | return self 226 | return Embedding(vectors=vectors, vocabulary=vocabulary) 227 | 228 | def normalize_words(self, ord=2, inplace=False): 229 | """Normalize embeddings matrix row-wise. 230 | 231 | Parameters 232 | ---------- 233 | ord: normalization order. Possible values {1, 2, 'inf', '-inf'} 234 | """ 235 | if ord == 2: 236 | ord = None # numpy uses this flag to indicate l2. 237 | vectors = self.vectors.T / np.linalg.norm(self.vectors, ord, axis=1) 238 | if inplace: 239 | self.vectors = vectors.T 240 | return self 241 | return Embedding(vectors=vectors.T, vocabulary=self.vocabulary) 242 | 243 | def nearest_neighbors(self, word, k=1, exclude=[], metric="cosine"): 244 | """ 245 | Find nearest neighbor of given word 246 | 247 | Parameters 248 | ---------- 249 | word: string or vector 250 | Query word or vector. 251 | 252 | k: int, default: 1 253 | Number of nearest neighbours to return. 254 | 255 | metric: string, default: 'cosine' 256 | Metric to use. 257 | 258 | exclude: list, default: [] 259 | Words to omit in answer 260 | 261 | Returns 262 | ------- 263 | n: list 264 | Nearest neighbors. 265 | """ 266 | if isinstance(word, string_types): 267 | assert word in self, "Word not found in the vocabulary" 268 | v = self[word] 269 | else: 270 | v = word 271 | 272 | D = pairwise_distances(self.vectors, v.reshape(1, -1), metric=metric) 273 | 274 | if isinstance(word, string_types): 275 | D[self.vocabulary.word_id[word]] = D.max() 276 | 277 | for w in exclude: 278 | D[self.vocabulary.word_id[w]] = D.max() 279 | 280 | return [self.vocabulary.id_word[id] for id in D.argsort(axis=0).flatten()[0:k]] 281 | 282 | @staticmethod 283 | def from_gensim(model): 284 | word_count = {} 285 | vectors = [] 286 | for word, vocab in sorted(iteritems(model.vocab), key=lambda item: -item[1].count): 287 | word = standardize_string(word) 288 | if word: 289 | vectors.append(model.syn0[vocab.index]) 290 | word_count[word] = vocab.count 291 | vocab = CountedVocabulary(word_count=word_count) 292 | vectors = np.asarray(vectors) 293 | return Embedding(vocabulary=vocab, vectors=vectors) 294 | 295 | @staticmethod 296 | def from_word2vec_vocab(fvocab): 297 | counts = {} 298 | with _open(fvocab) as fin: 299 | for line in fin: 300 | 301 | word, count = standardize_string(line).split() 302 | if word: 303 | counts[word] = int(count) 304 | return CountedVocabulary(word_count=counts) 305 | 306 | @staticmethod 307 | def _from_word2vec_binary(fname): 308 | with _open(fname, 'rb') as fin: 309 | words = [] 310 | header = fin.readline() 311 | vocab_size, layer1_size = list(map(int, header.split())) # throws for invalid file format 312 | logger.info("Loading #{} words with {} dim".format(vocab_size, layer1_size)) 313 | vectors = np.zeros((vocab_size, layer1_size), dtype=np.float32) 314 | binary_len = np.dtype("float32").itemsize * layer1_size 315 | for line_no in range(vocab_size): 316 | # mixed text and binary: read text first, then binary 317 | word = [] 318 | while True: 319 | ch = fin.read(1) 320 | if ch == b' ': 321 | break 322 | if ch != b'\n': # ignore newlines in front of words (some binary files have newline, some don't) 323 | word.append(ch) 324 | 325 | words.append(b''.join(word).decode("latin-1")) 326 | vectors[line_no, :] = np.fromstring(fin.read(binary_len), dtype=np.float32) 327 | 328 | if len(words) < vocab_size: 329 | logger.warning("Omitted {} words".format(vocab_size - len(words))) 330 | elif len(words) > vocab_size: 331 | raise RuntimeError("Read too many words, incorrect file") 332 | 333 | return words, vectors 334 | 335 | @staticmethod 336 | def _from_word2vec_text(fname): 337 | with _open(fname, 'r') as fin: 338 | words = [] 339 | 340 | header = fin.readline() 341 | ignored = 0 342 | vocab_size, layer1_size = list(map(int, header.split())) # throws for invalid file format 343 | vectors = np.zeros(shape=(vocab_size, layer1_size), dtype=np.float32) 344 | for line_no, line in enumerate(fin): 345 | try: 346 | parts = text_type(line, encoding="utf-8").split(' ') 347 | w = parts[0] 348 | parts = list(map(lambda x: x.strip(), parts[1:])) 349 | parts.insert(0, w) 350 | 351 | except TypeError as e: 352 | parts = line.split(' ') 353 | w = parts[0] 354 | parts = list(map(lambda x: x.strip(), parts[1:])) 355 | parts.insert(0, w) 356 | 357 | except Exception as e: 358 | logger.warning("We ignored line number {} because of errors in parsing" 359 | "\n{}".format(line_no, e)) 360 | continue 361 | 362 | # We differ from Gensim implementation. 363 | # Our assumption that a difference of one happens because of having a 364 | # space in the word. 365 | if len(parts) == layer1_size + 1: 366 | word, vectors[line_no - ignored] = parts[0], list(map(np.float32, parts[1:])) 367 | elif len(parts) == layer1_size + 2 and parts[-1]: 368 | # last element after splitting is not empty- some glove corpora have additional space 369 | word, vectors[line_no - ignored] = parts[:2], list(map(np.float32, parts[2:])) 370 | word = u" ".join(word) 371 | elif not parts[-1]: 372 | # omit last value - empty string 373 | word, vectors[line_no - ignored] = parts[0], list(map(np.float32, parts[1:-1])) 374 | else: 375 | ignored += 1 376 | logger.warning("We ignored line number {} because of unrecognized " 377 | "number of columns {}".format(line_no, parts[:-layer1_size])) 378 | continue 379 | 380 | words.append(word) 381 | 382 | if ignored: 383 | vectors = vectors[0:-ignored] 384 | 385 | if len(words) < vocab_size: 386 | logger.warning("Omitted {} words".format(vocab_size - len(words))) 387 | elif len(words) > vocab_size: 388 | raise RuntimeError("Read too many words, incorrect file") 389 | 390 | return words, vectors 391 | 392 | @staticmethod 393 | def from_glove(fname, vocab_size, dim): 394 | with _open(fname, 'r') as fin: 395 | 396 | words = [] 397 | words_uniq = set() 398 | 399 | ignored = 0 400 | vectors = np.zeros(shape=(vocab_size, dim), dtype=np.float32) 401 | for line_no, line in enumerate(fin): 402 | try: 403 | parts = text_type(line, encoding="utf-8").split(' ') 404 | parts[1:] = map(lambda x: np.float32(x.strip()), parts[1:]) 405 | except TypeError as e: 406 | 407 | parts = line.split(' ') 408 | parts[1:] = map(lambda x: np.float32(x.strip()), parts[1:]) 409 | 410 | except Exception as e: 411 | ignored += 1 412 | 413 | logger.warning("We ignored line number {} because of errors in parsing" 414 | "\n{}".format(line_no, e)) 415 | continue 416 | 417 | try: 418 | if parts[0] not in words_uniq: 419 | word, vectors[line_no - ignored] = parts[0], list(parts[len(parts) - dim:]) 420 | words.append(word) 421 | words_uniq.add(word) 422 | else: 423 | ignored += 1 424 | logger.warning( 425 | "We ignored line number {} - following word is duplicated in file:\n{}\n".format(line_no, 426 | parts[0])) 427 | 428 | except Exception as e: 429 | ignored += 1 430 | logger.warning("We ignored line number {} because of errors in parsing" 431 | "\n{}".format(line_no, e)) 432 | 433 | return Embedding(vocabulary=OrderedVocabulary(words), vectors=vectors[0:len(words)]) 434 | 435 | @staticmethod 436 | def from_dict(d): 437 | for k in d: # Standardize 438 | d[k] = np.array(d[k]).flatten() 439 | return Embedding(vectors=list(d.values()), vocabulary=Vocabulary(d.keys())) 440 | 441 | @staticmethod 442 | def to_word2vec(w, fname, binary=False): 443 | """ 444 | Store the input-hidden weight matrix in the same format used by the original 445 | C word2vec-tool, for compatibility. 446 | 447 | Parameters 448 | ---------- 449 | w: Embedding instance 450 | 451 | fname: string 452 | Destination file 453 | """ 454 | logger.info("storing %sx%s projection weights into %s" % (w.vectors.shape[0], w.vectors.shape[1], fname)) 455 | with _open(fname, 'wb') as fout: 456 | fout.write(to_utf8("%s %s\n" % w.vectors.shape)) 457 | # store in sorted order: most frequent words at the top 458 | for word, vector in zip(w.vocabulary.words, w.vectors): 459 | if binary: 460 | fout.write(to_utf8(word) + b" " + vector.astype("float32").tostring()) 461 | else: 462 | fout.write(to_utf8("%s %s\n" % (word, ' '.join("%.15f" % val for val in vector)))) 463 | 464 | @staticmethod 465 | def from_word2vec(fname, fvocab=None, binary=False): 466 | """ 467 | Load the input-hidden weight matrix from the original C word2vec-tool format. 468 | 469 | `binary` is a boolean indicating whether the data is in binary word2vec format. 470 | Word counts are read from `fvocab` filename, if set (this is the file generated 471 | by `-save-vocab` flag of the original C tool). 472 | """ 473 | vocabulary = None 474 | if fvocab is not None: 475 | logger.info("loading word counts from %s" % (fvocab)) 476 | vocabulary = Embedding.from_word2vec_vocab(fvocab) 477 | 478 | logger.info("loading projection weights from %s" % (fname)) 479 | if binary: 480 | words, vectors = Embedding._from_word2vec_binary(fname) 481 | else: 482 | words, vectors = Embedding._from_word2vec_text(fname) 483 | 484 | if not vocabulary: 485 | vocabulary = OrderedVocabulary(words=words) 486 | 487 | if len(words) != len(set(words)): 488 | raise RuntimeError("Vocabulary has duplicates") 489 | 490 | e = Embedding(vocabulary=vocabulary, vectors=vectors) 491 | 492 | return e 493 | 494 | @staticmethod 495 | def load(fname): 496 | """Load an embedding dump generated by `save`""" 497 | 498 | content = _open(fname).read() 499 | if PY2: 500 | state = pickle.loads(content, encoding='latin1') 501 | else: 502 | state = pickle.loads(content, encoding='latin1') 503 | voc, vec = state 504 | if len(voc) == 2: 505 | words, counts = voc 506 | word_count = dict(zip(words, counts)) 507 | vocab = CountedVocabulary(word_count=word_count) 508 | else: 509 | vocab = OrderedVocabulary(voc) 510 | return Embedding(vocabulary=vocab, vectors=vec) 511 | 512 | def save(self, fname): 513 | """Save a pickled version of the embedding into `fname`.""" 514 | 515 | vec = self.vectors 516 | voc = self.vocabulary.getstate() 517 | state = (voc, vec) 518 | with open(fname, 'wb') as f: 519 | pickle.dump(state, f, protocol=pickle.HIGHEST_PROTOCOL) 520 | -------------------------------------------------------------------------------- /web/embeddings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Fetchers for publicly available pretrained embeddings 4 | """ 5 | from six.moves import cPickle as pickle 6 | from os import path 7 | from .datasets.utils import _get_dataset_dir, _fetch_file 8 | from .embedding import Embedding 9 | 10 | def load_embedding(fname, format="word2vec_bin", normalize=True, 11 | lower=False, clean_words=False, load_kwargs={}): 12 | """ 13 | Loads embeddings from file 14 | 15 | Parameters 16 | ---------- 17 | fname: string 18 | Path to file containing embedding 19 | 20 | format: string 21 | Format of the embedding. Possible values are: 22 | 'word2vec_bin', 'word2vec', 'glove', 'dict' 23 | 24 | normalize: bool, default: True 25 | If true will normalize all vector to unit length 26 | 27 | clean_words: bool, default: True 28 | If true will only keep alphanumeric characters and "_", "-" 29 | Warning: shouldn't be applied to embeddings with non-ascii characters 30 | 31 | load_kwargs: 32 | Additional parameters passed to load function. Mostly useful for 'glove' format where you 33 | should pass vocab_size and dim. 34 | """ 35 | assert format in ['word2vec_bin', 'word2vec', 'glove', 'dict'], "Unrecognized format" 36 | if format == "word2vec_bin": 37 | w = Embedding.from_word2vec(fname, binary=True) 38 | elif format == "word2vec": 39 | w = Embedding.from_word2vec(fname, binary=False) 40 | elif format == "glove": 41 | w = Embedding.from_glove(fname, **load_kwargs) 42 | elif format == "dict": 43 | d = pickle.load(open(fname, "rb"), encoding='latin1') 44 | w = Embedding.from_dict(d) 45 | if normalize: 46 | w.normalize_words(inplace=True) 47 | if lower or clean_words: 48 | w.standardize_words(lower=lower, clean_words=clean_words, inplace=True) 49 | return w 50 | 51 | 52 | 53 | def fetch_GloVe(dim=300, corpus="wiki-6B", normalize=True, lower=False, clean_words=False): 54 | """ 55 | Fetches GloVe embeddings. 56 | 57 | Parameters 58 | ---------- 59 | dim: int, default: 300 60 | Dimensionality of embedding (usually performance increases with dimensionality). 61 | Available dimensionalities: 62 | * wiki-6B: 50, 100, 200, 300 63 | * common-crawl-42B: 300 64 | * common-crawl-840B: 300 65 | * twitter: 25, 50, 100, 200 66 | 67 | corpus: string, default: "wiki-6B" 68 | Corpus that GloVe vector were trained on. 69 | Available corpuses: "wiki-6B", "common-crawl-42B", "common-crawl-840B", "twitter-27B" 70 | 71 | normalize: bool, default: True 72 | If true will normalize all vector to unit length 73 | 74 | clean_words: bool, default: True 75 | If true will only keep alphanumeric characters and "_", "-" 76 | Warning: shouldn't be applied to embeddings with non-ascii characters 77 | 78 | load_kwargs: 79 | Additional parameters passed to load function. Mostly useful for 'glove' format where you 80 | should pass vocab_size and dim. 81 | 82 | Returns 83 | ------- 84 | w: Embedding 85 | Embedding instance 86 | 87 | References 88 | ---------- 89 | Project website: http://nlp.stanford.edu/projects/glove/ 90 | 91 | Notes 92 | ----- 93 | Loading GloVe format can take a while 94 | """ 95 | download_file = { 96 | "wiki-6B": "http://nlp.stanford.edu/data/glove.6B.zip", 97 | "common-crawl-42B": "http://nlp.stanford.edu/data/glove.42B.300d.zip", 98 | "common-crawl-840B": "http://nlp.stanford.edu/data/glove.840B.300d.zip", 99 | "twitter-27B": "http://nlp.stanford.edu/data/glove.twitter.27B.zip" 100 | } 101 | 102 | embedding_file = { 103 | "wiki-6B": { 104 | 50: "glove.6B/glove.6B.50d.txt", 105 | 100: "glove.6B/glove.6B.100d.txt", 106 | 200: "glove.6B/glove.6B.200d.txt", 107 | 300: "glove.6B/glove.6B.300d.txt" 108 | }, 109 | "common-crawl-42B": { 110 | 300: "glove.42B.300d/glove.42B.300d.txt" 111 | }, 112 | "common-crawl-840B": { 113 | 300: "glove.840B.300d/glove.840B.300d.txt" 114 | }, 115 | "twitter-27B": { 116 | 25: "glove.twitter.27B/glove.twitter.27B.25d.txt", 117 | 50: "glove.twitter.27B/glove.twitter.27B.50d.txt", 118 | 100: "glove.twitter.27B/glove.twitter.27B.100d.txt", 119 | 200: "glove.twitter.27B/glove.twitter.27B.200d.txt", 120 | } 121 | } 122 | 123 | vocab_size = { 124 | "wiki-6B": 400000, 125 | "common-crawl-42B": 1917494, 126 | "common-crawl-840B": 2196017, 127 | "twitter-27B": 1193514 128 | } 129 | 130 | assert corpus in download_file, "Unrecognized corpus" 131 | assert dim in embedding_file[corpus], "Not available dimensionality" 132 | 133 | _ = _fetch_file(url=download_file[corpus], 134 | data_dir="embeddings", 135 | uncompress=True, 136 | verbose=1) 137 | 138 | return load_embedding(path.join(_get_dataset_dir("embeddings"), embedding_file[corpus][dim]), 139 | format="glove", 140 | normalize=normalize, 141 | lower=lower, clean_words=clean_words,\ 142 | load_kwargs={"vocab_size": vocab_size[corpus], "dim": dim}) 143 | 144 | 145 | 146 | def fetch_HPCA(which, normalize=True, lower=False, clean_words=False): 147 | """ 148 | Fetches Hellinger PCA based embeddings 149 | 150 | Parameters 151 | ---------- 152 | which: str, default: "autoencoder_phrase_hpca" 153 | Can choose between "hpca" and "autoencoder_phrase_hpca" (from "The Sum of Its Parts") 154 | 155 | normalize: bool, default: True 156 | If true will normalize all vector to unit length 157 | 158 | clean_words: bool, default: True 159 | If true will only keep alphanumeric characters and "_", "-" 160 | Warning: shouldn't be applied to embeddings with non-ascii characters 161 | 162 | load_kwargs: 163 | Additional parameters passed to load function. Mostly useful for 'glove' format where you 164 | should pass vocab_size and dim. 165 | 166 | Returns 167 | ------- 168 | w: Embedding 169 | Instance of Embedding class 170 | 171 | References 172 | ---------- 173 | Published at http://lebret.ch/words/ 174 | Reference paper: Lebret, Collobert et al., “The Sum of Its Parts”: Joint Learning of Word and Phrase Representations with Autoencoders", 2015 175 | """ 176 | download_file = { 177 | "autoencoder_phrase_hpca": "https://www.dropbox.com/s/6dyf48crdmjbw1a/AHPCA.bin.gz?dl=1", 178 | "hpca": "https://www.dropbox.com/s/5y5l6vyn8yn11dv/HPCA.bin.gz?dl=1" 179 | } 180 | 181 | path = _fetch_file(url=download_file[which], 182 | data_dir="embeddings", 183 | uncompress=False, 184 | verbose=1) 185 | 186 | return load_embedding(path, format="word2vec_bin", normalize=normalize, lower=lower, clean_words=clean_words) 187 | 188 | 189 | 190 | def fetch_morphoRNNLM(which, normalize=True, lower=False, clean_words=False): 191 | """ 192 | Fetches recursive morphological neural network embeddings 193 | 194 | Parameters 195 | ---------- 196 | which: str, default: "CW" 197 | Can choose between CW and HSMN 198 | 199 | normalize: bool, default: True 200 | If true will normalize all vector to unit length 201 | 202 | clean_words: bool, default: True 203 | If true will only keep alphanumeric characters and "_", "-" 204 | Warning: shouldn't be applied to embeddings with non-ascii characters 205 | 206 | load_kwargs: 207 | Additional parameters passed to load function. Mostly useful for 'glove' format where you 208 | should pass vocab_size and dim. 209 | 210 | Returns 211 | ------- 212 | w: Embedding 213 | Instance of Embedding class 214 | 215 | References 216 | ---------- 217 | Published at http://stanford.edu/~lmthang/morphoNLM/ 218 | Reference paper: Luong, Socher et al., "Better Word Representations with Recursive Neural Networks for Morphology", 2013 219 | """ 220 | download_file = { 221 | "CW": "https://www.dropbox.com/s/7fdj2666iqv4xbu/cwCsmRNN.bin.gz?dl=1", 222 | "HSMN": "https://www.dropbox.com/s/okw1i6kc6e2jd1q/hsmnCsmRNN.bin.gz?dl=1" 223 | } 224 | 225 | path = _fetch_file(url=download_file[which], 226 | data_dir="embeddings", 227 | uncompress=False, 228 | verbose=1) 229 | 230 | return load_embedding(path, format="word2vec_bin", normalize=normalize, lower=lower, clean_words=clean_words) 231 | 232 | 233 | 234 | 235 | 236 | def fetch_NMT(which="DE", normalize=True, lower=False, clean_words=False): 237 | """ 238 | Fetches word embeddings induced by Neural Translation Machine 239 | 240 | Parameters 241 | ---------- 242 | which: str, default: "DE" 243 | Can choose between DE and FR, which fetches accordingly EN -> DE or EN -> FR translation 244 | induced word embeddings 245 | 246 | normalize: bool, default: True 247 | If true will normalize all vector to unit length 248 | 249 | clean_words: bool, default: True 250 | If true will only keep alphanumeric characters and "_", "-" 251 | Warning: shouldn't be applied to embeddings with non-ascii characters 252 | 253 | load_kwargs: 254 | Additional parameters passed to load function. Mostly useful for 'glove' format where you 255 | should pass vocab_size and dim. 256 | 257 | Returns 258 | ------- 259 | w: Embedding 260 | Instance of Embedding class 261 | 262 | References 263 | ---------- 264 | Published at https://www.cl.cam.ac.uk/~fh295/. 265 | Reference paper: Hill, Cho et al., "Embedding Word Similarity With Neural Machine Translation", 2014 266 | """ 267 | dirname = _fetch_file(url="https://www.cl.cam.ac.uk/~fh295/TEmbz.tar.gz", 268 | data_dir="embeddings", 269 | uncompress=True, 270 | verbose=1) 271 | 272 | assert which in ["DE", "FR"], "Unrecognized which parameter" 273 | 274 | fname = {"FR": "Trans_embds/D_RNN_500k_144h.pkl", "DE": "Trans_embds/D_german_50k_500k_168h.pkl"} 275 | 276 | return load_embedding(path.join(dirname, fname[which]), 277 | format="dict", 278 | normalize=normalize, 279 | lower=lower, clean_words=clean_words) 280 | 281 | 282 | 283 | def fetch_PDC(dim=300, normalize=True, lower=False, clean_words=False): 284 | """ 285 | Fetches PDC embeddings trained on wiki by Fei Sun 286 | 287 | Parameters 288 | ---------- 289 | dim: int, default:300 290 | Dimensionality of embedding 291 | 292 | normalize: bool, default: True 293 | If true will normalize all vector to unit length 294 | 295 | clean_words: bool, default: True 296 | If true will only keep alphanumeric characters and "_", "-" 297 | Warning: shouldn't be applied to embeddings with non-ascii characters 298 | 299 | load_kwargs: 300 | Additional parameters passed to load function. Mostly useful for 'glove' format where you 301 | should pass vocab_size and dim. 302 | 303 | Returns 304 | ------- 305 | w: Embedding 306 | Embedding instance 307 | 308 | References 309 | ---------- 310 | Embeddings were published on http://ofey.me/projects/wordrep/. 311 | Reference paper: Fei Sun, Jiafeng Guo, Yanyan Lan, Jun Xu, and Xueqi Cheng. 312 | "Learning word representations by jointly modeling syntagmatic and paradigmatic relations" 313 | """ 314 | 315 | url = { 316 | 50: "https://www.dropbox.com/s/0ofi1glri8l42y1/wikicorp.201004-pdc-" 317 | "iter-20-alpha-0.05-window-10-dim-50-neg-10-subsample-0.0001.txt.bz2?dl=1", 318 | 100: "https://www.dropbox.com/s/fmvegh4j62hulr0/wikicorp.201004-pdc-" 319 | "iter-20-alpha-0.05-window-10-dim-100-neg-10-subsample-0.0001.txt.bz2?dl=1", 320 | 300: "https://www.dropbox.com/s/jppkd6j2xxb9v48/wikicorp.201004-pdc-" 321 | "iter-20-alpha-0.05-window-10-dim-300-neg-10-subsample-0.0001.txt.bz2?dl=1" 322 | } 323 | assert dim in url, "Unavailable dimensionality" 324 | 325 | path = _fetch_file(url=url[dim], 326 | data_dir="embeddings", 327 | uncompress=False, 328 | move="pdc/pdc{}.txt.bz2".format(dim), 329 | verbose=1) 330 | 331 | return load_embedding(path, format="word2vec", normalize=normalize, lower=lower, clean_words=clean_words) 332 | 333 | 334 | def fetch_HDC(dim=300, normalize=True, lower=False, clean_words=False): 335 | """ 336 | Fetches PDC embeddings trained on wiki by Fei Sun 337 | 338 | Parameters 339 | ---------- 340 | dim: int, default:300 341 | Dimensionality of embedding 342 | 343 | normalize: bool, default: True 344 | If true will normalize all vector to unit length 345 | 346 | clean_words: bool, default: True 347 | If true will only keep alphanumeric characters and "_", "-" 348 | Warning: shouldn't be applied to embeddings with non-ascii characters 349 | 350 | load_kwargs: 351 | Additional parameters passed to load function. Mostly useful for 'glove' format where you 352 | should pass vocab_size and dim. 353 | 354 | Returns 355 | ------- 356 | w: Embedding 357 | Embedding instance 358 | 359 | References 360 | ---------- 361 | Embeddings were published on http://ofey.me/projects/wordrep/. 362 | Reference paper: Fei Sun, Jiafeng Guo, Yanyan Lan, Jun Xu, and Xueqi Cheng. 363 | "Learning word representations by jointly modeling syntagmatic and paradigmatic relations" 364 | """ 365 | 366 | url = { 367 | 50: "https://www.dropbox.com/s/q22ssy8055loknz/wikicorp.201004-hdc-" 368 | "iter-20-alpha-0.025-window-10-dim-50-neg-10-subsample-0.0001.txt.bz2?dl=1", 369 | 100: "https://www.dropbox.com/s/13226et55fi6g50/wikicorp.201004-hdc-" 370 | "iter-20-alpha-0.025-window-10-dim-100-neg-10-subsample-0.0001.txt.bz2?dl=1", 371 | 300: "https://www.dropbox.com/s/jrfwel32yd8w0lu/wikicorp.201004-hdc-" 372 | "iter-20-alpha-0.025-window-10-dim-300-neg-10-subsample-0.0001.txt.bz2?dl=1" 373 | } 374 | assert dim in url, "Unavailable dimensionality" 375 | 376 | path = _fetch_file(url=url[dim], 377 | data_dir="embeddings", 378 | uncompress=False, 379 | move="hdc/hdc{}.txt.bz2".format(dim), 380 | verbose=1) 381 | 382 | return load_embedding(path, format="word2vec", normalize=normalize, lower=lower, clean_words=clean_words) 383 | 384 | 385 | 386 | def fetch_SG_GoogleNews(normalize=True, lower=False, clean_words=False): 387 | """ 388 | Fetches SG (skip-gram with negative sampling) 389 | embeddings trained on GoogleNews dataset published on word2vec website 390 | 391 | Parameters 392 | ---------- 393 | normalize: bool, default: True 394 | If true will normalize all vector to unit length 395 | 396 | clean_words: bool, default: True 397 | If true will only keep alphanumeric characters and "_", "-" 398 | Warning: shouldn't be applied to embeddings with non-ascii characters 399 | 400 | load_kwargs: 401 | Additional parameters passed to load function. Mostly useful for 'glove' format where you 402 | should pass vocab_size and dim. 403 | 404 | Returns 405 | ------- 406 | w: Embedding 407 | Instance of Embedding class 408 | 409 | References 410 | ---------- 411 | Original source: https://code.google.com/p/word2vec/ 412 | """ 413 | path = _fetch_file(url="https://www.dropbox.com/s/bnm0trligffakd9/GoogleNews-vectors-negative300.bin.gz?dl=1", 414 | data_dir="embeddings", 415 | verbose=1) 416 | return load_embedding(path, format="word2vec_bin", normalize=normalize, lower=lower, clean_words=clean_words) 417 | 418 | def fetch_LexVec(which="commoncrawl-W+C", normalize=True, lower=False, clean_words=False): 419 | """ 420 | Fetches LexVec embeddings 421 | 422 | Parameters 423 | ---------- 424 | which: str, default: "commoncrawl-W+C" 425 | Can choose between "commoncrawl-W", "commoncrawl-W+C", "wikipedia+newscrawl-W", "wikipedia+newscrawl-W+C", "commoncrawl-ngramsubwords-W" 426 | 427 | normalize: bool, default: True 428 | If true will normalize all vector to unit length 429 | 430 | lower: bool, default: False 431 | If true, will convert string to lowercase 432 | 433 | clean_words: bool, default: False 434 | If true will only keep alphanumeric characters and "_", "-" 435 | Warning: shouldn't be applied to embeddings with non-ascii characters 436 | 437 | Returns 438 | ------- 439 | w: Embedding 440 | Instance of Embedding class 441 | 442 | References 443 | ---------- 444 | Published at https://github.com/alexandres/lexvec 445 | Reference paper: Salle, Alexandre, Marco Idiart, and Aline Villavicencio. Matrix Factorization using Window Sampling and Negative Sampling for Improved Word Representations. The 54th Annual Meeting of the Association for Computational Linguistics. 2016. 446 | """ 447 | download_file = { 448 | "commoncrawl-W": "https://www.dropbox.com/s/flh1fjynqvdsj4p/lexvec.commoncrawl.300d.W.pos.vectors.gz?dl=1", 449 | "commoncrawl-W+C": "https://www.dropbox.com/s/zkiajh6fj0hm0m7/lexvec.commoncrawl.300d.W%2BC.pos.vectors.gz?dl=1", 450 | "wikipedia+newscrawl-W": "https://www.dropbox.com/s/kguufyc2xcdi8yk/lexvec.enwiki%2Bnewscrawl.300d.W.pos.vectors.gz?dl=1", 451 | "wikipedia+newscrawl-W+C": "https://www.dropbox.com/s/u320t9bw6tzlwma/lexvec.enwiki%2Bnewscrawl.300d.W%2BC.pos.vectors.gz?dl=1", 452 | "commoncrawl-ngramsubwords-W": "https://www.dropbox.com/s/mrxn933chn5u37z/lexvec.commoncrawl.ngramsubwords.300d.W.pos.vectors.gz?dl=1" 453 | } 454 | 455 | path = _fetch_file(url=download_file[which], 456 | data_dir="embeddings", 457 | verbose=1) 458 | 459 | return load_embedding(path, format="word2vec", normalize=normalize, lower=lower, clean_words=clean_words) 460 | 461 | 462 | def fetch_conceptnet_numberbatch(clean_words=False): 463 | """ 464 | Fetches ConceptNetNumberbatch embeddings. Embeddings are normalized to unit length, 465 | and the vocabulary terms are lowercase. 466 | 467 | Parameters 468 | ---------- 469 | clean_words: bool, default: False 470 | If true will only keep alphanumeric characters and "_", "-" 471 | Warning: shouldn't be applied to embeddings with non-ascii characters 472 | 473 | Returns 474 | ------- 475 | w: Embedding 476 | Instance of Embedding class 477 | 478 | References 479 | ---------- 480 | Published at https://github.com/commonsense/conceptnet-numberbatch 481 | Reference paper: Robert Speer, Joshua Chin, and Catherine Havasi (2017). "ConceptNet 5.5: An Open Multilingual Graph of General Knowledge." In proceedings of AAAI 2017. 482 | """ 483 | path = _fetch_file(url='https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-en-17.06.txt.gz', 484 | data_dir='embeddings', 485 | uncompress=False, 486 | verbose=1) 487 | return load_embedding(path, format='word2vec', normalize=False, clean_words=clean_words) 488 | 489 | 490 | def fetch_FastText(lang="en", normalize=True, lower=False, clean_words=False): 491 | """ 492 | Fetches fastText embeddings 493 | 494 | Parameters 495 | ---------- 496 | lang: str, default: "en" 497 | Can choose between all accessible language on page: 498 | https://fasttext.cc/docs/en/pretrained-vectors.html#content 499 | 500 | normalize: bool, default: True 501 | If true will normalize all vector to unit length 502 | 503 | lower: bool, default: False 504 | If true, will convert string to lowercase 505 | 506 | clean_words: bool, default: False 507 | If true will only keep alphanumeric characters and "_", "-" 508 | Warning: shouldn't be applied to embeddings with non-ascii characters 509 | 510 | Returns 511 | ------- 512 | w: Embedding 513 | Instance of Embedding class 514 | 515 | References 516 | ---------- 517 | Published at https://fasttext.cc/ 518 | """ 519 | 520 | url_vec = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.{}.vec'.format(lang) 521 | 522 | path = _fetch_file(url=url_vec, data_dir='embeddings', 523 | uncompress=False, 524 | verbose=1) 525 | 526 | return load_embedding(path, format='word2vec', normalize=normalize, lower=lower, clean_words=clean_words) 527 | 528 | 529 | # TODO: uncomment after training is finished 530 | # def fetch_SG_wiki(normalize=True, lower=False, clean_words=True): 531 | # """ 532 | # Fetches SG (skip-gram) embeddings trained on recent (12.2015) Wiki corpus using gensim 533 | # 534 | # Note 535 | # ---- 536 | # Doesn't distinguish between lower and capital letters in embedding. 537 | # See scripts used for training on github in scripts/wikipedia/ 538 | # """ 539 | # fname = path.join(_get_dataset_dir('embeddings'), "sg-wiki-en-400.bin") 540 | # return _load_embedding(fname, format="word2vec_binary", normalize=normalize, 541 | # lower=lower, clean_words=clean_words) 542 | -------------------------------------------------------------------------------- /web/evaluate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Evaluation functions 4 | """ 5 | import logging 6 | import numpy as np 7 | from sklearn.cluster import AgglomerativeClustering, KMeans 8 | from .datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999, fetch_MTurk, fetch_RG65, fetch_RW, fetch_TR9856 9 | from .datasets.categorization import fetch_AP, fetch_battig, fetch_BLESS, fetch_ESSLI_1a, fetch_ESSLI_2b, \ 10 | fetch_ESSLI_2c 11 | from web.analogy import * 12 | from six import iteritems 13 | from web.embedding import Embedding 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | def calculate_purity(y_true, y_pred): 18 | """ 19 | Calculate purity for given true and predicted cluster labels. 20 | 21 | Parameters 22 | ---------- 23 | y_true: array, shape: (n_samples, 1) 24 | True cluster labels 25 | 26 | y_pred: array, shape: (n_samples, 1) 27 | Cluster assingment. 28 | 29 | Returns 30 | ------- 31 | purity: float 32 | Calculated purity. 33 | """ 34 | assert len(y_true) == len(y_pred) 35 | true_clusters = np.zeros(shape=(len(set(y_true)), len(y_true))) 36 | pred_clusters = np.zeros_like(true_clusters) 37 | for id, cl in enumerate(set(y_true)): 38 | true_clusters[id] = (y_true == cl).astype("int") 39 | for id, cl in enumerate(set(y_pred)): 40 | pred_clusters[id] = (y_pred == cl).astype("int") 41 | 42 | M = pred_clusters.dot(true_clusters.T) 43 | return 1. / len(y_true) * np.sum(np.max(M, axis=1)) 44 | 45 | 46 | def evaluate_categorization(w, X, y, method="all", seed=None): 47 | """ 48 | Evaluate embeddings on categorization task. 49 | 50 | Parameters 51 | ---------- 52 | w: Embedding or dict 53 | Embedding to test. 54 | 55 | X: vector, shape: (n_samples, ) 56 | Vector of words. 57 | 58 | y: vector, shape: (n_samples, ) 59 | Vector of cluster assignments. 60 | 61 | method: string, default: "all" 62 | What method to use. Possible values are "agglomerative", "kmeans", "all. 63 | If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude 64 | hyperparameter tuning to avoid overfitting). 65 | If "kmeans" is passed, method will fit KMeans. 66 | In both cases number of clusters is preset to the correct value. 67 | 68 | seed: int, default: None 69 | Seed passed to KMeans. 70 | 71 | Returns 72 | ------- 73 | purity: float 74 | Purity of the best obtained clustering. 75 | 76 | Notes 77 | ----- 78 | KMedoids method was excluded as empirically didn't improve over KMeans (for categorization 79 | tasks available in the package). 80 | """ 81 | 82 | if isinstance(w, dict): 83 | w = Embedding.from_dict(w) 84 | 85 | assert method in ["all", "kmeans", "agglomerative"], "Uncrecognized method" 86 | 87 | mean_vector = np.mean(w.vectors, axis=0, keepdims=True) 88 | words = np.vstack(w.get(word, mean_vector) for word in X.flatten()) 89 | ids = np.random.RandomState(seed).choice(range(len(X)), len(X), replace=False) 90 | 91 | # Evaluate clustering on several hyperparameters of AgglomerativeClustering and 92 | # KMeans 93 | best_purity = 0 94 | 95 | if method == "all" or method == "agglomerative": 96 | best_purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)), 97 | affinity="euclidean", 98 | linkage="ward").fit_predict(words[ids])) 99 | logger.debug("Purity={:.3f} using affinity={} linkage={}".format(best_purity, 'euclidean', 'ward')) 100 | for affinity in ["cosine", "euclidean"]: 101 | for linkage in ["average", "complete"]: 102 | purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)), 103 | affinity=affinity, 104 | linkage=linkage).fit_predict(words[ids])) 105 | logger.debug("Purity={:.3f} using affinity={} linkage={}".format(purity, affinity, linkage)) 106 | best_purity = max(best_purity, purity) 107 | 108 | if method == "all" or method == "kmeans": 109 | purity = calculate_purity(y[ids], KMeans(random_state=seed, n_init=10, n_clusters=len(set(y))). 110 | fit_predict(words[ids])) 111 | logger.debug("Purity={:.3f} using KMeans".format(purity)) 112 | best_purity = max(purity, best_purity) 113 | 114 | return best_purity 115 | 116 | 117 | 118 | def evaluate_on_semeval_2012_2(w): 119 | """ 120 | Simple method to score embedding using SimpleAnalogySolver 121 | 122 | Parameters 123 | ---------- 124 | w : Embedding or dict 125 | Embedding or dict instance. 126 | 127 | Returns 128 | ------- 129 | result: pandas.DataFrame 130 | Results with spearman correlation per broad category with special key "all" for summary 131 | spearman correlation 132 | """ 133 | if isinstance(w, dict): 134 | w = Embedding.from_dict(w) 135 | 136 | data = fetch_semeval_2012_2() 137 | mean_vector = np.mean(w.vectors, axis=0, keepdims=True) 138 | categories = data.y.keys() 139 | results = defaultdict(list) 140 | for c in categories: 141 | # Get mean of left and right vector 142 | prototypes = data.X_prot[c] 143 | prot_left = np.mean(np.vstack(w.get(word, mean_vector) for word in prototypes[:, 0]), axis=0) 144 | prot_right = np.mean(np.vstack(w.get(word, mean_vector) for word in prototypes[:, 1]), axis=0) 145 | 146 | questions = data.X[c] 147 | question_left, question_right = np.vstack(w.get(word, mean_vector) for word in questions[:, 0]), \ 148 | np.vstack(w.get(word, mean_vector) for word in questions[:, 1]) 149 | 150 | scores = np.dot(prot_left - prot_right, (question_left - question_right).T) 151 | 152 | c_name = data.categories_names[c].split("_")[0] 153 | # NaN happens when there are only 0s, which might happen for very rare words or 154 | # very insufficient word vocabulary 155 | cor = scipy.stats.spearmanr(scores, data.y[c]).correlation 156 | results[c_name].append(0 if np.isnan(cor) else cor) 157 | 158 | final_results = OrderedDict() 159 | final_results['all'] = sum(sum(v) for v in results.values()) / len(categories) 160 | for k in results: 161 | final_results[k] = sum(results[k]) / len(results[k]) 162 | return pd.Series(final_results) 163 | 164 | 165 | def evaluate_analogy(w, X, y, method="add", k=None, category=None, batch_size=100): 166 | """ 167 | Simple method to score embedding using SimpleAnalogySolver 168 | 169 | Parameters 170 | ---------- 171 | w : Embedding or dict 172 | Embedding or dict instance. 173 | 174 | method : {"add", "mul"} 175 | Method to use when finding analogy answer, see "Improving Distributional Similarity 176 | with Lessons Learned from Word Embeddings" 177 | 178 | X : array-like, shape (n_samples, 3) 179 | Analogy questions. 180 | 181 | y : array-like, shape (n_samples, ) 182 | Analogy answers. 183 | 184 | k : int, default: None 185 | If not None will select k top most frequent words from embedding 186 | 187 | batch_size : int, default: 100 188 | Increase to increase memory consumption and decrease running time 189 | 190 | category : list, default: None 191 | Category of each example, if passed function returns accuracy per category 192 | in addition to the overall performance. 193 | Analogy datasets have "category" field that can be supplied here. 194 | 195 | Returns 196 | ------- 197 | result: dict 198 | Results, where each key is for given category and special empty key "" stores 199 | summarized accuracy across categories 200 | """ 201 | if isinstance(w, dict): 202 | w = Embedding.from_dict(w) 203 | 204 | assert category is None or len(category) == y.shape[0], "Passed incorrect category list" 205 | 206 | solver = SimpleAnalogySolver(w=w, method=method, batch_size=batch_size, k=k) 207 | y_pred = solver.predict(X) 208 | 209 | if category is not None: 210 | results = OrderedDict({"all": np.mean(y_pred == y)}) 211 | count = OrderedDict({"all": len(y_pred)}) 212 | correct = OrderedDict({"all": np.sum(y_pred == y)}) 213 | for cat in set(category): 214 | results[cat] = np.mean(y_pred[category == cat] == y[category == cat]) 215 | count[cat] = np.sum(category == cat) 216 | correct[cat] = np.sum(y_pred[category == cat] == y[category == cat]) 217 | 218 | return pd.concat([pd.Series(results, name="accuracy"), 219 | pd.Series(correct, name="correct"), 220 | pd.Series(count, name="count")], 221 | axis=1) 222 | else: 223 | return np.mean(y_pred == y) 224 | 225 | 226 | def evaluate_on_WordRep(w, max_pairs=1000, solver_kwargs={}): 227 | """ 228 | Evaluate on WordRep dataset 229 | 230 | Parameters 231 | ---------- 232 | w : Embedding or dict 233 | Embedding or dict instance. 234 | 235 | max_pairs: int, default: 1000 236 | Each category will be constrained to maximum of max_pairs pairs 237 | (which results in max_pair * (max_pairs - 1) examples) 238 | 239 | solver_kwargs: dict, default: {} 240 | Arguments passed to SimpleAnalogySolver. It is suggested to limit number of words 241 | in the dictionary. 242 | 243 | References 244 | ---------- 245 | Bin Gao, Jiang Bian, Tie-Yan Liu (2015) 246 | "WordRep: A Benchmark for Research on Learning Word Representations" 247 | """ 248 | if isinstance(w, dict): 249 | w = Embedding.from_dict(w) 250 | 251 | data = fetch_wordrep() 252 | categories = set(data.category) 253 | 254 | accuracy = {} 255 | correct = {} 256 | count = {} 257 | for cat in categories: 258 | X_cat = data.X[data.category == cat] 259 | X_cat = X_cat[0:max_pairs] 260 | 261 | logger.info("Processing {} with {} pairs, {} questions".format(cat, X_cat.shape[0] 262 | , X_cat.shape[0] * (X_cat.shape[0] - 1))) 263 | 264 | # For each category construct question-answer pairs 265 | size = X_cat.shape[0] * (X_cat.shape[0] - 1) 266 | X = np.zeros(shape=(size, 3), dtype="object") 267 | y = np.zeros(shape=(size,), dtype="object") 268 | id = 0 269 | for left, right in product(X_cat, X_cat): 270 | if not np.array_equal(left, right): 271 | X[id, 0:2] = left 272 | X[id, 2] = right[0] 273 | y[id] = right[1] 274 | id += 1 275 | 276 | # Run solver 277 | solver = SimpleAnalogySolver(w=w, **solver_kwargs) 278 | y_pred = solver.predict(X) 279 | correct[cat] = float(np.sum(y_pred == y)) 280 | count[cat] = size 281 | accuracy[cat] = float(np.sum(y_pred == y)) / size 282 | 283 | # Add summary results 284 | correct['wikipedia'] = sum(correct[c] for c in categories if c in data.wikipedia_categories) 285 | correct['all'] = sum(correct[c] for c in categories) 286 | correct['wordnet'] = sum(correct[c] for c in categories if c in data.wordnet_categories) 287 | 288 | count['wikipedia'] = sum(count[c] for c in categories if c in data.wikipedia_categories) 289 | count['all'] = sum(count[c] for c in categories) 290 | count['wordnet'] = sum(count[c] for c in categories if c in data.wordnet_categories) 291 | 292 | accuracy['wikipedia'] = correct['wikipedia'] / count['wikipedia'] 293 | accuracy['all'] = correct['all'] / count['all'] 294 | accuracy['wordnet'] = correct['wordnet'] / count['wordnet'] 295 | 296 | return pd.concat([pd.Series(accuracy, name="accuracy"), 297 | pd.Series(correct, name="correct"), 298 | pd.Series(count, name="count")], axis=1) 299 | 300 | 301 | def evaluate_similarity(w, X, y): 302 | """ 303 | Calculate Spearman correlation between cosine similarity of the model 304 | and human rated similarity of word pairs 305 | 306 | Parameters 307 | ---------- 308 | w : Embedding or dict 309 | Embedding or dict instance. 310 | 311 | X: array, shape: (n_samples, 2) 312 | Word pairs 313 | 314 | y: vector, shape: (n_samples,) 315 | Human ratings 316 | 317 | Returns 318 | ------- 319 | cor: float 320 | Spearman correlation 321 | """ 322 | if isinstance(w, dict): 323 | w = Embedding.from_dict(w) 324 | 325 | missing_words = 0 326 | words = w.vocabulary.word_id 327 | for query in X: 328 | for query_word in query: 329 | if query_word not in words: 330 | missing_words += 1 331 | if missing_words > 0: 332 | logger.warning("Missing {} words. Will replace them with mean vector".format(missing_words)) 333 | 334 | 335 | mean_vector = np.mean(w.vectors, axis=0, keepdims=True) 336 | A = np.vstack(w.get(word, mean_vector) for word in X[:, 0]) 337 | B = np.vstack(w.get(word, mean_vector) for word in X[:, 1]) 338 | scores = np.array([v1.dot(v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2)) for v1, v2 in zip(A, B)]) 339 | return scipy.stats.spearmanr(scores, y).correlation 340 | 341 | 342 | def evaluate_on_all(w): 343 | """ 344 | Evaluate Embedding on all fast-running benchmarks 345 | 346 | Parameters 347 | ---------- 348 | w: Embedding or dict 349 | Embedding to evaluate. 350 | 351 | Returns 352 | ------- 353 | results: pandas.DataFrame 354 | DataFrame with results, one per column. 355 | """ 356 | if isinstance(w, dict): 357 | w = Embedding.from_dict(w) 358 | 359 | # Calculate results on similarity 360 | logger.info("Calculating similarity benchmarks") 361 | similarity_tasks = { 362 | "MEN": fetch_MEN(), 363 | "WS353": fetch_WS353(), 364 | "WS353R": fetch_WS353(which="relatedness"), 365 | "WS353S": fetch_WS353(which="similarity"), 366 | "SimLex999": fetch_SimLex999(), 367 | "RW": fetch_RW(), 368 | "RG65": fetch_RG65(), 369 | "MTurk": fetch_MTurk(), 370 | } 371 | 372 | similarity_results = {} 373 | 374 | for name, data in iteritems(similarity_tasks): 375 | similarity_results[name] = evaluate_similarity(w, data.X, data.y) 376 | logger.info("Spearman correlation of scores on {} {}".format(name, similarity_results[name])) 377 | 378 | # Calculate results on analogy 379 | logger.info("Calculating analogy benchmarks") 380 | analogy_tasks = { 381 | "Google": fetch_google_analogy(), 382 | "MSR": fetch_msr_analogy() 383 | } 384 | 385 | analogy_results = {} 386 | 387 | for name, data in iteritems(analogy_tasks): 388 | analogy_results[name] = evaluate_analogy(w, data.X, data.y) 389 | logger.info("Analogy prediction accuracy on {} {}".format(name, analogy_results[name])) 390 | 391 | analogy_results["SemEval2012_2"] = evaluate_on_semeval_2012_2(w)['all'] 392 | logger.info("Analogy prediction accuracy on {} {}".format("SemEval2012", analogy_results["SemEval2012_2"])) 393 | 394 | # Calculate results on categorization 395 | logger.info("Calculating categorization benchmarks") 396 | categorization_tasks = { 397 | "AP": fetch_AP(), 398 | "BLESS": fetch_BLESS(), 399 | "Battig": fetch_battig(), 400 | "ESSLI_2c": fetch_ESSLI_2c(), 401 | "ESSLI_2b": fetch_ESSLI_2b(), 402 | "ESSLI_1a": fetch_ESSLI_1a() 403 | } 404 | 405 | categorization_results = {} 406 | 407 | # Calculate results using helper function 408 | for name, data in iteritems(categorization_tasks): 409 | categorization_results[name] = evaluate_categorization(w, data.X, data.y) 410 | logger.info("Cluster purity on {} {}".format(name, categorization_results[name])) 411 | 412 | # Construct pd table 413 | cat = pd.DataFrame([categorization_results]) 414 | analogy = pd.DataFrame([analogy_results]) 415 | sim = pd.DataFrame([similarity_results]) 416 | results = cat.join(sim).join(analogy) 417 | 418 | return results 419 | -------------------------------------------------------------------------------- /web/tests/test_analogy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Tests for analogy solvers 5 | """ 6 | import numpy as np 7 | 8 | from web.datasets.utils import _fetch_file 9 | from web.embedding import Embedding 10 | from web.datasets.analogy import fetch_google_analogy 11 | from web.evaluate import evaluate_analogy, evaluate_on_semeval_2012_2, evaluate_on_WordRep 12 | 13 | 14 | # TODO: takes too long 15 | def test_semeval_solver(): 16 | url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1" 17 | file_name = _fetch_file(url, "test") 18 | w = Embedding.from_word2vec(file_name, binary=True) 19 | results = evaluate_on_semeval_2012_2(w) 20 | assert results['all'] >= 0, "Should have some results on SemEval2012" 21 | 22 | 23 | def test_wordrep_solver(): 24 | url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1" 25 | file_name = _fetch_file(url, "test") 26 | w = Embedding.from_word2vec(file_name, binary=True) 27 | P = evaluate_on_WordRep(w, max_pairs=2) 28 | assert P['accuracy']['all'] >= 0 29 | 30 | 31 | def test_analogy_solver(): 32 | url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1" 33 | file_name = _fetch_file(url, "test") 34 | 35 | w = Embedding.from_word2vec(file_name, binary=True) 36 | data = fetch_google_analogy() 37 | ids = np.random.RandomState(777).choice(range(data.X.shape[0]), 1000, replace=False) 38 | X, y = data.X[ids], data.y[ids] 39 | category = data.category_high_level[ids] 40 | 41 | results = evaluate_analogy(w=w, X=X, y=y, category=category) 42 | assert results['accuracy']['all'] >= 0.65 43 | assert results['accuracy']['semantic'] >= 0.7 44 | assert results['accuracy']['syntactic'] >= 0.63 45 | 46 | results = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul") 47 | assert results['accuracy']['all'] >= 0.7 48 | assert results['accuracy']['semantic'] >= 0.75 49 | assert results['accuracy']['syntactic'] >= 0.64 50 | 51 | results_mul = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul", k=400) 52 | results_add = evaluate_analogy(w=w, X=X, y=y, category=category, method="add", k=400) 53 | assert results_mul['accuracy']['all'] >= results_add['accuracy']['all'] 54 | assert results_mul['accuracy']['syntactic'] >= results_add['accuracy']['syntactic'] 55 | assert results_mul['accuracy']['semantic'] >= results_add['accuracy']['semantic'] 56 | -------------------------------------------------------------------------------- /web/tests/test_categorization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from web.evaluate import calculate_purity, evaluate_categorization 3 | from web.embedding import Embedding 4 | from web.datasets.utils import _fetch_file 5 | from web.datasets.categorization import fetch_ESSLI_2c 6 | 7 | def test_purity(): 8 | y_true = np.array([1,1,2,2,3]) 9 | y_pred = np.array([2,2,2,2,1]) 10 | assert abs(0.6 - calculate_purity(y_true, y_pred)) < 1e-10 11 | 12 | def test_categorization(): 13 | data = fetch_ESSLI_2c() 14 | url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1" 15 | file_name = _fetch_file(url, "test") 16 | w = Embedding.from_word2vec(file_name, binary=True) 17 | assert evaluate_categorization(w, data.X, data.y, seed=777, method="all") >= 0.2 -------------------------------------------------------------------------------- /web/tests/test_embedding.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Tests for embedding 5 | """ 6 | import tempfile 7 | from os import path 8 | 9 | import numpy as np 10 | 11 | from web.datasets.utils import _fetch_file 12 | from web.embedding import Embedding 13 | from web.utils import standardize_string 14 | from web.vocabulary import Vocabulary 15 | 16 | def test_standardize(): 17 | url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1" 18 | file_name = _fetch_file(url, "test") 19 | 20 | w = Embedding.from_word2vec(file_name, binary=True) 21 | w2 = w.standardize_words(inplace=False, lower=False, clean_words=True) 22 | w3 = Embedding.from_word2vec(file_name, binary=True) 23 | assert len(w2.words) == 95 24 | for word in w.vocabulary.words: 25 | if standardize_string(word, lower=False, clean_words=True): 26 | assert np.array_equal(w[word], w2[standardize_string(word, lower=False, clean_words=True)]) 27 | 28 | w3.standardize_words(inplace=True, clean_words=True, lower=False) 29 | assert len(w3.words) == 95 30 | for word in w.vocabulary.words: 31 | if standardize_string(word, lower=False): 32 | assert np.array_equal(w[word], w3[standardize_string(word, lower=False, clean_words=True)]) 33 | 34 | 35 | def test_standardize_preserve_identity(): 36 | d = {"Spider": [3, 4, 5], "spider": [1, 2, 3], "spideR": [3, 2, 4]} 37 | w3 = Embedding.from_dict(d) 38 | w4 = w3.standardize_words(inplace=False, lower=True) 39 | assert w4['spider'][0] == 1 40 | w3.standardize_words(inplace=True, lower=True) 41 | assert w3['spider'][0] == 1 42 | 43 | def test_save_2(): 44 | dirpath = tempfile.mkdtemp() 45 | w = ["a", "b", "c"] 46 | vectors = np.array([[1.,2.] ,[2.,3.], [3.,4.]]) 47 | e = Embedding(Vocabulary(w), vectors) 48 | Embedding.to_word2vec(e, path.join(dirpath, "test.bin"), binary=True) 49 | e2 = Embedding.from_word2vec(path.join(dirpath, "test.bin"), binary=True) 50 | assert np.array_equal(e2.vectors, vectors) 51 | 52 | def test_save(): 53 | url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1" 54 | file_name = _fetch_file(url, "test") 55 | w = Embedding.from_word2vec(file_name, binary=True) 56 | 57 | dirpath = tempfile.mkdtemp() 58 | w.to_word2vec(w, path.join(dirpath, "tmp.bin"), binary=True) 59 | w.to_word2vec(w, path.join(dirpath, "tmp.txt"), binary=False) 60 | w2 = Embedding.from_word2vec(path.join(dirpath, "tmp.bin"), binary=True) 61 | w3 = Embedding.from_word2vec(path.join(dirpath, "tmp.txt"), binary=False) 62 | assert np.array_equal(w.vectors, w2.vectors) 63 | assert w.vocabulary.words == w2.vocabulary.words 64 | assert np.sum(np.abs(w.vectors - w3.vectors)) < 1e-5 65 | assert w.vocabulary.words == w3.vocabulary.words 66 | -------------------------------------------------------------------------------- /web/tests/test_fetchers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Tests for data fetchers 5 | """ 6 | 7 | from web.datasets.analogy import fetch_google_analogy, fetch_msr_analogy, fetch_semeval_2012_2, \ 8 | fetch_wordrep 9 | 10 | from web.datasets.similarity import fetch_SimLex999, fetch_WS353, fetch_multilingual_SimLex999, \ 11 | fetch_MEN, fetch_MTurk, fetch_RW, fetch_RG65 12 | 13 | from web.datasets.categorization import fetch_AP, fetch_BLESS, fetch_battig,\ 14 | fetch_ESSLI_1a, fetch_ESSLI_2b, fetch_ESSLI_2c 15 | 16 | from itertools import product 17 | from six import iteritems 18 | 19 | def test_categorization_fetchers(): 20 | data = fetch_battig() 21 | assert data.X.shape[0] == 5231 22 | 23 | data = fetch_BLESS() 24 | assert data.X.shape[0] == 200 25 | 26 | data = fetch_AP() 27 | assert len(set(data.y)) == 21 28 | 29 | data = fetch_ESSLI_2c() 30 | assert data.X.shape[0] == 45 31 | assert len(set(data.y)) == 9 32 | 33 | data = fetch_ESSLI_2b() 34 | assert data.X.shape[0] == 40 35 | assert len(set(data.y)) == 3 36 | 37 | data = fetch_ESSLI_1a() 38 | assert data.X.shape[0] == 44 39 | assert len(set(data.y)) == 6 40 | 41 | def test_MTurk_fetcher(): 42 | data = fetch_MTurk() 43 | assert (len(data.y) == len(data.X) == 287) 44 | assert (10.0 >= data.y.max() >= 9) 45 | 46 | 47 | def test_RW_fetcher(): 48 | data = fetch_RW() 49 | assert (len(data.y) == len(data.X) == 2034) 50 | assert (10.0 >= data.y.max() >= 9.8) 51 | 52 | 53 | def test_RG65_fetcher(): 54 | data = fetch_RG65() 55 | assert (len(data.y) == len(data.X) == 65) 56 | assert (10.0 >= data.y.max() >= 9.8) 57 | 58 | 59 | def test_MEN_fetcher(): 60 | params = product(["all", "dev", "test"], ["natural", "lem"]) 61 | data, V = {}, {} 62 | for which, form in params: 63 | fetched = fetch_MEN(which=which, form=form) 64 | data[which + ":" + form] = fetched 65 | V[which + ":" + form] = set([" ".join(sorted(x)) for x in data[which + ":" + form].X]) 66 | assert fetched.y.max() <= 10.0 67 | 68 | assert V["dev:natural"].union(V["test:natural"]) == V["all:natural"] 69 | assert V["dev:lem"].union(V["test:lem"]) == V["all:lem"] 70 | assert data['all:natural'] 71 | 72 | 73 | def test_ws353_fetcher(): 74 | data1 = fetch_WS353(which="set1") 75 | data2 = fetch_WS353(which="set2") 76 | data3 = fetch_WS353(which="similarity") 77 | data4 = fetch_WS353(which="relatedness") 78 | data5 = fetch_WS353(which="all") 79 | V5 = set([" ".join(sorted(x)) for x in data5.X]) 80 | V1 = set([" ".join(sorted(x)) for x in data1.X]) 81 | V2 = set([" ".join(sorted(x)) for x in data2.X]) 82 | V3 = set([" ".join(sorted(x)) for x in data3.X]) 83 | V4 = set([" ".join(sorted(x)) for x in data4.X]) 84 | 85 | # sd and scores have same length 86 | assert data1.sd.shape[0] == data1.y.shape[0] 87 | assert data2.sd.shape[0] == data2.y.shape[0] 88 | 89 | # WSR = WSR-SET1 u WSR-SET2 90 | assert data5.X.shape[0] == 353 91 | assert V5 == V2.union(V1) 92 | 93 | assert V5 == V3.union(V4) 94 | 95 | # Two word pairs reoccurr 96 | assert len(V5) == 351 97 | 98 | 99 | def test_simlex999_fetchers(): 100 | data = fetch_SimLex999() 101 | assert data.X.shape == (999, 2) 102 | 103 | for lang in ["EN", "RU", "IT", "DE"]: 104 | data = fetch_multilingual_SimLex999(which=lang) 105 | assert data.y.shape[0] == data.sd.shape[0] 106 | assert data.X.shape[0] == 999 107 | 108 | 109 | def test_analogy_fetchers(): 110 | data = fetch_msr_analogy() 111 | assert len(set(data.category)) == 16 112 | 113 | data = fetch_google_analogy() 114 | assert len(set(data.category)) == 14 115 | assert len(set(data.category_high_level)) == 2 116 | 117 | data = fetch_semeval_2012_2() 118 | assert len(data.X) == len(data.y) == 79 119 | for k, val in iteritems(data.X_prot): 120 | assert len(val.shape) == 2, "Failed parsing prototypes for " + k 121 | 122 | data = fetch_wordrep(subsample=0.7) 123 | assert len(set(data.category)) == 25 124 | assert len(data.X[0]) == 2 125 | assert "all-capital-cities" in set(data.category) 126 | assert len(set(data.category_high_level)) == 2 -------------------------------------------------------------------------------- /web/tests/test_similarity.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Tests for similarity solvers 5 | """ 6 | from web.datasets.utils import _fetch_file 7 | from web.embedding import Embedding 8 | from web.datasets.similarity import fetch_SimLex999 9 | from web.evaluate import evaluate_similarity 10 | 11 | def test_similarity(): 12 | url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1" 13 | file_name = _fetch_file(url, "test") 14 | w = Embedding.from_word2vec(file_name, binary=True) 15 | data = fetch_SimLex999() 16 | 17 | result_1 = evaluate_similarity(w, data.X, data.y) 18 | result_2 = evaluate_similarity(dict(zip(w.vocabulary.words, w.vectors)), data.X, data.y) 19 | 20 | assert result_2 > 0 21 | assert result_1 == result_2, "evaluate_similarity should return same result for dict and Embedding instance" 22 | 23 | def test_similarity_norm(): 24 | url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1" 25 | file_name = _fetch_file(url, "test") 26 | w = Embedding.from_word2vec(file_name, binary=True) 27 | w_norm = w.normalize_words() 28 | data = fetch_SimLex999() 29 | 30 | result_1 = evaluate_similarity(w, data.X, data.y) 31 | result_2 = evaluate_similarity(w_norm, data.X, data.y) 32 | 33 | assert result_2 > 0 34 | assert result_1 == result_2, "evaluate_similarity should return same result for normalized and unnormalized words" -------------------------------------------------------------------------------- /web/tests/test_transform_words.py: -------------------------------------------------------------------------------- 1 | from web.embedding import Embedding 2 | from web.vocabulary import * 3 | 4 | import numpy as np 5 | import logging 6 | import sys 7 | 8 | 9 | # COUNTEDVOCABULARY 10 | 11 | def test_noinplace_transform_word_CountedVocabulary(): 12 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 13 | 14 | cw = CountedVocabulary(word_count=[(' cat ', 10), ('cat', 50), ('dog', 60)]) 15 | 16 | e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 11], [0, 11, 12], [0, 12, 13]])) 17 | pe = e.transform_words(lambda x: x.strip(), inplace=False) 18 | 19 | assert len(pe.vocabulary) == 2 20 | assert len(pe.vectors) == 2 21 | 22 | # 'dog' 23 | assert [0, 0, 11] in pe.vectors.tolist() 24 | # 'cat' 25 | assert [0, 11, 12] in pe.vectors.tolist() 26 | 27 | assert 'cat' in pe.vocabulary.words 28 | assert 'dog' in pe.vocabulary.words 29 | 30 | l = pe.vocabulary.getstate() 31 | d = {l[0][i]: l[1][i] for i in range(len(l[0]))} 32 | 33 | # dog 34 | assert pe.vocabulary.words[0] == 'dog' 35 | assert np.array_equal(pe.vectors[0], [0, 0, 11]) 36 | assert d['dog'] == 60 37 | 38 | # cat 39 | assert pe.vocabulary.words[1] == 'cat' 40 | assert np.array_equal(pe.vectors[1], [0, 11, 12]) 41 | assert d['cat'] == 50 42 | 43 | assert type(pe.vocabulary) == CountedVocabulary 44 | 45 | 46 | def test_noinplace_transform_word_prefer_occurences_CountedVocabulary(): 47 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 48 | 49 | cw = CountedVocabulary(word_count=[(' cat ', 5), ('pikatchu ', 10), ('cat', 50), ('dog', 60), ('pikatchu', 200)]) 50 | 51 | e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]])) 52 | pe = e.transform_words(lambda x: x.strip(), inplace=False) 53 | 54 | assert len(pe.vocabulary) == 3 55 | assert len(pe.vectors) == 3 56 | 57 | l = pe.vocabulary.getstate() 58 | d = {l[0][i]: l[1][i] for i in range(len(l[0]))} 59 | 60 | # 'dog' 61 | assert [0, 1, 11] in pe.vectors.tolist() 62 | # 'cat' 63 | assert [0, 11, 12] in pe.vectors.tolist() 64 | # pikatchu 65 | assert [0, 0, 1] in pe.vectors.tolist() 66 | 67 | assert 'cat' in pe.vocabulary.words 68 | assert 'dog' in pe.vocabulary.words 69 | assert 'pikatchu' in pe.vocabulary.words 70 | 71 | # pikatchu 72 | assert pe.vocabulary.words[0] == 'pikatchu' 73 | assert np.array_equal(pe.vectors[0], [0, 0, 1]) 74 | assert d['pikatchu'] == 200 75 | # dog 76 | assert pe.vocabulary.words[1] == 'dog' 77 | assert np.array_equal(pe.vectors[1], [0, 1, 11]) 78 | assert d['dog'] == 60 79 | 80 | # cat 81 | assert pe.vocabulary.words[2] == 'cat' 82 | assert np.array_equal(pe.vectors[2], [0, 11, 12]) 83 | assert d['cat'] == 50 84 | 85 | assert type(pe.vocabulary) == CountedVocabulary 86 | 87 | 88 | def test_noinplace_transform_word_prefer_shortestword_CountedVocabulary(): 89 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 90 | 91 | cw = CountedVocabulary( 92 | word_count=[('dog', 60), ('cat', 50), (' pikatchu ', 10), ('pikatchu', 10), (' cat ', 5)]) 93 | 94 | e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]])) 95 | pe = e.transform_words(lambda x: x.strip(), inplace=False) 96 | 97 | assert len(pe.vocabulary) == 3 98 | assert len(pe.vectors) == 3 99 | 100 | # 'dog' 101 | assert [0, 0, 1] in pe.vectors.tolist() 102 | # 'cat' 103 | assert [0, 1, 11] in pe.vectors.tolist() 104 | # pikatchu 105 | assert [0, 12, 13] in pe.vectors.tolist() 106 | 107 | assert 'cat' in pe.vocabulary.words 108 | assert 'dog' in pe.vocabulary.words 109 | assert 'pikatchu' in pe.vocabulary.words 110 | 111 | l = pe.vocabulary.getstate() 112 | d = {l[0][i]: l[1][i] for i in range(len(l[0]))} 113 | 114 | # pikatchu 115 | assert pe.vocabulary.words[2] == 'pikatchu' 116 | assert np.array_equal(pe.vectors[2], [0, 12, 13]) 117 | assert d['pikatchu'] == 10 118 | 119 | # dog 120 | assert pe.vocabulary.words[0] == 'dog' 121 | assert np.array_equal(pe.vectors[0], [0, 0, 1]) 122 | assert d['dog'] == 60 123 | 124 | # cat 125 | assert pe.vocabulary.words[1] == 'cat' 126 | assert np.array_equal(pe.vectors[1], [0, 1, 11]) 127 | assert d['cat'] == 50 128 | 129 | assert type(pe.vocabulary) == CountedVocabulary 130 | 131 | 132 | # ORDERDVOCABULARY 133 | 134 | def test_noinplace_transform_word_OrderedVocabulary(): 135 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 136 | 137 | cw = OrderedVocabulary(words=['dog', 'cat', ' cat']) 138 | 139 | e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 11], [0, 11, 12], [0, 12, 13]])) 140 | pe = e.transform_words(lambda x: x.strip(), inplace=False) 141 | 142 | assert len(pe.vocabulary) == 2 143 | assert len(pe.vectors) == 2 144 | 145 | # 'dog' 146 | assert [0, 0, 11] in pe.vectors.tolist() 147 | # 'cat' 148 | assert [0, 11, 12] in pe.vectors.tolist() 149 | 150 | assert 'cat' in pe.vocabulary.words 151 | assert 'dog' in pe.vocabulary.words 152 | 153 | # dog 154 | assert pe.vocabulary.words[0] == 'dog' 155 | assert np.array_equal(pe.vectors[0], [0, 0, 11]) 156 | 157 | # cat 158 | assert pe.vocabulary.words[1] == 'cat' 159 | assert np.array_equal(pe.vectors[1], [0, 11, 12]) 160 | 161 | assert type(pe.vocabulary) == OrderedVocabulary 162 | 163 | 164 | def test_noinplace_transform_word_prefer_occurences_OrderedVocabulary(): 165 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 166 | 167 | cw = OrderedVocabulary(words=['pikatchu', 'dog', 'cat', 'pikatchu ', ' cat ']) 168 | 169 | e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]])) 170 | pe = e.transform_words(lambda x: x.strip(), inplace=False) 171 | 172 | assert len(pe.vocabulary) == 3 173 | assert len(pe.vectors) == 3 174 | 175 | # 'dog' 176 | assert [0, 1, 11] in pe.vectors.tolist() 177 | # 'cat' 178 | assert [0, 11, 12] in pe.vectors.tolist() 179 | # pikatchu 180 | assert [0, 0, 1] in pe.vectors.tolist() 181 | 182 | assert 'cat' in pe.vocabulary.words 183 | assert 'dog' in pe.vocabulary.words 184 | assert 'pikatchu' in pe.vocabulary.words 185 | 186 | # pikatchu 187 | assert pe.vocabulary.words[0] == 'pikatchu' 188 | assert np.array_equal(pe.vectors[0], [0, 0, 1]) 189 | # dog 190 | assert pe.vocabulary.words[1] == 'dog' 191 | assert np.array_equal(pe.vectors[1], [0, 1, 11]) 192 | 193 | # cat 194 | assert pe.vocabulary.words[2] == 'cat' 195 | assert np.array_equal(pe.vectors[2], [0, 11, 12]) 196 | 197 | assert type(pe.vocabulary) == OrderedVocabulary 198 | 199 | 200 | def test_noinplace_transform_word_prefer_shortestword_OrderedVocabulary(): 201 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 202 | 203 | cw = OrderedVocabulary(words=['dog', 'cat', ' pikatchu ', 'pikatchu', ' cat ']) 204 | 205 | e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]])) 206 | pe = e.transform_words(lambda x: x.strip(), inplace=False) 207 | 208 | assert len(pe.vocabulary) == 3 209 | assert len(pe.vectors) == 3 210 | 211 | # 'dog' 212 | assert [0, 0, 1] in pe.vectors.tolist() 213 | # 'cat' 214 | assert [0, 1, 11] in pe.vectors.tolist() 215 | # pikatchu 216 | assert [0, 11, 12] in pe.vectors.tolist() 217 | 218 | assert 'cat' in pe.vocabulary.words 219 | assert 'dog' in pe.vocabulary.words 220 | assert 'pikatchu' in pe.vocabulary.words 221 | 222 | # pikatchu 223 | assert pe.vocabulary.words[2] == 'pikatchu' 224 | assert np.array_equal(pe.vectors[2], [0, 11, 12]) 225 | 226 | # dog 227 | assert pe.vocabulary.words[0] == 'dog' 228 | assert np.array_equal(pe.vectors[0], [0, 0, 1]) 229 | 230 | # cat 231 | assert pe.vocabulary.words[1] == 'cat' 232 | assert np.array_equal(pe.vectors[1], [0, 1, 11]) 233 | 234 | assert type(pe.vocabulary) == OrderedVocabulary 235 | 236 | 237 | # VOCABULARY 238 | 239 | def test_noinplace_transform_word_Vocabulary(): 240 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 241 | 242 | cw = Vocabulary(words=['dog', 'cat', ' cat ']) 243 | e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 11], [0, 11, 12], [0, 12, 13]])) 244 | pe = e.transform_words(lambda x: x.strip(), inplace=False) 245 | 246 | assert len(pe.vocabulary) == 2 247 | assert len(pe.vectors) == 2 248 | 249 | # 'dog' 250 | assert [0, 0, 11] in pe.vectors.tolist() 251 | # 'cat' 252 | assert [0, 11, 12] in pe.vectors.tolist() 253 | 254 | assert 'cat' in pe.vocabulary.words 255 | assert 'dog' in pe.vocabulary.words 256 | 257 | # dog 258 | assert pe.vocabulary.words[0] == 'dog' 259 | assert np.array_equal(pe.vectors[0], [0, 0, 11]) 260 | 261 | # cat 262 | assert pe.vocabulary.words[1] == 'cat' 263 | assert np.array_equal(pe.vectors[1], [0, 11, 12]) 264 | 265 | assert type(pe.vocabulary) == Vocabulary 266 | 267 | 268 | def test_noinplace_transform_word_prefer_shortest_ord1_Vocabulary(): 269 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 270 | 271 | cw = Vocabulary(words=['pikatchu ', 'dog', 'cat', 'pikatchu', ' cat ']) 272 | 273 | e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 12, 13], [0, 1, 11], [0, 11, 12], [0, 0, 1], [0, 13, 14]])) 274 | pe = e.transform_words(lambda x: x.strip(), inplace=False) 275 | 276 | assert len(pe.vocabulary) == 3 277 | assert len(pe.vectors) == 3 278 | 279 | # 'dog' 280 | assert [0, 1, 11] in pe.vectors.tolist() 281 | # 'cat' 282 | assert [0, 11, 12] in pe.vectors.tolist() 283 | # pikatchu 284 | assert [0, 0, 1] in pe.vectors.tolist() 285 | 286 | assert 'cat' in pe.vocabulary.words 287 | assert 'dog' in pe.vocabulary.words 288 | assert 'pikatchu' in pe.vocabulary.words 289 | 290 | # pikatchu 291 | assert pe.vocabulary.words[2] == 'pikatchu' 292 | assert np.array_equal(pe.vectors[2], [0, 0, 1]) 293 | 294 | # dog 295 | assert pe.vocabulary.words[0] == 'dog' 296 | assert np.array_equal(pe.vectors[0], [0, 1, 11]) 297 | 298 | # cat 299 | assert pe.vocabulary.words[1] == 'cat' 300 | assert np.array_equal(pe.vectors[1], [0, 11, 12]) 301 | 302 | assert type(pe.vocabulary) == Vocabulary 303 | 304 | 305 | def test_noinplace_transform_word_prefer_shortestword2_Vocabulary(): 306 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 307 | 308 | cw = Vocabulary(words=['dog', 'cat', ' pikatchu ', 'pikatchu', ' cat ']) 309 | e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]])) 310 | pe = e.transform_words(lambda x: x.strip(), inplace=False) 311 | 312 | assert len(pe.vocabulary) == 3 313 | assert len(pe.vectors) == 3 314 | 315 | # 'dog' 316 | assert [0, 0, 1] in pe.vectors.tolist() 317 | # 'cat' 318 | assert [0, 1, 11] in pe.vectors.tolist() 319 | # pikatchu 320 | assert [0, 12, 13] in pe.vectors.tolist() 321 | 322 | assert 'cat' in pe.vocabulary.words 323 | assert 'dog' in pe.vocabulary.words 324 | assert 'pikatchu' in pe.vocabulary.words 325 | 326 | # pikatchu 327 | assert pe.vocabulary.words[2] == 'pikatchu' 328 | assert np.array_equal(pe.vectors[2], [0, 12, 13]) 329 | 330 | # dog 331 | assert pe.vocabulary.words[0] == 'dog' 332 | assert np.array_equal(pe.vectors[0], [0, 0, 1]) 333 | 334 | # cat 335 | assert pe.vocabulary.words[1] == 'cat' 336 | assert np.array_equal(pe.vectors[1], [0, 1, 11]) 337 | 338 | assert type(pe.vocabulary) == Vocabulary 339 | 340 | ####################### inplace= True ####################### 341 | 342 | # COUNTEDVOCABULARY 343 | 344 | def test_inplace_transform_word_CountedVocabulary(): 345 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 346 | 347 | cw = CountedVocabulary(word_count=[(' cat ', 10), ('cat', 50), ('dog', 60)]) 348 | 349 | e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 11], [0, 11, 12], [0, 12, 13]])) 350 | pe = e.transform_words(lambda x: x.strip(), inplace=True) 351 | 352 | assert pe is e and pe == e 353 | 354 | assert len(pe.vocabulary) == 2 355 | assert len(pe.vectors) == 2 356 | 357 | # 'dog' 358 | assert [0, 0, 11] in pe.vectors.tolist() 359 | # 'cat' 360 | assert [0, 11, 12] in pe.vectors.tolist() 361 | 362 | assert 'cat' in pe.vocabulary.words 363 | assert 'dog' in pe.vocabulary.words 364 | 365 | l = pe.vocabulary.getstate() 366 | d = {l[0][i]: l[1][i] for i in range(len(l[0]))} 367 | 368 | # dog 369 | assert pe.vocabulary.words[0] == 'dog' 370 | assert np.array_equal(pe.vectors[0], [0, 0, 11]) 371 | assert d['dog'] == 60 372 | 373 | # cat 374 | assert pe.vocabulary.words[1] == 'cat' 375 | assert np.array_equal(pe.vectors[1], [0, 11, 12]) 376 | assert d['cat'] == 50 377 | 378 | assert type(pe.vocabulary) == CountedVocabulary 379 | 380 | 381 | def test_inplace_transform_word_prefer_occurences_CountedVocabulary(): 382 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 383 | 384 | cw = CountedVocabulary(word_count=[(' cat ', 5), ('pikatchu ', 10), ('cat', 50), ('dog', 60), ('pikatchu', 200)]) 385 | 386 | e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]])) 387 | pe = e.transform_words(lambda x: x.strip(), inplace=True) 388 | 389 | assert pe is e and pe == e 390 | 391 | assert len(pe.vocabulary) == 3 392 | assert len(pe.vectors) == 3 393 | 394 | l = pe.vocabulary.getstate() 395 | d = {l[0][i]: l[1][i] for i in range(len(l[0]))} 396 | 397 | # 'dog' 398 | assert [0, 1, 11] in pe.vectors.tolist() 399 | # 'cat' 400 | assert [0, 11, 12] in pe.vectors.tolist() 401 | # pikatchu 402 | assert [0, 0, 1] in pe.vectors.tolist() 403 | 404 | assert 'cat' in pe.vocabulary.words 405 | assert 'dog' in pe.vocabulary.words 406 | assert 'pikatchu' in pe.vocabulary.words 407 | 408 | # pikatchu 409 | assert pe.vocabulary.words[0] == 'pikatchu' 410 | assert np.array_equal(pe.vectors[0], [0, 0, 1]) 411 | assert d['pikatchu'] == 200 412 | # dog 413 | assert pe.vocabulary.words[1] == 'dog' 414 | assert np.array_equal(pe.vectors[1], [0, 1, 11]) 415 | assert d['dog'] == 60 416 | 417 | # cat 418 | assert pe.vocabulary.words[2] == 'cat' 419 | assert np.array_equal(pe.vectors[2], [0, 11, 12]) 420 | assert d['cat'] == 50 421 | 422 | assert type(pe.vocabulary) == CountedVocabulary 423 | 424 | 425 | def test_inplace_transform_word_prefer_shortestword_CountedVocabulary(): 426 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 427 | 428 | cw = CountedVocabulary( 429 | word_count=[('dog', 60), ('cat', 50), (' pikatchu ', 10), ('pikatchu', 10), (' cat ', 5)]) 430 | 431 | e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]])) 432 | pe = e.transform_words(lambda x: x.strip(), inplace=True) 433 | 434 | assert pe is e and pe == e 435 | 436 | assert len(pe.vocabulary) == 3 437 | assert len(pe.vectors) == 3 438 | 439 | # 'dog' 440 | assert [0, 0, 1] in pe.vectors.tolist() 441 | # 'cat' 442 | assert [0, 1, 11] in pe.vectors.tolist() 443 | # pikatchu 444 | assert [0, 12, 13] in pe.vectors.tolist() 445 | 446 | assert 'cat' in pe.vocabulary.words 447 | assert 'dog' in pe.vocabulary.words 448 | assert 'pikatchu' in pe.vocabulary.words 449 | 450 | l = pe.vocabulary.getstate() 451 | d = {l[0][i]: l[1][i] for i in range(len(l[0]))} 452 | 453 | # pikatchu 454 | assert pe.vocabulary.words[2] == 'pikatchu' 455 | assert np.array_equal(pe.vectors[2], [0, 12, 13]) 456 | assert d['pikatchu'] == 10 457 | 458 | # dog 459 | assert pe.vocabulary.words[0] == 'dog' 460 | assert np.array_equal(pe.vectors[0], [0, 0, 1]) 461 | assert d['dog'] == 60 462 | 463 | # cat 464 | assert pe.vocabulary.words[1] == 'cat' 465 | assert np.array_equal(pe.vectors[1], [0, 1, 11]) 466 | assert d['cat'] == 50 467 | 468 | assert type(pe.vocabulary) == CountedVocabulary 469 | 470 | # ORDERDVOCABULARY 471 | 472 | def test_inplace_transform_word_OrderedVocabulary(): 473 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 474 | 475 | cw = OrderedVocabulary(words=['dog', 'cat', ' cat']) 476 | 477 | e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 11], [0, 11, 12], [0, 12, 13]])) 478 | pe = e.transform_words(lambda x: x.strip(), inplace=True) 479 | 480 | assert pe is e and pe == e 481 | 482 | assert len(pe.vocabulary) == 2 483 | assert len(pe.vectors) == 2 484 | 485 | # 'dog' 486 | assert [0, 0, 11] in pe.vectors.tolist() 487 | # 'cat' 488 | assert [0, 11, 12] in pe.vectors.tolist() 489 | 490 | assert 'cat' in pe.vocabulary.words 491 | assert 'dog' in pe.vocabulary.words 492 | 493 | # dog 494 | assert pe.vocabulary.words[0] == 'dog' 495 | assert np.array_equal(pe.vectors[0], [0, 0, 11]) 496 | 497 | # cat 498 | assert pe.vocabulary.words[1] == 'cat' 499 | assert np.array_equal(pe.vectors[1], [0, 11, 12]) 500 | 501 | assert type(pe.vocabulary) == OrderedVocabulary 502 | 503 | 504 | def test_inplace_transform_word_prefer_occurences_OrderedVocabulary(): 505 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 506 | 507 | cw = OrderedVocabulary(words=['pikatchu', 'dog', 'cat', 'pikatchu ', ' cat ']) 508 | 509 | e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]])) 510 | pe = e.transform_words(lambda x: x.strip(), inplace=True) 511 | 512 | assert pe is e and pe == e 513 | 514 | assert len(pe.vocabulary) == 3 515 | assert len(pe.vectors) == 3 516 | 517 | # 'dog' 518 | assert [0, 1, 11] in pe.vectors.tolist() 519 | # 'cat' 520 | assert [0, 11, 12] in pe.vectors.tolist() 521 | # pikatchu 522 | assert [0, 0, 1] in pe.vectors.tolist() 523 | 524 | assert 'cat' in pe.vocabulary.words 525 | assert 'dog' in pe.vocabulary.words 526 | assert 'pikatchu' in pe.vocabulary.words 527 | 528 | # pikatchu 529 | assert pe.vocabulary.words[0] == 'pikatchu' 530 | assert np.array_equal(pe.vectors[0], [0, 0, 1]) 531 | # dog 532 | assert pe.vocabulary.words[1] == 'dog' 533 | assert np.array_equal(pe.vectors[1], [0, 1, 11]) 534 | 535 | # cat 536 | assert pe.vocabulary.words[2] == 'cat' 537 | assert np.array_equal(pe.vectors[2], [0, 11, 12]) 538 | 539 | assert type(pe.vocabulary) == OrderedVocabulary 540 | 541 | 542 | def test_inplace_transform_word_prefer_shortestword_OrderedVocabulary(): 543 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 544 | 545 | cw = OrderedVocabulary(words=['dog', 'cat', ' pikatchu ', 'pikatchu', ' cat ']) 546 | 547 | e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]])) 548 | pe = e.transform_words(lambda x: x.strip(), inplace=True) 549 | 550 | assert pe is e and pe == e 551 | 552 | assert len(pe.vocabulary) == 3 553 | assert len(pe.vectors) == 3 554 | 555 | # 'dog' 556 | assert [0, 0, 1] in pe.vectors.tolist() 557 | # 'cat' 558 | assert [0, 1, 11] in pe.vectors.tolist() 559 | # pikatchu 560 | assert [0, 11, 12] in pe.vectors.tolist() 561 | 562 | assert 'cat' in pe.vocabulary.words 563 | assert 'dog' in pe.vocabulary.words 564 | assert 'pikatchu' in pe.vocabulary.words 565 | 566 | # pikatchu 567 | assert pe.vocabulary.words[2] == 'pikatchu' 568 | assert np.array_equal(pe.vectors[2], [0, 11, 12]) 569 | 570 | # dog 571 | assert pe.vocabulary.words[0] == 'dog' 572 | assert np.array_equal(pe.vectors[0], [0, 0, 1]) 573 | 574 | # cat 575 | assert pe.vocabulary.words[1] == 'cat' 576 | assert np.array_equal(pe.vectors[1], [0, 1, 11]) 577 | 578 | assert type(pe.vocabulary) == OrderedVocabulary 579 | 580 | 581 | -------------------------------------------------------------------------------- /web/tests/test_vocabulary.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Tests for vocabulary 5 | """ 6 | -------------------------------------------------------------------------------- /web/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Utilities for package""" 5 | 6 | import bz2 7 | import gzip 8 | from os import path 9 | import tarfile 10 | import io 11 | from itertools import islice, chain 12 | from six import string_types, text_type 13 | 14 | 15 | def any2utf8(text, errors='strict', encoding='utf8'): 16 | """Convert a string (unicode or bytestring in `encoding`), to bytestring in utf8.""" 17 | if isinstance(text, text_type): 18 | return text.encode('utf8') 19 | # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8 20 | return text_type(text, encoding, errors=errors).encode('utf8') 21 | 22 | 23 | to_utf8 = any2utf8 24 | 25 | # Works just as good with unicode chars 26 | _delchars = [chr(c) for c in range(256)] 27 | _delchars = [x for x in _delchars if not x.isalnum()] 28 | _delchars.remove('\t') 29 | _delchars.remove(' ') 30 | _delchars.remove('-') 31 | _delchars.remove('_') # for instance phrases are joined in word2vec used this char 32 | _delchars = ''.join(_delchars) 33 | _delchars_table = dict((ord(char), None) for char in _delchars) 34 | 35 | 36 | def standardize_string(s, clean_words=True, lower=True, language="english"): 37 | """ 38 | Ensures common convention across code. Converts to utf-8 and removes non-alphanumeric characters 39 | 40 | Parameters 41 | ---------- 42 | language: only "english" is now supported. If "english" will remove non-alphanumeric characters 43 | 44 | lower: if True will lower strńing. 45 | 46 | clean_words: if True will remove non alphanumeric characters (for instance '$', '#' or 'ł') 47 | 48 | Returns 49 | ------- 50 | string: processed string 51 | """ 52 | 53 | assert isinstance(s, string_types) 54 | 55 | if not isinstance(s, text_type): 56 | s = text_type(s, "utf-8") 57 | 58 | if language == "english": 59 | s = (s.lower() if lower else s) 60 | s = (s.translate(_delchars_table) if clean_words else s) 61 | return s 62 | else: 63 | raise NotImplementedError("Not implemented standarization for other languages") 64 | 65 | 66 | def batched(iterable, size): 67 | sourceiter = iter(iterable) 68 | while True: 69 | batchiter = islice(sourceiter, size) 70 | try: 71 | yield chain([next(batchiter)], batchiter) 72 | except StopIteration: 73 | return 74 | 75 | 76 | def _open(file_, mode='r'): 77 | """Open file object given filenames, open files or even archives.""" 78 | if isinstance(file_, string_types): 79 | _, ext = path.splitext(file_) 80 | if ext in {'.gz'}: 81 | if mode == "r" or mode == "rb": 82 | # gzip is extremely slow 83 | return io.BufferedReader(gzip.GzipFile(file_, mode=mode)) 84 | else: 85 | return gzip.GzipFile(file_, mode=mode) 86 | if ext in {'.bz2'}: 87 | return bz2.BZ2File(file_, mode=mode) 88 | else: 89 | return io.open(file_, mode, **({"encoding": "utf-8"} if "b" not in mode else {})) 90 | return file_ 91 | -------------------------------------------------------------------------------- /web/version.py: -------------------------------------------------------------------------------- 1 | # License: MIT 2 | """ 3 | Unique version information place 4 | """ 5 | 6 | __version__ = "0.0.1" 7 | VERSION = tuple(int(x) for x in __version__.split(".")) 8 | -------------------------------------------------------------------------------- /web/vocabulary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ Supports word embeddings. 5 | 6 | NOTE: This file was adapted from the polyglot package 7 | """ 8 | 9 | from io import open, StringIO 10 | from collections import Counter, OrderedDict 11 | import os 12 | from concurrent.futures import ProcessPoolExecutor 13 | 14 | import six 15 | from six.moves import zip 16 | from six import iteritems 17 | from six import text_type as unicode 18 | from six import string_types 19 | from .utils import _open 20 | 21 | 22 | def count(lines): 23 | """ Counts the word frequences in a list of sentences. 24 | 25 | Note: 26 | This is a helper function for parallel execution of `Vocabulary.from_text` 27 | method. 28 | """ 29 | words = [w for l in lines for w in l.strip().split()] 30 | return Counter(words) 31 | 32 | 33 | class Vocabulary(object): 34 | """ A set of words/tokens that have consistent IDs. 35 | 36 | Attributes: 37 | word_id (dictionary): Mapping from words to IDs. 38 | id_word (dictionary): A reverse map of `word_id`. 39 | """ 40 | 41 | def __init__(self, words=None): 42 | """ Build attributes word_id and id_word from input. 43 | 44 | Args: 45 | words (list/set): list or set of words. 46 | """ 47 | words = self.sanitize_words(words) 48 | self.word_id = {w: i for i, w in enumerate(words)} 49 | self.id_word = {i: w for w, i in iteritems(self.word_id)} 50 | 51 | def __iter__(self): 52 | """Iterate over the words in a vocabulary.""" 53 | for w, i in sorted(iteritems(self.word_id), key=lambda wc: wc[1]): 54 | yield w 55 | 56 | @property 57 | def words(self): 58 | """ Ordered list of words according to their IDs.""" 59 | return list(self) 60 | 61 | def __unicode__(self): 62 | return u"\n".join(self.words) 63 | 64 | def __str__(self): 65 | if six.PY3: 66 | return self.__unicode__() 67 | return self.__unicode__().encode("utf-8") 68 | 69 | def __getitem__(self, key): 70 | if isinstance(key, string_types) and not isinstance(key, unicode): 71 | key = unicode(key, encoding="utf-8") 72 | return self.word_id[key] 73 | 74 | def add(self, word): 75 | if isinstance(word, string_types) and not isinstance(word, unicode): 76 | word = unicode(word, encoding="utf-8") 77 | 78 | if word in self.word_id: 79 | raise RuntimeError("Already existing word") 80 | 81 | id = len(self.word_id) 82 | self.word_id[word] = id 83 | self.id_word[id] = word 84 | 85 | def __contains__(self, key): 86 | return key in self.word_id 87 | 88 | def __delitem__(self, key): 89 | """Delete a word from vocabulary. 90 | 91 | Note: 92 | To maintain consecutive IDs, this operation implemented 93 | with a complexity of \\theta(n). 94 | """ 95 | del self.word_id[key] 96 | self.id_word = dict(enumerate(self.words)) 97 | self.word_id = {w: i for i, w in iteritems(self.id_word)} 98 | 99 | def __len__(self): 100 | return len(self.word_id) 101 | 102 | def sanitize_words(self, words): 103 | """Guarantees that all textual symbols are unicode. 104 | Note: 105 | We do not convert numbers, only strings to unicode. 106 | We assume that the strings are encoded in utf-8. 107 | """ 108 | _words = [] 109 | for w in words: 110 | if isinstance(w, string_types) and not isinstance(w, unicode): 111 | _words.append(unicode(w, encoding="utf-8")) 112 | else: 113 | _words.append(w) 114 | return _words 115 | 116 | def get(self, k, default=None): 117 | try: 118 | return self[k] 119 | except KeyError as e: 120 | return default 121 | 122 | def getstate(self): 123 | return list(self.words) 124 | 125 | @classmethod 126 | def from_vocabfile(cls, filename): 127 | """ Construct a CountedVocabulary out of a vocabulary file. 128 | 129 | Note: 130 | File has the following format word1 131 | word2 132 | """ 133 | words = [x.strip() for x in _open(filename, 'r').read().splitlines()] 134 | return cls(words=words) 135 | 136 | 137 | class OrderedVocabulary(Vocabulary): 138 | """ An ordered list of words/tokens according to their frequency. 139 | 140 | Note: 141 | The words order is assumed to be sorted according to the word frequency. 142 | Most frequent words appear first in the list. 143 | 144 | Attributes: 145 | word_id (dictionary): Mapping from words to IDs. 146 | id_word (dictionary): A reverse map of `word_id`. 147 | """ 148 | 149 | def __init__(self, words=None): 150 | """ Build attributes word_id and id_word from input. 151 | 152 | Args: 153 | words (list): list of sorted words according to frequency. 154 | """ 155 | 156 | words = self.sanitize_words(words) 157 | self.word_id = {w: i for i, w in enumerate(words)} 158 | self.id_word = {i: w for w, i in iteritems(self.word_id)} 159 | 160 | def most_frequent(self, k): 161 | """ Returns a vocabulary with the most frequent `k` words. 162 | 163 | Args: 164 | k (integer): specifies the top k most frequent words to be returned. 165 | """ 166 | return OrderedVocabulary(words=self.words[:k]) 167 | 168 | 169 | class CountedVocabulary(OrderedVocabulary): 170 | """ List of words and counts sorted according to word count. 171 | """ 172 | 173 | def __init__(self, word_count=None): 174 | """ Build attributes word_id and id_word from input. 175 | 176 | Args: 177 | word_count (dictionary): A dictionary of the type word:count or 178 | list of tuples of the type (word, count). 179 | """ 180 | 181 | if isinstance(word_count, dict): 182 | word_count = iteritems(word_count) 183 | sorted_counts = list(sorted(word_count, key=lambda wc: wc[1], reverse=True)) 184 | words = [w for w, c in sorted_counts] 185 | super(CountedVocabulary, self).__init__(words=words) 186 | self.word_count = OrderedDict(sorted_counts) 187 | 188 | def most_frequent(self, k): 189 | """ Returns a vocabulary with the most frequent `k` words. 190 | 191 | Args: 192 | k (integer): specifies the top k most frequent words to be returned. 193 | """ 194 | word_count = [(w, self.word_count[w]) for w in self.words[:k]] 195 | return CountedVocabulary(word_count=word_count) 196 | 197 | def min_count(self, n=1): 198 | """ Returns a vocabulary after eliminating the words that appear < `n`. 199 | 200 | Args: 201 | n (integer): specifies the minimum word frequency allowed. 202 | """ 203 | word_count = [(w, c) for w, c in iteritems(self.word_count) if c >= n] 204 | return CountedVocabulary(word_count=word_count) 205 | 206 | def __unicode__(self): 207 | return u"\n".join([u"{}\t{}".format(w, self.word_count[w]) for w in self.words]) 208 | 209 | def __delitem__(self, key): 210 | super(CountedVocabulary, self).__delitem__(key) 211 | self.word_count = OrderedDict([(w, self.word_count[w]) for w in self]) 212 | 213 | def getstate(self): 214 | words = list(self.words) 215 | counts = [self.word_count[w] for w in words] 216 | return (words, counts) 217 | 218 | @staticmethod 219 | def from_vocabfile(filename): 220 | """ Construct a CountedVocabulary out of a vocabulary file. 221 | 222 | Note: 223 | File has the following format word1 count1 224 | word2 count2 225 | """ 226 | word_count = [x.strip().split() for x in _open(filename, 'r').read().splitlines()] 227 | word_count = OrderedDict([(w, int(c)) for w, c in word_count]) 228 | return CountedVocabulary(word_count=word_count) 229 | 230 | 231 | --------------------------------------------------------------------------------