├── .gitignore
├── .travis.yml
├── CHANGES.rst
├── LICENSE
├── README.rst
├── examples
    ├── evaluate_similarity.ipynb
    ├── evaluate_similarity.py
    ├── solve_analogy.ipynb
    └── solve_analogy.py
├── requirements.txt
├── scripts
    ├── evaluate_embeddings.py
    ├── evaluate_on_all.py
    └── word2vec_wikipedia
    │   ├── process_wiki.py
    │   └── train.py
├── setup.py
└── web
    ├── __init__.py
    ├── _utils
        ├── __init__.py
        └── compat.py
    ├── analogy.py
    ├── datasets
        ├── __init__.py
        ├── analogy.py
        ├── categorization.py
        ├── similarity.py
        └── utils.py
    ├── embedding.py
    ├── embeddings.py
    ├── evaluate.py
    ├── tests
        ├── test_analogy.py
        ├── test_categorization.py
        ├── test_embedding.py
        ├── test_fetchers.py
        ├── test_similarity.py
        ├── test_transform_words.py
        └── test_vocabulary.py
    ├── utils.py
    ├── version.py
    └── vocabulary.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Pycharm
62 | .idea/
63 | /scripts/*.csv
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Config file for automatic testing at travis-ci.org
 2 | 
 3 | language: python
 4 | 
 5 | python:
 6 |   - 2.7
 7 |   - 3.4
 8 | 
 9 | addons:
10 |   apt:
11 |     packages:
12 |     - libblas-dev
13 |     - liblapack-dev
14 |     - gfortran
15 |     - python-numpy
16 |     - python-scipy
17 | 
18 | before_install:
19 |   - pip install -U pip
20 | 
21 | install:
22 |   - travis_wait travis_retry pip install -r requirements.txt
23 |   - travis_retry python setup.py install
24 | 
25 | script: python setup.py test


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kudkudak/word-embeddings-benchmarks/c78272b8c1374e5e518915a240ab2b348b59f44e/CHANGES.rst


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2010-2015 Google, Inc. http://angularjs.org
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Word Embeddings Benchmarks
 2 | =====
 3 | 
 4 | .. image:: https://travis-ci.org/kudkudak/word-embeddings-benchmarks.svg?branch=master
 5 | 
 6 | Word Embedding Benchmark (web) package is focused on providing methods for easy evaluating and reporting
 7 | results on common benchmarks (analogy, similarity and categorization).
 8 | 
 9 | Research goal of the package is to help drive research in word embeddings by easily accessible reproducible
10 | results (as there is a lot of contradictory results in the literature right now).
11 | This should also help to answer question if we should devise new methods for evaluating word embeddings.
12 | 
13 | To evaluate your embedding (converted to word2vec or python dict pickle)
14 | on all fast-running benchmarks execute ``./scripts/eval_on_all.py <path-to-file>``.
15 | See `here <https://github.com/kudkudak/word-embeddings-benchmarks/wiki>`_ results for embeddings available in the package.
16 | 
17 | Warnings and Disclaimers:
18 | 
19 | * Analogy test does not normalize internally word embeddings.
20 | * **Package is currently under development, and we expect within next few months an official release**. The main issue that might hit you at the moment is rather long embeddings loading times (especially if you use fetchers).
21 | 
22 | Please also refer to our recent publication on evaluation methods https://arxiv.org/abs/1702.02170.
23 | 
24 | Features:
25 | 
26 | * scikit-learn API and conventions
27 | * 18 popular datasets
28 | * 11 word embeddings (word2vec, HPCA, morphoRNNLM, GloVe, LexVec, ConceptNet, HDC/PDC and others)
29 | * methods to solve analogy, similarity and categorization tasks
30 | 
31 | Included datasets:
32 | 
33 | * TR9856
34 | * WordRep
35 | * Google Analogy
36 | * MSR Analogy
37 | * SemEval2012
38 | * AP 
39 | * BLESS
40 | * Battig
41 | * ESSLI (2b, 2a, 1c)
42 | * WS353
43 | * MTurk
44 | * RG65
45 | * RW
46 | * SimLex999
47 | * MEN
48 | 
49 | Note: embeddings are not hosted currently on a proper server, if the download is too slow consider downloading embeddings manually from original sources referred in docstrings.
50 | 
51 | Dependencies
52 | ======
53 | 
54 | Please see ``requirements.txt``.
55 | 
56 | Install
57 | ======
58 | 
59 | This package uses setuptools. You can install it running::
60 | 
61 |     python setup.py install
62 | 
63 | If you have problems during this installation. First you may need to install the dependencies::
64 | 
65 |     pip install -r requirements.txt
66 | 
67 | If you already have the dependencies listed in ``requirements.txt`` installed,
68 | to install in your home directory, use::
69 | 
70 |     python setup.py install --user
71 | 
72 | To install for all users on Unix/Linux::
73 | 
74 |     python setup.py build
75 |     sudo python setup.py install
76 | 
77 | You can also install it in development mode with::
78 | 
79 |     python setup.py develop
80 | 
81 | 
82 | Examples
83 | ========
84 | See `examples` folder.
85 | 
86 | License
87 | =======
88 | Code is licensed under MIT, however available embeddings distributed within package might be under different license. If you are unsure please reach to authors (references are included in docstrings)
89 | 


--------------------------------------------------------------------------------
/examples/evaluate_similarity.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import logging\n",
 10 |     "from six import iteritems\n",
 11 |     "from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999\n",
 12 |     "from web.embeddings import fetch_GloVe\n",
 13 |     "from web.evaluate import evaluate_similarity"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 3,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "# Configure logging\n",
 25 |     "logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 4,
 31 |    "metadata": {},
 32 |    "outputs": [
 33 |     {
 34 |      "name": "stdout",
 35 |      "output_type": "stream",
 36 |      "text": [
 37 |       "File already downloaded, skipping\n"
 38 |      ]
 39 |     },
 40 |     {
 41 |      "name": "stderr",
 42 |      "output_type": "stream",
 43 |      "text": [
 44 |       "05:49:40 INFO:Tranformed 400000 into 381871 words\n"
 45 |      ]
 46 |     }
 47 |    ],
 48 |    "source": [
 49 |     "# Fetch GloVe embedding (warning: it might take few minutes)\n",
 50 |     "w_glove = fetch_GloVe(corpus=\"wiki-6B\", dim=300)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 6,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# Define tasks\n",
 60 |     "tasks = {\n",
 61 |     "    \"MEN\": fetch_MEN(),\n",
 62 |     "    \"WS353\": fetch_WS353(),\n",
 63 |     "    \"SIMLEX999\": fetch_SimLex999()\n",
 64 |     "}"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 7,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "name": "stdout",
 74 |      "output_type": "stream",
 75 |      "text": [
 76 |       "Sample data from SIMLEX999: pair \"old\" and \"new\" is assigned score 1.58\n",
 77 |       "Sample data from MEN: pair \"sun\" and \"sunlight\" is assigned score [ 10.]\n",
 78 |       "Sample data from WS353: pair \"love\" and \"sex\" is assigned score 6.77\n"
 79 |      ]
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "# Print sample data\n",
 84 |     "for name, data in iteritems(tasks):\n",
 85 |     "    print(\"Sample data from {}: pair \\\"{}\\\" and \\\"{}\\\" is assigned score {}\".format(name, data.X[0][0], data.X[0][1], data.y[0]))"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 8,
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "name": "stderr",
 95 |      "output_type": "stream",
 96 |      "text": [
 97 |       "05:51:18 WARNING:Missing 24 words. Will replace them with mean vector\n"
 98 |      ]
 99 |     },
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "Spearman correlation of scores on SIMLEX999 0.370500357109\n",
105 |       "Spearman correlation of scores on MEN 0.737464696981\n",
106 |       "Spearman correlation of scores on WS353 0.521712569525\n"
107 |      ]
108 |     }
109 |    ],
110 |    "source": [
111 |     "# Calculate results using helper function\n",
112 |     "for name, data in iteritems(tasks):\n",
113 |     "    print \"Spearman correlation of scores on {} {}\".format(name, evaluate_similarity(w_glove, data.X, data.y))"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {
120 |     "collapsed": true
121 |    },
122 |    "outputs": [],
123 |    "source": []
124 |   }
125 |  ],
126 |  "metadata": {
127 |   "kernelspec": {
128 |    "display_name": "Python [default]",
129 |    "language": "python",
130 |    "name": "python2"
131 |   },
132 |   "language_info": {
133 |    "codemirror_mode": {
134 |     "name": "ipython",
135 |     "version": 2
136 |    },
137 |    "file_extension": ".py",
138 |    "mimetype": "text/x-python",
139 |    "name": "python",
140 |    "nbconvert_exporter": "python",
141 |    "pygments_lexer": "ipython2",
142 |    "version": "2.7.13"
143 |   }
144 |  },
145 |  "nbformat": 4,
146 |  "nbformat_minor": 1
147 | }
148 | 


--------------------------------------------------------------------------------
/examples/evaluate_similarity.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 |  Simple example showing evaluating embedding on similarity datasets
 5 | """
 6 | import logging
 7 | from six import iteritems
 8 | from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999
 9 | from web.embeddings import fetch_GloVe
10 | from web.evaluate import evaluate_similarity
11 | 
12 | # Configure logging
13 | logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')
14 | 
15 | # Fetch GloVe embedding (warning: it might take few minutes)
16 | w_glove = fetch_GloVe(corpus="wiki-6B", dim=300)
17 | 
18 | # Define tasks
19 | tasks = {
20 |     "MEN": fetch_MEN(),
21 |     "WS353": fetch_WS353(),
22 |     "SIMLEX999": fetch_SimLex999()
23 | }
24 | 
25 | # Print sample data
26 | for name, data in iteritems(tasks):
27 |     print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".format(name, data.X[0][0], data.X[0][1], data.y[0]))
28 | 
29 | # Calculate results using helper function
30 | for name, data in iteritems(tasks):
31 |     print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(w_glove, data.X, data.y)))
32 | 


--------------------------------------------------------------------------------
/examples/solve_analogy.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import logging\n",
 12 |     "from web.datasets.analogy import fetch_google_analogy\n",
 13 |     "from web.embeddings import fetch_SG_GoogleNews"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "# Configure logging\n",
 25 |     "logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {},
 32 |    "outputs": [
 33 |     {
 34 |      "name": "stderr",
 35 |      "output_type": "stream",
 36 |      "text": [
 37 |       "05:53:11 INFO:loading projection weights from /home/pocha/web_data/embeddings/GoogleNews-vectors-negative300.bin.gz\n",
 38 |       "05:53:11 INFO:Loading #3000000 words with 300 dim\n"
 39 |      ]
 40 |     },
 41 |     {
 42 |      "name": "stdout",
 43 |      "output_type": "stream",
 44 |      "text": [
 45 |       "File already downloaded, skipping\n"
 46 |      ]
 47 |     },
 48 |     {
 49 |      "name": "stderr",
 50 |      "output_type": "stream",
 51 |      "text": [
 52 |       "05:55:25 INFO:Tranformed 3000000 into 2665071 words\n"
 53 |      ]
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "# Fetch skip-gram trained on GoogleNews corpus and clean it slightly\n",
 58 |     "w = fetch_SG_GoogleNews(lower=True, clean_words=True)\n",
 59 |     "\n",
 60 |     "# Fetch analogy dataset\n",
 61 |     "data = fetch_google_analogy()"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 4,
 67 |    "metadata": {},
 68 |    "outputs": [
 69 |     {
 70 |      "name": "stdout",
 71 |      "output_type": "stream",
 72 |      "text": [
 73 |       "gram3-comparative\n",
 74 |       "gram8-plural\n",
 75 |       "capital-common-countries\n",
 76 |       "city-in-state\n",
 77 |       "family\n",
 78 |       "gram9-plural-verbs\n",
 79 |       "gram2-opposite\n",
 80 |       "currency\n",
 81 |       "gram4-superlative\n",
 82 |       "gram6-nationality-adjective\n",
 83 |       "gram7-past-tense\n",
 84 |       "gram5-present-participle\n",
 85 |       "capital-world\n",
 86 |       "gram1-adjective-to-adverb\n"
 87 |      ]
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "for cat in (set(data.category)):\n",
 92 |     "    print(cat)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 5,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "Question: bangkok is to thailand as havana is to ?\n",
105 |       "Answer: cuba\n",
106 |       "Predicted: asi\n",
107 |       "Question: baku is to azerbaijan as dushanbe is to ?\n",
108 |       "Answer: tajikistan\n",
109 |       "Predicted: tajikistan\n",
110 |       "Question: rome is to italy as windhoek is to ?\n",
111 |       "Answer: namibia\n",
112 |       "Predicted: otjiwarongo\n",
113 |       "Question: comfortable is to uncomfortable as clear is to ?\n",
114 |       "Answer: unclear\n",
115 |       "Predicted: abundantly_clear\n",
116 |       "Question: slow is to slowing as describe is to ?\n",
117 |       "Answer: describing\n",
118 |       "Predicted: describing\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "# Pick a sample of data and calculate answers\n",
124 |     "subset = [50, 1000, 4000, 10000, 14000]\n",
125 |     "for id in subset:\n",
126 |     "    w1, w2, w3 = data.X[id][0], data.X[id][1], data.X[id][2]\n",
127 |     "    print(\"Question: {} is to {} as {} is to ?\".format(w1, w2, w3))\n",
128 |     "    print(\"Answer: \" + data.y[id])\n",
129 |     "    print(\"Predicted: \" + \" \".join(w.nearest_neighbors(w[w2] - w[w1] + w[w3], exclude=[w1, w2, w3])))"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {
136 |     "collapsed": true
137 |    },
138 |    "outputs": [],
139 |    "source": []
140 |   }
141 |  ],
142 |  "metadata": {
143 |   "kernelspec": {
144 |    "display_name": "Python [default]",
145 |    "language": "python",
146 |    "name": "python2"
147 |   },
148 |   "language_info": {
149 |    "codemirror_mode": {
150 |     "name": "ipython",
151 |     "version": 2
152 |    },
153 |    "file_extension": ".py",
154 |    "mimetype": "text/x-python",
155 |    "name": "python",
156 |    "nbconvert_exporter": "python",
157 |    "pygments_lexer": "ipython2",
158 |    "version": "2.7.13"
159 |   }
160 |  },
161 |  "nbformat": 4,
162 |  "nbformat_minor": 1
163 | }
164 | 


--------------------------------------------------------------------------------
/examples/solve_analogy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 |  Simple example showing answering analogy questions
 5 | """
 6 | import logging
 7 | from web.datasets.analogy import fetch_google_analogy
 8 | from web.embeddings import fetch_SG_GoogleNews
 9 | 
10 | # Configure logging
11 | logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')
12 | 
13 | # Fetch skip-gram trained on GoogleNews corpus and clean it slightly
14 | w = fetch_SG_GoogleNews(lower=True, clean_words=True)
15 | 
16 | # Fetch analogy dataset
17 | data = fetch_google_analogy()
18 | 
19 | for cat in (set(data.category)):
20 |     print(cat)
21 | 
22 | # Pick a sample of data and calculate answers
23 | subset = [50, 1000, 4000, 10000, 14000]
24 | for id in subset:
25 |     w1, w2, w3 = data.X[id][0], data.X[id][1], data.X[id][2]
26 |     print("Question: {} is to {} as {} is to ?".format(w1, w2, w3))
27 |     print("Answer: " + data.y[id])
28 |     print("Predicted: " + " ".join(w.nearest_neighbors(w[w2] - w[w1] + w[w3], exclude=[w1, w2, w3])))
29 | 
30 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | matplotlib>=1.5.0
 2 | numpy>=1.10.0
 3 | Cython
 4 | pandas==0.19
 5 | pytest>=2.8.3
 6 | scipy>=0.9
 7 | scikit-learn>=0.16.1
 8 | seaborn>=0.6.0
 9 | futures
10 | tqdm
11 | docopt
12 | 


--------------------------------------------------------------------------------
/scripts/evaluate_embeddings.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 |  This script evaluates all embeddings available in the package
 5 |  and saves .csv results
 6 | 
 7 |  Usage:
 8 | 
 9 |  ./evaluate_embeddings <output_dir>
10 | """
11 | from web.evaluate import evaluate_on_all
12 | from web import embeddings
13 | from six import iteritems
14 | from multiprocessing import Pool
15 | from os import path
16 | import logging
17 | import optparse
18 | import multiprocessing
19 | 
20 | parser = optparse.OptionParser()
21 | parser.add_option("-j", "--n_jobs", type="int", default=4)
22 | parser.add_option("-o", "--output_dir", type="str", default="")
23 | (opts, args) = parser.parse_args()
24 | 
25 | # Configure logging
26 | logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')
27 | logger = logging.getLogger(__name__)
28 | 
29 | jobs = []
30 | 
31 | ## GloVe
32 | 
33 | for dim in [50, 100, 200, 300]:
34 |     jobs.append(["fetch_GloVe", {"dim": dim, "corpus": "wiki-6B"}])
35 | 
36 | for dim in [25, 50, 100, 200]:
37 |     jobs.append(["fetch_GloVe", {"dim": dim, "corpus": "twitter-27B"}])
38 | 
39 | 
40 | for corpus in ["common-crawl-42B", "common-crawl-840B"]:
41 |     jobs.append(["fetch_GloVe", {"dim": 300, "corpus": corpus}])
42 | 
43 | ## NMT
44 | 
45 | jobs.append(["fetch_NMT", {"which": "FR"}])
46 | jobs.append(["fetch_NMT", {"which": "DE"}])
47 | 
48 | ## PDC and HDC
49 | 
50 | for dim in [50, 100, 300]:
51 |     jobs.append(["fetch_PDC", {"dim": dim}])
52 |     jobs.append(["fetch_HDC", {"dim": dim}])
53 | 
54 | ## SG
55 | 
56 | jobs.append(["fetch_SG_GoogleNews", {}])
57 | 
58 | ## LexVec
59 | 
60 | jobs.append(["fetch_LexVec", {}])
61 | 
62 | ## ConceptNet Numberbatch
63 | jobs.append(["fetch_conceptnet_numberbatch", {}])
64 | 
65 | ## FastText
66 | jobs.append(["fetch_FastText", {}])
67 | 
68 | 
69 | def run_job(j):
70 |     fn, kwargs = j
71 |     outf = path.join(opts.output_dir, fn + "_" + "_".join(str(k) + "=" + str(v) for k, v in iteritems(kwargs))) + ".csv"
72 |     logger.info("Processing " + outf)
73 |     if not path.exists(outf):
74 |         w = getattr(embeddings, fn)(**kwargs)
75 |         res = evaluate_on_all(w)
76 |         res.to_csv(outf)
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     Pool(opts.n_jobs).map(run_job, jobs)
81 | 


--------------------------------------------------------------------------------
/scripts/evaluate_on_all.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 |  This script calculates embedding results against all available fast running
 5 |  benchmarks in the repository and saves results as single row csv table.
 6 | 
 7 |  Usage: ./evaluate_on_all -f <path to file> -o <path to output file>
 8 | 
 9 |  NOTE:
10 |  * script doesn't evaluate on WordRep (nor its subset) as it is non standard
11 |  for now and long running (unless some nearest neighbor approximation is used).
12 | 
13 |  * script is using CosAdd for calculating analogy answer.
14 | 
15 |  * script is not reporting results per category (for instance semantic/syntactic) in analogy benchmarks.
16 |  It is easy to change it by passing category parameter to evaluate_analogy function (see help).
17 | """
18 | from optparse import OptionParser
19 | import logging
20 | import os
21 | from web.embeddings import fetch_GloVe, load_embedding
22 | from web.datasets.utils import _get_dataset_dir
23 | 
24 | from web.evaluate import evaluate_on_all
25 | 
26 | 
27 | # Configure logging
28 | logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')
29 | logger = logging.getLogger(__name__)
30 | 
31 | parser = OptionParser()
32 | parser.add_option("-f", "--file", dest="filename",
33 |                   help="Path to the file with embedding. If relative will load from data directory.",
34 |                   default=None)
35 | 
36 | parser.add_option("-p", "--format", dest="format",
37 |                   help="Format of the embedding, possible values are: word2vec, word2vec_bin, dict and glove.",
38 |                   default=None)
39 | 
40 | parser.add_option("-o", "--output", dest="output",
41 |                   help="Path where to save results.",
42 |                   default=None)
43 | 
44 | parser.add_option("-c", "--clean_words", dest="clean_words",
45 |                   help="Clean_words argument passed to load_embedding function. If set to True will remove"
46 |                        "most of the non-alphanumeric characters, which should speed up evaluation.",
47 |                   default=False)
48 | 
49 | if __name__ == "__main__":
50 |     (options, args) = parser.parse_args()
51 | 
52 |     # Load embeddings
53 |     fname = options.filename
54 |     if not fname:
55 |         w = fetch_GloVe(corpus="wiki-6B", dim=300)
56 |     else:
57 |         if not os.path.isabs(fname):
58 |             fname = os.path.join(_get_dataset_dir(), fname)
59 | 
60 |         format = options.format
61 | 
62 |         if not format:
63 |             _, ext = os.path.splitext(fname)
64 |             if ext == ".bin":
65 |                 format = "word2vec_bin"
66 |             elif ext == ".txt":
67 |                 format = "word2vec"
68 |             elif ext == ".pkl":
69 |                 format = "dict"
70 | 
71 |         assert format in ['word2vec_bin', 'word2vec', 'glove', 'bin'], "Unrecognized format"
72 | 
73 |         load_kwargs = {}
74 |         if format == "glove":
75 |             load_kwargs['vocab_size'] = sum(1 for line in open(fname))
76 |             load_kwargs['dim'] = len(next(open(fname)).split()) - 1
77 | 
78 |         w = load_embedding(fname, format=format, normalize=True, lower=True, clean_words=options.clean_words,
79 |                            load_kwargs=load_kwargs)
80 | 
81 |     out_fname = options.output if options.output else "results.csv"
82 | 
83 |     results = evaluate_on_all(w)
84 | 
85 |     logger.info("Saving results...")
86 |     print(results)
87 |     results.to_csv(out_fname)
88 | 


--------------------------------------------------------------------------------
/scripts/word2vec_wikipedia/process_wiki.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 |  Usage: ./process_wiki.py <path to wiki dump>
 6 | 
 7 |  Adapted from http://textminingonline.com/training-word2vec-model-on-english-wikipedia-by-gensim
 8 | 
 9 |  TODO: add shuffle?
10 | """
11 | 
12 | import logging
13 | import os.path
14 | import sys
15 | import tqdm
16 |  
17 | from gensim.corpora import WikiCorpus
18 |  
19 | if __name__ == '__main__':
20 |     program = os.path.basename(sys.argv[0])
21 |     logger = logging.getLogger(program)
22 |  
23 |     logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
24 |     logging.root.setLevel(level=logging.INFO)
25 |     logger.info("running %s" % ' '.join(sys.argv))
26 |  
27 |     # check and process input arguments
28 |     if len(sys.argv) < 3:
29 |         print globals()['__doc__'] % locals()
30 |         sys.exit(1)
31 |     inp, outp = sys.argv[1:3]
32 |     space = " "
33 |     i = 0
34 |  
35 |     output = open(outp, 'w')
36 |     wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
37 |     for text in wiki.get_texts():
38 |         output.write(space.join(text) + "\n")
39 |         i = i + 1
40 |         if (i % 10000 == 0):
41 |             logger.info("Saved " + str(i) + " articles")
42 |  
43 |     output.close()
44 |     logger.info("Finished Saved " + str(i) + " articles")
45 | 
46 | 


--------------------------------------------------------------------------------
/scripts/word2vec_wikipedia/train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 |  Usage: ./train.py <path to processed wiki> <path to output file>
 6 | 
 7 |  Adapted from http://textminingonline.com/training-word2vec-model-on-english-wikipedia-by-gensim
 8 | """
 9 | 
10 | import logging
11 | import os.path
12 | import sys
13 | from gensim.models import Word2Vec
14 | from gensim.models.word2vec import LineSentence
15 | 
16 | if __name__ == '__main__':
17 |     program = os.path.basename(sys.argv[0])
18 |     logger = logging.getLogger(program)
19 | 
20 |     logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
21 |     logging.root.setLevel(level=logging.INFO)
22 |     logger.info("running %s" % ' '.join(sys.argv))
23 | 
24 |     # check and process input arguments
25 |     if len(sys.argv) < 3:
26 |         print globals()['__doc__'] % locals()
27 |         sys.exit(1)
28 |     inp, outp1 = sys.argv[1:4]
29 | 
30 |     # NOTE: it doesn't shuffle data between passes, which might degrade performance
31 |     model = Word2Vec(LineSentence(inp),
32 |                      size=300,
33 |                      negative=5,
34 |                      workers=5)
35 | 
36 |     model.save_word2vec_format(outp1, binary=False)


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | web
  5 | ----
  6 | 
  7 | Word Embeddings Benchmarks
  8 | """
  9 | 
 10 | from __future__ import print_function
 11 | 
 12 | import os.path as op
 13 | import io
 14 | import sys
 15 | 
 16 | from   setuptools import setup, find_packages
 17 | from   setuptools.command.test import test as TestCommand
 18 | 
 19 | 
 20 | # long description
 21 | def read(*filenames, **kwargs):
 22 |     encoding = kwargs.get('encoding', 'utf-8')
 23 |     sep = kwargs.get('sep', '\n')
 24 |     buf = []
 25 |     for filename in filenames:
 26 |         with io.open(filename, encoding=encoding) as f:
 27 |             buf.append(f.read())
 28 |     return sep.join(buf)
 29 | 
 30 | 
 31 | # Get version without importing, which avoids dependency issues
 32 | MODULE_NAME = find_packages(exclude=['tests'])[0]
 33 | VERSION_PYFILE = op.join(MODULE_NAME, 'version.py')
 34 | # set __version__ variable
 35 | exec (compile(read(VERSION_PYFILE), VERSION_PYFILE, 'exec'))
 36 | 
 37 | # INSTALL_REQUIRES = list(parse_requirements('requirements.txt'))
 38 | # req_files = ['requirements.txt', 'pip_requirements.txt']
 39 | 
 40 | LICENSE = 'New BSD'
 41 | 
 42 | setup_dict = dict(
 43 |     name=MODULE_NAME,
 44 |     version=__version__,
 45 |     description='Word Embedding Benchmarks',
 46 | 
 47 |     license='New BSD',
 48 |     author='Stanislaw Jastrzebski',
 49 |     author_email='grimghil@gmail.com',
 50 |     maintainer='Stanislaw Jastrzebski',
 51 |     maintainer_email='grimghil@gmail.com',
 52 | 
 53 |     packages=find_packages(),
 54 | 
 55 |     install_requires=['numpy',
 56 |                       'scipy',
 57 |                       'scikit-learn'],
 58 | 
 59 |     extra_files=['CHANGES.rst', 'COPYING', 'README.rst'],
 60 | 
 61 |     scripts=[],
 62 | 
 63 |     long_description=read('README.rst', 'CHANGES.rst'),
 64 | 
 65 |     platforms='Linux/MacOSX',
 66 | 
 67 |     # https://pypi.python.org/pypi?%3Aaction=list_classifiers
 68 |     classifiers=[
 69 |         'Programming Language :: Python',
 70 |         'Development Status :: 1 - Alpha',
 71 |         'Natural Language :: English',
 72 |         'Environment :: Console',
 73 |         'Intended Audience ::  Machine Learning Research',
 74 |         'License :: OSI Approved ::' + LICENSE,
 75 |         'Operating System :: OS Independent',
 76 |         'Topic :: Software Development :: Libraries :: Python Modules',
 77 |         'Topic :: Scientific/Engineering :: Machine Learning',
 78 |         'Topic :: Scientific/Engineering :: Information Analysis',
 79 |         'Operating System :: POSIX',
 80 |         'Operating System :: Unix',
 81 |         'Operating System :: MacOS',
 82 |         'Programming Language :: Python :: 2',
 83 |         'Programming Language :: Python :: 2.7',
 84 |         'Programming Language :: Python :: 3',
 85 |         'Programming Language :: Python :: 3.5',
 86 |     ],
 87 | 
 88 |     extras_require={
 89 |         'testing': ['pytest', 'pytest-cov'],
 90 |     }
 91 | )
 92 | 
 93 | 
 94 | # Python3 support keywords
 95 | if sys.version_info >= (3,):
 96 |     setup_dict['use_2to3'] = False
 97 |     setup_dict['convert_2to3_doctests'] = ['']
 98 |     setup_dict['use_2to3_fixers'] = ['']
 99 | 
100 | 
101 | class PyTest(TestCommand):
102 |     user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")]
103 | 
104 |     def initialize_options(self):
105 |         TestCommand.initialize_options(self)
106 |         self.pytest_args = []
107 | 
108 |     def finalize_options(self):
109 |         TestCommand.finalize_options(self)
110 |         self.test_args = []
111 |         self.test_suite = True
112 | 
113 |     def run_tests(self):
114 |         # import here, cause outside the eggs aren't loaded
115 |         import pytest
116 |         errno = pytest.main(self.pytest_args)
117 |         sys.exit(errno)
118 | 
119 | 
120 | setup_dict.update(dict(tests_require=['pytest'],
121 |                        cmdclass={'test': PyTest}))
122 | 
123 | if __name__ == '__main__':
124 |     setup(**setup_dict)
125 | 


--------------------------------------------------------------------------------
/web/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kudkudak/word-embeddings-benchmarks/c78272b8c1374e5e518915a240ab2b348b59f44e/web/__init__.py


--------------------------------------------------------------------------------
/web/_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/web/_utils/compat.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Compatibility layer for Python 3/Python 2 single codebase
 3 | """
 4 | import sys
 5 | import hashlib
 6 | 
 7 | 
 8 | if sys.version_info[0] == 3:
 9 |     import pickle
10 |     import io
11 |     import urllib
12 | 
13 |     _basestring = str
14 |     cPickle = pickle
15 |     StringIO = io.StringIO
16 |     BytesIO = io.BytesIO
17 |     _urllib = urllib
18 |     izip = zip
19 | 
20 |     def md5_hash(string):
21 |         m = hashlib.md5()
22 |         m.update(string.encode('utf-8'))
23 |         return m.hexdigest()
24 | else:
25 |     import cPickle
26 |     import StringIO
27 |     import urllib
28 |     import urllib2
29 |     import urlparse
30 |     import types
31 |     import itertools
32 | 
33 |     _basestring = basestring
34 |     cPickle = cPickle
35 |     StringIO = BytesIO = StringIO.StringIO
36 |     izip = itertools.izip
37 | 
38 |     class _module_lookup(object):
39 |         modules = [urlparse, urllib2, urllib]
40 | 
41 |         def __getattr__(self, name):
42 |             for module in self.modules:
43 |                 if hasattr(module, name):
44 |                     attr = getattr(module, name)
45 |                     if not isinstance(attr, types.ModuleType):
46 |                         return attr
47 |             raise NotImplementedError(
48 |                 'This function has not been imported properly')
49 | 
50 |     module_lookup = _module_lookup()
51 | 
52 |     class _urllib():
53 |         request = module_lookup
54 |         error = module_lookup
55 |         parse = module_lookup
56 | 
57 |     def md5_hash(string):
58 |         m = hashlib.md5()
59 |         m.update(string)
60 |         return m.hexdigest()
61 | 


--------------------------------------------------------------------------------
/web/analogy.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  Classes and function for answering analogy questions
  3 | """
  4 | 
  5 | import logging
  6 | from collections import OrderedDict
  7 | import six
  8 | from six.moves import range
  9 | import scipy
 10 | import pandas as pd
 11 | from itertools import product
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | import sklearn
 15 | from .datasets.analogy import *
 16 | from .utils import batched
 17 | from web.embedding import Embedding
 18 | 
 19 | class SimpleAnalogySolver(sklearn.base.BaseEstimator):
 20 |     """
 21 |     Answer analogy questions
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     w : Embedding instance
 26 | 
 27 |     method : {"add", "mul"}
 28 |       Method to use when finding analogy answer, see "Improving Distributional Similarity
 29 |       with Lessons Learned from Word Embeddings" O. Levy et al. 2014.
 30 | 
 31 |     batch_size : int
 32 |       Batch size to use while computing accuracy. This is because of extensive memory usage.
 33 | 
 34 |     k: int
 35 |       If not None will select k top most frequent words from embedding before doing analogy prediction
 36 |       (this can offer significant speedups)
 37 | 
 38 |     Note
 39 |     ----
 40 |     It is suggested to normalize and standardize embedding before passing it to SimpleAnalogySolver.
 41 |     To speed up code consider installing OpenBLAS and setting OMP_NUM_THREADS.
 42 |     """
 43 | 
 44 |     def __init__(self, w, method="add", batch_size=300, k=None):
 45 |         self.w = w
 46 |         self.batch_size = batch_size
 47 |         self.method = method
 48 |         self.k = k
 49 | 
 50 |     def score(self, X, y):
 51 |         """
 52 |         Calculate accuracy on analogy questions dataset
 53 | 
 54 |         Parameters
 55 |         ----------
 56 |         X : array-like, shape (n_samples, 3)
 57 |           Analogy questions.
 58 | 
 59 |         y : array-like, shape (n_samples, )
 60 |           Analogy answers.
 61 | 
 62 |         Returns
 63 |         -------
 64 |         acc : float
 65 |           Accuracy
 66 |         """
 67 |         return np.mean(y == self.predict(X))
 68 | 
 69 |     def predict(self, X):
 70 |         """
 71 |         Answer analogy questions
 72 | 
 73 |         Parameters
 74 |         ----------
 75 |         X : array-like, shape (n_samples, 3)
 76 |           Analogy questions.
 77 | 
 78 |         Returns
 79 |         -------
 80 |         y_pred : array-like, shape (n_samples, )
 81 |           Predicted words.
 82 |         """
 83 |         w = self.w.most_frequent(self.k) if self.k else self.w
 84 |         words = self.w.vocabulary.words
 85 |         word_id = self.w.vocabulary.word_id
 86 |         mean_vector = np.mean(w.vectors, axis=0)
 87 |         output = []
 88 | 
 89 |         missing_words = 0
 90 |         for query in X:
 91 |             for query_word in query:
 92 |                 if query_word not in word_id:
 93 |                     missing_words += 1
 94 |         if missing_words > 0:
 95 |             logger.warning("Missing {} words. Will replace them with mean vector".format(missing_words))
 96 | 
 97 |         # Batch due to memory constaints (in dot operation)
 98 |         for id_batch, batch in enumerate(batched(range(len(X)), self.batch_size)):
 99 |             ids = list(batch)
100 |             X_b = X[ids]
101 |             if id_batch % np.floor(len(X) / (10. * self.batch_size)) == 0:
102 |                 logger.info("Processing {}/{} batch".format(int(np.ceil(ids[1] / float(self.batch_size))),
103 |                                                             int(np.ceil(X.shape[0] / float(self.batch_size)))))
104 | 
105 |             A, B, C = np.vstack(w.get(word, mean_vector) for word in X_b[:, 0]), \
106 |                       np.vstack(w.get(word, mean_vector) for word in X_b[:, 1]), \
107 |                       np.vstack(w.get(word, mean_vector) for word in X_b[:, 2])
108 | 
109 |             if self.method == "add":
110 |                 D = np.dot(w.vectors, (B - A + C).T)
111 |             elif self.method == "mul":
112 |                 D_A = np.log((1.0 + np.dot(w.vectors, A.T)) / 2.0 + 1e-5)
113 |                 D_B = np.log((1.0 + np.dot(w.vectors, B.T)) / 2.0 + 1e-5)
114 |                 D_C = np.log((1.0 + np.dot(w.vectors, C.T)) / 2.0 + 1e-5)
115 |                 D = D_B - D_A + D_C
116 |             else:
117 |                 raise RuntimeError("Unrecognized method parameter")
118 | 
119 |             # Remove words that were originally in the query
120 |             for id, row in enumerate(X_b):
121 |                 D[[w.vocabulary.word_id[r] for r in row if r in
122 |                    w.vocabulary.word_id], id] = np.finfo(np.float32).min
123 | 
124 |             output.append([words[id] for id in D.argmax(axis=0)])
125 | 
126 |         return np.array([item for sublist in output for item in sublist])


--------------------------------------------------------------------------------
/web/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'kudkudak'
2 | 


--------------------------------------------------------------------------------
/web/datasets/analogy.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 |  Functions for fetching analogy datasets
  5 | """
  6 | 
  7 | from collections import defaultdict
  8 | import glob
  9 | import os
 10 | import numpy as np
 11 | 
 12 | from sklearn.utils import check_random_state
 13 | 
 14 | from sklearn.datasets.base import Bunch
 15 | from .utils import _get_dataset_dir, _fetch_file, _change_list_to_np
 16 | from ..utils import standardize_string
 17 | 
 18 | 
 19 | def fetch_wordrep(subsample=None, rng=None):
 20 |     """
 21 |     Fetch MSR WordRep dataset for testing both syntactic and semantic dataset
 22 | 
 23 |     Returns
 24 |     -------
 25 |     data : sklearn.datasets.base.Bunch
 26 |         dictionary-like object. Keys of interest:
 27 |         'X': matrix of word pairs
 28 |         'y': vector of answers
 29 |         'category': name of category
 30 |         'category_high_level': name of high level category (semantic/syntactic)
 31 | 
 32 |     References
 33 |     ----------
 34 |     Gao, Bin and Bian, Jiang and Liu, Tie-Yan,
 35 |     "Wordrep: A benchmark for research on learning word representations", 2014
 36 | 
 37 | 
 38 |     Notes
 39 |     -----
 40 |     This dataset is too big to calculate and store all word analogy quadruples, this is
 41 |     why it returns word paris
 42 | 
 43 |     """
 44 |     path = _fetch_file(url="https://www.dropbox.com/sh/5k78h9gllvc44vt/AAALLQq-Bge605OIMlmGBbNJa?dl=1",
 45 |                        data_dir="analogy",
 46 |                        uncompress=True,
 47 |                        move="EN-WORDREP/EN-WORDREP.zip",
 48 |                        verbose=0)
 49 | 
 50 |     wikipedia_dict = glob.glob(os.path.join(path, "Pairs_from_Wikipedia_and_Dictionary/*.txt"))
 51 |     wordnet = glob.glob(os.path.join(path, "Pairs_from_WordNet/*.txt"))
 52 | 
 53 |     # This dataset is too big to calculate and store all word analogy quadruples
 54 |     word_pairs = []
 55 |     category = []
 56 |     category_high_level = []
 57 | 
 58 |     files = wikipedia_dict + wordnet
 59 | 
 60 |     for file_name in files:
 61 |         c = os.path.basename(file_name).split(".")[0]
 62 |         c = c[c.index("-")+1:]
 63 |         with open(file_name, "r") as f:
 64 |             for l in f.read().splitlines():
 65 |                 word_pairs.append(standardize_string(l).split())
 66 |                 category.append(c)
 67 |                 category_high_level.append("wikipedia-dict" if file_name in wikipedia_dict else "wordnet")
 68 | 
 69 |     if subsample:
 70 |         assert 0 <= subsample <= 1.0
 71 |         rng = check_random_state(rng)
 72 |         ids = rng.choice(range(len(word_pairs)), int(subsample * len(word_pairs)), replace=False)
 73 |         word_pairs = [word_pairs[i] for i in ids]
 74 |         category = [category[i] for i in ids]
 75 |         category_high_level = [category_high_level[i] for i in ids]
 76 | 
 77 |     wordnet_categories = {'Antonym',
 78 |      'Attribute',
 79 |      'Causes',
 80 |      'DerivedFrom',
 81 |      'Entails',
 82 |      'HasContext',
 83 |      'InstanceOf',
 84 |      'IsA',
 85 |      'MadeOf',
 86 |      'MemberOf',
 87 |      'PartOf',
 88 |      'RelatedTo',
 89 |      'SimilarTo'}
 90 | 
 91 |     wikipedia_categories = {'adjective-to-adverb',
 92 |      'all-capital-cities',
 93 |      'city-in-state',
 94 |      'comparative',
 95 |      'currency',
 96 |      'man-woman',
 97 |      'nationality-adjective',
 98 |      'past-tense',
 99 |      'plural-nouns',
100 |      'plural-verbs',
101 |      'present-participle',
102 |      'superlative'}
103 | 
104 |     return Bunch(category_high_level=np.array(category_high_level),
105 |                  X=np.array(word_pairs),
106 |                  category=np.array(category),
107 |                  wikipedia_categories=wordnet_categories,
108 |                  wordnet_categories=wikipedia_categories)
109 | 
110 | 
111 | def fetch_google_analogy():
112 |     """
113 |     Fetch Google dataset for testing both semantic and syntactic analogies.
114 | 
115 |     Returns
116 |     -------
117 |     data : sklearn.datasets.base.Bunch
118 |         dictionary-like object. Keys of interest:
119 |         'X': matrix of word questions
120 |         'y': vector of answers
121 |         'category': name of category
122 |         'category_high_level': name of high level category (semantic/syntactic)
123 | 
124 |     References
125 |     ----------
126 |     Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S and Dean, Jeff,
127 |     "Distributed representations of words and phrases and their compositionality", 2013
128 | 
129 |     Notes
130 |     -----
131 |     This dataset is a subset of WordRep dataset.
132 | 
133 |     """
134 | 
135 |     url = "https://www.dropbox.com/s/eujtyfb5zem1mim/EN-GOOGLE.txt?dl=1"
136 |     with open(_fetch_file(url, "analogy/EN-GOOGLE", verbose=0), "r") as f:
137 |         L = f.read().splitlines()
138 | 
139 |     # Simple 4 word analogy questions with categories
140 |     questions = []
141 |     answers = []
142 |     category = []
143 |     cat = None
144 |     for l in L:
145 |         if l.startswith(":"):
146 |             cat =l.lower().split()[1]
147 |         else:
148 |             words =  standardize_string(l).split()
149 |             questions.append(words[0:3])
150 |             answers.append(words[3])
151 |             category.append(cat)
152 | 
153 |     assert set(category) == set(['gram3-comparative', 'gram8-plural', 'capital-common-countries',
154 |                                          'city-in-state', 'family', 'gram9-plural-verbs', 'gram2-opposite',
155 |                                          'currency', 'gram4-superlative', 'gram6-nationality-adjective',
156 |                                          'gram7-past-tense',
157 |                                          'gram5-present-participle', 'capital-world', 'gram1-adjective-to-adverb'])
158 | 
159 | 
160 |     syntactic = set([c for c in set(category) if c.startswith("gram")])
161 |     category_high_level = []
162 |     for cat in category:
163 |          category_high_level.append("syntactic" if cat in syntactic else "semantic")
164 | 
165 |     # dtype=object for memory efficiency
166 |     return Bunch(X=np.vstack(questions).astype("object"),
167 |                  y=np.hstack(answers).astype("object"),
168 |                  category=np.hstack(category).astype("object"),
169 |                  category_high_level=np.hstack(category_high_level).astype("object"))
170 | 
171 | 
172 | 
173 | def fetch_msr_analogy():
174 |     """
175 |     Fetch MSR dataset for testing performance on syntactic analogies
176 | 
177 |     Returns
178 |     -------
179 |     data : sklearn.datasets.base.Bunch
180 |         dictionary-like object. Keys of interest:
181 |         'X': matrix of word questions
182 |         'y': vector of answers
183 |         'category': name of category
184 |         'category_high_level': name of high level category (noun/adjective/verb)
185 | 
186 |     References
187 |     ----------
188 |     Originally published at http://research.microsoft.com/en-us/projects/rnn/.
189 | 
190 |     Notes
191 |     -----
192 |     Authors description: "more precisely, we tagged 267M words of newspaper text
193 |     with Treebank POS tags (Marcus et al., 1993). We then selected 100 of the most frequent comparative adjectives
194 |     (words labeled JJR); 100 of the most frequent plural nouns (NNS); 100 of the most frequent possessive nouns
195 |     (NN POS); and 100 of the most frequent base form verbs (VB).
196 |     We then systematically generated analogy questions by randomly matching each of the 100 words with 5 other words
197 |     from the same category, and creating variants.
198 |     """
199 |     url = "https://www.dropbox.com/s/ne0fib302jqbatw/EN-MSR.txt?dl=1"
200 |     with open(_fetch_file(url, "analogy/EN-MSR", verbose=0), "r") as f:
201 |         L = f.read().splitlines()
202 | 
203 |     # Typical 4 words analogy questions
204 |     questions = []
205 |     answers = []
206 |     category = []
207 |     for l in L:
208 |         words = standardize_string(l).split()
209 |         questions.append(words[0:3])
210 |         answers.append(words[4])
211 |         category.append(words[3])
212 | 
213 |     verb = set([c for c in set(category) if c.startswith("VB")])
214 |     noun = set([c for c in set(category) if c.startswith("NN")])
215 |     category_high_level = []
216 |     for cat in category:
217 |          if cat in verb:
218 |              category_high_level.append("verb")
219 |          elif cat in noun:
220 |              category_high_level.append("noun")
221 |          else:
222 |              category_high_level.append("adjective")
223 | 
224 |     assert set([c.upper() for c in category]) == set(['VBD_VBZ', 'VB_VBD', 'VBZ_VBD',
225 |                                          'VBZ_VB', 'NNPOS_NN', 'JJR_JJS', 'JJS_JJR', 'NNS_NN', 'JJR_JJ',
226 |                                          'NN_NNS', 'VB_VBZ', 'VBD_VB', 'JJS_JJ', 'NN_NNPOS', 'JJ_JJS', 'JJ_JJR'])
227 | 
228 |     return Bunch(X=np.vstack(questions).astype("object"),
229 |                  y=np.hstack(answers).astype("object"),
230 |                  category=np.hstack(category).astype("object"),
231 |                  category_high_level=np.hstack(category_high_level).astype("object"))
232 | 
233 | 
234 | # TODO: rewrite to a more standarized version
235 | def fetch_semeval_2012_2(which="all", which_scoring="golden"):
236 |     """
237 |     Fetch dataset used for SEMEVAL 2012 task 2 competition
238 | 
239 |     Parameters
240 |     -------
241 |     which : "all", "train" or "test"
242 |     which_scoring: "golden" or "platinium" (see Notes)
243 | 
244 |     Returns
245 |     -------
246 |     data : sklearn.datasets.base.Bunch
247 |         dictionary-like object. Keys of interest:
248 |         'X_prot': dictionary keyed on category. Each entry is a matrix of prototype word pairs (see Notes)
249 |         'X': dictionary keyed on category. Each entry is a matrix of question word pairs
250 |         'y': dictionary keyed on category. Each entry is a dictionary word pair -> score
251 | 
252 |         'categories_names': dictionary keyed on category. Each entry is a human readable name of
253 |         category.
254 |         'categories_descriptions': dictionary keyed on category. Each entry is a human readable description of
255 |         category.
256 | 
257 |     References
258 |     ----------
259 |     DA Jurgens et al.,
260 |     "Measuring degrees of relational similarity. In *SEM 2012: The First Joint Conference on Lexical
261 |     and Computational Semantics", 2012
262 | 
263 |     Notes
264 |     -----
265 |     Dataset used in competition was scored as in golden scoring (which_scoring) parameter, however
266 |     organiser have release improved labels afterwards (platinium scoring)
267 | 
268 |     The task is, given two pairs of words, A:B and C:D, determine the degree to which the semantic relations between
269 |     A and B are similar to those between C and D. Unlike the more familiar task of semantic relation identification,
270 |     which assigns each word pair to a discrete semantic relation class, this task recognizes the continuous range of
271 |     degrees of relational similarity. The challenge is to determine the degrees of relational similarity between a
272 |     given reference word pair and a variety of other pairs, mostly in the same general semantic relation class as the
273 |     reference pair.
274 |     """
275 |     assert which in ['all', 'train', 'test']
276 |     assert which_scoring in ['golden', 'platinium']
277 | 
278 |     path = _fetch_file(url="https://www.dropbox.com/sh/aarqsfnumx3d8ds/AAB05Mu2HdypP0pudGrNjooaa?dl=1",
279 |                        data_dir="analogy",
280 |                        uncompress=True,
281 |                        move="EN-SEMVAL-2012-2/EN-SEMVAL-2012-2.zip",
282 |                        verbose=0)
283 | 
284 |     train_files = set(glob.glob(os.path.join(path, "train*.txt"))) - \
285 |                   set(glob.glob(os.path.join(path, "train*_meta.txt")))
286 |     test_files = set(glob.glob(os.path.join(path, "test*.txt"))) - \
287 |                  set(glob.glob(os.path.join(path, "test*_meta.txt")))
288 | 
289 |     if which == "train":
290 |         files = train_files
291 |     elif which == "test":
292 |         files = test_files
293 |     elif which == "all":
294 |         files = train_files.union(test_files)
295 | 
296 |     # Every question is formed as similarity to analogy category that is
297 |     # posed as a list of 3 prototype word pairs
298 |     questions = defaultdict(list)
299 |     prototypes = {}
300 |     golden_scores = {}
301 |     platinium_scores = {}
302 |     scores = {"platinium": platinium_scores, "golden": golden_scores}
303 |     categories_names = {}
304 |     categories_descriptions = {}
305 |     for f in files:
306 |         with open(f[0:-4] + "_meta.txt") as meta_f:
307 |             meta = meta_f.read().splitlines()[1].split(",")
308 | 
309 |         with open(os.path.dirname(f) + "/pl-" + os.path.basename(f)) as f_pl:
310 |             platinium = f_pl.read().splitlines()
311 | 
312 |         with open(f) as f_gl:
313 |             golden = f_gl.read().splitlines()
314 | 
315 |         assert platinium[0] == golden[0], ("Incorrect file for ", f)
316 | 
317 |         c = meta[0] + "_" + meta[1]
318 |         categories_names[c] = meta[2] + "_" + meta[3]
319 |         categories_descriptions[c] = meta[4]
320 | 
321 |         prototypes[c] = [l.split(":") for l in \
322 |                          platinium[0].replace(": ", ":").replace(" ", ",").replace(".", "").split(",")]
323 |         golden_scores[c] = {}
324 |         platinium_scores[c] = {}
325 |         questions_raw = []
326 |         for line_pl in platinium[1:]:
327 |             word_pair, score = line_pl.split()
328 |             questions_raw.append(word_pair)
329 |             questions[c].append([standardize_string(w) for w in word_pair.split(":")])
330 |             platinium_scores[c][word_pair] = score
331 | 
332 |         for line_g in golden[1:]:
333 |             word_pair, score = line_g.split()
334 |             golden_scores[c][word_pair] = score
335 | 
336 |         # Make scores a list
337 |         platinium_scores[c] = [platinium_scores[c][w] for w in questions_raw]
338 |         golden_scores[c] = [golden_scores[c][w] for w in questions_raw]
339 | 
340 |     return Bunch(X_prot=_change_list_to_np(prototypes),
341 |                  X=_change_list_to_np(questions),
342 |                  y=scores[which_scoring],
343 |                  categories_names=categories_names,
344 |                  categories_descriptions=categories_descriptions)
345 | 
346 | 
347 | 


--------------------------------------------------------------------------------
/web/datasets/categorization.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 |  Functions for fetching categorization datasets
  5 | """
  6 | 
  7 | from sklearn.datasets.base import Bunch
  8 | from .utils import _get_cluster_assignments
  9 | 
 10 | 
 11 | def fetch_AP():
 12 |     """
 13 |     Fetch Almuhareb and Abdulrahman categorization dataset
 14 | 
 15 |     Returns
 16 |     -------
 17 |     data : sklearn.datasets.base.Bunch
 18 |         dictionary-like object. Keys of interest:
 19 |         'clusters': dict of arrays of words representing
 20 | 
 21 |     References
 22 |     ----------
 23 |     Almuhareb et al., "Concept learning and categorization from the web", 2005
 24 | 
 25 |     Notes
 26 |     -----
 27 |     Authors description:
 28 |     Our goal was to create a dataset balanced with respect to
 29 |     three factors: class type, frequency, and ambiguity.
 30 |     First of all, we aimed to include one class of nouns for
 31 |     each of the 21 unique beginners of the WordNet noun
 32 |     hierarchy4
 33 |     . We chose subclasses for each of these 21
 34 |     beginners that would represent a reasonably natural cluster:
 35 |     e.g., the hyponym social occasion for the unique beginner
 36 |     event. From each such class, we selected between 13 and 21
 37 |     nouns to be representative concepts for the class (e.g.,
 38 |     ceremony, feast, and graduation for the class social
 39 |     occasion).
 40 |     Secondly, we aimed to include about 1/3 high frequency
 41 |     nouns, 1/3 medium frequency, and 1/3 low frequency. Noun
 42 |     frequencies where estimated using the British National
 43 |     Corpus. We considered as highly frequent those nouns with
 44 |     frequency 1,000 or more; as medium frequent the nouns
 45 |     with between 1,000 and 100 occurrences; and those between
 46 |     100 and 5 as low frequent.
 47 |     Thirdly, we wanted the dataset to be balanced as to
 48 |     ambiguity, estimated on the basis of the number of senses in
 49 |     WordNet. Nouns with 4 or more senses were considered
 50 |     highly ambiguous; nouns with 2 or 3 senses medium
 51 |     ambiguous; and nouns with a single sense as not ambiguous.
 52 |     """
 53 |     return _get_cluster_assignments(dataset_name="EN-AP",
 54 |                                     url="https://www.dropbox.com/sh/6xu1c1aan8f83p3/AACMyoLwncNhRkUkqvGurYB6a?dl=1")
 55 | 
 56 | 
 57 | def fetch_BLESS():
 58 |     """
 59 |     Fetch Baroni and Marco categorization dataset
 60 | 
 61 |     Parameters
 62 |     -------
 63 | 
 64 |     Returns
 65 |     -------
 66 |     data : sklearn.datasets.base.Bunch
 67 |         dictionary-like object. Keys of interest:
 68 |         'X': words
 69 |         'y': cluster assignment
 70 | 
 71 |     References
 72 |     ----------
 73 |     Baroni et al. "How we BLESSed distributional semantic evaluation", 2011
 74 | 
 75 |     Notes
 76 |     -----
 77 |     Data set includes 200 concrete nouns (100 animate and 100 inanimate nouns)
 78 |     from different classes (e.g., tools, clothing, vehicles, animals, etc.).
 79 |     """
 80 |     return _get_cluster_assignments(dataset_name="EN-BLESS",
 81 |                                     url="https://www.dropbox.com/sh/5qbl5cmh17o3eh0/AACyCEqpMktdMI05zwphJRI7a?dl=1")
 82 | 
 83 | 
 84 | def fetch_battig():
 85 |     """
 86 |     Fetch 1969 Battig dataset
 87 | 
 88 |     Returns
 89 |     -------
 90 |     data : sklearn.datasets.base.Bunch
 91 |         dictionary-like object. Keys of interest:
 92 |         'X': words
 93 |         'y': cluster assignment
 94 |         'freq': frequency of response
 95 |         'frequency': Kucera-Francis word frequency
 96 |         'rank': rank of frequence within response
 97 |         'rfreq': rated frequency
 98 | 
 99 |     References
100 |     ----------
101 |     W.F Battig & W.E Montague (1968). Category norms for verbal items in 56 categories: A replication
102 |     and extension of the Connecticut norms using University of Maryland and Illinois students
103 |     (Tech. Rep.) University of Colorado, Boulder, CO (1968)
104 | 
105 |     Notes
106 |     -----
107 |     This dataset comprises a ranked list of 5231 words listed in 56 taxonomic categories by people
108 |     who were asked to list as many exemplars of a given category ("a precious stone", "a unit of time",
109 |     "a fruit", "a color", etc.). Participants had 30s to generate as many responses to each category as
110 |     possible, after which time the next category name was presented.
111 |     Included in this dataset are all words from the Battig and Montague (1969) norms listed with
112 |     freq > 1.
113 | 
114 |     This is not the same dataset as 'battig' in Baroni et al. "Don’t count, predict! A systematic comparison of
115 |     context-counting vs. context-predicting semantic vectors"
116 |     """
117 |     data = _get_cluster_assignments(dataset_name="EN-BATTIG",
118 |                                     url="https://www.dropbox.com/sh/ckp4yu7k7xl7u2a/AABhmpgU3ake3T9liA9BR8EBa?dl=1",
119 |                                     sep=",", skip_header=True)
120 |     return Bunch(X=data.X[:, 0], y=data.y,
121 |                  freq=data.X[:, 1], frequency=data.X[:, 2], rank=data.X[:, 3], rfreq=data.X[:, 4])
122 | 
123 | 
124 | 
125 | def fetch_ESSLI_2c():
126 |     """
127 |     Fetch ESSLI 2c task categorization dataset
128 | 
129 |     Returns
130 |     -------
131 |     data : sklearn.datasets.base.Bunch
132 |         dictionary-like object. Keys of interest:
133 |         'X': words
134 |         'y': cluster assignment
135 | 
136 |     References
137 |     ----------
138 |     Originally published at http://wordspace.collocations.de/doku.php/data:esslli2008:verb_categorization
139 | 
140 |     Notes
141 |     -----
142 |     The goal of the sub-task is to group verbs into semantic categories. The data set consists of 45 verbs,
143 |     belonging to 9 semantic classes. The classification scheme is inspired by P. Vinson & G. Vigliocco (2007),
144 |     “Semantic Feature Production Norms for a Large Set of Objects and Events”, Behavior Research Methods,
145 |     which in turn closely follows the classification proposed in Levin (1993). The data set consists of 44 concrete
146 |     nouns, belonging to 6 semantic categories (four animates and two inanimates). The nouns are included in the
147 |     feature norms described in McRae et al. (2005)
148 |     """
149 |     return _get_cluster_assignments(dataset_name="EN-ESSLI-2c",
150 |                                     url="https://www.dropbox.com/sh/d3mcyl3b5mawfhm/AAABygW1rguhI4L0XSw_I68ta?dl=1")
151 | 
152 | 
153 | def fetch_ESSLI_2b():
154 |     """
155 |     Fetch ESSLI 2c task categorization dataset
156 | 
157 |     Parameters
158 |     -------
159 | 
160 |     Returns
161 |     -------
162 |     data : sklearn.datasets.base.Bunch
163 |         dictionary-like object. Keys of interest:
164 |         'X': words
165 |         'y': cluster assignment
166 | 
167 |     References
168 |     ----------
169 |     Originally published at
170 |     http://wordspace.collocations.de/doku.php/data:esslli2008:abstract_concrete_nouns_discrimination.
171 | 
172 |     Notes
173 |     -----
174 |     The data set consists of 40 nouns extracted from the MRC Psycholinguistic Database, with ratings by human subjects
175 |     on the concreteness scale. The nouns have been classified into three classes: HI, LO and ME being highly,
176 |     low and medium abstract nouns.
177 |     """
178 |     return _get_cluster_assignments(dataset_name="EN-ESSLI-2b",
179 |                                     url="https://www.dropbox.com/sh/7gdv52gy9vb4mf2/AACExLgHdbvbBrRZBP6CcdDaa?dl=1")
180 | 
181 | 
182 | def fetch_ESSLI_1a():
183 |     """
184 |     Fetch ESSLI 1a task categorization dataset.
185 | 
186 |     Returns
187 |     -------
188 |     data : sklearn.datasets.base.Bunch
189 |         dictionary-like object. Keys of interest:
190 |         'X': words
191 |         'y': cluster assignment
192 | 
193 |     References
194 |     ----------
195 |     Originally published at http://wordspace.collocations.de/doku.php/data:esslli2008:concrete_nouns_categorization.
196 | 
197 |     Notes
198 |     -----
199 |     The goal of the sub-task is to group concrete nouns into semantic categories.
200 |     The data set consists of 44 concrete nouns, belonging to 6 semantic categories (four animates and two inanimates).
201 |     The nouns are included in the feature norms described in McRae et al. (2005)
202 |     """
203 |     return _get_cluster_assignments(dataset_name="EN-ESSLI-1a",
204 |                                     url="https://www.dropbox.com/sh/h362565r1sk5wii/AADjcdYy3nRo-MjuFUSvb-0ya?dl=1")
205 | 


--------------------------------------------------------------------------------
/web/datasets/similarity.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 |  Functions for fetching similarity datasets
  5 | """
  6 | 
  7 | import os
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | from sklearn.datasets.base import Bunch
 12 | 
 13 | from .utils import _get_as_pd, _fetch_file
 14 | 
 15 | 
 16 | def fetch_MTurk():
 17 |     """
 18 |     Fetch MTurk dataset for testing attributional similarity
 19 | 
 20 |     Returns
 21 |     -------
 22 |     data : sklearn.datasets.base.Bunch
 23 |         dictionary-like object. Keys of interest:
 24 |         'X': matrix of 2 words per column,
 25 |         'y': vector with scores,
 26 | 
 27 |     References
 28 |     ----------
 29 |     Radinsky, Kira et al., "A Word at a Time: Computing Word Relatedness Using Temporal Semantic Analysis", 2011
 30 | 
 31 |     Notes
 32 |     -----
 33 |     Human labeled examples of word semantic relatedness. The data pairs were generated using an algorithm as
 34 |     described in the paper by [K. Radinsky, E. Agichtein, E. Gabrilovich, S. Markovitch.].
 35 |     Each pair of words was evaluated by 10 people on a scale of 1-5.
 36 | 
 37 |     Additionally scores were multiplied by factor of 2.
 38 |     """
 39 |     data = _get_as_pd('https://www.dropbox.com/s/f1v4ve495mmd9pw/EN-TRUK.txt?dl=1',
 40 |                       'similarity', header=None, sep=" ").values
 41 |     return Bunch(X=data[:, 0:2].astype("object"),
 42 |                  y=2 * data[:, 2].astype(np.float))
 43 | 
 44 | 
 45 | def fetch_MEN(which="all", form="natural"):
 46 |     """
 47 |     Fetch MEN dataset for testing similarity and relatedness
 48 | 
 49 |     Parameters
 50 |     ----------
 51 |     which : "all", "test" or "dev"
 52 |     form : "lem" or "natural"
 53 | 
 54 |     Returns
 55 |     -------
 56 |     data : sklearn.datasets.base.Bunch
 57 |         dictionary-like object. Keys of interest:
 58 |         'X': matrix of 2 words per column,
 59 |         'y': vector with scores
 60 | 
 61 |     References
 62 |     ----------
 63 |     Published at http://clic.cimec.unitn.it/~elia.bruni/MEN.html.
 64 | 
 65 |     Notes
 66 |     -----
 67 |     Scores for MEN are calculated differently than in WS353 or SimLex999.
 68 |     Furthermore scores where rescaled to 0 - 10 scale to match standard scaling.
 69 | 
 70 |     The MEN Test Collection contains two sets of English word pairs (one for training and one for testing)
 71 |     together with human-assigned similarity judgments, obtained by crowdsourcing using Amazon Mechanical
 72 |     Turk via the CrowdFlower interface. The collection can be used to train and/or test computer algorithms
 73 |     implementing semantic similarity and relatedness measures.
 74 |     """
 75 |     if which == "dev":
 76 |         data = _get_as_pd('https://www.dropbox.com/s/c0hm5dd95xapenf/EN-MEN-LEM-DEV.txt?dl=1',
 77 |                           'similarity', header=None, sep=" ")
 78 |     elif which == "test":
 79 |         data = _get_as_pd('https://www.dropbox.com/s/vdmqgvn65smm2ah/EN-MEN-LEM-TEST.txt?dl=1',
 80 |                           'similarity/EN-MEN-LEM-TEST', header=None, sep=" ")
 81 |     elif which == "all":
 82 |         data = _get_as_pd('https://www.dropbox.com/s/b9rv8s7l32ni274/EN-MEN-LEM.txt?dl=1',
 83 |                           'similarity', header=None, sep=" ")
 84 |     else:
 85 |         raise RuntimeError("Not recognized which parameter")
 86 | 
 87 |     if form == "natural":
 88 |         # Remove last two chars from first two columns
 89 |         data = data.apply(lambda x: [y if isinstance(y, float) else y[0:-2] for y in x])
 90 |     elif form != "lem":
 91 |         raise RuntimeError("Not recognized form argument")
 92 | 
 93 |     return Bunch(X=data.values[:, 0:2].astype("object"), y=data.values[:, 2:].astype(np.float) / 5.0)
 94 | 
 95 | 
 96 | def fetch_WS353(which="all"):
 97 |     """
 98 |     Fetch WS353 dataset for testing attributional and
 99 |     relatedness similarity
100 | 
101 |     Parameters
102 |     ----------
103 |     which : 'all': for both relatedness and attributional similarity,
104 |             'relatedness': for relatedness similarity
105 |             'similarity': for attributional similarity
106 |             'set1': as divided by authors
107 |             'set2': as divided by authors
108 | 
109 |     References
110 |     ----------
111 |     Finkelstein, Gabrilovich, "Placing Search in Context: The Concept Revisited†", 2002
112 |     Agirre, Eneko et al., "A Study on Similarity and Relatedness Using Distributional and WordNet-based Approaches",
113 |     2009
114 | 
115 |     Returns
116 |     -------
117 |     data : sklearn.datasets.base.Bunch
118 |         dictionary-like object. Keys of interest:
119 |         'X': matrix of 2 words per column,
120 |         'y': vector with scores,
121 |         'sd': vector of std of scores if available (for set1 and set2)
122 |     """
123 |     if which == "all":
124 |         data = _get_as_pd('https://www.dropbox.com/s/eqal5qj97ajaycz/EN-WS353.txt?dl=1',
125 |                           'similarity', header=0, sep="\t")
126 |     elif which == "relatedness":
127 |         data = _get_as_pd('https://www.dropbox.com/s/x94ob9zg0kj67xg/EN-WSR353.txt?dl=1',
128 |                           'similarity', header=None, sep="\t")
129 |     elif which == "similarity":
130 |         data = _get_as_pd('https://www.dropbox.com/s/ohbamierd2kt1kp/EN-WSS353.txt?dl=1',
131 |                           'similarity', header=None, sep="\t")
132 |     elif which == "set1":
133 |         data = _get_as_pd('https://www.dropbox.com/s/opj6uxzh5ov8gha/EN-WS353-SET1.txt?dl=1',
134 |                           'similarity', header=0, sep="\t")
135 |     elif which == "set2":
136 |         data = _get_as_pd('https://www.dropbox.com/s/w03734er70wyt5o/EN-WS353-SET2.txt?dl=1',
137 |                           'similarity', header=0, sep="\t")
138 |     else:
139 |         raise RuntimeError("Not recognized which parameter")
140 | 
141 |     # We basically select all the columns available
142 |     X = data.values[:, 0:2]
143 |     y = data.values[:, 2].astype(np.float)
144 | 
145 |     # We have also scores
146 |     if data.values.shape[1] > 3:
147 |         sd = np.std(data.values[:, 2:15].astype(np.float), axis=1).flatten()
148 |         return Bunch(X=X.astype("object"), y=y, sd=sd)
149 |     else:
150 |         return Bunch(X=X.astype("object"), y=y)
151 | 
152 | 
153 | def fetch_RG65():
154 |     """
155 |     Fetch Rubenstein and Goodenough dataset for testing attributional and
156 |     relatedness similarity
157 | 
158 |     Returns
159 |     -------
160 |     data : sklearn.datasets.base.Bunch
161 |         dictionary-like object. Keys of interest:
162 |         'X': matrix of 2 words per column,
163 |         'y': vector with scores,
164 |         'sd': vector of std of scores if available (for set1 and set2)
165 | 
166 |     References
167 |     ----------
168 |     Rubenstein, Goodenough, "Contextual correlates of synonymy", 1965
169 | 
170 |     Notes
171 |     -----
172 |     Scores were scaled by factor 10/4
173 |     """
174 |     data = _get_as_pd('https://www.dropbox.com/s/chopke5zqly228d/EN-RG-65.txt?dl=1',
175 |                       'similarity', header=None, sep="\t").values
176 | 
177 |     return Bunch(X=data[:, 0:2].astype("object"),
178 |                  y=data[:, 2].astype(np.float) * 10.0 / 4.0)
179 | 
180 | 
181 | def fetch_RW():
182 |     """
183 |     Fetch Rare Words dataset for testing attributional similarity
184 | 
185 |     Returns
186 |     -------
187 |     data : sklearn.datasets.base.Bunch
188 |         dictionary-like object. Keys of interest:
189 |         'X': matrix of 2 words per column,
190 |         'y': vector with scores,
191 |         'sd': vector of std of scores
192 | 
193 |     References
194 |     ----------
195 |     Published at http://www-nlp.stanford.edu/~lmthang/morphoNLM/.
196 | 
197 |     Notes
198 |     -----
199 |     2034 word pairs that are relatively rare with human similarity scores. Rare word selection: our choices of
200 |     rare words (word1) are based on their frequencies – based on five bins (5, 10], (10, 100], (100, 1000],
201 |     (1000, 10000], and the affixes they possess. To create a diverse set of candidates, we randomly
202 |     select 15 words for each configuration (a frequency bin, an affix). At the scale of Wikipedia,
203 |     a word with frequency of 1-5 is most likely a junk word, and even restricted to words with
204 |     frequencies above five, there are still many non-English words. To counter such problems,
205 |     each word selected is required to have a non-zero number of synsets in WordNet(Miller, 1995).
206 |     """
207 |     data = _get_as_pd('https://www.dropbox.com/s/xhimnr51kcla62k/EN-RW.txt?dl=1',
208 |                       'similarity', header=None, sep="\t").values
209 |     return Bunch(X=data[:, 0:2].astype("object"),
210 |                  y=data[:, 2].astype(np.float),
211 |                  sd=np.std(data[:, 3:].astype(np.float)))
212 | 
213 | 
214 | def fetch_multilingual_SimLex999(which="EN"):
215 |     """
216 |     Fetch Multilingual SimLex999 dataset for testing attributional similarity
217 | 
218 |     Parameters
219 |     -------
220 |     which : "EN", "RU", "IT" or "DE" for language
221 | 
222 |     Returns
223 |     -------
224 |     data : sklearn.datasets.base.Bunch
225 |         dictionary-like object. Keys of interest:
226 |         'X': matrix of 2 words per column,
227 |         'y': vector with scores,
228 |         'sd': vector of sd of scores,
229 | 
230 |     References
231 |     ----------
232 |     Published at http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html.
233 | 
234 |     Notes
235 |     -----
236 |     Scores for EN are different than the original SimLex999 dataset.
237 | 
238 |     Authors description:
239 |     Multilingual SimLex999 resource consists of translations of the SimLex999 word similarity data set to
240 |     three languages: German, Italian and Russian. Each of the translated datasets is scored by
241 |     13 human judges (crowdworkers) - all fluent speakers of its language. For consistency, we
242 |     also collected human judgments for the original English corpus according to the same protocol
243 |     applied to the other languages. This dataset allows to explore the impact of the "judgement language"
244 |     (the language in which word pairs are presented to the human judges) on the resulted similarity scores
245 |     and to evaluate vector space models on a truly multilingual setup (i.e. when both the training and the
246 |     test data are multilingual).
247 |     """
248 |     if which == "EN":
249 |         data = _get_as_pd('https://www.dropbox.com/s/nczc4ao6koqq7qm/EN-MSIM999.txt?dl=1',
250 |                           'similarity', header=None, encoding='utf-8', sep=" ")
251 |     elif which == "DE":
252 |         data = _get_as_pd('https://www.dropbox.com/s/ucpwrp0ahawsdtf/DE-MSIM999.txt?dl=1',
253 |                           'similarity', header=None, encoding='utf-8', sep=" ")
254 |     elif which == "IT":
255 |         data = _get_as_pd('https://www.dropbox.com/s/siqjagyz8dkjb9q/IT-MSIM999.txt?dl=1',
256 |                           'similarity', header=None, encoding='utf-8', sep=" ")
257 |     elif which == "RU":
258 |         data = _get_as_pd('https://www.dropbox.com/s/3v26edm9a31klko/RU-MSIM999.txt?dl=1',
259 |                           'similarity', header=None, encoding='utf-8', sep=" ")
260 |     else:
261 |         raise RuntimeError("Not recognized which parameter")
262 | 
263 |     # We basically select all the columns available
264 |     X = data.values[:, 0:2]
265 |     scores = data.values[:, 2:].astype(np.float)
266 |     y = np.mean(scores, axis=1)
267 |     sd = np.std(scores, axis=1)
268 | 
269 |     return Bunch(X=X.astype("object"), y=y, sd=sd)
270 | 
271 | 
272 | def fetch_SimLex999():
273 |     """
274 |     Fetch SimLex999 dataset for testing attributional similarity
275 | 
276 |     Returns
277 |     -------
278 |     data : sklearn.datasets.base.Bunch
279 |         dictionary-like object. Keys of interest:
280 |         'X': matrix of 2 words per column,
281 |         'y': vector with scores,
282 |         'sd': vector of sd of scores,
283 |         'conc': matrix with columns conc(w1), conc(w2) and concQ the from dataset
284 |         'POS': vector with POS tag
285 |         'assoc': matrix with columns denoting free association: Assoc(USF) and SimAssoc333
286 | 
287 |     References
288 |     ----------
289 |     Hill, Felix et al., "Simlex-999: Evaluating semantic models with (genuine) similarity estimation", 2014
290 | 
291 |     Notes
292 |     -----
293 |      SimLex-999 is a gold standard resource for the evaluation of models that learn the meaning of words and concepts.
294 |      SimLex-999 provides a way of measuring how well models capture similarity, rather than relatedness or
295 |      association. The scores in SimLex-999 therefore differ from other well-known evaluation datasets
296 |      such as WordSim-353 (Finkelstein et al. 2002). The following two example pairs illustrate the
297 |      difference - note that clothes are not similar to closets (different materials, function etc.),
298 |      even though they are very much related: coast - shore 9.00 9.10, clothes - closet 1.96 8.00
299 |     """
300 |     data = _get_as_pd('https://www.dropbox.com/s/0jpa1x8vpmk3ych/EN-SIM999.txt?dl=1',
301 |                       'similarity', sep="\t")
302 | 
303 |     # We basically select all the columns available
304 |     X = data[['word1', 'word2']].values
305 |     y = data['SimLex999'].values
306 |     sd = data['SD(SimLex)'].values
307 |     conc = data[['conc(w1)', 'conc(w2)', 'concQ']].values
308 |     POS = data[['POS']].values
309 |     assoc = data[['Assoc(USF)', 'SimAssoc333']].values
310 | 
311 |     return Bunch(X=X.astype("object"), y=y, sd=sd, conc=conc, POS=POS, assoc=assoc)
312 | 
313 | 
314 | def fetch_TR9856():
315 |     """
316 |     Fetch TR9856 dataset for testing multi-word term relatedness
317 | 
318 |     Returns
319 |     -------
320 |     data : sklearn.datasets.base.Bunch
321 |         dictionary-like object. Keys of interest:
322 |         'X': matrix of 2 words per column,
323 |         'y': vector with scores,
324 |         'topic': vector of topics providing context for each pair of terms
325 | 
326 |     References
327 |     ----------
328 |     Levy, Ran et al., "TR9856: A multi-word term relatedness benchmark", 2015.
329 | 
330 |     Notes
331 |     -----
332 |     """
333 |     data = pd.read_csv(os.path.join(_fetch_file(
334 |         'https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_TR9856.v2.zip',
335 |         'similarity', uncompress=True, verbose=0),
336 |         'IBM_Debater_(R)_TR9856.v0.2', 'TermRelatednessResults.csv'), encoding="iso-8859-1")
337 | 
338 |     # We basically select all the columns available
339 |     X = data[['term1', 'term2']].values
340 |     y = data['score'].values
341 |     topic = data['topic'].values
342 | 
343 |     return Bunch(X=X.astype("object"), y=y, topic=topic)
344 | 


--------------------------------------------------------------------------------
/web/datasets/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Downloading datasets: utility functions
  3 | 
  4 | This is a copy of nilearn.datasets.
  5 | """
  6 | 
  7 | import errno
  8 | import os
  9 | import numpy as np
 10 | import base64
 11 | import collections
 12 | import contextlib
 13 | import fnmatch
 14 | import hashlib
 15 | import shutil
 16 | import tempfile
 17 | import time
 18 | import sys
 19 | import tarfile
 20 | import warnings
 21 | import zipfile
 22 | import glob
 23 | import pandas as pd
 24 | from tqdm import tqdm
 25 | from sklearn.datasets.base import Bunch
 26 | from .._utils.compat import _basestring, cPickle, _urllib, md5_hash
 27 | 
 28 | 
 29 | TEMP = tempfile.gettempdir()
 30 | 
 31 | 
 32 | def _makedirs(path):  # https://stackoverflow.com/a/600612/223267
 33 |     try:
 34 |         os.makedirs(path)
 35 |     except OSError as e:
 36 |         if e.errno == errno.EEXIST and os.path.isdir(path):
 37 |             pass
 38 |         else:
 39 |             raise
 40 | 
 41 | 
 42 | 
 43 | def _get_cluster_assignments(dataset_name, url, sep=" ", skip_header=False):
 44 |     data_dir = _get_dataset_dir("categorization", verbose=0)
 45 |     _fetch_file(url=url,
 46 |                  data_dir=data_dir,
 47 |                  uncompress=True,
 48 |                  move="{0}/{0}.txt".format(dataset_name),
 49 |                  verbose=0)
 50 |     files = glob.glob(os.path.join(data_dir, dataset_name + "/*.txt"))
 51 |     X = []
 52 |     y = []
 53 |     names = []
 54 |     for cluster_id, file_name in enumerate(files):
 55 |         with open(file_name) as f:
 56 |             lines = f.read().splitlines()[(int(skip_header)):]
 57 | 
 58 |             X += [l.split(sep) for l in lines]
 59 |             y += [os.path.basename(file_name).split(".")[0]] * len(lines)
 60 |     return Bunch(X=np.array(X, dtype="object"), y=np.array(y).astype("object"))
 61 | 
 62 | def _get_as_pd(url, dataset_name, **read_csv_kwargs):
 63 |     return pd.read_csv(_fetch_file(url, dataset_name, verbose=0), **read_csv_kwargs)
 64 | 
 65 | def _change_list_to_np(dict):
 66 |     return {k: np.array(dict[k], dtype="object") for k in dict}
 67 | 
 68 | def _format_time(t):
 69 |     if t > 60:
 70 |         return "%4.1fmin" % (t / 60.)
 71 |     else:
 72 |         return " %5.1fs" % (t)
 73 | 
 74 | 
 75 | def _md5_sum_file(path):
 76 |     """ Calculates the MD5 sum of a file.
 77 |     """
 78 |     with open(path, 'rb') as f:
 79 |         m = hashlib.md5()
 80 |         while True:
 81 |             data = f.read(8192)
 82 |             if not data:
 83 |                 break
 84 |             m.update(data)
 85 |     return m.hexdigest()
 86 | 
 87 | 
 88 | def _read_md5_sum_file(path):
 89 |     """ Reads a MD5 checksum file and returns hashes as a dictionary.
 90 |     """
 91 |     with open(path, "r") as f:
 92 |         hashes = {}
 93 |         while True:
 94 |             line = f.readline()
 95 |             if not line:
 96 |                 break
 97 |             h, name = line.rstrip().split('  ', 1)
 98 |             hashes[name] = h
 99 |     return hashes
100 | 
101 | 
102 | def readlinkabs(link):
103 |     """
104 |     Return an absolute path for the destination
105 |     of a symlink
106 |     """
107 |     path = os.readlink(link)
108 |     if os.path.isabs(path):
109 |         return path
110 |     return os.path.join(os.path.dirname(link), path)
111 | 
112 | 
113 | 
114 | def _chunk_report_(bytes_so_far, total_size, initial_size, t0):
115 |     """Show downloading percentage.
116 | 
117 |     Parameters
118 |     ----------
119 |     bytes_so_far: int
120 |         Number of downloaded bytes
121 | 
122 |     total_size: int
123 |         Total size of the file (may be 0/None, depending on download method).
124 | 
125 |     t0: int
126 |         The time in seconds (as returned by time.time()) at which the
127 |         download was resumed / started.
128 | 
129 |     initial_size: int
130 |         If resuming, indicate the initial size of the file.
131 |         If not resuming, set to zero.
132 |     """
133 | 
134 |     if not total_size:
135 |         sys.stderr.write("Downloaded %d of ? bytes\r" % (bytes_so_far))
136 | 
137 |     else:
138 |         # Estimate remaining download time
139 |         total_percent = float(bytes_so_far) / total_size
140 | 
141 |         current_download_size = bytes_so_far - initial_size
142 |         bytes_remaining = total_size - bytes_so_far
143 |         dt = time.time() - t0
144 |         download_rate = current_download_size / max(1e-8, float(dt))
145 |         # Minimum rate of 0.01 bytes/s, to avoid dividing by zero.
146 |         time_remaining = bytes_remaining / max(0.01, download_rate)
147 | 
148 |         # Trailing whitespace is to erase extra char when message length
149 |         # varies
150 |         sys.stderr.write(
151 |             "Downloaded %d of %d bytes (%0.2f%%, %s remaining)  \r"
152 |             % (bytes_so_far, total_size, total_percent * 100,
153 |                _format_time(time_remaining)))
154 | 
155 | 
156 | def _chunk_read_(response, local_file, chunk_size=8192, report_hook=None,
157 |                  initial_size=0, total_size=None, verbose=1):
158 |     """Download a file chunk by chunk and show advancement
159 | 
160 |     Parameters
161 |     ----------
162 |     response: _urllib.response.addinfourl
163 |         Response to the download request in order to get file size
164 | 
165 |     local_file: file
166 |         Hard disk file where data should be written
167 | 
168 |     chunk_size: int, optional
169 |         Size of downloaded chunks. Default: 8192
170 | 
171 |     report_hook: bool
172 |         Whether or not to show downloading advancement. Default: None
173 | 
174 |     initial_size: int, optional
175 |         If resuming, indicate the initial size of the file
176 | 
177 |     total_size: int, optional
178 |         Expected final size of download (None means it is unknown).
179 | 
180 |     verbose: int, optional
181 |         verbosity level (0 means no message).
182 | 
183 |     Returns
184 |     -------
185 |     data: string
186 |         The downloaded file.
187 | 
188 |     """
189 | 
190 | 
191 |     try:
192 |         if total_size is None:
193 |             total_size = response.info().get('Content-Length').strip()
194 |         total_size = int(total_size) + initial_size
195 |     except Exception as e:
196 |         if verbose > 1:
197 |             print("Warning: total size could not be determined.")
198 |             if verbose > 2:
199 |                 print("Full stack trace: %s" % e)
200 |         total_size = None
201 |     bytes_so_far = initial_size
202 | 
203 |     # t0 = time.time()
204 |     if report_hook:
205 |         pbar = tqdm(total=total_size, unit="b", unit_scale=True)
206 | 
207 |     while True:
208 |         chunk = response.read(chunk_size)
209 |         bytes_so_far += len(chunk)
210 | 
211 |         if not chunk:
212 |             if report_hook:
213 |                 # sys.stderr.write('\n')
214 |                 pbar.close()
215 |             break
216 | 
217 |         local_file.write(chunk)
218 |         if report_hook:
219 |             pbar.update(len(chunk)) # This is better because works in ipython
220 |             # _chunk_report_(bytes_so_far, total_size, initial_size, t0)
221 | 
222 |     if report_hook:
223 |         pbar.close()
224 | 
225 |     return
226 | 
227 | 
228 | def _get_dataset_dir(sub_dir=None, data_dir=None, default_paths=None,
229 |                      verbose=1):
230 |     """ Create if necessary and returns data directory of given dataset.
231 | 
232 |     Parameters
233 |     ----------
234 |     sub_dir: string
235 |         Name of sub-dir
236 | 
237 |     data_dir: string, optional
238 |         Path of the data directory. Used to force data storage in a specified
239 |         location. Default: None
240 | 
241 |     default_paths: list of string, optional
242 |         Default system paths in which the dataset may already have been
243 |         installed by a third party software. They will be checked first.
244 | 
245 |     verbose: int, optional
246 |         verbosity level (0 means no message).
247 | 
248 |     Returns
249 |     -------
250 |     data_dir: string
251 |         Path of the given dataset directory.
252 | 
253 |     Notes
254 |     -----
255 |     This function retrieves the datasets directory (or data directory) using
256 |     the following priority :
257 |     1. defaults system paths
258 |     2. the keyword argument data_dir
259 |     3. the global environment variable WEB_SHARED_DATA
260 |     4. the user environment variable WEB_DATA
261 |     5. web_data in the user home folder
262 |     """
263 |     # We build an array of successive paths by priority
264 |     # The boolean indicates if it is a pre_dir: in that case, we won't add the
265 |     # dataset name to the path.
266 |     paths = []
267 | 
268 | 
269 |     # Search given environment variables
270 |     if default_paths is not None:
271 |         for default_path in default_paths:
272 |             paths.extend([(d, True) for d in default_path.split(':')])
273 | 
274 |     # Check data_dir which force storage in a specific location
275 |     if data_dir is not None:
276 |         paths.extend([(d, False) for d in data_dir.split(':')])
277 |     else:
278 |         global_data = os.getenv('WEB_SHARED_DATA')
279 |         if global_data is not None:
280 |             paths.extend([(d, False) for d in global_data.split(':')])
281 | 
282 |         local_data = os.getenv('WEB_DATA')
283 |         if local_data is not None:
284 |             paths.extend([(d, False) for d in local_data.split(':')])
285 | 
286 |         paths.append((os.path.expanduser('~/web_data'), False))
287 | 
288 |     if verbose > 2:
289 |         print('Dataset search paths: %s' % paths)
290 | 
291 |     # Check if the dataset exists somewhere
292 |     for path, is_pre_dir in paths:
293 |         if not is_pre_dir and sub_dir:
294 |             path = os.path.join(path, sub_dir)
295 |         if os.path.islink(path):
296 |             # Resolve path
297 |             path = readlinkabs(path)
298 |         if os.path.exists(path) and os.path.isdir(path):
299 |             if verbose > 1:
300 |                 print('\nDataset found in %s\n' % path)
301 |             return path
302 | 
303 |     # If not, create a folder in the first writeable directory
304 |     errors = []
305 |     for (path, is_pre_dir) in paths:
306 |         if not is_pre_dir and sub_dir:
307 |             path = os.path.join(path, sub_dir)
308 |         if not os.path.exists(path):
309 |             try:
310 |                 _makedirs(path)
311 |                 if verbose > 0:
312 |                     print('\nDataset created in %s\n' % path)
313 |                 return path
314 |             except Exception as exc:
315 |                 short_error_message = getattr(exc, 'strerror', str(exc))
316 |                 errors.append('\n -{0} ({1})'.format(
317 |                     path, short_error_message))
318 | 
319 |     raise OSError('Web tried to store the dataset in the following '
320 |                   'directories, but:' + ''.join(errors))
321 | 
322 | 
323 | def _uncompress_file(file_, delete_archive=True, verbose=1):
324 |     """Uncompress files contained in a data_set.
325 | 
326 |     Parameters
327 |     ----------
328 |     file: string
329 |         path of file to be uncompressed.
330 | 
331 |     delete_archive: bool, optional
332 |         Wheteher or not to delete archive once it is uncompressed.
333 |         Default: True
334 | 
335 |     verbose: int, optional
336 |         verbosity level (0 means no message).
337 | 
338 |     Notes
339 |     -----
340 |     This handles zip, tar, gzip and bzip files only.
341 |     """
342 |     if verbose > 0:
343 |         print('Extracting data from %s...' % file_)
344 |     data_dir = os.path.dirname(file_)
345 |     # We first try to see if it is a zip file
346 |     try:
347 |         filename, ext = os.path.splitext(file_)
348 |         with open(file_, "rb") as fd:
349 |             header = fd.read(4)
350 |         processed = False
351 |         if zipfile.is_zipfile(file_):
352 |             z = zipfile.ZipFile(file_)
353 |             z.extractall(data_dir)
354 |             z.close()
355 |             processed = True
356 |         elif ext == '.gz' or header.startswith(b'\x1f\x8b'):
357 |             import gzip
358 |             gz = gzip.open(file_)
359 |             if ext == '.tgz':
360 |                 filename = filename + '.tar'
361 |             out = open(filename, 'wb')
362 |             shutil.copyfileobj(gz, out, 8192)
363 |             gz.close()
364 |             out.close()
365 |             # If file is .tar.gz, this will be handle in the next case
366 |             if delete_archive:
367 |                 os.remove(file_)
368 |             file_ = filename
369 |             filename, ext = os.path.splitext(file_)
370 |             processed = True
371 |         if tarfile.is_tarfile(file_):
372 |             with contextlib.closing(tarfile.open(file_, "r")) as tar:
373 |                 tar.extractall(path=data_dir)
374 |             processed = True
375 |         if not processed:
376 |             raise IOError(
377 |                     "[Uncompress] unknown archive file format: %s" % file_)
378 |         if delete_archive:
379 |             os.remove(file_)
380 |         if verbose > 0:
381 |             print('   ...done.')
382 |     except Exception as e:
383 |         if verbose > 0:
384 |             print('Error uncompressing file: %s' % e)
385 |         raise
386 | 
387 | 
388 | def _filter_column(array, col, criteria):
389 |     """ Return index array matching criteria
390 | 
391 |     Parameters
392 |     ----------
393 | 
394 |     array: numpy array with columns
395 |         Array in which data will be filtered
396 | 
397 |     col: string
398 |         Name of the column
399 | 
400 |     criteria: integer (or float), pair of integers, string or list of these
401 |         if integer, select elements in column matching integer
402 |         if a tuple, select elements between the limits given by the tuple
403 |         if a string, select elements that match the string
404 |     """
405 |     # Raise an error if the column does not exist. This is the only way to
406 |     # test it across all possible types (pandas, recarray...)
407 |     try:
408 |         array[col]
409 |     except:
410 |         raise KeyError('Filtering criterion %s does not exist' % col)
411 | 
412 |     if (not isinstance(criteria, _basestring) and
413 |         not isinstance(criteria, bytes) and
414 |         not isinstance(criteria, tuple) and
415 |             isinstance(criteria, collections.Iterable)):
416 | 
417 |         filter = np.zeros(array.shape[0], dtype=np.bool)
418 |         for criterion in criteria:
419 |             filter = np.logical_or(filter,
420 |                                    _filter_column(array, col, criterion))
421 |         return filter
422 | 
423 |     if isinstance(criteria, tuple):
424 |         if len(criteria) != 2:
425 |             raise ValueError("An interval must have 2 values")
426 |         if criteria[0] is None:
427 |             return array[col] <= criteria[1]
428 |         if criteria[1] is None:
429 |             return array[col] >= criteria[0]
430 |         filter = array[col] <= criteria[1]
431 |         return np.logical_and(filter, array[col] >= criteria[0])
432 | 
433 |     return array[col] == criteria
434 | 
435 | 
436 | def _filter_columns(array, filters, combination='and'):
437 |     """ Return indices of recarray entries that match criteria.
438 | 
439 |     Parameters
440 |     ----------
441 | 
442 |     array: numpy array with columns
443 |         Array in which data will be filtered
444 | 
445 |     filters: list of criteria
446 |         See _filter_column
447 | 
448 |     combination: string, optional
449 |         String describing the combination operator. Possible values are "and"
450 |         and "or".
451 |     """
452 |     if combination == 'and':
453 |         fcomb = np.logical_and
454 |         mask = np.ones(array.shape[0], dtype=np.bool)
455 |     elif combination == 'or':
456 |         fcomb = np.logical_or
457 |         mask = np.zeros(array.shape[0], dtype=np.bool)
458 |     else:
459 |         raise ValueError('Combination mode not known: %s' % combination)
460 | 
461 |     for column in filters:
462 |         mask = fcomb(mask, _filter_column(array, column, filters[column]))
463 |     return mask
464 | 
465 | 
466 | 
467 | 
468 | 
469 | def _get_dataset_descr(ds_name):
470 |     module_path = os.path.dirname(os.path.abspath(__file__))
471 | 
472 |     fname = ds_name
473 | 
474 |     try:
475 |         with open(os.path.join(module_path, 'description', fname + '.rst'))\
476 |                 as rst_file:
477 |             descr = rst_file.read()
478 |     except IOError:
479 |         descr = ''
480 | 
481 |     if descr == '':
482 |         print("Warning: Could not find dataset description.")
483 | 
484 |     return descr
485 | 
486 | 
487 | def movetree(src, dst):
488 |     """Move an entire tree to another directory. Any existing file is
489 |     overwritten"""
490 |     names = os.listdir(src)
491 | 
492 |     # Create destination dir if it does not exist
493 |     _makedirs(dst)
494 |     errors = []
495 | 
496 |     for name in names:
497 |         srcname = os.path.join(src, name)
498 |         dstname = os.path.join(dst, name)
499 |         try:
500 |             if os.path.isdir(srcname) and os.path.isdir(dstname):
501 |                 movetree(srcname, dstname)
502 |                 os.rmdir(srcname)
503 |             else:
504 |                 shutil.move(srcname, dstname)
505 |         except (IOError, os.error) as why:
506 |             errors.append((srcname, dstname, str(why)))
507 |         # catch the Error from the recursive movetree so that we can
508 |         # continue with other files
509 |         except Exception as err:
510 |             errors.extend(err.args[0])
511 |     if errors:
512 |         raise Exception(errors)
513 | 
514 | 
515 | # TODO: refactor, this function is a mess, it was adapted from other project
516 | # and it might have not been an optimal choice
517 | def _fetch_file(url, data_dir=TEMP, uncompress=False, move=False,md5sum=None,
518 |                 username=None, password=None, mock=False, handlers=[], resume=True, verbose=0):
519 |     """Load requested dataset, downloading it if needed or requested.
520 | 
521 |     This function retrieves files from the hard drive or download them from
522 |     the given urls. Note to developpers: All the files will be first
523 |     downloaded in a sandbox and, if everything goes well, they will be moved
524 |     into the folder of the dataset. This prevents corrupting previously
525 |     downloaded data. In case of a big dataset, do not hesitate to make several
526 |     calls if needed.
527 | 
528 |     Parameters
529 |     ----------
530 |     dataset_name: string
531 |         Unique dataset name
532 | 
533 |     resume: bool, optional
534 |         If true, try to resume partially downloaded files
535 | 
536 |     uncompress: bool, optional
537 |         If true, will uncompress zip
538 | 
539 |     move: str, optional
540 |         If True, will move downloaded file to given relative path.
541 |         NOTE: common usage is zip_file_id/zip_file.zip together
542 |         with uncompress set to True
543 | 
544 |     md5sum: string, optional
545 |         MD5 sum of the file. Checked if download of the file is required
546 | 
547 |     username: string, optional
548 |         Username used for basic HTTP authentication
549 | 
550 |     password: string, optional
551 |         Password used for basic HTTP authentication
552 | 
553 |     handlers: list of BaseHandler, optional
554 |         urllib handlers passed to urllib.request.build_opener. Used by
555 |         advanced users to customize request handling.
556 | 
557 |     data_dir: string, optional
558 |         Path of the data directory. Used to force data storage in a specified
559 |         location. Default: None
560 | 
561 |     resume: bool, optional
562 |         If true, try resuming download if possible
563 | 
564 |     verbose: int, optional
565 |         verbosity level (0 means no message).
566 | 
567 |     Returns
568 |     -------
569 |     files: list of string
570 |         Absolute paths of downloaded files on disk
571 |     """
572 | 
573 |     # TODO: move to global scope and rename
574 |     def _fetch_helper(url, data_dir=TEMP, resume=True, overwrite=False,
575 |                 md5sum=None, username=None, password=None, handlers=[],
576 |                 verbose=1):
577 |         if not os.path.isabs(data_dir):
578 |             data_dir = _get_dataset_dir(data_dir)
579 | 
580 |         # Determine data path
581 |         _makedirs(data_dir)
582 | 
583 |         # Determine filename using URL
584 |         parse = _urllib.parse.urlparse(url)
585 |         file_name = os.path.basename(parse.path)
586 |         if file_name == '':
587 |             file_name = md5_hash(parse.path)
588 | 
589 |         temp_file_name = file_name + ".part"
590 |         full_name = os.path.join(data_dir, file_name)
591 |         temp_full_name = os.path.join(data_dir, temp_file_name)
592 |         if os.path.exists(full_name):
593 |             if overwrite:
594 |                 os.remove(full_name)
595 |             else:
596 |                 return full_name
597 |         if os.path.exists(temp_full_name):
598 |             if overwrite:
599 |                 os.remove(temp_full_name)
600 |         t0 = time.time()
601 |         local_file = None
602 |         initial_size = 0
603 | 
604 |         try:
605 |             # Download data
606 |             url_opener = _urllib.request.build_opener(*handlers)
607 |             request = _urllib.request.Request(url)
608 |             request.add_header('Connection', 'Keep-Alive')
609 |             if username is not None and password is not None:
610 |                 if not url.startswith('https'):
611 |                     raise ValueError(
612 |                         'Authentication was requested on a non  secured URL (%s).'
613 |                         'Request has been blocked for security reasons.' % url)
614 |                 # Note: HTTPBasicAuthHandler is not fitted here because it relies
615 |                 # on the fact that the server will return a 401 error with proper
616 |                 # www-authentication header, which is not the case of most
617 |                 # servers.
618 |                 encoded_auth = base64.b64encode(
619 |                     (username + ':' + password).encode())
620 |                 request.add_header(b'Authorization', b'Basic ' + encoded_auth)
621 |             if verbose > 0:
622 |                 displayed_url = url.split('?')[0] if verbose == 1 else url
623 |                 print('Downloading data from %s ...' % displayed_url)
624 |             if resume and os.path.exists(temp_full_name):
625 |                 # Download has been interrupted, we try to resume it.
626 |                 local_file_size = os.path.getsize(temp_full_name)
627 |                 # If the file exists, then only download the remainder
628 |                 request.add_header("Range", "bytes=%s-" % (local_file_size))
629 |                 try:
630 |                     data = url_opener.open(request)
631 |                     content_range = data.info().get('Content-Range')
632 |                     if (content_range is None or not content_range.startswith(
633 |                             'bytes %s-' % local_file_size)):
634 |                         raise IOError('Server does not support resuming')
635 |                 except Exception:
636 |                     # A wide number of errors can be raised here. HTTPError,
637 |                     # URLError... I prefer to catch them all and rerun without
638 |                     # resuming.
639 |                     if verbose > 0:
640 |                         print('Resuming failed, try to download the whole file.')
641 |                     return _fetch_helper(
642 |                         url, data_dir, resume=False, overwrite=overwrite,
643 |                         md5sum=md5sum, username=username, password=password,
644 |                         handlers=handlers, verbose=verbose)
645 |                 local_file = open(temp_full_name, "ab")
646 |                 initial_size = local_file_size
647 |             else:
648 |                 data = url_opener.open(request)
649 |                 local_file = open(temp_full_name, "wb")
650 |             _chunk_read_(data, local_file, report_hook=(verbose > 0),
651 |                          initial_size=initial_size, verbose=verbose)
652 |             # temp file must be closed prior to the move
653 |             if not local_file.closed:
654 |                 local_file.close()
655 |             shutil.move(temp_full_name, full_name)
656 |             dt = time.time() - t0
657 |             if verbose > 0:
658 |                 print('...done. (%i seconds, %i min)' % (dt, dt // 60))
659 |         except _urllib.error.HTTPError as e:
660 |             if verbose > 0:
661 |                 print('Error while fetching file %s. Dataset fetching aborted.' %
662 |                       (file_name))
663 |             if verbose > 1:
664 |                 print("HTTP Error: %s, %s" % (e, url))
665 |             raise
666 |         except _urllib.error.URLError as e:
667 |             if verbose > 0:
668 |                 print('Error while fetching file %s. Dataset fetching aborted.' %
669 |                       (file_name))
670 |             if verbose > 1:
671 |                 print("URL Error: %s, %s" % (e, url))
672 |             raise
673 |         finally:
674 |             if local_file is not None:
675 |                 if not local_file.closed:
676 |                     local_file.close()
677 |         if md5sum is not None:
678 |             if (_md5_sum_file(full_name) != md5sum):
679 |                 raise ValueError("File %s checksum verification has failed."
680 |                                  " Dataset fetching aborted." % local_file)
681 |         return full_name
682 | 
683 |     if not os.path.isabs(data_dir):
684 |         data_dir = _get_dataset_dir(data_dir)
685 | 
686 | 
687 |     # There are two working directories here:
688 |     # - data_dir is the destination directory of the dataset
689 |     # - temp_dir is a temporary directory dedicated to this fetching call. All
690 |     #   files that must be downloaded will be in this directory. If a corrupted
691 |     #   file is found, or a file is missing, this working directory will be
692 |     #   deleted.
693 |     parse = _urllib.parse.urlparse(url)
694 |     file_name = os.path.basename(parse.path)
695 | 
696 |     files_pickle = cPickle.dumps([(file_, url) for file_, url in zip([file_name], [url])])
697 |     files_md5 = hashlib.md5(files_pickle).hexdigest()
698 |     temp_dir = os.path.join(data_dir, files_md5)
699 | 
700 |     # Create destination dir
701 |     _makedirs(data_dir)
702 | 
703 |     # Abortion flag, in case of error
704 |     abort = None
705 | 
706 |     # 2 possibilities:
707 |     # - the file exists in data_dir, nothing to do (we have to account for move parameter here)
708 |     # - the file does not exists: we download it in temp_dir
709 | 
710 |     # Target file in the data_dir
711 |     target_file = os.path.join(data_dir, file_name)
712 | 
713 |     # Change move so we always uncompress to some folder (this is important for
714 |     # detecting already downloaded files)
715 |     # Ex. glove.4B.zip -> glove.4B/glove.4B.zip
716 |     if uncompress and not move:
717 |         dirname, _ = os.path.splitext(file_name)
718 |         move = os.path.join(dirname, os.path.basename(file_name))
719 | 
720 |     if (abort is None
721 |         and not os.path.exists(target_file)
722 |         and (not move or (move and uncompress and not os.path.exists(os.path.dirname(os.path.join(data_dir, move)))))
723 |             or (move and not uncompress and not os.path.exists(os.path.join(data_dir, move)))):
724 | 
725 |         # Target file in temp dir
726 |         temp_target_file = os.path.join(temp_dir, file_name)
727 |         # We may be in a global read-only repository. If so, we cannot
728 |         # download files.
729 |         if not os.access(data_dir, os.W_OK):
730 |             raise ValueError('Dataset files are missing but dataset'
731 |                              ' repository is read-only. Contact your data'
732 |                              ' administrator to solve the problem')
733 | 
734 |         if not os.path.exists(temp_dir):
735 |             os.mkdir(temp_dir)
736 | 
737 |         dl_file = _fetch_helper(url, temp_dir, resume=resume,
738 |                               verbose=verbose, md5sum=md5sum,
739 |                               username=username,
740 |                               password=password,
741 |                               handlers=handlers)
742 | 
743 |         if (abort is None and not os.path.exists(target_file) and not
744 |                 os.path.exists(temp_target_file)):
745 |             if not mock:
746 |                 warnings.warn('An error occured while fetching %s' % file_)
747 |                 abort = ("Dataset has been downloaded but requested file was "
748 |                          "not provided:\nURL:%s\nFile:%s" %
749 |                          (url, target_file))
750 |             else:
751 |                 _makedirs(os.path.dirname(temp_target_file))
752 |                 open(temp_target_file, 'w').close()
753 | 
754 |         if move:
755 |             move = os.path.join(data_dir, move)
756 |             move_dir = os.path.dirname(move)
757 |             _makedirs(move_dir)
758 |             shutil.move(dl_file, move)
759 |             dl_file = move
760 |             target_file = dl_file
761 | 
762 |         if uncompress:
763 |             try:
764 |                 if os.path.getsize(dl_file) != 0:
765 |                     _uncompress_file(dl_file, verbose=verbose)
766 |                 else:
767 |                     os.remove(dl_file)
768 |                 target_file = os.path.dirname(target_file)
769 |             except Exception as e:
770 |                 abort = str(e)
771 |     else:
772 |         if verbose > 0:
773 |             print("File already downloaded, skipping")
774 | 
775 |         if move:
776 |             target_file = os.path.join(data_dir, move)
777 | 
778 |         if uncompress:
779 |             target_file = os.path.dirname(target_file)
780 | 
781 |     if abort is not None:
782 |         if os.path.exists(temp_dir):
783 |             shutil.rmtree(temp_dir)
784 |         raise IOError('Fetching aborted: ' + abort)
785 |     # If needed, move files from temps directory to final directory.
786 |     if os.path.exists(temp_dir):
787 |         # XXX We could only moved the files requested
788 |         # XXX Movetree can go wrong
789 |         movetree(temp_dir, data_dir)
790 |         shutil.rmtree(temp_dir)
791 |     return target_file
792 | 
793 | def _tree(path, pattern=None, dictionary=False):
794 |     """ Return a directory tree under the form of a dictionaries and list
795 | 
796 |     Parameters:
797 |     -----------
798 |     path: string
799 |         Path browsed
800 | 
801 |     pattern: string, optional
802 |         Pattern used to filter files (see fnmatch)
803 | 
804 |     dictionary: boolean, optional
805 |         If True, the function will return a dict instead of a list
806 |     """
807 |     files = []
808 |     dirs = [] if not dictionary else {}
809 |     for file_ in os.listdir(path):
810 |         file_path = os.path.join(path, file_)
811 |         if os.path.isdir(file_path):
812 |             if not dictionary:
813 |                 dirs.append((file_, _tree(file_path, pattern)))
814 |             else:
815 |                 dirs[file_] = _tree(file_path, pattern)
816 |         else:
817 |             if pattern is None or fnmatch.fnmatch(file_, pattern):
818 |                 files.append(file_path)
819 |     files = sorted(files)
820 |     if not dictionary:
821 |         return sorted(dirs) + files
822 |     if len(dirs) == 0:
823 |         return files
824 |     if len(files) > 0:
825 |         dirs['.'] = files
826 |     return dirs
827 | 


--------------------------------------------------------------------------------
/web/embedding.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Base class for embedding.
  3 | 
  4 | NOTE: This file was adapted from the polyglot package
  5 | """
  6 | 
  7 | import logging
  8 | from collections import OrderedDict
  9 | 
 10 | import numpy as np
 11 | import sys
 12 | 
 13 | from six import text_type
 14 | from six import PY2
 15 | from six import iteritems
 16 | from six import string_types
 17 | from .utils import _open
 18 | from .vocabulary import Vocabulary, CountedVocabulary, OrderedVocabulary
 19 | from six.moves import cPickle as pickle
 20 | from six.moves import range
 21 | from functools import partial
 22 | from .utils import standardize_string, to_utf8
 23 | 
 24 | from sklearn.metrics import pairwise_distances
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | 
 29 | class Embedding(object):
 30 |     """ Mapping a vocabulary to a d-dimensional points."""
 31 | 
 32 |     def __init__(self, vocabulary, vectors):
 33 |         self.vocabulary = vocabulary
 34 |         self.vectors = np.asarray(vectors)
 35 |         if len(self.vocabulary) != self.vectors.shape[0]:
 36 |             raise ValueError("Vocabulary has {} items but we have {} "
 37 |                              "vectors."
 38 |                              .format(len(vocabulary), self.vectors.shape[0]))
 39 | 
 40 |         if len(self.vocabulary.words) != len(set(self.vocabulary.words)):
 41 |             logger.warning("Vocabulary has duplicates.")
 42 | 
 43 |     def __getitem__(self, k):
 44 |         return self.vectors[self.vocabulary[k]]
 45 | 
 46 |     def __setitem__(self, k, v):
 47 |         if not v.shape[0] == self.vectors.shape[1]:
 48 |             raise RuntimeError("Please pass vector of len {}".format(self.vectors.shape[1]))
 49 | 
 50 |         if k not in self.vocabulary:
 51 |             self.vocabulary.add(k)
 52 |             self.vectors = np.vstack([self.vectors, v.reshape(1, -1)])
 53 |         else:
 54 |             self.vectors[self.vocabulary[k]] = v
 55 | 
 56 |     def __contains__(self, k):
 57 |         return k in self.vocabulary
 58 | 
 59 |     def __delitem__(self, k):
 60 |         """Remove the word and its vector from the embedding.
 61 | 
 62 |         Note:
 63 |          This operation costs \\theta(n). Be careful putting it in a loop.
 64 |         """
 65 |         index = self.vocabulary[k]
 66 |         del self.vocabulary[k]
 67 |         self.vectors = np.delete(self.vectors, index, 0)
 68 | 
 69 |     def __len__(self):
 70 |         return len(self.vocabulary)
 71 | 
 72 |     def __iter__(self):
 73 |         for w in self.vocabulary:
 74 |             yield w, self[w]
 75 | 
 76 |     @property
 77 |     def words(self):
 78 |         return self.vocabulary.words
 79 | 
 80 |     @property
 81 |     def shape(self):
 82 |         return self.vectors.shape
 83 | 
 84 |     def get(self, k, default=None):
 85 |         try:
 86 |             return self[k]
 87 |         except KeyError as e:
 88 |             return default
 89 | 
 90 |     def standardize_words(self, lower=False, clean_words=False, inplace=False):
 91 |         tw = self.transform_words(partial(standardize_string, lower=lower, clean_words=clean_words), inplace=inplace,
 92 |                                   lower=lower)
 93 | 
 94 |         if clean_words:
 95 |             tw = tw.transform_words(partial(lambda w: w.strip(" ")), inplace=inplace, lower=lower)
 96 |         return tw
 97 | 
 98 |     def transform_words(self, f, inplace=False, lower=False):
 99 |         """
100 |         Transform words in vocabulary according to following strategy.
101 |         Prefer shortest and most often occurring words- after transforming by some (lambda f) function.
102 | 
103 |         This allow eliminate noisy and wrong coded words.
104 | 
105 |         Strategy is implemented for all types of Vocabulary- they can be polymorphicaly extended.
106 | 
107 |         Parameters
108 |         ----------
109 |         f: lambda
110 |             Function called on each word- for transformation it.
111 | 
112 |         inplace: bool, default: False
113 |             Return new Embedding instance or modify existing
114 | 
115 |         lower: bool, default: False
116 |             If true, will convert all words to lowercase
117 | 
118 |         Returns
119 |         -------
120 |         e: Embedding
121 |         Instance of Embedding class with this same Vocabulary type as previous.
122 |         """
123 |         id_map = OrderedDict()
124 |         word_count = len(self.vectors)
125 |         # store max word length before f(w)- in corpora
126 |         words_len = {}
127 |         # store max occurrence count of word
128 |         counts = {}
129 |         is_vocab_generic = False
130 | 
131 |         curr_words = self.vocabulary.words
132 |         curr_vec = self.vectors
133 | 
134 |         if isinstance(self.vocabulary, CountedVocabulary):
135 |             _, counter_of_words = self.vocabulary.getstate()
136 |         elif isinstance(self.vocabulary, OrderedVocabulary):
137 |             # range in python3 is lazy
138 |             counter_of_words = range(len(self.vocabulary.words) - 1, -1, -1)
139 | 
140 |         elif isinstance(self.vocabulary, Vocabulary):
141 |             is_vocab_generic = True
142 |             # if corpora contain lowercase version of word i- for case Vocabulary
143 |             lowered_words = {}
144 | 
145 |             if lower:
146 | 
147 |                 for w, v in zip(self.vocabulary.words, self.vectors):
148 |                     wl = w.lower()
149 |                     if wl == w:
150 |                         lowered_words[wl] = v
151 |                     elif wl != w and wl not in lowered_words:
152 |                         lowered_words[wl] = v
153 | 
154 |                 curr_words = list(lowered_words.keys())
155 |                 curr_vec = np.asanyarray(list(lowered_words.values()))
156 | 
157 |         else:
158 |             raise NotImplementedError(
159 |                 'This kind of Vocabulary is not implemented in transform_words strategy and can not be matched')
160 | 
161 |         for id, w in enumerate(curr_words):
162 | 
163 |             fw = f(w)
164 |             if len(fw) and fw not in id_map:
165 |                 id_map[fw] = id
166 | 
167 |                 if not is_vocab_generic:
168 |                     counts[fw] = counter_of_words[id]
169 |                 words_len[fw] = len(w)
170 | 
171 |                 # overwrite
172 |             elif len(fw) and fw in id_map:
173 |                 if not is_vocab_generic and counter_of_words[id] > counts[fw]:
174 |                     id_map[fw] = id
175 | 
176 |                     counts[fw] = counter_of_words[id]
177 |                     words_len[fw] = len(w)
178 |                 elif is_vocab_generic and len(w) < words_len[fw]:
179 |                     # for generic Vocabulary
180 |                     id_map[fw] = id
181 | 
182 |                     words_len[fw] = len(w)
183 |                 elif not is_vocab_generic and counter_of_words[id] == counts[fw] and len(w) < words_len[fw]:
184 |                     id_map[fw] = id
185 | 
186 |                     counts[fw] = counter_of_words[id]
187 |                     words_len[fw] = len(w)
188 | 
189 |                 logger.warning("Overwriting {}".format(fw))
190 | 
191 |         if isinstance(self.vocabulary, CountedVocabulary):
192 |             words_only = id_map.keys()
193 |             vectors = curr_vec[[id_map[w] for w in words_only]]
194 |             words = {w: counter_of_words[id_map[w]] for w in words_only}
195 | 
196 |         elif isinstance(self.vocabulary, OrderedVocabulary):
197 |             words = sorted(id_map.keys(), key=lambda x: id_map[x])
198 |             vectors = curr_vec[[id_map[w] for w in words]]
199 | 
200 |         elif isinstance(self.vocabulary, Vocabulary):
201 |             words = sorted(id_map.keys(), key=lambda x: id_map[x])
202 |             vectors = curr_vec[[id_map[w] for w in words]]
203 | 
204 |         logger.info("Transformed {} into {} words".format(word_count, len(words)))
205 | 
206 |         if inplace:
207 |             self.vectors = vectors
208 |             self.vocabulary = self.vocabulary.__class__(words)
209 | 
210 |             return self
211 |         else:
212 |             return Embedding(vectors=vectors, vocabulary=self.vocabulary.__class__(words))
213 | 
214 |     def most_frequent(self, k, inplace=False):
215 |         """Only most frequent k words to be included in the embeddings."""
216 | 
217 |         assert isinstance(self.vocabulary, OrderedVocabulary), \
218 |             "most_frequent can be called only on Embedding with OrderedVocabulary"
219 | 
220 |         vocabulary = self.vocabulary.most_frequent(k)
221 |         vectors = np.asarray([self[w] for w in vocabulary])
222 |         if inplace:
223 |             self.vocabulary = vocabulary
224 |             self.vectors = vectors
225 |             return self
226 |         return Embedding(vectors=vectors, vocabulary=vocabulary)
227 | 
228 |     def normalize_words(self, ord=2, inplace=False):
229 |         """Normalize embeddings matrix row-wise.
230 | 
231 |         Parameters
232 |         ----------
233 |           ord: normalization order. Possible values {1, 2, 'inf', '-inf'}
234 |         """
235 |         if ord == 2:
236 |             ord = None  # numpy uses this flag to indicate l2.
237 |         vectors = self.vectors.T / np.linalg.norm(self.vectors, ord, axis=1)
238 |         if inplace:
239 |             self.vectors = vectors.T
240 |             return self
241 |         return Embedding(vectors=vectors.T, vocabulary=self.vocabulary)
242 | 
243 |     def nearest_neighbors(self, word, k=1, exclude=[], metric="cosine"):
244 |         """
245 |         Find nearest neighbor of given word
246 | 
247 |         Parameters
248 |         ----------
249 |           word: string or vector
250 |             Query word or vector.
251 | 
252 |           k: int, default: 1
253 |             Number of nearest neighbours to return.
254 | 
255 |           metric: string, default: 'cosine'
256 |             Metric to use.
257 | 
258 |           exclude: list, default: []
259 |             Words to omit in answer
260 | 
261 |         Returns
262 |         -------
263 |           n: list
264 |             Nearest neighbors.
265 |         """
266 |         if isinstance(word, string_types):
267 |             assert word in self, "Word not found in the vocabulary"
268 |             v = self[word]
269 |         else:
270 |             v = word
271 | 
272 |         D = pairwise_distances(self.vectors, v.reshape(1, -1), metric=metric)
273 | 
274 |         if isinstance(word, string_types):
275 |             D[self.vocabulary.word_id[word]] = D.max()
276 | 
277 |         for w in exclude:
278 |             D[self.vocabulary.word_id[w]] = D.max()
279 | 
280 |         return [self.vocabulary.id_word[id] for id in D.argsort(axis=0).flatten()[0:k]]
281 | 
282 |     @staticmethod
283 |     def from_gensim(model):
284 |         word_count = {}
285 |         vectors = []
286 |         for word, vocab in sorted(iteritems(model.vocab), key=lambda item: -item[1].count):
287 |             word = standardize_string(word)
288 |             if word:
289 |                 vectors.append(model.syn0[vocab.index])
290 |                 word_count[word] = vocab.count
291 |         vocab = CountedVocabulary(word_count=word_count)
292 |         vectors = np.asarray(vectors)
293 |         return Embedding(vocabulary=vocab, vectors=vectors)
294 | 
295 |     @staticmethod
296 |     def from_word2vec_vocab(fvocab):
297 |         counts = {}
298 |         with _open(fvocab) as fin:
299 |             for line in fin:
300 | 
301 |                 word, count = standardize_string(line).split()
302 |                 if word:
303 |                     counts[word] = int(count)
304 |         return CountedVocabulary(word_count=counts)
305 | 
306 |     @staticmethod
307 |     def _from_word2vec_binary(fname):
308 |         with _open(fname, 'rb') as fin:
309 |             words = []
310 |             header = fin.readline()
311 |             vocab_size, layer1_size = list(map(int, header.split()))  # throws for invalid file format
312 |             logger.info("Loading #{} words with {} dim".format(vocab_size, layer1_size))
313 |             vectors = np.zeros((vocab_size, layer1_size), dtype=np.float32)
314 |             binary_len = np.dtype("float32").itemsize * layer1_size
315 |             for line_no in range(vocab_size):
316 |                 # mixed text and binary: read text first, then binary
317 |                 word = []
318 |                 while True:
319 |                     ch = fin.read(1)
320 |                     if ch == b' ':
321 |                         break
322 |                     if ch != b'\n':  # ignore newlines in front of words (some binary files have newline, some don't)
323 |                         word.append(ch)
324 | 
325 |                 words.append(b''.join(word).decode("latin-1"))
326 |                 vectors[line_no, :] = np.fromstring(fin.read(binary_len), dtype=np.float32)
327 | 
328 |             if len(words) < vocab_size:
329 |                 logger.warning("Omitted {} words".format(vocab_size - len(words)))
330 |             elif len(words) > vocab_size:
331 |                 raise RuntimeError("Read too many words, incorrect file")
332 | 
333 |             return words, vectors
334 | 
335 |     @staticmethod
336 |     def _from_word2vec_text(fname):
337 |         with _open(fname, 'r') as fin:
338 |             words = []
339 | 
340 |             header = fin.readline()
341 |             ignored = 0
342 |             vocab_size, layer1_size = list(map(int, header.split()))  # throws for invalid file format
343 |             vectors = np.zeros(shape=(vocab_size, layer1_size), dtype=np.float32)
344 |             for line_no, line in enumerate(fin):
345 |                 try:
346 |                     parts = text_type(line, encoding="utf-8").split(' ')
347 |                     w = parts[0]
348 |                     parts = list(map(lambda x: x.strip(), parts[1:]))
349 |                     parts.insert(0, w)
350 | 
351 |                 except TypeError as e:
352 |                     parts = line.split(' ')
353 |                     w = parts[0]
354 |                     parts = list(map(lambda x: x.strip(), parts[1:]))
355 |                     parts.insert(0, w)
356 | 
357 |                 except Exception as e:
358 |                     logger.warning("We ignored line number {} because of errors in parsing"
359 |                                    "\n{}".format(line_no, e))
360 |                     continue
361 | 
362 |                 # We differ from Gensim implementation.
363 |                 # Our assumption that a difference of one happens because of having a
364 |                 # space in the word.
365 |                 if len(parts) == layer1_size + 1:
366 |                     word, vectors[line_no - ignored] = parts[0], list(map(np.float32, parts[1:]))
367 |                 elif len(parts) == layer1_size + 2 and parts[-1]:
368 |                     # last element after splitting is not empty- some glove corpora have additional space
369 |                     word, vectors[line_no - ignored] = parts[:2], list(map(np.float32, parts[2:]))
370 |                     word = u" ".join(word)
371 |                 elif not parts[-1]:
372 |                     # omit last value - empty string
373 |                     word, vectors[line_no - ignored] = parts[0], list(map(np.float32, parts[1:-1]))
374 |                 else:
375 |                     ignored += 1
376 |                     logger.warning("We ignored line number {} because of unrecognized "
377 |                                    "number of columns {}".format(line_no, parts[:-layer1_size]))
378 |                     continue
379 | 
380 |                 words.append(word)
381 | 
382 |             if ignored:
383 |                 vectors = vectors[0:-ignored]
384 | 
385 |             if len(words) < vocab_size:
386 |                 logger.warning("Omitted {} words".format(vocab_size - len(words)))
387 |             elif len(words) > vocab_size:
388 |                 raise RuntimeError("Read too many words, incorrect file")
389 | 
390 |             return words, vectors
391 | 
392 |     @staticmethod
393 |     def from_glove(fname, vocab_size, dim):
394 |         with _open(fname, 'r') as fin:
395 | 
396 |             words = []
397 |             words_uniq = set()
398 | 
399 |             ignored = 0
400 |             vectors = np.zeros(shape=(vocab_size, dim), dtype=np.float32)
401 |             for line_no, line in enumerate(fin):
402 |                 try:
403 |                     parts = text_type(line, encoding="utf-8").split(' ')
404 |                     parts[1:] = map(lambda x: np.float32(x.strip()), parts[1:])
405 |                 except TypeError as e:
406 | 
407 |                     parts = line.split(' ')
408 |                     parts[1:] = map(lambda x: np.float32(x.strip()), parts[1:])
409 | 
410 |                 except Exception as e:
411 |                     ignored += 1
412 | 
413 |                     logger.warning("We ignored line number {} because of errors in parsing"
414 |                                    "\n{}".format(line_no, e))
415 |                     continue
416 | 
417 |                 try:
418 |                     if parts[0] not in words_uniq:
419 |                         word, vectors[line_no - ignored] = parts[0], list(parts[len(parts) - dim:])
420 |                         words.append(word)
421 |                         words_uniq.add(word)
422 |                     else:
423 |                         ignored += 1
424 |                         logger.warning(
425 |                             "We ignored line number {} - following word is duplicated in file:\n{}\n".format(line_no,
426 |                                                                                                              parts[0]))
427 | 
428 |                 except Exception as e:
429 |                     ignored += 1
430 |                     logger.warning("We ignored line number {} because of errors in parsing"
431 |                                    "\n{}".format(line_no, e))
432 | 
433 |             return Embedding(vocabulary=OrderedVocabulary(words), vectors=vectors[0:len(words)])
434 | 
435 |     @staticmethod
436 |     def from_dict(d):
437 |         for k in d:  # Standardize
438 |             d[k] = np.array(d[k]).flatten()
439 |         return Embedding(vectors=list(d.values()), vocabulary=Vocabulary(d.keys()))
440 | 
441 |     @staticmethod
442 |     def to_word2vec(w, fname, binary=False):
443 |         """
444 |         Store the input-hidden weight matrix in the same format used by the original
445 |         C word2vec-tool, for compatibility.
446 | 
447 |         Parameters
448 |         ----------
449 |         w: Embedding instance
450 | 
451 |         fname: string
452 |           Destination file
453 |         """
454 |         logger.info("storing %sx%s projection weights into %s" % (w.vectors.shape[0], w.vectors.shape[1], fname))
455 |         with _open(fname, 'wb') as fout:
456 |             fout.write(to_utf8("%s %s\n" % w.vectors.shape))
457 |             # store in sorted order: most frequent words at the top
458 |             for word, vector in zip(w.vocabulary.words, w.vectors):
459 |                 if binary:
460 |                     fout.write(to_utf8(word) + b" " + vector.astype("float32").tostring())
461 |                 else:
462 |                     fout.write(to_utf8("%s %s\n" % (word, ' '.join("%.15f" % val for val in vector))))
463 | 
464 |     @staticmethod
465 |     def from_word2vec(fname, fvocab=None, binary=False):
466 |         """
467 |         Load the input-hidden weight matrix from the original C word2vec-tool format.
468 | 
469 |         `binary` is a boolean indicating whether the data is in binary word2vec format.
470 |         Word counts are read from `fvocab` filename, if set (this is the file generated
471 |         by `-save-vocab` flag of the original C tool).
472 |         """
473 |         vocabulary = None
474 |         if fvocab is not None:
475 |             logger.info("loading word counts from %s" % (fvocab))
476 |             vocabulary = Embedding.from_word2vec_vocab(fvocab)
477 | 
478 |         logger.info("loading projection weights from %s" % (fname))
479 |         if binary:
480 |             words, vectors = Embedding._from_word2vec_binary(fname)
481 |         else:
482 |             words, vectors = Embedding._from_word2vec_text(fname)
483 | 
484 |         if not vocabulary:
485 |             vocabulary = OrderedVocabulary(words=words)
486 | 
487 |         if len(words) != len(set(words)):
488 |             raise RuntimeError("Vocabulary has duplicates")
489 | 
490 |         e = Embedding(vocabulary=vocabulary, vectors=vectors)
491 | 
492 |         return e
493 | 
494 |     @staticmethod
495 |     def load(fname):
496 |         """Load an embedding dump generated by `save`"""
497 | 
498 |         content = _open(fname).read()
499 |         if PY2:
500 |             state = pickle.loads(content, encoding='latin1')
501 |         else:
502 |             state = pickle.loads(content, encoding='latin1')
503 |         voc, vec = state
504 |         if len(voc) == 2:
505 |             words, counts = voc
506 |             word_count = dict(zip(words, counts))
507 |             vocab = CountedVocabulary(word_count=word_count)
508 |         else:
509 |             vocab = OrderedVocabulary(voc)
510 |         return Embedding(vocabulary=vocab, vectors=vec)
511 | 
512 |     def save(self, fname):
513 |         """Save a pickled version of the embedding into `fname`."""
514 | 
515 |         vec = self.vectors
516 |         voc = self.vocabulary.getstate()
517 |         state = (voc, vec)
518 |         with open(fname, 'wb') as f:
519 |             pickle.dump(state, f, protocol=pickle.HIGHEST_PROTOCOL)
520 | 


--------------------------------------------------------------------------------
/web/embeddings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 |  Fetchers for publicly available pretrained embeddings
  4 | """
  5 | from six.moves import cPickle as pickle
  6 | from os import path
  7 | from .datasets.utils import _get_dataset_dir, _fetch_file
  8 | from .embedding import Embedding
  9 | 
 10 | def load_embedding(fname, format="word2vec_bin", normalize=True,
 11 |                    lower=False, clean_words=False, load_kwargs={}):
 12 |     """
 13 |     Loads embeddings from file
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     fname: string
 18 |       Path to file containing embedding
 19 | 
 20 |     format: string
 21 |       Format of the embedding. Possible values are:
 22 |       'word2vec_bin', 'word2vec', 'glove', 'dict'
 23 | 
 24 |     normalize: bool, default: True
 25 |       If true will normalize all vector to unit length
 26 | 
 27 |     clean_words: bool, default: True
 28 |       If true will only keep alphanumeric characters and "_", "-"
 29 |       Warning: shouldn't be applied to embeddings with non-ascii characters
 30 | 
 31 |     load_kwargs:
 32 |       Additional parameters passed to load function. Mostly useful for 'glove' format where you
 33 |       should pass vocab_size and dim.
 34 |     """
 35 |     assert format in ['word2vec_bin', 'word2vec', 'glove', 'dict'], "Unrecognized format"
 36 |     if format == "word2vec_bin":
 37 |         w = Embedding.from_word2vec(fname, binary=True)
 38 |     elif format == "word2vec":
 39 |         w = Embedding.from_word2vec(fname, binary=False)
 40 |     elif format == "glove":
 41 |         w = Embedding.from_glove(fname, **load_kwargs)
 42 |     elif format == "dict":
 43 |         d = pickle.load(open(fname, "rb"), encoding='latin1')
 44 |         w = Embedding.from_dict(d)
 45 |     if normalize:
 46 |         w.normalize_words(inplace=True)
 47 |     if lower or clean_words:
 48 |         w.standardize_words(lower=lower, clean_words=clean_words, inplace=True)
 49 |     return w
 50 | 
 51 | 
 52 | 
 53 | def fetch_GloVe(dim=300, corpus="wiki-6B", normalize=True, lower=False, clean_words=False):
 54 |     """
 55 |     Fetches GloVe embeddings.
 56 | 
 57 |     Parameters
 58 |     ----------
 59 |     dim: int, default: 300
 60 |       Dimensionality of embedding (usually performance increases with dimensionality).
 61 |       Available dimensionalities:
 62 |         * wiki-6B: 50, 100, 200, 300
 63 |         * common-crawl-42B: 300
 64 |         * common-crawl-840B: 300
 65 |         * twitter: 25, 50, 100, 200
 66 | 
 67 |     corpus: string, default: "wiki-6B"
 68 |       Corpus that GloVe vector were trained on.
 69 |       Available corpuses: "wiki-6B", "common-crawl-42B", "common-crawl-840B", "twitter-27B"
 70 | 
 71 |     normalize: bool, default: True
 72 |       If true will normalize all vector to unit length
 73 | 
 74 |     clean_words: bool, default: True
 75 |       If true will only keep alphanumeric characters and "_", "-"
 76 |       Warning: shouldn't be applied to embeddings with non-ascii characters
 77 | 
 78 |     load_kwargs:
 79 |       Additional parameters passed to load function. Mostly useful for 'glove' format where you
 80 |       should pass vocab_size and dim.
 81 | 
 82 |     Returns
 83 |     -------
 84 |     w: Embedding
 85 |       Embedding instance
 86 | 
 87 |     References
 88 |     ----------
 89 |     Project website: http://nlp.stanford.edu/projects/glove/
 90 | 
 91 |     Notes
 92 |     -----
 93 |     Loading GloVe format can take a while
 94 |     """
 95 |     download_file = {
 96 |             "wiki-6B": "http://nlp.stanford.edu/data/glove.6B.zip",
 97 |             "common-crawl-42B": "http://nlp.stanford.edu/data/glove.42B.300d.zip",
 98 |             "common-crawl-840B": "http://nlp.stanford.edu/data/glove.840B.300d.zip",
 99 |             "twitter-27B": "http://nlp.stanford.edu/data/glove.twitter.27B.zip"
100 |     }
101 | 
102 |     embedding_file = {
103 |         "wiki-6B": {
104 |             50: "glove.6B/glove.6B.50d.txt",
105 |             100: "glove.6B/glove.6B.100d.txt",
106 |             200: "glove.6B/glove.6B.200d.txt",
107 |             300: "glove.6B/glove.6B.300d.txt"
108 |         },
109 |         "common-crawl-42B": {
110 |             300: "glove.42B.300d/glove.42B.300d.txt"
111 |         },
112 |         "common-crawl-840B": {
113 |             300: "glove.840B.300d/glove.840B.300d.txt"
114 |         },
115 |         "twitter-27B": {
116 |             25: "glove.twitter.27B/glove.twitter.27B.25d.txt",
117 |             50: "glove.twitter.27B/glove.twitter.27B.50d.txt",
118 |             100: "glove.twitter.27B/glove.twitter.27B.100d.txt",
119 |             200: "glove.twitter.27B/glove.twitter.27B.200d.txt",
120 |         }
121 |     }
122 | 
123 |     vocab_size = {
124 |             "wiki-6B": 400000,
125 |             "common-crawl-42B": 1917494,
126 |             "common-crawl-840B": 2196017,
127 |             "twitter-27B": 1193514
128 |     }
129 | 
130 |     assert corpus in download_file, "Unrecognized corpus"
131 |     assert dim in embedding_file[corpus], "Not available dimensionality"
132 | 
133 |     _ = _fetch_file(url=download_file[corpus],
134 |                            data_dir="embeddings",
135 |                            uncompress=True,
136 |                            verbose=1)
137 | 
138 |     return load_embedding(path.join(_get_dataset_dir("embeddings"), embedding_file[corpus][dim]),
139 |                            format="glove",
140 |                            normalize=normalize,
141 |                            lower=lower, clean_words=clean_words,\
142 |                            load_kwargs={"vocab_size": vocab_size[corpus], "dim": dim})
143 | 
144 | 
145 | 
146 | def fetch_HPCA(which, normalize=True, lower=False, clean_words=False):
147 |     """
148 |     Fetches Hellinger PCA based embeddings
149 | 
150 |     Parameters
151 |     ----------
152 |     which: str, default: "autoencoder_phrase_hpca"
153 |       Can choose between "hpca" and "autoencoder_phrase_hpca" (from "The Sum of Its Parts")
154 | 
155 |     normalize: bool, default: True
156 |       If true will normalize all vector to unit length
157 | 
158 |     clean_words: bool, default: True
159 |       If true will only keep alphanumeric characters and "_", "-"
160 |       Warning: shouldn't be applied to embeddings with non-ascii characters
161 | 
162 |     load_kwargs:
163 |       Additional parameters passed to load function. Mostly useful for 'glove' format where you
164 |       should pass vocab_size and dim.
165 | 
166 |     Returns
167 |     -------
168 |     w: Embedding
169 |       Instance of Embedding class
170 | 
171 |     References
172 |     ----------
173 |     Published at http://lebret.ch/words/
174 |     Reference paper: Lebret, Collobert et al., “The Sum of Its Parts”: Joint Learning of Word and Phrase Representations with Autoencoders", 2015
175 |     """
176 |     download_file = {
177 |             "autoencoder_phrase_hpca": "https://www.dropbox.com/s/6dyf48crdmjbw1a/AHPCA.bin.gz?dl=1",
178 |             "hpca": "https://www.dropbox.com/s/5y5l6vyn8yn11dv/HPCA.bin.gz?dl=1"
179 |     }
180 | 
181 |     path = _fetch_file(url=download_file[which],
182 |                         data_dir="embeddings",
183 |                            uncompress=False,
184 |                            verbose=1)
185 | 
186 |     return load_embedding(path, format="word2vec_bin", normalize=normalize, lower=lower, clean_words=clean_words)
187 | 
188 | 
189 | 
190 | def fetch_morphoRNNLM(which, normalize=True, lower=False, clean_words=False):
191 |     """
192 |     Fetches recursive morphological neural network embeddings
193 | 
194 |     Parameters
195 |     ----------
196 |     which: str, default: "CW"
197 |       Can choose between CW and HSMN
198 | 
199 |     normalize: bool, default: True
200 |       If true will normalize all vector to unit length
201 | 
202 |     clean_words: bool, default: True
203 |       If true will only keep alphanumeric characters and "_", "-"
204 |       Warning: shouldn't be applied to embeddings with non-ascii characters
205 | 
206 |     load_kwargs:
207 |       Additional parameters passed to load function. Mostly useful for 'glove' format where you
208 |       should pass vocab_size and dim.
209 | 
210 |     Returns
211 |     -------
212 |     w: Embedding
213 |       Instance of Embedding class
214 | 
215 |     References
216 |     ----------
217 |     Published at http://stanford.edu/~lmthang/morphoNLM/
218 |     Reference paper: Luong, Socher et al., "Better Word Representations with Recursive Neural Networks for Morphology", 2013
219 |     """
220 |     download_file = {
221 |             "CW": "https://www.dropbox.com/s/7fdj2666iqv4xbu/cwCsmRNN.bin.gz?dl=1",
222 |             "HSMN": "https://www.dropbox.com/s/okw1i6kc6e2jd1q/hsmnCsmRNN.bin.gz?dl=1"
223 |     }
224 | 
225 |     path = _fetch_file(url=download_file[which],
226 |                         data_dir="embeddings",
227 |                            uncompress=False,
228 |                            verbose=1)
229 | 
230 |     return load_embedding(path, format="word2vec_bin", normalize=normalize, lower=lower, clean_words=clean_words)
231 | 
232 | 
233 | 
234 | 
235 | 
236 | def fetch_NMT(which="DE", normalize=True, lower=False, clean_words=False):
237 |     """
238 |     Fetches word embeddings induced by Neural Translation Machine
239 | 
240 |     Parameters
241 |     ----------
242 |     which: str, default: "DE"
243 |       Can choose between DE and FR, which fetches accordingly EN -> DE or EN -> FR translation
244 |       induced word embeddings
245 | 
246 |     normalize: bool, default: True
247 |       If true will normalize all vector to unit length
248 | 
249 |     clean_words: bool, default: True
250 |       If true will only keep alphanumeric characters and "_", "-"
251 |       Warning: shouldn't be applied to embeddings with non-ascii characters
252 | 
253 |     load_kwargs:
254 |       Additional parameters passed to load function. Mostly useful for 'glove' format where you
255 |       should pass vocab_size and dim.
256 | 
257 |     Returns
258 |     -------
259 |     w: Embedding
260 |       Instance of Embedding class
261 | 
262 |     References
263 |     ----------
264 |     Published at https://www.cl.cam.ac.uk/~fh295/.
265 |     Reference paper: Hill, Cho et al., "Embedding Word Similarity With Neural Machine Translation", 2014
266 |     """
267 |     dirname = _fetch_file(url="https://www.cl.cam.ac.uk/~fh295/TEmbz.tar.gz",
268 |                        data_dir="embeddings",
269 |                        uncompress=True,
270 |                        verbose=1)
271 | 
272 |     assert which in ["DE", "FR"], "Unrecognized which parameter"
273 | 
274 |     fname = {"FR": "Trans_embds/D_RNN_500k_144h.pkl", "DE": "Trans_embds/D_german_50k_500k_168h.pkl"}
275 | 
276 |     return load_embedding(path.join(dirname, fname[which]),
277 |                            format="dict",
278 |                            normalize=normalize,
279 |                            lower=lower, clean_words=clean_words)
280 | 
281 | 
282 | 
283 | def fetch_PDC(dim=300, normalize=True, lower=False, clean_words=False):
284 |     """
285 |     Fetches PDC embeddings trained on wiki by Fei Sun
286 | 
287 |     Parameters
288 |     ----------
289 |     dim: int, default:300
290 |       Dimensionality of embedding
291 | 
292 |     normalize: bool, default: True
293 |       If true will normalize all vector to unit length
294 | 
295 |     clean_words: bool, default: True
296 |       If true will only keep alphanumeric characters and "_", "-"
297 |       Warning: shouldn't be applied to embeddings with non-ascii characters
298 | 
299 |     load_kwargs:
300 |       Additional parameters passed to load function. Mostly useful for 'glove' format where you
301 |       should pass vocab_size and dim.
302 | 
303 |     Returns
304 |     -------
305 |     w: Embedding
306 |       Embedding instance
307 | 
308 |     References
309 |     ----------
310 |     Embeddings were published on http://ofey.me/projects/wordrep/.
311 |     Reference paper: Fei Sun, Jiafeng Guo, Yanyan Lan, Jun Xu, and Xueqi Cheng.
312 |     "Learning word representations by jointly modeling syntagmatic and paradigmatic relations"
313 |     """
314 | 
315 |     url = {
316 |         50: "https://www.dropbox.com/s/0ofi1glri8l42y1/wikicorp.201004-pdc-"
317 |              "iter-20-alpha-0.05-window-10-dim-50-neg-10-subsample-0.0001.txt.bz2?dl=1",
318 |         100: "https://www.dropbox.com/s/fmvegh4j62hulr0/wikicorp.201004-pdc-"
319 |              "iter-20-alpha-0.05-window-10-dim-100-neg-10-subsample-0.0001.txt.bz2?dl=1",
320 |         300: "https://www.dropbox.com/s/jppkd6j2xxb9v48/wikicorp.201004-pdc-"
321 |              "iter-20-alpha-0.05-window-10-dim-300-neg-10-subsample-0.0001.txt.bz2?dl=1"
322 |     }
323 |     assert dim in url, "Unavailable dimensionality"
324 | 
325 |     path = _fetch_file(url=url[dim],
326 |                         data_dir="embeddings",
327 |                            uncompress=False,
328 |                            move="pdc/pdc{}.txt.bz2".format(dim),
329 |                            verbose=1)
330 | 
331 |     return load_embedding(path, format="word2vec", normalize=normalize, lower=lower, clean_words=clean_words)
332 | 
333 | 
334 | def fetch_HDC(dim=300, normalize=True, lower=False, clean_words=False):
335 |     """
336 |     Fetches PDC embeddings trained on wiki by Fei Sun
337 | 
338 |     Parameters
339 |     ----------
340 |     dim: int, default:300
341 |       Dimensionality of embedding
342 | 
343 |     normalize: bool, default: True
344 |       If true will normalize all vector to unit length
345 | 
346 |     clean_words: bool, default: True
347 |       If true will only keep alphanumeric characters and "_", "-"
348 |       Warning: shouldn't be applied to embeddings with non-ascii characters
349 | 
350 |     load_kwargs:
351 |       Additional parameters passed to load function. Mostly useful for 'glove' format where you
352 |       should pass vocab_size and dim.
353 | 
354 |     Returns
355 |     -------
356 |     w: Embedding
357 |       Embedding instance
358 | 
359 |     References
360 |     ----------
361 |     Embeddings were published on http://ofey.me/projects/wordrep/.
362 |     Reference paper: Fei Sun, Jiafeng Guo, Yanyan Lan, Jun Xu, and Xueqi Cheng.
363 |     "Learning word representations by jointly modeling syntagmatic and paradigmatic relations"
364 |     """
365 | 
366 |     url = {
367 |         50: "https://www.dropbox.com/s/q22ssy8055loknz/wikicorp.201004-hdc-"
368 |             "iter-20-alpha-0.025-window-10-dim-50-neg-10-subsample-0.0001.txt.bz2?dl=1",
369 |         100: "https://www.dropbox.com/s/13226et55fi6g50/wikicorp.201004-hdc-"
370 |              "iter-20-alpha-0.025-window-10-dim-100-neg-10-subsample-0.0001.txt.bz2?dl=1",
371 |         300: "https://www.dropbox.com/s/jrfwel32yd8w0lu/wikicorp.201004-hdc-"
372 |              "iter-20-alpha-0.025-window-10-dim-300-neg-10-subsample-0.0001.txt.bz2?dl=1"
373 |     }
374 |     assert dim in url, "Unavailable dimensionality"
375 | 
376 |     path = _fetch_file(url=url[dim],
377 |                         data_dir="embeddings",
378 |                            uncompress=False,
379 |                            move="hdc/hdc{}.txt.bz2".format(dim),
380 |                            verbose=1)
381 | 
382 |     return load_embedding(path, format="word2vec", normalize=normalize, lower=lower, clean_words=clean_words)
383 | 
384 | 
385 | 
386 | def fetch_SG_GoogleNews(normalize=True, lower=False, clean_words=False):
387 |     """
388 |     Fetches SG (skip-gram with negative sampling)
389 |     embeddings trained on GoogleNews dataset published on word2vec website
390 | 
391 |     Parameters
392 |     ----------
393 |     normalize: bool, default: True
394 |       If true will normalize all vector to unit length
395 | 
396 |     clean_words: bool, default: True
397 |       If true will only keep alphanumeric characters and "_", "-"
398 |       Warning: shouldn't be applied to embeddings with non-ascii characters
399 | 
400 |     load_kwargs:
401 |       Additional parameters passed to load function. Mostly useful for 'glove' format where you
402 |       should pass vocab_size and dim.
403 | 
404 |     Returns
405 |     -------
406 |     w: Embedding
407 |       Instance of Embedding class
408 | 
409 |     References
410 |     ----------
411 |     Original source: https://code.google.com/p/word2vec/
412 |     """
413 |     path = _fetch_file(url="https://www.dropbox.com/s/bnm0trligffakd9/GoogleNews-vectors-negative300.bin.gz?dl=1",
414 |                            data_dir="embeddings",
415 |                            verbose=1)
416 |     return load_embedding(path, format="word2vec_bin", normalize=normalize, lower=lower, clean_words=clean_words)
417 | 
418 | def fetch_LexVec(which="commoncrawl-W+C", normalize=True, lower=False, clean_words=False):
419 |     """
420 |     Fetches LexVec embeddings
421 | 
422 |     Parameters
423 |     ----------
424 |     which: str, default: "commoncrawl-W+C"
425 |       Can choose between "commoncrawl-W", "commoncrawl-W+C", "wikipedia+newscrawl-W", "wikipedia+newscrawl-W+C", "commoncrawl-ngramsubwords-W"
426 | 
427 |     normalize: bool, default: True
428 |       If true will normalize all vector to unit length
429 | 
430 |     lower: bool, default: False
431 |       If true, will convert string to lowercase
432 | 
433 |     clean_words: bool, default: False
434 |       If true will only keep alphanumeric characters and "_", "-"
435 |       Warning: shouldn't be applied to embeddings with non-ascii characters
436 | 
437 |     Returns
438 |     -------
439 |     w: Embedding
440 |       Instance of Embedding class
441 | 
442 |     References
443 |     ----------
444 |     Published at https://github.com/alexandres/lexvec
445 |     Reference paper: Salle, Alexandre, Marco Idiart, and Aline Villavicencio. Matrix Factorization using Window Sampling and Negative Sampling for Improved Word Representations. The 54th Annual Meeting of the Association for Computational Linguistics. 2016.
446 |     """
447 |     download_file = {
448 |             "commoncrawl-W": "https://www.dropbox.com/s/flh1fjynqvdsj4p/lexvec.commoncrawl.300d.W.pos.vectors.gz?dl=1",
449 |             "commoncrawl-W+C": "https://www.dropbox.com/s/zkiajh6fj0hm0m7/lexvec.commoncrawl.300d.W%2BC.pos.vectors.gz?dl=1",
450 |             "wikipedia+newscrawl-W": "https://www.dropbox.com/s/kguufyc2xcdi8yk/lexvec.enwiki%2Bnewscrawl.300d.W.pos.vectors.gz?dl=1",
451 |             "wikipedia+newscrawl-W+C": "https://www.dropbox.com/s/u320t9bw6tzlwma/lexvec.enwiki%2Bnewscrawl.300d.W%2BC.pos.vectors.gz?dl=1",
452 |             "commoncrawl-ngramsubwords-W": "https://www.dropbox.com/s/mrxn933chn5u37z/lexvec.commoncrawl.ngramsubwords.300d.W.pos.vectors.gz?dl=1"
453 |     }
454 | 
455 |     path = _fetch_file(url=download_file[which],
456 |                         data_dir="embeddings",
457 |                         verbose=1)
458 | 
459 |     return load_embedding(path, format="word2vec", normalize=normalize, lower=lower, clean_words=clean_words)
460 | 
461 | 
462 | def fetch_conceptnet_numberbatch(clean_words=False):
463 |     """
464 |     Fetches ConceptNetNumberbatch embeddings. Embeddings are normalized to unit length,
465 |     and the vocabulary terms are lowercase.
466 | 
467 |     Parameters
468 |     ----------
469 |     clean_words: bool, default: False
470 |       If true will only keep alphanumeric characters and "_", "-"
471 |       Warning: shouldn't be applied to embeddings with non-ascii characters
472 | 
473 |     Returns
474 |     -------
475 |     w: Embedding
476 |       Instance of Embedding class
477 | 
478 |     References
479 |     ----------
480 |     Published at https://github.com/commonsense/conceptnet-numberbatch
481 |     Reference paper: Robert Speer, Joshua Chin, and Catherine Havasi (2017). "ConceptNet 5.5: An Open Multilingual Graph of General Knowledge." In proceedings of AAAI 2017.
482 |     """
483 |     path = _fetch_file(url='https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-en-17.06.txt.gz',
484 |                        data_dir='embeddings',
485 |                        uncompress=False,
486 |                        verbose=1)
487 |     return load_embedding(path, format='word2vec', normalize=False, clean_words=clean_words)
488 | 
489 | 
490 | def fetch_FastText(lang="en", normalize=True, lower=False, clean_words=False):
491 |     """
492 |        Fetches fastText embeddings
493 | 
494 |        Parameters
495 |        ----------
496 |        lang: str, default: "en"
497 |          Can choose between all accessible language on page:
498 |          https://fasttext.cc/docs/en/pretrained-vectors.html#content
499 | 
500 |        normalize: bool, default: True
501 |          If true will normalize all vector to unit length
502 | 
503 |        lower: bool, default: False
504 |          If true, will convert string to lowercase
505 | 
506 |        clean_words: bool, default: False
507 |          If true will only keep alphanumeric characters and "_", "-"
508 |          Warning: shouldn't be applied to embeddings with non-ascii characters
509 | 
510 |        Returns
511 |        -------
512 |        w: Embedding
513 |          Instance of Embedding class
514 | 
515 |        References
516 |        ----------
517 |        Published at https://fasttext.cc/
518 |        """
519 | 
520 |     url_vec = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.{}.vec'.format(lang)
521 | 
522 |     path = _fetch_file(url=url_vec, data_dir='embeddings',
523 |                        uncompress=False,
524 |                        verbose=1)
525 | 
526 |     return load_embedding(path, format='word2vec', normalize=normalize, lower=lower, clean_words=clean_words)
527 | 
528 | 
529 | # TODO: uncomment after training is finished
530 | # def fetch_SG_wiki(normalize=True, lower=False, clean_words=True):
531 | #     """
532 | #     Fetches SG (skip-gram) embeddings trained on recent (12.2015) Wiki corpus using gensim
533 | #
534 | #     Note
535 | #     ----
536 | #     Doesn't distinguish between lower and capital letters in embedding.
537 | #     See scripts used for training on github in scripts/wikipedia/
538 | #     """
539 | #     fname = path.join(_get_dataset_dir('embeddings'), "sg-wiki-en-400.bin")
540 | #     return _load_embedding(fname, format="word2vec_binary", normalize=normalize,
541 | #                            lower=lower, clean_words=clean_words)
542 | 


--------------------------------------------------------------------------------
/web/evaluate.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 |  Evaluation functions
  4 | """
  5 | import logging
  6 | import numpy as np
  7 | from sklearn.cluster import AgglomerativeClustering, KMeans
  8 | from .datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999, fetch_MTurk, fetch_RG65, fetch_RW, fetch_TR9856
  9 | from .datasets.categorization import fetch_AP, fetch_battig, fetch_BLESS, fetch_ESSLI_1a, fetch_ESSLI_2b, \
 10 |     fetch_ESSLI_2c
 11 | from web.analogy import *
 12 | from six import iteritems
 13 | from web.embedding import Embedding
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | def calculate_purity(y_true, y_pred):
 18 |     """
 19 |     Calculate purity for given true and predicted cluster labels.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     y_true: array, shape: (n_samples, 1)
 24 |       True cluster labels
 25 | 
 26 |     y_pred: array, shape: (n_samples, 1)
 27 |       Cluster assingment.
 28 | 
 29 |     Returns
 30 |     -------
 31 |     purity: float
 32 |       Calculated purity.
 33 |     """
 34 |     assert len(y_true) == len(y_pred)
 35 |     true_clusters = np.zeros(shape=(len(set(y_true)), len(y_true)))
 36 |     pred_clusters = np.zeros_like(true_clusters)
 37 |     for id, cl in enumerate(set(y_true)):
 38 |         true_clusters[id] = (y_true == cl).astype("int")
 39 |     for id, cl in enumerate(set(y_pred)):
 40 |         pred_clusters[id] = (y_pred == cl).astype("int")
 41 | 
 42 |     M = pred_clusters.dot(true_clusters.T)
 43 |     return 1. / len(y_true) * np.sum(np.max(M, axis=1))
 44 | 
 45 | 
 46 | def evaluate_categorization(w, X, y, method="all", seed=None):
 47 |     """
 48 |     Evaluate embeddings on categorization task.
 49 | 
 50 |     Parameters
 51 |     ----------
 52 |     w: Embedding or dict
 53 |       Embedding to test.
 54 | 
 55 |     X: vector, shape: (n_samples, )
 56 |       Vector of words.
 57 | 
 58 |     y: vector, shape: (n_samples, )
 59 |       Vector of cluster assignments.
 60 | 
 61 |     method: string, default: "all"
 62 |       What method to use. Possible values are "agglomerative", "kmeans", "all.
 63 |       If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude
 64 |       hyperparameter tuning to avoid overfitting).
 65 |       If "kmeans" is passed, method will fit KMeans.
 66 |       In both cases number of clusters is preset to the correct value.
 67 | 
 68 |     seed: int, default: None
 69 |       Seed passed to KMeans.
 70 | 
 71 |     Returns
 72 |     -------
 73 |     purity: float
 74 |       Purity of the best obtained clustering.
 75 | 
 76 |     Notes
 77 |     -----
 78 |     KMedoids method was excluded as empirically didn't improve over KMeans (for categorization
 79 |     tasks available in the package).
 80 |     """
 81 | 
 82 |     if isinstance(w, dict):
 83 |         w = Embedding.from_dict(w)
 84 | 
 85 |     assert method in ["all", "kmeans", "agglomerative"], "Uncrecognized method"
 86 | 
 87 |     mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
 88 |     words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
 89 |     ids = np.random.RandomState(seed).choice(range(len(X)), len(X), replace=False)
 90 | 
 91 |     # Evaluate clustering on several hyperparameters of AgglomerativeClustering and
 92 |     # KMeans
 93 |     best_purity = 0
 94 | 
 95 |     if method == "all" or method == "agglomerative":
 96 |         best_purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)),
 97 |                                                                        affinity="euclidean",
 98 |                                                                        linkage="ward").fit_predict(words[ids]))
 99 |         logger.debug("Purity={:.3f} using affinity={} linkage={}".format(best_purity, 'euclidean', 'ward'))
100 |         for affinity in ["cosine", "euclidean"]:
101 |             for linkage in ["average", "complete"]:
102 |                 purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)),
103 |                                                                           affinity=affinity,
104 |                                                                           linkage=linkage).fit_predict(words[ids]))
105 |                 logger.debug("Purity={:.3f} using affinity={} linkage={}".format(purity, affinity, linkage))
106 |                 best_purity = max(best_purity, purity)
107 | 
108 |     if method == "all" or method == "kmeans":
109 |         purity = calculate_purity(y[ids], KMeans(random_state=seed, n_init=10, n_clusters=len(set(y))).
110 |                                   fit_predict(words[ids]))
111 |         logger.debug("Purity={:.3f} using KMeans".format(purity))
112 |         best_purity = max(purity, best_purity)
113 | 
114 |     return best_purity
115 | 
116 | 
117 | 
118 | def evaluate_on_semeval_2012_2(w):
119 |     """
120 |     Simple method to score embedding using SimpleAnalogySolver
121 | 
122 |     Parameters
123 |     ----------
124 |     w : Embedding or dict
125 |       Embedding or dict instance.
126 | 
127 |     Returns
128 |     -------
129 |     result: pandas.DataFrame
130 |       Results with spearman correlation per broad category with special key "all" for summary
131 |       spearman correlation
132 |     """
133 |     if isinstance(w, dict):
134 |         w = Embedding.from_dict(w)
135 | 
136 |     data = fetch_semeval_2012_2()
137 |     mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
138 |     categories = data.y.keys()
139 |     results = defaultdict(list)
140 |     for c in categories:
141 |         # Get mean of left and right vector
142 |         prototypes = data.X_prot[c]
143 |         prot_left = np.mean(np.vstack(w.get(word, mean_vector) for word in prototypes[:, 0]), axis=0)
144 |         prot_right = np.mean(np.vstack(w.get(word, mean_vector) for word in prototypes[:, 1]), axis=0)
145 | 
146 |         questions = data.X[c]
147 |         question_left, question_right = np.vstack(w.get(word, mean_vector) for word in questions[:, 0]), \
148 |                                         np.vstack(w.get(word, mean_vector) for word in questions[:, 1])
149 | 
150 |         scores = np.dot(prot_left - prot_right, (question_left - question_right).T)
151 | 
152 |         c_name = data.categories_names[c].split("_")[0]
153 |         # NaN happens when there are only 0s, which might happen for very rare words or
154 |         # very insufficient word vocabulary
155 |         cor = scipy.stats.spearmanr(scores, data.y[c]).correlation
156 |         results[c_name].append(0 if np.isnan(cor) else cor)
157 | 
158 |     final_results = OrderedDict()
159 |     final_results['all'] = sum(sum(v) for v in results.values()) / len(categories)
160 |     for k in results:
161 |         final_results[k] = sum(results[k]) / len(results[k])
162 |     return pd.Series(final_results)
163 | 
164 | 
165 | def evaluate_analogy(w, X, y, method="add", k=None, category=None, batch_size=100):
166 |     """
167 |     Simple method to score embedding using SimpleAnalogySolver
168 | 
169 |     Parameters
170 |     ----------
171 |     w : Embedding or dict
172 |       Embedding or dict instance.
173 | 
174 |     method : {"add", "mul"}
175 |       Method to use when finding analogy answer, see "Improving Distributional Similarity
176 |       with Lessons Learned from Word Embeddings"
177 | 
178 |     X : array-like, shape (n_samples, 3)
179 |       Analogy questions.
180 | 
181 |     y : array-like, shape (n_samples, )
182 |       Analogy answers.
183 | 
184 |     k : int, default: None
185 |       If not None will select k top most frequent words from embedding
186 | 
187 |     batch_size : int, default: 100
188 |       Increase to increase memory consumption and decrease running time
189 | 
190 |     category : list, default: None
191 |       Category of each example, if passed function returns accuracy per category
192 |       in addition to the overall performance.
193 |       Analogy datasets have "category" field that can be supplied here.
194 | 
195 |     Returns
196 |     -------
197 |     result: dict
198 |       Results, where each key is for given category and special empty key "" stores
199 |       summarized accuracy across categories
200 |     """
201 |     if isinstance(w, dict):
202 |         w = Embedding.from_dict(w)
203 | 
204 |     assert category is None or len(category) == y.shape[0], "Passed incorrect category list"
205 | 
206 |     solver = SimpleAnalogySolver(w=w, method=method, batch_size=batch_size, k=k)
207 |     y_pred = solver.predict(X)
208 | 
209 |     if category is not None:
210 |         results = OrderedDict({"all": np.mean(y_pred == y)})
211 |         count = OrderedDict({"all": len(y_pred)})
212 |         correct = OrderedDict({"all": np.sum(y_pred == y)})
213 |         for cat in set(category):
214 |             results[cat] = np.mean(y_pred[category == cat] == y[category == cat])
215 |             count[cat] = np.sum(category == cat)
216 |             correct[cat] = np.sum(y_pred[category == cat] == y[category == cat])
217 | 
218 |         return pd.concat([pd.Series(results, name="accuracy"),
219 |                           pd.Series(correct, name="correct"),
220 |                           pd.Series(count, name="count")],
221 |                          axis=1)
222 |     else:
223 |         return np.mean(y_pred == y)
224 | 
225 | 
226 | def evaluate_on_WordRep(w, max_pairs=1000, solver_kwargs={}):
227 |     """
228 |     Evaluate on WordRep dataset
229 | 
230 |     Parameters
231 |     ----------
232 |     w : Embedding or dict
233 |       Embedding or dict instance.
234 | 
235 |     max_pairs: int, default: 1000
236 |       Each category will be constrained to maximum of max_pairs pairs
237 |       (which results in max_pair * (max_pairs - 1) examples)
238 | 
239 |     solver_kwargs: dict, default: {}
240 |       Arguments passed to SimpleAnalogySolver. It is suggested to limit number of words
241 |       in the dictionary.
242 | 
243 |     References
244 |     ----------
245 |     Bin Gao, Jiang Bian, Tie-Yan Liu (2015)
246 |      "WordRep: A Benchmark for Research on Learning Word Representations"
247 |     """
248 |     if isinstance(w, dict):
249 |         w = Embedding.from_dict(w)
250 | 
251 |     data = fetch_wordrep()
252 |     categories = set(data.category)
253 | 
254 |     accuracy = {}
255 |     correct = {}
256 |     count = {}
257 |     for cat in categories:
258 |         X_cat = data.X[data.category == cat]
259 |         X_cat = X_cat[0:max_pairs]
260 | 
261 |         logger.info("Processing {} with {} pairs, {} questions".format(cat, X_cat.shape[0]
262 |                                                                        , X_cat.shape[0] * (X_cat.shape[0] - 1)))
263 | 
264 |         # For each category construct question-answer pairs
265 |         size = X_cat.shape[0] * (X_cat.shape[0] - 1)
266 |         X = np.zeros(shape=(size, 3), dtype="object")
267 |         y = np.zeros(shape=(size,), dtype="object")
268 |         id = 0
269 |         for left, right in product(X_cat, X_cat):
270 |             if not np.array_equal(left, right):
271 |                 X[id, 0:2] = left
272 |                 X[id, 2] = right[0]
273 |                 y[id] = right[1]
274 |                 id += 1
275 | 
276 |         # Run solver
277 |         solver = SimpleAnalogySolver(w=w, **solver_kwargs)
278 |         y_pred = solver.predict(X)
279 |         correct[cat] = float(np.sum(y_pred == y))
280 |         count[cat] = size
281 |         accuracy[cat] = float(np.sum(y_pred == y)) / size
282 | 
283 |     # Add summary results
284 |     correct['wikipedia'] = sum(correct[c] for c in categories if c in data.wikipedia_categories)
285 |     correct['all'] = sum(correct[c] for c in categories)
286 |     correct['wordnet'] = sum(correct[c] for c in categories if c in data.wordnet_categories)
287 | 
288 |     count['wikipedia'] = sum(count[c] for c in categories if c in data.wikipedia_categories)
289 |     count['all'] = sum(count[c] for c in categories)
290 |     count['wordnet'] = sum(count[c] for c in categories if c in data.wordnet_categories)
291 | 
292 |     accuracy['wikipedia'] = correct['wikipedia'] / count['wikipedia']
293 |     accuracy['all'] = correct['all'] / count['all']
294 |     accuracy['wordnet'] = correct['wordnet'] / count['wordnet']
295 | 
296 |     return pd.concat([pd.Series(accuracy, name="accuracy"),
297 |                       pd.Series(correct, name="correct"),
298 |                       pd.Series(count, name="count")], axis=1)
299 | 
300 | 
301 | def evaluate_similarity(w, X, y):
302 |     """
303 |     Calculate Spearman correlation between cosine similarity of the model
304 |     and human rated similarity of word pairs
305 | 
306 |     Parameters
307 |     ----------
308 |     w : Embedding or dict
309 |       Embedding or dict instance.
310 | 
311 |     X: array, shape: (n_samples, 2)
312 |       Word pairs
313 | 
314 |     y: vector, shape: (n_samples,)
315 |       Human ratings
316 | 
317 |     Returns
318 |     -------
319 |     cor: float
320 |       Spearman correlation
321 |     """
322 |     if isinstance(w, dict):
323 |         w = Embedding.from_dict(w)
324 | 
325 |     missing_words = 0
326 |     words = w.vocabulary.word_id
327 |     for query in X:
328 |         for query_word in query:
329 |             if query_word not in words:
330 |                 missing_words += 1
331 |     if missing_words > 0:
332 |         logger.warning("Missing {} words. Will replace them with mean vector".format(missing_words))
333 | 
334 | 
335 |     mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
336 |     A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
337 |     B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])
338 |     scores = np.array([v1.dot(v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2)) for v1, v2 in zip(A, B)])
339 |     return scipy.stats.spearmanr(scores, y).correlation
340 | 
341 | 
342 | def evaluate_on_all(w):
343 |     """
344 |     Evaluate Embedding on all fast-running benchmarks
345 | 
346 |     Parameters
347 |     ----------
348 |     w: Embedding or dict
349 |       Embedding to evaluate.
350 | 
351 |     Returns
352 |     -------
353 |     results: pandas.DataFrame
354 |       DataFrame with results, one per column.
355 |     """
356 |     if isinstance(w, dict):
357 |         w = Embedding.from_dict(w)
358 | 
359 |     # Calculate results on similarity
360 |     logger.info("Calculating similarity benchmarks")
361 |     similarity_tasks = {
362 |         "MEN": fetch_MEN(),
363 |         "WS353": fetch_WS353(),
364 |         "WS353R": fetch_WS353(which="relatedness"),
365 |         "WS353S": fetch_WS353(which="similarity"),
366 |         "SimLex999": fetch_SimLex999(),
367 |         "RW": fetch_RW(),
368 |         "RG65": fetch_RG65(),
369 |         "MTurk": fetch_MTurk(),
370 |     }
371 | 
372 |     similarity_results = {}
373 | 
374 |     for name, data in iteritems(similarity_tasks):
375 |         similarity_results[name] = evaluate_similarity(w, data.X, data.y)
376 |         logger.info("Spearman correlation of scores on {} {}".format(name, similarity_results[name]))
377 | 
378 |     # Calculate results on analogy
379 |     logger.info("Calculating analogy benchmarks")
380 |     analogy_tasks = {
381 |         "Google": fetch_google_analogy(),
382 |         "MSR": fetch_msr_analogy()
383 |     }
384 | 
385 |     analogy_results = {}
386 | 
387 |     for name, data in iteritems(analogy_tasks):
388 |         analogy_results[name] = evaluate_analogy(w, data.X, data.y)
389 |         logger.info("Analogy prediction accuracy on {} {}".format(name, analogy_results[name]))
390 | 
391 |     analogy_results["SemEval2012_2"] = evaluate_on_semeval_2012_2(w)['all']
392 |     logger.info("Analogy prediction accuracy on {} {}".format("SemEval2012", analogy_results["SemEval2012_2"]))
393 | 
394 |     # Calculate results on categorization
395 |     logger.info("Calculating categorization benchmarks")
396 |     categorization_tasks = {
397 |         "AP": fetch_AP(),
398 |         "BLESS": fetch_BLESS(),
399 |         "Battig": fetch_battig(),
400 |         "ESSLI_2c": fetch_ESSLI_2c(),
401 |         "ESSLI_2b": fetch_ESSLI_2b(),
402 |         "ESSLI_1a": fetch_ESSLI_1a()
403 |     }
404 | 
405 |     categorization_results = {}
406 | 
407 |     # Calculate results using helper function
408 |     for name, data in iteritems(categorization_tasks):
409 |         categorization_results[name] = evaluate_categorization(w, data.X, data.y)
410 |         logger.info("Cluster purity on {} {}".format(name, categorization_results[name]))
411 | 
412 |     # Construct pd table
413 |     cat = pd.DataFrame([categorization_results])
414 |     analogy = pd.DataFrame([analogy_results])
415 |     sim = pd.DataFrame([similarity_results])
416 |     results = cat.join(sim).join(analogy)
417 | 
418 |     return results
419 | 


--------------------------------------------------------------------------------
/web/tests/test_analogy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 |  Tests for analogy solvers
 5 | """
 6 | import numpy as np
 7 | 
 8 | from web.datasets.utils import _fetch_file
 9 | from web.embedding import Embedding
10 | from web.datasets.analogy import fetch_google_analogy
11 | from web.evaluate import evaluate_analogy, evaluate_on_semeval_2012_2, evaluate_on_WordRep
12 | 
13 | 
14 | # TODO: takes too long
15 | def test_semeval_solver():
16 |     url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
17 |     file_name = _fetch_file(url, "test")
18 |     w = Embedding.from_word2vec(file_name, binary=True)
19 |     results = evaluate_on_semeval_2012_2(w)
20 |     assert results['all'] >= 0, "Should have some results on SemEval2012"
21 | 
22 | 
23 | def test_wordrep_solver():
24 |     url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
25 |     file_name = _fetch_file(url, "test")
26 |     w = Embedding.from_word2vec(file_name, binary=True)
27 |     P = evaluate_on_WordRep(w, max_pairs=2)
28 |     assert P['accuracy']['all'] >= 0
29 | 
30 | 
31 | def test_analogy_solver():
32 |     url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
33 |     file_name = _fetch_file(url, "test")
34 | 
35 |     w = Embedding.from_word2vec(file_name, binary=True)
36 |     data = fetch_google_analogy()
37 |     ids = np.random.RandomState(777).choice(range(data.X.shape[0]), 1000, replace=False)
38 |     X, y = data.X[ids], data.y[ids]
39 |     category = data.category_high_level[ids]
40 | 
41 |     results = evaluate_analogy(w=w, X=X, y=y, category=category)
42 |     assert results['accuracy']['all'] >= 0.65
43 |     assert results['accuracy']['semantic'] >= 0.7
44 |     assert results['accuracy']['syntactic'] >= 0.63
45 | 
46 |     results = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul")
47 |     assert results['accuracy']['all'] >= 0.7
48 |     assert results['accuracy']['semantic'] >= 0.75
49 |     assert results['accuracy']['syntactic'] >= 0.64
50 | 
51 |     results_mul = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul", k=400)
52 |     results_add = evaluate_analogy(w=w, X=X, y=y, category=category, method="add", k=400)
53 |     assert results_mul['accuracy']['all'] >= results_add['accuracy']['all']
54 |     assert results_mul['accuracy']['syntactic'] >= results_add['accuracy']['syntactic']
55 |     assert results_mul['accuracy']['semantic'] >= results_add['accuracy']['semantic']
56 | 


--------------------------------------------------------------------------------
/web/tests/test_categorization.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from web.evaluate import calculate_purity, evaluate_categorization
 3 | from web.embedding import Embedding
 4 | from web.datasets.utils import _fetch_file
 5 | from web.datasets.categorization import fetch_ESSLI_2c
 6 | 
 7 | def test_purity():
 8 |     y_true = np.array([1,1,2,2,3])
 9 |     y_pred = np.array([2,2,2,2,1])
10 |     assert abs(0.6 - calculate_purity(y_true, y_pred)) < 1e-10
11 | 
12 | def test_categorization():
13 |     data = fetch_ESSLI_2c()
14 |     url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
15 |     file_name = _fetch_file(url, "test")
16 |     w = Embedding.from_word2vec(file_name, binary=True)
17 |     assert evaluate_categorization(w, data.X, data.y, seed=777, method="all") >= 0.2


--------------------------------------------------------------------------------
/web/tests/test_embedding.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 |  Tests for embedding
 5 | """
 6 | import tempfile
 7 | from os import path
 8 | 
 9 | import numpy as np
10 | 
11 | from web.datasets.utils import _fetch_file
12 | from web.embedding import Embedding
13 | from web.utils import standardize_string
14 | from web.vocabulary import Vocabulary
15 | 
16 | def test_standardize():
17 |     url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
18 |     file_name = _fetch_file(url, "test")
19 | 
20 |     w = Embedding.from_word2vec(file_name, binary=True)
21 |     w2 = w.standardize_words(inplace=False, lower=False, clean_words=True)
22 |     w3 = Embedding.from_word2vec(file_name, binary=True)
23 |     assert len(w2.words) == 95
24 |     for word in w.vocabulary.words:
25 |         if standardize_string(word, lower=False, clean_words=True):
26 |             assert np.array_equal(w[word], w2[standardize_string(word, lower=False, clean_words=True)])
27 | 
28 |     w3.standardize_words(inplace=True, clean_words=True, lower=False)
29 |     assert len(w3.words) == 95
30 |     for word in w.vocabulary.words:
31 |         if standardize_string(word, lower=False):
32 |             assert np.array_equal(w[word], w3[standardize_string(word, lower=False, clean_words=True)])
33 | 
34 | 
35 | def test_standardize_preserve_identity():
36 |     d = {"Spider": [3, 4, 5], "spider": [1, 2, 3], "spideR": [3, 2, 4]}
37 |     w3 = Embedding.from_dict(d)
38 |     w4 = w3.standardize_words(inplace=False, lower=True)
39 |     assert w4['spider'][0] == 1
40 |     w3.standardize_words(inplace=True, lower=True)
41 |     assert w3['spider'][0] == 1
42 | 
43 | def test_save_2():
44 |     dirpath = tempfile.mkdtemp()
45 |     w = ["a", "b", "c"]
46 |     vectors = np.array([[1.,2.] ,[2.,3.], [3.,4.]])
47 |     e = Embedding(Vocabulary(w), vectors)
48 |     Embedding.to_word2vec(e, path.join(dirpath, "test.bin"), binary=True)
49 |     e2 = Embedding.from_word2vec(path.join(dirpath, "test.bin"), binary=True)
50 |     assert np.array_equal(e2.vectors, vectors)
51 | 
52 | def test_save():
53 |     url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
54 |     file_name = _fetch_file(url, "test")
55 |     w = Embedding.from_word2vec(file_name, binary=True)
56 | 
57 |     dirpath = tempfile.mkdtemp()
58 |     w.to_word2vec(w, path.join(dirpath, "tmp.bin"), binary=True)
59 |     w.to_word2vec(w, path.join(dirpath, "tmp.txt"), binary=False)
60 |     w2 = Embedding.from_word2vec(path.join(dirpath, "tmp.bin"), binary=True)
61 |     w3 = Embedding.from_word2vec(path.join(dirpath, "tmp.txt"), binary=False)
62 |     assert np.array_equal(w.vectors, w2.vectors)
63 |     assert w.vocabulary.words == w2.vocabulary.words
64 |     assert np.sum(np.abs(w.vectors - w3.vectors)) < 1e-5
65 |     assert w.vocabulary.words == w3.vocabulary.words
66 | 


--------------------------------------------------------------------------------
/web/tests/test_fetchers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 |  Tests for data fetchers
  5 | """
  6 | 
  7 | from web.datasets.analogy import fetch_google_analogy, fetch_msr_analogy, fetch_semeval_2012_2, \
  8 |     fetch_wordrep
  9 | 
 10 | from web.datasets.similarity import fetch_SimLex999, fetch_WS353, fetch_multilingual_SimLex999, \
 11 |     fetch_MEN, fetch_MTurk, fetch_RW, fetch_RG65
 12 | 
 13 | from web.datasets.categorization import fetch_AP, fetch_BLESS, fetch_battig,\
 14 |     fetch_ESSLI_1a, fetch_ESSLI_2b, fetch_ESSLI_2c
 15 | 
 16 | from itertools import product
 17 | from six import iteritems
 18 | 
 19 | def test_categorization_fetchers():
 20 |     data = fetch_battig()
 21 |     assert data.X.shape[0] == 5231
 22 | 
 23 |     data = fetch_BLESS()
 24 |     assert data.X.shape[0] == 200
 25 | 
 26 |     data = fetch_AP()
 27 |     assert len(set(data.y)) == 21
 28 | 
 29 |     data = fetch_ESSLI_2c()
 30 |     assert data.X.shape[0] == 45
 31 |     assert len(set(data.y)) == 9
 32 | 
 33 |     data = fetch_ESSLI_2b()
 34 |     assert data.X.shape[0] == 40
 35 |     assert len(set(data.y)) == 3
 36 | 
 37 |     data = fetch_ESSLI_1a()
 38 |     assert data.X.shape[0] == 44
 39 |     assert len(set(data.y)) == 6
 40 | 
 41 | def test_MTurk_fetcher():
 42 |     data = fetch_MTurk()
 43 |     assert (len(data.y) == len(data.X) == 287)
 44 |     assert (10.0 >= data.y.max() >= 9)
 45 | 
 46 | 
 47 | def test_RW_fetcher():
 48 |     data = fetch_RW()
 49 |     assert (len(data.y) == len(data.X) == 2034)
 50 |     assert (10.0 >= data.y.max() >= 9.8)
 51 | 
 52 | 
 53 | def test_RG65_fetcher():
 54 |     data = fetch_RG65()
 55 |     assert (len(data.y) == len(data.X) == 65)
 56 |     assert (10.0 >= data.y.max() >= 9.8)
 57 | 
 58 | 
 59 | def test_MEN_fetcher():
 60 |     params = product(["all", "dev", "test"], ["natural", "lem"])
 61 |     data, V = {}, {}
 62 |     for which, form in params:
 63 |         fetched = fetch_MEN(which=which, form=form)
 64 |         data[which + ":" + form] = fetched
 65 |         V[which + ":" + form] = set([" ".join(sorted(x)) for x in data[which + ":" + form].X])
 66 |         assert fetched.y.max() <= 10.0
 67 | 
 68 |     assert V["dev:natural"].union(V["test:natural"]) == V["all:natural"]
 69 |     assert V["dev:lem"].union(V["test:lem"]) == V["all:lem"]
 70 |     assert data['all:natural']
 71 | 
 72 | 
 73 | def test_ws353_fetcher():
 74 |     data1 = fetch_WS353(which="set1")
 75 |     data2 = fetch_WS353(which="set2")
 76 |     data3 = fetch_WS353(which="similarity")
 77 |     data4 = fetch_WS353(which="relatedness")
 78 |     data5 = fetch_WS353(which="all")
 79 |     V5 = set([" ".join(sorted(x)) for x in data5.X])
 80 |     V1 = set([" ".join(sorted(x)) for x in data1.X])
 81 |     V2 = set([" ".join(sorted(x)) for x in data2.X])
 82 |     V3 = set([" ".join(sorted(x)) for x in data3.X])
 83 |     V4 = set([" ".join(sorted(x)) for x in data4.X])
 84 | 
 85 |     # sd and scores have same length
 86 |     assert data1.sd.shape[0] == data1.y.shape[0]
 87 |     assert data2.sd.shape[0] == data2.y.shape[0]
 88 | 
 89 |     # WSR = WSR-SET1 u WSR-SET2
 90 |     assert data5.X.shape[0] == 353
 91 |     assert V5 == V2.union(V1)
 92 | 
 93 |     assert V5 == V3.union(V4)
 94 | 
 95 |     # Two word pairs reoccurr
 96 |     assert len(V5) == 351
 97 | 
 98 | 
 99 | def test_simlex999_fetchers():
100 |     data = fetch_SimLex999()
101 |     assert data.X.shape == (999, 2)
102 | 
103 |     for lang in ["EN", "RU", "IT", "DE"]:
104 |         data = fetch_multilingual_SimLex999(which=lang)
105 |         assert data.y.shape[0] == data.sd.shape[0]
106 |         assert data.X.shape[0] == 999
107 | 
108 | 
109 | def test_analogy_fetchers():
110 |     data = fetch_msr_analogy()
111 |     assert len(set(data.category)) == 16
112 | 
113 |     data = fetch_google_analogy()
114 |     assert len(set(data.category)) == 14
115 |     assert len(set(data.category_high_level)) == 2
116 | 
117 |     data = fetch_semeval_2012_2()
118 |     assert len(data.X) == len(data.y) == 79
119 |     for k, val in iteritems(data.X_prot):
120 |         assert len(val.shape) == 2, "Failed parsing prototypes for " + k
121 | 
122 |     data = fetch_wordrep(subsample=0.7)
123 |     assert len(set(data.category)) == 25
124 |     assert len(data.X[0]) == 2
125 |     assert "all-capital-cities" in set(data.category)
126 |     assert len(set(data.category_high_level)) == 2


--------------------------------------------------------------------------------
/web/tests/test_similarity.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 |  Tests for similarity solvers
 5 | """
 6 | from web.datasets.utils import _fetch_file
 7 | from web.embedding import Embedding
 8 | from web.datasets.similarity import fetch_SimLex999
 9 | from web.evaluate import evaluate_similarity
10 | 
11 | def test_similarity():
12 |     url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
13 |     file_name = _fetch_file(url, "test")
14 |     w = Embedding.from_word2vec(file_name, binary=True)
15 |     data = fetch_SimLex999()
16 | 
17 |     result_1 = evaluate_similarity(w, data.X, data.y)
18 |     result_2 =  evaluate_similarity(dict(zip(w.vocabulary.words, w.vectors)), data.X, data.y)
19 | 
20 |     assert result_2 > 0
21 |     assert result_1 == result_2, "evaluate_similarity should return same result for dict and Embedding instance"
22 | 
23 | def test_similarity_norm():
24 |     url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
25 |     file_name = _fetch_file(url, "test")
26 |     w = Embedding.from_word2vec(file_name, binary=True)
27 |     w_norm = w.normalize_words()
28 |     data = fetch_SimLex999()
29 | 
30 |     result_1 = evaluate_similarity(w, data.X, data.y)
31 |     result_2 = evaluate_similarity(w_norm, data.X, data.y)
32 | 
33 |     assert result_2 > 0
34 |     assert result_1 == result_2, "evaluate_similarity should return same result for normalized and unnormalized words"


--------------------------------------------------------------------------------
/web/tests/test_transform_words.py:
--------------------------------------------------------------------------------
  1 | from web.embedding import Embedding
  2 | from web.vocabulary import *
  3 | 
  4 | import numpy as np
  5 | import logging
  6 | import sys
  7 | 
  8 | 
  9 | # COUNTEDVOCABULARY
 10 | 
 11 | def test_noinplace_transform_word_CountedVocabulary():
 12 |     logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 13 | 
 14 |     cw = CountedVocabulary(word_count=[(' cat ', 10), ('cat', 50), ('dog', 60)])
 15 | 
 16 |     e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 11], [0, 11, 12], [0, 12, 13]]))
 17 |     pe = e.transform_words(lambda x: x.strip(), inplace=False)
 18 | 
 19 |     assert len(pe.vocabulary) == 2
 20 |     assert len(pe.vectors) == 2
 21 | 
 22 |     # 'dog'
 23 |     assert [0, 0, 11] in pe.vectors.tolist()
 24 |     # 'cat'
 25 |     assert [0, 11, 12] in pe.vectors.tolist()
 26 | 
 27 |     assert 'cat' in pe.vocabulary.words
 28 |     assert 'dog' in pe.vocabulary.words
 29 | 
 30 |     l = pe.vocabulary.getstate()
 31 |     d = {l[0][i]: l[1][i] for i in range(len(l[0]))}
 32 | 
 33 |     # dog
 34 |     assert pe.vocabulary.words[0] == 'dog'
 35 |     assert np.array_equal(pe.vectors[0], [0, 0, 11])
 36 |     assert d['dog'] == 60
 37 | 
 38 |     # cat
 39 |     assert pe.vocabulary.words[1] == 'cat'
 40 |     assert np.array_equal(pe.vectors[1], [0, 11, 12])
 41 |     assert d['cat'] == 50
 42 | 
 43 |     assert type(pe.vocabulary) == CountedVocabulary
 44 | 
 45 | 
 46 | def test_noinplace_transform_word_prefer_occurences_CountedVocabulary():
 47 |     logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 48 | 
 49 |     cw = CountedVocabulary(word_count=[(' cat ', 5), ('pikatchu ', 10), ('cat', 50), ('dog', 60), ('pikatchu', 200)])
 50 | 
 51 |     e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]]))
 52 |     pe = e.transform_words(lambda x: x.strip(), inplace=False)
 53 | 
 54 |     assert len(pe.vocabulary) == 3
 55 |     assert len(pe.vectors) == 3
 56 | 
 57 |     l = pe.vocabulary.getstate()
 58 |     d = {l[0][i]: l[1][i] for i in range(len(l[0]))}
 59 | 
 60 |     # 'dog'
 61 |     assert [0, 1, 11] in pe.vectors.tolist()
 62 |     # 'cat'
 63 |     assert [0, 11, 12] in pe.vectors.tolist()
 64 |     # pikatchu
 65 |     assert [0, 0, 1] in pe.vectors.tolist()
 66 | 
 67 |     assert 'cat' in pe.vocabulary.words
 68 |     assert 'dog' in pe.vocabulary.words
 69 |     assert 'pikatchu' in pe.vocabulary.words
 70 | 
 71 |     # pikatchu
 72 |     assert pe.vocabulary.words[0] == 'pikatchu'
 73 |     assert np.array_equal(pe.vectors[0], [0, 0, 1])
 74 |     assert d['pikatchu'] == 200
 75 |     # dog
 76 |     assert pe.vocabulary.words[1] == 'dog'
 77 |     assert np.array_equal(pe.vectors[1], [0, 1, 11])
 78 |     assert d['dog'] == 60
 79 | 
 80 |     # cat
 81 |     assert pe.vocabulary.words[2] == 'cat'
 82 |     assert np.array_equal(pe.vectors[2], [0, 11, 12])
 83 |     assert d['cat'] == 50
 84 | 
 85 |     assert type(pe.vocabulary) == CountedVocabulary
 86 | 
 87 | 
 88 | def test_noinplace_transform_word_prefer_shortestword_CountedVocabulary():
 89 |     logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 90 | 
 91 |     cw = CountedVocabulary(
 92 |         word_count=[('dog', 60), ('cat', 50), ('    pikatchu   ', 10), ('pikatchu', 10), (' cat ', 5)])
 93 | 
 94 |     e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]]))
 95 |     pe = e.transform_words(lambda x: x.strip(), inplace=False)
 96 | 
 97 |     assert len(pe.vocabulary) == 3
 98 |     assert len(pe.vectors) == 3
 99 | 
100 |     # 'dog'
101 |     assert [0, 0, 1] in pe.vectors.tolist()
102 |     # 'cat'
103 |     assert [0, 1, 11] in pe.vectors.tolist()
104 |     # pikatchu
105 |     assert [0, 12, 13] in pe.vectors.tolist()
106 | 
107 |     assert 'cat' in pe.vocabulary.words
108 |     assert 'dog' in pe.vocabulary.words
109 |     assert 'pikatchu' in pe.vocabulary.words
110 | 
111 |     l = pe.vocabulary.getstate()
112 |     d = {l[0][i]: l[1][i] for i in range(len(l[0]))}
113 | 
114 |     # pikatchu
115 |     assert pe.vocabulary.words[2] == 'pikatchu'
116 |     assert np.array_equal(pe.vectors[2], [0, 12, 13])
117 |     assert d['pikatchu'] == 10
118 | 
119 |     # dog
120 |     assert pe.vocabulary.words[0] == 'dog'
121 |     assert np.array_equal(pe.vectors[0], [0, 0, 1])
122 |     assert d['dog'] == 60
123 | 
124 |     # cat
125 |     assert pe.vocabulary.words[1] == 'cat'
126 |     assert np.array_equal(pe.vectors[1], [0, 1, 11])
127 |     assert d['cat'] == 50
128 | 
129 |     assert type(pe.vocabulary) == CountedVocabulary
130 | 
131 | 
132 | # ORDERDVOCABULARY
133 | 
134 | def test_noinplace_transform_word_OrderedVocabulary():
135 |     logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
136 | 
137 |     cw = OrderedVocabulary(words=['dog', 'cat', '  cat'])
138 | 
139 |     e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 11], [0, 11, 12], [0, 12, 13]]))
140 |     pe = e.transform_words(lambda x: x.strip(), inplace=False)
141 | 
142 |     assert len(pe.vocabulary) == 2
143 |     assert len(pe.vectors) == 2
144 | 
145 |     # 'dog'
146 |     assert [0, 0, 11] in pe.vectors.tolist()
147 |     # 'cat'
148 |     assert [0, 11, 12] in pe.vectors.tolist()
149 | 
150 |     assert 'cat' in pe.vocabulary.words
151 |     assert 'dog' in pe.vocabulary.words
152 | 
153 |     # dog
154 |     assert pe.vocabulary.words[0] == 'dog'
155 |     assert np.array_equal(pe.vectors[0], [0, 0, 11])
156 | 
157 |     # cat
158 |     assert pe.vocabulary.words[1] == 'cat'
159 |     assert np.array_equal(pe.vectors[1], [0, 11, 12])
160 | 
161 |     assert type(pe.vocabulary) == OrderedVocabulary
162 | 
163 | 
164 | def test_noinplace_transform_word_prefer_occurences_OrderedVocabulary():
165 |     logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
166 | 
167 |     cw = OrderedVocabulary(words=['pikatchu', 'dog', 'cat', 'pikatchu ', ' cat '])
168 | 
169 |     e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]]))
170 |     pe = e.transform_words(lambda x: x.strip(), inplace=False)
171 | 
172 |     assert len(pe.vocabulary) == 3
173 |     assert len(pe.vectors) == 3
174 | 
175 |     # 'dog'
176 |     assert [0, 1, 11] in pe.vectors.tolist()
177 |     # 'cat'
178 |     assert [0, 11, 12] in pe.vectors.tolist()
179 |     # pikatchu
180 |     assert [0, 0, 1] in pe.vectors.tolist()
181 | 
182 |     assert 'cat' in pe.vocabulary.words
183 |     assert 'dog' in pe.vocabulary.words
184 |     assert 'pikatchu' in pe.vocabulary.words
185 | 
186 |     # pikatchu
187 |     assert pe.vocabulary.words[0] == 'pikatchu'
188 |     assert np.array_equal(pe.vectors[0], [0, 0, 1])
189 |     # dog
190 |     assert pe.vocabulary.words[1] == 'dog'
191 |     assert np.array_equal(pe.vectors[1], [0, 1, 11])
192 | 
193 |     # cat
194 |     assert pe.vocabulary.words[2] == 'cat'
195 |     assert np.array_equal(pe.vectors[2], [0, 11, 12])
196 | 
197 |     assert type(pe.vocabulary) == OrderedVocabulary
198 | 
199 | 
200 | def test_noinplace_transform_word_prefer_shortestword_OrderedVocabulary():
201 |     logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
202 | 
203 |     cw = OrderedVocabulary(words=['dog', 'cat', '    pikatchu   ', 'pikatchu', ' cat  '])
204 | 
205 |     e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]]))
206 |     pe = e.transform_words(lambda x: x.strip(), inplace=False)
207 | 
208 |     assert len(pe.vocabulary) == 3
209 |     assert len(pe.vectors) == 3
210 | 
211 |     # 'dog'
212 |     assert [0, 0, 1] in pe.vectors.tolist()
213 |     # 'cat'
214 |     assert [0, 1, 11] in pe.vectors.tolist()
215 |     # pikatchu
216 |     assert [0, 11, 12] in pe.vectors.tolist()
217 | 
218 |     assert 'cat' in pe.vocabulary.words
219 |     assert 'dog' in pe.vocabulary.words
220 |     assert 'pikatchu' in pe.vocabulary.words
221 | 
222 |     # pikatchu
223 |     assert pe.vocabulary.words[2] == 'pikatchu'
224 |     assert np.array_equal(pe.vectors[2], [0, 11, 12])
225 | 
226 |     # dog
227 |     assert pe.vocabulary.words[0] == 'dog'
228 |     assert np.array_equal(pe.vectors[0], [0, 0, 1])
229 | 
230 |     # cat
231 |     assert pe.vocabulary.words[1] == 'cat'
232 |     assert np.array_equal(pe.vectors[1], [0, 1, 11])
233 | 
234 |     assert type(pe.vocabulary) == OrderedVocabulary
235 | 
236 | 
237 | # VOCABULARY
238 | 
239 | def test_noinplace_transform_word_Vocabulary():
240 |     logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
241 | 
242 |     cw = Vocabulary(words=['dog', 'cat', '  cat '])
243 |     e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 11], [0, 11, 12], [0, 12, 13]]))
244 |     pe = e.transform_words(lambda x: x.strip(), inplace=False)
245 | 
246 |     assert len(pe.vocabulary) == 2
247 |     assert len(pe.vectors) == 2
248 | 
249 |     # 'dog'
250 |     assert [0, 0, 11] in pe.vectors.tolist()
251 |     # 'cat'
252 |     assert [0, 11, 12] in pe.vectors.tolist()
253 | 
254 |     assert 'cat' in pe.vocabulary.words
255 |     assert 'dog' in pe.vocabulary.words
256 | 
257 |     # dog
258 |     assert pe.vocabulary.words[0] == 'dog'
259 |     assert np.array_equal(pe.vectors[0], [0, 0, 11])
260 | 
261 |     # cat
262 |     assert pe.vocabulary.words[1] == 'cat'
263 |     assert np.array_equal(pe.vectors[1], [0, 11, 12])
264 | 
265 |     assert type(pe.vocabulary) == Vocabulary
266 | 
267 | 
268 | def test_noinplace_transform_word_prefer_shortest_ord1_Vocabulary():
269 |     logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
270 | 
271 |     cw = Vocabulary(words=['pikatchu ', 'dog', 'cat', 'pikatchu', '  cat '])
272 | 
273 |     e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 12, 13], [0, 1, 11], [0, 11, 12], [0, 0, 1], [0, 13, 14]]))
274 |     pe = e.transform_words(lambda x: x.strip(), inplace=False)
275 | 
276 |     assert len(pe.vocabulary) == 3
277 |     assert len(pe.vectors) == 3
278 | 
279 |     # 'dog'
280 |     assert [0, 1, 11] in pe.vectors.tolist()
281 |     # 'cat'
282 |     assert [0, 11, 12] in pe.vectors.tolist()
283 |     # pikatchu
284 |     assert [0, 0, 1] in pe.vectors.tolist()
285 | 
286 |     assert 'cat' in pe.vocabulary.words
287 |     assert 'dog' in pe.vocabulary.words
288 |     assert 'pikatchu' in pe.vocabulary.words
289 | 
290 |     # pikatchu
291 |     assert pe.vocabulary.words[2] == 'pikatchu'
292 |     assert np.array_equal(pe.vectors[2], [0, 0, 1])
293 | 
294 |     # dog
295 |     assert pe.vocabulary.words[0] == 'dog'
296 |     assert np.array_equal(pe.vectors[0], [0, 1, 11])
297 | 
298 |     # cat
299 |     assert pe.vocabulary.words[1] == 'cat'
300 |     assert np.array_equal(pe.vectors[1], [0, 11, 12])
301 | 
302 |     assert type(pe.vocabulary) == Vocabulary
303 | 
304 | 
305 | def test_noinplace_transform_word_prefer_shortestword2_Vocabulary():
306 |     logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
307 | 
308 |     cw = Vocabulary(words=['dog', 'cat', '    pikatchu   ', 'pikatchu', ' cat '])
309 |     e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]]))
310 |     pe = e.transform_words(lambda x: x.strip(), inplace=False)
311 | 
312 |     assert len(pe.vocabulary) == 3
313 |     assert len(pe.vectors) == 3
314 | 
315 |     # 'dog'
316 |     assert [0, 0, 1] in pe.vectors.tolist()
317 |     # 'cat'
318 |     assert [0, 1, 11] in pe.vectors.tolist()
319 |     # pikatchu
320 |     assert [0, 12, 13] in pe.vectors.tolist()
321 | 
322 |     assert 'cat' in pe.vocabulary.words
323 |     assert 'dog' in pe.vocabulary.words
324 |     assert 'pikatchu' in pe.vocabulary.words
325 | 
326 |     # pikatchu
327 |     assert pe.vocabulary.words[2] == 'pikatchu'
328 |     assert np.array_equal(pe.vectors[2], [0, 12, 13])
329 | 
330 |     # dog
331 |     assert pe.vocabulary.words[0] == 'dog'
332 |     assert np.array_equal(pe.vectors[0], [0, 0, 1])
333 | 
334 |     # cat
335 |     assert pe.vocabulary.words[1] == 'cat'
336 |     assert np.array_equal(pe.vectors[1], [0, 1, 11])
337 | 
338 |     assert type(pe.vocabulary) == Vocabulary
339 | 
340 | ####################### inplace= True #######################
341 | 
342 | # COUNTEDVOCABULARY
343 | 
344 | def test_inplace_transform_word_CountedVocabulary():
345 |     logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
346 | 
347 |     cw = CountedVocabulary(word_count=[(' cat ', 10), ('cat', 50), ('dog', 60)])
348 | 
349 |     e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 11], [0, 11, 12], [0, 12, 13]]))
350 |     pe = e.transform_words(lambda x: x.strip(), inplace=True)
351 | 
352 |     assert pe is e and pe == e
353 | 
354 |     assert len(pe.vocabulary) == 2
355 |     assert len(pe.vectors) == 2
356 | 
357 |     # 'dog'
358 |     assert [0, 0, 11] in pe.vectors.tolist()
359 |     # 'cat'
360 |     assert [0, 11, 12] in pe.vectors.tolist()
361 | 
362 |     assert 'cat' in pe.vocabulary.words
363 |     assert 'dog' in pe.vocabulary.words
364 | 
365 |     l = pe.vocabulary.getstate()
366 |     d = {l[0][i]: l[1][i] for i in range(len(l[0]))}
367 | 
368 |     # dog
369 |     assert pe.vocabulary.words[0] == 'dog'
370 |     assert np.array_equal(pe.vectors[0], [0, 0, 11])
371 |     assert d['dog'] == 60
372 | 
373 |     # cat
374 |     assert pe.vocabulary.words[1] == 'cat'
375 |     assert np.array_equal(pe.vectors[1], [0, 11, 12])
376 |     assert d['cat'] == 50
377 | 
378 |     assert type(pe.vocabulary) == CountedVocabulary
379 | 
380 | 
381 | def test_inplace_transform_word_prefer_occurences_CountedVocabulary():
382 |     logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
383 | 
384 |     cw = CountedVocabulary(word_count=[(' cat ', 5), ('pikatchu ', 10), ('cat', 50), ('dog', 60), ('pikatchu', 200)])
385 | 
386 |     e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]]))
387 |     pe = e.transform_words(lambda x: x.strip(), inplace=True)
388 | 
389 |     assert pe is e and pe == e
390 | 
391 |     assert len(pe.vocabulary) == 3
392 |     assert len(pe.vectors) == 3
393 | 
394 |     l = pe.vocabulary.getstate()
395 |     d = {l[0][i]: l[1][i] for i in range(len(l[0]))}
396 | 
397 |     # 'dog'
398 |     assert [0, 1, 11] in pe.vectors.tolist()
399 |     # 'cat'
400 |     assert [0, 11, 12] in pe.vectors.tolist()
401 |     # pikatchu
402 |     assert [0, 0, 1] in pe.vectors.tolist()
403 | 
404 |     assert 'cat' in pe.vocabulary.words
405 |     assert 'dog' in pe.vocabulary.words
406 |     assert 'pikatchu' in pe.vocabulary.words
407 | 
408 |     # pikatchu
409 |     assert pe.vocabulary.words[0] == 'pikatchu'
410 |     assert np.array_equal(pe.vectors[0], [0, 0, 1])
411 |     assert d['pikatchu'] == 200
412 |     # dog
413 |     assert pe.vocabulary.words[1] == 'dog'
414 |     assert np.array_equal(pe.vectors[1], [0, 1, 11])
415 |     assert d['dog'] == 60
416 | 
417 |     # cat
418 |     assert pe.vocabulary.words[2] == 'cat'
419 |     assert np.array_equal(pe.vectors[2], [0, 11, 12])
420 |     assert d['cat'] == 50
421 | 
422 |     assert type(pe.vocabulary) == CountedVocabulary
423 | 
424 | 
425 | def test_inplace_transform_word_prefer_shortestword_CountedVocabulary():
426 |     logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
427 | 
428 |     cw = CountedVocabulary(
429 |         word_count=[('dog', 60), ('cat', 50), ('    pikatchu   ', 10), ('pikatchu', 10), (' cat ', 5)])
430 | 
431 |     e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]]))
432 |     pe = e.transform_words(lambda x: x.strip(), inplace=True)
433 | 
434 |     assert pe is e and pe == e
435 | 
436 |     assert len(pe.vocabulary) == 3
437 |     assert len(pe.vectors) == 3
438 | 
439 |     # 'dog'
440 |     assert [0, 0, 1] in pe.vectors.tolist()
441 |     # 'cat'
442 |     assert [0, 1, 11] in pe.vectors.tolist()
443 |     # pikatchu
444 |     assert [0, 12, 13] in pe.vectors.tolist()
445 | 
446 |     assert 'cat' in pe.vocabulary.words
447 |     assert 'dog' in pe.vocabulary.words
448 |     assert 'pikatchu' in pe.vocabulary.words
449 | 
450 |     l = pe.vocabulary.getstate()
451 |     d = {l[0][i]: l[1][i] for i in range(len(l[0]))}
452 | 
453 |     # pikatchu
454 |     assert pe.vocabulary.words[2] == 'pikatchu'
455 |     assert np.array_equal(pe.vectors[2], [0, 12, 13])
456 |     assert d['pikatchu'] == 10
457 | 
458 |     # dog
459 |     assert pe.vocabulary.words[0] == 'dog'
460 |     assert np.array_equal(pe.vectors[0], [0, 0, 1])
461 |     assert d['dog'] == 60
462 | 
463 |     # cat
464 |     assert pe.vocabulary.words[1] == 'cat'
465 |     assert np.array_equal(pe.vectors[1], [0, 1, 11])
466 |     assert d['cat'] == 50
467 | 
468 |     assert type(pe.vocabulary) == CountedVocabulary
469 | 
470 | # ORDERDVOCABULARY
471 | 
472 | def test_inplace_transform_word_OrderedVocabulary():
473 |     logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
474 | 
475 |     cw = OrderedVocabulary(words=['dog', 'cat', '  cat'])
476 | 
477 |     e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 11], [0, 11, 12], [0, 12, 13]]))
478 |     pe = e.transform_words(lambda x: x.strip(), inplace=True)
479 | 
480 |     assert pe is e and pe == e
481 | 
482 |     assert len(pe.vocabulary) == 2
483 |     assert len(pe.vectors) == 2
484 | 
485 |     # 'dog'
486 |     assert [0, 0, 11] in pe.vectors.tolist()
487 |     # 'cat'
488 |     assert [0, 11, 12] in pe.vectors.tolist()
489 | 
490 |     assert 'cat' in pe.vocabulary.words
491 |     assert 'dog' in pe.vocabulary.words
492 | 
493 |     # dog
494 |     assert pe.vocabulary.words[0] == 'dog'
495 |     assert np.array_equal(pe.vectors[0], [0, 0, 11])
496 | 
497 |     # cat
498 |     assert pe.vocabulary.words[1] == 'cat'
499 |     assert np.array_equal(pe.vectors[1], [0, 11, 12])
500 | 
501 |     assert type(pe.vocabulary) == OrderedVocabulary
502 | 
503 | 
504 | def test_inplace_transform_word_prefer_occurences_OrderedVocabulary():
505 |     logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
506 | 
507 |     cw = OrderedVocabulary(words=['pikatchu', 'dog', 'cat', 'pikatchu ', ' cat '])
508 | 
509 |     e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]]))
510 |     pe = e.transform_words(lambda x: x.strip(), inplace=True)
511 | 
512 |     assert pe is e and pe == e
513 | 
514 |     assert len(pe.vocabulary) == 3
515 |     assert len(pe.vectors) == 3
516 | 
517 |     # 'dog'
518 |     assert [0, 1, 11] in pe.vectors.tolist()
519 |     # 'cat'
520 |     assert [0, 11, 12] in pe.vectors.tolist()
521 |     # pikatchu
522 |     assert [0, 0, 1] in pe.vectors.tolist()
523 | 
524 |     assert 'cat' in pe.vocabulary.words
525 |     assert 'dog' in pe.vocabulary.words
526 |     assert 'pikatchu' in pe.vocabulary.words
527 | 
528 |     # pikatchu
529 |     assert pe.vocabulary.words[0] == 'pikatchu'
530 |     assert np.array_equal(pe.vectors[0], [0, 0, 1])
531 |     # dog
532 |     assert pe.vocabulary.words[1] == 'dog'
533 |     assert np.array_equal(pe.vectors[1], [0, 1, 11])
534 | 
535 |     # cat
536 |     assert pe.vocabulary.words[2] == 'cat'
537 |     assert np.array_equal(pe.vectors[2], [0, 11, 12])
538 | 
539 |     assert type(pe.vocabulary) == OrderedVocabulary
540 | 
541 | 
542 | def test_inplace_transform_word_prefer_shortestword_OrderedVocabulary():
543 |     logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
544 | 
545 |     cw = OrderedVocabulary(words=['dog', 'cat', '    pikatchu   ', 'pikatchu', ' cat  '])
546 | 
547 |     e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]]))
548 |     pe = e.transform_words(lambda x: x.strip(), inplace=True)
549 | 
550 |     assert pe is e and pe == e
551 | 
552 |     assert len(pe.vocabulary) == 3
553 |     assert len(pe.vectors) == 3
554 | 
555 |     # 'dog'
556 |     assert [0, 0, 1] in pe.vectors.tolist()
557 |     # 'cat'
558 |     assert [0, 1, 11] in pe.vectors.tolist()
559 |     # pikatchu
560 |     assert [0, 11, 12] in pe.vectors.tolist()
561 | 
562 |     assert 'cat' in pe.vocabulary.words
563 |     assert 'dog' in pe.vocabulary.words
564 |     assert 'pikatchu' in pe.vocabulary.words
565 | 
566 |     # pikatchu
567 |     assert pe.vocabulary.words[2] == 'pikatchu'
568 |     assert np.array_equal(pe.vectors[2], [0, 11, 12])
569 | 
570 |     # dog
571 |     assert pe.vocabulary.words[0] == 'dog'
572 |     assert np.array_equal(pe.vectors[0], [0, 0, 1])
573 | 
574 |     # cat
575 |     assert pe.vocabulary.words[1] == 'cat'
576 |     assert np.array_equal(pe.vectors[1], [0, 1, 11])
577 | 
578 |     assert type(pe.vocabulary) == OrderedVocabulary
579 | 
580 | 
581 | 


--------------------------------------------------------------------------------
/web/tests/test_vocabulary.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | """
4 |  Tests for vocabulary
5 | """
6 | 


--------------------------------------------------------------------------------
/web/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Utilities for package"""
 5 | 
 6 | import bz2
 7 | import gzip
 8 | from os import path
 9 | import tarfile
10 | import io
11 | from itertools import islice, chain
12 | from six import string_types, text_type
13 | 
14 | 
15 | def any2utf8(text, errors='strict', encoding='utf8'):
16 |     """Convert a string (unicode or bytestring in `encoding`), to bytestring in utf8."""
17 |     if isinstance(text, text_type):
18 |         return text.encode('utf8')
19 |     # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8
20 |     return text_type(text, encoding, errors=errors).encode('utf8')
21 | 
22 | 
23 | to_utf8 = any2utf8
24 | 
25 | # Works just as good with unicode chars
26 | _delchars = [chr(c) for c in range(256)]
27 | _delchars = [x for x in _delchars if not x.isalnum()]
28 | _delchars.remove('\t')
29 | _delchars.remove(' ')
30 | _delchars.remove('-')
31 | _delchars.remove('_')  # for instance phrases are joined in word2vec used this char
32 | _delchars = ''.join(_delchars)
33 | _delchars_table = dict((ord(char), None) for char in _delchars)
34 | 
35 | 
36 | def standardize_string(s, clean_words=True, lower=True, language="english"):
37 |     """
38 |     Ensures common convention across code. Converts to utf-8 and removes non-alphanumeric characters
39 | 
40 |     Parameters
41 |     ----------
42 |     language: only "english" is now supported. If "english" will remove non-alphanumeric characters
43 | 
44 |     lower: if True will lower strńing.
45 | 
46 |     clean_words: if True will remove non alphanumeric characters (for instance '$', '#' or 'ł')
47 | 
48 |     Returns
49 |     -------
50 |     string: processed string
51 |     """
52 | 
53 |     assert isinstance(s, string_types)
54 | 
55 |     if not isinstance(s, text_type):
56 |         s = text_type(s, "utf-8")
57 | 
58 |     if language == "english":
59 |         s = (s.lower() if lower else s)
60 |         s = (s.translate(_delchars_table) if clean_words else s)
61 |         return s
62 |     else:
63 |         raise NotImplementedError("Not implemented standarization for other languages")
64 | 
65 | 
66 | def batched(iterable, size):
67 |     sourceiter = iter(iterable)
68 |     while True:
69 |         batchiter = islice(sourceiter, size)
70 |         try:
71 |             yield chain([next(batchiter)], batchiter)
72 |         except StopIteration:
73 |             return
74 | 
75 | 
76 | def _open(file_, mode='r'):
77 |     """Open file object given filenames, open files or even archives."""
78 |     if isinstance(file_, string_types):
79 |         _, ext = path.splitext(file_)
80 |         if ext in {'.gz'}:
81 |             if mode == "r" or mode == "rb":
82 |                 # gzip is extremely slow
83 |                 return io.BufferedReader(gzip.GzipFile(file_, mode=mode))
84 |             else:
85 |                 return gzip.GzipFile(file_, mode=mode)
86 |         if ext in {'.bz2'}:
87 |             return bz2.BZ2File(file_, mode=mode)
88 |         else:
89 |             return io.open(file_, mode, **({"encoding": "utf-8"} if "b" not in mode else {}))
90 |     return file_
91 | 


--------------------------------------------------------------------------------
/web/version.py:
--------------------------------------------------------------------------------
1 | # License: MIT
2 | """
3 | Unique version information place
4 | """
5 | 
6 | __version__ = "0.0.1"
7 | VERSION = tuple(int(x) for x in __version__.split("."))
8 | 


--------------------------------------------------------------------------------
/web/vocabulary.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """ Supports word embeddings.
  5 | 
  6 | NOTE: This file was adapted from the polyglot package
  7 | """
  8 | 
  9 | from io import open, StringIO
 10 | from collections import Counter, OrderedDict
 11 | import os
 12 | from concurrent.futures import ProcessPoolExecutor
 13 | 
 14 | import six
 15 | from six.moves import zip
 16 | from six import iteritems
 17 | from six import text_type as unicode
 18 | from six import string_types
 19 | from .utils import _open
 20 | 
 21 | 
 22 | def count(lines):
 23 |     """ Counts the word frequences in a list of sentences.
 24 | 
 25 |     Note:
 26 |       This is a helper function for parallel execution of `Vocabulary.from_text`
 27 |       method.
 28 |     """
 29 |     words = [w for l in lines for w in l.strip().split()]
 30 |     return Counter(words)
 31 | 
 32 | 
 33 | class Vocabulary(object):
 34 |     """ A set of words/tokens that have consistent IDs.
 35 | 
 36 |     Attributes:
 37 |       word_id (dictionary): Mapping from words to IDs.
 38 |       id_word (dictionary): A reverse map of `word_id`.
 39 |     """
 40 | 
 41 |     def __init__(self, words=None):
 42 |         """ Build attributes word_id and id_word from input.
 43 | 
 44 |         Args:
 45 |           words (list/set): list or set of words.
 46 |         """
 47 |         words = self.sanitize_words(words)
 48 |         self.word_id = {w: i for i, w in enumerate(words)}
 49 |         self.id_word = {i: w for w, i in iteritems(self.word_id)}
 50 | 
 51 |     def __iter__(self):
 52 |         """Iterate over the words in a vocabulary."""
 53 |         for w, i in sorted(iteritems(self.word_id), key=lambda wc: wc[1]):
 54 |             yield w
 55 | 
 56 |     @property
 57 |     def words(self):
 58 |         """ Ordered list of words according to their IDs."""
 59 |         return list(self)
 60 | 
 61 |     def __unicode__(self):
 62 |         return u"\n".join(self.words)
 63 | 
 64 |     def __str__(self):
 65 |         if six.PY3:
 66 |             return self.__unicode__()
 67 |         return self.__unicode__().encode("utf-8")
 68 | 
 69 |     def __getitem__(self, key):
 70 |         if isinstance(key, string_types) and not isinstance(key, unicode):
 71 |             key = unicode(key, encoding="utf-8")
 72 |         return self.word_id[key]
 73 | 
 74 |     def add(self, word):
 75 |         if isinstance(word, string_types) and not isinstance(word, unicode):
 76 |             word = unicode(word, encoding="utf-8")
 77 | 
 78 |         if word in self.word_id:
 79 |             raise RuntimeError("Already existing word")
 80 | 
 81 |         id = len(self.word_id)
 82 |         self.word_id[word] = id
 83 |         self.id_word[id] = word
 84 | 
 85 |     def __contains__(self, key):
 86 |         return key in self.word_id
 87 | 
 88 |     def __delitem__(self, key):
 89 |         """Delete a word from vocabulary.
 90 | 
 91 |         Note:
 92 |          To maintain consecutive IDs, this operation implemented
 93 |          with a complexity of \\theta(n).
 94 |         """
 95 |         del self.word_id[key]
 96 |         self.id_word = dict(enumerate(self.words))
 97 |         self.word_id = {w: i for i, w in iteritems(self.id_word)}
 98 | 
 99 |     def __len__(self):
100 |         return len(self.word_id)
101 | 
102 |     def sanitize_words(self, words):
103 |         """Guarantees that all textual symbols are unicode.
104 |         Note:
105 |           We do not convert numbers, only strings to unicode.
106 |           We assume that the strings are encoded in utf-8.
107 |         """
108 |         _words = []
109 |         for w in words:
110 |             if isinstance(w, string_types) and not isinstance(w, unicode):
111 |                 _words.append(unicode(w, encoding="utf-8"))
112 |             else:
113 |                 _words.append(w)
114 |         return _words
115 | 
116 |     def get(self, k, default=None):
117 |         try:
118 |             return self[k]
119 |         except KeyError as e:
120 |             return default
121 | 
122 |     def getstate(self):
123 |         return list(self.words)
124 | 
125 |     @classmethod
126 |     def from_vocabfile(cls, filename):
127 |         """ Construct a CountedVocabulary out of a vocabulary file.
128 | 
129 |         Note:
130 |           File has the following format word1
131 |                                         word2
132 |         """
133 |         words = [x.strip() for x in _open(filename, 'r').read().splitlines()]
134 |         return cls(words=words)
135 | 
136 | 
137 | class OrderedVocabulary(Vocabulary):
138 |     """ An ordered list of words/tokens according to their frequency.
139 | 
140 |     Note:
141 |       The words order is assumed to be sorted according to the word frequency.
142 |       Most frequent words appear first in the list.
143 | 
144 |     Attributes:
145 |       word_id (dictionary): Mapping from words to IDs.
146 |       id_word (dictionary): A reverse map of `word_id`.
147 |     """
148 | 
149 |     def __init__(self, words=None):
150 |         """ Build attributes word_id and id_word from input.
151 | 
152 |         Args:
153 |           words (list): list of sorted words according to frequency.
154 |         """
155 | 
156 |         words = self.sanitize_words(words)
157 |         self.word_id = {w: i for i, w in enumerate(words)}
158 |         self.id_word = {i: w for w, i in iteritems(self.word_id)}
159 | 
160 |     def most_frequent(self, k):
161 |         """ Returns a vocabulary with the most frequent `k` words.
162 | 
163 |         Args:
164 |           k (integer): specifies the top k most frequent words to be returned.
165 |         """
166 |         return OrderedVocabulary(words=self.words[:k])
167 | 
168 | 
169 | class CountedVocabulary(OrderedVocabulary):
170 |     """ List of words and counts sorted according to word count.
171 |     """
172 | 
173 |     def __init__(self, word_count=None):
174 |         """ Build attributes word_id and id_word from input.
175 | 
176 |         Args:
177 |           word_count (dictionary): A dictionary of the type word:count or
178 |                                    list of tuples of the type (word, count).
179 |         """
180 | 
181 |         if isinstance(word_count, dict):
182 |             word_count = iteritems(word_count)
183 |         sorted_counts = list(sorted(word_count, key=lambda wc: wc[1], reverse=True))
184 |         words = [w for w, c in sorted_counts]
185 |         super(CountedVocabulary, self).__init__(words=words)
186 |         self.word_count = OrderedDict(sorted_counts)
187 | 
188 |     def most_frequent(self, k):
189 |         """ Returns a vocabulary with the most frequent `k` words.
190 | 
191 |         Args:
192 |           k (integer): specifies the top k most frequent words to be returned.
193 |         """
194 |         word_count = [(w, self.word_count[w]) for w in self.words[:k]]
195 |         return CountedVocabulary(word_count=word_count)
196 | 
197 |     def min_count(self, n=1):
198 |         """ Returns a vocabulary after eliminating the words that appear < `n`.
199 | 
200 |         Args:
201 |           n (integer): specifies the minimum word frequency allowed.
202 |         """
203 |         word_count = [(w, c) for w, c in iteritems(self.word_count) if c >= n]
204 |         return CountedVocabulary(word_count=word_count)
205 | 
206 |     def __unicode__(self):
207 |         return u"\n".join([u"{}\t{}".format(w, self.word_count[w]) for w in self.words])
208 | 
209 |     def __delitem__(self, key):
210 |         super(CountedVocabulary, self).__delitem__(key)
211 |         self.word_count = OrderedDict([(w, self.word_count[w]) for w in self])
212 | 
213 |     def getstate(self):
214 |         words = list(self.words)
215 |         counts = [self.word_count[w] for w in words]
216 |         return (words, counts)
217 | 
218 |     @staticmethod
219 |     def from_vocabfile(filename):
220 |         """ Construct a CountedVocabulary out of a vocabulary file.
221 | 
222 |         Note:
223 |           File has the following format word1 count1
224 |                                         word2 count2
225 |         """
226 |         word_count = [x.strip().split() for x in _open(filename, 'r').read().splitlines()]
227 |         word_count = OrderedDict([(w, int(c)) for w, c in word_count])
228 |         return CountedVocabulary(word_count=word_count)
229 | 
230 | 
231 | 


--------------------------------------------------------------------------------