├── .gitignore
├── README.md
├── images
    ├── english_sbs.png
    ├── etnlp_view_embs.png
    ├── etnlp_view_multi_embeddings.png
    └── visualisation_hanoi_2.png
└── src
    ├── codes
        ├── 00.run_etnlp_preprocessing.sh
        ├── 01.run_etnlp_evaluator.sh
        ├── 02.run_etnlp_extractor.sh
        ├── 03.run_etnlp_visualizer_inter.sh
        ├── 04.run_etnlp_visualizer_sbs.sh
        ├── api
        │   ├── __init__.py
        │   ├── embedding_evaluator.py
        │   ├── embedding_extractor.py
        │   ├── embedding_preprocessing.py
        │   └── embedding_visualizer.py
        ├── embeddings
        │   ├── __init__.py
        │   ├── embedding_configs.py
        │   ├── embedding_models.py
        │   └── embedding_utils.py
        ├── etnlp_api.py
        ├── requirements.txt
        ├── setup.py
        ├── utils
        │   ├── __init__.py
        │   ├── emb_utils.py
        │   ├── embedding_io.py
        │   ├── eval_utils.py
        │   ├── file_utils.py
        │   ├── string_utils.py
        │   ├── vectors.py
        │   └── word.py
        └── visualizer
        │   ├── README.md
        │   ├── __init__.py
        │   ├── data
        │       └── vnex.model.bin
        │   ├── images
        │       ├── w2v_vn.png
        │       └── w2v_vn_2.png
        │   ├── outof_w2vec.dict
        │   ├── static
        │       ├── bootstrap-theme.min.css
        │       ├── bootstrap.min.css
        │       └── style.css
        │   ├── templates
        │       ├── app.html
        │       └── search.html
        │   └── visualizer_sbs.py
    ├── data
        ├── .DS_Store
        ├── embedding_analogies
        │   ├── english
        │   │   └── english-word-analogy.txt
        │   ├── portuguese
        │   │   ├── LX-4WAnalogies-ETNLP.txt
        │   │   ├── LX-4WAnalogies.txt
        │   │   ├── POST_TAG_vocabulary.txt
        │   │   ├── evaluator_results.txt
        │   │   └── vocab.txt
        │   └── vi
        │   │   ├── Multi_evaluator_results.txt
        │   │   ├── analogy_list_vi_ner.txt
        │   │   ├── elmo_results_out_dict.txt
        │   │   └── solveable_analogies_vi.txt
        ├── embedding_dicts
        │   ├── C2V.vec
        │   ├── ELMO_23.vec
        │   ├── FastText_23.vec
        │   ├── MULTI_23.vec
        │   ├── W2V_C2V_23.vec
        │   ├── baomoi_c2v_dims_300.vec
        │   └── vn_elmo_medium_c2v.vec
        ├── glove2vec_dicts
        │   ├── glove1.vec
        │   ├── glove1_w2v.vec
        │   ├── glove2.vec
        │   └── glove2_w2v.vec
        └── vocab.txt
    └── examples
        ├── test1_etnlp_preprocessing.py
        ├── test2_etnlp_extractor.py
        ├── test3_etnlp_evaluator.py
        └── test4_etnlp_visualizer.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | __pycache__/
  2 | *.py[cod]
  3 | *$py.class
  4 | 
  5 | # C extensions
  6 | *.so
  7 | 
  8 | # Distribution / packaging
  9 | .Python
 10 | build/
 11 | dist/
 12 | develop-eggs/
 13 | downloads/
 14 | eggs/
 15 | .eggs/
 16 | lib/
 17 | lib64/
 18 | parts/
 19 | sdist/
 20 | var/
 21 | wheels/
 22 | pip-wheel-metadata/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # IPython
 78 | profile_default/
 79 | ipython_config.py
 80 | 
 81 | # pyenv
 82 | .python-version
 83 | 
 84 | # pipenv
 85 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 86 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 87 | #   having no cross-platform support, pipenv may install dependencies that don’t work, or not
 88 | #   install all needed dependencies.
 89 | #Pipfile.lock
 90 | 
 91 | # celery beat schedule file
 92 | celerybeat-schedule
 93 | 
 94 | # SageMath parsed files
 95 | *.sage.py
 96 | 
 97 | # Environments
 98 | .env
 99 | .venv
100 | env/
101 | venv/
102 | ENV/
103 | env.bak/
104 | venv.bak/
105 | 
106 | # Spyder project settings
107 | .spyderproject
108 | .spyproject
109 | 
110 | # Rope project settings
111 | .ropeproject
112 | 
113 | # mkdocs documentation
114 | /site
115 | 
116 | # mypy
117 | .mypy_cache/
118 | .dmypy.json
119 | dmypy.json
120 | 
121 | # Pyre type checker
122 | .pyre/
123 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ETNLP: A Toolkit for Extraction, Evaluation and Visualization of Pre-trained Word Embeddings
  2 | =====
  3 | 
  4 | # Table of contents
  5 | 1. [Introduction](#introduction)
  6 | 2. [More about ETNLP](#moreaboutETNLP)
  7 | 3. [Installation and How to Use](#installation_and_howtouse)
  8 | 4. [Download Resources](#Download_Resources)
  9 | 
 10 | 
 11 | # I. Overview <a name="introduction"></a>
 12 | ## A glimpse of ETNLP:
 13 | - Github: https://github.com/vietnlp/etnlp
 14 | - Video: https://vimeo.com/317599106
 15 | - Paper: https://arxiv.org/abs/1903.04433
 16 | 
 17 | 
 18 | # II. How do I cite ETNLP?
 19 | Please CITE paper the Arxiv paper whenever ETNLP (or the pre-trained embeddings) is used to produce published results or incorporated into other software:
 20 | 
 21 | ```
 22 | @inproceedings{vu:2019n,
 23 |   title={ETNLP: A Visual-Aided Systematic Approach to Select Pre-Trained Embeddings for a Downstream Task},
 24 |   author={Vu, Xuan-Son and Vu, Thanh and Tran, Son N and Jiang, Lili},
 25 |   booktitle={Proceedings of the International Conference Recent Advances in Natural Language Processing (RANLP)},
 26 |   year={2019}
 27 |   }
 28 | ```
 29 | 
 30 | 
 31 | # III. More about ETNLP <a name="moreaboutETNLP"></a>:
 32 | ## 1. Embedding Evaluator: 
 33 | To compare quality of embedding models on the word analogy task.
 34 | - Input: a pre-trained embedding vector file (word2vec format), and word analogy file.
 35 | - Output: (1) evaluate quality of the embedding model based on the MAP/P@10 score, (2) Paired t-tests to show significant level between different word embeddings.
 36 | 
 37 | ### 1.1. Note: The word analogy list is created by:
 38 | - Adopt from the English list by selecting suitable categories and translating to the target language (i.e., Vietnamese). 
 39 | - Removing inappropriate categories (i.e., category 6, 10, 11, 14) in the target language (i.e., Vietnamese).
 40 | - Adding custom category that is suitable for the target language (e.g., cities and their zones in Vietnam for Vietnamese).
 41 | Since most of this process is automatically done, it can be applied in other languages as well.
 42 | 
 43 | ### 1.2. Selected categories for Vietnamese:  
 44 | > 1. capital-common-countries
 45 | > 2. capital-world
 46 | > 3. currency: E.g., Algeria | dinar | Angola | kwanza
 47 | > 4. city-in-zone (Vietnam's cities and its zone)
 48 | > 5. family (boy|girl | brother | sister)
 49 | > 6. gram1-adjective-to-adverb (NOT USED)
 50 | > 7. gram2-opposite (e.g., acceptable | unacceptable | aware | unaware)
 51 | > 8. gram3-comparative (e.g., bad | worse | big | bigger)
 52 | > 9. gram4-superlative (e.g., bad | worst | big | biggest)
 53 | > 10. gram5-present-participle (NOT USED)
 54 | > 11. gram6-nationality-adjective-nguoi-tieng (e.g., Albania | Albanian | Argentina | Argentinean)
 55 | > 12. gram7-past-tense (NOT USED)
 56 | > 13. gram8-plural-cac-nhung (e.g., banana | bananas | bird | birds) (NOT USED)
 57 | > 14. gram9-plural-verbs (NOT USED)
 58 | 
 59 | ### 1.3 Evaluation results (in details)
 60 | 
 61 | * Analogy: Word Analogy Task
 62 | 
 63 | * NER (w): NER task with hyper-parameters selected from the best F1 on validation set.
 64 | 
 65 | * NER (w.o): NER task without selecting hyper-parameters from the validation set.
 66 | 
 67 | | ﻿            Model            | NER.w        | NER.w.o 	| Analogy 	|
 68 | |------------------------------	|------------- | ------------------	|------------------	|
 69 | | BiLC3 + w2v                 	| 89.01   | 89.41            	|         0.4796 |
 70 | | BiLC3 + Bert_Base           	| 88.26   | 89.91             | 0.4609 |
 71 | | BiLC3 + w2v_c2v             	| 89.46   | 89.46            	| 0.4796 |
 72 | | BiLC3 + fastText            	| 89.65   | 89.84            	|  0.4970 |
 73 | | BiLC3 + Elmo                	| 89.67 | 90.84            	| **0.4999** |
 74 | | BiLC3 + MULTI_WC_F_E_B        | **91.09**   | **91.75**        	| 0.4906|
 75 | 
 76 | 
 77 | ## 2. Embedding Extractor: To extract embedding vectors for other tasks.
 78 | - Input: (1) list of input embeddings, (2) a vocabulary file.
 79 | - Output: embedding vectors of the given vocab file in `.txt`, i.e., each line conains the embedding for a word. The file then be compressed in .gz format. This format is widely used in existing NLP Toolkits (e.g., Reimers et al. [1]).
 80 | 
 81 | ### Extra options:
 82 | - `-input-c2v`: character embedding file
 83 | - `solveoov:1`: to solve OOV words of the 1st embedding. Similarly for more than one embedding: e.g., `solveoov:1:2`.
 84 | 
 85 | [1] Nils Reimers and Iryna Gurevych, Reporting Score Distributions Makes a Difference: Performance Study of LSTM-networks for Sequence Tagging, 2017, http://arxiv.org/abs/1707.09861, arXiv.
 86 | 
 87 | ## 3. Visualizer: to explore the embedding space and compare between different embeddings.
 88 | 
 89 | ### Screenshot of viewing multiple-embeddings side-by-side (Vietnamese):
 90 | ![Alt text](images/etnlp_view_multi_embeddings.png "Screenshot multiple-embeddings side-by-side")
 91 | 
 92 | ### Screenshot of viewing each embedding interactively (Vietnamese):
 93 | ![Alt text](images/etnlp_view_embs.png "Screenshot example of viewing each embedding interactively")
 94 | 
 95 | ### Screenshot of viewing each embedding side-by-side (English):
 96 | ![Alt text](images/english_sbs.png "Screenshot example of viewing each embedding interactively")
 97 | 
 98 | 
 99 | # IV. Installation and How to use ETNLP <a name="installation_and_howtouse"></a>
100 | ## 1. Installation:
101 | 
102 | From source codes (Python 3.6.x):
103 | > 1. cd src/codes/
104 | > 2. pip install -r requirements.txt
105 | > 3. python setup.py install
106 | 
107 | From pip (python 3.6.x)
108 | > 1. sudo apt-get install python3-dev
109 | > 2. pip install cython
110 | > 3. pip install git+git://github.com/vietnlp/etnlp.git
111 | 
112 | OR:
113 | > 1. pip install etnlp 
114 | 
115 | ## 2. Examples
116 | > 1. cd src/examples
117 | > 2. python test1_etnlp_preprocessing.py
118 | > 3. python test2_etnlp_extractor.py
119 | > 4. python test3_etnlp_evaluator.py
120 | > 5. python test4_etnlp_visualizer.py
121 | ### Example of using Fasttext-Sent2Vec:
122 | - 01. Install: https://github.com/epfml/sent2vec
123 | ```
124 | 01. git clone https://github.com/epfml/sent2vec
125 | 02. cd sent2vec; pip install .
126 | ```
127 | 
128 | - 02. Extract embeddings for sentences (no requirement for tokenization before extracting embedding of sentences).
129 | ```
130 | import sent2vec
131 | model = sent2vec.Sent2vecModel()
132 | model.load_model('opendata_wiki_lowercase_words.bin')
133 | emb = model.embed_sentence("tôi là sinh viên đh công nghệ, đại học quôc gia hà nội") 
134 | embs = model.embed_sentences(["tôi là sinh viên", "tôi là nhà thơ", "tôi là bác sĩ"])
135 | 
136 | ```
137 | 
138 | 
139 | ### 3. Visualization
140 | Side-by-side visualization:
141 | > 1. sh src/codes/04.run_etnlp_visualizer_sbs.sh
142 | 
143 | Interactive visualization:
144 | > 1. sh src/codes/04.run_etnlp_visualizer_inter.sh
145 | 
146 | 
147 | # V. Available Lexical Resources <a name="Download_Resources"></a>
148 | ## 1. Word Analogy List for Vietnamese
149 | 
150 | | ﻿ Word Analogy List          | Download Link (NER Task)| Download Link (General)| 
151 | |------------------------------|---------------|---------------|
152 | | Vietnamese (This work)       | [Link1](https://drive.google.com/file/d/1eA5yvla4BhAIfWsmZherT1GEW6gzDC-1/view?usp=sharing)| [Link1](https://drive.google.com/file/d/1YJ9d5rVKMMKF1xWWZi26_sNpgULTvxwg/view?usp=sharing)|
153 | | English (Mirkolov et al. [2]) | [Link2]| [Link2](https://drive.google.com/file/d/10rWxGu8-nbQmYC8wrIussSZjY0lDh6RP/view?usp=sharing)|
154 | | Portuguese (Hartmann et al. [3]) | [Link3]| [Link3](https://github.com/nathanshartmann/portuguese_word_embeddings/blob/master/analogies/testset/LX-4WAnalogies.txt)|
155 | 
156 | 
157 | 
158 | ## 2. Multiple pre-trained embedding models for Vietnamese
159 | 
160 | - Training data: Wiki in Vietnamese:
161 | 
162 | | ﻿ # of sentences          | # of tokenized words| 
163 | |------------------------------|---------------|
164 | | ﻿  6,685,621          | 114,997,587 |
165 | 
166 | 
167 | - Download Pre-trained Embeddings: <br>
168 | (Note: The MULTI_WC_F_E_B is the concatenation of four embeddings: W2V_C2V, fastText, ELMO, and Bert_Base.)
169 | 
170 | | ﻿  Embedding Model           | Download Link (NER Task) | Download Link (AIVIVN SentiTask) | Download Link (General) | 
171 | |------------------------------|---------------|---------------|---------------|
172 | | w2v                          | [Link1](https://drive.google.com/file/d/1LHaZ8LXxteHzod42naqJZYCwwq5mI9aL/view?usp=sharing) (dim=300)|  [Link1] | [Link1] |
173 | | w2v_c2v                      | [Link2](https://drive.google.com/file/d/1-M9Tb9l8mNmP3RKxZiZNK1Vpbng2yw4l/view?usp=sharing) (dim=300)|  [Link2] | [Link2] |
174 | | fastText                     | [Link3](https://drive.google.com/file/d/1dHCPhKFjtDjbrUeeymheDnlhjtaljPGE/view?usp=sharing) (dim=300)|  [Link3] | [Link3] |
175 | | fastText-[Sent2Vec](https://github.com/epfml/sent2vec)                     | [Link3]|  [Link3] | [Link3](https://drive.google.com/file/d/1BzL1mpdfqCCJioCdAlTVshbrz0lGfP2D/view?usp=sharing) (dim=300, 6GB, trained on 20GB of [news data](https://github.com/binhvq/news-corpus) and Wiki-data of ETNLP. |
176 | | Elmo                         | [Link4](https://drive.google.com/file/d/1zDaSD8NsZNXGyd9iVOxTcb7CP61Ixo-r/view?usp=sharing) (dim=1024)|  [Link4](https://drive.google.com/file/d/1jVJtF0f6SbtUd-t3bnywP6mFnz0QXPIx/view?usp=sharing) (dim=1024)| [Link4](https://drive.google.com/file/d/1XPsTzg1Gex-Hh2nl9344YlZc1orOVBDp/view?usp=sharing) (dim=1024, 731MB and 1.9GB after extraction.)|
177 | | Bert_base                    | [Link5](https://drive.google.com/file/d/16fRkmIHiB16OlM8WdFmoApGtLMf6YJJ8/view?usp=sharing) (dim=768)|  [Link5] | [Link5] |
178 | | MULTI_WC_F_E_B               | [Link6](https://drive.google.com/file/d/1gq7b8hs31VzoeO3n3C__ftlDnE_iBZW2/view?usp=sharing)  (dim=2392)| [Link6] | [Link6] |
179 | 
180 | 
181 | # VI. Versioning
182 | For transparency and insight into our release cycle, and for striving to maintain backward compatibility, ETNLP will be maintained under the Semantic Versioning guidelines as much as possible.
183 | 
184 | Releases will be numbered with the following format:
185 | 
186 | `<major>.<minor>.<patch>`
187 | 
188 | And constructed with the following guidelines:
189 | 
190 | * Breaking backward compatibility bumps the major (and resets the minor and patch)
191 | * New additions without breaking backward compatibility bumps the minor (and resets the patch)
192 | * Bug fixes and misc changes bumps the patch
193 | 
194 | For more information on SemVer, please visit http://semver.org/.
195 | 


--------------------------------------------------------------------------------
/images/english_sbs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/images/english_sbs.png


--------------------------------------------------------------------------------
/images/etnlp_view_embs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/images/etnlp_view_embs.png


--------------------------------------------------------------------------------
/images/etnlp_view_multi_embeddings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/images/etnlp_view_multi_embeddings.png


--------------------------------------------------------------------------------
/images/visualisation_hanoi_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/images/visualisation_hanoi_2.png


--------------------------------------------------------------------------------
/src/codes/00.run_etnlp_preprocessing.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | export PYTHONPATH="$PYTHONPATH:$PWD"
3 | INPUT_FILES="../data/glove2vec_dicts/glove1.vec;../data/glove2vec_dicts/glove2.vec"
4 | OUTPUT_FILES="../data/glove2vec_dicts/glove1_w2v.vec;../data/glove2vec_dicts/glove2_w2v.vec"
5 | # do_normalize: use this flag to normalize in case of multiple embeddings.
6 | python ./etnlp_api.py  -input $INPUT_FILES -output $OUTPUT_FILES -args "glove2w2v"
7 | 


--------------------------------------------------------------------------------
/src/codes/01.run_etnlp_evaluator.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | export PYTHONPATH="$PYTHONPATH:$PWD"
3 | INPUT_FILES="../data/embedding_dicts/ELMO_23.vec;../data/embedding_dicts/FastText_23.vec;../data/embedding_dicts/W2V_C2V_23.vec;../data/embedding_dicts/MULTI_23.vec"
4 | ANALOGY_FILE="../data/embedding_analogies/vi/solveable_analogies_vi.txt"
5 | OUT_FILE="../data/embedding_analogies/vi/Multi_evaluator_results.txt"
6 | python ./etnlp_api.py  -input $INPUT_FILES -output $OUT_FILE -analoglist $ANALOGY_FILE -args eval


--------------------------------------------------------------------------------
/src/codes/02.run_etnlp_extractor.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | export PYTHONPATH="$PYTHONPATH:$PWD"
3 | INPUT_FILES="../data/embedding_dicts/ELMO_23.vec;../data/embedding_dicts/FastText_23.vec;../data/embedding_dicts/W2V_C2V_23.vec;../data/embedding_dicts/MULTI_23.vec"
4 | C2V="../data/embedding_dicts/C2V.vec"
5 | OUTPUT="../data/embedding_dicts/MULTI_W_F_B_E.vec"
6 | VOCAB_FILE="../data/vocab.txt"
7 | python ./etnlp_api.py  -input $INPUT_FILES -vocab $VOCAB_FILE -input_c2v $C2V -args "extract" -output $OUTPUT
8 | 


--------------------------------------------------------------------------------
/src/codes/03.run_etnlp_visualizer_inter.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | export PYTHONPATH="$PYTHONPATH:$PWD"
3 | INPUT_FILES="../data/embedding_dicts/ELMO_23.vec;../data/embedding_dicts/FastText_23.vec;../data/embedding_dicts/W2V_C2V_23.vec;../data/embedding_dicts/MULTI_23.vec"
4 | python3 ./etnlp_api.py  -input $INPUT_FILES -args visualizer -port 8889
5 | 


--------------------------------------------------------------------------------
/src/codes/04.run_etnlp_visualizer_sbs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | export PYTHONPATH="$PYTHONPATH:$PWD"
3 | INPUT_FILES="../data/embedding_dicts/ELMO_23.vec;../data/embedding_dicts/FastText_23.vec;../data/embedding_dicts/W2V_C2V_23.vec;../data/embedding_dicts/MULTI_23.vec"
4 | # python ./visualizer/visualizer_sbs.py  -input $INPUT_FILES -args visualizer
5 | python3 ./visualizer/visualizer_sbs.py $INPUT_FILES
6 | 


--------------------------------------------------------------------------------
/src/codes/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/src/codes/api/__init__.py


--------------------------------------------------------------------------------
/src/codes/api/embedding_evaluator.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import gensim
  3 | import argparse
  4 | from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors, Word2VecKeyedVectors
  5 | from gensim import utils, matutils
  6 | from six import string_types
  7 | from numpy import dot, float32 as REAL, array, ndarray,  argmax
  8 | from utils import embedding_io, emb_utils
  9 | from embeddings.embedding_configs import EmbeddingConfigs
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class new_Word2VecKeyedVectors(Word2VecKeyedVectors):
 15 |     def __init__(self, vector_size):
 16 |         super(Word2VecKeyedVectors, self).__init__(vector_size=vector_size)
 17 | 
 18 |     def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None):
 19 |         """
 20 |         Find the top-N most similar words. Positive words contribute positively towards the
 21 |         similarity, negative words negatively.
 22 | 
 23 |         This method computes cosine similarity between a simple mean of the projection
 24 |         weight vectors of the given words and the vectors for each word in the model.
 25 |         The method corresponds to the `word-analogy` and `distance` scripts in the original
 26 |         word2vec implementation.
 27 | 
 28 |         If topn is False, most_similar returns the vector of similarity scores.
 29 | 
 30 |         `restrict_vocab` is an optional integer which limits the range of vectors which
 31 |         are searched for most-similar values. For example, restrict_vocab=10000 would
 32 |         only check the first 10000 word vectors in the vocabulary order. (This may be
 33 |         meaningful if you've sorted the vocabulary by descending frequency.)
 34 | 
 35 |         Example::
 36 | 
 37 |           >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])
 38 |           [('queen', 0.50882536), ...]
 39 | 
 40 |         """
 41 |         if positive is None:
 42 |             positive = []
 43 |         if negative is None:
 44 |             negative = []
 45 | 
 46 |         self.init_sims()
 47 | 
 48 |         if isinstance(positive, string_types) and not negative:
 49 |             # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
 50 |             positive = [positive]
 51 | 
 52 |         # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
 53 |         positive = [
 54 |             (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word
 55 |             for word in positive
 56 |         ]
 57 |         negative = [
 58 |             (word, -1.0) if isinstance(word, string_types + (ndarray,)) else word
 59 |             for word in negative
 60 |         ]
 61 | 
 62 |         # compute the weighted average of all words
 63 |         all_words, mean = set(), []
 64 |         for word, weight in positive + negative:
 65 |             if isinstance(word, ndarray):
 66 |                 mean.append(weight * word)
 67 |             else:
 68 |                 mean.append(weight * self.word_vec(word, use_norm=True))
 69 |                 if word in self.vocab:
 70 |                     all_words.add(self.vocab[word].index)
 71 |         if not mean:
 72 |             raise ValueError("cannot compute similarity with no input")
 73 |         mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
 74 | 
 75 |         if indexer is not None:
 76 |             return indexer.most_similar(mean, topn)
 77 | 
 78 |         limited = self.syn0norm if restrict_vocab is None else self.syn0norm[:restrict_vocab]
 79 |         dists = dot(limited, mean)
 80 |         if not topn:
 81 |             return dists
 82 |         best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
 83 |         # ignore (don't return) words from the input
 84 |         result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
 85 |         return result[:topn]
 86 | 
 87 |     def new_accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True):
 88 |         """
 89 |         Compute accuracy of the model. `questions` is a filename where lines are
 90 |         4-tuples of words, split into sections by ": SECTION NAME" lines.
 91 |         See questions-words.txt in
 92 |         https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip
 93 |         for an example.
 94 | 
 95 |         The accuracy is reported (=printed to log and returned as a list) for each
 96 |         section separately, plus there's one aggregate summary at the end.
 97 | 
 98 |         Use `restrict_vocab` to ignore all questions containing a word not in the first `restrict_vocab`
 99 |         words (default 30,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
100 |         In case `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then
101 |         case normalization is performed.
102 | 
103 |         Use `case_insensitive` to convert all words in questions and vocab to their uppercase form before
104 |         evaluating the accuracy (default True). Useful in case of case-mismatch between training tokens
105 |         and question words. In case of multiple case variants of a single word, the vector for the first
106 |         occurrence (also the most frequent if vocabulary is sorted) is taken.
107 | 
108 |         This method corresponds to the `compute-accuracy` script of the original C word2vec.
109 | 
110 |         """
111 |         print("INFO: Using new accuracy")
112 |         ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
113 |         ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab)
114 | 
115 |         oov_counter, idx_cnt, is_vn_counter = 0, 0, 0
116 |         sections, section = [], None
117 |         for line_no, line in enumerate(utils.smart_open(questions)):
118 |             # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
119 |             line = utils.to_unicode(line)
120 | 
121 |             if line.startswith(': '):
122 |                 # a new section starts => store the old section
123 |                 if section:
124 |                     sections.append(section)
125 |                     self.log_accuracy(section)
126 |                 section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []}
127 |             else:
128 |                 # Count number of analogy to check
129 |                 idx_cnt += 1
130 |                 if not section:
131 |                     raise ValueError("missing section header before line #%i in %s" % (line_no, questions))
132 |                 try:
133 |                     if case_insensitive:
134 |                         a, b, c, expected = [word.upper() for word in line.split(" | ")]
135 |                     else:
136 |                         a, b, c, expected = [word for word in line.split(" | ")]
137 |                         # print("Line : ", line)
138 |                         # print("a, b, c, expected: %s, %s, %s, %s"%(a, b, c, expected))
139 |                         # input(">>> Wait ...")
140 |                 except ValueError:
141 |                     logger.info("SVX: ERROR skipping invalid line #%i in %s", line_no, questions)
142 |                     print("Line : ", line)
143 |                     print("a, b, c, expected: %s, %s, %s, %s" % (a, b, c, expected))
144 |                     input(">>> Wait ...")
145 |                     continue
146 | 
147 |                 # In case of Vietnamese, word analogy can be a phrase
148 |                 if " " in a or " " in b or " " in c or " " in expected:
149 |                     is_vn_counter += 1
150 |                     pass
151 |                 else:
152 |                     if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
153 |                         logger.debug("SVX: skipping line #%i with OOV words: %s", line_no, line.strip())
154 |                         oov_counter += 1
155 |                         continue
156 | 
157 |                 original_vocab = self.vocab
158 |                 self.vocab = ok_vocab
159 |                 ignore = {a, b, c}  # input words to be ignored
160 |                 predicted = None
161 |                 # find the most likely prediction, ignoring OOV words and input words
162 |                 sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab)
163 |                 self.vocab = original_vocab
164 |                 for index in matutils.argsort(sims, reverse=True):
165 |                     predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index]
166 |                     if predicted in ok_vocab and predicted not in ignore:
167 |                         if predicted != expected:
168 |                             logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted)
169 |                         break
170 |                 if predicted == expected:
171 |                     section['correct'].append((a, b, c, expected))
172 |                 else:
173 |                     section['incorrect'].append((a, b, c, expected))
174 | 
175 |         if section:
176 |             # store the last section, too
177 |             sections.append(section)
178 |             self.log_accuracy(section)
179 | 
180 |         total = {
181 |             'OOV/Total/VNCompound_Words': [oov_counter, (idx_cnt), is_vn_counter],
182 |             'section': 'total',
183 |             'correct': sum((s['correct'] for s in sections), []),
184 |             'incorrect': sum((s['incorrect'] for s in sections), []),
185 |         }
186 |         self.log_accuracy(total)
187 |         sections.append(total)
188 |         return sections
189 | 
190 | 
191 | def convert_conll_format_to_normal(connl_file, out_file):
192 |     """
193 |     read file conll format
194 |     return format : One sentence per line
195 |     sentences_arr: [EU rejects German call .., ...]
196 |     tags_arr: [B-ORG O B-MIST O ..., ...]
197 |     """
198 |     f = open(connl_file)
199 |     sentences = []
200 |     sentence = ""
201 |     for line in f:
202 |         # print("line: ", line)
203 |         if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
204 |             sentences.append(sentence.rstrip())
205 |             sentence = ""
206 |             continue
207 |         else:
208 |             splits = line.split('\t')
209 |             sentence += splits[1].rstrip() + " "
210 | 
211 |     # To handle the last sentence.
212 |     if len(sentence) > 0:
213 |         sentences.append(sentence)
214 |         del sentence
215 | 
216 |     # Write to output
217 |     if out_file is None:
218 |         out_file = connl_file + ".std.txt"
219 |     writer = open(out_file, "w")
220 |     for sen in sentences:
221 |         writer.write(sen + "\n")
222 |         writer.flush()
223 |     writer.close()
224 | 
225 |     return sentences
226 | 
227 | 
228 | def verify_word_analogies(file):
229 |     """
230 |     Verify the word analogy file.
231 |     :param file:
232 |     :return:
233 |     """
234 |     f_reader = open(file, "r")
235 | 
236 |     valid_cnt, invalid_cnt = 0, 0
237 | 
238 |     for line in f_reader:
239 |         # print("line: ", line)
240 |         if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
241 |             continue
242 |         else:
243 |             splits = line.split('\t')
244 |             if len(splits) != 4:
245 |                 invalid_cnt += 1
246 |             else:
247 |                 valid_cnt += 1
248 | 
249 |     print("Valid analogy: %s, invalid analogy: %s" % (valid_cnt, invalid_cnt))
250 | 
251 | 
252 | def check_oov_of_word_analogies(w2v_format_emb_file, analogy_file, is_vn=True, case_sensitive=True):
253 |     emb_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_format_emb_file,
254 |         binary=False,
255 |         unicode_errors='ignore')
256 | 
257 |     f_reader = open(analogy_file, "r")
258 |     vocab_arr = []
259 |     for line in f_reader:
260 |         if not case_sensitive:
261 |             line = line.lower()
262 | 
263 |         if line.startswith(': '):
264 |             continue
265 |         else:
266 |             for word in line.split(" | "):
267 |                 # In Vietnamese, we have compound and single word.
268 |                 # if is_vn:
269 |                 #     if " " in word:
270 |                 #         print("I should not going here")
271 |                 #         single_words = word.split(" ")
272 |                 #         for single_word in single_words:
273 |                 #             vocab_arr.append(single_word)
274 |                 # For other languages.
275 |                 # else:
276 |                 vocab_arr.append(word)
277 | 
278 |     print("Before unique set: len = ", len(vocab_arr))
279 |     unique_vocab_arr = set(vocab_arr)
280 |     print("After unique set: len = ", len(unique_vocab_arr))
281 |     valid_word_cnt = 0
282 |     for word in unique_vocab_arr:
283 |         if word in emb_model:
284 |             valid_word_cnt += 1
285 | 
286 |     print("With Is_VN = %s, case_sensitive = %s, Valid word = %s/%s" % (is_vn,
287 |                                                                         case_sensitive,
288 |                                                                         valid_word_cnt,
289 |                                                                         len(unique_vocab_arr)))
290 | 
291 | 
292 | def evaluator_api(input_files, analoglist, output, embed_config=None):
293 |     """
294 | 
295 |     :param input_files:
296 |     :param analoglist:
297 |     :param output:
298 |     :param embed_config:
299 |     :return:
300 |     """
301 |     if embed_config is None:
302 |         embed_config = EmbeddingConfigs() # Initialize default config for embedding.
303 |     local_embedding_names, local_word_embeddings = embedding_io.load_word_embeddings(input_files, embed_config)
304 |     # emb_utils.print_analogy('man', 'him', 'woman', emb_words)
305 |     local_output_str = emb_utils.eval_word_analogy_4_all_embeddings(analoglist,
306 |                                                                     local_embedding_names,
307 |                                                                     local_word_embeddings,
308 |                                                                     output_file=output)
309 |     print("OUTPUT: ", local_output_str)
310 | 
311 | 
312 | if __name__ == "__main__":
313 |     """
314 |     Evaluates a given word embedding model.
315 |     To use:
316 |     evaluate.py path_to_model [-restrict]
317 |     optional restrict argument performs an evaluation using the original
318 |     Mikolov restriction of vocabulary
319 |     """
320 | 
321 |     desc = "Evaluates a word embedding model"
322 |     parser = argparse.ArgumentParser(description=desc)
323 |     parser.add_argument("-input",
324 |                         required=True,
325 |                         default="../data/embedding_dicts/ELMO_23.vec",
326 |                         help="Input multiple word embeddings, each model separated by a `;`.")
327 |     parser.add_argument("-analoglist",
328 |                         nargs="?",
329 |                         # default="../data/embedding_analogies/vi/analogy_vn_seg.txt.std.txt",
330 |                         default="../data/embedding_analogies/vi/solveable_analogies_vi.txt",
331 |                         help="Input analogy file to run the word analogy evaluation.")
332 | 
333 |     parser.add_argument("-r",
334 |                         nargs="?",
335 |                         default=False,
336 |                         help="Vocabulary restriction")
337 | 
338 |     parser.add_argument("-checkoov",
339 |                         nargs="?",
340 |                         default=False,
341 |                         help="Check OOV percentage")
342 | 
343 |     parser.add_argument("-lang",
344 |                         nargs="?",
345 |                         default="VI",
346 |                         help="Specify language, by default, it's Vietnamese.")
347 | 
348 |     parser.add_argument("-lowercase",
349 |                         nargs="?",
350 |                         default=True,
351 |                         help="Lowercase all word analogies? (depends on how the emb was trained).")
352 | 
353 |     parser.add_argument("-output",
354 |                         nargs="?",
355 |                         default="../data/embedding_analogies/vi/results_out.txt",
356 |                         help="Output file of word analogy task")
357 | 
358 |     parser.add_argument("-remove_redundancy",
359 |                         nargs="?",
360 |                         default=True,
361 |                         help="Remove redundancy in predicted words")
362 | 
363 |     print("Params: ", parser)
364 | 
365 |     args = parser.parse_args()
366 | 
367 |     embedding_config = EmbeddingConfigs()
368 | 
369 |     paths_of_models = args.input
370 |     testset = args.analoglist
371 |     is_vietnamese = args.lang
372 |     output_file = args.output
373 | 
374 |     # use restriction?
375 |     restriction = None
376 |     if args.r:
377 |         restriction = 30000
378 | 
379 |     # set logging definitions
380 |     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
381 |                         level=logging.INFO)
382 | 
383 |     if args.checkoov:
384 |         print("Checking OOV ...")
385 |         check_oov_of_word_analogies(paths_of_models, testset, is_vn=is_vietnamese)
386 | 
387 |     if not args.checkoov:
388 |         print("Evaluating embeddings on the word analogy task ...")
389 |         if is_vietnamese:
390 |             print(" ... for ETNLP's evaluation approach.")
391 |             embedding_names, word_embeddings = embedding_io.load_word_embeddings(paths_of_models, embedding_config)
392 |             # emb_utils.print_analogy('man', 'him', 'woman', emb_words)
393 |             output_str = emb_utils.eval_word_analogy_4_all_embeddings(testset, embedding_names, word_embeddings,
394 |                                                                       output_file=args.output_file)
395 |             print("#"*20)
396 |             print(output_str)
397 |             print("#" * 20)
398 | 
399 |         else:
400 |             print(" ... for Mirkolov et al.'s evaluation approach.")
401 |             word_analogy_obj = new_Word2VecKeyedVectors(1024)
402 | 
403 |             # load and evaluate
404 |             model = word_analogy_obj.load_word2vec_format(
405 |                 paths_of_models,
406 |                 binary=False,
407 |                 unicode_errors='ignore')
408 | 
409 |             model.accuracy = word_analogy_obj.new_accuracy
410 | 
411 |             acc = model.accuracy(testset, restrict_vocab=restriction, case_insensitive=False)
412 |             print("Acc = ", acc)
413 | 
414 |     print("DONE")
415 | 


--------------------------------------------------------------------------------
/src/codes/api/embedding_extractor.py:
--------------------------------------------------------------------------------
  1 | from embeddings import embedding_utils
  2 | from pathlib import Path
  3 | import numpy as np
  4 | import os
  5 | import logging
  6 | import gzip
  7 | from embeddings.embedding_configs import EmbeddingConfigs
  8 | 
  9 | 
 10 | def get_multi_embedding_models(config: EmbeddingConfigs):
 11 |     """
 12 | 
 13 |     :param config:
 14 |     :return:
 15 |     """
 16 |     model_paths_list = config.model_paths_list
 17 |     model_names_list = config.model_names_list
 18 |     model_dims_list = config.model_dims_list
 19 |     char_model_path = config.char_model_path
 20 |     char_model_dims = config.char_model_dims
 21 | 
 22 |     if char_model_path:
 23 |         char_model = embedding_utils.reload_char2vec_model(char_model_path, char_model_dims)
 24 |     else:
 25 |         char_model = None
 26 | 
 27 |     embedding_models = embedding_utils.reload_embedding_models(model_paths_list,
 28 |                                                                model_names_list,
 29 |                                                                model_dims_list,
 30 |                                                                char_model)
 31 |     # doc_vector = embedding_models.get_vector_of_document(tokenized_text)
 32 |     return embedding_models
 33 | 
 34 | 
 35 | def get_emb_dim(emb_file):
 36 |     idx = 0
 37 |     dim = 0
 38 |     with open(emb_file, "r") as reader:
 39 |         if idx == 0:
 40 |             line = reader.readline().rstrip()
 41 |             dim = int(line.split(" ")[1])
 42 |     return dim
 43 | 
 44 | 
 45 | def extract_embedding_for_vocab_file(paths_of_emb_models, vocab_words_file, c2v_emb_file, output_file, output_format):
 46 |     """
 47 | 
 48 |     :param paths_of_emb_models:
 49 |     :param vocab_words_file:
 50 |     :param c2v_emb_file:
 51 |     :param output_file:
 52 |     :param output_format:
 53 |     :return:
 54 |     """
 55 |     config = EmbeddingConfigs()
 56 |     config.output_format = output_format
 57 |     config.model_paths_list = paths_of_emb_models.split(";")
 58 |     embedding_file_names = []
 59 |     embedding_dims = []
 60 | 
 61 |     if c2v_emb_file:
 62 |         config.char_model_path = c2v_emb_file
 63 |         config.char_model_dims = get_emb_dim(c2v_emb_file)
 64 | 
 65 |     print("02. Extracting word embeddings ...")
 66 |     if paths_of_emb_models and paths_of_emb_models.__contains__(";"):
 67 |         files = paths_of_emb_models.split(";")
 68 |         for emb_file in files:
 69 |             embedding_name = os.path.basename(os.path.normpath(emb_file))
 70 |             embedding_file_names.append(embedding_name)
 71 |             embedding_dim = get_emb_dim(emb_file)
 72 |             embedding_dims.append(embedding_dim)
 73 |     elif paths_of_emb_models: # In case there is only one embedding
 74 |         embedding_name = os.path.basename(os.path.normpath(paths_of_emb_models))
 75 |         embedding_file_names.append(embedding_name)
 76 |         embedding_dim = get_emb_dim(paths_of_emb_models)
 77 |         embedding_dims.append(embedding_dim)
 78 |     else:
 79 |         raise Exception("List of embeddings cannot be None.")
 80 | 
 81 |     # Data type:
 82 |     embedding_names = ["word2vec"]*len(embedding_dims) # embedding type, only support w2v and c2v type now
 83 |     config.model_names_list = embedding_names
 84 |     config.model_dims_list = embedding_dims
 85 | 
 86 |     # Do extracting embeddings
 87 |     extract_embedding_vectors(vocab_words_file, output_file, config)
 88 |     print("Done")
 89 | 
 90 | 
 91 | def extract_embedding_vectors(vocab_words_file, output_file, config: EmbeddingConfigs):
 92 |     """
 93 | 
 94 |     :param vocab_words_file:
 95 |     :param output_file:
 96 |     :param config:
 97 |     :return:
 98 |     """
 99 |     # Load vocab
100 |     with Path(vocab_words_file).open() as f:
101 |         word_to_idx = {line.strip(): idx for idx, line in enumerate(f)}
102 |     size_vocab = len(word_to_idx)
103 | 
104 |     # Output writer
105 |     fwriter = open(output_file, "w")
106 | 
107 |     # Array of zeros
108 |     dim_size = sum(config.model_dims_list)
109 |     found = 0
110 |     print('Reading embedding file (may take a while)')
111 | 
112 |     embedding_models = get_multi_embedding_models(config)
113 | 
114 |     embeddings = np.zeros((size_vocab, dim_size))
115 | 
116 |     line_idx = 0
117 |     for word in word_to_idx.keys():
118 |         word_idx = word_to_idx[word]
119 | 
120 |         word = word.rstrip()
121 |         try:
122 |             if line_idx % 100000 == 0:
123 |                 print('- At line {}'.format(line_idx))
124 | 
125 |             w2v_vector = embedding_models.get_word_vector_of_multi_embeddings(word)
126 | 
127 |             if w2v_vector is not None and len(w2v_vector) > 0:
128 |                 embeddings[word_idx] = w2v_vector
129 |                 line = "%s %s" % (word, " ".join(str(scalar) for scalar in w2v_vector))
130 |                 fwriter.write(line + "\n")
131 |                 fwriter.flush()
132 |                 found += 1
133 | 
134 |             logging.debug("Embedding: ", w2v_vector)
135 |         except Exception as e:
136 |             logging.debug("Unexpected error: word = %s, error = %s" % (word, e))
137 |             pass
138 |         line_idx += 1
139 | 
140 |     print('- done. Found {} vectors for {} words'.format(found, size_vocab))
141 |     fwriter.close()
142 | 
143 |     # Open file again to add meta data:
144 |     src = open(output_file, "r")
145 |     meta_line = "%s %s\n"%(found, dim_size)
146 |     oline = src.readlines()
147 |     # Here, we prepend the string we want to on first line
148 |     oline.insert(0, meta_line)
149 |     src.close()
150 | 
151 |     # We again open the file in WRITE mode
152 |     src = open(output_file, "w")
153 |     src.writelines(oline)
154 |     src.close()
155 |     # Done with writing.
156 | 
157 |     if config.output_format.__contains__(".gz"):
158 |         content = open(output_file, "rb").read()
159 |         gzip_out_file = output_file + '.gz'
160 |         with gzip.open(gzip_out_file, 'wb') as f:
161 |             f.write(content)
162 |         print("Saved embedding to %s" % (gzip_out_file))
163 | 
164 |     if config.output_format.__contains__(".npz"):
165 |         npz_out_file = output_file + '.npz'
166 |         np.savez_compressed(npz_out_file, embeddings=embeddings)
167 |         print("Saved embedding to %s"%(npz_out_file))
168 |     return
169 | 
170 | 


--------------------------------------------------------------------------------
/src/codes/api/embedding_preprocessing.py:
--------------------------------------------------------------------------------
  1 | # Convert to a standard word2vec format
  2 | 
  3 | import gensim
  4 | from utils import embedding_io
  5 | import sys
  6 | from threading import Thread
  7 | from embeddings.embedding_configs import EmbeddingConfigs
  8 | 
  9 | 
 10 | def convert_to_w2v(vocab_file, embedding_file, out_file):
 11 |     """
 12 |     Export from a word2vec file by filtering out vocabs based on the input vocab file.
 13 |     :param vocab_file:
 14 |     :param embedding_file:
 15 |     :param out_file:
 16 |     :return: word2vec file
 17 |     """
 18 |     std_vocab = []
 19 |     with open(vocab_file) as f:
 20 |         for word in f:
 21 |             std_vocab.append(word)
 22 | 
 23 |     print ("Loaded NER vocab_size = %s" % (len(std_vocab)))
 24 |     is_binary = False
 25 |     if embedding_file.endswith(".bin"):
 26 |         is_binary = True
 27 | 
 28 |     print("Loading w2v model ...")
 29 | 
 30 |     emb_model = gensim.models.KeyedVectors.load_word2vec_format(embedding_file,
 31 |                                                                 binary=is_binary,
 32 |                                                                 unicode_errors='ignore')
 33 | 
 34 |     print("LOADED model: vocab_size = %s" % (len(emb_model.wv.vocab)))
 35 |     f_writer = open(out_file, "w")
 36 |     for word in std_vocab:
 37 |         word = word.rstrip()
 38 |         line = None
 39 |         if word in emb_model:
 40 |             vector = " ".join(str(item) for item in emb_model[word])
 41 |             # word = word.lower()
 42 |             line = "%s %s" % (word, vector)
 43 |         else:
 44 |             word = word.lower()
 45 |             if word in emb_model:
 46 |                 vector = " ".join(str(item) for item in emb_model[word])
 47 |                 line = "%s %s" % (word, vector)
 48 |                 # print("LINE: ", line)
 49 |         if line:
 50 |             f_writer.write(line + "\n")
 51 |     f_writer.close()
 52 | 
 53 | 
 54 | def test():
 55 |     vocab_file = "../data/vnner_BiLSTM_CRF/vocab.words.txt"
 56 |     embedding_file = "../data/embedding_dicts/elmo_embeddings_large.txt"
 57 |     out_file = "../data/embedding_dicts/elmo_1024dims_wiki_normalcase2lowercase_NER.vec"
 58 |     convert_to_w2v(vocab_file, embedding_file, out_file)
 59 |     print("Out file: ", out_file)
 60 |     print("DONE")
 61 | 
 62 | 
 63 | def load_and_save_2_word2vec_model(input_model_path, output_model_path, embedding_config):
 64 |     """
 65 |     Process one embedding model
 66 |     :param input_model_path:
 67 |     :param output_model_path:
 68 |     :return:
 69 |     """
 70 |     model_in = embedding_io.load_word_embedding(input_model_path, embedding_config)
 71 |     embedding_io.save_model_to_file(model_in, output_model_path)
 72 |     print("Write model back to ", output_model_path)
 73 | 
 74 | 
 75 | def load_and_save_2_word2vec_models(input_embedding_files_str, output_embedding_files_str, embedding_config):
 76 |     """
 77 |     Multi-threaded processing to export to word2vec format
 78 |     :param input_embedding_files_str:
 79 |     :param output_embedding_files_str:
 80 |     :return:
 81 |     """
 82 |     if input_embedding_files_str.__contains__(";"):
 83 |         input_model_files = input_embedding_files_str.split(";")
 84 |     else:
 85 |         input_model_files = [input_embedding_files_str]
 86 | 
 87 |     if output_embedding_files_str.__contains__(";"):
 88 |         output_model_files = output_embedding_files_str.split(";")
 89 |     else:
 90 |         output_model_files = [output_embedding_files_str]
 91 | 
 92 |     # Double check input files and output files.
 93 |     assert (len(output_model_files) == len(input_model_files)), \
 94 |         "Number of input files and output files must be equal. Exiting ..."
 95 | 
 96 |     # create a list of threads
 97 |     threads = []
 98 | 
 99 |     for model_in, model_out in zip(input_model_files, output_model_files):
100 |         # We start one thread per file.
101 |         process = Thread(target=load_and_save_2_word2vec_model, args=[model_in, model_out, embedding_config])
102 |         process.start()
103 |         threads.append(process)
104 |         # load_and_save_2_word2vec_model(model_in, model_out)
105 | 
106 |     # This to ensure each thread has finished processing the input file.
107 |     for process in threads:
108 |         process.join()
109 | 
110 | 
111 | if __name__ == "__main__":
112 | 
113 |     if len(sys.argv) != 2:
114 |         print("Missing input arguments. Input format: ./*.py <emb_file1;emb_file2;...>. Exiting ...")
115 |         exit(0)
116 | 
117 |     embedding_config = EmbeddingConfigs()
118 |     # We don't need to be word2vec format for pre-processing here but it still shows warning
119 |     # if input files aren't in w2v format.
120 |     embedding_config.is_word2vec_format = True
121 |     embedding_config.do_normalize_emb = False # If you don't want to normalize the embedding vectors.
122 | 
123 |     if sys.argv[1].__contains__(";"):
124 |         in_model_files = sys.argv[1].split(";")
125 |     else:
126 |         in_model_files = [sys.argv[1]]
127 | 
128 |     out_model_files = [input_model_path + ".extracted.vec" for input_model_path in in_model_files]
129 | 
130 |     load_and_save_2_word2vec_models(in_model_files, out_model_files)
131 | 


--------------------------------------------------------------------------------
/src/codes/api/embedding_visualizer.py:
--------------------------------------------------------------------------------
  1 | # 1. Read embedding file
  2 | # 2. Convert to tensorboard
  3 | # 3. Visualize
  4 | 
  5 | # encoding: utf-8
  6 | import sys, os
  7 | import gensim
  8 | import tensorflow as tf
  9 | import numpy as np
 10 | from tensorflow.contrib.tensorboard.plugins import projector
 11 | import logging
 12 | from tensorboard import default
 13 | from tensorboard import program
 14 | 
 15 | 
 16 | class TensorBoardTool:
 17 | 
 18 |     def __init__(self, dir_path):
 19 |         self.dir_path = dir_path
 20 | 
 21 |     def run(self, emb_name, port):
 22 |         # Remove http messages
 23 |         # log = logging.getLogger('sonvx').setLevel(logging.INFO)
 24 |         logging.basicConfig(level=logging.INFO)
 25 |         logging.propagate = False
 26 |         # Start tensorboard server
 27 |         tb = program.TensorBoard(default.get_plugins(), default.get_assets_zip_provider())
 28 |         tb.configure(argv=[None, '--logdir', self.dir_path, '--port', str(port)])
 29 |         url = tb.launch()
 30 |         sys.stdout.write('TensorBoard of %s at %s \n' % (emb_name, url))
 31 | 
 32 | 
 33 | def convert_multiple_emb_models_2_tf(emb_name_arr, w2v_model_arr, output_path, port):
 34 |     """
 35 | 
 36 |     :param emb_name_arr:
 37 |     :param w2v_model_arr:
 38 |     :param output_path:
 39 |     :param port:
 40 |     :return:
 41 |     """
 42 |     idx = 0
 43 |     # define the model without training
 44 |     sess = tf.InteractiveSession()
 45 |     config = projector.ProjectorConfig()
 46 | 
 47 |     for w2v_model in w2v_model_arr:
 48 |         emb_name = emb_name_arr[idx]
 49 | 
 50 |         meta_file = "%s.tsv" % emb_name
 51 |         placeholder = np.zeros((len(w2v_model.wv.index2word), w2v_model.vector_size))
 52 | 
 53 |         with open(os.path.join(output_path, meta_file), 'wb') as file_metadata:
 54 |             for i, word in enumerate(w2v_model.wv.index2word):
 55 |                 placeholder[i] = w2v_model[word]
 56 |                 # temporary solution for https://github.com/tensorflow/tensorflow/issues/9094
 57 |                 if word == '':
 58 |                     print("Empty Line, should replaced by any thing else, or will cause a bug of tensorboard")
 59 |                     file_metadata.write(u"{0}".format('<Empty Line>').encode('utf-8') + b'\n')
 60 |                 else:
 61 |                     file_metadata.write(u"{0}".format(word).encode('utf-8') + b'\n')
 62 | 
 63 |         word_embedding_var = tf.Variable(placeholder, trainable=False, name=emb_name)
 64 |         tf.global_variables_initializer().run()
 65 |         sess.run(word_embedding_var)
 66 | 
 67 |         # adding into projector
 68 |         embed = config.embeddings.add()
 69 |         embed.tensor_name = emb_name
 70 |         embed.metadata_path = meta_file
 71 |         idx += 1
 72 | 
 73 |     saver = tf.train.Saver()
 74 |     writer = tf.summary.FileWriter(output_path, sess.graph)
 75 | 
 76 |     # Specify the width and height of a single thumbnail.
 77 |     projector.visualize_embeddings(writer, config)
 78 |     all_emb_name = "_".join(emb_name for emb_name in emb_name_arr)
 79 |     saver.save(sess, os.path.join(output_path, '%s.ckpt' % all_emb_name))
 80 |     # tf.flags.FLAGS.logdir = output_path
 81 |     # print('Running `tensorboard --logdir={0}` to run visualize result on tensorboard'.format(output_path))
 82 |     # tb.run_main()q
 83 |     tb_tool = TensorBoardTool(output_path)
 84 |     tb_tool.run(all_emb_name, port)
 85 |     return
 86 | 
 87 | 
 88 | def convert_one_emb_model_2_tf(emb_name, model, output_path, port):
 89 |     """
 90 | 
 91 |     :param model: Word2Vec model
 92 |     :param output_path:
 93 |     :return:
 94 |     """
 95 |     # emb_name = "word_embedding"
 96 |     meta_file = "%s.tsv"%emb_name
 97 |     placeholder = np.zeros((len(model.wv.index2word), model.vector_size))
 98 | 
 99 |     with open(os.path.join(output_path, meta_file), 'wb') as file_metadata:
100 |         for i, word in enumerate(model.wv.index2word):
101 |             placeholder[i] = model[word]
102 |             # temporary solution for https://github.com/tensorflow/tensorflow/issues/9094
103 |             if word == '':
104 |                 print("Empty Line, should replaced by any thing else, or will cause a bug of tensorboard")
105 |                 file_metadata.write(u"{0}".format('<Empty Line>').encode('utf-8') + b'\n')
106 |             else:
107 |                 file_metadata.write(u"{0}".format(word).encode('utf-8') + b'\n')
108 | 
109 |     # define the model without training
110 |     sess = tf.InteractiveSession()
111 | 
112 |     word_embedding_var = tf.Variable(placeholder, trainable=False, name=emb_name)
113 |     sess.run(word_embedding_var)
114 |     # tf.global_variables_initializer().run()
115 | 
116 |     saver = tf.train.Saver()
117 |     writer = tf.summary.FileWriter(output_path, sess.graph)
118 | 
119 |     # adding into projector
120 |     config = projector.ProjectorConfig()
121 |     embed = config.embeddings.add()
122 |     embed.tensor_name = emb_name
123 |     embed.metadata_path = meta_file
124 | 
125 |     # Specify the width and height of a single thumbnail.
126 |     projector.visualize_embeddings(writer, config)
127 |     saver.save(sess, os.path.join(output_path, '%s.ckpt'%emb_name))
128 |     # tf.flags.FLAGS.logdir = output_path
129 |     # print('Running `tensorboard --logdir={0}` to run visualize result on tensorboard'.format(output_path))
130 |     # tb.run_main()q
131 |     tb_tool = TensorBoardTool(output_path)
132 |     tb_tool.run(emb_name, port)
133 |     return
134 | 
135 | 
136 | def visualize_multiple_embeddings_individually(paths_of_emb_models):
137 |     output_root_dir = "../data/embedding_tf_data/"
138 |     starting_port = 6006
139 |     embedding_names = []
140 |     print("Loaded all word embeddings, going to visualize ...")
141 | 
142 |     if paths_of_emb_models and paths_of_emb_models.__contains__(";"):
143 |         files = paths_of_emb_models.split(";")
144 |         for emb_file in files:
145 | 
146 |             embedding_name = os.path.basename(os.path.normpath(emb_file))
147 | 
148 |             tf_data_folder = output_root_dir + embedding_name
149 | 
150 |             if not os.path.exists(tf_data_folder):
151 |                 os.makedirs(tf_data_folder)
152 | 
153 |             is_binary = False
154 | 
155 |             if emb_file.endswith(".bin"):
156 |                 is_binary = True
157 | 
158 |             emb_model = gensim.models.KeyedVectors.load_word2vec_format(emb_file, binary=is_binary)
159 | 
160 |             convert_one_emb_model_2_tf(embedding_name, emb_model, tf_data_folder, starting_port)
161 | 
162 |             embedding_names.append(embedding_name)
163 | 
164 |             starting_port += 1
165 | 
166 |     while True:
167 |         print("Type exit to quite the visualizer: ")
168 |         user_input = input()
169 |         if user_input == "exit":
170 |             break
171 |     return
172 | 
173 | 
174 | def visualize_multiple_embeddings_all_in_one(paths_of_emb_models, port):
175 |     output_root_dir = "../data/embedding_tf_data/"
176 |     starting_port = port
177 |     embedding_names = []
178 |     print("Loaded all word embeddings, going to visualize ...")
179 | 
180 |     embedding_name_arr = []
181 |     w2v_embedding_model_arr = []
182 | 
183 |     if paths_of_emb_models and paths_of_emb_models.__contains__(";"):
184 |         files = paths_of_emb_models.split(";")
185 |         for emb_file in files:
186 | 
187 |             embedding_name = os.path.basename(os.path.normpath(emb_file))
188 |             embedding_name_arr.append(embedding_name)
189 | 
190 |             is_binary = False
191 | 
192 |             if emb_file.endswith(".bin"):
193 |                 is_binary = True
194 | 
195 |             emb_model = gensim.models.KeyedVectors.load_word2vec_format(emb_file, binary=is_binary)
196 |             w2v_embedding_model_arr.append(emb_model)
197 |             embedding_names.append(embedding_name)
198 | 
199 |         # print("View side-by-side word similarity of multiple embeddings at: http://Sons-MBP.lan:8089")
200 | 
201 |     all_emb_name = "_".join(emb_name for emb_name in embedding_name_arr)
202 |     tf_data_folder = output_root_dir + all_emb_name
203 |     if not os.path.exists(tf_data_folder):
204 |         os.makedirs(tf_data_folder)
205 | 
206 |     convert_multiple_emb_models_2_tf(embedding_name_arr, w2v_embedding_model_arr, tf_data_folder, starting_port)
207 | 
208 |     while True:
209 |         print("Type exit to quite the visualizer: ")
210 |         user_input = input()
211 |         if user_input == "exit":
212 |             break
213 |     return
214 | 
215 | 
216 | def visualize_multiple_embeddings(paths_of_emb_models, port):
217 |     """
218 |     API to other part to call, don't modify this function.
219 |     :param paths_of_emb_models:
220 |     :param port:
221 |     :return:
222 |     """
223 |     visualize_multiple_embeddings_all_in_one(paths_of_emb_models, port)
224 | 
225 | 
226 | if __name__ == "__main__":
227 |     """
228 |     Just run `python w2v_visualizer.py word2vec.model visualize_result`
229 |     """
230 |     try:
231 |         model_path = sys.argv[1]
232 |         output_path = sys.argv[2]
233 |     except Exception as e:
234 |         print("Please provide model path and output path %s " % e)
235 | 
236 |     # model = Word2Vec.load(model_path)
237 |     model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
238 |     convert_one_emb_model_2_tf(model, output_path)
239 | 


--------------------------------------------------------------------------------
/src/codes/embeddings/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/src/codes/embeddings/__init__.py


--------------------------------------------------------------------------------
/src/codes/embeddings/embedding_configs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class EmbeddingConfigs(object):
 4 |     """
 5 |         Configuration information
 6 |     """
 7 |     is_word2vec_format = True
 8 |     do_normalize_emb = True
 9 |     model_paths_list = []
10 |     model_names_list = []
11 |     model_dims_list = []
12 |     char_model_path = None
13 |     char_model_dims = -1
14 |     output_format = ".txt;.npz;.gz"
15 | 


--------------------------------------------------------------------------------
/src/codes/embeddings/embedding_models.py:
--------------------------------------------------------------------------------
  1 | from gensim.models import KeyedVectors as Word2Vec
  2 | import numpy as np
  3 | from embeddings import embedding_utils
  4 | from utils import file_utils
  5 | import os, re
  6 | import logging
  7 | 
  8 | 
  9 | DEBUG = False
 10 | 
 11 | 
 12 | class Model_Constants(object):
 13 |     word2vec = "word2vec"
 14 |     char2vec = "char2vec"
 15 |     private_word2vec = "private_word2vec"
 16 |     elmo = "elmo"
 17 | 
 18 | 
 19 | class Embedding_Model(object):
 20 |     def __init__(self, name, vector_dim):
 21 |         self.name = name
 22 |         self.model = None
 23 |         self.char_model = None
 24 |         self.vocabs_list = None
 25 |         self.vector_dim = vector_dim
 26 |         # TODO: update this changeable param later
 27 |         # unk, random, mean, replace_by_character_embedding
 28 |         self.unknown_word = "replace_by_character_embedding"
 29 |         # self.MAX_DIM = 400 # No longer use MAX_DIM, now it depends on input dims
 30 | 
 31 |     def load_model(self, model_path):
 32 |         if self.name == Model_Constants.word2vec or self.name == Model_Constants.elmo:
 33 |             if model_path.endswith(".bin"):
 34 |                 self.model = Word2Vec.load_word2vec_format(model_path, binary=True)
 35 |             else:
 36 |                 self.model = Word2Vec.load_word2vec_format(model_path, binary=False)
 37 |         elif self.name == Model_Constants.char2vec:
 38 |             self.model = dict()
 39 |             print("Loading model_path = ", model_path)
 40 |             file = open(model_path, "r")
 41 |             for line in file:
 42 |                 elements = line.split()
 43 |                 if len(elements) > 100:  # because embedding dim is higher than 100.
 44 |                     # char_model[elements[0]] = np.array(map(float, elements[1:])).tolist()
 45 |                     self.model[elements[0]] = np.array([float(i) for i in elements[1:]]).tolist()
 46 |             return self.model
 47 |         elif self.name == Model_Constants.private_word2vec:
 48 |             self.model, _, self.vocabs_list = embedding_utils.reload_embeddings(model_path)
 49 |         else:
 50 |             raise Exception("Unknown embedding models!")
 51 | 
 52 |     def is_punct(self, word):
 53 |         arr_list = [
 54 |             '!',
 55 |             '"',
 56 |             '%',
 57 |             '&',
 58 |             "'",
 59 |             "''",
 60 |             '(',
 61 |             '(.',
 62 |             ')',
 63 |             '*',
 64 |             '+',
 65 |             ',',
 66 |             '-',
 67 |             '---',
 68 |             '.',
 69 |             '..',
 70 |             '...',
 71 |             '....',
 72 |             '/',
 73 |             ]
 74 |         if word in arr_list:
 75 |             return True
 76 |         else:
 77 |             return False
 78 | 
 79 |     def is_number(self, word):
 80 |         regex = r"^[0-9]+"
 81 |         matches = re.finditer(regex, word, re.MULTILINE)
 82 |         matchNum = 0
 83 |         for matchNum, match in enumerate(matches):
 84 |             matchNum = matchNum + 1
 85 |         if matchNum > 0:
 86 |             return True
 87 |         else:
 88 |             return False
 89 | 
 90 |     def set_char_model(self, char_model):
 91 |         self.char_model = char_model
 92 | 
 93 |     def load_vocabs_list(self, vocab_file_path):
 94 |         """
 95 |         Load vocabs list for private w2v model. Has to be pickle file.
 96 |         :param vocab_file_path:
 97 |         :return:
 98 |         """
 99 |         if vocab_file_path:
100 |             self.vocabs_list = file_utils.load_obj(vocab_file_path)
101 | 
102 |     def get_char_vector(self, char_model, word):
103 |         """
104 |         char_model here is an instance of embedding_model
105 |         :param char_model: an instance of embedding_model
106 |         :param word:
107 |         :return:
108 |         """
109 |         if char_model is None:
110 |             # Sonvx on March 20, 2019: we now allow the char_model is None,
111 |             # cannot call this get_char_vector in such case.
112 |             raise Exception("Char_model is None! Cannot use character-embedding.")
113 | 
114 |         out_char_2_vec = []
115 |         char_vecs = []
116 |         chars = list(word)
117 |         vecs = []
118 |         for c in chars:
119 |             if c in char_model.model:
120 |                 emb_vector = char_model.model[c]
121 |                 vecs.append(emb_vector)
122 |                 if DEBUG:
123 |                     input(">>>>>>")
124 |                     print("Char_emb_vector=", emb_vector)
125 | 
126 |         # char_vecs.extend(list(vecs))
127 | 
128 |         if len(vecs) > 0:
129 |             out_char_2_vec = np.mean(vecs, axis=0)
130 | 
131 |         if DEBUG:
132 |             print(">>> Output of char2vec: %s"%(out_char_2_vec))
133 |             input(">>>> outc2v ...")
134 | 
135 |         return out_char_2_vec
136 | 
137 |     def is_unknown_word(self, word):
138 |         """Check whether or not a word is unknown"""
139 |         is_unknown_word = False
140 |         if self.vocabs_list is not None:
141 |             if word not in self.vocabs_list:
142 |                 is_unknown_word = True
143 |         else:
144 |             if word not in self.model:
145 |                 is_unknown_word = True
146 |         return is_unknown_word
147 | 
148 |     def get_word_vector(self, word):
149 |         """
150 |         Handle unknown word: In case of our private word2vec, we have a vocabs_list to check. With regular models,
151 |         we can check inside the model. Note that by default, we use char-model to handle unknown words.
152 |         :param word:
153 |         :param char_model:
154 |         :return:
155 |         """
156 | 
157 |         rtn_vector = []
158 | 
159 |         # try first time with normal case
160 |         is_unknown_word = self.is_unknown_word(word)
161 | 
162 |         # try 2nd times with lowercase.
163 |         if is_unknown_word:
164 |             word = word.lower()
165 |             is_unknown_word = self.is_unknown_word(word)
166 | 
167 |         # unknown word
168 |         if is_unknown_word and self.char_model:
169 |             # Sonvx on March 20, 2019: solve unknown only when char_model is SET.
170 |             rtn_vector = self.get_vector_of_unknown(word)
171 |         else:
172 |             # normal case
173 |             if self.name == Model_Constants.word2vec:
174 |                 rtn_vector = self.model[word]
175 | 
176 |                 # For now we have self.vector_dim, max_dim, and len(rtn_vector)
177 |                 # Update: move to use self.vector_dim only
178 |                 if len(rtn_vector) > self.vector_dim:
179 |                     print("Warning: auto trim to %s/%s dimensions"%(self.vector_dim, len(rtn_vector)))
180 |                     rtn_vector = self.model[word][:self.vector_dim]
181 | 
182 |             elif self.name == Model_Constants.elmo:
183 |                 rtn_vector = self.model[word]
184 | 
185 |                 if self.vector_dim == len(rtn_vector)/2:
186 |                     vector1 = rtn_vector[:self.vector_dim]
187 |                     vector2 = rtn_vector[self.vector_dim:]
188 |                     print("Notice: auto average to  b[i] = (a[i] + a[i + %s])/2 /%s dimensions" % (self.vector_dim,
189 |                                                                                                    len(rtn_vector)))
190 |                     rtn_vector = np.mean([vector1, vector2], 0)
191 |                 elif len(rtn_vector) > self.vector_dim:
192 |                     print("Warning: auto trim to %s/%s dimensions" % (self.vector_dim, len(rtn_vector)))
193 |                     rtn_vector = self.model[word][:self.vector_dim]
194 | 
195 |             elif self.name == Model_Constants.char2vec:
196 |                 rtn_vector = self.get_char_vector(self, word)
197 | 
198 |             elif self.name == Model_Constants.private_word2vec:
199 |                 # Handle unknown word - Not need for now since we handle unknown words first
200 |                 if word not in self.vocabs_list:
201 |                     word = "UNK"
202 |                 word_idx = self.vocabs_list.index(word)
203 |                 emb_vector = self.model[word_idx]
204 |                 rtn_vector = emb_vector
205 | 
206 |         # final check before returning vector
207 |         if DEBUG:
208 |             print(">>> DEBUG: len(rtn_vector) = %s" % (len(rtn_vector)))
209 |             input(">>> before returning vector ...")
210 |         if len(rtn_vector) < 1:
211 |             return np.zeros(self.vector_dim)
212 |         else:
213 |             if len(rtn_vector) == self.vector_dim:
214 |                 return rtn_vector
215 |             # TODO: find a better way to represent unknown word by character to have same-size with word-vector-size
216 |             # For now, I add 0 to the [current-len, expected-len]
217 |             else:
218 |                 logging.debug("Model name = %s, Current word = %s, Current size = %s, expected size = %s"
219 |                       %(self.name, word, len(rtn_vector), self.vector_dim))
220 |                 return np.append(rtn_vector, np.zeros(self.vector_dim - len(rtn_vector)))
221 | 
222 |     def get_vector_of_unknown(self, word):
223 |         """
224 |         If word is UNK, use char_vector model instead.
225 |         :param word:
226 |         :return:
227 |         """
228 |         # Here we handle features based on the w2v model where
229 |         # numbers and punctuations are encoded as <punct>, <number>
230 |         if self.name == Model_Constants.word2vec:
231 |             if self.is_number(word):
232 |                 rtn_vector = self.model["<number>"]
233 |             elif self.is_punct(word):
234 |                 rtn_vector = self.model["<punct>"]
235 |             else:
236 |                 rtn_vector = self.get_char_vector(self.char_model, word)
237 | 
238 |             if rtn_vector is not None:
239 |                 if len(rtn_vector) > self.vector_dim:
240 |                     print("Warning: auto trim to %s/%s dimensions"%(self.vector_dim, len(rtn_vector)))
241 |                     return rtn_vector[:self.vector_dim]
242 |                 else:
243 |                     return rtn_vector
244 |         # otherwise, using c2v to build-up the embedding vector
245 |         else:
246 |             return self.get_char_vector(self.char_model, word)
247 | 
248 | 
249 | class Embedding_Models(object):
250 |     """
251 |     Using all available embedding models to generate vectors
252 |     """
253 |     def __init__(self, list_models):
254 |         self.list_models = list_models # list of embedding_model_objs: ['word2vec', 'char2vec', 'private_word2vec']
255 | 
256 |     def add_model(self, emb_model, char_model):
257 |         """
258 |         Add new model into the collection of embedding models. Note that, every model has to add char_model to handle
259 |         unknown word.
260 |         :param emb_model:
261 |         :param char_model:
262 |         :return:
263 |         """
264 |         if char_model is None:
265 |             print("Warning: char_model is None -> cannot solve OOV word. Keep going ...")
266 |             # Sonvx on March 20, 2019: change to allow None char_model
267 |             # raise Exception("char_model cannot be None.")
268 |         if isinstance(emb_model, Embedding_Model):
269 |             emb_model.set_char_model(char_model)
270 |             self.list_models.append(emb_model)
271 |         else:
272 |             raise Exception("Not an instance of embedding_model class.")
273 | 
274 |     def get_vector_of_document(self, document):
275 |         """
276 |         Get all embedding vectors for one document
277 |         :param document:
278 |         :return:
279 |         """
280 |         doc_vector = []
281 |         # debug_dict = {}
282 |         # print ("len_doc = ", len(document))
283 |         for word in document:
284 | 
285 |             all_vectors_of_word = []
286 | 
287 |             # get all embedding vectors of a word
288 |             for emb_model in self.list_models:
289 |                 emb_vector = emb_model.get_word_vector(word)
290 |                 # print("len_emb_vector = ", len(emb_vector))
291 |                 all_vectors_of_word.extend(emb_vector)
292 | 
293 |                 # if word in debug_dict.keys():
294 |                 #     debug_dict[word].append(len(emb_vector))
295 |                 # else:
296 |                 #     debug_dict[word] = [len(emb_vector)]
297 | 
298 |              # stack a combined vector of all words
299 |             doc_vector.append(all_vectors_of_word)
300 | 
301 |         # print("list of words and emb size = ", debug_dict)
302 |         # get the mean of them to represent a document
303 |         doc_vector = np.mean(doc_vector, axis=0)
304 | 
305 |         return doc_vector
306 | 
307 |     def get_word_vector_of_multi_embeddings(self, word):
308 |         """
309 |         Get all embedding vectors for one document
310 |         :param word:
311 |         :return:
312 |         """
313 |         word_vector = []
314 |         for emb_model in self.list_models:
315 |             emb_vector = emb_model.get_word_vector(word)
316 |             word_vector.extend(emb_vector)
317 | 
318 |         return word_vector
319 | 
320 | 
321 | 
322 | 
323 | 
324 | 


--------------------------------------------------------------------------------
/src/codes/embeddings/embedding_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from utils import file_utils
  3 | from embeddings.embedding_models import Embedding_Model, Embedding_Models
  4 | 
  5 | 
  6 | def reload_char2vec_model(model_path, model_dim):
  7 |     char_model = Embedding_Model("char2vec", model_dim)
  8 |     char_model.load_model(model_path)
  9 |     return char_model
 10 | 
 11 | 
 12 | def reload_embedding_models(model_paths_list, model_names_list, model_dims_list, char_model):
 13 |     """
 14 |     Reload collection of embedding models to serve feature extraction task.
 15 |     :param model_paths_list:
 16 |     :param model_names_list:
 17 |     :param model_dims_list:
 18 |     :param char_model:
 19 |     :return:
 20 |     """
 21 |     # model path list and name list must be equal.
 22 |     print("model_paths_list = ", model_paths_list)
 23 |     print("model_formats_list = ", model_names_list)
 24 |     assert (len(model_names_list) == len(model_paths_list)), "Not equal length"
 25 |     assert (len(model_names_list) == len(model_dims_list)), "Not equal length"
 26 | 
 27 |     all_emb_models = Embedding_Models([])
 28 | 
 29 |     for model_idx in range(len(model_paths_list)):
 30 |         # get model path based on index
 31 |         model_path = model_paths_list[model_idx]
 32 |         model_name = model_names_list[model_idx]
 33 |         model_dim = model_dims_list[model_idx]
 34 | 
 35 |         if model_path is not None:
 36 | 
 37 |             emb_model = Embedding_Model(model_name, model_dim)
 38 |             emb_model.load_model(model_path)
 39 | 
 40 |             # add to final list of emb_models
 41 |             all_emb_models.add_model(emb_model, char_model)
 42 | 
 43 |     return all_emb_models
 44 | 
 45 | 
 46 | def save_embedding_models_tofolder(dir_path, final_embeddings, reverse_dictionary, vocabulary_size):
 47 |     """
 48 |     Save all trained word-embedding model of the custom word2vec.
 49 |     :param final_embeddings:
 50 |     :param reverse_dictionary:
 51 |     :param vocabulary_size:
 52 |     :return:
 53 |     """
 54 | 
 55 |     if not os.path.exists(dir_path):
 56 |         os.makedirs(dir_path)
 57 | 
 58 |     def save_to_word2vec_model(vocabs_list):
 59 |         # print("Saving word2vec format ...")
 60 |         filewriter = open(os.path.join(dir_path, "word2vec.txt"), "w", encoding="utf-8")
 61 | 
 62 |         filewriter.write("%s %s\n" % (len(vocabs_list), len(final_embeddings[0])))
 63 |         for word in vocabs_list:
 64 |             word_idx = vocabs_list.index(word)
 65 |             emb_vector = final_embeddings[word_idx]
 66 |             line = ' '.join(["%s" % (x) for x in emb_vector])
 67 |             filewriter.write(word + " " + line + "\n")
 68 | 
 69 |         filewriter.close()
 70 |         # print("Done!")
 71 | 
 72 |     file_utils.save_obj(final_embeddings, os.path.join(dir_path, "final_embeddings"))
 73 |     # We don't need to save reversed_dictionary
 74 |     # file_utils.save_obj(reverse_dictionary, os.path.join(FLAGS.trained_models, "reversed_dictionary"))
 75 |     vocab_list = [reverse_dictionary[i] for i in range(vocabulary_size)]
 76 |     save_to_word2vec_model(vocab_list)
 77 |     file_utils.save_obj(vocab_list, os.path.join(dir_path, "words_dictionary"))
 78 | 
 79 | 
 80 | def save_embedding_models(FLAGS, final_embeddings, reverse_dictionary, vocabulary_size):
 81 |     """
 82 |     Keep for old implementation.
 83 |     :param FLAGS:
 84 |     :param final_embeddings:
 85 |     :param reverse_dictionary:
 86 |     :param vocabulary_size:
 87 |     :return:
 88 |     """
 89 |     save_embedding_models_tofolder(FLAGS.trained_models, final_embeddings,
 90 |                                    reverse_dictionary, vocabulary_size)
 91 | 
 92 | 
 93 | def reload_embeddings(trained_models_dir):
 94 |     """
 95 |     Reload trained word-embedding model of the custom word2vec.
 96 |     :param trained_models_dir:
 97 |     :return:
 98 |     """
 99 |     final_embeddings = file_utils.load_obj(os.path.join(trained_models_dir, "final_embeddings"))
100 |     # reverse_dictionary = file_utils.load_obj(os.path.join(trained_models_dir, "reversed_dictionary"))
101 |     reverse_dictionary = None
102 |     labels = file_utils.load_obj(os.path.join(trained_models_dir, "words_dictionary"))
103 |     return final_embeddings, reverse_dictionary, labels
104 | 
105 | 
106 | def create_single_utf8_file(input_dir, output_file):
107 |     import glob
108 |     # path = './wiki_data/*.txt'
109 |     # out = './wiki_all.vi.utf8.txt'
110 |     files = glob.glob(input_dir)
111 |     for file in files:
112 |         with open(output_file, "a") as myfile:
113 |             with open(file, "r") as fp:
114 |                 for line in fp:
115 |                     line = line.strip().lower()
116 |                     line = line.decode('utf-8', 'ignore').encode("utf-8")
117 |                     myfile.write(line)
118 |     print("done")
119 | 


--------------------------------------------------------------------------------
/src/codes/etnlp_api.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from api import embedding_preprocessing, embedding_evaluator, embedding_extractor, embedding_visualizer
  3 | from visualizer import visualizer_sbs
  4 | import logging
  5 | import os
  6 | from embeddings.embedding_configs import EmbeddingConfigs
  7 | __version__ = "0.1.3"
  8 | 
  9 | 
 10 | embedding_config = EmbeddingConfigs()
 11 | 
 12 | if __name__ == "__main__":
 13 |     """
 14 |     ETNLP: a toolkit for evaluate, extract, and visualize multiple word embeddings
 15 |     """
 16 | 
 17 |     _desc = "Evaluates a word embedding model"
 18 |     _parser = argparse.ArgumentParser(description=_desc)
 19 |     _parser.add_argument("-input",
 20 |                          required=True,
 21 |                          default="../data/embedding_dicts/elmo_embeddings.txt",
 22 |                          #
 23 |                          help="model")
 24 |     _parser.add_argument("-analoglist",
 25 |                          nargs="?",
 26 |                          # default="../data/embedding_analogies/vi/analogy_vn_seg.txt.std.txt",
 27 |                          default="./data/embedding_analogy/solveable_analogies_vi.txt",
 28 |                          help="testset")
 29 | 
 30 |     _parser.add_argument("-args",
 31 |                          nargs="?",
 32 |                          default="eval",
 33 |                          help="Run evaluation")
 34 | 
 35 |     _parser.add_argument("-lang",
 36 |                          nargs="?",
 37 |                          default="VI",
 38 |                          help="Specify language, by default, it's Vietnamese.")
 39 | 
 40 |     _parser.add_argument("-vocab",
 41 |                          nargs="?",
 42 |                          default="../data/vocab.txt",
 43 |                          help="Vocab to be extracted")
 44 | 
 45 |     _parser.add_argument("-port",
 46 |                          nargs="?",
 47 |                          default=8889,
 48 |                          help="Port for visualization")
 49 | 
 50 |     _parser.add_argument("-input_c2v",
 51 |                          nargs="?",
 52 |                          default=None,
 53 |                          help="C2V embedding")
 54 | 
 55 |     _parser.add_argument("-output",
 56 |                          nargs="?",
 57 |                          default="../data/embedding_analogies/vi/results_out.txt",
 58 |                          help="Output file of word analogy task")
 59 | 
 60 |     _parser.add_argument("-output_format",
 61 |                          nargs="?",
 62 |                          default=".txt",
 63 |                          help="Format of output file of the extracted embedding.")
 64 | 
 65 |     _args = _parser.parse_args()
 66 | 
 67 |     # Set logging level
 68 |     logging.basicConfig(level=logging.INFO)
 69 |     logging.disable(logging.INFO)
 70 |     os.environ['TF_CPP_MIN_LOG_LEVEL'] = '5'
 71 | 
 72 |     input_embedding_files_str = _args.input
 73 |     analoglist = _args.analoglist
 74 |     is_vietnamese = _args.lang
 75 |     output_files_str = _args.output
 76 |     options_str = _args.args
 77 |     vocab_file = _args.vocab
 78 |     output_format = _args.output_format
 79 |     port = _args.port
 80 | 
 81 |     # By default, we process all embeddings as word2vec format.
 82 |     embedding_preprocessing.is_word2vec_format = True
 83 | 
 84 |     if options_str == 'eval':
 85 |         print("Starting evaluator ...")
 86 |         embedding_evaluator.evaluator_api(input_files=input_embedding_files_str, analoglist=analoglist,
 87 |                                           output=output_files_str)
 88 |         print("Done evaluator !")
 89 |     elif options_str == 'visualizer':
 90 |         print("Starting visualizer ...")
 91 |         embedding_visualizer.visualize_multiple_embeddings(input_embedding_files_str, port)
 92 |         print("Done visualizer !")
 93 |     elif options_str.startswith("extract"):
 94 |         print("Starting extractor ...")
 95 |         embedding_extractor.extract_embedding_for_vocab_file(input_embedding_files_str, vocab_file,
 96 |                                                              _args.input_c2v, output_files_str, output_format)
 97 |         print("Done extractor !")
 98 |     elif options_str.startswith("glove2w2v"):
 99 |         print("Starting pre-processing: convert to word2vec format ...")
100 |         embedding_config.is_word2vec_format = False
101 |         if options_str.__contains__("do_normalize"):
102 |             embedding_config.do_normalize_emb = True
103 |         else:
104 |             embedding_config.do_normalize_emb = False
105 |         embedding_preprocessing.load_and_save_2_word2vec_models(input_embedding_files_str,
106 |                                                                 output_files_str,
107 |                                                                 embedding_config)
108 | 
109 |     else:
110 |         print("Invalid options")
111 | 
112 |     print("Done!")
113 | 
114 | 
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/src/codes/requirements.txt:
--------------------------------------------------------------------------------
 1 | gensim==3.4.0
 2 | scipy==1.1.0
 3 | six==1.12.0
 4 | setuptools==40.6.2
 5 | tensorflow==1.12.0
 6 | Flask==1.0.2
 7 | tensorboard==1.12.0
 8 | numpy==1.15.4
 9 | scikit_learn==0.20.3
10 | typing==3.6.6
11 | 


--------------------------------------------------------------------------------
/src/codes/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from etnlp_api import __version__
 3 | 
 4 | 
 5 | with open("../../README.md", "r") as fh:
 6 |     long_description = fh.read()
 7 | 
 8 | setup(
 9 |     name='ETNLP',
10 |     version=__version__,
11 |     # packages=['api', 'utils', 'embeddings', 'visualizer'],
12 |     packages=find_packages(),
13 |     py_modules=['etnlp_api'],
14 |     long_description=long_description,
15 |     long_description_content_type="text/markdown",
16 |     url='https://github.com/vietnlp/etnlp',
17 |     license='MIT',
18 |     author='vietnlp',
19 |     author_email='sonvx.coltech@gmail.com',
20 |     description='ETNLP: Embedding Toolkit for NLP Tasks'
21 | )
22 | # from setuptools import setup, find_packages
23 | # import sys
24 | #
25 | # with open('requirements.txt') as f:
26 | #     reqs = f.read()
27 | # setup(
28 | #     name='ETNLP',
29 | #     version='0.1.0',
30 | #     description='ETNLP: Embedding Toolkit for NLP Tasks',
31 | #     python_requires='>=3.5',
32 | #     packages=find_packages(exclude=('data')),
33 | #     install_requires=reqs.strip().split('\n'),
34 | # )
35 | 


--------------------------------------------------------------------------------
/src/codes/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/src/codes/utils/__init__.py


--------------------------------------------------------------------------------
/src/codes/utils/emb_utils.py:
--------------------------------------------------------------------------------
  1 | from sklearn.metrics.pairwise import cosine_similarity
  2 | from typing import Any, Iterable, List, Optional, Set, Tuple
  3 | 
  4 | from utils.vectors import Vector
  5 | from utils import vectors
  6 | from utils.word import Word
  7 | from utils import eval_utils
  8 | from gensim import utils as genutils
  9 | import logging
 10 | import numpy as np
 11 | from scipy import stats
 12 | 
 13 | # Timing info for most_similar (100k words):
 14 | # Original version: 7.3s
 15 | # Normalized vectors: 3.4s
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | def most_similar(base_vector: Vector, words: List[Word]) -> List[Tuple[float, Word]]:
 20 |     """Finds n words with smallest cosine similarity to a given word"""
 21 |     words_with_distance = [(vectors.cosine_similarity_normalized(base_vector, w.vector), w) for w in words]
 22 |     # We want cosine similarity to be as large as possible (close to 1)
 23 |     sorted_by_distance = sorted(words_with_distance, key=lambda t: t[0], reverse=True)
 24 |     # Sonvx: remove duplications (not understand why yet, probably because the w2v?)
 25 |     # sorted_by_distance = list(set(sorted_by_distance))
 26 |     return sorted_by_distance
 27 | 
 28 | 
 29 | def print_most_similar(words: List[Word], text: str) -> None:
 30 |     base_word = find_word(text, words)
 31 |     if not base_word:
 32 |         print("Unknown word: %s"%(text))
 33 |         return
 34 |     print("Words related to %s:" % (base_word.text))
 35 |     sorted_by_distance = [
 36 |         word.text for (dist, word) in
 37 |         most_similar(base_word.vector, words)
 38 |         if word.text.lower() != base_word.text.lower()
 39 |     ]
 40 |     print(', '.join(sorted_by_distance[:10]))
 41 | 
 42 | 
 43 | def read_word() -> str:
 44 |     return input("Type a word: ")
 45 | 
 46 | 
 47 | def find_word(text: str, words: List[Word]) -> Optional[Word]:
 48 |     try:
 49 |         return next(w for w in words if text == w.text)
 50 |     except StopIteration:
 51 |         return None
 52 | 
 53 | 
 54 | def closest_analogies_OLD(
 55 |         left2: str, left1: str, right2: str, words: List[Word]
 56 | ) -> List[Tuple[float, Word]]:
 57 |     word_left1 = find_word(left1, words)
 58 |     word_left2 = find_word(left2, words)
 59 |     word_right2 = find_word(right2, words)
 60 |     if (not word_left1) or (not word_left2) or (not word_right2):
 61 |         return []
 62 |     vector = vectors.add(
 63 |         vectors.sub(word_left1.vector, word_left2.vector),
 64 |         word_right2.vector)
 65 |     closest = most_similar(vector, words)[:10]
 66 | 
 67 |     def is_redundant(word: str) -> bool:
 68 |         """
 69 |         Sometimes the two left vectors are so close the answer is e.g.
 70 |         "shirt-clothing is like phone-phones". Skip 'phones' and get the next
 71 |         suggestion, which might be more interesting.
 72 |         """
 73 |         word_lower = word.lower()
 74 |         return (
 75 |                 left1.lower() in word_lower or
 76 |                 left2.lower() in word_lower or
 77 |                 right2.lower() in word_lower)
 78 | 
 79 |     closest_filtered = [(dist, w) for (dist, w) in closest if not is_redundant(w.text)]
 80 |     return closest_filtered
 81 | 
 82 | 
 83 | def closest_analogies_vectors(
 84 |         word_left2: Word, word_left1: Word, word_right2: Word, words: List[Word]) \
 85 |             -> List[Tuple[float, Word]]:
 86 |     """
 87 |     Sonvx:
 88 |     :param word_left2:
 89 |     :param word_left1:
 90 |     :param word_right2:
 91 |     :param words:
 92 |     :param remove_redundancy: remove suggestions if they contain the given words.
 93 |     :return:
 94 |     """
 95 |     # print(">>>> Remove redundancy = ", remove_redundancy)
 96 |     # input(">>>>")
 97 |     vector = vectors.add(
 98 |         vectors.sub(word_left1.vector, word_left2.vector),
 99 |         word_right2.vector)
100 |     closest = most_similar(vector, words)[:10]
101 | 
102 |     def is_redundant(word: str) -> bool:
103 |         """
104 |         Sometimes the two left vectors are so close the answer is e.g.
105 |         "shirt-clothing is like phone-phones". Skip 'phones' and get the next
106 |         suggestion, which might be more interesting.
107 |         """
108 |         word_lower = word.lower()
109 |         return (
110 |                 word_left1.text.lower() in word_lower or
111 |                 word_left2.text.lower() in word_lower or
112 |                 word_right2.text.lower() in word_lower)
113 |     # It doesn't work this way for Vietnamese, so we try both of this to test for now
114 |     if False:
115 |         closest_filtered = [(dist, w) for (dist, w) in closest if not is_redundant(w.text)]
116 |     else:
117 |         closest_filtered = closest
118 |     return closest_filtered
119 | 
120 | 
121 | def get_avg_vector(word, embedding_words):
122 | 
123 |     if " " in word:
124 |         single_words = word.split(" ")
125 |         list_vector = []
126 | 
127 |         for single_word in single_words:
128 |             word_vec = find_word(single_word, embedding_words)
129 |             if word_vec:
130 |                 list_vector.append(word_vec.vector)
131 |             else:
132 |                 # Try again with lowercase
133 |                 single_word = single_word.lower()
134 |                 word_vec = find_word(single_word, embedding_words)
135 |                 if word_vec:
136 |                     list_vector.append(word_vec.vector)
137 | 
138 |         # print("list_vector: ", list_vector)
139 |         # input(">>>>>>>>")
140 | 
141 |         returned_Word = Word(word, vectors.mean_list(list_vector), 1)
142 |     else:
143 |         returned_Word = find_word(word, embedding_words)
144 | 
145 |     # print("Avg returned vector = ", returned_vector)
146 |     # input(">>>>")
147 | 
148 |     return returned_Word
149 | 
150 | 
151 | def run_paired_ttests(all_map_arr, embedding_names):
152 |     """
153 |     Run Paired t-tests on MAP results
154 |     :param all_map_arr:
155 |     :param embedding_names:
156 |     :return:
157 |     """
158 |     str_out = ""
159 |     num_embs = len(all_map_arr)
160 | 
161 |     # Verify to make sure they have the same length
162 |     if all_map_arr and embedding_names:
163 |         for i in range(0, num_embs - 1):
164 |             for j in range(i + 1, num_embs):
165 |                 if len(all_map_arr[i]) != len(all_map_arr[j]):
166 |                     raise Exception("Two embedding (%s, %s) have different MAP list, sizes: %s vs. %s"
167 |                                     % (embedding_names[i], embedding_names[j], len(all_map_arr[i]), len(all_map_arr[j])))
168 |     else:
169 |         logging.error("Inputs are NULL")
170 | 
171 |     result_str_ttest_arr = []
172 |     for i in range(0, num_embs - 1):
173 |         for j in range(i + 1, num_embs):
174 |             stat_test_ret = stats.ttest_rel(all_map_arr[i], all_map_arr[j])
175 |             # if stat_test_ret.pvalue >= 0.05:
176 |             result = "%s vs. %s: %s" % (embedding_names[i], embedding_names[j], stat_test_ret)
177 |             str_out += result + "\n"
178 | 
179 |     return str_out
180 | 
181 | 
182 | def eval_word_analogy_4_all_embeddings(word_analogies_file, embedding_names: List[str],
183 |                                        word_embeddings: List[List[Word]], output_file):
184 |     """
185 |     Run word analogy for all embeddings
186 |     :param word_analogies_file:
187 |     :param embedding_names:
188 |     :param word_embeddings:
189 |     :param output_file:
190 |     :return:
191 |     """
192 |     fwriter = open(output_file, "w")
193 |     idx = 0
194 |     all_map_arr = []
195 |     console_output_str = ""
196 |     category = ": | Word Analogy Task results\n"
197 |     fwriter.write(category)
198 |     console_output_str += category
199 | 
200 |     for word_embedding in word_embeddings:
201 |         embedding_name = embedding_names[idx]
202 |         map_at_10, map_arr, result_str = eval_word_analogies(word_analogies_file, word_embedding, embedding_name)
203 |         all_map_arr.append(map_arr)
204 |         meta_info = "\nEmbedding: %s"%(embedding_names[idx])
205 |         fwriter.write(meta_info + "\n")
206 |         fwriter.write(result_str)
207 |         fwriter.write("MAP_arr = %s"%(map_arr))
208 |         fwriter.write("MAP@10 = %s" % (map_at_10))
209 |         fwriter.flush()
210 |         console_output_str += meta_info + "\n" + "MAP@10 = %s" % (map_at_10) + "\n"
211 |         idx += 1
212 | 
213 |     # Getting significant Paired t-tests
214 |     category = "\n: | Paired t-tests results\n"
215 |     fwriter.write(category)
216 |     console_output_str += category
217 |     ttests_result = run_paired_ttests(all_map_arr, embedding_names)
218 |     console_output_str += ttests_result
219 |     fwriter.write(ttests_result)
220 |     fwriter.flush()
221 |     fwriter.close()
222 | 
223 |     return console_output_str
224 | 
225 | 
226 | def eval_word_analogies(word_analogies_file, words: List[Word], embedding_name):
227 |     """
228 |     Sonvx: Evaluate word analogy for one embedding.
229 |     :param word_analogies_file:
230 |     :param words:
231 |     :return:
232 |     """
233 |     # input("GO checking >>>>")
234 |     oov_counter, idx_cnt, is_vn_counter, phrase_cnt = 0, -1, 0, 0
235 |     sections, section = [], None
236 |     # map_arr = []
237 |     out_str = ""
238 |     map_ret_dict = {}
239 | 
240 |     for line_no, line in enumerate(genutils.smart_open(word_analogies_file)):
241 |         # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
242 |         line = genutils.to_unicode(line)
243 |         line = line.rstrip()
244 |         if line.startswith(': |'):
245 |             # a new section starts => store the old section
246 |             if section:
247 |                 sections.append(section)
248 |             section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []}
249 |         else:
250 |             # Count number of analogy to check
251 |             idx_cnt += 1
252 | 
253 |             # Set default map value
254 |             map_ret_dict[idx_cnt] = 0.0
255 | 
256 |             if not section:
257 |                 raise ValueError("missing section header before line #%i in %s" % (line_no, word_analogies_file))
258 |             try:
259 |                 # a - b + c = expected
260 |                 # Input: Baghdad | Irac | Bangkok | Thai_Lan
261 |                 # Baghdad - Irac = Bangkok - Thai_Lan
262 |                 # -> Baghdad - Irac + Thai_Lan = Bangkok
263 |                 # =>
264 |                 a, b, expected, c = [word for word in line.split(" | ")]
265 |             except ValueError:
266 |                 logger.debug("SVX: ERROR skipping invalid line #%i in %s", line_no, word_analogies_file)
267 |                 print("Line : ", line)
268 |                 print("a, b, c, expected: %s, %s, %s, %s" % (a, b, c, expected))
269 |                 # input(">>> Wait ...")
270 |                 continue
271 | 
272 |             # In case of Vietnamese, word analogy can be a phrase
273 |             if " " in expected:
274 |                 print("INFO: we don't support to find word analogies for phrase for NOW.")
275 |                 phrase_cnt += 1
276 |                 continue
277 |             elif " " in a or " " in b or " " in c:
278 |                 is_vn_counter += 1
279 |                 word_left1 = get_avg_vector(a, words)
280 |                 word_left2 = get_avg_vector(b, words)
281 |                 word_right2 = get_avg_vector(c, words)
282 |             else:
283 |                 word_left1 = find_word(a, words)
284 |                 word_left2 = find_word(b, words)
285 |                 word_right2 = find_word(c, words)
286 | 
287 |             if (not word_left1) or (not word_left2) or (not word_right2):
288 |                 logger.debug("SVX: skipping line #%i with OOV words: %s", line_no, line.strip())
289 |                 oov_counter += 1
290 |                 continue
291 | 
292 |             # Write solable analogy to a file
293 |             # fsolveable_writer.write(line + "\n")
294 | 
295 |             logger.debug("word_left1 = %s", word_left1.text)
296 |             logger.debug("word_left2 = %s", word_left2.text)
297 |             logger.debug("word_right2 = %s", word_right2.text)
298 | 
299 |             # Start finding close word:
300 |             # Note: we can only find 1 expected word in Vietnamese for NOW
301 |             top10_candidate = closest_analogies_vectors(word_left2, word_left1,
302 |                                                         word_right2, words)
303 |             list_candidate_arr = []
304 |             for tuple in top10_candidate:
305 |                 list_candidate_arr.append(tuple[1].text)
306 | 
307 |             logger.debug("Expected Word: %s, candidate = %s" % (expected, list_candidate_arr))
308 |             # input(">>>>>")
309 |             # Calculate MAP@10 score
310 |             this_map_result = eval_utils.mapk(expected, list_candidate_arr, word_level=True)
311 |             if this_map_result >= 0:
312 |                 this_map_result = round(this_map_result, 6)
313 |                 # map_arr[idx_cnt] = this_map_result
314 |             else:
315 |                 this_map_result = 0.0
316 |                 # map_arr.append(0.0)
317 |                 # map_arr[idx_cnt] = this_map_result
318 | 
319 |             map_ret_dict[idx_cnt] = this_map_result
320 | 
321 |             if expected in list_candidate_arr:
322 |                 section['correct'].append((a, b, c, expected))
323 |                 out_line = "%s - %s + %s = ?; Expect: %s, candidate: %s" % \
324 |                            (word_left1, word_left2, word_right2, expected, list_candidate_arr)
325 |                 out_str += out_line + "\n"
326 | 
327 |             # else:
328 |             #    section['incorrect'].append((a, b, c, expected))
329 | 
330 |     # fsolveable_writer.close()
331 |     if section:
332 |         # store the last section, too
333 |         sections.append(section)
334 | 
335 |     map_arr = list(map_ret_dict.values())
336 |     logger.debug("map_arr = ", map_arr)
337 |     logger.debug("MAP_RET_DICT = ", map_ret_dict)
338 |     # input("Check result dict: >>>>>")
339 | 
340 |     total = {
341 |         "Emb_Name: " + embedding_name + '/OOV/Total/VN_Solveable_Cases/VN_Phrase_Target':
342 |             [oov_counter, (idx_cnt + 1), is_vn_counter, phrase_cnt],
343 |         'MAP@10': np.mean(map_arr)
344 |         # ,
345 |         # 'section': 'total'
346 |         # ,
347 |         # 'correct': sum((s['correct'] for s in sections), []),
348 |         # 'incorrect': sum((s['incorrect'] for s in sections), []),
349 |     }
350 |     # print (out_str)
351 |     # print(total)
352 |     # logger.info(total)
353 | 
354 |     sections.append(total)
355 |     sections_str = "\n%s\n" % sections
356 | 
357 |     return np.mean(map_arr), map_arr, sections_str
358 | 
359 | 
360 | def print_analogy(left2: str, left1: str, right2: str, words: List[Word]) -> None:
361 |     analogies = closest_analogies_OLD(left2, left1, right2, words)
362 |     if (len(analogies) == 0):
363 |         # print(f"{left2}-{left1} is like {right2}-?")
364 |         print("%s-%s is like %s-?"%(left2, left1, right2))
365 |         # man-king is like woman-king
366 |         # input: man is to king is like woman is to ___?(queen).
367 |     else:
368 |         (dist, w) = analogies[0]
369 |         # alternatives = ', '.join([f"{w.text} ({dist})" for (dist, w) in analogies])
370 |         # print(f"{left2}-{left1} is like {right2}-{w.text}")
371 |         print("%s-%s is like %s-%s"%(left2, left1, right2, w.text))
372 | 
373 | 


--------------------------------------------------------------------------------
/src/codes/utils/embedding_io.py:
--------------------------------------------------------------------------------
  1 | from typing import Iterable, List, Set
  2 | 
  3 | from itertools import groupby
  4 | import numpy as np
  5 | import re
  6 | import utils.vectors as v
  7 | from utils.word import Word
  8 | import logging
  9 | import os
 10 | from embeddings.embedding_configs import EmbeddingConfigs
 11 | 
 12 | 
 13 | def save_model_to_file(embedding_model: List[Word], model_file_out: str):
 14 |     """
 15 |     Save loaded model back to file (to remove duplicated items).
 16 |     :param embedding_model:
 17 |     :param model_file_out:
 18 |     :return:
 19 |     """
 20 |     fwriter = open(model_file_out, "w")
 21 | 
 22 |     meta_data = "%s %s\n"%(len(embedding_model), len(embedding_model[0].vector))
 23 |     fwriter.write(meta_data)
 24 |     fwriter.flush()
 25 |     for w_Word in embedding_model:
 26 |         line = w_Word.text + " " + " ".join(str(scalar) for scalar in w_Word.vector.tolist())
 27 |         fwriter.write(line + "\n")
 28 |         fwriter.flush()
 29 |     fwriter.close()
 30 | 
 31 | 
 32 | def load_word_embeddings(file_paths: str, emb_config: EmbeddingConfigs) -> List[List[Word]]:
 33 |     """
 34 |     Sonvx: load multiple embeddings: e.g., <emb_file1>;<emb_file2>
 35 |     :param file_paths:
 36 |     :param emb_config:
 37 |     :return:
 38 |     """
 39 |     embedding_models = []
 40 |     embedding_names = []
 41 |     if file_paths and file_paths.__contains__(";"):
 42 |         files = file_paths.split(";")
 43 |         for emb_file in files:
 44 |             word_embedding = load_word_embedding(emb_file.replace("\"", ""), emb_config)
 45 |             embedding_name = os.path.basename(os.path.normpath(emb_file))
 46 |             embedding_models.append(word_embedding)
 47 |             embedding_names.append(embedding_name)
 48 |     else:
 49 |         return [load_word_embedding(file_paths), emb_config]
 50 | 
 51 |     return embedding_names, embedding_models
 52 | 
 53 | 
 54 | def load_word_embedding(file_path: str, emb_config: EmbeddingConfigs) -> List[Word]:
 55 |     """
 56 |     Load and cleanup the data.
 57 |     :param file_path:
 58 |     :param emb_config:
 59 |     :return:
 60 |     """
 61 |     # print(f"Loading {file_path}...")
 62 |     print("Loading %s ..."%(file_path))
 63 |     words = load_words_raw(file_path, emb_config)
 64 |     # print(f"Loaded {len(words)} words.")
 65 |     print("Loaded %s words." %(len(words)))
 66 | 
 67 |     # Test
 68 |     word1 = words[1]
 69 |     print("Vec Len(word1) = ", len(word1.vector))
 70 | 
 71 |     # num_dimensions = most_common_dimension(words)
 72 |     # words = [w for w in words if len(w.vector) == dims]
 73 |     # print(f"Using {num_dimensions}-dimensional vectors, {len(words)} remain.")
 74 | 
 75 |     # words = remove_stop_words(words)
 76 |     # print(f"Removed stop words, {len(words)} remain.")
 77 | 
 78 |     # ords = remove_duplicates(words)
 79 |     # print(f"Removed duplicates, {len(words)} remain.")
 80 | 
 81 |     logging.debug("Embedding words: ", words[:10])
 82 |     print("Emb_vocab_size = ", len(words))
 83 |     # input("Done loading embedding: >>>>")
 84 | 
 85 |     return words
 86 | 
 87 | 
 88 | def load_words_raw(file_path: str, emb_config: EmbeddingConfigs) -> List[Word]:
 89 |     """
 90 |     Load the file as-is, without doing any validation or cleanup.
 91 |     :param file_path:
 92 |     :param emb_config:
 93 |     :return:
 94 |     """
 95 | 
 96 |     def parse_line(line: str, frequency: int) -> Word:
 97 |         # print("Line=", line)
 98 |         tokens = line.split(" ")
 99 |         word = tokens[0]
100 |         if emb_config.do_normalize_emb:
101 |             vector = v.normalize(np.array([float(x) for x in tokens[1:]]))
102 |         else:
103 |             vector = np.array([float(x) for x in tokens[1:]])
104 |         return Word(word, vector, frequency)
105 | 
106 |     # Sonvx: NOT loading the same word twice.
107 | 
108 |     unique_dict = {}
109 | 
110 |     words = []
111 |     # Words are sorted from the most common to the least common ones
112 |     frequency = 1
113 | 
114 |     duplicated_entry = 0
115 | 
116 |     idx_counter, vocab_size, emb_dim = 0, 0, 0
117 |     with open(file_path) as f:
118 |         for line in f:
119 |             line = line.rstrip()
120 | 
121 |             # print("Processing line: ", line)
122 | 
123 |             if idx_counter == 0 and emb_config.is_word2vec_format:
124 |                 try:
125 |                     meta_info = line.split(" ")
126 |                     vocab_size = int(meta_info[0])
127 |                     emb_dim = int(meta_info[1])
128 |                     idx_counter += 1
129 |                     continue
130 |                 except Exception as e:
131 |                     print("meta_info = "%(meta_info))
132 |                     logging.error("Input embedding has format issue: Error = %s" % (e))
133 | 
134 |             # if len(line) < 20: # Ignore the first line of w2v format.
135 |             #     continue
136 | 
137 |             w = parse_line(line, frequency)
138 | 
139 |             # Svx: only load if the word is not existed in the list.
140 |             if w.text not in unique_dict:
141 |                 unique_dict[w.text] = frequency
142 |                 words.append(w)
143 |                 frequency += 1
144 |             else:
145 |                 duplicated_entry += 1
146 |                 # print("Loading the same word again")
147 | 
148 |             # # Svx: check if the embedding dim is the same with the metadata, random check only
149 |             if idx_counter == 10:
150 |                 if len(w.vector) != emb_dim:
151 |                     message = "Metadata and the real vector size do not match: meta:real = %s:%s" \
152 |                                   % (emb_dim, len(w.vector))
153 |                     logging.error(message)
154 |                     raise ValueError(message)
155 |             idx_counter += 1
156 | 
157 |     if duplicated_entry > 0:
158 |         logging.debug("Loading the same word again: %s"%(duplicated_entry))
159 | 
160 |     # Final check:
161 |     if (frequency - 1) != vocab_size:
162 |         msg = "Loaded %s/%s unique vocab." % ((frequency - 1), vocab_size)
163 |         logging.info(msg)
164 | 
165 |     return words
166 | 
167 | 
168 | def iter_len(iter: Iterable[complex]) -> int:
169 |     return sum(1 for _ in iter)
170 | 
171 | 
172 | def most_common_dimension(words: List[Word]) -> int:
173 |     """
174 |     There is a line in the input file which is missing a word
175 |     (search -0.0739, -0.135, 0.0584).
176 |     """
177 |     lengths = sorted([len(word.vector) for word in words])
178 |     dimensions = [(k, iter_len(v)) for k, v in groupby(lengths)]
179 |     print("Dimensions:")
180 |     for (dim, num_vectors) in dimensions:
181 |         # print(f"{num_vectors} {dim}-dimensional vectors")
182 |         print("%s %s-dimensional vectors"%(num_vectors, dim))
183 |     most_common = sorted(dimensions, key=lambda t: t[1], reverse=True)[0]
184 |     return most_common[0]
185 | 
186 | 
187 | # We want to ignore these characters,
188 | # so that e.g. "U.S.", "U.S", "US_" and "US" are the same word.
189 | ignore_char_regex = re.compile("[\W_]")
190 | 
191 | # Has to start and end with an alphanumeric character
192 | is_valid_word = re.compile("^[^\W_].*[^\W_]$")
193 | 
194 | 
195 | def remove_duplicates(words: List[Word]) -> List[Word]:
196 |     seen_words: Set[str] = set()
197 |     unique_words: List[Word] = []
198 |     for w in words:
199 |         canonical = ignore_char_regex.sub("", w.text)
200 |         if not canonical in seen_words:
201 |             seen_words.add(canonical)
202 |             # Keep the original ordering
203 |             unique_words.append(w)
204 |     return unique_words
205 | 
206 | 
207 | def remove_stop_words(words: List[Word]) -> List[Word]:
208 |     return [w for w in words if (
209 |             len(w.text) > 1 and is_valid_word.match(w.text))]
210 | 
211 | 
212 | # Run "smoke tests" on import
213 | assert [w.text for w in remove_stop_words([
214 |     Word('a', [], 1),
215 |     Word('ab', [], 1),
216 |     Word('-ab', [], 1),
217 |     Word('ab_', [], 1),
218 |     Word('a.', [], 1),
219 |     Word('.a', [], 1),
220 |     Word('ab', [], 1),
221 | ])] == ['ab', 'ab']
222 | assert [w.text for w in remove_duplicates([
223 |     Word('a.b', [], 1),
224 |     Word('-a-b', [], 1),
225 |     Word('ab_+', [], 1),
226 |     Word('.abc...', [], 1),
227 | ])] == ['a.b', '.abc...']
228 | 


--------------------------------------------------------------------------------
/src/codes/utils/eval_utils.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 | MAP@K word level and character level are explained in detail in this paper:
  4 | 
  5 | dpUGC: Learn Differentially Private Representationfor User Generated Contents
  6 | Xuan-Son Vu, Son N. Tran, Lili Jiang
  7 | In: Proceedings of the 20th International Conference on Computational Linguistics and
  8 | Intelligent Text Processing, April, 2019, (to appear)
  9 | 
 10 | Please cite the above paper if you use codes in this file.
 11 | 
 12 | """
 13 | 
 14 | 
 15 | def apk(actual, predicted, k=10):
 16 |     """
 17 |     Computes the average precision at k.
 18 |     This function computes the average prescision at k between two lists of
 19 |     items.
 20 |     Parameters
 21 |     ----------
 22 |     actual : list
 23 |              A list of elements that are to be predicted (order doesn't matter)
 24 |     predicted : list
 25 |                 A list of predicted elements (order does matter)
 26 |     k : int, optional
 27 |         The maximum number of predicted elements
 28 |     Returns
 29 |     -------
 30 |     score : double
 31 |             The average precision at k over the input lists
 32 |     """
 33 |     if len(predicted) > k:
 34 |         predicted = predicted[:k]
 35 | 
 36 |     score = 0.0
 37 |     num_hits = 0.0
 38 | 
 39 |     for i, p in enumerate(predicted):
 40 |         if p in actual and p not in predicted[:i]:
 41 |             num_hits += 1.0
 42 |             score += num_hits / (i + 1.0)
 43 | 
 44 |     if not actual:
 45 |         return 0.0
 46 | 
 47 |     return score / min(len(actual), k)
 48 | 
 49 | 
 50 | def mapk(actual, predicted, k=10, word_level=True):
 51 |     """
 52 |     Computes the mean average precision at k.
 53 |     This function computes the mean average prescision at k between two lists
 54 |     of lists of items.
 55 |     Parameters
 56 |     ----------
 57 |     actual : list
 58 |              A list of lists of elements that are to be predicted
 59 |              (order doesn't matter in the lists)
 60 |     predicted : list
 61 |                 A list of lists of predicted elements
 62 |                 (order matters in the lists)
 63 |     k : int, optional
 64 |         The maximum number of predicted elements
 65 |     Returns
 66 |     -------
 67 |     score : double
 68 |             The mean average precision at k over the input lists
 69 |     """
 70 |     # print("Sending arr = ", arr)
 71 |     if word_level:
 72 |         return calc_map(actual, predicted, topK=k)
 73 |     else:
 74 |         # arr = [apk(a, p, k) for a, p in zip(actual, predicted)]
 75 |         # return np.mean(arr)
 76 |         return calc_map_character_level(actual, predicted, topK=k)
 77 | 
 78 | 
 79 | def calc_map(actual, predicted, topK=10):
 80 |     """
 81 | 
 82 |     :param actual:
 83 |     :param predicted:
 84 |     :param topK:
 85 |     :return:
 86 |     """
 87 |     # print("Input: actual %s, predicted %s"%(actual, predicted))
 88 |     if len(predicted) > topK:
 89 |         predicted = predicted[:topK]
 90 |     idx = 1
 91 |     hit = 0
 92 |     map_arr = []
 93 |     for answer in predicted:
 94 |         if answer in actual[:topK]:
 95 |             hit += 1
 96 |             val = (hit * 1.0) / (idx * 1.0)
 97 |             # print("hit = %s, idx = %s"%(hit, idx))
 98 |             map_arr.append(val)
 99 |             # print("hit: %s, map_arr = %s"%(answer, map_arr))
100 |         idx += 1
101 |     # print("map_arr = %s done", map_arr)
102 |     if len(map_arr) > 0:
103 |         return np.mean(map_arr)
104 |     else:
105 |         return 0.0
106 | 
107 | 
108 | def calc_map_character_level(actual, predicted, topK=10):
109 |     """
110 | 
111 |     :param actual:
112 |     :param predicted:
113 |     :param topK:
114 |     :return:
115 |     """
116 |     # print("Input: actual %s, predicted %s" % (actual, predicted))
117 |     if len(predicted) > topK:
118 |         predicted = predicted[:topK]
119 | 
120 |     if len(actual) > topK:
121 |         actual = actual[:topK]
122 | 
123 |     rank = 1
124 |     hit = 0
125 |     actual_seq = ''.join([word for word in actual])
126 |     predicted_seq = ''.join([word for word in predicted])
127 |     map_arr = []
128 |     for char in predicted_seq:
129 |         if char in actual_seq[:rank]:
130 |             hit += 1
131 |             val = (hit * 1.0) / (rank * 1.0)
132 |             # print("hit = %s, idx = %s" % (hit, rank))
133 |             map_arr.append(val)
134 |             # print("hit: %s, map_arr = %s" % (char, map_arr))
135 |         rank += 1
136 |     # print("map_arr = %s done", map_arr)
137 |     return np.mean(map_arr)
138 | 
139 | 
140 | import unittest
141 | import numpy as np
142 | 
143 | 
144 | def test_apk(self):
145 |     self.assertAlmostEqual(apk(range(1, 6), [6, 4, 7, 1, 2], 2), 0.25)
146 |     self.assertAlmostEqual(apk(range(1, 6), [1, 1, 1, 1, 1], 5), 0.2)
147 |     predicted = range(1, 21)
148 |     predicted.extend(range(200, 600))
149 |     self.assertAlmostEqual(apk(range(1, 100), predicted, 20), 1.0)
150 | 
151 | 
152 | def test_mapk(self):
153 |     self.assertAlmostEqual(mapk([range(1, 5)], [range(1, 5)], 3), 1.0)
154 |     self.assertAlmostEqual(mapk([[1, 3, 4], [1, 2, 4], [1, 3]],
155 |                                 [range(1, 6), range(1, 6), range(1, 6)], 3), 0.685185185185185)
156 |     self.assertAlmostEqual(mapk([range(1, 6), range(1, 6)],
157 |                                 [[6, 4, 7, 1, 2], [1, 1, 1, 1, 1]], 5), 0.26)
158 |     self.assertAlmostEqual(mapk([[1, 3], [1, 2, 3], [1, 2, 3]],
159 |                                 [range(1, 6), [1, 1, 1], [1, 2, 1]], 3), 11.0 / 18)
160 | 
161 | 
162 | if __name__ == '__main__':
163 |     a1 = ["1", '2', '3', '4']
164 |     b1 = ['1', '5', '2', '8']
165 |     print(mapk(a1, b1, 4))
166 | 
167 |     a1 = ["15"]
168 |     b1 = ["1", "2", "3", "4", "5","6","7","8","9","10"]
169 | 
170 |     print("MapK:", mapk(a1, b1, 4))
171 | 
172 |     # unittest.main()
173 | 


--------------------------------------------------------------------------------
/src/codes/utils/file_utils.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | 
 4 | def save_obj(obj, file_path):
 5 |     with open(file_path + '.pkl', 'wb') as f:
 6 |         pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
 7 | 
 8 | 
 9 | def load_obj(file_path):
10 |     with open(file_path + '.pkl', 'rb') as f:
11 |         return pickle.load(f)
12 | 
13 | 
14 | def get_unique_vocab(analogy_file_path, write_out_file):
15 |     """
16 | 
17 |     :param analogy_file_path:
18 |     :param write_out_file:
19 |     :return:
20 |     """
21 |     vocab_dict = {}
22 |     with open(analogy_file_path, "r") as freader:
23 |         for line in freader:
24 |             if line.__contains__(" | "):
25 |                 word_parts =  line.split(" | ")
26 |                 for word in word_parts:
27 |                     word = word.rstrip()
28 |                     vocab_dict[word] = 0
29 | 
30 |     fwriter = open(write_out_file, "w")
31 |     for word in vocab_dict.keys():
32 |         fwriter.write(word + "\n")
33 |     fwriter.close()
34 | 
35 |     print("Write dictionary file to %s"%(write_out_file))
36 | 
37 |     return vocab_dict
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     get_unique_vocab("../data/embedding_analogies/portuguese/LX-4WAnalogies-ETNLP.txt",
42 |                      "../data/embedding_analogies/portuguese/vocab.txt")
43 | 


--------------------------------------------------------------------------------
/src/codes/utils/string_utils.py:
--------------------------------------------------------------------------------
 1 | import six
 2 | 
 3 | 
 4 | def convert_to_unicode(text):
 5 |     """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
 6 |     if six.PY3:
 7 |         if isinstance(text, str):
 8 |             return text
 9 |         elif isinstance(text, bytes):
10 |             return text.decode("utf-8", "ignore")
11 |         else:
12 |             raise ValueError("Unsupported string type: %s" % (type(text)))
13 |     elif six.PY2:
14 |         if isinstance(text, str):
15 |             return text.decode("utf-8", "ignore")
16 |         elif isinstance(text, unicode):
17 |             return text
18 |         else:
19 |             raise ValueError("Unsupported string type: %s" % (type(text)))
20 |     else:
21 |         raise ValueError("Not running on Python2 or Python 3?")
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/src/codes/utils/vectors.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Any, Optional
 2 | 
 3 | import math
 4 | import numpy as np
 5 | 
 6 | # Adopt from https://github.com/mkonicek/nlp/vecters.py
 7 | 
 8 | # Vector = np.ndarray[float]
 9 | Vector = 'np.ndarray[float]'
10 | vector_type = 'np.ndarray[float]'
11 | 
12 | # Vector = np.ndarray(dtype=float)
13 | 
14 | 
15 | def l2_len(v: vector_type) -> float:
16 |     return math.sqrt(np.dot(v, v))
17 | 
18 | 
19 | def dot(v1: vector_type, v2: vector_type) -> float:
20 |     assert v1.shape == v2.shape
21 |     return np.dot(v1, v2)
22 | 
23 | 
24 | def mean(v1: vector_type, v2: vector_type) -> Vector:
25 |     """
26 |     Added by Sonvx: get mean of 2 vectors.
27 |     :param v1:
28 |     :param v2:
29 |     :return:
30 |     """
31 |     assert v1.shape == v2.shape
32 |     return np.mean([v1, v2], axis=0)
33 | 
34 | 
35 | def mean_list(v1: List[Vector]) -> Vector:
36 |     """
37 |     Added by Sonvx: get mean of 2 vectors.
38 |     :param v1:
39 |     :return:
40 |     """
41 |     if len(v1) > 0:
42 |         return np.mean(v1, axis=0)
43 |     else:
44 |         return None
45 | 
46 | 
47 | def add(v1: vector_type, v2: vector_type) -> Vector:
48 |     assert v1.shape == v2.shape
49 |     return np.add(v1, v2)
50 | 
51 | 
52 | def sub(v1: vector_type, v2: vector_type) -> Vector:
53 |     assert v1.shape == v2.shape
54 |     return np.subtract(v1, v2)
55 | 
56 | 
57 | def normalize(v: vector_type) -> Vector:
58 |     return v / l2_len(v)
59 | 
60 | 
61 | def cosine_similarity_normalized(v1: vector_type, v2: vector_type) -> float:
62 |     """
63 |     Returns the cosine of the angle between the two vectors.
64 |     Each of the vectors must have length (L2-norm) equal to 1.
65 |     Results range from -1 (very different) to 1 (very similar).
66 |     """
67 |     return dot(v1, v2)
68 | 


--------------------------------------------------------------------------------
/src/codes/utils/word.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from utils.vectors import Vector
 3 | 
 4 | # Adopt from https://github.com/mkonicek/nlp/Word.py
 5 | 
 6 | 
 7 | class Word:
 8 |     """A single word (one line of the input vector embedding file)"""
 9 | 
10 |     def __init__(self, text: str, vector: Vector, frequency: int) -> None:
11 |         self.text = text
12 |         self.vector = vector
13 |         self.frequency = frequency
14 | 
15 |     def __repr__(self) -> str:
16 |         vector_preview = ', '.join(map(str, self.vector[:2]))
17 |         # return f"{self.text} [{vector_preview}, ...]"
18 |         return "%s [%s, ...]"%(self.text, vector_preview)
19 | 


--------------------------------------------------------------------------------
/src/codes/visualizer/README.md:
--------------------------------------------------------------------------------
 1 | # Requirements:
 2 | - ```pip install gensim flask```
 3 | - Download any pre-trained embeddings and put it into ../03.run_etnlp_visualizer_inter.sh
 4 | 
 5 | # How to run
 6 | > 1. sh ../03.run_etnlp_visualizer_inter.sh
 7 | > 2. Visit http://localhost:8089
 8 | 
 9 | # Screenshot
10 | 
11 | ![Alt text](https://github.com/vietnlp/etnlp/blob/master/images/etnlp_view_multi_embeddings.png "Screenshot example of one given input")
12 | 
13 | 


--------------------------------------------------------------------------------
/src/codes/visualizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/src/codes/visualizer/__init__.py


--------------------------------------------------------------------------------
/src/codes/visualizer/data/vnex.model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/src/codes/visualizer/data/vnex.model.bin


--------------------------------------------------------------------------------
/src/codes/visualizer/images/w2v_vn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/src/codes/visualizer/images/w2v_vn.png


--------------------------------------------------------------------------------
/src/codes/visualizer/images/w2v_vn_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/src/codes/visualizer/images/w2v_vn_2.png


--------------------------------------------------------------------------------
/src/codes/visualizer/outof_w2vec.dict:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | 
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | 'news' 
12 | 
13 | 
14 | 
15 | 
16 | 
17 | 'news' 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 'news' 
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 'news' 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 'news' 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 'news' 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 'news' 
49 | 
50 | 'back' 
51 | 'back' 
52 | 'back' 
53 | 'back' 
54 | 'news' 
55 | 'news' 
56 | 'back' 
57 | 'back' 
58 | 'back' 
59 | 'back' 
60 | 'news' 
61 | 'news' 
62 | 'back' 
63 | 'back' 
64 | 'back' 
65 | 'back' 
66 | 'news' 
67 | 'news' 
68 | 'lovely' 
69 | 'lovely' 
70 | 'lovely' 
71 | 'lovely' 
72 | 'love' 
73 | 'love' 
74 | 


--------------------------------------------------------------------------------
/src/codes/visualizer/static/bootstrap-theme.min.css:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * Bootstrap v3.3.5 (http://getbootstrap.com)
 3 |  * Copyright 2011-2015 Twitter, Inc.
 4 |  * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE)
 5 |  */
 6 | 
 7 | /*!
 8 |  * Generated using the Bootstrap Customizer (http://getbootstrap.com/customize/?id=6d5e1954144aa5c7842c)
 9 |  * Config saved to config.json and https://gist.github.com/6d5e1954144aa5c7842c
10 |  *//*!
11 |  * Bootstrap v3.3.5 (http://getbootstrap.com)
12 |  * Copyright 2011-2015 Twitter, Inc.
13 |  * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE)
14 |  */.btn-default,.btn-primary,.btn-success,.btn-info,.btn-warning,.btn-danger{text-shadow:0 -1px 0 rgba(0,0,0,0.2);-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,0.15),0 1px 1px rgba(0,0,0,0.075);box-shadow:inset 0 1px 0 rgba(255,255,255,0.15),0 1px 1px rgba(0,0,0,0.075)}.btn-default:active,.btn-primary:active,.btn-success:active,.btn-info:active,.btn-warning:active,.btn-danger:active,.btn-default.active,.btn-primary.active,.btn-success.active,.btn-info.active,.btn-warning.active,.btn-danger.active{-webkit-box-shadow:inset 0 3px 5px rgba(0,0,0,0.125);box-shadow:inset 0 3px 5px rgba(0,0,0,0.125)}.btn-default.disabled,.btn-primary.disabled,.btn-success.disabled,.btn-info.disabled,.btn-warning.disabled,.btn-danger.disabled,.btn-default[disabled],.btn-primary[disabled],.btn-success[disabled],.btn-info[disabled],.btn-warning[disabled],.btn-danger[disabled],fieldset[disabled] .btn-default,fieldset[disabled] .btn-primary,fieldset[disabled] .btn-success,fieldset[disabled] .btn-info,fieldset[disabled] .btn-warning,fieldset[disabled] .btn-danger{-webkit-box-shadow:none;box-shadow:none}.btn-default .badge,.btn-primary .badge,.btn-success .badge,.btn-info .badge,.btn-warning .badge,.btn-danger .badge{text-shadow:none}.btn:active,.btn.active{background-image:none}.btn-default{background-image:-webkit-linear-gradient(top, #fff 0, #e0e0e0 100%);background-image:-o-linear-gradient(top, #fff 0, #e0e0e0 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #fff), to(#e0e0e0));background-image:linear-gradient(to bottom, #fff 0, #e0e0e0 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffffff', endColorstr='#ffe0e0e0', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);background-repeat:repeat-x;border-color:#dbdbdb;text-shadow:0 1px 0 #fff;border-color:#ccc}.btn-default:hover,.btn-default:focus{background-color:#e0e0e0;background-position:0 -15px}.btn-default:active,.btn-default.active{background-color:#e0e0e0;border-color:#dbdbdb}.btn-default.disabled,.btn-default[disabled],fieldset[disabled] .btn-default,.btn-default.disabled:hover,.btn-default[disabled]:hover,fieldset[disabled] .btn-default:hover,.btn-default.disabled:focus,.btn-default[disabled]:focus,fieldset[disabled] .btn-default:focus,.btn-default.disabled.focus,.btn-default[disabled].focus,fieldset[disabled] .btn-default.focus,.btn-default.disabled:active,.btn-default[disabled]:active,fieldset[disabled] .btn-default:active,.btn-default.disabled.active,.btn-default[disabled].active,fieldset[disabled] .btn-default.active{background-color:#e0e0e0;background-image:none}.btn-primary{background-image:-webkit-linear-gradient(top, #337ab7 0, #265a88 100%);background-image:-o-linear-gradient(top, #337ab7 0, #265a88 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #337ab7), to(#265a88));background-image:linear-gradient(to bottom, #337ab7 0, #265a88 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff265a88', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);background-repeat:repeat-x;border-color:#245580}.btn-primary:hover,.btn-primary:focus{background-color:#265a88;background-position:0 -15px}.btn-primary:active,.btn-primary.active{background-color:#265a88;border-color:#245580}.btn-primary.disabled,.btn-primary[disabled],fieldset[disabled] .btn-primary,.btn-primary.disabled:hover,.btn-primary[disabled]:hover,fieldset[disabled] .btn-primary:hover,.btn-primary.disabled:focus,.btn-primary[disabled]:focus,fieldset[disabled] .btn-primary:focus,.btn-primary.disabled.focus,.btn-primary[disabled].focus,fieldset[disabled] .btn-primary.focus,.btn-primary.disabled:active,.btn-primary[disabled]:active,fieldset[disabled] .btn-primary:active,.btn-primary.disabled.active,.btn-primary[disabled].active,fieldset[disabled] .btn-primary.active{background-color:#265a88;background-image:none}.btn-success{background-image:-webkit-linear-gradient(top, #5cb85c 0, #419641 100%);background-image:-o-linear-gradient(top, #5cb85c 0, #419641 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #5cb85c), to(#419641));background-image:linear-gradient(to bottom, #5cb85c 0, #419641 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5cb85c', endColorstr='#ff419641', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);background-repeat:repeat-x;border-color:#3e8f3e}.btn-success:hover,.btn-success:focus{background-color:#419641;background-position:0 -15px}.btn-success:active,.btn-success.active{background-color:#419641;border-color:#3e8f3e}.btn-success.disabled,.btn-success[disabled],fieldset[disabled] .btn-success,.btn-success.disabled:hover,.btn-success[disabled]:hover,fieldset[disabled] .btn-success:hover,.btn-success.disabled:focus,.btn-success[disabled]:focus,fieldset[disabled] .btn-success:focus,.btn-success.disabled.focus,.btn-success[disabled].focus,fieldset[disabled] .btn-success.focus,.btn-success.disabled:active,.btn-success[disabled]:active,fieldset[disabled] .btn-success:active,.btn-success.disabled.active,.btn-success[disabled].active,fieldset[disabled] .btn-success.active{background-color:#419641;background-image:none}.btn-info{background-image:-webkit-linear-gradient(top, #5bc0de 0, #2aabd2 100%);background-image:-o-linear-gradient(top, #5bc0de 0, #2aabd2 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #5bc0de), to(#2aabd2));background-image:linear-gradient(to bottom, #5bc0de 0, #2aabd2 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5bc0de', endColorstr='#ff2aabd2', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);background-repeat:repeat-x;border-color:#28a4c9}.btn-info:hover,.btn-info:focus{background-color:#2aabd2;background-position:0 -15px}.btn-info:active,.btn-info.active{background-color:#2aabd2;border-color:#28a4c9}.btn-info.disabled,.btn-info[disabled],fieldset[disabled] .btn-info,.btn-info.disabled:hover,.btn-info[disabled]:hover,fieldset[disabled] .btn-info:hover,.btn-info.disabled:focus,.btn-info[disabled]:focus,fieldset[disabled] .btn-info:focus,.btn-info.disabled.focus,.btn-info[disabled].focus,fieldset[disabled] .btn-info.focus,.btn-info.disabled:active,.btn-info[disabled]:active,fieldset[disabled] .btn-info:active,.btn-info.disabled.active,.btn-info[disabled].active,fieldset[disabled] .btn-info.active{background-color:#2aabd2;background-image:none}.btn-warning{background-image:-webkit-linear-gradient(top, #f0ad4e 0, #eb9316 100%);background-image:-o-linear-gradient(top, #f0ad4e 0, #eb9316 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #f0ad4e), to(#eb9316));background-image:linear-gradient(to bottom, #f0ad4e 0, #eb9316 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff0ad4e', endColorstr='#ffeb9316', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);background-repeat:repeat-x;border-color:#e38d13}.btn-warning:hover,.btn-warning:focus{background-color:#eb9316;background-position:0 -15px}.btn-warning:active,.btn-warning.active{background-color:#eb9316;border-color:#e38d13}.btn-warning.disabled,.btn-warning[disabled],fieldset[disabled] .btn-warning,.btn-warning.disabled:hover,.btn-warning[disabled]:hover,fieldset[disabled] .btn-warning:hover,.btn-warning.disabled:focus,.btn-warning[disabled]:focus,fieldset[disabled] .btn-warning:focus,.btn-warning.disabled.focus,.btn-warning[disabled].focus,fieldset[disabled] .btn-warning.focus,.btn-warning.disabled:active,.btn-warning[disabled]:active,fieldset[disabled] .btn-warning:active,.btn-warning.disabled.active,.btn-warning[disabled].active,fieldset[disabled] .btn-warning.active{background-color:#eb9316;background-image:none}.btn-danger{background-image:-webkit-linear-gradient(top, #d9534f 0, #c12e2a 100%);background-image:-o-linear-gradient(top, #d9534f 0, #c12e2a 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #d9534f), to(#c12e2a));background-image:linear-gradient(to bottom, #d9534f 0, #c12e2a 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9534f', endColorstr='#ffc12e2a', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);background-repeat:repeat-x;border-color:#b92c28}.btn-danger:hover,.btn-danger:focus{background-color:#c12e2a;background-position:0 -15px}.btn-danger:active,.btn-danger.active{background-color:#c12e2a;border-color:#b92c28}.btn-danger.disabled,.btn-danger[disabled],fieldset[disabled] .btn-danger,.btn-danger.disabled:hover,.btn-danger[disabled]:hover,fieldset[disabled] .btn-danger:hover,.btn-danger.disabled:focus,.btn-danger[disabled]:focus,fieldset[disabled] .btn-danger:focus,.btn-danger.disabled.focus,.btn-danger[disabled].focus,fieldset[disabled] .btn-danger.focus,.btn-danger.disabled:active,.btn-danger[disabled]:active,fieldset[disabled] .btn-danger:active,.btn-danger.disabled.active,.btn-danger[disabled].active,fieldset[disabled] .btn-danger.active{background-color:#c12e2a;background-image:none}.thumbnail,.img-thumbnail{-webkit-box-shadow:0 1px 2px rgba(0,0,0,0.075);box-shadow:0 1px 2px rgba(0,0,0,0.075)}.dropdown-menu>li>a:hover,.dropdown-menu>li>a:focus{background-image:-webkit-linear-gradient(top, #f5f5f5 0, #e8e8e8 100%);background-image:-o-linear-gradient(top, #f5f5f5 0, #e8e8e8 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #f5f5f5), to(#e8e8e8));background-image:linear-gradient(to bottom, #f5f5f5 0, #e8e8e8 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff5f5f5', endColorstr='#ffe8e8e8', GradientType=0);background-color:#e8e8e8}.dropdown-menu>.active>a,.dropdown-menu>.active>a:hover,.dropdown-menu>.active>a:focus{background-image:-webkit-linear-gradient(top, #337ab7 0, #2e6da4 100%);background-image:-o-linear-gradient(top, #337ab7 0, #2e6da4 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #337ab7), to(#2e6da4));background-image:linear-gradient(to bottom, #337ab7 0, #2e6da4 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2e6da4', GradientType=0);background-color:#2e6da4}.navbar-default{background-image:-webkit-linear-gradient(top, #fff 0, #f8f8f8 100%);background-image:-o-linear-gradient(top, #fff 0, #f8f8f8 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #fff), to(#f8f8f8));background-image:linear-gradient(to bottom, #fff 0, #f8f8f8 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffffff', endColorstr='#fff8f8f8', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);border-radius:4px;-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,0.15),0 1px 5px rgba(0,0,0,0.075);box-shadow:inset 0 1px 0 rgba(255,255,255,0.15),0 1px 5px rgba(0,0,0,0.075)}.navbar-default .navbar-nav>.open>a,.navbar-default .navbar-nav>.active>a{background-image:-webkit-linear-gradient(top, #dbdbdb 0, #e2e2e2 100%);background-image:-o-linear-gradient(top, #dbdbdb 0, #e2e2e2 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #dbdbdb), to(#e2e2e2));background-image:linear-gradient(to bottom, #dbdbdb 0, #e2e2e2 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffdbdbdb', endColorstr='#ffe2e2e2', GradientType=0);-webkit-box-shadow:inset 0 3px 9px rgba(0,0,0,0.075);box-shadow:inset 0 3px 9px rgba(0,0,0,0.075)}.navbar-brand,.navbar-nav>li>a{text-shadow:0 1px 0 rgba(255,255,255,0.25)}.navbar-inverse{background-image:-webkit-linear-gradient(top, #3c3c3c 0, #222 100%);background-image:-o-linear-gradient(top, #3c3c3c 0, #222 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #3c3c3c), to(#222));background-image:linear-gradient(to bottom, #3c3c3c 0, #222 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff3c3c3c', endColorstr='#ff222222', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);border-radius:4px}.navbar-inverse .navbar-nav>.open>a,.navbar-inverse .navbar-nav>.active>a{background-image:-webkit-linear-gradient(top, #080808 0, #0f0f0f 100%);background-image:-o-linear-gradient(top, #080808 0, #0f0f0f 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #080808), to(#0f0f0f));background-image:linear-gradient(to bottom, #080808 0, #0f0f0f 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff080808', endColorstr='#ff0f0f0f', GradientType=0);-webkit-box-shadow:inset 0 3px 9px rgba(0,0,0,0.25);box-shadow:inset 0 3px 9px rgba(0,0,0,0.25)}.navbar-inverse .navbar-brand,.navbar-inverse .navbar-nav>li>a{text-shadow:0 -1px 0 rgba(0,0,0,0.25)}.navbar-static-top,.navbar-fixed-top,.navbar-fixed-bottom{border-radius:0}@media (max-width:767px){.navbar .navbar-nav .open .dropdown-menu>.active>a,.navbar .navbar-nav .open .dropdown-menu>.active>a:hover,.navbar .navbar-nav .open .dropdown-menu>.active>a:focus{color:#fff;background-image:-webkit-linear-gradient(top, #337ab7 0, #2e6da4 100%);background-image:-o-linear-gradient(top, #337ab7 0, #2e6da4 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #337ab7), to(#2e6da4));background-image:linear-gradient(to bottom, #337ab7 0, #2e6da4 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2e6da4', GradientType=0)}}.alert{text-shadow:0 1px 0 rgba(255,255,255,0.2);-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,0.25),0 1px 2px rgba(0,0,0,0.05);box-shadow:inset 0 1px 0 rgba(255,255,255,0.25),0 1px 2px rgba(0,0,0,0.05)}.alert-success{background-image:-webkit-linear-gradient(top, #dff0d8 0, #c8e5bc 100%);background-image:-o-linear-gradient(top, #dff0d8 0, #c8e5bc 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #dff0d8), to(#c8e5bc));background-image:linear-gradient(to bottom, #dff0d8 0, #c8e5bc 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffdff0d8', endColorstr='#ffc8e5bc', GradientType=0);border-color:#b2dba1}.alert-info{background-image:-webkit-linear-gradient(top, #d9edf7 0, #b9def0 100%);background-image:-o-linear-gradient(top, #d9edf7 0, #b9def0 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #d9edf7), to(#b9def0));background-image:linear-gradient(to bottom, #d9edf7 0, #b9def0 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9edf7', endColorstr='#ffb9def0', GradientType=0);border-color:#9acfea}.alert-warning{background-image:-webkit-linear-gradient(top, #fcf8e3 0, #f8efc0 100%);background-image:-o-linear-gradient(top, #fcf8e3 0, #f8efc0 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #fcf8e3), to(#f8efc0));background-image:linear-gradient(to bottom, #fcf8e3 0, #f8efc0 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fffcf8e3', endColorstr='#fff8efc0', GradientType=0);border-color:#f5e79e}.alert-danger{background-image:-webkit-linear-gradient(top, #f2dede 0, #e7c3c3 100%);background-image:-o-linear-gradient(top, #f2dede 0, #e7c3c3 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #f2dede), to(#e7c3c3));background-image:linear-gradient(to bottom, #f2dede 0, #e7c3c3 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff2dede', endColorstr='#ffe7c3c3', GradientType=0);border-color:#dca7a7}.progress{background-image:-webkit-linear-gradient(top, #ebebeb 0, #f5f5f5 100%);background-image:-o-linear-gradient(top, #ebebeb 0, #f5f5f5 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #ebebeb), to(#f5f5f5));background-image:linear-gradient(to bottom, #ebebeb 0, #f5f5f5 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffebebeb', endColorstr='#fff5f5f5', GradientType=0)}.progress-bar{background-image:-webkit-linear-gradient(top, #337ab7 0, #286090 100%);background-image:-o-linear-gradient(top, #337ab7 0, #286090 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #337ab7), to(#286090));background-image:linear-gradient(to bottom, #337ab7 0, #286090 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff286090', GradientType=0)}.progress-bar-success{background-image:-webkit-linear-gradient(top, #5cb85c 0, #449d44 100%);background-image:-o-linear-gradient(top, #5cb85c 0, #449d44 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #5cb85c), to(#449d44));background-image:linear-gradient(to bottom, #5cb85c 0, #449d44 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5cb85c', endColorstr='#ff449d44', GradientType=0)}.progress-bar-info{background-image:-webkit-linear-gradient(top, #5bc0de 0, #31b0d5 100%);background-image:-o-linear-gradient(top, #5bc0de 0, #31b0d5 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #5bc0de), to(#31b0d5));background-image:linear-gradient(to bottom, #5bc0de 0, #31b0d5 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5bc0de', endColorstr='#ff31b0d5', GradientType=0)}.progress-bar-warning{background-image:-webkit-linear-gradient(top, #f0ad4e 0, #ec971f 100%);background-image:-o-linear-gradient(top, #f0ad4e 0, #ec971f 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #f0ad4e), to(#ec971f));background-image:linear-gradient(to bottom, #f0ad4e 0, #ec971f 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff0ad4e', endColorstr='#ffec971f', GradientType=0)}.progress-bar-danger{background-image:-webkit-linear-gradient(top, #d9534f 0, #c9302c 100%);background-image:-o-linear-gradient(top, #d9534f 0, #c9302c 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #d9534f), to(#c9302c));background-image:linear-gradient(to bottom, #d9534f 0, #c9302c 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9534f', endColorstr='#ffc9302c', GradientType=0)}.progress-bar-striped{background-image:-webkit-linear-gradient(45deg, rgba(255,255,255,0.15) 25%, transparent 25%, transparent 50%, rgba(255,255,255,0.15) 50%, rgba(255,255,255,0.15) 75%, transparent 75%, transparent);background-image:-o-linear-gradient(45deg, rgba(255,255,255,0.15) 25%, transparent 25%, transparent 50%, rgba(255,255,255,0.15) 50%, rgba(255,255,255,0.15) 75%, transparent 75%, transparent);background-image:linear-gradient(45deg, rgba(255,255,255,0.15) 25%, transparent 25%, transparent 50%, rgba(255,255,255,0.15) 50%, rgba(255,255,255,0.15) 75%, transparent 75%, transparent)}.list-group{border-radius:4px;-webkit-box-shadow:0 1px 2px rgba(0,0,0,0.075);box-shadow:0 1px 2px rgba(0,0,0,0.075)}.list-group-item.active,.list-group-item.active:hover,.list-group-item.active:focus{text-shadow:0 -1px 0 #286090;background-image:-webkit-linear-gradient(top, #337ab7 0, #2b669a 100%);background-image:-o-linear-gradient(top, #337ab7 0, #2b669a 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #337ab7), to(#2b669a));background-image:linear-gradient(to bottom, #337ab7 0, #2b669a 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2b669a', GradientType=0);border-color:#2b669a}.list-group-item.active .badge,.list-group-item.active:hover .badge,.list-group-item.active:focus .badge{text-shadow:none}.panel{-webkit-box-shadow:0 1px 2px rgba(0,0,0,0.05);box-shadow:0 1px 2px rgba(0,0,0,0.05)}.panel-default>.panel-heading{background-image:-webkit-linear-gradient(top, #f5f5f5 0, #e8e8e8 100%);background-image:-o-linear-gradient(top, #f5f5f5 0, #e8e8e8 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #f5f5f5), to(#e8e8e8));background-image:linear-gradient(to bottom, #f5f5f5 0, #e8e8e8 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff5f5f5', endColorstr='#ffe8e8e8', GradientType=0)}.panel-primary>.panel-heading{background-image:-webkit-linear-gradient(top, #337ab7 0, #2e6da4 100%);background-image:-o-linear-gradient(top, #337ab7 0, #2e6da4 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #337ab7), to(#2e6da4));background-image:linear-gradient(to bottom, #337ab7 0, #2e6da4 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2e6da4', GradientType=0)}.panel-success>.panel-heading{background-image:-webkit-linear-gradient(top, #dff0d8 0, #d0e9c6 100%);background-image:-o-linear-gradient(top, #dff0d8 0, #d0e9c6 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #dff0d8), to(#d0e9c6));background-image:linear-gradient(to bottom, #dff0d8 0, #d0e9c6 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffdff0d8', endColorstr='#ffd0e9c6', GradientType=0)}.panel-info>.panel-heading{background-image:-webkit-linear-gradient(top, #d9edf7 0, #c4e3f3 100%);background-image:-o-linear-gradient(top, #d9edf7 0, #c4e3f3 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #d9edf7), to(#c4e3f3));background-image:linear-gradient(to bottom, #d9edf7 0, #c4e3f3 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9edf7', endColorstr='#ffc4e3f3', GradientType=0)}.panel-warning>.panel-heading{background-image:-webkit-linear-gradient(top, #fcf8e3 0, #faf2cc 100%);background-image:-o-linear-gradient(top, #fcf8e3 0, #faf2cc 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #fcf8e3), to(#faf2cc));background-image:linear-gradient(to bottom, #fcf8e3 0, #faf2cc 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fffcf8e3', endColorstr='#fffaf2cc', GradientType=0)}.panel-danger>.panel-heading{background-image:-webkit-linear-gradient(top, #f2dede 0, #ebcccc 100%);background-image:-o-linear-gradient(top, #f2dede 0, #ebcccc 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #f2dede), to(#ebcccc));background-image:linear-gradient(to bottom, #f2dede 0, #ebcccc 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff2dede', endColorstr='#ffebcccc', GradientType=0)}.well{background-image:-webkit-linear-gradient(top, #e8e8e8 0, #f5f5f5 100%);background-image:-o-linear-gradient(top, #e8e8e8 0, #f5f5f5 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #e8e8e8), to(#f5f5f5));background-image:linear-gradient(to bottom, #e8e8e8 0, #f5f5f5 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffe8e8e8', endColorstr='#fff5f5f5', GradientType=0);border-color:#dcdcdc;-webkit-box-shadow:inset 0 1px 3px rgba(0,0,0,0.05),0 1px 0 rgba(255,255,255,0.1);box-shadow:inset 0 1px 3px rgba(0,0,0,0.05),0 1px 0 rgba(255,255,255,0.1)}


--------------------------------------------------------------------------------
/src/codes/visualizer/static/style.css:
--------------------------------------------------------------------------------
 1 | .container-4{
 2 |   overflow: hidden;
 3 |   width: 300px;
 4 |   vertical-align: middle;
 5 |   white-space: nowrap;
 6 | }
 7 | 
 8 | .container-4 input#search{
 9 |   width: 300px;
10 |   height: 50px;
11 |   background: #2b303b;
12 |   border: none;
13 |   font-size: 10pt;
14 |   float: left;
15 |   color: #fff;
16 |   padding-left: 15px;
17 |   -webkit-border-radius: 5px;
18 |   -moz-border-radius: 5px;
19 |   border-radius: 5px;
20 | }
21 | 
22 | .container-4 input#search::-webkit-input-placeholder {
23 |    color: #65737e;
24 | }
25 | 
26 | .container-4 input#search:-moz-placeholder { /* Firefox 18- */
27 |    color: #65737e;
28 | }
29 | 
30 | .container-4 input#search::-moz-placeholder {  /* Firefox 19+ */
31 |    color: #65737e;
32 | }
33 | 
34 | .container-4 input#search:-ms-input-placeholder {
35 |    color: #65737e;
36 | }
37 | 
38 | .container-4 button.icon{
39 |   -webkit-border-top-right-radius: 5px;
40 |   -webkit-border-bottom-right-radius: 5px;
41 |   -moz-border-radius-topright: 5px;
42 |   -moz-border-radius-bottomright: 5px;
43 |   border-top-right-radius: 5px;
44 |   border-bottom-right-radius: 5px;
45 | 
46 |   border: none;
47 |   background: #232833;
48 |   height: 50px;
49 |   width: 50px;
50 |   color: #4f5b66;
51 |   opacity: 0;
52 |   font-size: 10pt;
53 | 
54 |   -webkit-transition: all .55s ease;
55 |   -moz-transition: all .55s ease;
56 |   -ms-transition: all .55s ease;
57 |   -o-transition: all .55s ease;
58 |   transition: all .55s ease;
59 | }
60 | 
61 | .container-4:hover button.icon, .container-4:active button.icon, .container-4:focus button.icon{
62 |   outline: none;
63 |   opacity: 1;
64 |   margin-left: -50px;
65 | }
66 | 
67 | .container-4:hover button.icon:hover{
68 |   background: white;
69 | }
70 | 
71 | div#answers {
72 |   background-color: #f2f2f2;
73 |   padding-top: 2px;
74 |   padding-bottom: 2px;
75 |   padding-left: 100px; 
76 | }
77 | 


--------------------------------------------------------------------------------
/src/codes/visualizer/templates/app.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <link rel="stylesheet" type="text/css" href="static/style.css">
 5 |     <meta charset="UTF-8">
 6 |     <title>Title</title>
 7 | </head>
 8 | <body>
 9 | <link href="//maxcdn.bootstrapcdn.com/font-awesome/4.1.0/css/font-awesome.min.css" rel="stylesheet">
10 | <div class="box">
11 |   <div class="container-4">
12 |     <input type="search" id="search" placeholder="Search..." />
13 |     <button class="icon"><i class="fa fa-search"></i></button>
14 |   </div>
15 | </div>
16 | </body>
17 | </html>


--------------------------------------------------------------------------------
/src/codes/visualizer/templates/search.html:
--------------------------------------------------------------------------------
 1 | {% block content %}
 2 | <div class="search">
 3 | 
 4 | <div class="container">
 5 | <form action="/search" method="post" role="form">
 6 | <div class="form-group">
 7 | <label for="name">Search:</label>
 8 | <input type="text" class="form-control" id="name" name="search"
 9 |        placeholder="Type a word, e.g., heo"
10 |        value="{{ request.form.search}}">
11 | </div>
12 | <button type="submit" class="btn btn-success">Search</button>
13 | </form>
14 | 
15 | </div>
16 | </div>
17 | 
18 | <html>
19 | <head>
20 |     <link rel="stylesheet" type="text/css" href="static/style.css">
21 |     <meta charset="UTF-8">
22 |     <title>ETNLP's Side-by-Side Visualizer</title>
23 |     <link rel="stylesheet" media="screen" href ="static/bootstrap.min.css">
24 |    <link rel="stylesheet" href="static/bootstrap-theme.min.css">
25 |    <meta name="viewport" content = "width=device-width, initial-scale=1.0">
26 |     <style>
27 |         .answers {
28 |           padding-top: 20px;
29 |           padding-bottom: 100px;
30 |           padding-left: 50px;
31 |         }
32 |         .testimonial-group > .row {
33 |           overflow-x: auto;
34 |           white-space: nowrap;
35 |         }
36 |         .testimonial-group > .row > .col-xs-4 {
37 |           display: inline-block;
38 |           float: none;
39 |         }
40 |     </style>
41 | </head>
42 | 
43 | 
44 | <div class="answers testimonial-group">
45 | 
46 | 
47 | {% for emb_name in embedding_names_arr %}
48 | 
49 |    <table style="float: left">
50 |         <tr><td>&nbsp;</td></tr>
51 |         <tr><td>&nbsp;</td></tr>
52 |             <tr><td>{{ emb_name }}</td></tr>
53 |             {% for page in output_arr[loop.index0] %}
54 |                     <tr><td>{{ page }}</td></tr>
55 |             {% endfor %}
56 |     </table>
57 | 
58 | {% endfor %}
59 | 
60 | 
61 | 
62 | 
63 | </div>
64 | 
65 | {% for message in get_flashed_messages() %}
66 | <div class=flash>
67 |     {{ message }}
68 | </div>
69 | {% endfor %}
70 | {% endblock %}
71 | 
72 | </html>


--------------------------------------------------------------------------------
/src/codes/visualizer/visualizer_sbs.py:
--------------------------------------------------------------------------------
  1 | from flask import Flask, render_template
  2 | from flask import request
  3 | import gensim
  4 | from distutils.version import LooseVersion
  5 | from utils import string_utils
  6 | import sys
  7 | 
  8 | 
  9 | app = Flask(__name__)
 10 | app.config.from_object(__name__)
 11 | app.config['SECRET_KEY'] = '7d441f27d441f27567d441f2b6176a'
 12 | 
 13 | global embedding_models
 14 | 
 15 | 
 16 | @app.route('/search', methods=['GET', 'POST'])
 17 | def search():
 18 |     """
 19 |     Get input query and return list of top similiar words in all embeddings.
 20 |     :param embedding_paths_arr:
 21 |     :return:
 22 |     """
 23 |     if request.method == "POST":
 24 |         query = request.values['search'] or ''
 25 |         # query = unicode(query, "utf-8")
 26 |         # query = query.decode().encode("utf-8")
 27 |         # Python 2.7
 28 |         try:
 29 |             # Old
 30 |             # query = unicode(query).lower()
 31 |             query = string_utils.convert_to_unicode(query)
 32 |         except Exception as e:
 33 |             raise Exception("Something went wrong: msg = %s, query = %s."%(e, query))
 34 | 
 35 |         print('query = ' + query)
 36 |         output_arr = []
 37 | 
 38 |         for embedding_model in embedding_models:
 39 |             try:
 40 |                 output = []
 41 |                 sim_list = embedding_model.most_similar(query, topn=50)
 42 |                 for wordsimilar in sim_list:
 43 |                     output.append(wordsimilar[0] + ' - ' + str(round(wordsimilar[1], 6)))
 44 | 
 45 |                 output_arr.append(output)
 46 |             except Exception as e:
 47 |                 output = 'Err: %s, Not found query = %s' % (e, query)
 48 |                 output_arr.append(output)
 49 | 
 50 |     return render_template('search.html',
 51 |                            embedding_names_arr=embedding_names_arr,
 52 |                            output_arr=output_arr
 53 |                            )
 54 | 
 55 | 
 56 | @app.route("/")
 57 | def get_index():
 58 |     return render_template('search.html')
 59 | 
 60 | 
 61 | @app.route("/multi_search")
 62 | def multi_search():
 63 |     return render_template('multi_search.html')
 64 | 
 65 | 
 66 | if __name__ == "__main__":
 67 |     import os
 68 | 
 69 |     dir_path = os.path.dirname(os.path.realpath(__file__))
 70 |     # download pre-trained_models at https://github.com/vietnlp/etnlp
 71 |     if len(sys.argv) < 2:
 72 |         print("Missing input arguments. Input format: ./*.py <emb_file1;emb_file2;...>. Exiting ...")
 73 |         exit(0)
 74 | 
 75 |     if sys.argv[1].__contains__(";"):
 76 |         model_files = sys.argv[1].split(";")
 77 |     else:
 78 |         model_files = [sys.argv[1]]
 79 | 
 80 |     embedding_names_arr = [os.path.basename(file_path) for file_path in model_files]
 81 | 
 82 |     embedding_models = []
 83 |     idx = 0
 84 |     for model in model_files:
 85 |         # model = root_dir + model
 86 |         if os.path.isfile(model):
 87 |             print('Loading embedding model ... %s' % (idx))
 88 | 
 89 |             isBinary = False
 90 |             if model.endswith(".bin"):
 91 |                 isBinary = True
 92 | 
 93 |             if LooseVersion(gensim.__version__) >= LooseVersion("1.0.1"):
 94 |                 from gensim.models import KeyedVectors
 95 | 
 96 |                 embedding_models.append(KeyedVectors.load_word2vec_format(model, binary=isBinary))
 97 |             else:
 98 |                 from gensim.models import Word2Vec
 99 | 
100 |                 embedding_models.append(Word2Vec.load_word2vec_format(model, binary=isBinary))
101 |             idx += 1
102 |         else:
103 |             print(
104 |                 "Download word2vec model and put into ../data/. File: https://github.com/vietnlp/etnlp")
105 | 
106 |     app.run(debug=False, port=8089, host='0.0.0.0')
107 | 


--------------------------------------------------------------------------------
/src/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/src/data/.DS_Store


--------------------------------------------------------------------------------
/src/data/embedding_analogies/portuguese/vocab.txt:
--------------------------------------------------------------------------------
  1 | Atenas
  2 | Grécia
  3 | Bagdade
  4 | Iraque
  5 | Banguecoque
  6 | Tailândia
  7 | Pequim
  8 | China
  9 | Berlim
 10 | Alemanha
 11 | Berna
 12 | Suíça
 13 | Cairo
 14 | Egito
 15 | Camberra
 16 | Austrália
 17 | Hanói
 18 | Vietname
 19 | Havana
 20 | Cuba
 21 | Helsínquia
 22 | Finlândia
 23 | Islamabade
 24 | Paquistão
 25 | Cábul
 26 | Afeganistão
 27 | Londres
 28 | Inglaterra
 29 | Madrid
 30 | Espanha
 31 | Moscovo
 32 | Rússia
 33 | Oslo
 34 | Noruega
 35 | Ottawa
 36 | Canadá
 37 | Paris
 38 | França
 39 | Roma
 40 | Itália
 41 | Estocolmo
 42 | Suécia
 43 | Teerão
 44 | Irão
 45 | Tóquio
 46 | Japão
 47 | capital-world
 48 | Abuja
 49 | Nigéria
 50 | Acra
 51 | Gana
 52 | Argel
 53 | Argélia
 54 | Amã
 55 | Jordânia
 56 | Ancara
 57 | Turquia
 58 | Antananarivo
 59 | Madagáscar
 60 | Apia
 61 | Samoa
 62 | Asgabate
 63 | Turquemenistão
 64 | Asmara
 65 | Eritreia
 66 | Astana
 67 | Cazaquistão
 68 | Baku
 69 | Azerbaijão
 70 | Bamako
 71 | Mali
 72 | Banjul
 73 | Gâmbia
 74 | Beirute
 75 | Líbano
 76 | Belgrado
 77 | Sérvia
 78 | Belmopã
 79 | Belize
 80 | Bisqueque
 81 | Quirguistão
 82 | Bratislava
 83 | Eslováquia
 84 | Bruxelas
 85 | Bélgica
 86 | Bucareste
 87 | Roménia
 88 | Budapeste
 89 | Hungria
 90 | Bujumbura
 91 | Burundi
 92 | Caracas
 93 | Venezuela
 94 | Chisinau
 95 | Moldávia
 96 | Conacri
 97 | Guiné
 98 | Copenhaga
 99 | Dinamarca
100 | Dacar
101 | Senegal
102 | Damasco
103 | Síria
104 | Daca
105 | Bangladeche
106 | Doa
107 | Catar
108 | Dublim
109 | Irlanda
110 | Duchambé
111 | Tajiquistão
112 | Funafuti
113 | Tuvalu
114 | Gaborone
115 | Botsuana
116 | Georgetown
117 | Guiana
118 | Harare
119 | Zimbabué
120 | Jacarta
121 | Indonésia
122 | Campala
123 | Uganda
124 | Catmandu
125 | Nepal
126 | Cartum
127 | Sudão
128 | Kiev
129 | Ucrânia
130 | Quigali
131 | Ruanda
132 | Kingston
133 | Jamaica
134 | Libreville
135 | Gabão
136 | Lilongwe
137 | Malaui
138 | Lima
139 | Peru
140 | Lisboa
141 | Portugal
142 | Liubliana
143 | Eslovénia
144 | Luanda
145 | Angola
146 | Lusaca
147 | Zâmbia
148 | Manágua
149 | Nicarágua
150 | Manama
151 | Bareine
152 | Manila
153 | Filipinas
154 | Maputo
155 | Moçambique
156 | Minsk
157 | Bielorrússia
158 | Mogadíscio
159 | Somália
160 | Monróvia
161 | Libéria
162 | Montevideu
163 | Uruguai
164 | Mascate
165 | Omã
166 | Nairóbi
167 | Quénia
168 | Nassau
169 | Baamas
170 | Niamei
171 | Níger
172 | Nicósia
173 | Chipre
174 | Nuaquechote
175 | Mauritânia
176 | Nuque
177 | Gronelândia
178 | Paramaribo
179 | Suriname
180 | Podgorica
181 | Montenegro
182 | Quito
183 | Equador
184 | Rabat
185 | Marrocos
186 | Riga
187 | Letónia
188 | Roseau
189 | Dominica
190 | Santiago
191 | Chile
192 | Escópia
193 | Macedónia
194 | Sófia
195 | Bulgária
196 | Suva
197 | Fiji
198 | Taipé
199 | Taiwan
200 | Talim
201 | Estónia
202 | Tashkent
203 | Uzbequistão
204 | Tbilisi
205 | Geórgia
206 | Tegucigalpa
207 | Honduras
208 | Timbu
209 | Butão
210 | Tirana
211 | Albânia
212 | Trípoli
213 | Líbia
214 | Tunes
215 | Tunísia
216 | Vaduz
217 | Liechtenstein
218 | Valletta
219 | Malta
220 | Viena
221 | Áustria
222 | Vienciana
223 | Laos
224 | Vílnius
225 | Lituânia
226 | Varsóvia
227 | Polónia
228 | Windhoek
229 | Namíbia
230 | Erevan
231 | Arménia
232 | Zagreb
233 | Croácia
234 | currency
235 | dinar
236 | kwanza
237 | Argentina
238 | peso
239 | dram
240 | Brasil
241 | real
242 | lev
243 | Cambodja
244 | riel
245 | dólar
246 | kuna
247 | coroa
248 | Europa
249 | euro
250 | florim
251 | Índia
252 | rupia
253 | rial
254 | Iene
255 | Coreia
256 | won
257 | lats
258 | litas
259 | Malásia
260 | ringgit
261 | México
262 | naira
263 | zlóti
264 | leu
265 | rublo
266 | baht
267 | grívnia
268 | EUA
269 | dongue
270 | city-in-state
271 | Chicago
272 | Ilinóis
273 | Houston
274 | Texas
275 | Filadélfia
276 | Pensilvânia
277 | Fênix
278 | Arizona
279 | Dallas
280 | Jacksonville
281 | Flórida
282 | Indianápolis
283 | Indiana
284 | Austin
285 | Detroit
286 | Michigan
287 | Mênfis
288 | Tennessee
289 | Boston
290 | Massachusetts
291 | Seattle
292 | Washington
293 | Denver
294 | Colorado
295 | Baltimore
296 | Marilândia
297 | Nashville
298 | Louisville
299 | Kentucky
300 | Milwaukee
301 | Wisconsin
302 | Portland
303 | Oregão
304 | Tucson
305 | Fresno
306 | Califórnia
307 | Sacramento
308 | Mesa
309 | Atlanta
310 | Omaha
311 | Nebraska
312 | Miami
313 | Tulsa
314 | Oklahoma
315 | Oakland
316 | Cleveland
317 | Ohio
318 | Minneapolis
319 | Minesota
320 | Wichita
321 | Kansas
322 | Arlington
323 | Bakersfield
324 | Tampa
325 | Anaheim
326 | Honolulu
327 | Havai
328 | Pitsburgo
329 | Lexington
330 | Stockton
331 | Cincinnati
332 | Anchorage
333 | Alasca
334 | Toledo
335 | Plano
336 | Henderson
337 | Nevada
338 | Orlando
339 | Laredo
340 | Chandler
341 | Madison
342 | Lubbock
343 | Garland
344 | Glendale
345 | Hialeah
346 | Reno
347 | Scottsdale
348 | Irving
349 | Fremont
350 | Irvine
351 | Spokane
352 | Modesto
353 | Shreveport
354 | Luisiana
355 | Tacoma
356 | Oxnard
357 | Fontana
358 | Akron
359 | Amarillo
360 | Tallahassee
361 | Huntsville
362 | Alabama
363 | Worcester
364 | family
365 | rapaz
366 | rapariga
367 | irmão
368 | irmã
369 | irmãos
370 | irmãs
371 | pai
372 | mãe
373 | avô
374 | avó
375 | neto
376 | neta
377 | noivo
378 | noiva
379 | ele
380 | ela
381 | dele
382 | dela
383 | marido
384 | mulher
385 | rei
386 | rainha
387 | homem
388 | sobrinho
389 | sobrinha
390 | príncipe
391 | princesa
392 | filho
393 | filha
394 | fihos
395 | filhas
396 | meio-irmão
397 | meia-irmã
398 | padrasto
399 | madrasta
400 | enteado
401 | enteada
402 | tio
403 | tia
404 | gram1-adjective-to-adverb
405 | fantástico
406 | fantasticamente
407 | aparente
408 | aparentemente
409 | calmo
410 | calmamente
411 | alegre
412 | alegremente
413 | completo
414 | completamente
415 | eficiente
416 | eficientemente
417 | afortunado
418 | afortunadamente
419 | livre
420 | livremente
421 | furioso
422 | furiosamente
423 | imediato
424 | imediatamente
425 | infrequente
426 | infrequentemente
427 | sortudo
428 | felizmente
429 | óbvio
430 | obviamente
431 | ocasional
432 | ocasionalmente
433 | possível
434 | possivelmente
435 | preciso
436 | precisamente
437 | profissional
438 | profissionalmente
439 | rápido
440 | rapidamente
441 | silencioso
442 | silenciosamente
443 | raro
444 | raramente
445 | relutante
446 | relutantemente
447 | seguro
448 | seguramente
449 | sério
450 | seriamente
451 | lento
452 | lentamente
453 | repentino
454 | repentinamente
455 | veloz/rápido
456 | típico
457 | tipicamente
458 | infeliz
459 | infelizmente
460 | usual
461 | usualmente
462 | gram2-opposite
463 | aceitável
464 | inaceitável
465 | consciente
466 | inconsciente
467 | certo
468 | incerto
469 | claro
470 | obscuro
471 | confortável
472 | desconfortável
473 | competitivo
474 | descompetitivo
475 | consistente
476 | inconsistente
477 | convincente
478 | inconvincente
479 | conveniente
480 | inconveniente
481 | decidido
482 | indeciso
483 | ineficiente
484 | ético
485 | antiético
486 | honesto
487 | desonesto
488 | impressivo
489 | inexpressivo
490 | informativo
491 | desinformativo
492 | informado
493 | desinformado
494 | conhecido
495 | desconhecido
496 | provável
497 | improvável
498 | lógico
499 | ilógico
500 | agradável
501 | desagradável
502 | impossível
503 | impossivelmente
504 | produtivo
505 | improdutivo
506 | racional
507 | irracional
508 | responsável
509 | irresponsável
510 | gram3-comparative
511 | mau
512 | pior
513 | grande
514 | maior
515 | bom
516 | melhor
517 | pequeno
518 | menor
519 | gram4-superlative
520 | brilhante
521 | brilhantíssimo
522 | escuro
523 | escuríssimo
524 | fácil
525 | facílimo
526 | rapidíssimo
527 | grandíssimo
528 | alto
529 | altíssimo
530 | larguíssimo
531 | longo
532 | longuíssimo
533 | baixo
534 | baixíssimo
535 | velho
536 | velhíssimo
537 | aguçado
538 | aguçadíssimo
539 | simples
540 | simplicíssimo
541 | curto
542 | curtíssimo
543 | estranho
544 | estranhíssimo
545 | forte
546 | fortíssimo
547 | doce
548 | dulcíssimo
549 | fraco
550 | fraquíssimo
551 | largo
552 | jovem
553 | novíssimo
554 | gram5-present-participle
555 | programar
556 | programando
557 | dançar
558 | dançando
559 | depurar
560 | depurando
561 | diminuir
562 | diminuindo
563 | descrever
564 | descrevendo
565 | descobrir
566 | descobrindo
567 | melhorar
568 | melhorando
569 | voar
570 | voando
571 | gerar
572 | gerando
573 | ir
574 | indo
575 | implementar
576 | implementando
577 | aumentar
578 | aumentando
579 | inventar
580 | inventando
581 | saltar
582 | saltando
583 | ouvir
584 | ouvindo
585 | ver
586 | vendo
587 | mover
588 | movendo
589 | jogar
590 | jogando
591 | prever
592 | prevendo
593 | ler
594 | lendo
595 | correr
596 | correndo
597 | dizer
598 | dizendo
599 | gritar
600 | gritando
601 | baralhar
602 | baralhando
603 | cantar
604 | cantando
605 | sentar
606 | sentando
607 | abrandando
608 | nadar
609 | nadando
610 | pensar
611 | pensando
612 | desaparecer
613 | desaparecendo
614 | andar
615 | andando
616 | escrever
617 | escrevendo
618 | gram6-nationality-adjective
619 | Albanês
620 | Argentino
621 | Australiano
622 | Austríaco
623 | Bielorusso
624 | Brasileiro
625 | Búlgaro
626 | Cambojano
627 | Chileno
628 | Chinês
629 | Colombia
630 | Colombiano
631 | Croata
632 | Dinamarquês
633 | Egípcio
634 | Inglês
635 | Frânces
636 | Alemão
637 | Grego
638 | Islândia
639 | Islandês
640 | Indiano
641 | Irlândes
642 | Israel
643 | Israelita
644 | Italiano
645 | Japonês
646 | Coreano
647 | Macedónio
648 | Maltês
649 | Mexicano
650 | Moldávio
651 | Holanda
652 | Holandês
653 | Norueguês
654 | Peruano
655 | Polaco
656 | Português
657 | Russo
658 | Eslovaco
659 | Espanhol
660 | Sueco
661 | Suiço
662 | Tailandês
663 | Ucraniano
664 | gram7-past-tense
665 | dançou
666 | diminuiu
667 | descreveu
668 | melhorou
669 | caíndo
670 | caiu
671 | alimentando
672 | alimentou
673 | voou
674 | gerou
675 | foi
676 | escondendo
677 | escondeu
678 | acertando
679 | acertou
680 | implementou
681 | aumentou
682 | saltou
683 | conhecendo
684 | conheceu
685 | ouviu
686 | olhou
687 | moveu
688 | pagou
689 | jogou
690 | previu
691 | correu
692 | disse
693 | gritou
694 | viu
695 | vendendo
696 | vendeu
697 | encolhendo
698 | encolheu
699 | cantou
700 | sentou
701 | dormindo
702 | dormiu
703 | lentificou
704 | gastando
705 | gastou
706 | golpeando
707 | golpeou
708 | nadou
709 | tirando
710 | tirou
711 | pensou
712 | desapareceu
713 | andou
714 | escreveu
715 | gram8-plural
716 | banana
717 | bananas
718 | pássaro
719 | pássaros
720 | garrafa
721 | garrafas
722 | edifício
723 | edifícios
724 | carro
725 | carros
726 | gato
727 | gatos
728 | criança
729 | crianças
730 | nuvem
731 | nuvens
732 | cor
733 | cores
734 | computador
735 | computadores
736 | vaca
737 | vacas
738 | cão
739 | cães
740 | dólares
741 | burro
742 | burros
743 | sonho
744 | sonhos
745 | águia
746 | águias
747 | elefante
748 | elefantes
749 | olho
750 | olhos
751 | dedo
752 | dedos
753 | cabra
754 | cabras
755 | mão
756 | mãos
757 | cavalo
758 | cavalos
759 | leão
760 | leões
761 | máquina
762 | máquinas
763 | manga
764 | mangas
765 | homens
766 | melão
767 | melões
768 | macaco
769 | macacos
770 | rato
771 | ratos
772 | cebola
773 | cebolas
774 | pêra
775 | pêras
776 | porco
777 | porcos
778 | ananás
779 | ananases
780 | ratazana
781 | ratazanas
782 | estrada
783 | estradas
784 | cobra
785 | cobras
786 | mulheres
787 | gram9-plural-verbs
788 | diminuem
789 | descrevem
790 | comer
791 | comem
792 | melhoram
793 | estima
794 | estimam
795 | encontra
796 | encontram
797 | geram
798 | vão
799 | implementam
800 | aumentam
801 | ouvem
802 | jogam
803 | prevêem
804 | fornece
805 | fornecem
806 | dizem
807 | gritam
808 | procura
809 | procuram
810 | vêem
811 | baralham
812 | cantam
813 | sentam
814 | lentificam
815 | diz
816 | nadam
817 | fala
818 | falam
819 | pensam
820 | desaparecem
821 | andam
822 | trabalhou
823 | trabalham
824 | escrevem
825 | 


--------------------------------------------------------------------------------
/src/data/embedding_analogies/vi/Multi_evaluator_results.txt:
--------------------------------------------------------------------------------
1 | : | Word Analogy Task results
2 | 


--------------------------------------------------------------------------------
/src/data/embedding_analogies/vi/solveable_analogies_vi.txt:
--------------------------------------------------------------------------------
1 | analogy_list_vi_ner.txt


--------------------------------------------------------------------------------
/src/data/glove2vec_dicts/glove1.vec:
--------------------------------------------------------------------------------
1 | word10 0.123 0.134 0.532 0.152
2 | word20 0.934 0.412 0.532 0.159
3 | word30 0.334 0.241 0.324 0.188
4 | word90 0.334 0.241 0.324 0.188
5 | word31 0.334 0.341 0.324 0.288


--------------------------------------------------------------------------------
/src/data/glove2vec_dicts/glove1_w2v.vec:
--------------------------------------------------------------------------------
1 | 5 4
2 | word10 0.123 0.134 0.532 0.152
3 | word20 0.934 0.412 0.532 0.159
4 | word30 0.334 0.241 0.324 0.188
5 | word90 0.334 0.241 0.324 0.188
6 | word31 0.334 0.341 0.324 0.288
7 | 


--------------------------------------------------------------------------------
/src/data/glove2vec_dicts/glove2.vec:
--------------------------------------------------------------------------------
1 | word1 0.123 0.134 0.532 0.152
2 | word2 0.934 0.412 0.532 0.159
3 | word3 0.334 0.241 0.324 0.188
4 | word9 0.334 0.241 0.324 0.188


--------------------------------------------------------------------------------
/src/data/glove2vec_dicts/glove2_w2v.vec:
--------------------------------------------------------------------------------
1 | 4 4
2 | word1 0.123 0.134 0.532 0.152
3 | word2 0.934 0.412 0.532 0.159
4 | word3 0.334 0.241 0.324 0.188
5 | word9 0.334 0.241 0.324 0.188
6 | 


--------------------------------------------------------------------------------
/src/data/vocab.txt:
--------------------------------------------------------------------------------
 1 | tôi
 2 | yêu
 3 | hà_nội
 4 | ghét
 5 | em
 6 | iphone
 7 | thích
 8 | hận
 9 | đắm_say
10 | đẹp
11 | giận
12 | đà_nẵng
13 | cậu
14 | bé
15 | cô
16 | gái
17 | anh_trai
18 | em_gái
19 | người 
20 | đàn_ông
21 | phụ_nữ
22 | hoàng_tử
23 | công_chúa
24 | 


--------------------------------------------------------------------------------
/src/examples/test1_etnlp_preprocessing.py:
--------------------------------------------------------------------------------
 1 | from etnlp_api import embedding_preprocessing as emb_prep
 2 | from etnlp_api import embedding_config
 3 | 
 4 | INPUT_FILES="../data/glove2vec_dicts/glove1.vec;../data/glove2vec_dicts/glove2.vec"
 5 | OUTPUT_FILES="../data/glove2vec_dicts/glove1_w2v.vec;../data/glove2vec_dicts/glove2_w2v.vec"
 6 | # do_normalize: use this flag to normalize in case of multiple embeddings.
 7 | embedding_config.do_normalize_emb = False
 8 | # to mark input embeddings are not in word2vec format.
 9 | embedding_config.is_word2vec_format = False
10 | emb_prep.load_and_save_2_word2vec_models(INPUT_FILES, OUTPUT_FILES, embedding_config)
11 | 
12 | print("Done with exporting")


--------------------------------------------------------------------------------
/src/examples/test2_etnlp_extractor.py:
--------------------------------------------------------------------------------
 1 | from etnlp_api import embedding_config
 2 | from etnlp_api import embedding_extractor
 3 | 
 4 | 
 5 | emb1 = "<point_to_your_downloaded_file>/W2V_C2V.vec"
 6 | emb2 = "<point_to_your_downloaded_file>/ELMO.vec"
 7 | emb3 = "<point_to_your_downloaded_file>/MULTI.vec"
 8 | emb4 = "<point_to_your_downloaded_file>/FastText.vec"
 9 | C2V = "../data/embedding_dicts/C2V.vec"
10 | out1 = "../data/embedding_dicts/W2V_C2V_23.vec"
11 | out2 = "../data/embedding_dicts/ELMO_23.vec"
12 | out3 = "../data/embedding_dicts/MULTI_23.vec"
13 | out4 = "../data/embedding_dicts/FastText_23.vec"
14 | 
15 | VOCAB_FILE = "../data/vocab.txt"
16 | # OUTPUT_FORMAT=".txt;.npz;.gz"
17 | OUTPUT_FORMAT = ".txt"
18 | # embedding_config
19 | embedding_config.do_normalize_emb = True
20 | 
21 | emb_files = [emb1, emb2, emb3, emb4]
22 | out_files = [out1, out2, out3, out4]
23 | 
24 | for emb_file, out_file in zip(emb_files, out_files):
25 |     embedding_extractor.extract_embedding_for_vocab_file(emb_file, VOCAB_FILE,
26 |                                                      C2V, out_file, OUTPUT_FORMAT)
27 | print("DONE")
28 | 
29 | 


--------------------------------------------------------------------------------
/src/examples/test3_etnlp_evaluator.py:
--------------------------------------------------------------------------------
 1 | from etnlp_api import embedding_evaluator
 2 | import os
 3 | import tensorflow as tf
 4 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 5 | tf.logging.set_verbosity(tf.logging.ERROR)
 6 | 
 7 | INPUT_FILES = "../data/embedding_dicts/ELMO_23.vec;../data/embedding_dicts/FastText_23.vec;" \
 8 |               "../data/embedding_dicts/W2V_C2V_23.vec;../data/embedding_dicts/MULTI_23.vec"
 9 | ANALOGY_FILE = "../data/embedding_analogies/vi/solveable_analogies_vi.txt"
10 | OUT_FILE = "../data/embedding_analogies/vi/Multi_evaluator_results.txt"
11 | embedding_evaluator.evaluator_api(INPUT_FILES, ANALOGY_FILE, OUT_FILE)
12 | print("DONE")


--------------------------------------------------------------------------------
/src/examples/test4_etnlp_visualizer.py:
--------------------------------------------------------------------------------
1 | # from etnlp_api import embedding_config
2 | from etnlp_api import embedding_visualizer
3 | 
4 | INPUT_FILES = "../data/embedding_dicts/ELMO_12.vec;../data/embedding_dicts/FastText_12.vec;" \
5 |               "../data/embedding_dicts/W2V_C2V_12.vec;../data/embedding_dicts/MULTI_12.vec"
6 | embedding_visualizer.visualize_multiple_embeddings(INPUT_FILES)
7 | 
8 | print("DONE")


--------------------------------------------------------------------------------