├── .gitignore ├── README.md ├── images ├── english_sbs.png ├── etnlp_view_embs.png ├── etnlp_view_multi_embeddings.png └── visualisation_hanoi_2.png └── src ├── codes ├── 00.run_etnlp_preprocessing.sh ├── 01.run_etnlp_evaluator.sh ├── 02.run_etnlp_extractor.sh ├── 03.run_etnlp_visualizer_inter.sh ├── 04.run_etnlp_visualizer_sbs.sh ├── api │ ├── __init__.py │ ├── embedding_evaluator.py │ ├── embedding_extractor.py │ ├── embedding_preprocessing.py │ └── embedding_visualizer.py ├── embeddings │ ├── __init__.py │ ├── embedding_configs.py │ ├── embedding_models.py │ └── embedding_utils.py ├── etnlp_api.py ├── requirements.txt ├── setup.py ├── utils │ ├── __init__.py │ ├── emb_utils.py │ ├── embedding_io.py │ ├── eval_utils.py │ ├── file_utils.py │ ├── string_utils.py │ ├── vectors.py │ └── word.py └── visualizer │ ├── README.md │ ├── __init__.py │ ├── data │ └── vnex.model.bin │ ├── images │ ├── w2v_vn.png │ └── w2v_vn_2.png │ ├── outof_w2vec.dict │ ├── static │ ├── bootstrap-theme.min.css │ ├── bootstrap.min.css │ └── style.css │ ├── templates │ ├── app.html │ └── search.html │ └── visualizer_sbs.py ├── data ├── .DS_Store ├── embedding_analogies │ ├── english │ │ └── english-word-analogy.txt │ ├── portuguese │ │ ├── LX-4WAnalogies-ETNLP.txt │ │ ├── LX-4WAnalogies.txt │ │ ├── POST_TAG_vocabulary.txt │ │ ├── evaluator_results.txt │ │ └── vocab.txt │ └── vi │ │ ├── Multi_evaluator_results.txt │ │ ├── analogy_list_vi_ner.txt │ │ ├── elmo_results_out_dict.txt │ │ └── solveable_analogies_vi.txt ├── embedding_dicts │ ├── C2V.vec │ ├── ELMO_23.vec │ ├── FastText_23.vec │ ├── MULTI_23.vec │ ├── W2V_C2V_23.vec │ ├── baomoi_c2v_dims_300.vec │ └── vn_elmo_medium_c2v.vec ├── glove2vec_dicts │ ├── glove1.vec │ ├── glove1_w2v.vec │ ├── glove2.vec │ └── glove2_w2v.vec └── vocab.txt └── examples ├── test1_etnlp_preprocessing.py ├── test2_etnlp_extractor.py ├── test3_etnlp_evaluator.py └── test4_etnlp_visualizer.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | *$py.class 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | build/ 11 | dist/ 12 | develop-eggs/ 13 | downloads/ 14 | eggs/ 15 | .eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | pip-wheel-metadata/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # IPython 78 | profile_default/ 79 | ipython_config.py 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # pipenv 85 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 86 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 87 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 88 | # install all needed dependencies. 89 | #Pipfile.lock 90 | 91 | # celery beat schedule file 92 | celerybeat-schedule 93 | 94 | # SageMath parsed files 95 | *.sage.py 96 | 97 | # Environments 98 | .env 99 | .venv 100 | env/ 101 | venv/ 102 | ENV/ 103 | env.bak/ 104 | venv.bak/ 105 | 106 | # Spyder project settings 107 | .spyderproject 108 | .spyproject 109 | 110 | # Rope project settings 111 | .ropeproject 112 | 113 | # mkdocs documentation 114 | /site 115 | 116 | # mypy 117 | .mypy_cache/ 118 | .dmypy.json 119 | dmypy.json 120 | 121 | # Pyre type checker 122 | .pyre/ 123 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ETNLP: A Toolkit for Extraction, Evaluation and Visualization of Pre-trained Word Embeddings 2 | ===== 3 | 4 | # Table of contents 5 | 1. [Introduction](#introduction) 6 | 2. [More about ETNLP](#moreaboutETNLP) 7 | 3. [Installation and How to Use](#installation_and_howtouse) 8 | 4. [Download Resources](#Download_Resources) 9 | 10 | 11 | # I. Overview 12 | ## A glimpse of ETNLP: 13 | - Github: https://github.com/vietnlp/etnlp 14 | - Video: https://vimeo.com/317599106 15 | - Paper: https://arxiv.org/abs/1903.04433 16 | 17 | 18 | # II. How do I cite ETNLP? 19 | Please CITE paper the Arxiv paper whenever ETNLP (or the pre-trained embeddings) is used to produce published results or incorporated into other software: 20 | 21 | ``` 22 | @inproceedings{vu:2019n, 23 | title={ETNLP: A Visual-Aided Systematic Approach to Select Pre-Trained Embeddings for a Downstream Task}, 24 | author={Vu, Xuan-Son and Vu, Thanh and Tran, Son N and Jiang, Lili}, 25 | booktitle={Proceedings of the International Conference Recent Advances in Natural Language Processing (RANLP)}, 26 | year={2019} 27 | } 28 | ``` 29 | 30 | 31 | # III. More about ETNLP : 32 | ## 1. Embedding Evaluator: 33 | To compare quality of embedding models on the word analogy task. 34 | - Input: a pre-trained embedding vector file (word2vec format), and word analogy file. 35 | - Output: (1) evaluate quality of the embedding model based on the MAP/P@10 score, (2) Paired t-tests to show significant level between different word embeddings. 36 | 37 | ### 1.1. Note: The word analogy list is created by: 38 | - Adopt from the English list by selecting suitable categories and translating to the target language (i.e., Vietnamese). 39 | - Removing inappropriate categories (i.e., category 6, 10, 11, 14) in the target language (i.e., Vietnamese). 40 | - Adding custom category that is suitable for the target language (e.g., cities and their zones in Vietnam for Vietnamese). 41 | Since most of this process is automatically done, it can be applied in other languages as well. 42 | 43 | ### 1.2. Selected categories for Vietnamese: 44 | > 1. capital-common-countries 45 | > 2. capital-world 46 | > 3. currency: E.g., Algeria | dinar | Angola | kwanza 47 | > 4. city-in-zone (Vietnam's cities and its zone) 48 | > 5. family (boy|girl | brother | sister) 49 | > 6. gram1-adjective-to-adverb (NOT USED) 50 | > 7. gram2-opposite (e.g., acceptable | unacceptable | aware | unaware) 51 | > 8. gram3-comparative (e.g., bad | worse | big | bigger) 52 | > 9. gram4-superlative (e.g., bad | worst | big | biggest) 53 | > 10. gram5-present-participle (NOT USED) 54 | > 11. gram6-nationality-adjective-nguoi-tieng (e.g., Albania | Albanian | Argentina | Argentinean) 55 | > 12. gram7-past-tense (NOT USED) 56 | > 13. gram8-plural-cac-nhung (e.g., banana | bananas | bird | birds) (NOT USED) 57 | > 14. gram9-plural-verbs (NOT USED) 58 | 59 | ### 1.3 Evaluation results (in details) 60 | 61 | * Analogy: Word Analogy Task 62 | 63 | * NER (w): NER task with hyper-parameters selected from the best F1 on validation set. 64 | 65 | * NER (w.o): NER task without selecting hyper-parameters from the validation set. 66 | 67 | |  Model | NER.w | NER.w.o | Analogy | 68 | |------------------------------ |------------- | ------------------ |------------------ | 69 | | BiLC3 + w2v | 89.01 | 89.41 | 0.4796 | 70 | | BiLC3 + Bert_Base | 88.26 | 89.91 | 0.4609 | 71 | | BiLC3 + w2v_c2v | 89.46 | 89.46 | 0.4796 | 72 | | BiLC3 + fastText | 89.65 | 89.84 | 0.4970 | 73 | | BiLC3 + Elmo | 89.67 | 90.84 | **0.4999** | 74 | | BiLC3 + MULTI_WC_F_E_B | **91.09** | **91.75** | 0.4906| 75 | 76 | 77 | ## 2. Embedding Extractor: To extract embedding vectors for other tasks. 78 | - Input: (1) list of input embeddings, (2) a vocabulary file. 79 | - Output: embedding vectors of the given vocab file in `.txt`, i.e., each line conains the embedding for a word. The file then be compressed in .gz format. This format is widely used in existing NLP Toolkits (e.g., Reimers et al. [1]). 80 | 81 | ### Extra options: 82 | - `-input-c2v`: character embedding file 83 | - `solveoov:1`: to solve OOV words of the 1st embedding. Similarly for more than one embedding: e.g., `solveoov:1:2`. 84 | 85 | [1] Nils Reimers and Iryna Gurevych, Reporting Score Distributions Makes a Difference: Performance Study of LSTM-networks for Sequence Tagging, 2017, http://arxiv.org/abs/1707.09861, arXiv. 86 | 87 | ## 3. Visualizer: to explore the embedding space and compare between different embeddings. 88 | 89 | ### Screenshot of viewing multiple-embeddings side-by-side (Vietnamese): 90 | ![Alt text](images/etnlp_view_multi_embeddings.png "Screenshot multiple-embeddings side-by-side") 91 | 92 | ### Screenshot of viewing each embedding interactively (Vietnamese): 93 | ![Alt text](images/etnlp_view_embs.png "Screenshot example of viewing each embedding interactively") 94 | 95 | ### Screenshot of viewing each embedding side-by-side (English): 96 | ![Alt text](images/english_sbs.png "Screenshot example of viewing each embedding interactively") 97 | 98 | 99 | # IV. Installation and How to use ETNLP 100 | ## 1. Installation: 101 | 102 | From source codes (Python 3.6.x): 103 | > 1. cd src/codes/ 104 | > 2. pip install -r requirements.txt 105 | > 3. python setup.py install 106 | 107 | From pip (python 3.6.x) 108 | > 1. sudo apt-get install python3-dev 109 | > 2. pip install cython 110 | > 3. pip install git+git://github.com/vietnlp/etnlp.git 111 | 112 | OR: 113 | > 1. pip install etnlp 114 | 115 | ## 2. Examples 116 | > 1. cd src/examples 117 | > 2. python test1_etnlp_preprocessing.py 118 | > 3. python test2_etnlp_extractor.py 119 | > 4. python test3_etnlp_evaluator.py 120 | > 5. python test4_etnlp_visualizer.py 121 | ### Example of using Fasttext-Sent2Vec: 122 | - 01. Install: https://github.com/epfml/sent2vec 123 | ``` 124 | 01. git clone https://github.com/epfml/sent2vec 125 | 02. cd sent2vec; pip install . 126 | ``` 127 | 128 | - 02. Extract embeddings for sentences (no requirement for tokenization before extracting embedding of sentences). 129 | ``` 130 | import sent2vec 131 | model = sent2vec.Sent2vecModel() 132 | model.load_model('opendata_wiki_lowercase_words.bin') 133 | emb = model.embed_sentence("tôi là sinh viên đh công nghệ, đại học quôc gia hà nội") 134 | embs = model.embed_sentences(["tôi là sinh viên", "tôi là nhà thơ", "tôi là bác sĩ"]) 135 | 136 | ``` 137 | 138 | 139 | ### 3. Visualization 140 | Side-by-side visualization: 141 | > 1. sh src/codes/04.run_etnlp_visualizer_sbs.sh 142 | 143 | Interactive visualization: 144 | > 1. sh src/codes/04.run_etnlp_visualizer_inter.sh 145 | 146 | 147 | # V. Available Lexical Resources 148 | ## 1. Word Analogy List for Vietnamese 149 | 150 | |  Word Analogy List | Download Link (NER Task)| Download Link (General)| 151 | |------------------------------|---------------|---------------| 152 | | Vietnamese (This work) | [Link1](https://drive.google.com/file/d/1eA5yvla4BhAIfWsmZherT1GEW6gzDC-1/view?usp=sharing)| [Link1](https://drive.google.com/file/d/1YJ9d5rVKMMKF1xWWZi26_sNpgULTvxwg/view?usp=sharing)| 153 | | English (Mirkolov et al. [2]) | [Link2]| [Link2](https://drive.google.com/file/d/10rWxGu8-nbQmYC8wrIussSZjY0lDh6RP/view?usp=sharing)| 154 | | Portuguese (Hartmann et al. [3]) | [Link3]| [Link3](https://github.com/nathanshartmann/portuguese_word_embeddings/blob/master/analogies/testset/LX-4WAnalogies.txt)| 155 | 156 | 157 | 158 | ## 2. Multiple pre-trained embedding models for Vietnamese 159 | 160 | - Training data: Wiki in Vietnamese: 161 | 162 | |  # of sentences | # of tokenized words| 163 | |------------------------------|---------------| 164 | |  6,685,621 | 114,997,587 | 165 | 166 | 167 | - Download Pre-trained Embeddings:
168 | (Note: The MULTI_WC_F_E_B is the concatenation of four embeddings: W2V_C2V, fastText, ELMO, and Bert_Base.) 169 | 170 | |  Embedding Model | Download Link (NER Task) | Download Link (AIVIVN SentiTask) | Download Link (General) | 171 | |------------------------------|---------------|---------------|---------------| 172 | | w2v | [Link1](https://drive.google.com/file/d/1LHaZ8LXxteHzod42naqJZYCwwq5mI9aL/view?usp=sharing) (dim=300)| [Link1] | [Link1] | 173 | | w2v_c2v | [Link2](https://drive.google.com/file/d/1-M9Tb9l8mNmP3RKxZiZNK1Vpbng2yw4l/view?usp=sharing) (dim=300)| [Link2] | [Link2] | 174 | | fastText | [Link3](https://drive.google.com/file/d/1dHCPhKFjtDjbrUeeymheDnlhjtaljPGE/view?usp=sharing) (dim=300)| [Link3] | [Link3] | 175 | | fastText-[Sent2Vec](https://github.com/epfml/sent2vec) | [Link3]| [Link3] | [Link3](https://drive.google.com/file/d/1BzL1mpdfqCCJioCdAlTVshbrz0lGfP2D/view?usp=sharing) (dim=300, 6GB, trained on 20GB of [news data](https://github.com/binhvq/news-corpus) and Wiki-data of ETNLP. | 176 | | Elmo | [Link4](https://drive.google.com/file/d/1zDaSD8NsZNXGyd9iVOxTcb7CP61Ixo-r/view?usp=sharing) (dim=1024)| [Link4](https://drive.google.com/file/d/1jVJtF0f6SbtUd-t3bnywP6mFnz0QXPIx/view?usp=sharing) (dim=1024)| [Link4](https://drive.google.com/file/d/1XPsTzg1Gex-Hh2nl9344YlZc1orOVBDp/view?usp=sharing) (dim=1024, 731MB and 1.9GB after extraction.)| 177 | | Bert_base | [Link5](https://drive.google.com/file/d/16fRkmIHiB16OlM8WdFmoApGtLMf6YJJ8/view?usp=sharing) (dim=768)| [Link5] | [Link5] | 178 | | MULTI_WC_F_E_B | [Link6](https://drive.google.com/file/d/1gq7b8hs31VzoeO3n3C__ftlDnE_iBZW2/view?usp=sharing) (dim=2392)| [Link6] | [Link6] | 179 | 180 | 181 | # VI. Versioning 182 | For transparency and insight into our release cycle, and for striving to maintain backward compatibility, ETNLP will be maintained under the Semantic Versioning guidelines as much as possible. 183 | 184 | Releases will be numbered with the following format: 185 | 186 | `..` 187 | 188 | And constructed with the following guidelines: 189 | 190 | * Breaking backward compatibility bumps the major (and resets the minor and patch) 191 | * New additions without breaking backward compatibility bumps the minor (and resets the patch) 192 | * Bug fixes and misc changes bumps the patch 193 | 194 | For more information on SemVer, please visit http://semver.org/. 195 | -------------------------------------------------------------------------------- /images/english_sbs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/images/english_sbs.png -------------------------------------------------------------------------------- /images/etnlp_view_embs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/images/etnlp_view_embs.png -------------------------------------------------------------------------------- /images/etnlp_view_multi_embeddings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/images/etnlp_view_multi_embeddings.png -------------------------------------------------------------------------------- /images/visualisation_hanoi_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/images/visualisation_hanoi_2.png -------------------------------------------------------------------------------- /src/codes/00.run_etnlp_preprocessing.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | export PYTHONPATH="$PYTHONPATH:$PWD" 3 | INPUT_FILES="../data/glove2vec_dicts/glove1.vec;../data/glove2vec_dicts/glove2.vec" 4 | OUTPUT_FILES="../data/glove2vec_dicts/glove1_w2v.vec;../data/glove2vec_dicts/glove2_w2v.vec" 5 | # do_normalize: use this flag to normalize in case of multiple embeddings. 6 | python ./etnlp_api.py -input $INPUT_FILES -output $OUTPUT_FILES -args "glove2w2v" 7 | -------------------------------------------------------------------------------- /src/codes/01.run_etnlp_evaluator.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | export PYTHONPATH="$PYTHONPATH:$PWD" 3 | INPUT_FILES="../data/embedding_dicts/ELMO_23.vec;../data/embedding_dicts/FastText_23.vec;../data/embedding_dicts/W2V_C2V_23.vec;../data/embedding_dicts/MULTI_23.vec" 4 | ANALOGY_FILE="../data/embedding_analogies/vi/solveable_analogies_vi.txt" 5 | OUT_FILE="../data/embedding_analogies/vi/Multi_evaluator_results.txt" 6 | python ./etnlp_api.py -input $INPUT_FILES -output $OUT_FILE -analoglist $ANALOGY_FILE -args eval -------------------------------------------------------------------------------- /src/codes/02.run_etnlp_extractor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | export PYTHONPATH="$PYTHONPATH:$PWD" 3 | INPUT_FILES="../data/embedding_dicts/ELMO_23.vec;../data/embedding_dicts/FastText_23.vec;../data/embedding_dicts/W2V_C2V_23.vec;../data/embedding_dicts/MULTI_23.vec" 4 | C2V="../data/embedding_dicts/C2V.vec" 5 | OUTPUT="../data/embedding_dicts/MULTI_W_F_B_E.vec" 6 | VOCAB_FILE="../data/vocab.txt" 7 | python ./etnlp_api.py -input $INPUT_FILES -vocab $VOCAB_FILE -input_c2v $C2V -args "extract" -output $OUTPUT 8 | -------------------------------------------------------------------------------- /src/codes/03.run_etnlp_visualizer_inter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | export PYTHONPATH="$PYTHONPATH:$PWD" 3 | INPUT_FILES="../data/embedding_dicts/ELMO_23.vec;../data/embedding_dicts/FastText_23.vec;../data/embedding_dicts/W2V_C2V_23.vec;../data/embedding_dicts/MULTI_23.vec" 4 | python3 ./etnlp_api.py -input $INPUT_FILES -args visualizer -port 8889 5 | -------------------------------------------------------------------------------- /src/codes/04.run_etnlp_visualizer_sbs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | export PYTHONPATH="$PYTHONPATH:$PWD" 3 | INPUT_FILES="../data/embedding_dicts/ELMO_23.vec;../data/embedding_dicts/FastText_23.vec;../data/embedding_dicts/W2V_C2V_23.vec;../data/embedding_dicts/MULTI_23.vec" 4 | # python ./visualizer/visualizer_sbs.py -input $INPUT_FILES -args visualizer 5 | python3 ./visualizer/visualizer_sbs.py $INPUT_FILES 6 | -------------------------------------------------------------------------------- /src/codes/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/src/codes/api/__init__.py -------------------------------------------------------------------------------- /src/codes/api/embedding_evaluator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import gensim 3 | import argparse 4 | from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors, Word2VecKeyedVectors 5 | from gensim import utils, matutils 6 | from six import string_types 7 | from numpy import dot, float32 as REAL, array, ndarray, argmax 8 | from utils import embedding_io, emb_utils 9 | from embeddings.embedding_configs import EmbeddingConfigs 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class new_Word2VecKeyedVectors(Word2VecKeyedVectors): 15 | def __init__(self, vector_size): 16 | super(Word2VecKeyedVectors, self).__init__(vector_size=vector_size) 17 | 18 | def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None): 19 | """ 20 | Find the top-N most similar words. Positive words contribute positively towards the 21 | similarity, negative words negatively. 22 | 23 | This method computes cosine similarity between a simple mean of the projection 24 | weight vectors of the given words and the vectors for each word in the model. 25 | The method corresponds to the `word-analogy` and `distance` scripts in the original 26 | word2vec implementation. 27 | 28 | If topn is False, most_similar returns the vector of similarity scores. 29 | 30 | `restrict_vocab` is an optional integer which limits the range of vectors which 31 | are searched for most-similar values. For example, restrict_vocab=10000 would 32 | only check the first 10000 word vectors in the vocabulary order. (This may be 33 | meaningful if you've sorted the vocabulary by descending frequency.) 34 | 35 | Example:: 36 | 37 | >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man']) 38 | [('queen', 0.50882536), ...] 39 | 40 | """ 41 | if positive is None: 42 | positive = [] 43 | if negative is None: 44 | negative = [] 45 | 46 | self.init_sims() 47 | 48 | if isinstance(positive, string_types) and not negative: 49 | # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) 50 | positive = [positive] 51 | 52 | # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words 53 | positive = [ 54 | (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word 55 | for word in positive 56 | ] 57 | negative = [ 58 | (word, -1.0) if isinstance(word, string_types + (ndarray,)) else word 59 | for word in negative 60 | ] 61 | 62 | # compute the weighted average of all words 63 | all_words, mean = set(), [] 64 | for word, weight in positive + negative: 65 | if isinstance(word, ndarray): 66 | mean.append(weight * word) 67 | else: 68 | mean.append(weight * self.word_vec(word, use_norm=True)) 69 | if word in self.vocab: 70 | all_words.add(self.vocab[word].index) 71 | if not mean: 72 | raise ValueError("cannot compute similarity with no input") 73 | mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) 74 | 75 | if indexer is not None: 76 | return indexer.most_similar(mean, topn) 77 | 78 | limited = self.syn0norm if restrict_vocab is None else self.syn0norm[:restrict_vocab] 79 | dists = dot(limited, mean) 80 | if not topn: 81 | return dists 82 | best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) 83 | # ignore (don't return) words from the input 84 | result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] 85 | return result[:topn] 86 | 87 | def new_accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True): 88 | """ 89 | Compute accuracy of the model. `questions` is a filename where lines are 90 | 4-tuples of words, split into sections by ": SECTION NAME" lines. 91 | See questions-words.txt in 92 | https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip 93 | for an example. 94 | 95 | The accuracy is reported (=printed to log and returned as a list) for each 96 | section separately, plus there's one aggregate summary at the end. 97 | 98 | Use `restrict_vocab` to ignore all questions containing a word not in the first `restrict_vocab` 99 | words (default 30,000). This may be meaningful if you've sorted the vocabulary by descending frequency. 100 | In case `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then 101 | case normalization is performed. 102 | 103 | Use `case_insensitive` to convert all words in questions and vocab to their uppercase form before 104 | evaluating the accuracy (default True). Useful in case of case-mismatch between training tokens 105 | and question words. In case of multiple case variants of a single word, the vector for the first 106 | occurrence (also the most frequent if vocabulary is sorted) is taken. 107 | 108 | This method corresponds to the `compute-accuracy` script of the original C word2vec. 109 | 110 | """ 111 | print("INFO: Using new accuracy") 112 | ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] 113 | ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) 114 | 115 | oov_counter, idx_cnt, is_vn_counter = 0, 0, 0 116 | sections, section = [], None 117 | for line_no, line in enumerate(utils.smart_open(questions)): 118 | # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed 119 | line = utils.to_unicode(line) 120 | 121 | if line.startswith(': '): 122 | # a new section starts => store the old section 123 | if section: 124 | sections.append(section) 125 | self.log_accuracy(section) 126 | section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} 127 | else: 128 | # Count number of analogy to check 129 | idx_cnt += 1 130 | if not section: 131 | raise ValueError("missing section header before line #%i in %s" % (line_no, questions)) 132 | try: 133 | if case_insensitive: 134 | a, b, c, expected = [word.upper() for word in line.split(" | ")] 135 | else: 136 | a, b, c, expected = [word for word in line.split(" | ")] 137 | # print("Line : ", line) 138 | # print("a, b, c, expected: %s, %s, %s, %s"%(a, b, c, expected)) 139 | # input(">>> Wait ...") 140 | except ValueError: 141 | logger.info("SVX: ERROR skipping invalid line #%i in %s", line_no, questions) 142 | print("Line : ", line) 143 | print("a, b, c, expected: %s, %s, %s, %s" % (a, b, c, expected)) 144 | input(">>> Wait ...") 145 | continue 146 | 147 | # In case of Vietnamese, word analogy can be a phrase 148 | if " " in a or " " in b or " " in c or " " in expected: 149 | is_vn_counter += 1 150 | pass 151 | else: 152 | if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: 153 | logger.debug("SVX: skipping line #%i with OOV words: %s", line_no, line.strip()) 154 | oov_counter += 1 155 | continue 156 | 157 | original_vocab = self.vocab 158 | self.vocab = ok_vocab 159 | ignore = {a, b, c} # input words to be ignored 160 | predicted = None 161 | # find the most likely prediction, ignoring OOV words and input words 162 | sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab) 163 | self.vocab = original_vocab 164 | for index in matutils.argsort(sims, reverse=True): 165 | predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index] 166 | if predicted in ok_vocab and predicted not in ignore: 167 | if predicted != expected: 168 | logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted) 169 | break 170 | if predicted == expected: 171 | section['correct'].append((a, b, c, expected)) 172 | else: 173 | section['incorrect'].append((a, b, c, expected)) 174 | 175 | if section: 176 | # store the last section, too 177 | sections.append(section) 178 | self.log_accuracy(section) 179 | 180 | total = { 181 | 'OOV/Total/VNCompound_Words': [oov_counter, (idx_cnt), is_vn_counter], 182 | 'section': 'total', 183 | 'correct': sum((s['correct'] for s in sections), []), 184 | 'incorrect': sum((s['incorrect'] for s in sections), []), 185 | } 186 | self.log_accuracy(total) 187 | sections.append(total) 188 | return sections 189 | 190 | 191 | def convert_conll_format_to_normal(connl_file, out_file): 192 | """ 193 | read file conll format 194 | return format : One sentence per line 195 | sentences_arr: [EU rejects German call .., ...] 196 | tags_arr: [B-ORG O B-MIST O ..., ...] 197 | """ 198 | f = open(connl_file) 199 | sentences = [] 200 | sentence = "" 201 | for line in f: 202 | # print("line: ", line) 203 | if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n": 204 | sentences.append(sentence.rstrip()) 205 | sentence = "" 206 | continue 207 | else: 208 | splits = line.split('\t') 209 | sentence += splits[1].rstrip() + " " 210 | 211 | # To handle the last sentence. 212 | if len(sentence) > 0: 213 | sentences.append(sentence) 214 | del sentence 215 | 216 | # Write to output 217 | if out_file is None: 218 | out_file = connl_file + ".std.txt" 219 | writer = open(out_file, "w") 220 | for sen in sentences: 221 | writer.write(sen + "\n") 222 | writer.flush() 223 | writer.close() 224 | 225 | return sentences 226 | 227 | 228 | def verify_word_analogies(file): 229 | """ 230 | Verify the word analogy file. 231 | :param file: 232 | :return: 233 | """ 234 | f_reader = open(file, "r") 235 | 236 | valid_cnt, invalid_cnt = 0, 0 237 | 238 | for line in f_reader: 239 | # print("line: ", line) 240 | if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n": 241 | continue 242 | else: 243 | splits = line.split('\t') 244 | if len(splits) != 4: 245 | invalid_cnt += 1 246 | else: 247 | valid_cnt += 1 248 | 249 | print("Valid analogy: %s, invalid analogy: %s" % (valid_cnt, invalid_cnt)) 250 | 251 | 252 | def check_oov_of_word_analogies(w2v_format_emb_file, analogy_file, is_vn=True, case_sensitive=True): 253 | emb_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_format_emb_file, 254 | binary=False, 255 | unicode_errors='ignore') 256 | 257 | f_reader = open(analogy_file, "r") 258 | vocab_arr = [] 259 | for line in f_reader: 260 | if not case_sensitive: 261 | line = line.lower() 262 | 263 | if line.startswith(': '): 264 | continue 265 | else: 266 | for word in line.split(" | "): 267 | # In Vietnamese, we have compound and single word. 268 | # if is_vn: 269 | # if " " in word: 270 | # print("I should not going here") 271 | # single_words = word.split(" ") 272 | # for single_word in single_words: 273 | # vocab_arr.append(single_word) 274 | # For other languages. 275 | # else: 276 | vocab_arr.append(word) 277 | 278 | print("Before unique set: len = ", len(vocab_arr)) 279 | unique_vocab_arr = set(vocab_arr) 280 | print("After unique set: len = ", len(unique_vocab_arr)) 281 | valid_word_cnt = 0 282 | for word in unique_vocab_arr: 283 | if word in emb_model: 284 | valid_word_cnt += 1 285 | 286 | print("With Is_VN = %s, case_sensitive = %s, Valid word = %s/%s" % (is_vn, 287 | case_sensitive, 288 | valid_word_cnt, 289 | len(unique_vocab_arr))) 290 | 291 | 292 | def evaluator_api(input_files, analoglist, output, embed_config=None): 293 | """ 294 | 295 | :param input_files: 296 | :param analoglist: 297 | :param output: 298 | :param embed_config: 299 | :return: 300 | """ 301 | if embed_config is None: 302 | embed_config = EmbeddingConfigs() # Initialize default config for embedding. 303 | local_embedding_names, local_word_embeddings = embedding_io.load_word_embeddings(input_files, embed_config) 304 | # emb_utils.print_analogy('man', 'him', 'woman', emb_words) 305 | local_output_str = emb_utils.eval_word_analogy_4_all_embeddings(analoglist, 306 | local_embedding_names, 307 | local_word_embeddings, 308 | output_file=output) 309 | print("OUTPUT: ", local_output_str) 310 | 311 | 312 | if __name__ == "__main__": 313 | """ 314 | Evaluates a given word embedding model. 315 | To use: 316 | evaluate.py path_to_model [-restrict] 317 | optional restrict argument performs an evaluation using the original 318 | Mikolov restriction of vocabulary 319 | """ 320 | 321 | desc = "Evaluates a word embedding model" 322 | parser = argparse.ArgumentParser(description=desc) 323 | parser.add_argument("-input", 324 | required=True, 325 | default="../data/embedding_dicts/ELMO_23.vec", 326 | help="Input multiple word embeddings, each model separated by a `;`.") 327 | parser.add_argument("-analoglist", 328 | nargs="?", 329 | # default="../data/embedding_analogies/vi/analogy_vn_seg.txt.std.txt", 330 | default="../data/embedding_analogies/vi/solveable_analogies_vi.txt", 331 | help="Input analogy file to run the word analogy evaluation.") 332 | 333 | parser.add_argument("-r", 334 | nargs="?", 335 | default=False, 336 | help="Vocabulary restriction") 337 | 338 | parser.add_argument("-checkoov", 339 | nargs="?", 340 | default=False, 341 | help="Check OOV percentage") 342 | 343 | parser.add_argument("-lang", 344 | nargs="?", 345 | default="VI", 346 | help="Specify language, by default, it's Vietnamese.") 347 | 348 | parser.add_argument("-lowercase", 349 | nargs="?", 350 | default=True, 351 | help="Lowercase all word analogies? (depends on how the emb was trained).") 352 | 353 | parser.add_argument("-output", 354 | nargs="?", 355 | default="../data/embedding_analogies/vi/results_out.txt", 356 | help="Output file of word analogy task") 357 | 358 | parser.add_argument("-remove_redundancy", 359 | nargs="?", 360 | default=True, 361 | help="Remove redundancy in predicted words") 362 | 363 | print("Params: ", parser) 364 | 365 | args = parser.parse_args() 366 | 367 | embedding_config = EmbeddingConfigs() 368 | 369 | paths_of_models = args.input 370 | testset = args.analoglist 371 | is_vietnamese = args.lang 372 | output_file = args.output 373 | 374 | # use restriction? 375 | restriction = None 376 | if args.r: 377 | restriction = 30000 378 | 379 | # set logging definitions 380 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 381 | level=logging.INFO) 382 | 383 | if args.checkoov: 384 | print("Checking OOV ...") 385 | check_oov_of_word_analogies(paths_of_models, testset, is_vn=is_vietnamese) 386 | 387 | if not args.checkoov: 388 | print("Evaluating embeddings on the word analogy task ...") 389 | if is_vietnamese: 390 | print(" ... for ETNLP's evaluation approach.") 391 | embedding_names, word_embeddings = embedding_io.load_word_embeddings(paths_of_models, embedding_config) 392 | # emb_utils.print_analogy('man', 'him', 'woman', emb_words) 393 | output_str = emb_utils.eval_word_analogy_4_all_embeddings(testset, embedding_names, word_embeddings, 394 | output_file=args.output_file) 395 | print("#"*20) 396 | print(output_str) 397 | print("#" * 20) 398 | 399 | else: 400 | print(" ... for Mirkolov et al.'s evaluation approach.") 401 | word_analogy_obj = new_Word2VecKeyedVectors(1024) 402 | 403 | # load and evaluate 404 | model = word_analogy_obj.load_word2vec_format( 405 | paths_of_models, 406 | binary=False, 407 | unicode_errors='ignore') 408 | 409 | model.accuracy = word_analogy_obj.new_accuracy 410 | 411 | acc = model.accuracy(testset, restrict_vocab=restriction, case_insensitive=False) 412 | print("Acc = ", acc) 413 | 414 | print("DONE") 415 | -------------------------------------------------------------------------------- /src/codes/api/embedding_extractor.py: -------------------------------------------------------------------------------- 1 | from embeddings import embedding_utils 2 | from pathlib import Path 3 | import numpy as np 4 | import os 5 | import logging 6 | import gzip 7 | from embeddings.embedding_configs import EmbeddingConfigs 8 | 9 | 10 | def get_multi_embedding_models(config: EmbeddingConfigs): 11 | """ 12 | 13 | :param config: 14 | :return: 15 | """ 16 | model_paths_list = config.model_paths_list 17 | model_names_list = config.model_names_list 18 | model_dims_list = config.model_dims_list 19 | char_model_path = config.char_model_path 20 | char_model_dims = config.char_model_dims 21 | 22 | if char_model_path: 23 | char_model = embedding_utils.reload_char2vec_model(char_model_path, char_model_dims) 24 | else: 25 | char_model = None 26 | 27 | embedding_models = embedding_utils.reload_embedding_models(model_paths_list, 28 | model_names_list, 29 | model_dims_list, 30 | char_model) 31 | # doc_vector = embedding_models.get_vector_of_document(tokenized_text) 32 | return embedding_models 33 | 34 | 35 | def get_emb_dim(emb_file): 36 | idx = 0 37 | dim = 0 38 | with open(emb_file, "r") as reader: 39 | if idx == 0: 40 | line = reader.readline().rstrip() 41 | dim = int(line.split(" ")[1]) 42 | return dim 43 | 44 | 45 | def extract_embedding_for_vocab_file(paths_of_emb_models, vocab_words_file, c2v_emb_file, output_file, output_format): 46 | """ 47 | 48 | :param paths_of_emb_models: 49 | :param vocab_words_file: 50 | :param c2v_emb_file: 51 | :param output_file: 52 | :param output_format: 53 | :return: 54 | """ 55 | config = EmbeddingConfigs() 56 | config.output_format = output_format 57 | config.model_paths_list = paths_of_emb_models.split(";") 58 | embedding_file_names = [] 59 | embedding_dims = [] 60 | 61 | if c2v_emb_file: 62 | config.char_model_path = c2v_emb_file 63 | config.char_model_dims = get_emb_dim(c2v_emb_file) 64 | 65 | print("02. Extracting word embeddings ...") 66 | if paths_of_emb_models and paths_of_emb_models.__contains__(";"): 67 | files = paths_of_emb_models.split(";") 68 | for emb_file in files: 69 | embedding_name = os.path.basename(os.path.normpath(emb_file)) 70 | embedding_file_names.append(embedding_name) 71 | embedding_dim = get_emb_dim(emb_file) 72 | embedding_dims.append(embedding_dim) 73 | elif paths_of_emb_models: # In case there is only one embedding 74 | embedding_name = os.path.basename(os.path.normpath(paths_of_emb_models)) 75 | embedding_file_names.append(embedding_name) 76 | embedding_dim = get_emb_dim(paths_of_emb_models) 77 | embedding_dims.append(embedding_dim) 78 | else: 79 | raise Exception("List of embeddings cannot be None.") 80 | 81 | # Data type: 82 | embedding_names = ["word2vec"]*len(embedding_dims) # embedding type, only support w2v and c2v type now 83 | config.model_names_list = embedding_names 84 | config.model_dims_list = embedding_dims 85 | 86 | # Do extracting embeddings 87 | extract_embedding_vectors(vocab_words_file, output_file, config) 88 | print("Done") 89 | 90 | 91 | def extract_embedding_vectors(vocab_words_file, output_file, config: EmbeddingConfigs): 92 | """ 93 | 94 | :param vocab_words_file: 95 | :param output_file: 96 | :param config: 97 | :return: 98 | """ 99 | # Load vocab 100 | with Path(vocab_words_file).open() as f: 101 | word_to_idx = {line.strip(): idx for idx, line in enumerate(f)} 102 | size_vocab = len(word_to_idx) 103 | 104 | # Output writer 105 | fwriter = open(output_file, "w") 106 | 107 | # Array of zeros 108 | dim_size = sum(config.model_dims_list) 109 | found = 0 110 | print('Reading embedding file (may take a while)') 111 | 112 | embedding_models = get_multi_embedding_models(config) 113 | 114 | embeddings = np.zeros((size_vocab, dim_size)) 115 | 116 | line_idx = 0 117 | for word in word_to_idx.keys(): 118 | word_idx = word_to_idx[word] 119 | 120 | word = word.rstrip() 121 | try: 122 | if line_idx % 100000 == 0: 123 | print('- At line {}'.format(line_idx)) 124 | 125 | w2v_vector = embedding_models.get_word_vector_of_multi_embeddings(word) 126 | 127 | if w2v_vector is not None and len(w2v_vector) > 0: 128 | embeddings[word_idx] = w2v_vector 129 | line = "%s %s" % (word, " ".join(str(scalar) for scalar in w2v_vector)) 130 | fwriter.write(line + "\n") 131 | fwriter.flush() 132 | found += 1 133 | 134 | logging.debug("Embedding: ", w2v_vector) 135 | except Exception as e: 136 | logging.debug("Unexpected error: word = %s, error = %s" % (word, e)) 137 | pass 138 | line_idx += 1 139 | 140 | print('- done. Found {} vectors for {} words'.format(found, size_vocab)) 141 | fwriter.close() 142 | 143 | # Open file again to add meta data: 144 | src = open(output_file, "r") 145 | meta_line = "%s %s\n"%(found, dim_size) 146 | oline = src.readlines() 147 | # Here, we prepend the string we want to on first line 148 | oline.insert(0, meta_line) 149 | src.close() 150 | 151 | # We again open the file in WRITE mode 152 | src = open(output_file, "w") 153 | src.writelines(oline) 154 | src.close() 155 | # Done with writing. 156 | 157 | if config.output_format.__contains__(".gz"): 158 | content = open(output_file, "rb").read() 159 | gzip_out_file = output_file + '.gz' 160 | with gzip.open(gzip_out_file, 'wb') as f: 161 | f.write(content) 162 | print("Saved embedding to %s" % (gzip_out_file)) 163 | 164 | if config.output_format.__contains__(".npz"): 165 | npz_out_file = output_file + '.npz' 166 | np.savez_compressed(npz_out_file, embeddings=embeddings) 167 | print("Saved embedding to %s"%(npz_out_file)) 168 | return 169 | 170 | -------------------------------------------------------------------------------- /src/codes/api/embedding_preprocessing.py: -------------------------------------------------------------------------------- 1 | # Convert to a standard word2vec format 2 | 3 | import gensim 4 | from utils import embedding_io 5 | import sys 6 | from threading import Thread 7 | from embeddings.embedding_configs import EmbeddingConfigs 8 | 9 | 10 | def convert_to_w2v(vocab_file, embedding_file, out_file): 11 | """ 12 | Export from a word2vec file by filtering out vocabs based on the input vocab file. 13 | :param vocab_file: 14 | :param embedding_file: 15 | :param out_file: 16 | :return: word2vec file 17 | """ 18 | std_vocab = [] 19 | with open(vocab_file) as f: 20 | for word in f: 21 | std_vocab.append(word) 22 | 23 | print ("Loaded NER vocab_size = %s" % (len(std_vocab))) 24 | is_binary = False 25 | if embedding_file.endswith(".bin"): 26 | is_binary = True 27 | 28 | print("Loading w2v model ...") 29 | 30 | emb_model = gensim.models.KeyedVectors.load_word2vec_format(embedding_file, 31 | binary=is_binary, 32 | unicode_errors='ignore') 33 | 34 | print("LOADED model: vocab_size = %s" % (len(emb_model.wv.vocab))) 35 | f_writer = open(out_file, "w") 36 | for word in std_vocab: 37 | word = word.rstrip() 38 | line = None 39 | if word in emb_model: 40 | vector = " ".join(str(item) for item in emb_model[word]) 41 | # word = word.lower() 42 | line = "%s %s" % (word, vector) 43 | else: 44 | word = word.lower() 45 | if word in emb_model: 46 | vector = " ".join(str(item) for item in emb_model[word]) 47 | line = "%s %s" % (word, vector) 48 | # print("LINE: ", line) 49 | if line: 50 | f_writer.write(line + "\n") 51 | f_writer.close() 52 | 53 | 54 | def test(): 55 | vocab_file = "../data/vnner_BiLSTM_CRF/vocab.words.txt" 56 | embedding_file = "../data/embedding_dicts/elmo_embeddings_large.txt" 57 | out_file = "../data/embedding_dicts/elmo_1024dims_wiki_normalcase2lowercase_NER.vec" 58 | convert_to_w2v(vocab_file, embedding_file, out_file) 59 | print("Out file: ", out_file) 60 | print("DONE") 61 | 62 | 63 | def load_and_save_2_word2vec_model(input_model_path, output_model_path, embedding_config): 64 | """ 65 | Process one embedding model 66 | :param input_model_path: 67 | :param output_model_path: 68 | :return: 69 | """ 70 | model_in = embedding_io.load_word_embedding(input_model_path, embedding_config) 71 | embedding_io.save_model_to_file(model_in, output_model_path) 72 | print("Write model back to ", output_model_path) 73 | 74 | 75 | def load_and_save_2_word2vec_models(input_embedding_files_str, output_embedding_files_str, embedding_config): 76 | """ 77 | Multi-threaded processing to export to word2vec format 78 | :param input_embedding_files_str: 79 | :param output_embedding_files_str: 80 | :return: 81 | """ 82 | if input_embedding_files_str.__contains__(";"): 83 | input_model_files = input_embedding_files_str.split(";") 84 | else: 85 | input_model_files = [input_embedding_files_str] 86 | 87 | if output_embedding_files_str.__contains__(";"): 88 | output_model_files = output_embedding_files_str.split(";") 89 | else: 90 | output_model_files = [output_embedding_files_str] 91 | 92 | # Double check input files and output files. 93 | assert (len(output_model_files) == len(input_model_files)), \ 94 | "Number of input files and output files must be equal. Exiting ..." 95 | 96 | # create a list of threads 97 | threads = [] 98 | 99 | for model_in, model_out in zip(input_model_files, output_model_files): 100 | # We start one thread per file. 101 | process = Thread(target=load_and_save_2_word2vec_model, args=[model_in, model_out, embedding_config]) 102 | process.start() 103 | threads.append(process) 104 | # load_and_save_2_word2vec_model(model_in, model_out) 105 | 106 | # This to ensure each thread has finished processing the input file. 107 | for process in threads: 108 | process.join() 109 | 110 | 111 | if __name__ == "__main__": 112 | 113 | if len(sys.argv) != 2: 114 | print("Missing input arguments. Input format: ./*.py . Exiting ...") 115 | exit(0) 116 | 117 | embedding_config = EmbeddingConfigs() 118 | # We don't need to be word2vec format for pre-processing here but it still shows warning 119 | # if input files aren't in w2v format. 120 | embedding_config.is_word2vec_format = True 121 | embedding_config.do_normalize_emb = False # If you don't want to normalize the embedding vectors. 122 | 123 | if sys.argv[1].__contains__(";"): 124 | in_model_files = sys.argv[1].split(";") 125 | else: 126 | in_model_files = [sys.argv[1]] 127 | 128 | out_model_files = [input_model_path + ".extracted.vec" for input_model_path in in_model_files] 129 | 130 | load_and_save_2_word2vec_models(in_model_files, out_model_files) 131 | -------------------------------------------------------------------------------- /src/codes/api/embedding_visualizer.py: -------------------------------------------------------------------------------- 1 | # 1. Read embedding file 2 | # 2. Convert to tensorboard 3 | # 3. Visualize 4 | 5 | # encoding: utf-8 6 | import sys, os 7 | import gensim 8 | import tensorflow as tf 9 | import numpy as np 10 | from tensorflow.contrib.tensorboard.plugins import projector 11 | import logging 12 | from tensorboard import default 13 | from tensorboard import program 14 | 15 | 16 | class TensorBoardTool: 17 | 18 | def __init__(self, dir_path): 19 | self.dir_path = dir_path 20 | 21 | def run(self, emb_name, port): 22 | # Remove http messages 23 | # log = logging.getLogger('sonvx').setLevel(logging.INFO) 24 | logging.basicConfig(level=logging.INFO) 25 | logging.propagate = False 26 | # Start tensorboard server 27 | tb = program.TensorBoard(default.get_plugins(), default.get_assets_zip_provider()) 28 | tb.configure(argv=[None, '--logdir', self.dir_path, '--port', str(port)]) 29 | url = tb.launch() 30 | sys.stdout.write('TensorBoard of %s at %s \n' % (emb_name, url)) 31 | 32 | 33 | def convert_multiple_emb_models_2_tf(emb_name_arr, w2v_model_arr, output_path, port): 34 | """ 35 | 36 | :param emb_name_arr: 37 | :param w2v_model_arr: 38 | :param output_path: 39 | :param port: 40 | :return: 41 | """ 42 | idx = 0 43 | # define the model without training 44 | sess = tf.InteractiveSession() 45 | config = projector.ProjectorConfig() 46 | 47 | for w2v_model in w2v_model_arr: 48 | emb_name = emb_name_arr[idx] 49 | 50 | meta_file = "%s.tsv" % emb_name 51 | placeholder = np.zeros((len(w2v_model.wv.index2word), w2v_model.vector_size)) 52 | 53 | with open(os.path.join(output_path, meta_file), 'wb') as file_metadata: 54 | for i, word in enumerate(w2v_model.wv.index2word): 55 | placeholder[i] = w2v_model[word] 56 | # temporary solution for https://github.com/tensorflow/tensorflow/issues/9094 57 | if word == '': 58 | print("Empty Line, should replaced by any thing else, or will cause a bug of tensorboard") 59 | file_metadata.write(u"{0}".format('').encode('utf-8') + b'\n') 60 | else: 61 | file_metadata.write(u"{0}".format(word).encode('utf-8') + b'\n') 62 | 63 | word_embedding_var = tf.Variable(placeholder, trainable=False, name=emb_name) 64 | tf.global_variables_initializer().run() 65 | sess.run(word_embedding_var) 66 | 67 | # adding into projector 68 | embed = config.embeddings.add() 69 | embed.tensor_name = emb_name 70 | embed.metadata_path = meta_file 71 | idx += 1 72 | 73 | saver = tf.train.Saver() 74 | writer = tf.summary.FileWriter(output_path, sess.graph) 75 | 76 | # Specify the width and height of a single thumbnail. 77 | projector.visualize_embeddings(writer, config) 78 | all_emb_name = "_".join(emb_name for emb_name in emb_name_arr) 79 | saver.save(sess, os.path.join(output_path, '%s.ckpt' % all_emb_name)) 80 | # tf.flags.FLAGS.logdir = output_path 81 | # print('Running `tensorboard --logdir={0}` to run visualize result on tensorboard'.format(output_path)) 82 | # tb.run_main()q 83 | tb_tool = TensorBoardTool(output_path) 84 | tb_tool.run(all_emb_name, port) 85 | return 86 | 87 | 88 | def convert_one_emb_model_2_tf(emb_name, model, output_path, port): 89 | """ 90 | 91 | :param model: Word2Vec model 92 | :param output_path: 93 | :return: 94 | """ 95 | # emb_name = "word_embedding" 96 | meta_file = "%s.tsv"%emb_name 97 | placeholder = np.zeros((len(model.wv.index2word), model.vector_size)) 98 | 99 | with open(os.path.join(output_path, meta_file), 'wb') as file_metadata: 100 | for i, word in enumerate(model.wv.index2word): 101 | placeholder[i] = model[word] 102 | # temporary solution for https://github.com/tensorflow/tensorflow/issues/9094 103 | if word == '': 104 | print("Empty Line, should replaced by any thing else, or will cause a bug of tensorboard") 105 | file_metadata.write(u"{0}".format('').encode('utf-8') + b'\n') 106 | else: 107 | file_metadata.write(u"{0}".format(word).encode('utf-8') + b'\n') 108 | 109 | # define the model without training 110 | sess = tf.InteractiveSession() 111 | 112 | word_embedding_var = tf.Variable(placeholder, trainable=False, name=emb_name) 113 | sess.run(word_embedding_var) 114 | # tf.global_variables_initializer().run() 115 | 116 | saver = tf.train.Saver() 117 | writer = tf.summary.FileWriter(output_path, sess.graph) 118 | 119 | # adding into projector 120 | config = projector.ProjectorConfig() 121 | embed = config.embeddings.add() 122 | embed.tensor_name = emb_name 123 | embed.metadata_path = meta_file 124 | 125 | # Specify the width and height of a single thumbnail. 126 | projector.visualize_embeddings(writer, config) 127 | saver.save(sess, os.path.join(output_path, '%s.ckpt'%emb_name)) 128 | # tf.flags.FLAGS.logdir = output_path 129 | # print('Running `tensorboard --logdir={0}` to run visualize result on tensorboard'.format(output_path)) 130 | # tb.run_main()q 131 | tb_tool = TensorBoardTool(output_path) 132 | tb_tool.run(emb_name, port) 133 | return 134 | 135 | 136 | def visualize_multiple_embeddings_individually(paths_of_emb_models): 137 | output_root_dir = "../data/embedding_tf_data/" 138 | starting_port = 6006 139 | embedding_names = [] 140 | print("Loaded all word embeddings, going to visualize ...") 141 | 142 | if paths_of_emb_models and paths_of_emb_models.__contains__(";"): 143 | files = paths_of_emb_models.split(";") 144 | for emb_file in files: 145 | 146 | embedding_name = os.path.basename(os.path.normpath(emb_file)) 147 | 148 | tf_data_folder = output_root_dir + embedding_name 149 | 150 | if not os.path.exists(tf_data_folder): 151 | os.makedirs(tf_data_folder) 152 | 153 | is_binary = False 154 | 155 | if emb_file.endswith(".bin"): 156 | is_binary = True 157 | 158 | emb_model = gensim.models.KeyedVectors.load_word2vec_format(emb_file, binary=is_binary) 159 | 160 | convert_one_emb_model_2_tf(embedding_name, emb_model, tf_data_folder, starting_port) 161 | 162 | embedding_names.append(embedding_name) 163 | 164 | starting_port += 1 165 | 166 | while True: 167 | print("Type exit to quite the visualizer: ") 168 | user_input = input() 169 | if user_input == "exit": 170 | break 171 | return 172 | 173 | 174 | def visualize_multiple_embeddings_all_in_one(paths_of_emb_models, port): 175 | output_root_dir = "../data/embedding_tf_data/" 176 | starting_port = port 177 | embedding_names = [] 178 | print("Loaded all word embeddings, going to visualize ...") 179 | 180 | embedding_name_arr = [] 181 | w2v_embedding_model_arr = [] 182 | 183 | if paths_of_emb_models and paths_of_emb_models.__contains__(";"): 184 | files = paths_of_emb_models.split(";") 185 | for emb_file in files: 186 | 187 | embedding_name = os.path.basename(os.path.normpath(emb_file)) 188 | embedding_name_arr.append(embedding_name) 189 | 190 | is_binary = False 191 | 192 | if emb_file.endswith(".bin"): 193 | is_binary = True 194 | 195 | emb_model = gensim.models.KeyedVectors.load_word2vec_format(emb_file, binary=is_binary) 196 | w2v_embedding_model_arr.append(emb_model) 197 | embedding_names.append(embedding_name) 198 | 199 | # print("View side-by-side word similarity of multiple embeddings at: http://Sons-MBP.lan:8089") 200 | 201 | all_emb_name = "_".join(emb_name for emb_name in embedding_name_arr) 202 | tf_data_folder = output_root_dir + all_emb_name 203 | if not os.path.exists(tf_data_folder): 204 | os.makedirs(tf_data_folder) 205 | 206 | convert_multiple_emb_models_2_tf(embedding_name_arr, w2v_embedding_model_arr, tf_data_folder, starting_port) 207 | 208 | while True: 209 | print("Type exit to quite the visualizer: ") 210 | user_input = input() 211 | if user_input == "exit": 212 | break 213 | return 214 | 215 | 216 | def visualize_multiple_embeddings(paths_of_emb_models, port): 217 | """ 218 | API to other part to call, don't modify this function. 219 | :param paths_of_emb_models: 220 | :param port: 221 | :return: 222 | """ 223 | visualize_multiple_embeddings_all_in_one(paths_of_emb_models, port) 224 | 225 | 226 | if __name__ == "__main__": 227 | """ 228 | Just run `python w2v_visualizer.py word2vec.model visualize_result` 229 | """ 230 | try: 231 | model_path = sys.argv[1] 232 | output_path = sys.argv[2] 233 | except Exception as e: 234 | print("Please provide model path and output path %s " % e) 235 | 236 | # model = Word2Vec.load(model_path) 237 | model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True) 238 | convert_one_emb_model_2_tf(model, output_path) 239 | -------------------------------------------------------------------------------- /src/codes/embeddings/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/src/codes/embeddings/__init__.py -------------------------------------------------------------------------------- /src/codes/embeddings/embedding_configs.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class EmbeddingConfigs(object): 4 | """ 5 | Configuration information 6 | """ 7 | is_word2vec_format = True 8 | do_normalize_emb = True 9 | model_paths_list = [] 10 | model_names_list = [] 11 | model_dims_list = [] 12 | char_model_path = None 13 | char_model_dims = -1 14 | output_format = ".txt;.npz;.gz" 15 | -------------------------------------------------------------------------------- /src/codes/embeddings/embedding_models.py: -------------------------------------------------------------------------------- 1 | from gensim.models import KeyedVectors as Word2Vec 2 | import numpy as np 3 | from embeddings import embedding_utils 4 | from utils import file_utils 5 | import os, re 6 | import logging 7 | 8 | 9 | DEBUG = False 10 | 11 | 12 | class Model_Constants(object): 13 | word2vec = "word2vec" 14 | char2vec = "char2vec" 15 | private_word2vec = "private_word2vec" 16 | elmo = "elmo" 17 | 18 | 19 | class Embedding_Model(object): 20 | def __init__(self, name, vector_dim): 21 | self.name = name 22 | self.model = None 23 | self.char_model = None 24 | self.vocabs_list = None 25 | self.vector_dim = vector_dim 26 | # TODO: update this changeable param later 27 | # unk, random, mean, replace_by_character_embedding 28 | self.unknown_word = "replace_by_character_embedding" 29 | # self.MAX_DIM = 400 # No longer use MAX_DIM, now it depends on input dims 30 | 31 | def load_model(self, model_path): 32 | if self.name == Model_Constants.word2vec or self.name == Model_Constants.elmo: 33 | if model_path.endswith(".bin"): 34 | self.model = Word2Vec.load_word2vec_format(model_path, binary=True) 35 | else: 36 | self.model = Word2Vec.load_word2vec_format(model_path, binary=False) 37 | elif self.name == Model_Constants.char2vec: 38 | self.model = dict() 39 | print("Loading model_path = ", model_path) 40 | file = open(model_path, "r") 41 | for line in file: 42 | elements = line.split() 43 | if len(elements) > 100: # because embedding dim is higher than 100. 44 | # char_model[elements[0]] = np.array(map(float, elements[1:])).tolist() 45 | self.model[elements[0]] = np.array([float(i) for i in elements[1:]]).tolist() 46 | return self.model 47 | elif self.name == Model_Constants.private_word2vec: 48 | self.model, _, self.vocabs_list = embedding_utils.reload_embeddings(model_path) 49 | else: 50 | raise Exception("Unknown embedding models!") 51 | 52 | def is_punct(self, word): 53 | arr_list = [ 54 | '!', 55 | '"', 56 | '%', 57 | '&', 58 | "'", 59 | "''", 60 | '(', 61 | '(.', 62 | ')', 63 | '*', 64 | '+', 65 | ',', 66 | '-', 67 | '---', 68 | '.', 69 | '..', 70 | '...', 71 | '....', 72 | '/', 73 | ] 74 | if word in arr_list: 75 | return True 76 | else: 77 | return False 78 | 79 | def is_number(self, word): 80 | regex = r"^[0-9]+" 81 | matches = re.finditer(regex, word, re.MULTILINE) 82 | matchNum = 0 83 | for matchNum, match in enumerate(matches): 84 | matchNum = matchNum + 1 85 | if matchNum > 0: 86 | return True 87 | else: 88 | return False 89 | 90 | def set_char_model(self, char_model): 91 | self.char_model = char_model 92 | 93 | def load_vocabs_list(self, vocab_file_path): 94 | """ 95 | Load vocabs list for private w2v model. Has to be pickle file. 96 | :param vocab_file_path: 97 | :return: 98 | """ 99 | if vocab_file_path: 100 | self.vocabs_list = file_utils.load_obj(vocab_file_path) 101 | 102 | def get_char_vector(self, char_model, word): 103 | """ 104 | char_model here is an instance of embedding_model 105 | :param char_model: an instance of embedding_model 106 | :param word: 107 | :return: 108 | """ 109 | if char_model is None: 110 | # Sonvx on March 20, 2019: we now allow the char_model is None, 111 | # cannot call this get_char_vector in such case. 112 | raise Exception("Char_model is None! Cannot use character-embedding.") 113 | 114 | out_char_2_vec = [] 115 | char_vecs = [] 116 | chars = list(word) 117 | vecs = [] 118 | for c in chars: 119 | if c in char_model.model: 120 | emb_vector = char_model.model[c] 121 | vecs.append(emb_vector) 122 | if DEBUG: 123 | input(">>>>>>") 124 | print("Char_emb_vector=", emb_vector) 125 | 126 | # char_vecs.extend(list(vecs)) 127 | 128 | if len(vecs) > 0: 129 | out_char_2_vec = np.mean(vecs, axis=0) 130 | 131 | if DEBUG: 132 | print(">>> Output of char2vec: %s"%(out_char_2_vec)) 133 | input(">>>> outc2v ...") 134 | 135 | return out_char_2_vec 136 | 137 | def is_unknown_word(self, word): 138 | """Check whether or not a word is unknown""" 139 | is_unknown_word = False 140 | if self.vocabs_list is not None: 141 | if word not in self.vocabs_list: 142 | is_unknown_word = True 143 | else: 144 | if word not in self.model: 145 | is_unknown_word = True 146 | return is_unknown_word 147 | 148 | def get_word_vector(self, word): 149 | """ 150 | Handle unknown word: In case of our private word2vec, we have a vocabs_list to check. With regular models, 151 | we can check inside the model. Note that by default, we use char-model to handle unknown words. 152 | :param word: 153 | :param char_model: 154 | :return: 155 | """ 156 | 157 | rtn_vector = [] 158 | 159 | # try first time with normal case 160 | is_unknown_word = self.is_unknown_word(word) 161 | 162 | # try 2nd times with lowercase. 163 | if is_unknown_word: 164 | word = word.lower() 165 | is_unknown_word = self.is_unknown_word(word) 166 | 167 | # unknown word 168 | if is_unknown_word and self.char_model: 169 | # Sonvx on March 20, 2019: solve unknown only when char_model is SET. 170 | rtn_vector = self.get_vector_of_unknown(word) 171 | else: 172 | # normal case 173 | if self.name == Model_Constants.word2vec: 174 | rtn_vector = self.model[word] 175 | 176 | # For now we have self.vector_dim, max_dim, and len(rtn_vector) 177 | # Update: move to use self.vector_dim only 178 | if len(rtn_vector) > self.vector_dim: 179 | print("Warning: auto trim to %s/%s dimensions"%(self.vector_dim, len(rtn_vector))) 180 | rtn_vector = self.model[word][:self.vector_dim] 181 | 182 | elif self.name == Model_Constants.elmo: 183 | rtn_vector = self.model[word] 184 | 185 | if self.vector_dim == len(rtn_vector)/2: 186 | vector1 = rtn_vector[:self.vector_dim] 187 | vector2 = rtn_vector[self.vector_dim:] 188 | print("Notice: auto average to b[i] = (a[i] + a[i + %s])/2 /%s dimensions" % (self.vector_dim, 189 | len(rtn_vector))) 190 | rtn_vector = np.mean([vector1, vector2], 0) 191 | elif len(rtn_vector) > self.vector_dim: 192 | print("Warning: auto trim to %s/%s dimensions" % (self.vector_dim, len(rtn_vector))) 193 | rtn_vector = self.model[word][:self.vector_dim] 194 | 195 | elif self.name == Model_Constants.char2vec: 196 | rtn_vector = self.get_char_vector(self, word) 197 | 198 | elif self.name == Model_Constants.private_word2vec: 199 | # Handle unknown word - Not need for now since we handle unknown words first 200 | if word not in self.vocabs_list: 201 | word = "UNK" 202 | word_idx = self.vocabs_list.index(word) 203 | emb_vector = self.model[word_idx] 204 | rtn_vector = emb_vector 205 | 206 | # final check before returning vector 207 | if DEBUG: 208 | print(">>> DEBUG: len(rtn_vector) = %s" % (len(rtn_vector))) 209 | input(">>> before returning vector ...") 210 | if len(rtn_vector) < 1: 211 | return np.zeros(self.vector_dim) 212 | else: 213 | if len(rtn_vector) == self.vector_dim: 214 | return rtn_vector 215 | # TODO: find a better way to represent unknown word by character to have same-size with word-vector-size 216 | # For now, I add 0 to the [current-len, expected-len] 217 | else: 218 | logging.debug("Model name = %s, Current word = %s, Current size = %s, expected size = %s" 219 | %(self.name, word, len(rtn_vector), self.vector_dim)) 220 | return np.append(rtn_vector, np.zeros(self.vector_dim - len(rtn_vector))) 221 | 222 | def get_vector_of_unknown(self, word): 223 | """ 224 | If word is UNK, use char_vector model instead. 225 | :param word: 226 | :return: 227 | """ 228 | # Here we handle features based on the w2v model where 229 | # numbers and punctuations are encoded as , 230 | if self.name == Model_Constants.word2vec: 231 | if self.is_number(word): 232 | rtn_vector = self.model[""] 233 | elif self.is_punct(word): 234 | rtn_vector = self.model[""] 235 | else: 236 | rtn_vector = self.get_char_vector(self.char_model, word) 237 | 238 | if rtn_vector is not None: 239 | if len(rtn_vector) > self.vector_dim: 240 | print("Warning: auto trim to %s/%s dimensions"%(self.vector_dim, len(rtn_vector))) 241 | return rtn_vector[:self.vector_dim] 242 | else: 243 | return rtn_vector 244 | # otherwise, using c2v to build-up the embedding vector 245 | else: 246 | return self.get_char_vector(self.char_model, word) 247 | 248 | 249 | class Embedding_Models(object): 250 | """ 251 | Using all available embedding models to generate vectors 252 | """ 253 | def __init__(self, list_models): 254 | self.list_models = list_models # list of embedding_model_objs: ['word2vec', 'char2vec', 'private_word2vec'] 255 | 256 | def add_model(self, emb_model, char_model): 257 | """ 258 | Add new model into the collection of embedding models. Note that, every model has to add char_model to handle 259 | unknown word. 260 | :param emb_model: 261 | :param char_model: 262 | :return: 263 | """ 264 | if char_model is None: 265 | print("Warning: char_model is None -> cannot solve OOV word. Keep going ...") 266 | # Sonvx on March 20, 2019: change to allow None char_model 267 | # raise Exception("char_model cannot be None.") 268 | if isinstance(emb_model, Embedding_Model): 269 | emb_model.set_char_model(char_model) 270 | self.list_models.append(emb_model) 271 | else: 272 | raise Exception("Not an instance of embedding_model class.") 273 | 274 | def get_vector_of_document(self, document): 275 | """ 276 | Get all embedding vectors for one document 277 | :param document: 278 | :return: 279 | """ 280 | doc_vector = [] 281 | # debug_dict = {} 282 | # print ("len_doc = ", len(document)) 283 | for word in document: 284 | 285 | all_vectors_of_word = [] 286 | 287 | # get all embedding vectors of a word 288 | for emb_model in self.list_models: 289 | emb_vector = emb_model.get_word_vector(word) 290 | # print("len_emb_vector = ", len(emb_vector)) 291 | all_vectors_of_word.extend(emb_vector) 292 | 293 | # if word in debug_dict.keys(): 294 | # debug_dict[word].append(len(emb_vector)) 295 | # else: 296 | # debug_dict[word] = [len(emb_vector)] 297 | 298 | # stack a combined vector of all words 299 | doc_vector.append(all_vectors_of_word) 300 | 301 | # print("list of words and emb size = ", debug_dict) 302 | # get the mean of them to represent a document 303 | doc_vector = np.mean(doc_vector, axis=0) 304 | 305 | return doc_vector 306 | 307 | def get_word_vector_of_multi_embeddings(self, word): 308 | """ 309 | Get all embedding vectors for one document 310 | :param word: 311 | :return: 312 | """ 313 | word_vector = [] 314 | for emb_model in self.list_models: 315 | emb_vector = emb_model.get_word_vector(word) 316 | word_vector.extend(emb_vector) 317 | 318 | return word_vector 319 | 320 | 321 | 322 | 323 | 324 | -------------------------------------------------------------------------------- /src/codes/embeddings/embedding_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from utils import file_utils 3 | from embeddings.embedding_models import Embedding_Model, Embedding_Models 4 | 5 | 6 | def reload_char2vec_model(model_path, model_dim): 7 | char_model = Embedding_Model("char2vec", model_dim) 8 | char_model.load_model(model_path) 9 | return char_model 10 | 11 | 12 | def reload_embedding_models(model_paths_list, model_names_list, model_dims_list, char_model): 13 | """ 14 | Reload collection of embedding models to serve feature extraction task. 15 | :param model_paths_list: 16 | :param model_names_list: 17 | :param model_dims_list: 18 | :param char_model: 19 | :return: 20 | """ 21 | # model path list and name list must be equal. 22 | print("model_paths_list = ", model_paths_list) 23 | print("model_formats_list = ", model_names_list) 24 | assert (len(model_names_list) == len(model_paths_list)), "Not equal length" 25 | assert (len(model_names_list) == len(model_dims_list)), "Not equal length" 26 | 27 | all_emb_models = Embedding_Models([]) 28 | 29 | for model_idx in range(len(model_paths_list)): 30 | # get model path based on index 31 | model_path = model_paths_list[model_idx] 32 | model_name = model_names_list[model_idx] 33 | model_dim = model_dims_list[model_idx] 34 | 35 | if model_path is not None: 36 | 37 | emb_model = Embedding_Model(model_name, model_dim) 38 | emb_model.load_model(model_path) 39 | 40 | # add to final list of emb_models 41 | all_emb_models.add_model(emb_model, char_model) 42 | 43 | return all_emb_models 44 | 45 | 46 | def save_embedding_models_tofolder(dir_path, final_embeddings, reverse_dictionary, vocabulary_size): 47 | """ 48 | Save all trained word-embedding model of the custom word2vec. 49 | :param final_embeddings: 50 | :param reverse_dictionary: 51 | :param vocabulary_size: 52 | :return: 53 | """ 54 | 55 | if not os.path.exists(dir_path): 56 | os.makedirs(dir_path) 57 | 58 | def save_to_word2vec_model(vocabs_list): 59 | # print("Saving word2vec format ...") 60 | filewriter = open(os.path.join(dir_path, "word2vec.txt"), "w", encoding="utf-8") 61 | 62 | filewriter.write("%s %s\n" % (len(vocabs_list), len(final_embeddings[0]))) 63 | for word in vocabs_list: 64 | word_idx = vocabs_list.index(word) 65 | emb_vector = final_embeddings[word_idx] 66 | line = ' '.join(["%s" % (x) for x in emb_vector]) 67 | filewriter.write(word + " " + line + "\n") 68 | 69 | filewriter.close() 70 | # print("Done!") 71 | 72 | file_utils.save_obj(final_embeddings, os.path.join(dir_path, "final_embeddings")) 73 | # We don't need to save reversed_dictionary 74 | # file_utils.save_obj(reverse_dictionary, os.path.join(FLAGS.trained_models, "reversed_dictionary")) 75 | vocab_list = [reverse_dictionary[i] for i in range(vocabulary_size)] 76 | save_to_word2vec_model(vocab_list) 77 | file_utils.save_obj(vocab_list, os.path.join(dir_path, "words_dictionary")) 78 | 79 | 80 | def save_embedding_models(FLAGS, final_embeddings, reverse_dictionary, vocabulary_size): 81 | """ 82 | Keep for old implementation. 83 | :param FLAGS: 84 | :param final_embeddings: 85 | :param reverse_dictionary: 86 | :param vocabulary_size: 87 | :return: 88 | """ 89 | save_embedding_models_tofolder(FLAGS.trained_models, final_embeddings, 90 | reverse_dictionary, vocabulary_size) 91 | 92 | 93 | def reload_embeddings(trained_models_dir): 94 | """ 95 | Reload trained word-embedding model of the custom word2vec. 96 | :param trained_models_dir: 97 | :return: 98 | """ 99 | final_embeddings = file_utils.load_obj(os.path.join(trained_models_dir, "final_embeddings")) 100 | # reverse_dictionary = file_utils.load_obj(os.path.join(trained_models_dir, "reversed_dictionary")) 101 | reverse_dictionary = None 102 | labels = file_utils.load_obj(os.path.join(trained_models_dir, "words_dictionary")) 103 | return final_embeddings, reverse_dictionary, labels 104 | 105 | 106 | def create_single_utf8_file(input_dir, output_file): 107 | import glob 108 | # path = './wiki_data/*.txt' 109 | # out = './wiki_all.vi.utf8.txt' 110 | files = glob.glob(input_dir) 111 | for file in files: 112 | with open(output_file, "a") as myfile: 113 | with open(file, "r") as fp: 114 | for line in fp: 115 | line = line.strip().lower() 116 | line = line.decode('utf-8', 'ignore').encode("utf-8") 117 | myfile.write(line) 118 | print("done") 119 | -------------------------------------------------------------------------------- /src/codes/etnlp_api.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from api import embedding_preprocessing, embedding_evaluator, embedding_extractor, embedding_visualizer 3 | from visualizer import visualizer_sbs 4 | import logging 5 | import os 6 | from embeddings.embedding_configs import EmbeddingConfigs 7 | __version__ = "0.1.3" 8 | 9 | 10 | embedding_config = EmbeddingConfigs() 11 | 12 | if __name__ == "__main__": 13 | """ 14 | ETNLP: a toolkit for evaluate, extract, and visualize multiple word embeddings 15 | """ 16 | 17 | _desc = "Evaluates a word embedding model" 18 | _parser = argparse.ArgumentParser(description=_desc) 19 | _parser.add_argument("-input", 20 | required=True, 21 | default="../data/embedding_dicts/elmo_embeddings.txt", 22 | # 23 | help="model") 24 | _parser.add_argument("-analoglist", 25 | nargs="?", 26 | # default="../data/embedding_analogies/vi/analogy_vn_seg.txt.std.txt", 27 | default="./data/embedding_analogy/solveable_analogies_vi.txt", 28 | help="testset") 29 | 30 | _parser.add_argument("-args", 31 | nargs="?", 32 | default="eval", 33 | help="Run evaluation") 34 | 35 | _parser.add_argument("-lang", 36 | nargs="?", 37 | default="VI", 38 | help="Specify language, by default, it's Vietnamese.") 39 | 40 | _parser.add_argument("-vocab", 41 | nargs="?", 42 | default="../data/vocab.txt", 43 | help="Vocab to be extracted") 44 | 45 | _parser.add_argument("-port", 46 | nargs="?", 47 | default=8889, 48 | help="Port for visualization") 49 | 50 | _parser.add_argument("-input_c2v", 51 | nargs="?", 52 | default=None, 53 | help="C2V embedding") 54 | 55 | _parser.add_argument("-output", 56 | nargs="?", 57 | default="../data/embedding_analogies/vi/results_out.txt", 58 | help="Output file of word analogy task") 59 | 60 | _parser.add_argument("-output_format", 61 | nargs="?", 62 | default=".txt", 63 | help="Format of output file of the extracted embedding.") 64 | 65 | _args = _parser.parse_args() 66 | 67 | # Set logging level 68 | logging.basicConfig(level=logging.INFO) 69 | logging.disable(logging.INFO) 70 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '5' 71 | 72 | input_embedding_files_str = _args.input 73 | analoglist = _args.analoglist 74 | is_vietnamese = _args.lang 75 | output_files_str = _args.output 76 | options_str = _args.args 77 | vocab_file = _args.vocab 78 | output_format = _args.output_format 79 | port = _args.port 80 | 81 | # By default, we process all embeddings as word2vec format. 82 | embedding_preprocessing.is_word2vec_format = True 83 | 84 | if options_str == 'eval': 85 | print("Starting evaluator ...") 86 | embedding_evaluator.evaluator_api(input_files=input_embedding_files_str, analoglist=analoglist, 87 | output=output_files_str) 88 | print("Done evaluator !") 89 | elif options_str == 'visualizer': 90 | print("Starting visualizer ...") 91 | embedding_visualizer.visualize_multiple_embeddings(input_embedding_files_str, port) 92 | print("Done visualizer !") 93 | elif options_str.startswith("extract"): 94 | print("Starting extractor ...") 95 | embedding_extractor.extract_embedding_for_vocab_file(input_embedding_files_str, vocab_file, 96 | _args.input_c2v, output_files_str, output_format) 97 | print("Done extractor !") 98 | elif options_str.startswith("glove2w2v"): 99 | print("Starting pre-processing: convert to word2vec format ...") 100 | embedding_config.is_word2vec_format = False 101 | if options_str.__contains__("do_normalize"): 102 | embedding_config.do_normalize_emb = True 103 | else: 104 | embedding_config.do_normalize_emb = False 105 | embedding_preprocessing.load_and_save_2_word2vec_models(input_embedding_files_str, 106 | output_files_str, 107 | embedding_config) 108 | 109 | else: 110 | print("Invalid options") 111 | 112 | print("Done!") 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /src/codes/requirements.txt: -------------------------------------------------------------------------------- 1 | gensim==3.4.0 2 | scipy==1.1.0 3 | six==1.12.0 4 | setuptools==40.6.2 5 | tensorflow==1.12.0 6 | Flask==1.0.2 7 | tensorboard==1.12.0 8 | numpy==1.15.4 9 | scikit_learn==0.20.3 10 | typing==3.6.6 11 | -------------------------------------------------------------------------------- /src/codes/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from etnlp_api import __version__ 3 | 4 | 5 | with open("../../README.md", "r") as fh: 6 | long_description = fh.read() 7 | 8 | setup( 9 | name='ETNLP', 10 | version=__version__, 11 | # packages=['api', 'utils', 'embeddings', 'visualizer'], 12 | packages=find_packages(), 13 | py_modules=['etnlp_api'], 14 | long_description=long_description, 15 | long_description_content_type="text/markdown", 16 | url='https://github.com/vietnlp/etnlp', 17 | license='MIT', 18 | author='vietnlp', 19 | author_email='sonvx.coltech@gmail.com', 20 | description='ETNLP: Embedding Toolkit for NLP Tasks' 21 | ) 22 | # from setuptools import setup, find_packages 23 | # import sys 24 | # 25 | # with open('requirements.txt') as f: 26 | # reqs = f.read() 27 | # setup( 28 | # name='ETNLP', 29 | # version='0.1.0', 30 | # description='ETNLP: Embedding Toolkit for NLP Tasks', 31 | # python_requires='>=3.5', 32 | # packages=find_packages(exclude=('data')), 33 | # install_requires=reqs.strip().split('\n'), 34 | # ) 35 | -------------------------------------------------------------------------------- /src/codes/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/src/codes/utils/__init__.py -------------------------------------------------------------------------------- /src/codes/utils/emb_utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics.pairwise import cosine_similarity 2 | from typing import Any, Iterable, List, Optional, Set, Tuple 3 | 4 | from utils.vectors import Vector 5 | from utils import vectors 6 | from utils.word import Word 7 | from utils import eval_utils 8 | from gensim import utils as genutils 9 | import logging 10 | import numpy as np 11 | from scipy import stats 12 | 13 | # Timing info for most_similar (100k words): 14 | # Original version: 7.3s 15 | # Normalized vectors: 3.4s 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def most_similar(base_vector: Vector, words: List[Word]) -> List[Tuple[float, Word]]: 20 | """Finds n words with smallest cosine similarity to a given word""" 21 | words_with_distance = [(vectors.cosine_similarity_normalized(base_vector, w.vector), w) for w in words] 22 | # We want cosine similarity to be as large as possible (close to 1) 23 | sorted_by_distance = sorted(words_with_distance, key=lambda t: t[0], reverse=True) 24 | # Sonvx: remove duplications (not understand why yet, probably because the w2v?) 25 | # sorted_by_distance = list(set(sorted_by_distance)) 26 | return sorted_by_distance 27 | 28 | 29 | def print_most_similar(words: List[Word], text: str) -> None: 30 | base_word = find_word(text, words) 31 | if not base_word: 32 | print("Unknown word: %s"%(text)) 33 | return 34 | print("Words related to %s:" % (base_word.text)) 35 | sorted_by_distance = [ 36 | word.text for (dist, word) in 37 | most_similar(base_word.vector, words) 38 | if word.text.lower() != base_word.text.lower() 39 | ] 40 | print(', '.join(sorted_by_distance[:10])) 41 | 42 | 43 | def read_word() -> str: 44 | return input("Type a word: ") 45 | 46 | 47 | def find_word(text: str, words: List[Word]) -> Optional[Word]: 48 | try: 49 | return next(w for w in words if text == w.text) 50 | except StopIteration: 51 | return None 52 | 53 | 54 | def closest_analogies_OLD( 55 | left2: str, left1: str, right2: str, words: List[Word] 56 | ) -> List[Tuple[float, Word]]: 57 | word_left1 = find_word(left1, words) 58 | word_left2 = find_word(left2, words) 59 | word_right2 = find_word(right2, words) 60 | if (not word_left1) or (not word_left2) or (not word_right2): 61 | return [] 62 | vector = vectors.add( 63 | vectors.sub(word_left1.vector, word_left2.vector), 64 | word_right2.vector) 65 | closest = most_similar(vector, words)[:10] 66 | 67 | def is_redundant(word: str) -> bool: 68 | """ 69 | Sometimes the two left vectors are so close the answer is e.g. 70 | "shirt-clothing is like phone-phones". Skip 'phones' and get the next 71 | suggestion, which might be more interesting. 72 | """ 73 | word_lower = word.lower() 74 | return ( 75 | left1.lower() in word_lower or 76 | left2.lower() in word_lower or 77 | right2.lower() in word_lower) 78 | 79 | closest_filtered = [(dist, w) for (dist, w) in closest if not is_redundant(w.text)] 80 | return closest_filtered 81 | 82 | 83 | def closest_analogies_vectors( 84 | word_left2: Word, word_left1: Word, word_right2: Word, words: List[Word]) \ 85 | -> List[Tuple[float, Word]]: 86 | """ 87 | Sonvx: 88 | :param word_left2: 89 | :param word_left1: 90 | :param word_right2: 91 | :param words: 92 | :param remove_redundancy: remove suggestions if they contain the given words. 93 | :return: 94 | """ 95 | # print(">>>> Remove redundancy = ", remove_redundancy) 96 | # input(">>>>") 97 | vector = vectors.add( 98 | vectors.sub(word_left1.vector, word_left2.vector), 99 | word_right2.vector) 100 | closest = most_similar(vector, words)[:10] 101 | 102 | def is_redundant(word: str) -> bool: 103 | """ 104 | Sometimes the two left vectors are so close the answer is e.g. 105 | "shirt-clothing is like phone-phones". Skip 'phones' and get the next 106 | suggestion, which might be more interesting. 107 | """ 108 | word_lower = word.lower() 109 | return ( 110 | word_left1.text.lower() in word_lower or 111 | word_left2.text.lower() in word_lower or 112 | word_right2.text.lower() in word_lower) 113 | # It doesn't work this way for Vietnamese, so we try both of this to test for now 114 | if False: 115 | closest_filtered = [(dist, w) for (dist, w) in closest if not is_redundant(w.text)] 116 | else: 117 | closest_filtered = closest 118 | return closest_filtered 119 | 120 | 121 | def get_avg_vector(word, embedding_words): 122 | 123 | if " " in word: 124 | single_words = word.split(" ") 125 | list_vector = [] 126 | 127 | for single_word in single_words: 128 | word_vec = find_word(single_word, embedding_words) 129 | if word_vec: 130 | list_vector.append(word_vec.vector) 131 | else: 132 | # Try again with lowercase 133 | single_word = single_word.lower() 134 | word_vec = find_word(single_word, embedding_words) 135 | if word_vec: 136 | list_vector.append(word_vec.vector) 137 | 138 | # print("list_vector: ", list_vector) 139 | # input(">>>>>>>>") 140 | 141 | returned_Word = Word(word, vectors.mean_list(list_vector), 1) 142 | else: 143 | returned_Word = find_word(word, embedding_words) 144 | 145 | # print("Avg returned vector = ", returned_vector) 146 | # input(">>>>") 147 | 148 | return returned_Word 149 | 150 | 151 | def run_paired_ttests(all_map_arr, embedding_names): 152 | """ 153 | Run Paired t-tests on MAP results 154 | :param all_map_arr: 155 | :param embedding_names: 156 | :return: 157 | """ 158 | str_out = "" 159 | num_embs = len(all_map_arr) 160 | 161 | # Verify to make sure they have the same length 162 | if all_map_arr and embedding_names: 163 | for i in range(0, num_embs - 1): 164 | for j in range(i + 1, num_embs): 165 | if len(all_map_arr[i]) != len(all_map_arr[j]): 166 | raise Exception("Two embedding (%s, %s) have different MAP list, sizes: %s vs. %s" 167 | % (embedding_names[i], embedding_names[j], len(all_map_arr[i]), len(all_map_arr[j]))) 168 | else: 169 | logging.error("Inputs are NULL") 170 | 171 | result_str_ttest_arr = [] 172 | for i in range(0, num_embs - 1): 173 | for j in range(i + 1, num_embs): 174 | stat_test_ret = stats.ttest_rel(all_map_arr[i], all_map_arr[j]) 175 | # if stat_test_ret.pvalue >= 0.05: 176 | result = "%s vs. %s: %s" % (embedding_names[i], embedding_names[j], stat_test_ret) 177 | str_out += result + "\n" 178 | 179 | return str_out 180 | 181 | 182 | def eval_word_analogy_4_all_embeddings(word_analogies_file, embedding_names: List[str], 183 | word_embeddings: List[List[Word]], output_file): 184 | """ 185 | Run word analogy for all embeddings 186 | :param word_analogies_file: 187 | :param embedding_names: 188 | :param word_embeddings: 189 | :param output_file: 190 | :return: 191 | """ 192 | fwriter = open(output_file, "w") 193 | idx = 0 194 | all_map_arr = [] 195 | console_output_str = "" 196 | category = ": | Word Analogy Task results\n" 197 | fwriter.write(category) 198 | console_output_str += category 199 | 200 | for word_embedding in word_embeddings: 201 | embedding_name = embedding_names[idx] 202 | map_at_10, map_arr, result_str = eval_word_analogies(word_analogies_file, word_embedding, embedding_name) 203 | all_map_arr.append(map_arr) 204 | meta_info = "\nEmbedding: %s"%(embedding_names[idx]) 205 | fwriter.write(meta_info + "\n") 206 | fwriter.write(result_str) 207 | fwriter.write("MAP_arr = %s"%(map_arr)) 208 | fwriter.write("MAP@10 = %s" % (map_at_10)) 209 | fwriter.flush() 210 | console_output_str += meta_info + "\n" + "MAP@10 = %s" % (map_at_10) + "\n" 211 | idx += 1 212 | 213 | # Getting significant Paired t-tests 214 | category = "\n: | Paired t-tests results\n" 215 | fwriter.write(category) 216 | console_output_str += category 217 | ttests_result = run_paired_ttests(all_map_arr, embedding_names) 218 | console_output_str += ttests_result 219 | fwriter.write(ttests_result) 220 | fwriter.flush() 221 | fwriter.close() 222 | 223 | return console_output_str 224 | 225 | 226 | def eval_word_analogies(word_analogies_file, words: List[Word], embedding_name): 227 | """ 228 | Sonvx: Evaluate word analogy for one embedding. 229 | :param word_analogies_file: 230 | :param words: 231 | :return: 232 | """ 233 | # input("GO checking >>>>") 234 | oov_counter, idx_cnt, is_vn_counter, phrase_cnt = 0, -1, 0, 0 235 | sections, section = [], None 236 | # map_arr = [] 237 | out_str = "" 238 | map_ret_dict = {} 239 | 240 | for line_no, line in enumerate(genutils.smart_open(word_analogies_file)): 241 | # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed 242 | line = genutils.to_unicode(line) 243 | line = line.rstrip() 244 | if line.startswith(': |'): 245 | # a new section starts => store the old section 246 | if section: 247 | sections.append(section) 248 | section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} 249 | else: 250 | # Count number of analogy to check 251 | idx_cnt += 1 252 | 253 | # Set default map value 254 | map_ret_dict[idx_cnt] = 0.0 255 | 256 | if not section: 257 | raise ValueError("missing section header before line #%i in %s" % (line_no, word_analogies_file)) 258 | try: 259 | # a - b + c = expected 260 | # Input: Baghdad | Irac | Bangkok | Thai_Lan 261 | # Baghdad - Irac = Bangkok - Thai_Lan 262 | # -> Baghdad - Irac + Thai_Lan = Bangkok 263 | # => 264 | a, b, expected, c = [word for word in line.split(" | ")] 265 | except ValueError: 266 | logger.debug("SVX: ERROR skipping invalid line #%i in %s", line_no, word_analogies_file) 267 | print("Line : ", line) 268 | print("a, b, c, expected: %s, %s, %s, %s" % (a, b, c, expected)) 269 | # input(">>> Wait ...") 270 | continue 271 | 272 | # In case of Vietnamese, word analogy can be a phrase 273 | if " " in expected: 274 | print("INFO: we don't support to find word analogies for phrase for NOW.") 275 | phrase_cnt += 1 276 | continue 277 | elif " " in a or " " in b or " " in c: 278 | is_vn_counter += 1 279 | word_left1 = get_avg_vector(a, words) 280 | word_left2 = get_avg_vector(b, words) 281 | word_right2 = get_avg_vector(c, words) 282 | else: 283 | word_left1 = find_word(a, words) 284 | word_left2 = find_word(b, words) 285 | word_right2 = find_word(c, words) 286 | 287 | if (not word_left1) or (not word_left2) or (not word_right2): 288 | logger.debug("SVX: skipping line #%i with OOV words: %s", line_no, line.strip()) 289 | oov_counter += 1 290 | continue 291 | 292 | # Write solable analogy to a file 293 | # fsolveable_writer.write(line + "\n") 294 | 295 | logger.debug("word_left1 = %s", word_left1.text) 296 | logger.debug("word_left2 = %s", word_left2.text) 297 | logger.debug("word_right2 = %s", word_right2.text) 298 | 299 | # Start finding close word: 300 | # Note: we can only find 1 expected word in Vietnamese for NOW 301 | top10_candidate = closest_analogies_vectors(word_left2, word_left1, 302 | word_right2, words) 303 | list_candidate_arr = [] 304 | for tuple in top10_candidate: 305 | list_candidate_arr.append(tuple[1].text) 306 | 307 | logger.debug("Expected Word: %s, candidate = %s" % (expected, list_candidate_arr)) 308 | # input(">>>>>") 309 | # Calculate MAP@10 score 310 | this_map_result = eval_utils.mapk(expected, list_candidate_arr, word_level=True) 311 | if this_map_result >= 0: 312 | this_map_result = round(this_map_result, 6) 313 | # map_arr[idx_cnt] = this_map_result 314 | else: 315 | this_map_result = 0.0 316 | # map_arr.append(0.0) 317 | # map_arr[idx_cnt] = this_map_result 318 | 319 | map_ret_dict[idx_cnt] = this_map_result 320 | 321 | if expected in list_candidate_arr: 322 | section['correct'].append((a, b, c, expected)) 323 | out_line = "%s - %s + %s = ?; Expect: %s, candidate: %s" % \ 324 | (word_left1, word_left2, word_right2, expected, list_candidate_arr) 325 | out_str += out_line + "\n" 326 | 327 | # else: 328 | # section['incorrect'].append((a, b, c, expected)) 329 | 330 | # fsolveable_writer.close() 331 | if section: 332 | # store the last section, too 333 | sections.append(section) 334 | 335 | map_arr = list(map_ret_dict.values()) 336 | logger.debug("map_arr = ", map_arr) 337 | logger.debug("MAP_RET_DICT = ", map_ret_dict) 338 | # input("Check result dict: >>>>>") 339 | 340 | total = { 341 | "Emb_Name: " + embedding_name + '/OOV/Total/VN_Solveable_Cases/VN_Phrase_Target': 342 | [oov_counter, (idx_cnt + 1), is_vn_counter, phrase_cnt], 343 | 'MAP@10': np.mean(map_arr) 344 | # , 345 | # 'section': 'total' 346 | # , 347 | # 'correct': sum((s['correct'] for s in sections), []), 348 | # 'incorrect': sum((s['incorrect'] for s in sections), []), 349 | } 350 | # print (out_str) 351 | # print(total) 352 | # logger.info(total) 353 | 354 | sections.append(total) 355 | sections_str = "\n%s\n" % sections 356 | 357 | return np.mean(map_arr), map_arr, sections_str 358 | 359 | 360 | def print_analogy(left2: str, left1: str, right2: str, words: List[Word]) -> None: 361 | analogies = closest_analogies_OLD(left2, left1, right2, words) 362 | if (len(analogies) == 0): 363 | # print(f"{left2}-{left1} is like {right2}-?") 364 | print("%s-%s is like %s-?"%(left2, left1, right2)) 365 | # man-king is like woman-king 366 | # input: man is to king is like woman is to ___?(queen). 367 | else: 368 | (dist, w) = analogies[0] 369 | # alternatives = ', '.join([f"{w.text} ({dist})" for (dist, w) in analogies]) 370 | # print(f"{left2}-{left1} is like {right2}-{w.text}") 371 | print("%s-%s is like %s-%s"%(left2, left1, right2, w.text)) 372 | 373 | -------------------------------------------------------------------------------- /src/codes/utils/embedding_io.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, List, Set 2 | 3 | from itertools import groupby 4 | import numpy as np 5 | import re 6 | import utils.vectors as v 7 | from utils.word import Word 8 | import logging 9 | import os 10 | from embeddings.embedding_configs import EmbeddingConfigs 11 | 12 | 13 | def save_model_to_file(embedding_model: List[Word], model_file_out: str): 14 | """ 15 | Save loaded model back to file (to remove duplicated items). 16 | :param embedding_model: 17 | :param model_file_out: 18 | :return: 19 | """ 20 | fwriter = open(model_file_out, "w") 21 | 22 | meta_data = "%s %s\n"%(len(embedding_model), len(embedding_model[0].vector)) 23 | fwriter.write(meta_data) 24 | fwriter.flush() 25 | for w_Word in embedding_model: 26 | line = w_Word.text + " " + " ".join(str(scalar) for scalar in w_Word.vector.tolist()) 27 | fwriter.write(line + "\n") 28 | fwriter.flush() 29 | fwriter.close() 30 | 31 | 32 | def load_word_embeddings(file_paths: str, emb_config: EmbeddingConfigs) -> List[List[Word]]: 33 | """ 34 | Sonvx: load multiple embeddings: e.g., ; 35 | :param file_paths: 36 | :param emb_config: 37 | :return: 38 | """ 39 | embedding_models = [] 40 | embedding_names = [] 41 | if file_paths and file_paths.__contains__(";"): 42 | files = file_paths.split(";") 43 | for emb_file in files: 44 | word_embedding = load_word_embedding(emb_file.replace("\"", ""), emb_config) 45 | embedding_name = os.path.basename(os.path.normpath(emb_file)) 46 | embedding_models.append(word_embedding) 47 | embedding_names.append(embedding_name) 48 | else: 49 | return [load_word_embedding(file_paths), emb_config] 50 | 51 | return embedding_names, embedding_models 52 | 53 | 54 | def load_word_embedding(file_path: str, emb_config: EmbeddingConfigs) -> List[Word]: 55 | """ 56 | Load and cleanup the data. 57 | :param file_path: 58 | :param emb_config: 59 | :return: 60 | """ 61 | # print(f"Loading {file_path}...") 62 | print("Loading %s ..."%(file_path)) 63 | words = load_words_raw(file_path, emb_config) 64 | # print(f"Loaded {len(words)} words.") 65 | print("Loaded %s words." %(len(words))) 66 | 67 | # Test 68 | word1 = words[1] 69 | print("Vec Len(word1) = ", len(word1.vector)) 70 | 71 | # num_dimensions = most_common_dimension(words) 72 | # words = [w for w in words if len(w.vector) == dims] 73 | # print(f"Using {num_dimensions}-dimensional vectors, {len(words)} remain.") 74 | 75 | # words = remove_stop_words(words) 76 | # print(f"Removed stop words, {len(words)} remain.") 77 | 78 | # ords = remove_duplicates(words) 79 | # print(f"Removed duplicates, {len(words)} remain.") 80 | 81 | logging.debug("Embedding words: ", words[:10]) 82 | print("Emb_vocab_size = ", len(words)) 83 | # input("Done loading embedding: >>>>") 84 | 85 | return words 86 | 87 | 88 | def load_words_raw(file_path: str, emb_config: EmbeddingConfigs) -> List[Word]: 89 | """ 90 | Load the file as-is, without doing any validation or cleanup. 91 | :param file_path: 92 | :param emb_config: 93 | :return: 94 | """ 95 | 96 | def parse_line(line: str, frequency: int) -> Word: 97 | # print("Line=", line) 98 | tokens = line.split(" ") 99 | word = tokens[0] 100 | if emb_config.do_normalize_emb: 101 | vector = v.normalize(np.array([float(x) for x in tokens[1:]])) 102 | else: 103 | vector = np.array([float(x) for x in tokens[1:]]) 104 | return Word(word, vector, frequency) 105 | 106 | # Sonvx: NOT loading the same word twice. 107 | 108 | unique_dict = {} 109 | 110 | words = [] 111 | # Words are sorted from the most common to the least common ones 112 | frequency = 1 113 | 114 | duplicated_entry = 0 115 | 116 | idx_counter, vocab_size, emb_dim = 0, 0, 0 117 | with open(file_path) as f: 118 | for line in f: 119 | line = line.rstrip() 120 | 121 | # print("Processing line: ", line) 122 | 123 | if idx_counter == 0 and emb_config.is_word2vec_format: 124 | try: 125 | meta_info = line.split(" ") 126 | vocab_size = int(meta_info[0]) 127 | emb_dim = int(meta_info[1]) 128 | idx_counter += 1 129 | continue 130 | except Exception as e: 131 | print("meta_info = "%(meta_info)) 132 | logging.error("Input embedding has format issue: Error = %s" % (e)) 133 | 134 | # if len(line) < 20: # Ignore the first line of w2v format. 135 | # continue 136 | 137 | w = parse_line(line, frequency) 138 | 139 | # Svx: only load if the word is not existed in the list. 140 | if w.text not in unique_dict: 141 | unique_dict[w.text] = frequency 142 | words.append(w) 143 | frequency += 1 144 | else: 145 | duplicated_entry += 1 146 | # print("Loading the same word again") 147 | 148 | # # Svx: check if the embedding dim is the same with the metadata, random check only 149 | if idx_counter == 10: 150 | if len(w.vector) != emb_dim: 151 | message = "Metadata and the real vector size do not match: meta:real = %s:%s" \ 152 | % (emb_dim, len(w.vector)) 153 | logging.error(message) 154 | raise ValueError(message) 155 | idx_counter += 1 156 | 157 | if duplicated_entry > 0: 158 | logging.debug("Loading the same word again: %s"%(duplicated_entry)) 159 | 160 | # Final check: 161 | if (frequency - 1) != vocab_size: 162 | msg = "Loaded %s/%s unique vocab." % ((frequency - 1), vocab_size) 163 | logging.info(msg) 164 | 165 | return words 166 | 167 | 168 | def iter_len(iter: Iterable[complex]) -> int: 169 | return sum(1 for _ in iter) 170 | 171 | 172 | def most_common_dimension(words: List[Word]) -> int: 173 | """ 174 | There is a line in the input file which is missing a word 175 | (search -0.0739, -0.135, 0.0584). 176 | """ 177 | lengths = sorted([len(word.vector) for word in words]) 178 | dimensions = [(k, iter_len(v)) for k, v in groupby(lengths)] 179 | print("Dimensions:") 180 | for (dim, num_vectors) in dimensions: 181 | # print(f"{num_vectors} {dim}-dimensional vectors") 182 | print("%s %s-dimensional vectors"%(num_vectors, dim)) 183 | most_common = sorted(dimensions, key=lambda t: t[1], reverse=True)[0] 184 | return most_common[0] 185 | 186 | 187 | # We want to ignore these characters, 188 | # so that e.g. "U.S.", "U.S", "US_" and "US" are the same word. 189 | ignore_char_regex = re.compile("[\W_]") 190 | 191 | # Has to start and end with an alphanumeric character 192 | is_valid_word = re.compile("^[^\W_].*[^\W_]$") 193 | 194 | 195 | def remove_duplicates(words: List[Word]) -> List[Word]: 196 | seen_words: Set[str] = set() 197 | unique_words: List[Word] = [] 198 | for w in words: 199 | canonical = ignore_char_regex.sub("", w.text) 200 | if not canonical in seen_words: 201 | seen_words.add(canonical) 202 | # Keep the original ordering 203 | unique_words.append(w) 204 | return unique_words 205 | 206 | 207 | def remove_stop_words(words: List[Word]) -> List[Word]: 208 | return [w for w in words if ( 209 | len(w.text) > 1 and is_valid_word.match(w.text))] 210 | 211 | 212 | # Run "smoke tests" on import 213 | assert [w.text for w in remove_stop_words([ 214 | Word('a', [], 1), 215 | Word('ab', [], 1), 216 | Word('-ab', [], 1), 217 | Word('ab_', [], 1), 218 | Word('a.', [], 1), 219 | Word('.a', [], 1), 220 | Word('ab', [], 1), 221 | ])] == ['ab', 'ab'] 222 | assert [w.text for w in remove_duplicates([ 223 | Word('a.b', [], 1), 224 | Word('-a-b', [], 1), 225 | Word('ab_+', [], 1), 226 | Word('.abc...', [], 1), 227 | ])] == ['a.b', '.abc...'] 228 | -------------------------------------------------------------------------------- /src/codes/utils/eval_utils.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | MAP@K word level and character level are explained in detail in this paper: 4 | 5 | dpUGC: Learn Differentially Private Representationfor User Generated Contents 6 | Xuan-Son Vu, Son N. Tran, Lili Jiang 7 | In: Proceedings of the 20th International Conference on Computational Linguistics and 8 | Intelligent Text Processing, April, 2019, (to appear) 9 | 10 | Please cite the above paper if you use codes in this file. 11 | 12 | """ 13 | 14 | 15 | def apk(actual, predicted, k=10): 16 | """ 17 | Computes the average precision at k. 18 | This function computes the average prescision at k between two lists of 19 | items. 20 | Parameters 21 | ---------- 22 | actual : list 23 | A list of elements that are to be predicted (order doesn't matter) 24 | predicted : list 25 | A list of predicted elements (order does matter) 26 | k : int, optional 27 | The maximum number of predicted elements 28 | Returns 29 | ------- 30 | score : double 31 | The average precision at k over the input lists 32 | """ 33 | if len(predicted) > k: 34 | predicted = predicted[:k] 35 | 36 | score = 0.0 37 | num_hits = 0.0 38 | 39 | for i, p in enumerate(predicted): 40 | if p in actual and p not in predicted[:i]: 41 | num_hits += 1.0 42 | score += num_hits / (i + 1.0) 43 | 44 | if not actual: 45 | return 0.0 46 | 47 | return score / min(len(actual), k) 48 | 49 | 50 | def mapk(actual, predicted, k=10, word_level=True): 51 | """ 52 | Computes the mean average precision at k. 53 | This function computes the mean average prescision at k between two lists 54 | of lists of items. 55 | Parameters 56 | ---------- 57 | actual : list 58 | A list of lists of elements that are to be predicted 59 | (order doesn't matter in the lists) 60 | predicted : list 61 | A list of lists of predicted elements 62 | (order matters in the lists) 63 | k : int, optional 64 | The maximum number of predicted elements 65 | Returns 66 | ------- 67 | score : double 68 | The mean average precision at k over the input lists 69 | """ 70 | # print("Sending arr = ", arr) 71 | if word_level: 72 | return calc_map(actual, predicted, topK=k) 73 | else: 74 | # arr = [apk(a, p, k) for a, p in zip(actual, predicted)] 75 | # return np.mean(arr) 76 | return calc_map_character_level(actual, predicted, topK=k) 77 | 78 | 79 | def calc_map(actual, predicted, topK=10): 80 | """ 81 | 82 | :param actual: 83 | :param predicted: 84 | :param topK: 85 | :return: 86 | """ 87 | # print("Input: actual %s, predicted %s"%(actual, predicted)) 88 | if len(predicted) > topK: 89 | predicted = predicted[:topK] 90 | idx = 1 91 | hit = 0 92 | map_arr = [] 93 | for answer in predicted: 94 | if answer in actual[:topK]: 95 | hit += 1 96 | val = (hit * 1.0) / (idx * 1.0) 97 | # print("hit = %s, idx = %s"%(hit, idx)) 98 | map_arr.append(val) 99 | # print("hit: %s, map_arr = %s"%(answer, map_arr)) 100 | idx += 1 101 | # print("map_arr = %s done", map_arr) 102 | if len(map_arr) > 0: 103 | return np.mean(map_arr) 104 | else: 105 | return 0.0 106 | 107 | 108 | def calc_map_character_level(actual, predicted, topK=10): 109 | """ 110 | 111 | :param actual: 112 | :param predicted: 113 | :param topK: 114 | :return: 115 | """ 116 | # print("Input: actual %s, predicted %s" % (actual, predicted)) 117 | if len(predicted) > topK: 118 | predicted = predicted[:topK] 119 | 120 | if len(actual) > topK: 121 | actual = actual[:topK] 122 | 123 | rank = 1 124 | hit = 0 125 | actual_seq = ''.join([word for word in actual]) 126 | predicted_seq = ''.join([word for word in predicted]) 127 | map_arr = [] 128 | for char in predicted_seq: 129 | if char in actual_seq[:rank]: 130 | hit += 1 131 | val = (hit * 1.0) / (rank * 1.0) 132 | # print("hit = %s, idx = %s" % (hit, rank)) 133 | map_arr.append(val) 134 | # print("hit: %s, map_arr = %s" % (char, map_arr)) 135 | rank += 1 136 | # print("map_arr = %s done", map_arr) 137 | return np.mean(map_arr) 138 | 139 | 140 | import unittest 141 | import numpy as np 142 | 143 | 144 | def test_apk(self): 145 | self.assertAlmostEqual(apk(range(1, 6), [6, 4, 7, 1, 2], 2), 0.25) 146 | self.assertAlmostEqual(apk(range(1, 6), [1, 1, 1, 1, 1], 5), 0.2) 147 | predicted = range(1, 21) 148 | predicted.extend(range(200, 600)) 149 | self.assertAlmostEqual(apk(range(1, 100), predicted, 20), 1.0) 150 | 151 | 152 | def test_mapk(self): 153 | self.assertAlmostEqual(mapk([range(1, 5)], [range(1, 5)], 3), 1.0) 154 | self.assertAlmostEqual(mapk([[1, 3, 4], [1, 2, 4], [1, 3]], 155 | [range(1, 6), range(1, 6), range(1, 6)], 3), 0.685185185185185) 156 | self.assertAlmostEqual(mapk([range(1, 6), range(1, 6)], 157 | [[6, 4, 7, 1, 2], [1, 1, 1, 1, 1]], 5), 0.26) 158 | self.assertAlmostEqual(mapk([[1, 3], [1, 2, 3], [1, 2, 3]], 159 | [range(1, 6), [1, 1, 1], [1, 2, 1]], 3), 11.0 / 18) 160 | 161 | 162 | if __name__ == '__main__': 163 | a1 = ["1", '2', '3', '4'] 164 | b1 = ['1', '5', '2', '8'] 165 | print(mapk(a1, b1, 4)) 166 | 167 | a1 = ["15"] 168 | b1 = ["1", "2", "3", "4", "5","6","7","8","9","10"] 169 | 170 | print("MapK:", mapk(a1, b1, 4)) 171 | 172 | # unittest.main() 173 | -------------------------------------------------------------------------------- /src/codes/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | 4 | def save_obj(obj, file_path): 5 | with open(file_path + '.pkl', 'wb') as f: 6 | pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) 7 | 8 | 9 | def load_obj(file_path): 10 | with open(file_path + '.pkl', 'rb') as f: 11 | return pickle.load(f) 12 | 13 | 14 | def get_unique_vocab(analogy_file_path, write_out_file): 15 | """ 16 | 17 | :param analogy_file_path: 18 | :param write_out_file: 19 | :return: 20 | """ 21 | vocab_dict = {} 22 | with open(analogy_file_path, "r") as freader: 23 | for line in freader: 24 | if line.__contains__(" | "): 25 | word_parts = line.split(" | ") 26 | for word in word_parts: 27 | word = word.rstrip() 28 | vocab_dict[word] = 0 29 | 30 | fwriter = open(write_out_file, "w") 31 | for word in vocab_dict.keys(): 32 | fwriter.write(word + "\n") 33 | fwriter.close() 34 | 35 | print("Write dictionary file to %s"%(write_out_file)) 36 | 37 | return vocab_dict 38 | 39 | 40 | if __name__ == '__main__': 41 | get_unique_vocab("../data/embedding_analogies/portuguese/LX-4WAnalogies-ETNLP.txt", 42 | "../data/embedding_analogies/portuguese/vocab.txt") 43 | -------------------------------------------------------------------------------- /src/codes/utils/string_utils.py: -------------------------------------------------------------------------------- 1 | import six 2 | 3 | 4 | def convert_to_unicode(text): 5 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" 6 | if six.PY3: 7 | if isinstance(text, str): 8 | return text 9 | elif isinstance(text, bytes): 10 | return text.decode("utf-8", "ignore") 11 | else: 12 | raise ValueError("Unsupported string type: %s" % (type(text))) 13 | elif six.PY2: 14 | if isinstance(text, str): 15 | return text.decode("utf-8", "ignore") 16 | elif isinstance(text, unicode): 17 | return text 18 | else: 19 | raise ValueError("Unsupported string type: %s" % (type(text))) 20 | else: 21 | raise ValueError("Not running on Python2 or Python 3?") 22 | 23 | 24 | -------------------------------------------------------------------------------- /src/codes/utils/vectors.py: -------------------------------------------------------------------------------- 1 | from typing import List, Any, Optional 2 | 3 | import math 4 | import numpy as np 5 | 6 | # Adopt from https://github.com/mkonicek/nlp/vecters.py 7 | 8 | # Vector = np.ndarray[float] 9 | Vector = 'np.ndarray[float]' 10 | vector_type = 'np.ndarray[float]' 11 | 12 | # Vector = np.ndarray(dtype=float) 13 | 14 | 15 | def l2_len(v: vector_type) -> float: 16 | return math.sqrt(np.dot(v, v)) 17 | 18 | 19 | def dot(v1: vector_type, v2: vector_type) -> float: 20 | assert v1.shape == v2.shape 21 | return np.dot(v1, v2) 22 | 23 | 24 | def mean(v1: vector_type, v2: vector_type) -> Vector: 25 | """ 26 | Added by Sonvx: get mean of 2 vectors. 27 | :param v1: 28 | :param v2: 29 | :return: 30 | """ 31 | assert v1.shape == v2.shape 32 | return np.mean([v1, v2], axis=0) 33 | 34 | 35 | def mean_list(v1: List[Vector]) -> Vector: 36 | """ 37 | Added by Sonvx: get mean of 2 vectors. 38 | :param v1: 39 | :return: 40 | """ 41 | if len(v1) > 0: 42 | return np.mean(v1, axis=0) 43 | else: 44 | return None 45 | 46 | 47 | def add(v1: vector_type, v2: vector_type) -> Vector: 48 | assert v1.shape == v2.shape 49 | return np.add(v1, v2) 50 | 51 | 52 | def sub(v1: vector_type, v2: vector_type) -> Vector: 53 | assert v1.shape == v2.shape 54 | return np.subtract(v1, v2) 55 | 56 | 57 | def normalize(v: vector_type) -> Vector: 58 | return v / l2_len(v) 59 | 60 | 61 | def cosine_similarity_normalized(v1: vector_type, v2: vector_type) -> float: 62 | """ 63 | Returns the cosine of the angle between the two vectors. 64 | Each of the vectors must have length (L2-norm) equal to 1. 65 | Results range from -1 (very different) to 1 (very similar). 66 | """ 67 | return dot(v1, v2) 68 | -------------------------------------------------------------------------------- /src/codes/utils/word.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from utils.vectors import Vector 3 | 4 | # Adopt from https://github.com/mkonicek/nlp/Word.py 5 | 6 | 7 | class Word: 8 | """A single word (one line of the input vector embedding file)""" 9 | 10 | def __init__(self, text: str, vector: Vector, frequency: int) -> None: 11 | self.text = text 12 | self.vector = vector 13 | self.frequency = frequency 14 | 15 | def __repr__(self) -> str: 16 | vector_preview = ', '.join(map(str, self.vector[:2])) 17 | # return f"{self.text} [{vector_preview}, ...]" 18 | return "%s [%s, ...]"%(self.text, vector_preview) 19 | -------------------------------------------------------------------------------- /src/codes/visualizer/README.md: -------------------------------------------------------------------------------- 1 | # Requirements: 2 | - ```pip install gensim flask``` 3 | - Download any pre-trained embeddings and put it into ../03.run_etnlp_visualizer_inter.sh 4 | 5 | # How to run 6 | > 1. sh ../03.run_etnlp_visualizer_inter.sh 7 | > 2. Visit http://localhost:8089 8 | 9 | # Screenshot 10 | 11 | ![Alt text](https://github.com/vietnlp/etnlp/blob/master/images/etnlp_view_multi_embeddings.png "Screenshot example of one given input") 12 | 13 | -------------------------------------------------------------------------------- /src/codes/visualizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/src/codes/visualizer/__init__.py -------------------------------------------------------------------------------- /src/codes/visualizer/data/vnex.model.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/src/codes/visualizer/data/vnex.model.bin -------------------------------------------------------------------------------- /src/codes/visualizer/images/w2v_vn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/src/codes/visualizer/images/w2v_vn.png -------------------------------------------------------------------------------- /src/codes/visualizer/images/w2v_vn_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/src/codes/visualizer/images/w2v_vn_2.png -------------------------------------------------------------------------------- /src/codes/visualizer/outof_w2vec.dict: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 'news' 12 | 13 | 14 | 15 | 16 | 17 | 'news' 18 | 19 | 20 | 21 | 22 | 23 | 'news' 24 | 25 | 26 | 27 | 28 | 29 | 'news' 30 | 31 | 32 | 33 | 34 | 35 | 'news' 36 | 37 | 38 | 39 | 40 | 41 | 42 | 'news' 43 | 44 | 45 | 46 | 47 | 48 | 'news' 49 | 50 | 'back' 51 | 'back' 52 | 'back' 53 | 'back' 54 | 'news' 55 | 'news' 56 | 'back' 57 | 'back' 58 | 'back' 59 | 'back' 60 | 'news' 61 | 'news' 62 | 'back' 63 | 'back' 64 | 'back' 65 | 'back' 66 | 'news' 67 | 'news' 68 | 'lovely' 69 | 'lovely' 70 | 'lovely' 71 | 'lovely' 72 | 'love' 73 | 'love' 74 | -------------------------------------------------------------------------------- /src/codes/visualizer/static/bootstrap-theme.min.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap v3.3.5 (http://getbootstrap.com) 3 | * Copyright 2011-2015 Twitter, Inc. 4 | * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) 5 | */ 6 | 7 | /*! 8 | * Generated using the Bootstrap Customizer (http://getbootstrap.com/customize/?id=6d5e1954144aa5c7842c) 9 | * Config saved to config.json and https://gist.github.com/6d5e1954144aa5c7842c 10 | *//*! 11 | * Bootstrap v3.3.5 (http://getbootstrap.com) 12 | * Copyright 2011-2015 Twitter, Inc. 13 | * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) 14 | */.btn-default,.btn-primary,.btn-success,.btn-info,.btn-warning,.btn-danger{text-shadow:0 -1px 0 rgba(0,0,0,0.2);-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,0.15),0 1px 1px rgba(0,0,0,0.075);box-shadow:inset 0 1px 0 rgba(255,255,255,0.15),0 1px 1px rgba(0,0,0,0.075)}.btn-default:active,.btn-primary:active,.btn-success:active,.btn-info:active,.btn-warning:active,.btn-danger:active,.btn-default.active,.btn-primary.active,.btn-success.active,.btn-info.active,.btn-warning.active,.btn-danger.active{-webkit-box-shadow:inset 0 3px 5px rgba(0,0,0,0.125);box-shadow:inset 0 3px 5px rgba(0,0,0,0.125)}.btn-default.disabled,.btn-primary.disabled,.btn-success.disabled,.btn-info.disabled,.btn-warning.disabled,.btn-danger.disabled,.btn-default[disabled],.btn-primary[disabled],.btn-success[disabled],.btn-info[disabled],.btn-warning[disabled],.btn-danger[disabled],fieldset[disabled] .btn-default,fieldset[disabled] .btn-primary,fieldset[disabled] .btn-success,fieldset[disabled] .btn-info,fieldset[disabled] .btn-warning,fieldset[disabled] .btn-danger{-webkit-box-shadow:none;box-shadow:none}.btn-default .badge,.btn-primary .badge,.btn-success .badge,.btn-info .badge,.btn-warning .badge,.btn-danger .badge{text-shadow:none}.btn:active,.btn.active{background-image:none}.btn-default{background-image:-webkit-linear-gradient(top, #fff 0, #e0e0e0 100%);background-image:-o-linear-gradient(top, #fff 0, #e0e0e0 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #fff), to(#e0e0e0));background-image:linear-gradient(to bottom, #fff 0, #e0e0e0 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffffff', endColorstr='#ffe0e0e0', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);background-repeat:repeat-x;border-color:#dbdbdb;text-shadow:0 1px 0 #fff;border-color:#ccc}.btn-default:hover,.btn-default:focus{background-color:#e0e0e0;background-position:0 -15px}.btn-default:active,.btn-default.active{background-color:#e0e0e0;border-color:#dbdbdb}.btn-default.disabled,.btn-default[disabled],fieldset[disabled] .btn-default,.btn-default.disabled:hover,.btn-default[disabled]:hover,fieldset[disabled] .btn-default:hover,.btn-default.disabled:focus,.btn-default[disabled]:focus,fieldset[disabled] .btn-default:focus,.btn-default.disabled.focus,.btn-default[disabled].focus,fieldset[disabled] .btn-default.focus,.btn-default.disabled:active,.btn-default[disabled]:active,fieldset[disabled] .btn-default:active,.btn-default.disabled.active,.btn-default[disabled].active,fieldset[disabled] .btn-default.active{background-color:#e0e0e0;background-image:none}.btn-primary{background-image:-webkit-linear-gradient(top, #337ab7 0, #265a88 100%);background-image:-o-linear-gradient(top, #337ab7 0, #265a88 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #337ab7), to(#265a88));background-image:linear-gradient(to bottom, #337ab7 0, #265a88 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff265a88', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);background-repeat:repeat-x;border-color:#245580}.btn-primary:hover,.btn-primary:focus{background-color:#265a88;background-position:0 -15px}.btn-primary:active,.btn-primary.active{background-color:#265a88;border-color:#245580}.btn-primary.disabled,.btn-primary[disabled],fieldset[disabled] .btn-primary,.btn-primary.disabled:hover,.btn-primary[disabled]:hover,fieldset[disabled] .btn-primary:hover,.btn-primary.disabled:focus,.btn-primary[disabled]:focus,fieldset[disabled] .btn-primary:focus,.btn-primary.disabled.focus,.btn-primary[disabled].focus,fieldset[disabled] .btn-primary.focus,.btn-primary.disabled:active,.btn-primary[disabled]:active,fieldset[disabled] .btn-primary:active,.btn-primary.disabled.active,.btn-primary[disabled].active,fieldset[disabled] .btn-primary.active{background-color:#265a88;background-image:none}.btn-success{background-image:-webkit-linear-gradient(top, #5cb85c 0, #419641 100%);background-image:-o-linear-gradient(top, #5cb85c 0, #419641 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #5cb85c), to(#419641));background-image:linear-gradient(to bottom, #5cb85c 0, #419641 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5cb85c', endColorstr='#ff419641', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);background-repeat:repeat-x;border-color:#3e8f3e}.btn-success:hover,.btn-success:focus{background-color:#419641;background-position:0 -15px}.btn-success:active,.btn-success.active{background-color:#419641;border-color:#3e8f3e}.btn-success.disabled,.btn-success[disabled],fieldset[disabled] .btn-success,.btn-success.disabled:hover,.btn-success[disabled]:hover,fieldset[disabled] .btn-success:hover,.btn-success.disabled:focus,.btn-success[disabled]:focus,fieldset[disabled] .btn-success:focus,.btn-success.disabled.focus,.btn-success[disabled].focus,fieldset[disabled] .btn-success.focus,.btn-success.disabled:active,.btn-success[disabled]:active,fieldset[disabled] .btn-success:active,.btn-success.disabled.active,.btn-success[disabled].active,fieldset[disabled] .btn-success.active{background-color:#419641;background-image:none}.btn-info{background-image:-webkit-linear-gradient(top, #5bc0de 0, #2aabd2 100%);background-image:-o-linear-gradient(top, #5bc0de 0, #2aabd2 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #5bc0de), to(#2aabd2));background-image:linear-gradient(to bottom, #5bc0de 0, #2aabd2 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5bc0de', endColorstr='#ff2aabd2', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);background-repeat:repeat-x;border-color:#28a4c9}.btn-info:hover,.btn-info:focus{background-color:#2aabd2;background-position:0 -15px}.btn-info:active,.btn-info.active{background-color:#2aabd2;border-color:#28a4c9}.btn-info.disabled,.btn-info[disabled],fieldset[disabled] .btn-info,.btn-info.disabled:hover,.btn-info[disabled]:hover,fieldset[disabled] .btn-info:hover,.btn-info.disabled:focus,.btn-info[disabled]:focus,fieldset[disabled] .btn-info:focus,.btn-info.disabled.focus,.btn-info[disabled].focus,fieldset[disabled] .btn-info.focus,.btn-info.disabled:active,.btn-info[disabled]:active,fieldset[disabled] .btn-info:active,.btn-info.disabled.active,.btn-info[disabled].active,fieldset[disabled] .btn-info.active{background-color:#2aabd2;background-image:none}.btn-warning{background-image:-webkit-linear-gradient(top, #f0ad4e 0, #eb9316 100%);background-image:-o-linear-gradient(top, #f0ad4e 0, #eb9316 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #f0ad4e), to(#eb9316));background-image:linear-gradient(to bottom, #f0ad4e 0, #eb9316 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff0ad4e', endColorstr='#ffeb9316', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);background-repeat:repeat-x;border-color:#e38d13}.btn-warning:hover,.btn-warning:focus{background-color:#eb9316;background-position:0 -15px}.btn-warning:active,.btn-warning.active{background-color:#eb9316;border-color:#e38d13}.btn-warning.disabled,.btn-warning[disabled],fieldset[disabled] .btn-warning,.btn-warning.disabled:hover,.btn-warning[disabled]:hover,fieldset[disabled] .btn-warning:hover,.btn-warning.disabled:focus,.btn-warning[disabled]:focus,fieldset[disabled] .btn-warning:focus,.btn-warning.disabled.focus,.btn-warning[disabled].focus,fieldset[disabled] .btn-warning.focus,.btn-warning.disabled:active,.btn-warning[disabled]:active,fieldset[disabled] .btn-warning:active,.btn-warning.disabled.active,.btn-warning[disabled].active,fieldset[disabled] .btn-warning.active{background-color:#eb9316;background-image:none}.btn-danger{background-image:-webkit-linear-gradient(top, #d9534f 0, #c12e2a 100%);background-image:-o-linear-gradient(top, #d9534f 0, #c12e2a 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #d9534f), to(#c12e2a));background-image:linear-gradient(to bottom, #d9534f 0, #c12e2a 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9534f', endColorstr='#ffc12e2a', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);background-repeat:repeat-x;border-color:#b92c28}.btn-danger:hover,.btn-danger:focus{background-color:#c12e2a;background-position:0 -15px}.btn-danger:active,.btn-danger.active{background-color:#c12e2a;border-color:#b92c28}.btn-danger.disabled,.btn-danger[disabled],fieldset[disabled] .btn-danger,.btn-danger.disabled:hover,.btn-danger[disabled]:hover,fieldset[disabled] .btn-danger:hover,.btn-danger.disabled:focus,.btn-danger[disabled]:focus,fieldset[disabled] .btn-danger:focus,.btn-danger.disabled.focus,.btn-danger[disabled].focus,fieldset[disabled] .btn-danger.focus,.btn-danger.disabled:active,.btn-danger[disabled]:active,fieldset[disabled] .btn-danger:active,.btn-danger.disabled.active,.btn-danger[disabled].active,fieldset[disabled] .btn-danger.active{background-color:#c12e2a;background-image:none}.thumbnail,.img-thumbnail{-webkit-box-shadow:0 1px 2px rgba(0,0,0,0.075);box-shadow:0 1px 2px rgba(0,0,0,0.075)}.dropdown-menu>li>a:hover,.dropdown-menu>li>a:focus{background-image:-webkit-linear-gradient(top, #f5f5f5 0, #e8e8e8 100%);background-image:-o-linear-gradient(top, #f5f5f5 0, #e8e8e8 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #f5f5f5), to(#e8e8e8));background-image:linear-gradient(to bottom, #f5f5f5 0, #e8e8e8 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff5f5f5', endColorstr='#ffe8e8e8', GradientType=0);background-color:#e8e8e8}.dropdown-menu>.active>a,.dropdown-menu>.active>a:hover,.dropdown-menu>.active>a:focus{background-image:-webkit-linear-gradient(top, #337ab7 0, #2e6da4 100%);background-image:-o-linear-gradient(top, #337ab7 0, #2e6da4 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #337ab7), to(#2e6da4));background-image:linear-gradient(to bottom, #337ab7 0, #2e6da4 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2e6da4', GradientType=0);background-color:#2e6da4}.navbar-default{background-image:-webkit-linear-gradient(top, #fff 0, #f8f8f8 100%);background-image:-o-linear-gradient(top, #fff 0, #f8f8f8 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #fff), to(#f8f8f8));background-image:linear-gradient(to bottom, #fff 0, #f8f8f8 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffffff', endColorstr='#fff8f8f8', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);border-radius:4px;-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,0.15),0 1px 5px rgba(0,0,0,0.075);box-shadow:inset 0 1px 0 rgba(255,255,255,0.15),0 1px 5px rgba(0,0,0,0.075)}.navbar-default .navbar-nav>.open>a,.navbar-default .navbar-nav>.active>a{background-image:-webkit-linear-gradient(top, #dbdbdb 0, #e2e2e2 100%);background-image:-o-linear-gradient(top, #dbdbdb 0, #e2e2e2 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #dbdbdb), to(#e2e2e2));background-image:linear-gradient(to bottom, #dbdbdb 0, #e2e2e2 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffdbdbdb', endColorstr='#ffe2e2e2', GradientType=0);-webkit-box-shadow:inset 0 3px 9px rgba(0,0,0,0.075);box-shadow:inset 0 3px 9px rgba(0,0,0,0.075)}.navbar-brand,.navbar-nav>li>a{text-shadow:0 1px 0 rgba(255,255,255,0.25)}.navbar-inverse{background-image:-webkit-linear-gradient(top, #3c3c3c 0, #222 100%);background-image:-o-linear-gradient(top, #3c3c3c 0, #222 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #3c3c3c), to(#222));background-image:linear-gradient(to bottom, #3c3c3c 0, #222 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff3c3c3c', endColorstr='#ff222222', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);border-radius:4px}.navbar-inverse .navbar-nav>.open>a,.navbar-inverse .navbar-nav>.active>a{background-image:-webkit-linear-gradient(top, #080808 0, #0f0f0f 100%);background-image:-o-linear-gradient(top, #080808 0, #0f0f0f 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #080808), to(#0f0f0f));background-image:linear-gradient(to bottom, #080808 0, #0f0f0f 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff080808', endColorstr='#ff0f0f0f', GradientType=0);-webkit-box-shadow:inset 0 3px 9px rgba(0,0,0,0.25);box-shadow:inset 0 3px 9px rgba(0,0,0,0.25)}.navbar-inverse .navbar-brand,.navbar-inverse .navbar-nav>li>a{text-shadow:0 -1px 0 rgba(0,0,0,0.25)}.navbar-static-top,.navbar-fixed-top,.navbar-fixed-bottom{border-radius:0}@media (max-width:767px){.navbar .navbar-nav .open .dropdown-menu>.active>a,.navbar .navbar-nav .open .dropdown-menu>.active>a:hover,.navbar .navbar-nav .open .dropdown-menu>.active>a:focus{color:#fff;background-image:-webkit-linear-gradient(top, #337ab7 0, #2e6da4 100%);background-image:-o-linear-gradient(top, #337ab7 0, #2e6da4 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #337ab7), to(#2e6da4));background-image:linear-gradient(to bottom, #337ab7 0, #2e6da4 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2e6da4', GradientType=0)}}.alert{text-shadow:0 1px 0 rgba(255,255,255,0.2);-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,0.25),0 1px 2px rgba(0,0,0,0.05);box-shadow:inset 0 1px 0 rgba(255,255,255,0.25),0 1px 2px rgba(0,0,0,0.05)}.alert-success{background-image:-webkit-linear-gradient(top, #dff0d8 0, #c8e5bc 100%);background-image:-o-linear-gradient(top, #dff0d8 0, #c8e5bc 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #dff0d8), to(#c8e5bc));background-image:linear-gradient(to bottom, #dff0d8 0, #c8e5bc 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffdff0d8', endColorstr='#ffc8e5bc', GradientType=0);border-color:#b2dba1}.alert-info{background-image:-webkit-linear-gradient(top, #d9edf7 0, #b9def0 100%);background-image:-o-linear-gradient(top, #d9edf7 0, #b9def0 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #d9edf7), to(#b9def0));background-image:linear-gradient(to bottom, #d9edf7 0, #b9def0 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9edf7', endColorstr='#ffb9def0', GradientType=0);border-color:#9acfea}.alert-warning{background-image:-webkit-linear-gradient(top, #fcf8e3 0, #f8efc0 100%);background-image:-o-linear-gradient(top, #fcf8e3 0, #f8efc0 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #fcf8e3), to(#f8efc0));background-image:linear-gradient(to bottom, #fcf8e3 0, #f8efc0 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fffcf8e3', endColorstr='#fff8efc0', GradientType=0);border-color:#f5e79e}.alert-danger{background-image:-webkit-linear-gradient(top, #f2dede 0, #e7c3c3 100%);background-image:-o-linear-gradient(top, #f2dede 0, #e7c3c3 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #f2dede), to(#e7c3c3));background-image:linear-gradient(to bottom, #f2dede 0, #e7c3c3 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff2dede', endColorstr='#ffe7c3c3', GradientType=0);border-color:#dca7a7}.progress{background-image:-webkit-linear-gradient(top, #ebebeb 0, #f5f5f5 100%);background-image:-o-linear-gradient(top, #ebebeb 0, #f5f5f5 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #ebebeb), to(#f5f5f5));background-image:linear-gradient(to bottom, #ebebeb 0, #f5f5f5 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffebebeb', endColorstr='#fff5f5f5', GradientType=0)}.progress-bar{background-image:-webkit-linear-gradient(top, #337ab7 0, #286090 100%);background-image:-o-linear-gradient(top, #337ab7 0, #286090 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #337ab7), to(#286090));background-image:linear-gradient(to bottom, #337ab7 0, #286090 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff286090', GradientType=0)}.progress-bar-success{background-image:-webkit-linear-gradient(top, #5cb85c 0, #449d44 100%);background-image:-o-linear-gradient(top, #5cb85c 0, #449d44 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #5cb85c), to(#449d44));background-image:linear-gradient(to bottom, #5cb85c 0, #449d44 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5cb85c', endColorstr='#ff449d44', GradientType=0)}.progress-bar-info{background-image:-webkit-linear-gradient(top, #5bc0de 0, #31b0d5 100%);background-image:-o-linear-gradient(top, #5bc0de 0, #31b0d5 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #5bc0de), to(#31b0d5));background-image:linear-gradient(to bottom, #5bc0de 0, #31b0d5 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5bc0de', endColorstr='#ff31b0d5', GradientType=0)}.progress-bar-warning{background-image:-webkit-linear-gradient(top, #f0ad4e 0, #ec971f 100%);background-image:-o-linear-gradient(top, #f0ad4e 0, #ec971f 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #f0ad4e), to(#ec971f));background-image:linear-gradient(to bottom, #f0ad4e 0, #ec971f 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff0ad4e', endColorstr='#ffec971f', GradientType=0)}.progress-bar-danger{background-image:-webkit-linear-gradient(top, #d9534f 0, #c9302c 100%);background-image:-o-linear-gradient(top, #d9534f 0, #c9302c 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #d9534f), to(#c9302c));background-image:linear-gradient(to bottom, #d9534f 0, #c9302c 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9534f', endColorstr='#ffc9302c', GradientType=0)}.progress-bar-striped{background-image:-webkit-linear-gradient(45deg, rgba(255,255,255,0.15) 25%, transparent 25%, transparent 50%, rgba(255,255,255,0.15) 50%, rgba(255,255,255,0.15) 75%, transparent 75%, transparent);background-image:-o-linear-gradient(45deg, rgba(255,255,255,0.15) 25%, transparent 25%, transparent 50%, rgba(255,255,255,0.15) 50%, rgba(255,255,255,0.15) 75%, transparent 75%, transparent);background-image:linear-gradient(45deg, rgba(255,255,255,0.15) 25%, transparent 25%, transparent 50%, rgba(255,255,255,0.15) 50%, rgba(255,255,255,0.15) 75%, transparent 75%, transparent)}.list-group{border-radius:4px;-webkit-box-shadow:0 1px 2px rgba(0,0,0,0.075);box-shadow:0 1px 2px rgba(0,0,0,0.075)}.list-group-item.active,.list-group-item.active:hover,.list-group-item.active:focus{text-shadow:0 -1px 0 #286090;background-image:-webkit-linear-gradient(top, #337ab7 0, #2b669a 100%);background-image:-o-linear-gradient(top, #337ab7 0, #2b669a 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #337ab7), to(#2b669a));background-image:linear-gradient(to bottom, #337ab7 0, #2b669a 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2b669a', GradientType=0);border-color:#2b669a}.list-group-item.active .badge,.list-group-item.active:hover .badge,.list-group-item.active:focus .badge{text-shadow:none}.panel{-webkit-box-shadow:0 1px 2px rgba(0,0,0,0.05);box-shadow:0 1px 2px rgba(0,0,0,0.05)}.panel-default>.panel-heading{background-image:-webkit-linear-gradient(top, #f5f5f5 0, #e8e8e8 100%);background-image:-o-linear-gradient(top, #f5f5f5 0, #e8e8e8 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #f5f5f5), to(#e8e8e8));background-image:linear-gradient(to bottom, #f5f5f5 0, #e8e8e8 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff5f5f5', endColorstr='#ffe8e8e8', GradientType=0)}.panel-primary>.panel-heading{background-image:-webkit-linear-gradient(top, #337ab7 0, #2e6da4 100%);background-image:-o-linear-gradient(top, #337ab7 0, #2e6da4 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #337ab7), to(#2e6da4));background-image:linear-gradient(to bottom, #337ab7 0, #2e6da4 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2e6da4', GradientType=0)}.panel-success>.panel-heading{background-image:-webkit-linear-gradient(top, #dff0d8 0, #d0e9c6 100%);background-image:-o-linear-gradient(top, #dff0d8 0, #d0e9c6 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #dff0d8), to(#d0e9c6));background-image:linear-gradient(to bottom, #dff0d8 0, #d0e9c6 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffdff0d8', endColorstr='#ffd0e9c6', GradientType=0)}.panel-info>.panel-heading{background-image:-webkit-linear-gradient(top, #d9edf7 0, #c4e3f3 100%);background-image:-o-linear-gradient(top, #d9edf7 0, #c4e3f3 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #d9edf7), to(#c4e3f3));background-image:linear-gradient(to bottom, #d9edf7 0, #c4e3f3 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9edf7', endColorstr='#ffc4e3f3', GradientType=0)}.panel-warning>.panel-heading{background-image:-webkit-linear-gradient(top, #fcf8e3 0, #faf2cc 100%);background-image:-o-linear-gradient(top, #fcf8e3 0, #faf2cc 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #fcf8e3), to(#faf2cc));background-image:linear-gradient(to bottom, #fcf8e3 0, #faf2cc 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fffcf8e3', endColorstr='#fffaf2cc', GradientType=0)}.panel-danger>.panel-heading{background-image:-webkit-linear-gradient(top, #f2dede 0, #ebcccc 100%);background-image:-o-linear-gradient(top, #f2dede 0, #ebcccc 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #f2dede), to(#ebcccc));background-image:linear-gradient(to bottom, #f2dede 0, #ebcccc 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff2dede', endColorstr='#ffebcccc', GradientType=0)}.well{background-image:-webkit-linear-gradient(top, #e8e8e8 0, #f5f5f5 100%);background-image:-o-linear-gradient(top, #e8e8e8 0, #f5f5f5 100%);background-image:-webkit-gradient(linear, left top, left bottom, color-stop(0, #e8e8e8), to(#f5f5f5));background-image:linear-gradient(to bottom, #e8e8e8 0, #f5f5f5 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffe8e8e8', endColorstr='#fff5f5f5', GradientType=0);border-color:#dcdcdc;-webkit-box-shadow:inset 0 1px 3px rgba(0,0,0,0.05),0 1px 0 rgba(255,255,255,0.1);box-shadow:inset 0 1px 3px rgba(0,0,0,0.05),0 1px 0 rgba(255,255,255,0.1)} -------------------------------------------------------------------------------- /src/codes/visualizer/static/style.css: -------------------------------------------------------------------------------- 1 | .container-4{ 2 | overflow: hidden; 3 | width: 300px; 4 | vertical-align: middle; 5 | white-space: nowrap; 6 | } 7 | 8 | .container-4 input#search{ 9 | width: 300px; 10 | height: 50px; 11 | background: #2b303b; 12 | border: none; 13 | font-size: 10pt; 14 | float: left; 15 | color: #fff; 16 | padding-left: 15px; 17 | -webkit-border-radius: 5px; 18 | -moz-border-radius: 5px; 19 | border-radius: 5px; 20 | } 21 | 22 | .container-4 input#search::-webkit-input-placeholder { 23 | color: #65737e; 24 | } 25 | 26 | .container-4 input#search:-moz-placeholder { /* Firefox 18- */ 27 | color: #65737e; 28 | } 29 | 30 | .container-4 input#search::-moz-placeholder { /* Firefox 19+ */ 31 | color: #65737e; 32 | } 33 | 34 | .container-4 input#search:-ms-input-placeholder { 35 | color: #65737e; 36 | } 37 | 38 | .container-4 button.icon{ 39 | -webkit-border-top-right-radius: 5px; 40 | -webkit-border-bottom-right-radius: 5px; 41 | -moz-border-radius-topright: 5px; 42 | -moz-border-radius-bottomright: 5px; 43 | border-top-right-radius: 5px; 44 | border-bottom-right-radius: 5px; 45 | 46 | border: none; 47 | background: #232833; 48 | height: 50px; 49 | width: 50px; 50 | color: #4f5b66; 51 | opacity: 0; 52 | font-size: 10pt; 53 | 54 | -webkit-transition: all .55s ease; 55 | -moz-transition: all .55s ease; 56 | -ms-transition: all .55s ease; 57 | -o-transition: all .55s ease; 58 | transition: all .55s ease; 59 | } 60 | 61 | .container-4:hover button.icon, .container-4:active button.icon, .container-4:focus button.icon{ 62 | outline: none; 63 | opacity: 1; 64 | margin-left: -50px; 65 | } 66 | 67 | .container-4:hover button.icon:hover{ 68 | background: white; 69 | } 70 | 71 | div#answers { 72 | background-color: #f2f2f2; 73 | padding-top: 2px; 74 | padding-bottom: 2px; 75 | padding-left: 100px; 76 | } 77 | -------------------------------------------------------------------------------- /src/codes/visualizer/templates/app.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Title 7 | 8 | 9 | 10 |
11 |
12 | 13 | 14 |
15 |
16 | 17 | -------------------------------------------------------------------------------- /src/codes/visualizer/templates/search.html: -------------------------------------------------------------------------------- 1 | {% block content %} 2 | 17 | 18 | 19 | 20 | 21 | 22 | ETNLP's Side-by-Side Visualizer 23 | 24 | 25 | 26 | 41 | 42 | 43 | 44 |
45 | 46 | 47 | {% for emb_name in embedding_names_arr %} 48 | 49 | 50 | 51 | 52 | 53 | {% for page in output_arr[loop.index0] %} 54 | 55 | {% endfor %} 56 |
 
 
{{ emb_name }}
{{ page }}
57 | 58 | {% endfor %} 59 | 60 | 61 | 62 | 63 |
64 | 65 | {% for message in get_flashed_messages() %} 66 |
67 | {{ message }} 68 |
69 | {% endfor %} 70 | {% endblock %} 71 | 72 | -------------------------------------------------------------------------------- /src/codes/visualizer/visualizer_sbs.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template 2 | from flask import request 3 | import gensim 4 | from distutils.version import LooseVersion 5 | from utils import string_utils 6 | import sys 7 | 8 | 9 | app = Flask(__name__) 10 | app.config.from_object(__name__) 11 | app.config['SECRET_KEY'] = '7d441f27d441f27567d441f2b6176a' 12 | 13 | global embedding_models 14 | 15 | 16 | @app.route('/search', methods=['GET', 'POST']) 17 | def search(): 18 | """ 19 | Get input query and return list of top similiar words in all embeddings. 20 | :param embedding_paths_arr: 21 | :return: 22 | """ 23 | if request.method == "POST": 24 | query = request.values['search'] or '' 25 | # query = unicode(query, "utf-8") 26 | # query = query.decode().encode("utf-8") 27 | # Python 2.7 28 | try: 29 | # Old 30 | # query = unicode(query).lower() 31 | query = string_utils.convert_to_unicode(query) 32 | except Exception as e: 33 | raise Exception("Something went wrong: msg = %s, query = %s."%(e, query)) 34 | 35 | print('query = ' + query) 36 | output_arr = [] 37 | 38 | for embedding_model in embedding_models: 39 | try: 40 | output = [] 41 | sim_list = embedding_model.most_similar(query, topn=50) 42 | for wordsimilar in sim_list: 43 | output.append(wordsimilar[0] + ' - ' + str(round(wordsimilar[1], 6))) 44 | 45 | output_arr.append(output) 46 | except Exception as e: 47 | output = 'Err: %s, Not found query = %s' % (e, query) 48 | output_arr.append(output) 49 | 50 | return render_template('search.html', 51 | embedding_names_arr=embedding_names_arr, 52 | output_arr=output_arr 53 | ) 54 | 55 | 56 | @app.route("/") 57 | def get_index(): 58 | return render_template('search.html') 59 | 60 | 61 | @app.route("/multi_search") 62 | def multi_search(): 63 | return render_template('multi_search.html') 64 | 65 | 66 | if __name__ == "__main__": 67 | import os 68 | 69 | dir_path = os.path.dirname(os.path.realpath(__file__)) 70 | # download pre-trained_models at https://github.com/vietnlp/etnlp 71 | if len(sys.argv) < 2: 72 | print("Missing input arguments. Input format: ./*.py . Exiting ...") 73 | exit(0) 74 | 75 | if sys.argv[1].__contains__(";"): 76 | model_files = sys.argv[1].split(";") 77 | else: 78 | model_files = [sys.argv[1]] 79 | 80 | embedding_names_arr = [os.path.basename(file_path) for file_path in model_files] 81 | 82 | embedding_models = [] 83 | idx = 0 84 | for model in model_files: 85 | # model = root_dir + model 86 | if os.path.isfile(model): 87 | print('Loading embedding model ... %s' % (idx)) 88 | 89 | isBinary = False 90 | if model.endswith(".bin"): 91 | isBinary = True 92 | 93 | if LooseVersion(gensim.__version__) >= LooseVersion("1.0.1"): 94 | from gensim.models import KeyedVectors 95 | 96 | embedding_models.append(KeyedVectors.load_word2vec_format(model, binary=isBinary)) 97 | else: 98 | from gensim.models import Word2Vec 99 | 100 | embedding_models.append(Word2Vec.load_word2vec_format(model, binary=isBinary)) 101 | idx += 1 102 | else: 103 | print( 104 | "Download word2vec model and put into ../data/. File: https://github.com/vietnlp/etnlp") 105 | 106 | app.run(debug=False, port=8089, host='0.0.0.0') 107 | -------------------------------------------------------------------------------- /src/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vietnlp/etnlp/88862f63d4a8c9d75b7897555b4cdbcb97889337/src/data/.DS_Store -------------------------------------------------------------------------------- /src/data/embedding_analogies/portuguese/vocab.txt: -------------------------------------------------------------------------------- 1 | Atenas 2 | Grécia 3 | Bagdade 4 | Iraque 5 | Banguecoque 6 | Tailândia 7 | Pequim 8 | China 9 | Berlim 10 | Alemanha 11 | Berna 12 | Suíça 13 | Cairo 14 | Egito 15 | Camberra 16 | Austrália 17 | Hanói 18 | Vietname 19 | Havana 20 | Cuba 21 | Helsínquia 22 | Finlândia 23 | Islamabade 24 | Paquistão 25 | Cábul 26 | Afeganistão 27 | Londres 28 | Inglaterra 29 | Madrid 30 | Espanha 31 | Moscovo 32 | Rússia 33 | Oslo 34 | Noruega 35 | Ottawa 36 | Canadá 37 | Paris 38 | França 39 | Roma 40 | Itália 41 | Estocolmo 42 | Suécia 43 | Teerão 44 | Irão 45 | Tóquio 46 | Japão 47 | capital-world 48 | Abuja 49 | Nigéria 50 | Acra 51 | Gana 52 | Argel 53 | Argélia 54 | Amã 55 | Jordânia 56 | Ancara 57 | Turquia 58 | Antananarivo 59 | Madagáscar 60 | Apia 61 | Samoa 62 | Asgabate 63 | Turquemenistão 64 | Asmara 65 | Eritreia 66 | Astana 67 | Cazaquistão 68 | Baku 69 | Azerbaijão 70 | Bamako 71 | Mali 72 | Banjul 73 | Gâmbia 74 | Beirute 75 | Líbano 76 | Belgrado 77 | Sérvia 78 | Belmopã 79 | Belize 80 | Bisqueque 81 | Quirguistão 82 | Bratislava 83 | Eslováquia 84 | Bruxelas 85 | Bélgica 86 | Bucareste 87 | Roménia 88 | Budapeste 89 | Hungria 90 | Bujumbura 91 | Burundi 92 | Caracas 93 | Venezuela 94 | Chisinau 95 | Moldávia 96 | Conacri 97 | Guiné 98 | Copenhaga 99 | Dinamarca 100 | Dacar 101 | Senegal 102 | Damasco 103 | Síria 104 | Daca 105 | Bangladeche 106 | Doa 107 | Catar 108 | Dublim 109 | Irlanda 110 | Duchambé 111 | Tajiquistão 112 | Funafuti 113 | Tuvalu 114 | Gaborone 115 | Botsuana 116 | Georgetown 117 | Guiana 118 | Harare 119 | Zimbabué 120 | Jacarta 121 | Indonésia 122 | Campala 123 | Uganda 124 | Catmandu 125 | Nepal 126 | Cartum 127 | Sudão 128 | Kiev 129 | Ucrânia 130 | Quigali 131 | Ruanda 132 | Kingston 133 | Jamaica 134 | Libreville 135 | Gabão 136 | Lilongwe 137 | Malaui 138 | Lima 139 | Peru 140 | Lisboa 141 | Portugal 142 | Liubliana 143 | Eslovénia 144 | Luanda 145 | Angola 146 | Lusaca 147 | Zâmbia 148 | Manágua 149 | Nicarágua 150 | Manama 151 | Bareine 152 | Manila 153 | Filipinas 154 | Maputo 155 | Moçambique 156 | Minsk 157 | Bielorrússia 158 | Mogadíscio 159 | Somália 160 | Monróvia 161 | Libéria 162 | Montevideu 163 | Uruguai 164 | Mascate 165 | Omã 166 | Nairóbi 167 | Quénia 168 | Nassau 169 | Baamas 170 | Niamei 171 | Níger 172 | Nicósia 173 | Chipre 174 | Nuaquechote 175 | Mauritânia 176 | Nuque 177 | Gronelândia 178 | Paramaribo 179 | Suriname 180 | Podgorica 181 | Montenegro 182 | Quito 183 | Equador 184 | Rabat 185 | Marrocos 186 | Riga 187 | Letónia 188 | Roseau 189 | Dominica 190 | Santiago 191 | Chile 192 | Escópia 193 | Macedónia 194 | Sófia 195 | Bulgária 196 | Suva 197 | Fiji 198 | Taipé 199 | Taiwan 200 | Talim 201 | Estónia 202 | Tashkent 203 | Uzbequistão 204 | Tbilisi 205 | Geórgia 206 | Tegucigalpa 207 | Honduras 208 | Timbu 209 | Butão 210 | Tirana 211 | Albânia 212 | Trípoli 213 | Líbia 214 | Tunes 215 | Tunísia 216 | Vaduz 217 | Liechtenstein 218 | Valletta 219 | Malta 220 | Viena 221 | Áustria 222 | Vienciana 223 | Laos 224 | Vílnius 225 | Lituânia 226 | Varsóvia 227 | Polónia 228 | Windhoek 229 | Namíbia 230 | Erevan 231 | Arménia 232 | Zagreb 233 | Croácia 234 | currency 235 | dinar 236 | kwanza 237 | Argentina 238 | peso 239 | dram 240 | Brasil 241 | real 242 | lev 243 | Cambodja 244 | riel 245 | dólar 246 | kuna 247 | coroa 248 | Europa 249 | euro 250 | florim 251 | Índia 252 | rupia 253 | rial 254 | Iene 255 | Coreia 256 | won 257 | lats 258 | litas 259 | Malásia 260 | ringgit 261 | México 262 | naira 263 | zlóti 264 | leu 265 | rublo 266 | baht 267 | grívnia 268 | EUA 269 | dongue 270 | city-in-state 271 | Chicago 272 | Ilinóis 273 | Houston 274 | Texas 275 | Filadélfia 276 | Pensilvânia 277 | Fênix 278 | Arizona 279 | Dallas 280 | Jacksonville 281 | Flórida 282 | Indianápolis 283 | Indiana 284 | Austin 285 | Detroit 286 | Michigan 287 | Mênfis 288 | Tennessee 289 | Boston 290 | Massachusetts 291 | Seattle 292 | Washington 293 | Denver 294 | Colorado 295 | Baltimore 296 | Marilândia 297 | Nashville 298 | Louisville 299 | Kentucky 300 | Milwaukee 301 | Wisconsin 302 | Portland 303 | Oregão 304 | Tucson 305 | Fresno 306 | Califórnia 307 | Sacramento 308 | Mesa 309 | Atlanta 310 | Omaha 311 | Nebraska 312 | Miami 313 | Tulsa 314 | Oklahoma 315 | Oakland 316 | Cleveland 317 | Ohio 318 | Minneapolis 319 | Minesota 320 | Wichita 321 | Kansas 322 | Arlington 323 | Bakersfield 324 | Tampa 325 | Anaheim 326 | Honolulu 327 | Havai 328 | Pitsburgo 329 | Lexington 330 | Stockton 331 | Cincinnati 332 | Anchorage 333 | Alasca 334 | Toledo 335 | Plano 336 | Henderson 337 | Nevada 338 | Orlando 339 | Laredo 340 | Chandler 341 | Madison 342 | Lubbock 343 | Garland 344 | Glendale 345 | Hialeah 346 | Reno 347 | Scottsdale 348 | Irving 349 | Fremont 350 | Irvine 351 | Spokane 352 | Modesto 353 | Shreveport 354 | Luisiana 355 | Tacoma 356 | Oxnard 357 | Fontana 358 | Akron 359 | Amarillo 360 | Tallahassee 361 | Huntsville 362 | Alabama 363 | Worcester 364 | family 365 | rapaz 366 | rapariga 367 | irmão 368 | irmã 369 | irmãos 370 | irmãs 371 | pai 372 | mãe 373 | avô 374 | avó 375 | neto 376 | neta 377 | noivo 378 | noiva 379 | ele 380 | ela 381 | dele 382 | dela 383 | marido 384 | mulher 385 | rei 386 | rainha 387 | homem 388 | sobrinho 389 | sobrinha 390 | príncipe 391 | princesa 392 | filho 393 | filha 394 | fihos 395 | filhas 396 | meio-irmão 397 | meia-irmã 398 | padrasto 399 | madrasta 400 | enteado 401 | enteada 402 | tio 403 | tia 404 | gram1-adjective-to-adverb 405 | fantástico 406 | fantasticamente 407 | aparente 408 | aparentemente 409 | calmo 410 | calmamente 411 | alegre 412 | alegremente 413 | completo 414 | completamente 415 | eficiente 416 | eficientemente 417 | afortunado 418 | afortunadamente 419 | livre 420 | livremente 421 | furioso 422 | furiosamente 423 | imediato 424 | imediatamente 425 | infrequente 426 | infrequentemente 427 | sortudo 428 | felizmente 429 | óbvio 430 | obviamente 431 | ocasional 432 | ocasionalmente 433 | possível 434 | possivelmente 435 | preciso 436 | precisamente 437 | profissional 438 | profissionalmente 439 | rápido 440 | rapidamente 441 | silencioso 442 | silenciosamente 443 | raro 444 | raramente 445 | relutante 446 | relutantemente 447 | seguro 448 | seguramente 449 | sério 450 | seriamente 451 | lento 452 | lentamente 453 | repentino 454 | repentinamente 455 | veloz/rápido 456 | típico 457 | tipicamente 458 | infeliz 459 | infelizmente 460 | usual 461 | usualmente 462 | gram2-opposite 463 | aceitável 464 | inaceitável 465 | consciente 466 | inconsciente 467 | certo 468 | incerto 469 | claro 470 | obscuro 471 | confortável 472 | desconfortável 473 | competitivo 474 | descompetitivo 475 | consistente 476 | inconsistente 477 | convincente 478 | inconvincente 479 | conveniente 480 | inconveniente 481 | decidido 482 | indeciso 483 | ineficiente 484 | ético 485 | antiético 486 | honesto 487 | desonesto 488 | impressivo 489 | inexpressivo 490 | informativo 491 | desinformativo 492 | informado 493 | desinformado 494 | conhecido 495 | desconhecido 496 | provável 497 | improvável 498 | lógico 499 | ilógico 500 | agradável 501 | desagradável 502 | impossível 503 | impossivelmente 504 | produtivo 505 | improdutivo 506 | racional 507 | irracional 508 | responsável 509 | irresponsável 510 | gram3-comparative 511 | mau 512 | pior 513 | grande 514 | maior 515 | bom 516 | melhor 517 | pequeno 518 | menor 519 | gram4-superlative 520 | brilhante 521 | brilhantíssimo 522 | escuro 523 | escuríssimo 524 | fácil 525 | facílimo 526 | rapidíssimo 527 | grandíssimo 528 | alto 529 | altíssimo 530 | larguíssimo 531 | longo 532 | longuíssimo 533 | baixo 534 | baixíssimo 535 | velho 536 | velhíssimo 537 | aguçado 538 | aguçadíssimo 539 | simples 540 | simplicíssimo 541 | curto 542 | curtíssimo 543 | estranho 544 | estranhíssimo 545 | forte 546 | fortíssimo 547 | doce 548 | dulcíssimo 549 | fraco 550 | fraquíssimo 551 | largo 552 | jovem 553 | novíssimo 554 | gram5-present-participle 555 | programar 556 | programando 557 | dançar 558 | dançando 559 | depurar 560 | depurando 561 | diminuir 562 | diminuindo 563 | descrever 564 | descrevendo 565 | descobrir 566 | descobrindo 567 | melhorar 568 | melhorando 569 | voar 570 | voando 571 | gerar 572 | gerando 573 | ir 574 | indo 575 | implementar 576 | implementando 577 | aumentar 578 | aumentando 579 | inventar 580 | inventando 581 | saltar 582 | saltando 583 | ouvir 584 | ouvindo 585 | ver 586 | vendo 587 | mover 588 | movendo 589 | jogar 590 | jogando 591 | prever 592 | prevendo 593 | ler 594 | lendo 595 | correr 596 | correndo 597 | dizer 598 | dizendo 599 | gritar 600 | gritando 601 | baralhar 602 | baralhando 603 | cantar 604 | cantando 605 | sentar 606 | sentando 607 | abrandando 608 | nadar 609 | nadando 610 | pensar 611 | pensando 612 | desaparecer 613 | desaparecendo 614 | andar 615 | andando 616 | escrever 617 | escrevendo 618 | gram6-nationality-adjective 619 | Albanês 620 | Argentino 621 | Australiano 622 | Austríaco 623 | Bielorusso 624 | Brasileiro 625 | Búlgaro 626 | Cambojano 627 | Chileno 628 | Chinês 629 | Colombia 630 | Colombiano 631 | Croata 632 | Dinamarquês 633 | Egípcio 634 | Inglês 635 | Frânces 636 | Alemão 637 | Grego 638 | Islândia 639 | Islandês 640 | Indiano 641 | Irlândes 642 | Israel 643 | Israelita 644 | Italiano 645 | Japonês 646 | Coreano 647 | Macedónio 648 | Maltês 649 | Mexicano 650 | Moldávio 651 | Holanda 652 | Holandês 653 | Norueguês 654 | Peruano 655 | Polaco 656 | Português 657 | Russo 658 | Eslovaco 659 | Espanhol 660 | Sueco 661 | Suiço 662 | Tailandês 663 | Ucraniano 664 | gram7-past-tense 665 | dançou 666 | diminuiu 667 | descreveu 668 | melhorou 669 | caíndo 670 | caiu 671 | alimentando 672 | alimentou 673 | voou 674 | gerou 675 | foi 676 | escondendo 677 | escondeu 678 | acertando 679 | acertou 680 | implementou 681 | aumentou 682 | saltou 683 | conhecendo 684 | conheceu 685 | ouviu 686 | olhou 687 | moveu 688 | pagou 689 | jogou 690 | previu 691 | correu 692 | disse 693 | gritou 694 | viu 695 | vendendo 696 | vendeu 697 | encolhendo 698 | encolheu 699 | cantou 700 | sentou 701 | dormindo 702 | dormiu 703 | lentificou 704 | gastando 705 | gastou 706 | golpeando 707 | golpeou 708 | nadou 709 | tirando 710 | tirou 711 | pensou 712 | desapareceu 713 | andou 714 | escreveu 715 | gram8-plural 716 | banana 717 | bananas 718 | pássaro 719 | pássaros 720 | garrafa 721 | garrafas 722 | edifício 723 | edifícios 724 | carro 725 | carros 726 | gato 727 | gatos 728 | criança 729 | crianças 730 | nuvem 731 | nuvens 732 | cor 733 | cores 734 | computador 735 | computadores 736 | vaca 737 | vacas 738 | cão 739 | cães 740 | dólares 741 | burro 742 | burros 743 | sonho 744 | sonhos 745 | águia 746 | águias 747 | elefante 748 | elefantes 749 | olho 750 | olhos 751 | dedo 752 | dedos 753 | cabra 754 | cabras 755 | mão 756 | mãos 757 | cavalo 758 | cavalos 759 | leão 760 | leões 761 | máquina 762 | máquinas 763 | manga 764 | mangas 765 | homens 766 | melão 767 | melões 768 | macaco 769 | macacos 770 | rato 771 | ratos 772 | cebola 773 | cebolas 774 | pêra 775 | pêras 776 | porco 777 | porcos 778 | ananás 779 | ananases 780 | ratazana 781 | ratazanas 782 | estrada 783 | estradas 784 | cobra 785 | cobras 786 | mulheres 787 | gram9-plural-verbs 788 | diminuem 789 | descrevem 790 | comer 791 | comem 792 | melhoram 793 | estima 794 | estimam 795 | encontra 796 | encontram 797 | geram 798 | vão 799 | implementam 800 | aumentam 801 | ouvem 802 | jogam 803 | prevêem 804 | fornece 805 | fornecem 806 | dizem 807 | gritam 808 | procura 809 | procuram 810 | vêem 811 | baralham 812 | cantam 813 | sentam 814 | lentificam 815 | diz 816 | nadam 817 | fala 818 | falam 819 | pensam 820 | desaparecem 821 | andam 822 | trabalhou 823 | trabalham 824 | escrevem 825 | -------------------------------------------------------------------------------- /src/data/embedding_analogies/vi/Multi_evaluator_results.txt: -------------------------------------------------------------------------------- 1 | : | Word Analogy Task results 2 | -------------------------------------------------------------------------------- /src/data/embedding_analogies/vi/solveable_analogies_vi.txt: -------------------------------------------------------------------------------- 1 | analogy_list_vi_ner.txt -------------------------------------------------------------------------------- /src/data/glove2vec_dicts/glove1.vec: -------------------------------------------------------------------------------- 1 | word10 0.123 0.134 0.532 0.152 2 | word20 0.934 0.412 0.532 0.159 3 | word30 0.334 0.241 0.324 0.188 4 | word90 0.334 0.241 0.324 0.188 5 | word31 0.334 0.341 0.324 0.288 -------------------------------------------------------------------------------- /src/data/glove2vec_dicts/glove1_w2v.vec: -------------------------------------------------------------------------------- 1 | 5 4 2 | word10 0.123 0.134 0.532 0.152 3 | word20 0.934 0.412 0.532 0.159 4 | word30 0.334 0.241 0.324 0.188 5 | word90 0.334 0.241 0.324 0.188 6 | word31 0.334 0.341 0.324 0.288 7 | -------------------------------------------------------------------------------- /src/data/glove2vec_dicts/glove2.vec: -------------------------------------------------------------------------------- 1 | word1 0.123 0.134 0.532 0.152 2 | word2 0.934 0.412 0.532 0.159 3 | word3 0.334 0.241 0.324 0.188 4 | word9 0.334 0.241 0.324 0.188 -------------------------------------------------------------------------------- /src/data/glove2vec_dicts/glove2_w2v.vec: -------------------------------------------------------------------------------- 1 | 4 4 2 | word1 0.123 0.134 0.532 0.152 3 | word2 0.934 0.412 0.532 0.159 4 | word3 0.334 0.241 0.324 0.188 5 | word9 0.334 0.241 0.324 0.188 6 | -------------------------------------------------------------------------------- /src/data/vocab.txt: -------------------------------------------------------------------------------- 1 | tôi 2 | yêu 3 | hà_nội 4 | ghét 5 | em 6 | iphone 7 | thích 8 | hận 9 | đắm_say 10 | đẹp 11 | giận 12 | đà_nẵng 13 | cậu 14 | bé 15 | cô 16 | gái 17 | anh_trai 18 | em_gái 19 | người 20 | đàn_ông 21 | phụ_nữ 22 | hoàng_tử 23 | công_chúa 24 | -------------------------------------------------------------------------------- /src/examples/test1_etnlp_preprocessing.py: -------------------------------------------------------------------------------- 1 | from etnlp_api import embedding_preprocessing as emb_prep 2 | from etnlp_api import embedding_config 3 | 4 | INPUT_FILES="../data/glove2vec_dicts/glove1.vec;../data/glove2vec_dicts/glove2.vec" 5 | OUTPUT_FILES="../data/glove2vec_dicts/glove1_w2v.vec;../data/glove2vec_dicts/glove2_w2v.vec" 6 | # do_normalize: use this flag to normalize in case of multiple embeddings. 7 | embedding_config.do_normalize_emb = False 8 | # to mark input embeddings are not in word2vec format. 9 | embedding_config.is_word2vec_format = False 10 | emb_prep.load_and_save_2_word2vec_models(INPUT_FILES, OUTPUT_FILES, embedding_config) 11 | 12 | print("Done with exporting") -------------------------------------------------------------------------------- /src/examples/test2_etnlp_extractor.py: -------------------------------------------------------------------------------- 1 | from etnlp_api import embedding_config 2 | from etnlp_api import embedding_extractor 3 | 4 | 5 | emb1 = "/W2V_C2V.vec" 6 | emb2 = "/ELMO.vec" 7 | emb3 = "/MULTI.vec" 8 | emb4 = "/FastText.vec" 9 | C2V = "../data/embedding_dicts/C2V.vec" 10 | out1 = "../data/embedding_dicts/W2V_C2V_23.vec" 11 | out2 = "../data/embedding_dicts/ELMO_23.vec" 12 | out3 = "../data/embedding_dicts/MULTI_23.vec" 13 | out4 = "../data/embedding_dicts/FastText_23.vec" 14 | 15 | VOCAB_FILE = "../data/vocab.txt" 16 | # OUTPUT_FORMAT=".txt;.npz;.gz" 17 | OUTPUT_FORMAT = ".txt" 18 | # embedding_config 19 | embedding_config.do_normalize_emb = True 20 | 21 | emb_files = [emb1, emb2, emb3, emb4] 22 | out_files = [out1, out2, out3, out4] 23 | 24 | for emb_file, out_file in zip(emb_files, out_files): 25 | embedding_extractor.extract_embedding_for_vocab_file(emb_file, VOCAB_FILE, 26 | C2V, out_file, OUTPUT_FORMAT) 27 | print("DONE") 28 | 29 | -------------------------------------------------------------------------------- /src/examples/test3_etnlp_evaluator.py: -------------------------------------------------------------------------------- 1 | from etnlp_api import embedding_evaluator 2 | import os 3 | import tensorflow as tf 4 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 5 | tf.logging.set_verbosity(tf.logging.ERROR) 6 | 7 | INPUT_FILES = "../data/embedding_dicts/ELMO_23.vec;../data/embedding_dicts/FastText_23.vec;" \ 8 | "../data/embedding_dicts/W2V_C2V_23.vec;../data/embedding_dicts/MULTI_23.vec" 9 | ANALOGY_FILE = "../data/embedding_analogies/vi/solveable_analogies_vi.txt" 10 | OUT_FILE = "../data/embedding_analogies/vi/Multi_evaluator_results.txt" 11 | embedding_evaluator.evaluator_api(INPUT_FILES, ANALOGY_FILE, OUT_FILE) 12 | print("DONE") -------------------------------------------------------------------------------- /src/examples/test4_etnlp_visualizer.py: -------------------------------------------------------------------------------- 1 | # from etnlp_api import embedding_config 2 | from etnlp_api import embedding_visualizer 3 | 4 | INPUT_FILES = "../data/embedding_dicts/ELMO_12.vec;../data/embedding_dicts/FastText_12.vec;" \ 5 | "../data/embedding_dicts/W2V_C2V_12.vec;../data/embedding_dicts/MULTI_12.vec" 6 | embedding_visualizer.visualize_multiple_embeddings(INPUT_FILES) 7 | 8 | print("DONE") --------------------------------------------------------------------------------