├── .gitignore ├── LICENSE.md ├── README.md ├── bin ├── divide.py ├── lstm.sh ├── plot_tsne ├── preprocess.py ├── preprocess.sh ├── run_pipeline.sh ├── run_short_pipeline.sh ├── semantics_check ├── submit_scripts_for_supercomputers │ ├── submit_clean.sh │ ├── submit_clstm.sh │ ├── submit_divide.sh │ ├── submit_loaded_topics.sh │ ├── submit_lstm.sh │ ├── submit_make_wiki.sh │ ├── submit_pipeline.sh │ ├── submit_split.sh │ ├── submit_topics.sh │ └── submit_words2ids.sh ├── test_topics.sh └── wiki_extractor_launch.sh ├── documentation ├── ml-project.pdf ├── training_perplexities.png ├── word_embeddings_and_topic_detection.pdf └── word_embeddings_and_topic_detection_II.pdf ├── execution.txt ├── results ├── all_perplexities_lstm.png ├── learning_rate.png ├── train_perplexities.png └── train_perplexities_detail.png └── src ├── __init__.py ├── context ├── TIMES.txt ├── __init__.py ├── creator.py ├── custom.py └── topics_analysis.py ├── lstm ├── __init__.py ├── clstm.py ├── input_pipeline.py ├── lstm.py ├── lstm_wp.py ├── reader.py ├── reader_frag.py ├── reader_test.py └── reader_topics.py ├── postprocess ├── semantics_check.py ├── test_topics.py └── tsne.py ├── preprocess ├── __init__.py ├── cleaner.py ├── embeddings.py ├── filter.py ├── transform_from_gensim.py ├── words2ids.py └── words2ids_validator.py └── utils ├── __init__.py ├── flatten.py ├── memory.py ├── split_1k.py └── vector_manager.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | .idea 92 | .iml 93 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # contextualLSTM 2 | Contextual LSTM for NLP tasks like word prediction 3 | 4 | This repo's goal is to implement de Contextual LSTM model for word prediction as described by [Ghosh, S., Vinyals, O., Strope, B., Roy, S., Dean, T., & Heck, L. (n.d.). Contextual LSTM (CLSTM) models for Large scale NLP tasks. https://doi.org/10.1145/12351] 5 | 6 | **Notes**: there are scripts to run the pipelines. However, the project needs a bit of cleanup. If anyone is interested in using it please write to me or open an issue and I'll fix/help with any error you have. 7 | 8 | 9 | ## Data preprocessing and embeddings 10 | 11 | Further details about wikipedia data preprocessing at 12 | 13 | ./documentation/word_embeddings_and_topic_detection.pdf 14 | 15 | 16 | ## Context creation with topic detection 17 | 18 | Further details of different gensim topic detection methods as well as embeddings arithmetic for context creation at 19 | 20 | ./documentation/word_embeddings_and_topic_detection_II.pdf 21 | 22 | ## Execution 23 | 24 | Download a wikipedia dump for example: 25 | 26 | https://dumps.wikimedia.org/enwiki/20180420/enwiki-20180420-pages-articles.xml.bz2 27 | 28 | After that use wiki_extractor to process it: 29 | 30 | `./wiki_extractor_launch.sh path_to_wikipedia_dump` 31 | 32 | where `path_to_wikipedia_dump` is the file you downloaded (e.g. enwiki-20180120-pages-articles.xml.bz2) 33 | 34 | 35 | To run the whole pipeline use the script: 36 | 37 | `./run_pipeline.sh ../data/enwiki 500` 38 | 39 | `./preprocess.sh ../data/enwiki 500 2 40 | where: 41 | * `../data/enwiki` is the default path where preprocess script extracted and cleaned the wikipedia dump. 42 | * 500 is the desired embedding size. 43 | 44 | 45 | To run just the pipeline with pre-trained embeddings of size 1000 run: 46 | 47 | `./run_short_pipeline.sh ../data/ 1000` 48 | 49 | You can download the required trained embeddings from here: 50 | 51 | https://www.dropbox.com/s/ws6d8l6h6jp3ldc/embeddings.tar.gz?dl=0 52 | 53 | You should place them inside the models/ folder 54 | 55 | 56 | ## LSTM 57 | 58 | Basic LSTM implementation with TF at ./src/lstm.py 59 | 60 | ## CLSTM 61 | 62 | Contextual LSTM implementation with TF at ./src/clstm.py 63 | 64 | **Although functional, this version is still too slow to be practical for training. If you want to collaborate or have any question regarding it feel free to contact me, I plan to finish it shortly and upload a detailed description of it.** 65 | 66 | 67 | ## Execution 68 | 69 | Most files have their own execution script under /bin folder. 70 | All scripts named submit_XXX.sh are designed to be run in a SuperComputer with Slurm queue system. In order to run the locally, just issue the python commands followed by the correct paths. 71 | 72 | **Note:** due to the use of many different packages not all files run with the same Python version (some with 2.7, others with 3.5.2 and the rest 3.6), I expect to unify them (or state clearly the version) soon. 73 | -------------------------------------------------------------------------------- /bin/divide.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | data = open("wiki_data", "r").read() 3 | data = data.split() 4 | size = len(data) 5 | 6 | training = int(size*0.8) 7 | validation = int(size*0.1) 8 | testing = int(size*0.1) 9 | 10 | with open("wiki.train.txt", "w") as f: 11 | f.write(data[0:training]) 12 | 13 | with open("wiki.train.txt", "w") as f: 14 | f.write(" ".join(data[0:training])) 15 | 16 | with open("wiki.valid.txt", "w") as f: 17 | f.write(" ".join(data[training+1:training+1+validation])) 18 | 19 | with open("wiki.test.txt", "w") as f: 20 | f.write(" ".join(data[training+1+validation+1:-1])) 21 | 22 | -------------------------------------------------------------------------------- /bin/lstm.sh: -------------------------------------------------------------------------------- 1 | 2 | python ../src/lstm/lstm.py \ 3 | --data_path ../data/full.list \ 4 | --embeddings ../models/eos/idWordVec_ \ 5 | --model large \ 6 | --use_fp16 True \ 7 | --word_to_id ../models/eos/word2id_1000.pklz 8 | 9 | -------------------------------------------------------------------------------- /bin/plot_tsne: -------------------------------------------------------------------------------- 1 | export PYTHONPATH="$PYTHONPATH:../src/" 2 | 3 | python2 ../src/postprocess/tsne.py -i ../models/idWordVec.pklz -w ../models/word2vec_org_200 4 | -------------------------------------------------------------------------------- /bin/preprocess.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, "../src/") 3 | 4 | from preprocess.cleaner import clean_data 5 | from preprocess.embeddings import create_embeddings 6 | from preprocess.transform_from_gensim import transform_gensim 7 | from preprocess.words2ids import translate_files 8 | from preprocess.words2ids_validator import check_translated_files 9 | from preprocess.filter import filter_data 10 | from utils.vector_manager import VectorManager 11 | from time import time 12 | 13 | import argparse 14 | 15 | """" 16 | Orchestrating file which handles all the processing pipeline: 17 | * Clean data 18 | * Create embeddings 19 | * Transform model structures 20 | * Translate word's lists to IDs lists. 21 | """ 22 | 23 | if __name__ == '__main__': 24 | 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument('-d', '--data', type=str, help="Path of the data extracted with wikiExtractor", required=True) 27 | parser.add_argument('-s', '--size', type=int, help="Size of the word embeddings.", default=200, required=False) 28 | parser.add_argument('-c', '--count', type=int, help="Min count for embeddings (if set to something bigger than 1 " 29 | "you should manually handle the non processed words i.e. create" 30 | " 'unknown' key and add it to the embeddings' )", default=1, required=False) 31 | 32 | args = parser.parse_args() 33 | 34 | # Arguments parsing 35 | data_path = args.data 36 | emb_size = args.size # size of the embedding vectors to create 37 | min_count = args.count # minimum word occurrences to be in embeddings set 38 | 39 | print("Starting Preprocess pipeline\n\t * Data path: %s\n\t * Embedding size: %s\n\t * Min count: %s" % 40 | (data_path, emb_size, min_count)) 41 | 42 | # Clean Wikipedia data 43 | t0 = time() 44 | sentences = clean_data(data_path) 45 | 46 | t1 = time() 47 | print("Time cleaning data: %s\nCreating embeddings from cleaned data..." % (t1-t0)) 48 | 49 | # Create embeddings from the cleaned data 50 | model = create_embeddings(data_path, emb_size, min_count) 51 | t2 = time() 52 | print("Time creating embeddings: %s" % (t2-t1)) 53 | 54 | print("Saving embeddings model...") 55 | model.save("../models/word2vec_gensim_%s" % emb_size) 56 | model.wv.save_word2vec_format("../models/word2vec_org_%s" % emb_size, 57 | "../models/vocabulary_%s" % emb_size, 58 | binary=False) 59 | 60 | 61 | # Get only: 62 | # * word2id vector (for transforming data to numerical) 63 | # * id_word_vec (actually contain word embeddings an associated id <-> word 64 | t3 = time() 65 | word2id, id_word_vec = transform_gensim(model.wv) 66 | t4 = time() 67 | print("Time transforming gensim to word2ID and idWordVec vectors: %s" % (t4-t3)) 68 | 69 | # Save model for checkpointing 70 | VectorManager.write_pickled("../models/word2id_%s" % emb_size, word2id) 71 | VectorManager.write_pickled("../models/idWordVec_%s" % emb_size, id_word_vec) 72 | 73 | t5 = time() 74 | translate_files(data_path, word2id) 75 | t6 = time() 76 | print("Time translating words to numbers: %s" % (t6-t5)) 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /bin/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PYTHONPATH="$PYTHONPATH:../src/" 4 | 5 | data_path=$1 6 | embeddings_size=$2 7 | min_word_count_threshold=$3 8 | 9 | # Preprocess all wiki data and create embeddings 10 | python2 preprocess.py \ 11 | --data ${data_path} \ 12 | --size ${embeddings_size} \ 13 | 14 | 15 | # Put all the files into a list to be fed to TF LSTM 16 | find ../data/ -name *num_eos > ../data/full.list 17 | -------------------------------------------------------------------------------- /bin/run_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PYTHONPATH="$PYTHONPATH:../src/" 4 | 5 | data_path=$1 6 | embeddings_size=$2 7 | 8 | # Preprocess all wiki data and create embeddings 9 | python2 preprocess.py \ 10 | --data ${data_path} \ 11 | --size ${embeddings_size} 12 | 13 | 14 | # Put all the files into a list to be fed to TF LSTM 15 | find ../data/ -name *num_eos > ../data/full.list 16 | 17 | # Run the LSTM 18 | python ../src/lstm/lstm.py \ 19 | --data_path ../data/full.list \ 20 | --embeddings ../models/idWordVec_${embeddings_size}.pklz \ 21 | --model large \ 22 | --use_fp16 True \ 23 | --word_to_id ../models/word2id_${embeddings_size}.pklz 24 | -------------------------------------------------------------------------------- /bin/run_short_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PYTHONPATH="$PYTHONPATH:../src/" 4 | 5 | data_path=$1 6 | embeddings_size=$2 7 | 8 | 9 | python2 ../src/preprocess/filter.py \ 10 | --data ${data_path} \ 11 | --word_vector ../models/word2id_1000.pklz 12 | 13 | # Translate all wiki data 14 | python2 ../src/preprocess/words2ids.py \ 15 | --data ${data_path} \ 16 | --word_vector ../models/eos/word2id_1000.pklz 17 | 18 | 19 | # Put all the files into a list to be fed to TF LSTM 20 | find ../data/ -name *num_eos > ../data/full.list 21 | 22 | # Run the LSTM 23 | python ../src/lstm/lstm.py \ 24 | --data_path ../data/full.list \ 25 | --embeddings ../models/idWordVec_${embeddings_size}.pklz \ 26 | --model large \ 27 | --use_fp16 True \ 28 | --word_to_id ../models/word2id_${embeddings_size}.pklz 29 | -------------------------------------------------------------------------------- /bin/semantics_check: -------------------------------------------------------------------------------- 1 | export PYTHONPATH="$PYTHONPATH:../src/" 2 | 3 | python2 ../src/postprocess/semantics_check.py -w ../models/word2vec_org_200 4 | -------------------------------------------------------------------------------- /bin/submit_scripts_for_supercomputers/submit_clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | appName="cleanerPar" 3 | echo "#!/bin/sh 4 | #SBATCH --job-name=$appName 5 | #SBATCH --exclusive 6 | #SBATCH -t30:59:00 7 | #SBATCH --workdir=. 8 | #SBATCH -o $appName-%J.out 9 | #SBATCH -e $appName-%J.err 10 | #SBATCH -N1 11 | #SBATCH -n12 12 | 13 | export PYTHONPATH="$PYTHONPATH:/gpfs/home/bsc19/bsc19277/contextualLSTM/src/" 14 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/src/preprocess/cleaner.py -d /gpfs/home/bsc19/bsc19277/contextualLSTM/data/enwiki -w /gpfs/home/bsc19/bsc19277/contextualLSTM/models/word2id_1000.pklz" > job 15 | 16 | sbatch < job 17 | rm job -------------------------------------------------------------------------------- /bin/submit_scripts_for_supercomputers/submit_clstm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | appName="clstm-Verb" 3 | echo "#!/bin/sh 4 | #SBATCH --job-name=$appName 5 | #SBATCH --exclusive 6 | #SBATCH -t30:59:00 7 | #SBATCH --workdir=. 8 | #SBATCH -o $appName-%J.out 9 | #SBATCH -e $appName-%J.err 10 | #SBATCH -N1 11 | #SBATCH -n16 12 | #SBATCH --mem=100000 13 | 14 | module purge && module load K80 cuda/8.0 mkl/2017.1 CUDNN/5.1.10-cuda_8.0 intel-opencl/2016 python/3.6.0+_ML 15 | #module purge; module load K80 cuda/7.5 mkl/2017.0.098 CUDNN/5.1.3 python/3.5.2_ML 16 | #module purge && module load K80 mkl/2017.0.098 cuda/7.5 CUDNN/5.1.3 intel-opencl/2016 python/2.7.12_ML 17 | #export PYTHONPATH=$PYTHONPATH:/gpfs/home/bsc19/bsc19277/contextualLSTM/src 18 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/src/lstm/clstm.py \ 19 | --data_path /gpfs/home/bsc19/bsc19277/contextualLSTM/data/wikipedia/small.list \ 20 | --embeddings /gpfs/home/bsc19/bsc19277/contextualLSTM/models/eos/idWordVec_ \ 21 | --model small \ 22 | --use_fp16 True \ 23 | --word_to_id_path /gpfs/home/bsc19/bsc19277/contextualLSTM/models/eos/word2id_" > job 24 | 25 | sbatch < job 26 | rm job 27 | #SBATCH --dependency=afterany:753016 28 | #SBATCH --gres gpu:0 29 | #SBATCH --constraint=k80 30 | -------------------------------------------------------------------------------- /bin/submit_scripts_for_supercomputers/submit_divide.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | appName="divider" 3 | echo "#!/bin/sh 4 | #SBATCH --job-name=$appName 5 | #SBATCH --exclusive 6 | #SBATCH -t30:59:00 7 | #SBATCH --workdir=. 8 | #SBATCH -o $appName-%J.out 9 | #SBATCH -e $appName-%J.err 10 | #SBATCH -N1 11 | #SBATCH -n12 12 | #SBATCH --mem=100000 13 | 14 | export PYTHONPATH="$PYTHONPATH:/gpfs/home/bsc19/bsc19277/contextualLSTM/src/" 15 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/bin/divide.py" > job 16 | 17 | sbatch < job 18 | rm job 19 | -------------------------------------------------------------------------------- /bin/submit_scripts_for_supercomputers/submit_loaded_topics.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | appName="hdp_topicsLoadedAnalysis" 3 | echo "#!/bin/sh 4 | #SBATCH --job-name=$appName 5 | #SBATCH --exclusive 6 | #SBATCH -t32:00:00 7 | #SBATCH --workdir=. 8 | #SBATCH -o $appName-%J.out 9 | #SBATCH -e $appName-%J.err 10 | #SBATCH -N1 11 | #SBATCH -n12 12 | 13 | export PYTHONPATH="$PYTHONPATH:/gpfs/home/bsc19/bsc19277/contextualLSTM/src/" 14 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/src/lda/lda.py -d /gpfs/home/bsc19/bsc19277/contextualLSTM/data/enwiki -m /gpfs/home/bsc19/bsc19277/contextualLSTM/models/topics -c /gpfs/home/bsc19/bsc19277/contextualLSTM/models/gensim_tfidf.mm.bz2 -i /gpfs/home/bsc19/bsc19277/contextualLSTM/models/gensim_wordids.txt.bz2 " > job 15 | 16 | sbatch < job 17 | rm job -------------------------------------------------------------------------------- /bin/submit_scripts_for_supercomputers/submit_lstm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | appName="lstm" 3 | echo "#!/bin/sh 4 | #SBATCH --job-name=$appName 5 | #SBATCH --exclusive 6 | #SBATCH -t30:59:00 7 | #SBATCH --workdir=. 8 | #SBATCH -o $appName-%J.out 9 | #SBATCH -e $appName-%J.err 10 | #SBATCH -N1 11 | #SBATCH -n16 12 | #SBATCH --gres gpu:4 13 | #SBATCH --constraint=k80 14 | #SBATCH --mem=100000 15 | 16 | module purge && module load K80 cuda/8.0 mkl/2017.1 CUDNN/5.1.10-cuda_8.0 intel-opencl/2016 python/3.6.0+_ML 17 | 18 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/src/lstm/lstm.py \ 19 | --data_path /gpfs/home/bsc19/bsc19277/contextualLSTM/data/wikipedia/full.list \ 20 | --embeddings /gpfs/home/bsc19/bsc19277/contextualLSTM/models/eos/idWordVec_ \ 21 | --model medium \ 22 | --use_fp16 True \ 23 | --word_to_id /gpfs/home/bsc19/bsc19277/contextualLSTM/models/eos/word2id_200.pklz" > job 24 | 25 | sbatch < job 26 | rm job 27 | 28 | -------------------------------------------------------------------------------- /bin/submit_scripts_for_supercomputers/submit_make_wiki.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "#!/bin/sh 4 | #SBATCH --job-name=make_wiki 5 | #SBATCH --exclusive 6 | #SBATCH -t30:59:00 7 | #SBATCH --workdir=. 8 | #SBATCH -o make_wiki-%J.out 9 | #SBATCH -e make_wiki-%J.err 10 | #SBATCH -N1 11 | #SBATCH -n12 12 | 13 | python -m gensim.scripts.make_wiki /gpfs/home/bsc19/bsc19277/contextualLSTM/data/enwiki-20170220-pages-articles.xml.bz2 /gpfs/home/bsc19/bsc19277/contextualLSTM/models/gensim" > job 14 | 15 | sbatch < job 16 | rm job 17 | -------------------------------------------------------------------------------- /bin/submit_scripts_for_supercomputers/submit_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "#!/bin/sh 4 | #SBATCH --job-name=word2vec 5 | #SBATCH --exclusive 6 | #SBATCH -t30:59:00 7 | #SBATCH --workdir=. 8 | #SBATCH -o word2vec-%J.out 9 | #SBATCH -e word2vec-%J.err 10 | #SBATCH -N1 11 | #SBATCH -n12 12 | 13 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/bin/preprocess.py -d /gpfs/home/bsc19/bsc19277/contextualLSTM/data/enwiki -s 500" > job 14 | 15 | sbatch < job 16 | rm job -------------------------------------------------------------------------------- /bin/submit_scripts_for_supercomputers/submit_split.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | appName="splitr" 3 | echo "#!/bin/sh 4 | #SBATCH --job-name=$appName 5 | #SBATCH --exclusive 6 | #SBATCH -t30:59:00 7 | #SBATCH --workdir=. 8 | #SBATCH -o $appName-%J.out 9 | #SBATCH -e $appName-%J.err 10 | #SBATCH -N1 11 | #SBATCH -n12 12 | #SBATCH --mem=100000 13 | 14 | export PYTHONPATH="$PYTHONPATH:/gpfs/home/bsc19/bsc19277/contextualLSTM/src/" 15 | 16 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/src/utils/split_1k.py -d /gpfs/home/bsc19/bsc19277/contextualLSTM/data/wikipedia/full_lists/train.list -o /gpfs/home/bsc19/bsc19277/contextualLSTM/data/wikipedia/train/ 17 | 18 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/src/utils/split_1k.py -d /gpfs/home/bsc19/bsc19277/contextualLSTM/data/wikipedia/full_lists/test.list -o /gpfs/home/bsc19/bsc19277/contextualLSTM/data/wikipedia/test/ 19 | 20 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/src/utils/split_1k.py -d /gpfs/home/bsc19/bsc19277/contextualLSTM/data/wikipedia/full_lists/valid.list -o /gpfs/home/bsc19/bsc19277/contextualLSTM/data/wikipedia/valid/ 21 | " > job 22 | 23 | sbatch < job 24 | rm job 25 | 26 | -------------------------------------------------------------------------------- /bin/submit_scripts_for_supercomputers/submit_topics.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | appName="topicsAnalysis" 3 | echo "#!/bin/sh 4 | #SBATCH --job-name=$appName 5 | #SBATCH --exclusive 6 | #SBATCH -t30:59:00 7 | #SBATCH --workdir=. 8 | #SBATCH -o $appName-%J.out 9 | #SBATCH -e $appName-%J.err 10 | #SBATCH -N1 11 | #SBATCH -n12 12 | 13 | export PYTHONPATH="$PYTHONPATH:/gpfs/home/bsc19/bsc19277/contextualLSTM/src/" 14 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/src/lda/lda.py -d /gpfs/home/bsc19/bsc19277/contextualLSTM/data/enwiki -m /gpfs/home/bsc19/bsc19277/contextualLSTM/models/topics -w /gpfs/home/bsc19/bsc19277/contextualLSTM/models/word2id_1000.pklz" > job 15 | 16 | sbatch < job 17 | rm job -------------------------------------------------------------------------------- /bin/submit_scripts_for_supercomputers/submit_words2ids.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "#!/bin/sh 4 | #SBATCH --job-name=word2ids 5 | #SBATCH --exclusive 6 | #SBATCH -t30:59:00 7 | #SBATCH --workdir=. 8 | #SBATCH -o word2ids-%J.out 9 | #SBATCH -e word2ids-%J.err 10 | #SBATCH -N1 11 | #SBATCH -n12 12 | 13 | export PYTHONPATH=$PYTHONPATH:/gpfs/home/bsc19/bsc19277/contextualLSTM/src 14 | 15 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/src/preprocess/words2ids.py -d /gpfs/home/bsc19/bsc19277/contextualLSTM/data/enwiki -w /gpfs/home/bsc19/bsc19277/contextualLSTM/models/eos/word2id_1000.pklz" > job 16 | 17 | sbatch < job 18 | rm job -------------------------------------------------------------------------------- /bin/test_topics.sh: -------------------------------------------------------------------------------- 1 | # LDA online 2 | python ../src/postprocess/test_topics.py 3 | -m ../models/topics/lda_online 4 | -w ../models/eos/word2id_1000.pklz 5 | -i ../models/topics/gensim_wordids.txt.bz2 6 | -e ../models/eos/i2WordVec_1000.pklz 7 | 8 | # LDA Parallel 9 | python ../src/postprocess/test_topics.py 10 | -m ../models/topics/lda_parallel_bf64b098-c517-47c8-9267-1ce116e0033d 11 | -w ../models/eos/word2id_1000.pklz 12 | -i ../models/topics/gensim_wordids.txt.bz2 13 | -e ../models/eos/i2WordVec_1000.pklz 14 | 15 | # LSI 16 | python postprocess/test_topics.py 17 | -m ../models/topics/lsa_c59e4bd3-1553-4ff1-a448-8c5be75d3f33 18 | -w ../models/eos/word2id_1000.pklz 19 | -i ../models/topics/gensim_wordids.txt.bz2 20 | -e ../models/eos/i2WordVec_1000.pklz 21 | -------------------------------------------------------------------------------- /bin/wiki_extractor_launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Clean Wikipedia Data 3 | 4 | 5 | wikipedia_dump_path=$1 6 | ../src/preprocess/wikiextractor/build/scripts-3.5/WikiExtractor.py -o data/enwiki ${wikipedia_dump_path} 7 | -------------------------------------------------------------------------------- /documentation/ml-project.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/documentation/ml-project.pdf -------------------------------------------------------------------------------- /documentation/training_perplexities.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/documentation/training_perplexities.png -------------------------------------------------------------------------------- /documentation/word_embeddings_and_topic_detection.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/documentation/word_embeddings_and_topic_detection.pdf -------------------------------------------------------------------------------- /documentation/word_embeddings_and_topic_detection_II.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/documentation/word_embeddings_and_topic_detection_II.pdf -------------------------------------------------------------------------------- /execution.txt: -------------------------------------------------------------------------------- 1 | # Execution 2 | 3 | Most files have their own execution script under /bin folder. 4 | All scripts named submit_XXX.sh are designed to be run in a SuperComputer with Slurm queue system. In order to run the locally, just issue the python commands followed by the correct paths. **Note:** due to the use of many different packages not all files run with the same Python version (some with 2.7, others with 3.5.2 and the rest 3.6), I expect to unify them (or state clearly the version) soon. 5 | 6 | FINAL NOTE: when running files manually, do so from inside 'src' or export it to the Python path to ensure all dependencies are met. 7 | -------------------------------------------------------------------------------- /results/all_perplexities_lstm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/results/all_perplexities_lstm.png -------------------------------------------------------------------------------- /results/learning_rate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/results/learning_rate.png -------------------------------------------------------------------------------- /results/train_perplexities.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/results/train_perplexities.png -------------------------------------------------------------------------------- /results/train_perplexities_detail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/results/train_perplexities_detail.png -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/src/__init__.py -------------------------------------------------------------------------------- /src/context/TIMES.txt: -------------------------------------------------------------------------------- 1 | hdp_topicsLoadedAnalysis-764553.out:[BLOCK] Training time for LSA: 14390.89 2 | lsa__topicsLoadedAnalysis-764083.out:[BLOCK] Training time for LSA: 12396.78 3 | lsa__topicsLoadedAnalysis-764552.out:[BLOCK] Training time for LSA: 14483.59 4 | topicsLoadedAnalysis-764012.out:[BLOCK] Training time for LSA: 9075.62 5 | 6 | 7 | model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=11, passes=3) 8 | ldam__topicsLoadedAnalysis-764082.out:[BLOCK] Training time for LDA multicore: 52544.35 9 | 10 | -------------------------------------------------------------------------------- /src/context/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/src/context/__init__.py -------------------------------------------------------------------------------- /src/context/creator.py: -------------------------------------------------------------------------------- 1 | from utils.vector_manager import VectorManager 2 | from utils.flatten import flatten 3 | from gensim.corpora import Dictionary, MmCorpus 4 | from gensim.models import LsiModel, LdaMulticore, LdaModel, HdpModel 5 | from time import time 6 | 7 | import numpy as np 8 | import argparse 9 | import pickle 10 | import sys 11 | 12 | 13 | 14 | class TopicCreator(object): 15 | 16 | def __init__(self, dictionary_path, word2id, embeddings, lda=None, lsi=None): 17 | self.dictionary = self.load_dict(dictionary_path) 18 | self.word2id = VectorManager.read_vector(word2id) 19 | # self.word2id = self.word2id_to_id2word(word2id) 20 | self.embeddings = embeddings 21 | self.lda = lda 22 | self.lsi = lsi 23 | 24 | def load_dict(self, dict_path): 25 | print("[BLOCK] Loading dictionary files from %s" % (dict_path)) 26 | sys.stdout.flush() 27 | return Dictionary.load_from_text(dict_path) 28 | 29 | def word2id_to_id2word(self, word2id_path): 30 | 31 | word2id = pickle.load(open(word2id_path)) 32 | id2word_c = [0] * len(word2id) 33 | for w in word2id: 34 | id2word_c[word2id[w]] = w 35 | return id2word_c 36 | 37 | 38 | def get_lsa_topic_embeding(self, document): 39 | """ 40 | Construct a context vector by doing the weighted sum of the embeddings of the words of most relevant lsi topic 41 | :param document: sequence of text to get the context for 42 | :return: numpy array with the context 43 | """ 44 | if not self.lsi: 45 | print("LSI model not provided") 46 | raise Exception("LSI model not available") 47 | 48 | document = [self.embeddings[int(elem)][1] for elem in document] 49 | corpus = [self.dictionary.doc2bow(document)] 50 | corpus_topics = self.lsi[corpus][0] 51 | 52 | values = [abs(val) for _, val in corpus_topics] 53 | index = values.index(max(values)) 54 | 55 | topics = self.lsi.show_topic(index) 56 | 57 | 58 | embedding = np.zeros_like(self.embeddings[0][2], dtype=np.float32) 59 | for word, weight in topics: 60 | embedding = np.multiply(weight, self.embeddings[self.word2id[word]][2]) 61 | 62 | return embedding 63 | 64 | 65 | def average_embeddings(self, document): 66 | """ 67 | Construct a context vector by doing the average of the embeddings seen so far 68 | :return: numpy array with the context 69 | """ 70 | if not self.lsi: 71 | print("LSI model not provided") 72 | raise Exception("LSI model not available") 73 | 74 | document_embeddings = [self.embeddings[int(elem)][2] for elem in document] 75 | 76 | embedding = np.mean(document_embeddings) 77 | 78 | return embedding 79 | 80 | def get_lda_best_topic_words(self, document): 81 | """ 82 | Construct a context vector by returning the embedding of the most relevant word of the topic 83 | :param document: sequence of text to get the context for 84 | :return: numpy array with the context or unknown embedding if topic is not found 85 | """ 86 | if not self.lda: 87 | print("LDA model not provided") 88 | raise Exception("LDA model not available") 89 | 90 | document = [self.embeddings[int(elem)][1] for elem in document] 91 | corpus = [self.dictionary.doc2bow(document)] 92 | top_topics = self.lda.top_topics(corpus, num_words=100)[0][0] 93 | 94 | if not top_topics[0][0] > 0.1: 95 | topic_word = '' 96 | else: 97 | topic_word = top_topics[0][1] 98 | 99 | try: 100 | embedding = self.embeddings[self.word2id[topic_word]][2] 101 | except KeyError as e: 102 | embedding = self.embeddings[self.word2id['']][2] 103 | print("Word %s not found in word2id dict, returning UNK topic (%s)" % (topic_word, e)) 104 | 105 | return embedding 106 | 107 | def get_lda_topic_embedding(self, document): 108 | """ 109 | Construct a context vector by doing the weighted sum of the embeddings of the 10 most relevant words of the topic 110 | :param document: sequence of text to get the context for 111 | :return: numpy array with the context 112 | """ 113 | if not self.lda: 114 | print("LDA model not provided") 115 | raise Exception("LDA model not available") 116 | 117 | document = [self.embeddings[int(elem)][1] for elem in document] 118 | corpus = [self.dictionary.doc2bow(document)] 119 | topics = self.lda.top_topics(corpus, num_words=100)[0][0] 120 | top_topic = topics[0] 121 | 122 | if not top_topic[0] > 0: 123 | topic_embedding = self.embeddings[self.word2id['']][2] 124 | else: 125 | topic_embedding = np.zeros_like(self.embeddings[self.word2id['']][2], dtype=np.float32) 126 | for i in range(10): 127 | weight = topics[i][0] 128 | embed = self.embeddings[self.word2id[topics[i][1]]][2] 129 | update = np.multiply(weight, embed) 130 | topic_embedding = np.add(topic_embedding, update) 131 | 132 | return topic_embedding -------------------------------------------------------------------------------- /src/context/custom.py: -------------------------------------------------------------------------------- 1 | from nltk.tokenize import RegexpTokenizer 2 | from stop_words import get_stop_words 3 | from nltk.stem.porter import PorterStemmer 4 | from gensim import corpora, models 5 | import gensim 6 | 7 | tokenizer = RegexpTokenizer(r'\w+') 8 | 9 | # create English stop words list 10 | en_stop = get_stop_words('en') 11 | 12 | # Create p_stemmer of class PorterStemmer 13 | p_stemmer = PorterStemmer() 14 | 15 | # create sample documents 16 | paragraphs = ["Space Exploration Technologies Corporation, better known as SpaceX, is an American aerospace manufacturer and space transport services company headquartered in Hawthorne, California. It was founded in 2002 by entrepreneur Elon Musk with the goal of reducing space transportation costs and enabling the colonization of Mars. SpaceX has since developed the Falcon launch vehicle family and the Dragon spacecraft family, which both currently deliver payloads into Earth orbit.", "SpaceX's achievements include the first privately funded liquid-propellant rocket to reach orbit (Falcon 1 in 2008); the first privately funded company to successfully launch, orbit, and recover a spacecraft (Dragon in 2010); the first private company to send a spacecraft to the International Space Station (Dragon in 2012), and the first propulsive landing for an orbital rocket. As of March 2017, SpaceX has since flown ten missions to the International Space Station (ISS) under a cargo resupply contract. NASA also awarded SpaceX a further development contract in 2011 to develop and demonstrate a human-rated Dragon, which would be used to transport astronauts to the ISS and return them safely to Earth.", "SpaceX announced in 2011 they were beginning a privately funded reusable launch system technology development program. In December 2015, a first stage was flown back to a landing pad near the launch site, where it successfully accomplished a propulsive vertical landing. This was the first such achievement by a rocket for orbital spaceflight. In April 2016, with the launch of CRS-8, SpaceX successfully vertically landed a first stage on an ocean drone-ship landing platform. In May 2016, in another first, SpaceX again landed a first stage, but during a significantly more energetic geostationary transfer orbit mission. In March 2017, SpaceX became the first to successfully re-launch and land the first stage of an orbital rocket.", "In 2016, CEO Elon Musk unveiled the mission architecture of the Interplanetary Transport System program, an ambitious privately funded initiative to develop spaceflight technology for use in manned interplanetary spaceflight, and which, if demand emerges, could lead to sustainable human settlements on Mars over the long term. This is the main purpose this System was designed for. In 2017, Elon Musk announced that the company had been contracted by two private individuals to send them in a Dragon spacecraft on a free return trajectory around the Moon. Provisionally launching in 2018, this could become the first instance of lunar tourism."] 17 | 18 | space_split = [line.split(" ") for line in paragraphs] 19 | 20 | # compile sample documents into a list 21 | 22 | # list for tokenized documents in loop 23 | #texts = [] 24 | 25 | # loop through document list 26 | #for i in paragraphs: 27 | 28 | # clean and tokenize document string 29 | # raw = i.lower() 30 | # tokens = tokenizer.tokenize(raw) 31 | 32 | # remove stop words from tokens 33 | # stopped_tokens = [i for i in tokens if not i in en_stop] 34 | 35 | # stem tokens 36 | # stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] 37 | 38 | # add tokens to list 39 | # texts.append(stemmed_tokens) 40 | 41 | # turn our tokenized documents into a id <-> term dictionary 42 | dictionary = corpora.Dictionary(space_split) 43 | 44 | # convert tokenized documents into a document-term matrix 45 | corpus = [dictionary.doc2bow(text) for text in space_split] 46 | 47 | 48 | # Use first paragraph 49 | y = [dictionary.token2id.get(word) for word in space_split[0]] 50 | X = [0] + y 51 | # generate LDA model 52 | ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20) 53 | -------------------------------------------------------------------------------- /src/context/topics_analysis.py: -------------------------------------------------------------------------------- 1 | from utils.vector_manager import VectorManager 2 | from utils.flatten import flatten 3 | from gensim.corpora import Dictionary, MmCorpus 4 | from gensim.models import LsiModel, LdaMulticore, LdaModel, HdpModel 5 | from time import time 6 | 7 | import multiprocessing as mp 8 | 9 | import argparse 10 | import os 11 | import sys 12 | import bz2 13 | 14 | stop_words = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", 15 | "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", 16 | "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", 17 | "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", 18 | "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", 19 | "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", 20 | "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", 21 | "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", 22 | "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", 23 | "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", 24 | "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", 25 | "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", 26 | "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", 27 | "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 28 | "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", 29 | "your", "yours", "yourself", "yourselves"] 30 | 31 | 32 | def get_file_as_list(filename): 33 | words_list = VectorManager.parse_into_list(VectorManager.read_vector(filename)) 34 | words_list = [w for w in words_list if w not in stop_words] 35 | return words_list 36 | 37 | 38 | def get_lists(data_path): 39 | filepaths = [] 40 | for root, dirs, files in os.walk(data_path): 41 | filepaths.extend(["%s/%s" % (root, file) for file in files if file.endswith("_clean")]) 42 | 43 | p = mp.Pool(mp.cpu_count() * 2) 44 | files_list = p.map(get_file_as_list, filepaths) 45 | 46 | return filepaths, files_list 47 | 48 | 49 | def get_corpus_and_dict(data_path): 50 | print("[BLOCK] Getting corpus and dictionary files from %s" % (data_path)) 51 | sys.stdout.flush() 52 | 53 | file_paths, files_list = get_lists(data_path) 54 | 55 | print("[BLOCK] Building dictionary with %s documents" % len(files_list)) 56 | sys.stdout.flush() 57 | 58 | dictionary = Dictionary(files_list) 59 | 60 | print("[BLOCK] Filtering out %s (0.1)" % (int(len(dictionary)*0.1))) 61 | sys.stdout.flush() 62 | 63 | dictionary.filter_n_most_frequent(int(len(dictionary)*0.1)) 64 | 65 | # convert tokenized documents into a document-term matrix 66 | corpus = [dictionary.doc2bow(doc) for doc in files_list] 67 | 68 | return corpus, dictionary 69 | 70 | 71 | def load_corpus_and_dict(corpus_path, id2word_path): 72 | print("[BLOCK] Loading corpus and dictionary files from %s and %s" % (data_path, id2word_path)) 73 | sys.stdout.flush() 74 | dictionary = Dictionary.load_from_text(id2word_path) 75 | 76 | print("[BLOCK] Loading corpus iterator") 77 | sys.stdout.flush() 78 | #mm = gensim.corpora.MmCorpus(corpus_path) 79 | corpus = MmCorpus(bz2.BZ2File(corpus_path)) # use this if you compressed the TFIDF output (recommended) 80 | 81 | return corpus, dictionary 82 | 83 | 84 | def topic_analysis(corpus, dictionary, models_path, technique): 85 | 86 | import uuid 87 | uuid = str(uuid.uuid4()) 88 | print("[BLOCK] Starting models for context") 89 | sys.stdout.flush() 90 | 91 | if technique == "all" or technique == "hdp": 92 | t1 = time() 93 | # HDP model 94 | model = HdpModel(corpus, id2word=dictionary) 95 | model.save("%s/hdp_%s" % (models_path, uuid)) 96 | del model 97 | t2 = time() 98 | print("[BLOCK] Training time for HDP model: %s" % (round(t2-t1, 2))) 99 | sys.stdout.flush() 100 | 101 | if technique == "all" or technique == "ldap": 102 | t1 = time() 103 | # Parallel LDA model 104 | model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=23, passes=20) 105 | model.save("%s/lda_parallel_%s" % (models_path, uuid)) 106 | del model 107 | t2 = time() 108 | print("[BLOCK] Training time for LDA multicore: %s" % (round(t2-t1, 2))) 109 | sys.stdout.flush() 110 | 111 | if technique == "all" or technique == "lsa": 112 | t1 = time() 113 | # LSA model 114 | model = LsiModel(corpus, id2word=dictionary, num_topics=400) 115 | model.save("%s/lsa_%s" % (models_path, uuid)) 116 | del model 117 | t2 = time() 118 | print("[BLOCK] Training time for LSA: %s" % (round(t2-t1, 2))) 119 | sys.stdout.flush() 120 | 121 | if technique == "all" or technique == "ldao": 122 | t1 = time() 123 | # Online LDA model 124 | model = LdaModel(corpus, id2word=dictionary, num_topics=100, update_every=1, chunksize=10000, passes=5) 125 | model.save("%s/lda_online_%s" % (models_path, uuid)) 126 | t2 = time() 127 | print("[BLOCK] Training time for LDA online: %s" % (round(t2-t1, 2))) 128 | sys.stdout.flush() 129 | 130 | if technique == "all" or technique == "lda": 131 | t1 = time() 132 | # Offline LDA model 133 | model = LdaModel(corpus, id2word=dictionary, num_topics=100, update_every=0, passes=20) 134 | model.save("%s/lda_offline_%s" % (models_path, uuid)) 135 | del model 136 | t2 = time() 137 | print("[BLOCK] Training time for LDA offline: %s" % (round(t2-t1, 2))) 138 | sys.stdout.flush() 139 | 140 | 141 | if __name__ == '__main__': 142 | parser = argparse.ArgumentParser() 143 | parser.add_argument('-d', '--data', type=str, help="Path of the data to be translated with word2id vector." 144 | " and clean up.", required=True) 145 | parser.add_argument('-m', '--models', type=str, help="Directory were the models will be stored.", required=True) 146 | parser.add_argument('-w', '--word_vector', type=str, help="Word2ID vector to be used for doc translation.", 147 | required=False, default=None) 148 | parser.add_argument('-c', '--corpus_path', type=str, help="Corpus iterator path [wiki_en_tfidf.mm.bz2].", 149 | required=False, default=None) 150 | parser.add_argument('-i', '--id_word', type=str, help="Id2Word vector path ['wiki_en_wordids.txt'].", 151 | required=False, default=None) 152 | parser.add_argument('-t', '--technique', type=str, help="Technique used for topic modeling. Available options all," 153 | "hierarchical dirichlet process (hdp), latent dirichlet allocation (lda), lda multicore (ldap)" 154 | "latent semantic anaylisis (lsa), lda online (ldao)", required=False, default="all") 155 | 156 | args = parser.parse_args() 157 | data_path = args.data 158 | models_path = args.models 159 | word2id_file = args.word_vector 160 | corpus_path = args.corpus_path 161 | id2word_path = args.id_word 162 | technique = args.technique 163 | 164 | begin = time() 165 | 166 | if word2id_file: 167 | w2Id = VectorManager.read_vector(word2id_file) 168 | 169 | if corpus_path and id2word_path: 170 | corpus, dictionary = load_corpus_and_dict(corpus_path, id2word_path) 171 | else: 172 | corpus, dictionary = get_corpus_and_dict(data_path) 173 | 174 | topic_analysis(corpus, dictionary, models_path, technique) 175 | 176 | end = time() 177 | print("Total processing time: %d seconds" % (end - begin)) 178 | -------------------------------------------------------------------------------- /src/lstm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/src/lstm/__init__.py -------------------------------------------------------------------------------- /src/lstm/clstm.py: -------------------------------------------------------------------------------- 1 | """ 2 | To run: 3 | 4 | $ python lstm_frag.py --data_path=path/to/train.list 5 | 6 | """ 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import sys 12 | sys.path.insert(0, "../src/") 13 | 14 | import inspect 15 | import time 16 | from utils.vector_manager import VectorManager 17 | from context.creator import TopicCreator 18 | # from context.create import get_lda_best_topic_words, get_lda_topic_embedding, get_lsa_topic_embeding 19 | import subprocess 20 | 21 | import numpy as np 22 | import tensorflow as tf 23 | from gensim.models import LsiModel, LdaModel 24 | 25 | flags = tf.flags 26 | logging = tf.logging 27 | 28 | flags.DEFINE_string( 29 | "model", "small", 30 | "A type of model. Possible options are: small, medium, large.") 31 | 32 | flags.DEFINE_string( 33 | "tasks", "all", 34 | "Tasks to be performed. Possible options are: all, train, test, valid") 35 | 36 | flags.DEFINE_string( 37 | "word2id_path", "../models/eos/word2id_", 38 | "A type of model. Possible options are: small, medium, large.") 39 | 40 | flags.DEFINE_string( 41 | "embeddings", "../models/eos/idWordVec_", 42 | "Embeddings path") 43 | 44 | flags.DEFINE_string("topic_model_path", "../models/topics/lda_parallel_bf64b098-c517-47c8-9267-1ce116e0033d", 45 | "Where the lda model is stored.") 46 | 47 | flags.DEFINE_string("dictionary_path", "../models/topics/gensim_wordids.txt.bz2", 48 | "Where the dictionary is stored.") 49 | 50 | flags.DEFINE_string("data_path", None, 51 | "Where the training/test data is stored.") 52 | flags.DEFINE_string("save_path", None, 53 | "Model output directory.") 54 | flags.DEFINE_bool("use_fp16", False, 55 | "Train using 16-bit floats instead of 32bit floats") 56 | 57 | flags.DEFINE_string("context", "lda", 58 | "Type of context to be used. Possible values are, lda, lda_mean, lsi, arithmetic") 59 | 60 | FLAGS = flags.FLAGS 61 | 62 | 63 | def data_type(): 64 | return tf.float16 if FLAGS.use_fp16 else tf.float32 65 | 66 | 67 | def get_context(topic_creator, segment): 68 | if FLAGS.context == "lda": 69 | return topic_creator.get_lda_best_topic_words(segment) 70 | if FLAGS.context == "lda_mean": 71 | return topic_creator.get_lda_topic_embedding(segment) 72 | if FLAGS.context == "lsi": 73 | return topic_creator.get_lsa_topic_embeding(segment) 74 | if FLAGS.context == "arithmetic": 75 | return topic_creator.average_embeddings(segment) 76 | 77 | 78 | def generate_arrays_from_list(name, topic_creator, files, embeddings, num_steps=35, batch_size=20, embedding_size=200): 79 | eos_mark = [id for id, w, vec in embeddings if w == ""][0] 80 | eop_mark = [id for id, w, vec in embeddings if w == ""][0] 81 | unknown_embedding = [vec for id, w, vec in embeddings if w == ""][0] 82 | debug = False 83 | # print("EOS mark: %s, EOP mark: %s" % (eos_mark, eop_mark)) 84 | while 1: 85 | for file_name in files: 86 | raw_list = VectorManager.parse_into_list(open(file_name).read()) 87 | 88 | n_words = len(raw_list) 89 | batch_len = n_words // batch_size 90 | data = np.reshape(raw_list[0:batch_size*batch_len], [batch_size, batch_len]) 91 | sentSegments = [list() for _ in range(batch_size)] 92 | parSegments = [list() for _ in range(batch_size)] 93 | 94 | 95 | for i in range(0, n_words - num_steps, 1): 96 | 97 | x = data[0:batch_size, i * num_steps:(i + 1) * num_steps] 98 | y = data[0:batch_size, i * num_steps + 1:(i + 1) * num_steps + 1] 99 | 100 | if len(x[0]) < num_steps or len(y[0]) < num_steps: 101 | break 102 | 103 | 104 | emb_x = [[embeddings[int(elem)][2] for elem in l] for l in x] 105 | emb_x = np.reshape(emb_x, newshape=(batch_size, num_steps, embedding_size)) 106 | 107 | final_x = np.zeros(shape=(batch_size, num_steps, len(embeddings[0][2])*3)) 108 | for batch in range(0, batch_size): 109 | for step in range(0, num_steps): 110 | if debug: 111 | print("%s == %s ? %s [eos]\n%s == %s ? %s[eop]" % (int(x[batch][step]), eos_mark, 112 | int(x[batch][step]) == eos_mark, 113 | int(x[batch][step]), eop_mark, 114 | int(x[batch][step]) == eop_mark)) 115 | if int(x[batch][step]) == eos_mark: 116 | sentSegments[batch] = [] 117 | else: 118 | sentSegments[batch].append(x[batch][step]) 119 | if int(x[batch][step]) == eop_mark: 120 | parSegments[batch] = [] 121 | else: 122 | parSegments[batch].append(x[batch][step]) 123 | 124 | sentTopic = unknown_embedding 125 | parTopic = unknown_embedding 126 | if sentSegments: 127 | sentTopic = get_context(topic_creator, sentSegments[batch]) 128 | 129 | if parSegments: 130 | if sentSegments[batch] == parSegments[batch]: 131 | parTopic = sentTopic 132 | else: 133 | parTopic = get_context(topic_creator, parSegments[batch]) 134 | 135 | final_x[batch][step] = np.hstack((emb_x[batch][step], sentTopic, parTopic)) 136 | 137 | 138 | 139 | if debug: 140 | print("Batch size %s\nNum steps %s\nEmbedding size %s" % (batch_size, num_steps, embedding_size 141 | )) 142 | print("Len(x): %s\n Len(x[0] %s\n Len(x[0][0] %s" % (len(x), len(x[0]), len(x[0][0]))) 143 | print("Len(y): %s\n Len(y[0] %s" % (len(y), len(y[0]))) 144 | 145 | 146 | 147 | y = np.reshape(y, newshape=(batch_size, num_steps)) 148 | 149 | yield final_x, y 150 | 151 | class WPModel(object): 152 | """Word Prediction model.""" 153 | 154 | def __init__(self, is_training, config): 155 | 156 | self.config = config 157 | batch_size = config.batch_size 158 | num_steps = config.num_steps 159 | size = config.hidden_size 160 | vocab_size = config.vocab_size 161 | embedding_size = config.embedding_size 162 | 163 | def lstm_cell(): 164 | # With the latest TensorFlow source code (as of Mar 27, 2017), 165 | # the BasicLSTMCell will need a reuse parameter which is unfortunately not 166 | # defined in TensorFlow 1.0. To maintain backwards compatibility, we add 167 | # an argument check here: 168 | # if 'reuse' in inspect.getargspec( 169 | # tf.contrib.rnn.BasicLSTMCell.__init__).args: 170 | # return tf.contrib.rnn.BasicLSTMCell( 171 | # size, forget_bias=0.0, state_is_tuple=True, 172 | # reuse=tf.get_variable_scope().reuse) 173 | # else: 174 | return tf.contrib.rnn.BasicLSTMCell( 175 | size, forget_bias=0.0, state_is_tuple=True) 176 | 177 | attn_cell = lstm_cell 178 | if is_training and config.keep_prob < 1: 179 | def attn_cell(): 180 | return tf.contrib.rnn.DropoutWrapper( 181 | lstm_cell(), output_keep_prob=config.keep_prob) 182 | 183 | cell = tf.contrib.rnn.MultiRNNCell( 184 | [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) 185 | 186 | self._initial_state = cell.zero_state(batch_size, data_type()) 187 | 188 | with tf.device("/cpu:0"): 189 | 190 | self.inputs = tf.placeholder(dtype=data_type(), shape=(batch_size, num_steps, embedding_size*3)) 191 | self.targets = tf.placeholder(dtype=tf.int32, shape=(batch_size, num_steps)) 192 | 193 | if is_training and config.keep_prob < 1: 194 | # Dropout allows to use the net for train and testing 195 | # See: https://stackoverflow.com/questions/34597316/why-input-is-scaled-in-tf-nn-dropout-in-tensorflow 196 | # and: http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf 197 | inputs = tf.nn.dropout(self.inputs, config.keep_prob) 198 | else: 199 | inputs = self.inputs 200 | 201 | inputs = tf.unstack(inputs, num=num_steps, axis=1) 202 | 203 | outputs, state = tf.contrib.rnn.static_rnn( 204 | cell, inputs, initial_state=self._initial_state) 205 | 206 | output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) 207 | softmax_w = tf.get_variable( 208 | "softmax_w", [size, vocab_size], dtype=data_type()) 209 | softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) 210 | logits = tf.matmul(output, softmax_w) + softmax_b 211 | loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( 212 | [logits], 213 | [tf.reshape(self.targets, [-1])], 214 | [tf.ones([batch_size * num_steps], dtype=data_type())]) 215 | self._cost = cost = tf.reduce_sum(loss) / batch_size 216 | self._final_state = state 217 | 218 | if not is_training: 219 | return 220 | 221 | self._lr = tf.Variable(0.0, trainable=False) 222 | tvars = tf.trainable_variables() 223 | grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 224 | config.max_grad_norm) 225 | optimizer = tf.train.GradientDescentOptimizer(self._lr) 226 | self._train_op = optimizer.apply_gradients( 227 | zip(grads, tvars), 228 | global_step=tf.contrib.framework.get_or_create_global_step()) 229 | 230 | self._new_lr = tf.placeholder( 231 | tf.float32, shape=[], name="new_learning_rate") 232 | self._lr_update = tf.assign(self._lr, self._new_lr) 233 | 234 | def assign_lr(self, session, lr_value): 235 | session.run(self._lr_update, feed_dict={self._new_lr: lr_value}) 236 | 237 | @property 238 | def input(self): 239 | return self._input 240 | 241 | @property 242 | def initial_state(self): 243 | return self._initial_state 244 | 245 | @property 246 | def cost(self): 247 | return self._cost 248 | 249 | @property 250 | def final_state(self): 251 | return self._final_state 252 | 253 | @property 254 | def lr(self): 255 | return self._lr 256 | 257 | @property 258 | def train_op(self): 259 | return self._train_op 260 | 261 | 262 | class SmallConfig(object): 263 | """Small config.""" 264 | init_scale = 0.1 265 | learning_rate = 1.0 266 | max_grad_norm = 5 267 | num_layers = 1 268 | num_steps = 20 269 | hidden_size = 200 270 | max_epoch = 4 271 | max_max_epoch = 13 272 | keep_prob = 1.0 273 | lr_decay = 0.5 274 | batch_size = 20 275 | vocab_size = 126930 276 | embedding_size = 200 277 | epoch_size = 1 278 | 279 | class MediumConfig(object): 280 | """Medium config.""" 281 | init_scale = 0.05 282 | learning_rate = 1.0 283 | max_grad_norm = 5 284 | num_layers = 1 285 | num_steps = 35 286 | hidden_size = 512 287 | max_epoch = 6 288 | max_max_epoch = 39 289 | keep_prob = 0.5 290 | lr_decay = 0.8 291 | batch_size = 20 292 | vocab_size = 126930 293 | embedding_size = 200 294 | epoch_size = 1 295 | 296 | class LargeConfig(object): 297 | """Large config.""" 298 | init_scale = 0.04 299 | learning_rate = 1.0 300 | max_grad_norm = 10 301 | num_layers = 1 302 | num_steps = 35 303 | hidden_size = 1024 304 | max_epoch = 14 305 | max_max_epoch = 55 306 | keep_prob = 0.35 307 | lr_decay = 1 / 1.15 308 | batch_size = 20 309 | vocab_size = 126930 310 | embedding_size = 1000 311 | epoch_size = 1 312 | 313 | class TestConfig(object): 314 | """Tiny config, for testing.""" 315 | init_scale = 0.1 316 | learning_rate = 1.0 317 | max_grad_norm = 1 318 | num_layers = 1 319 | num_steps = 2 320 | hidden_size = 2 321 | max_epoch = 1 322 | max_max_epoch = 1 323 | keep_prob = 1.0 324 | lr_decay = 0.5 325 | batch_size = 10 326 | vocab_size = 126930 327 | embedding_size = 200 328 | epoch_size = 1 329 | 330 | 331 | def run_epoch(session, generator, model, eval_op=None, verbose=False): 332 | """Runs the model on the given data.""" 333 | start_time = time.time() 334 | costs = 0.0 335 | iters = 0 336 | config = model.config 337 | state = session.run(model.initial_state) 338 | 339 | fetches = { 340 | "cost": model.cost, 341 | "final_state": model.final_state, 342 | } 343 | if eval_op is not None: 344 | fetches["eval_op"] = eval_op 345 | 346 | print("Epoch size starting training %s" % config.epoch_size) 347 | sys.stdout.flush() 348 | for step in range(config.epoch_size): 349 | x, y = next(generator) 350 | feed_dict = {} 351 | for i, (c, h) in enumerate(model.initial_state): 352 | feed_dict[c] = state[i].c 353 | feed_dict[h] = state[i].h 354 | # feed_dict["embeddings"] = embeddings 355 | feed_dict[model.inputs] = x 356 | feed_dict[model.targets] = y 357 | 358 | vals = session.run(fetches, feed_dict) 359 | cost = vals["cost"] 360 | state = vals["final_state"] 361 | 362 | costs += cost 363 | iters += config.num_steps 364 | 365 | if verbose and step % 100 == 0: 366 | print("%.3f perplexity: %.3f speed: %.0f wps" % 367 | (step * 1.0 / config.epoch_size, np.exp(costs / iters), 368 | iters * config.batch_size / (time.time() - start_time))) 369 | sys.stdout.flush() 370 | 371 | return np.exp(costs / iters) 372 | 373 | 374 | def get_config(): 375 | if FLAGS.model == "small": 376 | return SmallConfig() 377 | elif FLAGS.model == "medium": 378 | return MediumConfig() 379 | elif FLAGS.model == "large": 380 | return LargeConfig() 381 | elif FLAGS.model == "test": 382 | return TestConfig() 383 | else: 384 | raise ValueError("Invalid model: %s", FLAGS.model) 385 | 386 | def get_epoch_size(files, config): 387 | total = 0 388 | for file in files: 389 | file_words = subprocess.check_output(['wc', '-w', file]) 390 | number = file_words.split()[0] 391 | words = int(number) 392 | total += words - (words % (config.batch_size * config.num_steps)) 393 | print("Total words: %s, Batch size: %s, Num steps: %s" % (total, config.batch_size, config.num_steps)) 394 | sys.stdout.flush() 395 | epoch_size = ((total // config.batch_size) - 1) // config.num_steps 396 | 397 | return epoch_size 398 | 399 | def main(_): 400 | if not FLAGS.data_path: 401 | raise ValueError("Must set --data_path to wiki data directory list") 402 | 403 | vocab_size = 126930 404 | 405 | config = get_config() 406 | config.vocab_size = vocab_size 407 | 408 | valid_config = get_config() 409 | config.vocab_size = vocab_size 410 | 411 | 412 | eval_config = get_config() 413 | eval_config.batch_size = 1 414 | eval_config.num_steps = 1 415 | eval_config.vocab_size = vocab_size 416 | 417 | embeddings = VectorManager.read_vector("%s%s.pklz" % (FLAGS.embeddings, config.embedding_size)) 418 | 419 | # Load LDA or LSI model for topic creator 420 | if "lda" in FLAGS.context: 421 | model = LdaModel.load(FLAGS.topic_model_path) 422 | elif "lsi" in FLAGS.context: 423 | model = LsiModel.load(FLAGS.topic_model_path) 424 | else: 425 | model = None 426 | 427 | topic_creator = TopicCreator(FLAGS.dictionary_path, "%s%s.pklz" % (FLAGS.word2id_path, config.embedding_size), 428 | embeddings, model) 429 | files = open(FLAGS.data_path).read().split() 430 | 431 | training_list = files[0:int(0.8 * len(files))] 432 | validation_list = files[int(0.8 * len(files)):int(0.9 * len(files))] 433 | testing_list = files[int(0.9 * len(files)):len(files)] 434 | 435 | print("Lists sizes\n * Training: %s\n * Validation: %s\n * Testing: %s" % 436 | (len(training_list), len(validation_list), len(testing_list))) 437 | 438 | config.epoch_size = get_epoch_size(training_list, config) 439 | valid_config.epoch_size = get_epoch_size(validation_list, valid_config) 440 | eval_config.epoch_size = get_epoch_size(testing_list, eval_config) 441 | 442 | gen_train = generate_arrays_from_list("Train", topic_creator, training_list, embeddings, batch_size=config.batch_size, 443 | embedding_size=config.embedding_size, num_steps=config.num_steps) 444 | 445 | gen_valid = generate_arrays_from_list("Validation", topic_creator, validation_list, embeddings, batch_size=valid_config.batch_size, 446 | embedding_size=valid_config.embedding_size, num_steps=valid_config.num_steps) 447 | 448 | gen_test = generate_arrays_from_list("Test", topic_creator, testing_list, embeddings, batch_size=eval_config.batch_size, 449 | embedding_size=eval_config.embedding_size, num_steps=eval_config.num_steps) 450 | 451 | print("Epoch sizes\n * Training: %s\n * Validation: %s\n * Testing: %s" % 452 | (config.epoch_size, valid_config.epoch_size, eval_config.epoch_size)) 453 | sys.stdout.flush() 454 | with tf.Graph().as_default(): 455 | # Args: [minval, maxval] 456 | initializer = tf.random_uniform_initializer(-config.init_scale, 457 | config.init_scale) 458 | 459 | with tf.name_scope("Train"): 460 | with tf.variable_scope("Model", reuse=None, initializer=initializer): 461 | m = WPModel(is_training=True, config=config) 462 | tf.summary.scalar("Training Loss", m.cost) 463 | tf.summary.scalar("Learning Rate", m.lr) 464 | 465 | with tf.name_scope("Valid"): 466 | with tf.variable_scope("Model", reuse=True, initializer=initializer): 467 | mvalid = WPModel(is_training=False, config=valid_config) 468 | tf.summary.scalar("Validation Loss", mvalid.cost) 469 | 470 | with tf.name_scope("Test"): 471 | with tf.variable_scope("Model", reuse=True, initializer=initializer): 472 | mtest = WPModel(is_training=False, config=eval_config) 473 | 474 | sv = tf.train.Supervisor(logdir=FLAGS.save_path) 475 | with sv.managed_session() as session: 476 | for i in range(config.max_max_epoch): 477 | lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) 478 | m.assign_lr(session, config.learning_rate * lr_decay) 479 | 480 | print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) 481 | train_perplexity = run_epoch(session, generator=gen_train, model=m, eval_op=m.train_op, 482 | verbose=True) 483 | print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) 484 | valid_perplexity = run_epoch(session, generator=gen_valid, model=mvalid) 485 | print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) 486 | 487 | test_perplexity = run_epoch(session, generator=gen_test, model=mtest) 488 | print("Test Perplexity: %.3f" % test_perplexity) 489 | 490 | if FLAGS.save_path: 491 | print("Saving model to %s." % FLAGS.save_path) 492 | sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step) 493 | 494 | 495 | if __name__ == "__main__": 496 | tf.app.run() 497 | -------------------------------------------------------------------------------- /src/lstm/input_pipeline.py: -------------------------------------------------------------------------------- 1 | # TensorFlow Input Pipelines for Large Data Sets 2 | # ischlag.github.io 3 | # TensorFlow 0.11, 07.11.2016 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | import threading 8 | 9 | # Generating some simple data 10 | r = np.arange(0.0,100003.0) 11 | raw_data = np.dstack((r,r,r,r))[0] 12 | raw_target = np.array([[1,0,0]] * 100003) 13 | 14 | # are used to feed data into our queue 15 | queue_input_data = tf.placeholder(tf.float32, shape=[20, 4]) 16 | queue_input_target = tf.placeholder(tf.float32, shape=[20, 3]) 17 | 18 | queue = tf.FIFOQueue(capacity=50, dtypes=[tf.float32, tf.float32], shapes=[[4], [3]]) 19 | 20 | enqueue_op = queue.enqueue_many([queue_input_data, queue_input_target]) 21 | dequeue_op = queue.dequeue() 22 | 23 | # tensorflow recommendation: 24 | # capacity = min_after_dequeue + (num_threads + a small safety margin) * batch_size 25 | data_batch, target_batch = tf.train.batch(dequeue_op, batch_size=15, capacity=40) 26 | # use this to shuffle batches: 27 | # data_batch, target_batch = tf.train.shuffle_batch(dequeue_op, batch_size=15, capacity=40, min_after_dequeue=5) 28 | 29 | def enqueue(sess): 30 | """ Iterates over our data puts small junks into our queue.""" 31 | under = 0 32 | max = len(raw_data) 33 | while True: 34 | print("starting to write into queue") 35 | upper = under + 20 36 | print("try to enqueue ", under, " to ", upper) 37 | if upper <= max: 38 | curr_data = raw_data[under:upper] 39 | curr_target = raw_target[under:upper] 40 | under = upper 41 | else: 42 | rest = upper - max 43 | curr_data = np.concatenate((raw_data[under:max], raw_data[0:rest])) 44 | curr_target = np.concatenate((raw_target[under:max], raw_target[0:rest])) 45 | under = rest 46 | 47 | sess.run(enqueue_op, feed_dict={queue_input_data: curr_data, 48 | queue_input_target: curr_target}) 49 | print("added to the queue") 50 | print("finished enqueueing") 51 | 52 | # start the threads for our FIFOQueue and batch 53 | sess = tf.Session() 54 | enqueue_thread = threading.Thread(target=enqueue, args=[sess]) 55 | enqueue_thread.isDaemon() 56 | enqueue_thread.start() 57 | 58 | coord = tf.train.Coordinator() 59 | threads = tf.train.start_queue_runners(coord=coord, sess=sess) 60 | 61 | # Fetch the data from the pipeline and put it where it belongs (into your model) 62 | for i in range(5): 63 | run_options = tf.RunOptions(timeout_in_ms=4000) 64 | curr_data_batch, curr_target_batch = sess.run([data_batch, target_batch], options=run_options) 65 | print(curr_data_batch) 66 | 67 | # shutdown everything to avoid zombies 68 | sess.run(queue.close(cancel_pending_enqueues=True)) 69 | coord.request_stop() 70 | coord.join(threads) 71 | sess.close() 72 | Blog -------------------------------------------------------------------------------- /src/lstm/lstm.py: -------------------------------------------------------------------------------- 1 | """ 2 | To run: 3 | 4 | $ python lstm_frag.py --data_path=path/to/train.list 5 | 6 | """ 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import sys 12 | sys.path.insert(0, "../src/") 13 | 14 | from utils.vector_manager import VectorManager 15 | 16 | import numpy as np 17 | import tensorflow as tf 18 | 19 | import subprocess 20 | import inspect 21 | import time 22 | 23 | 24 | flags = tf.flags 25 | logging = tf.logging 26 | 27 | flags.DEFINE_string( 28 | "model", "small", 29 | "A type of model. Possible options are: small, medium, large.") 30 | 31 | flags.DEFINE_string( 32 | "tasks", "all", 33 | "Tasks to be performed. Possible options are: all, train, test, valid") 34 | 35 | flags.DEFINE_string( 36 | "word_to_id_path", "../models/eos/word2id_1000.pklz", 37 | "A type of model. Possible options are: small, medium, large.") 38 | 39 | flags.DEFINE_string( 40 | "embeddings", "../models/eos/idWordVec_", 41 | "Embeddings path") 42 | 43 | flags.DEFINE_string("data_path", None, 44 | "Where the training/test data is stored.") 45 | flags.DEFINE_string("save_path", None, 46 | "Model output directory.") 47 | flags.DEFINE_bool("use_fp16", False, 48 | "Train using 16-bit floats instead of 32bit floats") 49 | 50 | FLAGS = flags.FLAGS 51 | 52 | 53 | def data_type(): 54 | return tf.float16 if FLAGS.use_fp16 else tf.float32 55 | 56 | 57 | def get_vocab_size(): 58 | word_to_id = VectorManager.read_vector(FLAGS.word_to_id_path) 59 | size = len(word_to_id) 60 | print("Vocabulary size: %s" % size) 61 | return size 62 | 63 | 64 | def generate_arrays_from_list(name, files, embeddings, num_steps=35, batch_size=20, embedding_size=200): 65 | 66 | debug = False 67 | while 1: 68 | for file_name in files: 69 | print("Generating from file %s for %s" % (file_name, name)) 70 | raw_list = VectorManager.parse_into_list(open(file_name).read()) 71 | 72 | n_words = len(raw_list) 73 | batch_len = n_words // batch_size 74 | data = np.reshape(raw_list[0:batch_size*batch_len], [batch_size, batch_len]) 75 | 76 | for i in range(0, n_words - num_steps, 1): 77 | 78 | x = data[0:batch_size, i * num_steps:(i + 1) * num_steps] 79 | x = [[embeddings[int(elem)][2] for elem in l] for l in x] 80 | y = data[0:batch_size, i * num_steps + 1:(i + 1) * num_steps + 1] 81 | 82 | 83 | if len(x[0]) < num_steps or len(y[0]) < num_steps: 84 | break 85 | if debug: 86 | print("Batch size %s\nNum steps %s\nEmbedding size %s" % (batch_size, num_steps, embedding_size 87 | )) 88 | print("Len(x): %s\n Len(x[0] %s\n Len(x[0][0] %s" % (len(x), len(x[0]), len(x[0][0]))) 89 | print("Len(y): %s\n Len(y[0] %s" % (len(y), len(y[0]))) 90 | x = np.reshape(x, newshape=(batch_size, num_steps, embedding_size)) 91 | 92 | y = np.reshape(y, newshape=(batch_size, num_steps)) 93 | 94 | yield x, y 95 | 96 | class WPModel(object): 97 | """Word Prediction model.""" 98 | 99 | def __init__(self, is_training, config): 100 | 101 | self.config = config 102 | batch_size = config.batch_size 103 | num_steps = config.num_steps 104 | size = config.hidden_size 105 | vocab_size = config.vocab_size 106 | embedding_size = config.embedding_size 107 | 108 | def lstm_cell(): 109 | # With the latest TensorFlow source code (as of Mar 27, 2017), 110 | # the BasicLSTMCell will need a reuse parameter which is unfortunately not 111 | # defined in TensorFlow 1.0. To maintain backwards compatibility, we add 112 | # an argument check here: 113 | # if 'reuse' in inspect.getargspec( 114 | # tf.contrib.rnn.BasicLSTMCell.__init__).args: 115 | # return tf.contrib.rnn.BasicLSTMCell( 116 | # size, forget_bias=0.0, state_is_tuple=True, 117 | # reuse=tf.get_variable_scope().reuse) 118 | # else: 119 | return tf.contrib.rnn.BasicLSTMCell( 120 | size, forget_bias=0.0, state_is_tuple=True) 121 | 122 | attn_cell = lstm_cell 123 | if is_training and config.keep_prob < 1: 124 | def attn_cell(): 125 | return tf.contrib.rnn.DropoutWrapper( 126 | lstm_cell(), output_keep_prob=config.keep_prob) 127 | 128 | cell = tf.contrib.rnn.MultiRNNCell( 129 | [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) 130 | 131 | self._initial_state = cell.zero_state(batch_size, data_type()) 132 | 133 | with tf.device("/cpu:0"): 134 | self.inputs = tf.placeholder(dtype=data_type(), shape=(batch_size, num_steps, embedding_size)) 135 | self.targets = tf.placeholder(dtype=tf.int32, shape=(batch_size, num_steps)) 136 | 137 | if is_training and config.keep_prob < 1: 138 | # Dropout allows to use the net for train and testing 139 | # See: https://stackoverflow.com/questions/34597316/why-input-is-scaled-in-tf-nn-dropout-in-tensorflow 140 | # and: http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf 141 | inputs = tf.nn.dropout(self.inputs, config.keep_prob) 142 | else: 143 | inputs = self.inputs 144 | 145 | inputs = tf.unstack(inputs, num=num_steps, axis=1) 146 | 147 | outputs, state = tf.contrib.rnn.static_rnn( 148 | cell, inputs, initial_state=self._initial_state) 149 | 150 | output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) 151 | softmax_w = tf.get_variable( 152 | "softmax_w", [size, vocab_size], dtype=data_type()) 153 | softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) 154 | logits = tf.matmul(output, softmax_w) + softmax_b 155 | loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( 156 | [logits], 157 | [tf.reshape(self.targets, [-1])], 158 | [tf.ones([batch_size * num_steps], dtype=data_type())]) 159 | self._cost = cost = tf.reduce_sum(loss) / batch_size 160 | self._final_state = state 161 | 162 | if not is_training: 163 | return 164 | 165 | self._lr = tf.Variable(0.0, trainable=False) 166 | tvars = tf.trainable_variables() 167 | grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 168 | config.max_grad_norm) 169 | optimizer = tf.train.GradientDescentOptimizer(self._lr) 170 | self._train_op = optimizer.apply_gradients( 171 | zip(grads, tvars), 172 | global_step=tf.contrib.framework.get_or_create_global_step()) 173 | 174 | self._new_lr = tf.placeholder( 175 | tf.float32, shape=[], name="new_learning_rate") 176 | self._lr_update = tf.assign(self._lr, self._new_lr) 177 | 178 | def assign_lr(self, session, lr_value): 179 | session.run(self._lr_update, feed_dict={self._new_lr: lr_value}) 180 | 181 | @property 182 | def input(self): 183 | return self._input 184 | 185 | @property 186 | def initial_state(self): 187 | return self._initial_state 188 | 189 | @property 190 | def cost(self): 191 | return self._cost 192 | 193 | @property 194 | def final_state(self): 195 | return self._final_state 196 | 197 | @property 198 | def lr(self): 199 | return self._lr 200 | 201 | @property 202 | def train_op(self): 203 | return self._train_op 204 | 205 | 206 | class SmallConfig(object): 207 | """Small config.""" 208 | init_scale = 0.1 209 | learning_rate = 1.0 210 | max_grad_norm = 5 211 | num_layers = 1 212 | num_steps = 20 213 | hidden_size = 200 214 | max_epoch = 2 215 | max_max_epoch = 13 216 | keep_prob = 1.0 217 | lr_decay = 0.5 218 | batch_size = 20 219 | vocab_size = 126930 220 | embedding_size = 200 221 | epoch_size = 1 222 | 223 | class MediumConfig(object): 224 | """Medium config.""" 225 | init_scale = 0.05 226 | learning_rate = 1.0 227 | max_grad_norm = 5 228 | num_layers = 1 229 | num_steps = 35 230 | hidden_size = 512 231 | max_epoch = 6 232 | max_max_epoch = 39 233 | keep_prob = 0.5 234 | lr_decay = 0.8 235 | batch_size = 20 236 | vocab_size = 126930 237 | embedding_size = 200 238 | epoch_size = 1 239 | 240 | class LargeConfig(object): 241 | """Large config.""" 242 | init_scale = 0.04 243 | learning_rate = 1.0 244 | max_grad_norm = 10 245 | num_layers = 1 246 | num_steps = 35 247 | hidden_size = 1024 248 | max_epoch = 14 249 | max_max_epoch = 55 250 | keep_prob = 0.35 251 | lr_decay = 1 / 1.15 252 | batch_size = 20 253 | vocab_size = 126930 254 | embedding_size = 1000 255 | epoch_size = 1 256 | 257 | class TestConfig(object): 258 | """Tiny config, for testing.""" 259 | init_scale = 0.1 260 | learning_rate = 1.0 261 | max_grad_norm = 1 262 | num_layers = 1 263 | num_steps = 2 264 | hidden_size = 2 265 | max_epoch = 1 266 | max_max_epoch = 1 267 | keep_prob = 1.0 268 | lr_decay = 0.5 269 | batch_size = 10 270 | vocab_size = 126930 271 | embedding_size = 200 272 | epoch_size = 1 273 | 274 | 275 | def run_epoch(session, generator, model, eval_op=None, verbose=False): 276 | """Runs the model on the given data.""" 277 | start_time = time.time() 278 | costs = 0.0 279 | iters = 0 280 | config = model.config 281 | state = session.run(model.initial_state) 282 | 283 | fetches = { 284 | "cost": model.cost, 285 | "final_state": model.final_state, 286 | } 287 | if eval_op is not None: 288 | fetches["eval_op"] = eval_op 289 | 290 | print("Epoch size starting training %s" % config.epoch_size) 291 | for step in range(config.epoch_size): 292 | x, y = next(generator) 293 | feed_dict = {} 294 | for i, (c, h) in enumerate(model.initial_state): 295 | feed_dict[c] = state[i].c 296 | feed_dict[h] = state[i].h 297 | feed_dict[model.inputs] = x 298 | feed_dict[model.targets] = y 299 | 300 | vals = session.run(fetches, feed_dict) 301 | cost = vals["cost"] 302 | state = vals["final_state"] 303 | 304 | costs += cost 305 | iters += config.num_steps 306 | 307 | # if verbose and step % 100 == 0: 308 | print("%.3f perplexity: %.3f speed: %.0f wps" % 309 | (step * 1.0 / config.epoch_size, np.exp(costs / iters), 310 | iters * config.batch_size / (time.time() - start_time))) 311 | sys.stdout.flush() 312 | 313 | return np.exp(costs / iters) 314 | 315 | 316 | def get_config(): 317 | if FLAGS.model == "small": 318 | return SmallConfig() 319 | elif FLAGS.model == "medium": 320 | return MediumConfig() 321 | elif FLAGS.model == "large": 322 | return LargeConfig() 323 | elif FLAGS.model == "test": 324 | return TestConfig() 325 | else: 326 | raise ValueError("Invalid model: %s", FLAGS.model) 327 | 328 | def get_epoch_size(files, config): 329 | total = 0 330 | for file in files: 331 | file_words = subprocess.check_output(['wc', '-w', file]) 332 | number = file_words.split()[0] 333 | words = int(number) 334 | total += words - (words % (config.batch_size * config.num_steps)) 335 | print("Total words: %s, Batch size: %s, Num steps: %s" % (total, config.batch_size, config.num_steps)) 336 | sys.stdout.flush() 337 | epoch_size = ((total // config.batch_size) - 1) // config.num_steps 338 | 339 | return epoch_size 340 | 341 | def main(_): 342 | if not FLAGS.data_path: 343 | raise ValueError("Must set --data_path to wiki data directory list") 344 | 345 | vocab_size = 126930 346 | 347 | config = get_config() 348 | config.vocab_size = vocab_size 349 | 350 | valid_config = get_config() 351 | config.vocab_size = vocab_size 352 | 353 | 354 | eval_config = get_config() 355 | eval_config.batch_size = 1 356 | eval_config.num_steps = 1 357 | eval_config.vocab_size = vocab_size 358 | 359 | print("Embeddings path: {}".format(FLAGS.embeddings)) 360 | embeddings = VectorManager.read_vector(FLAGS.embeddings) 361 | files = open(FLAGS.data_path).read().split() 362 | 363 | training_list = files[0:int(0.8 * len(files))] 364 | validation_list = files[int(0.8 * len(files)):int(0.9 * len(files))] 365 | testing_list = files[int(0.9 * len(files)):len(files)] 366 | 367 | config.epoch_size = get_epoch_size(training_list, config) 368 | valid_config.epoch_size = get_epoch_size(validation_list, valid_config) 369 | eval_config.epoch_size = get_epoch_size(testing_list, eval_config) 370 | 371 | gen_train = generate_arrays_from_list("Train", training_list, embeddings, batch_size=config.batch_size, 372 | embedding_size=config.embedding_size, num_steps=config.num_steps) 373 | gen_valid = generate_arrays_from_list("Validation", validation_list, embeddings, batch_size=valid_config.batch_size, 374 | embedding_size=valid_config.embedding_size, num_steps=valid_config.num_steps) 375 | gen_test = generate_arrays_from_list("Test", testing_list, embeddings, batch_size=eval_config.batch_size, 376 | embedding_size=eval_config.embedding_size, num_steps=eval_config.num_steps) 377 | 378 | print("Epoch sizes\n * Training: %s\n * Validation: %s\n * Testing: %s" % 379 | (config.epoch_size, valid_config.epoch_size, eval_config.epoch_size)) 380 | sys.stdout.flush() 381 | with tf.Graph().as_default(): 382 | # Args: [minval, maxval] 383 | initializer = tf.random_uniform_initializer(-config.init_scale, 384 | config.init_scale) 385 | 386 | with tf.name_scope("Train"): 387 | with tf.variable_scope("Model", reuse=None, initializer=initializer): 388 | m = WPModel(is_training=True, config=config) 389 | tf.summary.scalar("Training Loss", m.cost) 390 | tf.summary.scalar("Learning Rate", m.lr) 391 | 392 | with tf.name_scope("Valid"): 393 | with tf.variable_scope("Model", reuse=True, initializer=initializer): 394 | mvalid = WPModel(is_training=False, config=valid_config) 395 | tf.summary.scalar("Validation Loss", mvalid.cost) 396 | 397 | with tf.name_scope("Test"): 398 | with tf.variable_scope("Model", reuse=True, initializer=initializer): 399 | mtest = WPModel(is_training=False, config=eval_config) 400 | 401 | sv = tf.train.Supervisor(logdir=FLAGS.save_path) 402 | with sv.managed_session() as session: 403 | for i in range(config.max_max_epoch): 404 | lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) 405 | m.assign_lr(session, config.learning_rate * lr_decay) 406 | 407 | print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) 408 | train_perplexity = run_epoch(session, generator=gen_train, model=m, eval_op=m.train_op, 409 | verbose=True) 410 | print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) 411 | valid_perplexity = run_epoch(session, generator=gen_valid, model=mvalid) 412 | print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) 413 | 414 | test_perplexity = run_epoch(session, generator=gen_test, model=mtest) 415 | print("Test Perplexity: %.3f" % test_perplexity) 416 | 417 | if FLAGS.save_path: 418 | print("Saving model to %s." % FLAGS.save_path) 419 | sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step) 420 | 421 | 422 | if __name__ == "__main__": 423 | tf.app.run() 424 | -------------------------------------------------------------------------------- /src/lstm/lstm_wp.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Example / benchmark for building a PTB LSTM model. 17 | 18 | Trains the model described in: 19 | (Zaremba, et. al.) Recurrent Neural Network Regularization 20 | http://arxiv.org/abs/1409.2329 21 | 22 | There are 3 supported model configurations: 23 | =========================================== 24 | | config | epochs | train | valid | test 25 | =========================================== 26 | | small | 13 | 37.99 | 121.39 | 115.91 27 | | medium | 39 | 48.45 | 86.16 | 82.07 28 | | large | 55 | 37.87 | 82.62 | 78.29 29 | The exact results may vary depending on the random initialization. 30 | 31 | The hyperparameters used in the model: 32 | - init_scale - the initial scale of the weights 33 | - learning_rate - the initial value of the learning rate 34 | - max_grad_norm - the maximum permissible norm of the gradient 35 | - num_layers - the number of LSTM layers 36 | - num_steps - the number of unrolled steps of LSTM 37 | - hidden_size - the number of LSTM units 38 | - max_epoch - the number of epochs trained with the initial learning rate 39 | - max_max_epoch - the total number of epochs for training 40 | - keep_prob - the probability of keeping weights in the dropout layer 41 | - lr_decay - the decay of the learning rate for each epoch after "max_epoch" 42 | - batch_size - the batch size 43 | 44 | The data required for this example is in the data/ dir of the 45 | PTB dataset from Tomas Mikolov's webpage: 46 | 47 | $ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz 48 | $ tar xvf simple-examples.tgz 49 | 50 | To run: 51 | 52 | $ python ptb_word_lm.py --data_path=simple-examples/data/ 53 | 54 | """ 55 | from __future__ import absolute_import 56 | from __future__ import division 57 | from __future__ import print_function 58 | 59 | import sys 60 | sys.path.insert(0, "../src/") 61 | 62 | import inspect 63 | import time 64 | 65 | import numpy as np 66 | import tensorflow as tf 67 | 68 | import reader_wp as reader 69 | 70 | flags = tf.flags 71 | logging = tf.logging 72 | 73 | flags.DEFINE_string( 74 | "model", "small", 75 | "A type of model. Possible options are: small, medium, large.") 76 | 77 | flags.DEFINE_string( 78 | "tasks", "all", 79 | "Tasks to be performed. Possible options are: all, train, test, valid") 80 | 81 | flags.DEFINE_string( 82 | "word_to_id_path", "../models/eos/word2id_1000.pklz", 83 | "A type of model. Possible options are: small, medium, large.") 84 | 85 | flags.DEFINE_string("data_path", None, 86 | "Where the training/test data is stored.") 87 | flags.DEFINE_string("save_path", None, 88 | "Model output directory.") 89 | flags.DEFINE_bool("use_fp16", False, 90 | "Train using 16-bit floats instead of 32bit floats") 91 | 92 | FLAGS = flags.FLAGS 93 | 94 | 95 | def data_type(): 96 | return tf.float16 if FLAGS.use_fp16 else tf.float32 97 | 98 | 99 | def get_vocab_size(): 100 | word_to_id = VectorManager.read_vector(FLAGS.word_to_id_path) 101 | size = len(word_to_id) 102 | print("Vocabulary size: %s" % size) 103 | return size 104 | 105 | class WPInput(object): 106 | """The input data.""" 107 | 108 | def __init__(self, config, data, name=None): 109 | self.batch_size = batch_size = config.batch_size 110 | self.num_steps = num_steps = config.num_steps 111 | self.epoch_size = ((len(data) // batch_size) - 1) // num_steps 112 | self.input_data, self.targets = reader.wiki_producer( 113 | data, batch_size, num_steps, name=name) 114 | 115 | 116 | class WPModel(object): 117 | """Word Prediction model.""" 118 | 119 | def __init__(self, is_training, config, input_): 120 | self._input = input_ 121 | 122 | batch_size = input_.batch_size 123 | num_steps = input_.num_steps 124 | size = config.hidden_size 125 | vocab_size = config.vocab_size 126 | 127 | # Slightly better results can be obtained with forget gate biases 128 | # initialized to 1 but the hyperparameters of the model would need to be 129 | # different than reported in the paper. 130 | def lstm_cell(): 131 | # With the latest TensorFlow source code (as of Mar 27, 2017), 132 | # the BasicLSTMCell will need a reuse parameter which is unfortunately not 133 | # defined in TensorFlow 1.0. To maintain backwards compatibility, we add 134 | # an argument check here: 135 | # if 'reuse' in inspect.getargspec( 136 | # tf.contrib.rnn.BasicLSTMCell.__init__).args: 137 | # return tf.contrib.rnn.BasicLSTMCell( 138 | # size, forget_bias=0.0, state_is_tuple=True, 139 | # reuse=tf.get_variable_scope().reuse) 140 | # else: 141 | return tf.contrib.rnn.BasicLSTMCell( 142 | size, forget_bias=0.0, state_is_tuple=True) 143 | 144 | attn_cell = lstm_cell 145 | if is_training and config.keep_prob < 1: 146 | def attn_cell(): 147 | return tf.contrib.rnn.DropoutWrapper( 148 | lstm_cell(), output_keep_prob=config.keep_prob) 149 | 150 | cell = tf.contrib.rnn.MultiRNNCell( 151 | [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) 152 | 153 | # data_type() returns float32 or float16 154 | self._initial_state = cell.zero_state(batch_size, data_type()) 155 | 156 | with tf.device("/cpu:0"): 157 | # TODO: replace TF input with my embeddings 158 | # TODO: implement PTB reader or something similar 159 | embedding = tf.get_variable( 160 | "embedding", [vocab_size, size], dtype=data_type()) 161 | inputs = tf.nn.embedding_lookup(embedding, input_.input_data) 162 | 163 | 164 | if is_training and config.keep_prob < 1: 165 | # Dropout allows to use the net for train and testing 166 | # See: https://stackoverflow.com/questions/34597316/why-input-is-scaled-in-tf-nn-dropout-in-tensorflow 167 | # and: http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf 168 | inputs = tf.nn.dropout(inputs, config.keep_prob) 169 | 170 | # Simplified version of models/tutorials/rnn/rnn.py's rnn(). 171 | # This builds an unrolled LSTM for tutorial purposes only. 172 | # In general, use the rnn() or state_saving_rnn() from rnn.py. 173 | # 174 | # The alternative version of the code below is: 175 | # 176 | inputs = tf.unstack(inputs, num=num_steps, axis=1) 177 | outputs, state = tf.contrib.rnn.static_rnn( 178 | cell, inputs, initial_state=self._initial_state) 179 | # TODO: passing the sequence_length argument will enable to input variable-length tensors 180 | 181 | # outputs = [] 182 | # state = self._initial_state 183 | # with tf.variable_scope("RNN"): 184 | # for time_step in range(num_steps): 185 | # if time_step > 0: 186 | # tf.get_variable_scope().reuse_variables() 187 | # (cell_output, state) = cell(inputs[:, time_step, :], state) # Call (inputs, state) 188 | # outputs.append(cell_output) 189 | 190 | # TODO: check why outputs are stacked and resized 191 | output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) 192 | softmax_w = tf.get_variable( 193 | "softmax_w", [size, vocab_size], dtype=data_type()) 194 | softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) 195 | logits = tf.matmul(output, softmax_w) + softmax_b 196 | loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( 197 | [logits], 198 | [tf.reshape(input_.targets, [-1])], 199 | [tf.ones([batch_size * num_steps], dtype=data_type())]) 200 | self._cost = cost = tf.reduce_sum(loss) / batch_size 201 | self._final_state = state 202 | 203 | if not is_training: 204 | return 205 | 206 | self._lr = tf.Variable(0.0, trainable=False) 207 | tvars = tf.trainable_variables() 208 | grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 209 | config.max_grad_norm) 210 | optimizer = tf.train.GradientDescentOptimizer(self._lr) 211 | self._train_op = optimizer.apply_gradients( 212 | zip(grads, tvars), 213 | global_step=tf.contrib.framework.get_or_create_global_step()) 214 | 215 | self._new_lr = tf.placeholder( 216 | tf.float32, shape=[], name="new_learning_rate") 217 | self._lr_update = tf.assign(self._lr, self._new_lr) 218 | 219 | def assign_lr(self, session, lr_value): 220 | session.run(self._lr_update, feed_dict={self._new_lr: lr_value}) 221 | 222 | @property 223 | def input(self): 224 | return self._input 225 | 226 | @property 227 | def initial_state(self): 228 | return self._initial_state 229 | 230 | @property 231 | def cost(self): 232 | return self._cost 233 | 234 | @property 235 | def final_state(self): 236 | return self._final_state 237 | 238 | @property 239 | def lr(self): 240 | return self._lr 241 | 242 | @property 243 | def train_op(self): 244 | return self._train_op 245 | 246 | 247 | class SmallConfig(object): 248 | """Small config.""" 249 | init_scale = 0.1 250 | learning_rate = 1.0 251 | max_grad_norm = 5 252 | num_layers = 2 253 | num_steps = 20 254 | hidden_size = 200 255 | max_epoch = 4 256 | max_max_epoch = 13 257 | keep_prob = 1.0 258 | lr_decay = 0.5 259 | batch_size = 20 260 | vocab_size = 27942 261 | 262 | 263 | class MediumConfig(object): 264 | """Medium config.""" 265 | init_scale = 0.05 266 | learning_rate = 1.0 267 | max_grad_norm = 5 268 | num_layers = 2 269 | num_steps = 35 270 | hidden_size = 650 271 | max_epoch = 6 272 | max_max_epoch = 39 273 | keep_prob = 0.5 274 | lr_decay = 0.8 275 | batch_size = 20 276 | vocab_size = 10000 277 | 278 | 279 | class LargeConfig(object): 280 | """Large config.""" 281 | init_scale = 0.04 282 | learning_rate = 1.0 283 | max_grad_norm = 10 284 | num_layers = 2 285 | num_steps = 35 286 | hidden_size = 1024 287 | max_epoch = 14 288 | max_max_epoch = 55 289 | keep_prob = 0.35 290 | lr_decay = 1 / 1.15 291 | batch_size = 20 292 | vocab_size = 10000 293 | 294 | 295 | class TestConfig(object): 296 | """Tiny config, for testing.""" 297 | init_scale = 0.1 298 | learning_rate = 1.0 299 | max_grad_norm = 1 300 | num_layers = 1 301 | num_steps = 2 302 | hidden_size = 2 303 | max_epoch = 1 304 | max_max_epoch = 1 305 | keep_prob = 1.0 306 | lr_decay = 0.5 307 | batch_size = 20 308 | vocab_size = 10000 309 | 310 | 311 | def run_epoch(session, model, eval_op=None, verbose=False): 312 | """Runs the model on the given data.""" 313 | start_time = time.time() 314 | costs = 0.0 315 | iters = 0 316 | state = session.run(model.initial_state) 317 | 318 | fetches = { 319 | "cost": model.cost, 320 | "final_state": model.final_state, 321 | } 322 | if eval_op is not None: 323 | fetches["eval_op"] = eval_op 324 | 325 | for step in range(model.input.epoch_size): 326 | feed_dict = {} 327 | for i, (c, h) in enumerate(model.initial_state): 328 | feed_dict[c] = state[i].c 329 | feed_dict[h] = state[i].h 330 | 331 | vals = session.run(fetches, feed_dict) 332 | cost = vals["cost"] 333 | state = vals["final_state"] 334 | 335 | costs += cost 336 | iters += model.input.num_steps 337 | 338 | if verbose and step % (model.input.epoch_size // 10) == 10: 339 | print("%.3f perplexity: %.3f speed: %.0f wps" % 340 | (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), 341 | iters * model.input.batch_size / (time.time() - start_time))) 342 | 343 | return np.exp(costs / iters) 344 | 345 | 346 | def get_config(): 347 | if FLAGS.model == "small": 348 | return SmallConfig() 349 | elif FLAGS.model == "medium": 350 | return MediumConfig() 351 | elif FLAGS.model == "large": 352 | return LargeConfig() 353 | elif FLAGS.model == "test": 354 | return TestConfig() 355 | else: 356 | raise ValueError("Invalid model: %s", FLAGS.model) 357 | 358 | 359 | def main(_): 360 | if not FLAGS.data_path: 361 | raise ValueError("Must set --data_path to wiki data directory") 362 | 363 | raw_data = reader.wiki_raw_data(FLAGS.data_path, FLAGS.word_to_id_path) 364 | train_data, valid_data, test_data = raw_data 365 | 366 | #vocab_size = get_vocab_size() 367 | vocab_size = 126930 368 | 369 | config = get_config() 370 | config.vocab_size = vocab_size 371 | 372 | eval_config = get_config() 373 | eval_config.batch_size = 1 374 | eval_config.num_steps = 1 375 | eval_config.vocab_size = vocab_size 376 | 377 | with tf.Graph().as_default(): 378 | # Args: [minval, maxval] 379 | initializer = tf.random_uniform_initializer(-config.init_scale, 380 | config.init_scale) 381 | 382 | with tf.name_scope("Train"): 383 | train_input = WPInput(config=config, data=train_data, name="TrainInput") 384 | with tf.variable_scope("Model", reuse=None, initializer=initializer): 385 | m = WPModel(is_training=True, config=config, input_=train_input) 386 | tf.summary.scalar("Training Loss", m.cost) 387 | tf.summary.scalar("Learning Rate", m.lr) 388 | 389 | with tf.name_scope("Valid"): 390 | valid_input = WPInput(config=config, data=valid_data, name="ValidInput") 391 | with tf.variable_scope("Model", reuse=True, initializer=initializer): 392 | mvalid = WPModel(is_training=False, config=config, input_=valid_input) 393 | tf.summary.scalar("Validation Loss", mvalid.cost) 394 | 395 | with tf.name_scope("Test"): 396 | test_input = WPInput(config=eval_config, data=test_data, name="TestInput") 397 | with tf.variable_scope("Model", reuse=True, initializer=initializer): 398 | mtest = WPModel(is_training=False, config=eval_config, 399 | input_=test_input) 400 | 401 | sv = tf.train.Supervisor(logdir=FLAGS.save_path) 402 | with sv.managed_session() as session: 403 | for i in range(config.max_max_epoch): 404 | lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) 405 | m.assign_lr(session, config.learning_rate * lr_decay) 406 | 407 | print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) 408 | train_perplexity = run_epoch(session, m, eval_op=m.train_op, 409 | verbose=True) 410 | print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) 411 | valid_perplexity = run_epoch(session, mvalid) 412 | print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) 413 | 414 | test_perplexity = run_epoch(session, mtest) 415 | print("Test Perplexity: %.3f" % test_perplexity) 416 | 417 | if FLAGS.save_path: 418 | print("Saving model to %s." % FLAGS.save_path) 419 | sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step) 420 | 421 | 422 | if __name__ == "__main__": 423 | tf.app.run() 424 | -------------------------------------------------------------------------------- /src/lstm/reader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | """Utilities for parsing PTB text files.""" 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import collections 23 | import os 24 | 25 | import tensorflow as tf 26 | 27 | 28 | def _read_words(filename): 29 | with tf.gfile.GFile(filename, "r") as f: 30 | return f.read().decode("utf-8").replace("\n", "").split() 31 | 32 | 33 | def _build_vocab(filename): 34 | data = _read_words(filename) 35 | 36 | counter = collections.Counter(data) 37 | count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) 38 | 39 | words, _ = list(zip(*count_pairs)) 40 | word_to_id = dict(zip(words, range(len(words)))) 41 | 42 | return word_to_id 43 | 44 | 45 | def _file_to_word_ids(filename, word_to_id): 46 | data = _read_words(filename) 47 | return [word_to_id[word] for word in data if word in word_to_id] 48 | 49 | 50 | def ptb_raw_data(data_path=None): 51 | """Load PTB raw data from data directory "data_path". 52 | 53 | Reads PTB text files, converts strings to integer ids, 54 | and performs mini-batching of the inputs. 55 | 56 | The PTB dataset comes from Tomas Mikolov's webpage: 57 | 58 | http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz 59 | 60 | Args: 61 | data_path: string path to the directory where simple-examples.tgz has 62 | been extracted. 63 | 64 | Returns: 65 | tuple (train_data, valid_data, test_data, vocabulary) 66 | where each of the data objects can be passed to PTBIterator. 67 | """ 68 | 69 | train_path = os.path.join(data_path, "ptb.train.txt") 70 | valid_path = os.path.join(data_path, "ptb.valid.txt") 71 | test_path = os.path.join(data_path, "ptb.test.txt") 72 | 73 | word_to_id = _build_vocab(train_path) 74 | train_data = _file_to_word_ids(train_path, word_to_id) 75 | valid_data = _file_to_word_ids(valid_path, word_to_id) 76 | test_data = _file_to_word_ids(test_path, word_to_id) 77 | vocabulary = len(word_to_id) 78 | return train_data, valid_data, test_data, vocabulary 79 | 80 | 81 | def ptb_producer(raw_data, batch_size, num_steps, name=None): 82 | """Iterate on the raw PTB data. 83 | 84 | This chunks up raw_data into batches of examples and returns Tensors that 85 | are drawn from these batches. 86 | 87 | Args: 88 | raw_data: one of the raw data outputs from ptb_raw_data. 89 | batch_size: int, the batch size. 90 | num_steps: int, the number of unrolls. 91 | name: the name of this operation (optional). 92 | 93 | Returns: 94 | A pair of Tensors, each shaped [batch_size, num_steps]. The second element 95 | of the tuple is the same data time-shifted to the right by one. 96 | 97 | Raises: 98 | tf.errors.InvalidArgumentError: if batch_size or num_steps are too high. 99 | """ 100 | with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]): 101 | raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32) 102 | 103 | data_len = tf.size(raw_data) 104 | batch_len = data_len // batch_size 105 | data = tf.reshape(raw_data[0 : batch_size * batch_len], 106 | [batch_size, batch_len]) 107 | 108 | epoch_size = (batch_len - 1) // num_steps 109 | assertion = tf.assert_positive( 110 | epoch_size, 111 | message="epoch_size == 0, decrease batch_size or num_steps") 112 | with tf.control_dependencies([assertion]): 113 | epoch_size = tf.identity(epoch_size, name="epoch_size") 114 | 115 | i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue() 116 | x = tf.slice(data, [0, i * num_steps], 117 | [batch_size, (i + 1) * num_steps]) 118 | #x = tf.strided_slice(data, [0, i * num_steps], 119 | # [batch_size, (i + 1) * num_steps]) 120 | x.set_shape([batch_size, num_steps]) 121 | y = tf.slice(data, [0, i * num_steps + 1], 122 | [batch_size, (i + 1) * num_steps + 1]) 123 | y.set_shape([batch_size, num_steps]) 124 | return x, y 125 | -------------------------------------------------------------------------------- /src/lstm/reader_frag.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | """Utilities for parsing PTB text files.""" 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import collections 23 | import os 24 | 25 | import tensorflow as tf 26 | import numpy as np 27 | 28 | def _read_words(filename): 29 | with tf.gfile.GFile(filename, "r") as f: 30 | return f.read().decode("latin-1").split() 31 | 32 | 33 | def _build_vocab(filename): 34 | data = _read_words(filename) 35 | 36 | counter = collections.Counter(data) 37 | count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) 38 | 39 | words, _ = list(zip(*count_pairs)) 40 | word_to_id = dict(zip(words, range(len(words)))) 41 | 42 | return word_to_id 43 | 44 | 45 | def _file_to_word_ids_translating(filename, word_to_id): 46 | data = _read_words(filename) 47 | return [word_to_id[word] for word in data if word in word_to_id] 48 | 49 | 50 | def _file_to_word_ids(filename): 51 | data = [] 52 | files = open(filename).read().split() 53 | for f in files: 54 | with open(f) as fn: 55 | data.extend([int(w) for w in fn.read().split()]) 56 | 57 | return data 58 | 59 | 60 | def wiki_raw_data(data_path=None, word_to_id_path=None): 61 | """Load WP raw data from data directory "data_path". 62 | 63 | Reads WP text files, converts strings to integer ids, 64 | and performs mini-batching of the inputs. 65 | 66 | The WP dataset comes from Tomas Mikolov's webpage: 67 | e 68 | http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz 69 | 70 | Args: 71 | data_path: string path to the directory where simple-examples.tgz has 72 | been extracted. 73 | 74 | Returns: 75 | tuple (train_data, valid_data, test_data, vocabulary) 76 | where each of the data objects can be passed to PTBIterator. 77 | """ 78 | import sys 79 | 80 | print("Loading data from %s" % data_path) 81 | train_path = os.path.join(data_path, "train.list") 82 | valid_path = os.path.join(data_path, "valid.list") 83 | test_path = os.path.join(data_path, "test.list") 84 | 85 | # word_to_id = VectorManager.read_vector(word_to_id_path) 86 | # print("Word 2 ID size: %s" % (sys.getsizeof(word_to_id))) 87 | # sys.stdout.flush() 88 | 89 | #word_to_id = _build_vocab(train_path) 90 | train_data = open(train_path).read().split() 91 | print("Train size: %s" % (len(train_data))) 92 | sys.stdout.flush() 93 | 94 | valid_data = _file_to_word_ids(valid_path) 95 | print("Validation size: %s" % (len(valid_data))) 96 | sys.stdout.flush() 97 | 98 | test_data = _file_to_word_ids(test_path) 99 | print("Test size: %s" % (len(test_data))) 100 | sys.stdout.flush() 101 | 102 | # vocabulary = len(word_to_id) 103 | 104 | return train_data, valid_data, test_data 105 | 106 | def wiki_producer(data_name, raw_data, batch_size, num_steps, name=None): 107 | """Iterate on the raw Wikipedia data. 108 | 109 | This chunks up raw_data into batches of examples and returns Tensors that 110 | are drawn from these batches. 111 | 112 | Args: 113 | raw_data: one of the raw data outputs from ptb_raw_data. 114 | batch_size: int, the batch size. 115 | num_steps: int, the number of unrolls. 116 | name: the name of this operation (optional). 117 | 118 | Returns: 119 | A pair of Tensors, each shaped [batch_size, num_steps]. The second element 120 | of the tuple is the same data time-shifted to the right by one. 121 | 122 | Raises: 123 | tf.errors.InvalidArgumentError: if batch_size or num_steps are too high. 124 | """ 125 | with tf.name_scope(name, "WPProducer", [raw_data, batch_size, num_steps]): 126 | 127 | if data_name == "TrainInput": 128 | data_len = 1516132009 # Validated 129 | elif data_name == "ValidInput": 130 | data_len = 182828964 # Validated 131 | elif data_name == "TestInput": 132 | data_len = 181755142 # Validated 133 | else: 134 | print("[ERROR] Data length not defined.") 135 | data_len = 0 136 | 137 | stride = 3500000 138 | 139 | 140 | #raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32) 141 | #raw_data = np.array(raw_data, dtype=np.int32) 142 | 143 | batch_len = data_len // batch_size 144 | print("Indices %s %s %s" % (batch_size * batch_len, 145 | batch_size, batch_len)) 146 | data = np.reshape(raw_data[0: batch_size * batch_len], 147 | [batch_size, batch_len]) 148 | 149 | epoch_size = (batch_len - 1) // num_steps 150 | assertion = tf.assert_positive( 151 | epoch_size, 152 | message="epoch_size == 0, decrease batch_size or num_steps") 153 | with tf.control_dependencies([assertion]): 154 | epoch_size = tf.identity(epoch_size, name="epoch_size") 155 | 156 | 157 | i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue() 158 | 159 | index_0 = tf.multiply(i, num_steps) 160 | index_aux = tf.add(i, 1) 161 | index_1 = tf.multiply(index_aux, num_steps) 162 | # i2 = q.dequeue() 163 | # x = data[0:batch_size, i * num_steps:(i + 1) * num_steps] 164 | #print("Slices [0, %s], [%s, %s]" % (i * num_steps, batch_size, (i + 1) * num_steps)) 165 | x = tf.strided_slice(data, [0, i * num_steps], 166 | [batch_size, (i + 1) * num_steps]) 167 | x.set_shape([batch_size, num_steps]) 168 | 169 | y = data[0:batch_size, i * num_steps + 1:(i + 1) * num_steps + 1] 170 | y = tf.strided_slice(data, [0, i * num_steps + 1], 171 | [batch_size, (i + 1) * num_steps + 1]) 172 | y.set_shape([batch_size, num_steps]) 173 | 174 | return x, y, i*num_steps, (i + 1) * num_steps, data 175 | -------------------------------------------------------------------------------- /src/lstm/reader_test.py: -------------------------------------------------------------------------------- 1 | from utils.vector_manager import VectorManager 2 | from utils.flatten import flatten 3 | import tensorflow as tf 4 | import numpy as np 5 | 6 | 7 | data = ['consumers', 'may', 'want', 'to', 'move', 'their', 8 | 'telephones', 'a', 'little', 'closer', 'to', 'the', 9 | 'tv', 'set', '', '', 'watching', 'abc', "'s", 10 | 'monday', 'night', 'football', 'can', 'now', 'vote', 'during', 11 | '', 'for', 'the', 'greatest', 'play', 'in', 'N', 'years', 12 | 'from', 'among', 'four', 'or', 'five', '', '', 13 | 'two', 'weeks', 'ago', 'viewers', 'of', 'several', 'nbc', 14 | '', 'consumer', 'segments', 'started', 'calling', 'a', 15 | 'N', 'number', 'for', 'advice', 'on', 'various', '', 16 | 'issues', 'and', 'the', 'new', 'syndicated', 'reality', 17 | 'show', 'hard', 'copy', 'records', 'viewers', "'", 'opinions', 18 | 'for', 'possible', 'airing', 'on', 'the', 'next', 'day', "'s", 19 | 'show', 'interactive', 'telephone', 'technology', 'has', 20 | 'taken', 'a', 'new', 'leap', 'in', '', 'and', 'television', 21 | 'programmers', 'are', 'racing', 'to', 'exploit'] 22 | 23 | 24 | 25 | from lstm.reader_wp import wiki_raw_data, wiki_producer 26 | 27 | train, valid, test = wiki_raw_data("../data/wikipedia/") 28 | #data = data.flatten() 29 | batch_size = 2 30 | num_steps = 3 31 | inputs, targets, s1, s2, x = wiki_producer(train, batch_size=batch_size, num_steps=num_steps) 32 | 33 | # print inputs 34 | # sv = tf.train.Supervisor() 35 | # with sv.managed_session() as sess: 36 | # print sess.run([inputs, s1, s2]) 37 | # print sess.run([inputs, s1, s2]) 38 | # print sess.run([inputs, s1, s2]) 39 | # print sess.run([inputs, s1, s2]) 40 | # print sess.run([inputs, s1, s2]) 41 | 42 | data_len = np.size(train) 43 | batch_len = data_len // batch_size 44 | ndata = np.reshape(train[0: batch_size * batch_len], 45 | [batch_size, batch_len]) 46 | 47 | 48 | -------------------------------------------------------------------------------- /src/lstm/reader_topics.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | """Utilities for parsing PTB text files.""" 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import collections 23 | import os 24 | 25 | import tensorflow as tf 26 | 27 | 28 | def _read_words(filename): 29 | with tf.gfile.GFile(filename, "r") as f: 30 | return f.read().decode("utf-8").replace("\n", "").split() 31 | 32 | 33 | def _build_vocab(filename): 34 | data = _read_words(filename) 35 | 36 | counter = collections.Counter(data) 37 | count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) 38 | 39 | words, _ = list(zip(*count_pairs)) 40 | word_to_id = dict(zip(words, range(len(words)))) 41 | 42 | return word_to_id 43 | 44 | 45 | def _file_to_word_ids(filename, word_to_id): 46 | data = _read_words(filename) 47 | return [word_to_id[word] for word in data if word in word_to_id] 48 | 49 | 50 | def wiki_raw_data(data_path=None): 51 | """Load PTB raw data from data directory "data_path". 52 | 53 | Reads PTB text files, converts strings to integer ids, 54 | and performs mini-batching of the inputs. 55 | 56 | The PTB dataset comes from Tomas Mikolov's webpage: 57 | 58 | http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz 59 | 60 | Args: 61 | data_path: string path to the directory where simple-examples.tgz has 62 | been extracted. 63 | 64 | Returns: 65 | tuple (train_data, valid_data, test_data, vocabulary) 66 | where each of the data objects can be passed to PTBIterator. 67 | """ 68 | 69 | train_path = os.path.join(data_path, "wiki.train.txt") 70 | valid_path = os.path.join(data_path, "wiki.valid.txt") 71 | test_path = os.path.join(data_path, "wiki.test.txt") 72 | 73 | word_to_id = _build_vocab(train_path) 74 | train_data = _file_to_word_ids(train_path, word_to_id) 75 | valid_data = _file_to_word_ids(valid_path, word_to_id) 76 | test_data = _file_to_word_ids(test_path, word_to_id) 77 | vocabulary = len(word_to_id) 78 | return train_data, valid_data, test_data, vocabulary 79 | 80 | 81 | def ptb_producer(raw_data, batch_size, num_steps, name=None): 82 | """Iterate on the raw Wikipedia data. 83 | 84 | This chunks up raw_data into batches of examples and returns Tensors that 85 | are drawn from these batches. 86 | 87 | Args: 88 | raw_data: one of the raw data outputs from ptb_raw_data. 89 | batch_size: int, the batch size. 90 | num_steps: int, the number of unrolls. 91 | name: the name of this operation (optional). 92 | 93 | Returns: 94 | A pair of Tensors, each shaped [batch_size, num_steps]. The second element 95 | of the tuple is the same data time-shifted to the right by one. 96 | 97 | Raises: 98 | tf.errors.InvalidArgumentError: if batch_size or num_steps are too high. 99 | """ 100 | with tf.name_scope(name, "WPProducer", [raw_data, batch_size, num_steps]): 101 | 102 | 103 | 104 | 105 | # data_len, batch_len, data, epoch_size, i, x, y, y_2 = sess.run([data_len, batch_len, data, epoch_size, i, x, y, y_2]) 106 | # 107 | # 108 | # batch_size = 5 109 | # num_steps = 5 110 | # 111 | # tf.reset_default_graph() 112 | raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.string) 113 | 114 | data_len = tf.size(raw_data) 115 | batch_len = data_len // batch_size 116 | data = tf.reshape(raw_data[0 : batch_size * batch_len], 117 | [batch_size, batch_len]) 118 | 119 | epoch_size = (batch_len - 1) // num_steps 120 | assertion = tf.assert_positive( 121 | epoch_size, 122 | message="epoch_size == 0, decrease batch_size or num_steps") 123 | with tf.control_dependencies([assertion]): 124 | epoch_size = tf.identity(epoch_size, name="epoch_size") 125 | 126 | i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue() 127 | # i2 = q.dequeue() 128 | x = tf.strided_slice(data, [0, i * num_steps], 129 | [batch_size, (i + 1) * num_steps]) 130 | x.set_shape([batch_size, num_steps]) 131 | y = tf.strided_slice(data, [0, i * num_steps + 1], 132 | [batch_size, (i + 1) * num_steps + 1]) 133 | y.set_shape([batch_size, num_steps]) 134 | 135 | 136 | 137 | # 138 | # sv = tf.train.Supervisor() 139 | # 140 | # with sv.managed_session() as sess: 141 | # ii, xx, yy, yy2 = sess.run([i, x, y, y_2]) 142 | 143 | return x, y 144 | 145 | # tf.reset_default_graph() 146 | # init = tf.initialize_all_variables() 147 | # sess = tf.Session() 148 | # sess.run(init) 149 | # tf.train.start_queue_runners(sess=sess) 150 | # sess.run([x, y]) -------------------------------------------------------------------------------- /src/postprocess/semantics_check.py: -------------------------------------------------------------------------------- 1 | from gensim.models.keyedvectors import KeyedVectors 2 | from pprint import pprint 3 | import gensim 4 | import os 5 | import argparse 6 | 7 | module_path = "%s/test" % os.path.dirname(gensim.__file__) 8 | 9 | 10 | def semantics_checks(wv): 11 | """ 12 | Perform some semantics check to see that the generated word vectors are sensible 13 | :param wv: word vectors of the embeddings 14 | """ 15 | print("Operations using multiplicative combination objective:") 16 | w = wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man']) 17 | print(" * King + Woman - Man = %s [%s]" % (w[0][0], w[0][1])) 18 | w = wv.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london']) 19 | print(" * Baghdad + England - London = %s [%s]" % (w[0][0], w[0][1])) 20 | 21 | print("\n * Most similar words to Paris:") 22 | pprint(wv.most_similar_cosmul('paris')) 23 | 24 | print("\n * Most similar words to Jupiter:") 25 | pprint(wv.most_similar_cosmul('jupiter')) 26 | 27 | print("\n * Most similar words to Zeus:") 28 | pprint(wv.most_similar_cosmul('zeus')) 29 | 30 | 31 | def compute_accuracies(wv): 32 | """ 33 | Compute the accuracy of parameter word embeddings with 5 semantic and 9 grammatical relations 34 | :param wv: word vectors of the embeddings 35 | """ 36 | acc = wv.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt')) 37 | for sec in acc: 38 | correct = len(sec['correct']) 39 | incorrect = len(sec['incorrect']) 40 | total = correct + incorrect 41 | ac = correct / float(total) 42 | print("\n[%s]\n\tAccuracy [%s] %s/%s" % (sec['section'].title(), round(ac, 2), correct, total)) 43 | 44 | 45 | if __name__ == '__main__': 46 | parser = argparse.ArgumentParser() 47 | parser.add_argument('-w', '--word_vectors', type=str, help="Path of LM to perform the tests upon", required=True) 48 | 49 | args = parser.parse_args() 50 | 51 | # Arguments parsing 52 | wv_path = args.word_vectors 53 | 54 | print("Loading model...") 55 | wv = KeyedVectors.load_word2vec_format(wv_path, binary=False) 56 | 57 | # Some semantic examples 58 | semantics_checks(wv) 59 | 60 | # Compute and print questions accuracies 61 | compute_accuracies(wv) 62 | 63 | 64 | -------------------------------------------------------------------------------- /src/postprocess/test_topics.py: -------------------------------------------------------------------------------- 1 | from utils.vector_manager import VectorManager 2 | from gensim.corpora import Dictionary, MmCorpus 3 | from gensim.models import LsiModel, LdaMulticore, LdaModel, HdpModel 4 | from time import time 5 | 6 | import numpy as np 7 | import argparse 8 | import pickle 9 | import sys 10 | 11 | 12 | def load_dict(id2word_path): 13 | print("[BLOCK] Loading dictionary files from %s" % (id2word_path)) 14 | sys.stdout.flush() 15 | dictionary = Dictionary.load_from_text(id2word_path) 16 | 17 | 18 | return dictionary 19 | 20 | 21 | def word2id_to_id2word(word2id_path): 22 | 23 | word2id = pickle.load(open(word2id_path)) 24 | id2word_c = [0] * len(word2id) 25 | for w in word2id: 26 | id2word_c[word2id[w]] = w 27 | return id2word_c 28 | 29 | 30 | 31 | def print_lsa_topic(document, dictionary, lsi): 32 | corpus = [dictionary.doc2bow(document.split())] 33 | topics = lsi[corpus] 34 | topics = topics[0] # Only one document 35 | 36 | values = [abs(val) for _, val in topics] 37 | index = values.index(max(values)) 38 | # print(values) 39 | print(topics[index], lsi.print_topic(index)) 40 | 41 | 42 | def print_hdp(document, dictionary, hdp): 43 | corpus = [dictionary.doc2bow(document.split())] 44 | corpus_hdp = hdp[corpus] 45 | 46 | for doc in corpus_hdp: 47 | values = [abs(val) for _, val in doc] 48 | index = values.index(max(values)) 49 | # print(values) 50 | print(doc[index], hdp.print_topic(index)) 51 | 52 | 53 | 54 | if __name__ == '__main__': 55 | parser = argparse.ArgumentParser() 56 | parser.add_argument('-m', '--model', type=str, help="Directory where the model is stored.", required=True) 57 | parser.add_argument('-e', '--embeddings', type=str, help="Embeddings path (id_word_vec.pklz)", required=True) 58 | parser.add_argument('-w', '--word2id_path', type=str, help="Word2ID vector to be used for doc translation.", 59 | required=True, default=None) 60 | parser.add_argument('-i', '--id_word', type=str, help="Id2Word vector path ['wiki_en_wordids.txt'].", 61 | required=True, default=None) 62 | 63 | args = parser.parse_args() 64 | 65 | model_path = args.model 66 | id2word_path = args.id_word 67 | word2id_path = args.word2id_path 68 | emb_path = args.embeddings 69 | 70 | begin = time() 71 | 72 | dictionary = load_dict(id2word_path) 73 | id2word = word2id_to_id2word(word2id_path) 74 | w2Id = VectorManager.read_vector(word2id_path) 75 | embeddings = VectorManager.read_vector(emb_path) 76 | 77 | demo1 = "the roman consul is normally a notable person from the senate elected " \ 78 | "by direct voting of the italic tribes" 79 | 80 | data = open("../data/small/AA/wiki_01_clean_simple").read().split("") 81 | s1 = data[0].split("")[0] 82 | data = open("../data/small/AA/wiki_00_clean_simple").read().split("") 83 | s2 = data[0].split("")[0] 84 | data = open("../data/small/AB/wiki_00_clean_simple").read().split("") 85 | s3 = data[0].split("")[0] 86 | data = open("../data/small/AB/wiki_01_clean_simple").read().split("") 87 | s4 = data[0].split("")[0] 88 | 89 | 90 | if "lda" in model_path: 91 | lda = LdaModel.load(model_path) 92 | print("Demo 1:\n%s" % demo1) 93 | print(get_lda_best_topic_words(demo1, dictionary, lda)) 94 | print("Demo 2:\n%s" % s1) 95 | print(get_lda_best_topic_words(s1, dictionary, lda)) 96 | print("Demo 3:\n%s" % s2) 97 | print(get_lda_best_topic_words(s2, dictionary, lda)) 98 | print("Demo 4:\n%s" % s3) 99 | print(get_lda_best_topic_words(s3, dictionary, lda)) 100 | print("Demo 5:\n%s" % s4) 101 | print(get_lda_best_topic_words(s4, dictionary, lda)) 102 | elif "lsa" in model_path: 103 | lsi = LsiModel.load(model_path) 104 | print("Demo 1:\n%s" % demo1) 105 | print(print_lsa_topic(demo1, dictionary, lsi)) 106 | print("Demo 2:\n%s" % s1) 107 | print(print_lsa_topic(s1, dictionary, lsi)) 108 | print("Demo 3:\n%s" % s2) 109 | print(print_lsa_topic(s2, dictionary, lsi)) 110 | print("Demo 4:\n%s" % s3) 111 | print(print_lsa_topic(s3, dictionary, lsi)) 112 | print("Demo 5:\n%s" % s4) 113 | print(print_lsa_topic(s4, dictionary, lsi)) 114 | print(get_lsa_topic_embeding(s4, dictionary, lsi, w2Id, embeddings)) 115 | elif "hdp" in model_path: 116 | hdp = HdpModel.load(model_path) 117 | print("Demo 1:\n%s" % demo1) 118 | print(print_hdp(demo1, dictionary, hdp)) 119 | 120 | 121 | end = time() 122 | print("Total processing time: %d seconds" % (end - begin)) 123 | -------------------------------------------------------------------------------- /src/postprocess/tsne.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from gensim.models.keyedvectors import KeyedVectors 5 | from utils.vector_manager import VectorManager 6 | from sklearn.manifold import TSNE 7 | import matplotlib.pyplot as plt 8 | import argparse 9 | 10 | 11 | def plot_tsne(id_word_vec): 12 | 13 | """ 14 | Compute the t-SNE dimensionality reduction values of input parameter and plot them in 2D 15 | :param id_word_vec: vector containing the tuples (id, word, embedding) to be plotted 16 | """ 17 | tsne = TSNE(n_components=2) 18 | X_tsne = tsne.fit_transform([v for _, _, v in id_word_vec]) 19 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1]) 20 | 21 | for i, word in enumerate([word for _, word, _ in id_word_vec]): 22 | plt.annotate(word, (X_tsne[i, 0], X_tsne[i, 1])) 23 | 24 | plt.show() 25 | 26 | 27 | def subset(initial_word, id_word_vec, wv, max): 28 | """ 29 | Get a subset of max number of words using cosmul distance starting from initial_word 30 | :param initial_word: first word to be used to find nearest ones 31 | :param id_word_vec: vector containing the tuples (id, word, embedding) for each word 32 | :param wv: gensim word embeddings model 33 | :param max: number of words to return 34 | :return: list of tuples (id, word, embedding) 35 | """ 36 | words = [initial_word] 37 | subset = [] 38 | while len(words) > 0 and len(subset) < max: 39 | w = words.pop() 40 | sim = wv.similar_by_word(w) 41 | ws = [w for w, _ in sim] 42 | similars = [s for s in ws if s not in subset] 43 | subset.extend(similars) 44 | words.extend(similars) 45 | 46 | final_set = [(i, w, v) for i, w, v in id_word_vec if w in subset] 47 | return final_set 48 | 49 | 50 | if __name__ == '__main__': 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument('-i', '--id_word_vec', type=str, help="Path of id <-> word <-> embedding vector", required=True) 53 | parser.add_argument('-w', '--word_vectors', type=str, help="Path of LM to perform the tests upon", required=True) 54 | 55 | args = parser.parse_args() 56 | 57 | # Arguments parsing 58 | wv_path = args.word_vectors 59 | path = args.id_word_vec 60 | 61 | print("Loading model...") 62 | wv = KeyedVectors.load_word2vec_format(wv_path, binary=False) 63 | 64 | print("Loading id-word-vec...") 65 | id_word_vec = VectorManager.read_vector(path) 66 | 67 | print("Finding subset to plot") 68 | initial_word = 'jupiter' 69 | max_elements = 500 70 | sb = subset(initial_word, id_word_vec, wv, max_elements) 71 | 72 | print("Plotting subset of words...") 73 | # Plot t-SNE 74 | plot_tsne(sb) 75 | 76 | 77 | -------------------------------------------------------------------------------- /src/preprocess/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/src/preprocess/__init__.py -------------------------------------------------------------------------------- /src/preprocess/cleaner.py: -------------------------------------------------------------------------------- 1 | from utils.vector_manager import VectorManager 2 | from pattern.en import tokenize 3 | from time import time 4 | 5 | import multiprocessing as mp 6 | import os 7 | import re 8 | import sys 9 | import argparse 10 | 11 | 12 | def cleanhtml(raw_html): 13 | """ 14 | Removes the tags remaining from wikiExtracted data 15 | :param raw_html: html/text content of a file with many docs 16 | :return: only text from raw_html 17 | """ 18 | cleanr = re.compile('<.*?>') 19 | cleantext = re.sub(cleanr, ' ', raw_html) 20 | return cleantext 21 | 22 | 23 | def remove_title(text): 24 | """ 25 | Removes the title of a document 26 | :param text: text containing an article output from cleanhtml() 27 | :return: text of the article without title 28 | """ 29 | index = text.find("\n\n") 30 | if index != -1: 31 | return text[index+2:] 32 | else: 33 | return text 34 | 35 | 36 | def is_number(s): 37 | """ 38 | Checks if the parameter s is a number 39 | :param s: anything 40 | :return: true if s is a number, false otherwise 41 | """ 42 | try: 43 | float(s) 44 | return True 45 | except ValueError: 46 | return False 47 | 48 | 49 | def _transform_file(file_path, debug=False): 50 | """ 51 | Transforms a file containing articles into a 4D list of words divided into sentences, 52 | paragraphs and docs. Write the result to disk with the name filename_wl (words list) 53 | :param file_path: file to transform 54 | """ 55 | if debug: 56 | print("Cleaning %s" % file_path) 57 | with open(file_path) as f: 58 | raw = f.read().decode("latin-1") 59 | data = cleanhtml(raw) 60 | docs = data.split("") 61 | del data 62 | file_out = "%s_wl" % file_path 63 | file_string = "" 64 | for doc in [d.strip() for d in docs if d.strip()]: 65 | paragraphs = [tokenize(par) for par in remove_title(cleanhtml(doc)).strip().split("\n\n") if par] 66 | doc_a = False 67 | for p in paragraphs: 68 | par_a = False 69 | for sent in p: 70 | line = " ".join([word for word in sent.lower().split() 71 | if word.isalpha() or is_number(word)]) 72 | if line: 73 | file_string += line + "\n" 74 | par_a = True 75 | doc_a = True 76 | 77 | if par_a: 78 | file_string += "\n" 79 | if doc_a: 80 | file_string += "\n" 81 | 82 | VectorManager.write_string(file_out, file_string.encode("latin-1")) 83 | del file_string 84 | if debug: 85 | print("Done with %s" % file_path) 86 | 87 | 88 | def transform(dirname, debug=False): 89 | """ 90 | Handles the parallel transformation of all the dataset into 4D lists 91 | """ 92 | for root, dirs, files in os.walk(dirname): 93 | filtered_files = ["%s/%s" % (root, file) for file in files if 94 | is_number(file.split("_")[1]) and len(file.split("_")) == 2] 95 | 96 | threads = min(mp.cpu_count() * 4, filtered_files) 97 | print("Starting %s processes to clean %s files" % (threads, len(filtered_files))) 98 | i = 0 99 | while i < len(filtered_files): 100 | ps = [] 101 | j = 0 102 | while j < threads and (i + j) < len(filtered_files): 103 | if debug: 104 | print("[%s] Creating %s of %s for file %s" % ( 105 | i, i + j, len(filtered_files), filtered_files[i + j])) 106 | p = (mp.Process(target=_transform_file, args=(filtered_files[i + j],))) 107 | p.start() 108 | ps.append(p) 109 | j += 1 110 | 111 | if debug: 112 | print("%s process in the list to join" % len(ps)) 113 | j = 0 114 | while j < threads and (i + j) < len(filtered_files): 115 | if debug: 116 | print("[%s] Joining %s of %s for file %s" % ( 117 | i, j, len(filtered_files), filtered_files[i + j])) 118 | ps[j].join() 119 | j += 1 120 | 121 | i += j 122 | 123 | sys.stdout.flush() 124 | 125 | 126 | def clean_data(files_path): 127 | 128 | """ 129 | Wrapper function to cleans the data and transforms it into 4D. Used to be called from either main or as block of 130 | the pipeline 131 | :param data_path: of the files to convert 132 | :return: MySentences class ready to be fed to Word2Vec model 133 | """ 134 | print("[BLOCK] Transforming sentences to 4-dimensional lists") 135 | transform(files_path) 136 | print("[BLOCK] Done transforming data") 137 | sys.stdout.flush() 138 | 139 | 140 | if __name__ == '__main__': 141 | 142 | parser = argparse.ArgumentParser() 143 | parser.add_argument('-d', '--data', type=str, help="Path of the data to be used for the word embeddings" 144 | " and clean up.", required=True) 145 | 146 | args = parser.parse_args() 147 | data_path = args.data 148 | 149 | print("Cleaning data from %s" % (data_path)) 150 | 151 | begin = time() 152 | 153 | clean_data(data_path) 154 | 155 | 156 | end = time() 157 | print("Total processing time: %d seconds" % (end - begin)) 158 | -------------------------------------------------------------------------------- /src/preprocess/embeddings.py: -------------------------------------------------------------------------------- 1 | from utils.vector_manager import VectorManager 2 | from time import time 3 | 4 | import multiprocessing as mp 5 | import gensim 6 | import os 7 | import sys 8 | import argparse 9 | from contextlib import closing 10 | 11 | 12 | 13 | def read_file(filename): 14 | return VectorManager.read_vector(filename) 15 | 16 | 17 | class MySentences(object): 18 | def __init__(self, dirname): 19 | self.dirname = dirname 20 | self.files = [] 21 | self.file_paths = [] 22 | for root, dirs, files in os.walk(self.dirname): 23 | # for filename in [file for file in files if file.endswith("_simple")]: 24 | for filename in [file for file in files if file.endswith("_clean")]: 25 | file_path = root + '/' + filename 26 | self.file_paths.append(file_path) 27 | print("Got %s files to turn into sentences" % len(self.file_paths)) 28 | 29 | 30 | def __iter__(self): 31 | """ 32 | Defines how to iterate the MySentences class in order to feed it directly into Word2Vec method. Yields a 33 | sentence (as a list of words) for every iteration. 34 | """ 35 | # for root, dirs, files in os.walk(self.dirname): 36 | for file_path in self.file_paths: 37 | file_data = VectorManager.read_vector(file_path) 38 | file_sentences = VectorManager.parse_into_sentences(file_data) 39 | 40 | for sentence in file_sentences: 41 | yield sentence 42 | 43 | 44 | def create_embeddings(files_path, embedding_size, minimum_count): 45 | 46 | """ 47 | Creates embeddings with the sentences, embedding size, min_count of occurrences, a max window length of 10, and 48 | cpu_count() number of workers. Used to be called from either main or as block of the pipeline 49 | :param files_path: used to generate the word embeddings 50 | :param embedding_size: size of the embeddings to generate 51 | :param minimum_count: min. occurrences per word to be included 52 | :return: word2vec model with all the embeddings and extra info 53 | """ 54 | print("[BLOCK] Initializing MySentences from {}".format(files_path)) 55 | sentences = MySentences(files_path) 56 | print("[BLOCK] Creating embeddings model") 57 | sys.stdout.flush() 58 | model_w2v = gensim.models.Word2Vec(sentences, 59 | size=embedding_size, 60 | window=10, 61 | min_count=minimum_count, 62 | workers=mp.cpu_count()) 63 | print("[BLOCK] Created embeddings of size %s" % embedding_size) 64 | sys.stdout.flush() 65 | 66 | return model_w2v 67 | 68 | 69 | if __name__ == '__main__': 70 | 71 | parser = argparse.ArgumentParser() 72 | parser.add_argument('-d', '--data', type=str, help="Path of the data to be used for the word embeddings" 73 | " and clean up.", required=True) 74 | parser.add_argument('-s', '--size', type=int, help="Size of the word embeddings.", default=200, required=True) 75 | parser.add_argument('-c', '--min_count', type=int, help="Size of the word embeddings.", default=1, required=False) 76 | 77 | args = parser.parse_args() 78 | data_path = args.data 79 | emb_size = args.size 80 | min_count = args.min_count 81 | 82 | print("Creating embeddings of size %s for data in %s" % (emb_size, data_path)) 83 | 84 | begin = time() 85 | 86 | model = create_embeddings(data_path, emb_size, min_count) 87 | 88 | print("Saving embeddings model...") 89 | model.save("../models/word2vec_gensim_%s" % emb_size) 90 | model.wv.save_word2vec_format("../models/word2vec_org_%s" % emb_size, 91 | "../models/vocabulary_%s" % emb_size, 92 | binary=False) 93 | 94 | end = time() 95 | print("Total processing time: %d seconds" % (end - begin)) 96 | -------------------------------------------------------------------------------- /src/preprocess/filter.py: -------------------------------------------------------------------------------- 1 | from utils.vector_manager import VectorManager 2 | from pattern.en import tokenize 3 | from time import time 4 | 5 | import multiprocessing as mp 6 | import os 7 | import re 8 | import sys 9 | import argparse 10 | 11 | 12 | def cleanhtml(raw_html): 13 | """ 14 | Removes the tags remaining from wikiExtracted data 15 | :param raw_html: html/text content of a file with many docs 16 | :return: only text from raw_html 17 | """ 18 | cleanr = re.compile('<.*?>') 19 | cleantext = re.sub(cleanr, ' ', raw_html) 20 | return cleantext 21 | 22 | 23 | def remove_title(text): 24 | """ 25 | Removes the title of a document 26 | :param text: text containing an article output from cleanhtml() 27 | :return: text of the article without title 28 | """ 29 | index = text.find("\n\n") 30 | if index != -1: 31 | return text[index+2:] 32 | else: 33 | return text 34 | 35 | 36 | def is_number(s): 37 | """ 38 | Checks if the parameter s is a number 39 | :param s: anything 40 | :return: true if s is a number, false otherwise 41 | """ 42 | try: 43 | float(s) 44 | return True 45 | except ValueError: 46 | return False 47 | 48 | def known(word, w2id): 49 | """ 50 | Return ID of the word (or 0 if word is not in word2Id dict) 51 | :param word: to translated 52 | :return: Id of the word 53 | """ 54 | 55 | try: 56 | word_r = w2id[word] 57 | return word 58 | except KeyError: 59 | return '' 60 | 61 | 62 | def _transform_file(file_path, w2id, split_par=False, debug=False): 63 | """ 64 | Transforms a file containing articles into a 4D list of words divided into sentences, 65 | paragraphs and docs. Write the result to disk with the name filename_clean.pklz 66 | :param file_path: file to transform 67 | """ 68 | if debug: 69 | print("Cleaning %s" % file_path) 70 | with open(file_path) as f: 71 | data = f.read().decode("latin-1") 72 | docs = data.split("") 73 | del data 74 | if not split_par: 75 | file_out = "%s_clean_simple" % file_path 76 | else: 77 | file_out = "%s_clean_paragraph" % file_path 78 | file_string = "" 79 | for doc in [d.strip() for d in docs if d.strip()]: 80 | paragraphs = [tokenize(par) for par in remove_title(cleanhtml(doc)).strip().split("\n\n") if par] 81 | doc_a = False 82 | for p in paragraphs: 83 | par_a = False 84 | for sent in p: 85 | line = [word for word in sent.lower().split() 86 | if word.isalpha() or is_number(word)] 87 | 88 | line = " ".join([known(word, w2id) for word in line]) 89 | if line: 90 | file_string += line + " " 91 | par_a = True 92 | 93 | if par_a and split_par: 94 | file_string += " " 95 | 96 | VectorManager.write_string(file_out, file_string.encode("latin-1")) 97 | del file_string 98 | if debug: 99 | print("Done with %s" % file_path) 100 | 101 | 102 | def transform(dirname, w2Id, paragraph_mark, debug=False): 103 | """ 104 | Handles the parallel transformation of all the dataset into 4D lists 105 | """ 106 | for root, dirs, files in os.walk(dirname): 107 | filtered_files = [] 108 | for file in files: 109 | print("File: {}".format(file)) 110 | try: 111 | if is_number(file.split("_")[1]) and len(file.split("_")) == 2: 112 | filtered_files.append("%s/%s" % (root, file)) 113 | except IndexError: 114 | pass 115 | 116 | 117 | threads = min(mp.cpu_count() * 4, filtered_files) 118 | print("Starting %s processes to clean %s files" % (threads, len(filtered_files))) 119 | i = 0 120 | while i < len(filtered_files): 121 | ps = [] 122 | j = 0 123 | while j < threads and (i + j) < len(filtered_files): 124 | if debug: 125 | print("[%s] Creating %s of %s for file %s" % ( 126 | i, i + j, len(filtered_files), filtered_files[i + j])) 127 | p = (mp.Process(target=_transform_file, args=(filtered_files[i + j], w2Id, paragraph_mark))) 128 | p.start() 129 | ps.append(p) 130 | j += 1 131 | 132 | if debug: 133 | print("%s process in the list to join" % len(ps)) 134 | j = 0 135 | while j < threads and (i + j) < len(filtered_files): 136 | if debug: 137 | print("[%s] Joining %s of %s for file %s" % ( 138 | i, j, len(filtered_files), filtered_files[i + j])) 139 | ps[j].join() 140 | j += 1 141 | 142 | i += j 143 | 144 | sys.stdout.flush() 145 | 146 | 147 | def filter_data(files_path, w2Id, paragraph_mark): 148 | 149 | """ 150 | Wrapper function to filters occurrences not present in w2Id. Used to be called from either main or as block of 151 | the pipeline 152 | :param data_path: of the files to convert 153 | :return: MySentences class ready to be fed to Word2Vec model 154 | """ 155 | print("[BLOCK] Filtering sentences to files divided by (splitting also paragraphs? %s)" % paragraph_mark) 156 | transform(files_path, w2Id, paragraph_mark) 157 | print("[BLOCK] Done transforming data") 158 | sys.stdout.flush() 159 | 160 | 161 | if __name__ == '__main__': 162 | 163 | parser = argparse.ArgumentParser() 164 | parser.add_argument('-d', '--data', type=str, help="Path of the data to be used for the word embeddings" 165 | " and clean up.", required=True) 166 | parser.add_argument('-w', '--word_vector', type=str, help="Word2ID vector to be used for doc translation.", 167 | required=True) 168 | parser.add_argument('-', '--paragraph_marks', type=str, help="Add marking the end of paragraphs", 169 | required=False, default=False) 170 | 171 | args = parser.parse_args() 172 | data_path = args.data 173 | word2id_file = args.word_vector 174 | paragraph_mark = args.paragraph_marks 175 | 176 | begin = time() 177 | 178 | w2Id = VectorManager.read_vector(word2id_file) 179 | 180 | print("Filtering data from %s" % (data_path)) 181 | 182 | begin = time() 183 | 184 | filter_data(data_path, w2Id, paragraph_mark) 185 | 186 | 187 | end = time() 188 | print("Total processing time: %d seconds" % (end - begin)) 189 | -------------------------------------------------------------------------------- /src/preprocess/transform_from_gensim.py: -------------------------------------------------------------------------------- 1 | from gensim.models.keyedvectors import KeyedVectors 2 | from utils.vector_manager import VectorManager 3 | import numpy as np 4 | import argparse 5 | 6 | 7 | def transform_gensim(wv): 8 | """ 9 | Transforms word2Vec model class to two structures: word2id dictionary (used to translate word into IDs) and 10 | id_word_vec which contains the tuple (id, word, embedding) for each word in the model. Used to be called from 11 | either main or as block of the pipeline. 12 | :param wv: word2vec model with the word embeddings 13 | :return: word2id and id_word_vec 14 | """ 15 | print("Transforming from gensim a total of %s" % len(wv.vocab.items())) 16 | complete_vec = [(v.index, w, wv.word_vec(w)) for w, v in wv.vocab.items()] 17 | sorted_vec = sorted(complete_vec) 18 | id_word_vec = sorted_vec 19 | word2id = dict([(w, id) for id, w, _ in id_word_vec]) 20 | 21 | return word2id, id_word_vec 22 | 23 | if __name__ == '__main__': 24 | 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument('-k', '--kv', type=str, help="Path of the keyed vectors to translate [word2vec_org_XXX]", 27 | required=True) 28 | 29 | args = parser.parse_args() 30 | data_path = args.kv 31 | 32 | print("Loading keyed vectors") 33 | wv = KeyedVectors.load_word2vec_format(data_path, binary=False) 34 | 35 | emb_size = len(wv.syn0[0]) 36 | word2id, id_word_vec = transform_gensim(wv) 37 | 38 | w2id_filepath = "../models/word2id_%s" % emb_size 39 | idWordVec_filepath = "../models/idWordVec_%s" % emb_size 40 | 41 | print("Writing files:\n\t * word2id: %s\n\t * idWordVec: %s" % (w2id_filepath, idWordVec_filepath)) 42 | VectorManager.write_pickled(w2id_filepath, word2id) 43 | VectorManager.write_pickled(idWordVec_filepath, id_word_vec) 44 | -------------------------------------------------------------------------------- /src/preprocess/words2ids.py: -------------------------------------------------------------------------------- 1 | from utils.vector_manager import VectorManager 2 | from time import time 3 | import multiprocessing as mp 4 | import argparse 5 | import numpy as np 6 | import os 7 | import sys 8 | 9 | 10 | def word2Id(filename, w2id, debug=False): 11 | if debug: 12 | print("Translating %s" % filename) 13 | unk_id = 0 14 | file_out = "%s_num" % filename.split("_clean")[0] 15 | 16 | def transform_numpy(): 17 | """ 18 | Transforms a 4D list of words into a 4D numpy array of integers and writes it into file_out 19 | """ 20 | docs = VectorManager.parse_into_4D(VectorManager.read_vector(filename)) 21 | file_list = [] 22 | for doc in docs: 23 | doc_list = [] 24 | for paragraph in doc: 25 | par_list = [] 26 | for sentence in paragraph: 27 | s_id = [toId(word) for word in sentence if word] 28 | if s_id: 29 | par_list.append(s_id) 30 | doc_list.append(par_list) 31 | file_list.append(doc_list) 32 | np.save(file_out, np.array(file_list)) 33 | 34 | 35 | def transform(): 36 | """ 37 | Transforms a 4D list of words into a 4D numpy array of integers and writes it into file_out 38 | """ 39 | with open(filename) as f: 40 | data = f.read().decode("latin-1").split() 41 | 42 | ids = " ".join([str(w2id[w]) for w in data]) 43 | 44 | with open("%s_num_eos" % filename, "wb") as f: 45 | f.write(ids) 46 | 47 | 48 | def toId(word): 49 | """ 50 | Return ID of the word (or 0 if word is not in word2Id dict) 51 | :param word: to translated 52 | :return: Id of the word 53 | """ 54 | word_id = unk_id 55 | try: 56 | word_id = w2id[word] 57 | except KeyError: 58 | pass 59 | finally: 60 | return word_id 61 | 62 | transform() 63 | # return transform() 64 | 65 | 66 | class FileW2ID(object): 67 | """ 68 | Auxiliar class which holds the filepaths and w2id structure and yields them one at a time in order to avoid 69 | replicating the w2id structure (which can be quite big) 70 | """ 71 | 72 | def __init__(self, filepaths, w2id): 73 | self.filepaths = filepaths 74 | self.w2id = w2id 75 | 76 | def __iter__(self): 77 | for file in self.filepaths: 78 | yield (file, self.w2id) 79 | 80 | 81 | def translate_files(data_path, w2id, suffix="_clean", debug=False): 82 | """ 83 | Handles the parallel translation from word to id of the files in data_path with the mapping w2id 84 | :param data_path: path of the files to transform. Used to be called from either main or as block of 85 | the pipeline 86 | :param w2id: mappings to be used 87 | """ 88 | print("[BLOCK] Translating files from %s" % (data_path)) 89 | 90 | filepaths = [] 91 | for root, dirs, files in os.walk(data_path): 92 | filepaths.extend(["%s/%s" % (root, file) for file in files if file.endswith(suffix)]) 93 | 94 | threads = min(mp.cpu_count() * 4, filepaths) 95 | 96 | print("[BLOCK] Starting %s processes to translate to IDs %s files" % (threads, len(filepaths))) 97 | i = 0 98 | while i < len(filepaths): 99 | ps = [] 100 | j = 0 101 | while j < threads and (i + j) < len(filepaths): 102 | if debug: 103 | print("[%s] Creating %s of %s for file %s" % ( 104 | i, i + j, len(filepaths), filepaths[i + j])) 105 | p = (mp.Process(target=word2Id, args=(filepaths[i + j], w2id,))) 106 | p.start() 107 | ps.append(p) 108 | j += 1 109 | 110 | if debug: 111 | print("%s process in the list to join" % len(ps)) 112 | j = 0 113 | while j < threads and (i + j) < len(filepaths): 114 | if debug: 115 | print("[%s] Joining %s of %s for file %s" % ( 116 | i, j, len(filepaths), filepaths[i + j])) 117 | ps[j].join() 118 | j += 1 119 | 120 | i += j 121 | # for p in iter_file_w2id: 122 | # word2Id(p) 123 | # p = mp.Pool(threads, maxtasksperchild=1) 124 | # p.map(word2Id, iter_file_w2id) 125 | 126 | print("[BLOCK] Files translated to IDs") 127 | sys.stdout.flush() 128 | 129 | if __name__ == '__main__': 130 | parser = argparse.ArgumentParser() 131 | parser.add_argument('-d', '--data', type=str, help="Path of the data to be translated with word2id vector." 132 | " and clean up.", required=True) 133 | parser.add_argument('-w', '--word_vector', type=str, help="Word2ID vector to be used for doc translation.", 134 | required=False, default="../models/eos/word2id_1000.pklz") 135 | 136 | args = parser.parse_args() 137 | data_path = args.data 138 | word2id_file = args.word_vector 139 | 140 | begin = time() 141 | 142 | w2Id = VectorManager.read_vector(word2id_file) 143 | translate_files(data_path, w2Id) 144 | 145 | end = time() 146 | print("Total processing time: %d seconds" % (end - begin)) 147 | -------------------------------------------------------------------------------- /src/preprocess/words2ids_validator.py: -------------------------------------------------------------------------------- 1 | from utils.vector_manager import VectorManager 2 | from time import time 3 | import multiprocessing as mp 4 | import argparse 5 | import numpy as np 6 | import os 7 | import sys 8 | 9 | confidence = 0.8 10 | 11 | 12 | def id2Word(param): 13 | filename, id2w = param 14 | file_words = "%s_clean" % filename.split("_num")[0] 15 | print("Comparing original %s with %s" % (file_words, filename)) 16 | 17 | 18 | def is_valid_numpy(): 19 | """ 20 | """ 21 | docs_ids = VectorManager.read_vector(filename) 22 | original = VectorManager.parse_into_4D(VectorManager.read_vector(file_words)) 23 | file_list = [] 24 | comparison = [] 25 | unknowns = 0 26 | for d in range(0, len(docs_ids)): 27 | doc_list = [] 28 | for p in range(0, len(docs_ids[d])): 29 | par_list = [] 30 | for s in range(0, len(docs_ids[d][p])): 31 | sent_list = [] 32 | for w in range(0, len(docs_ids[d][p][s])): 33 | try: 34 | translated = to_word(docs_ids[d][p][s][w]) 35 | if translated == '': 36 | unknowns += 1 37 | comparison.append(translated == original[d][p][s][w]) 38 | sent_list.append(translated) 39 | except Exception as e: 40 | print("[%s] Indices %s %s %s %s: %s" % (filename, d,p,s,w, e)) 41 | par_list.append(sent_list) 42 | doc_list.append(par_list) 43 | file_list.append(doc_list) 44 | 45 | valid = False 46 | try: 47 | ratio = float(comparison.count(True)) / len(comparison) 48 | u_ratio = round(float(unknowns) / len(comparison), 2) 49 | if ratio < confidence: 50 | print("[WARN] File %s equality ratio is %s with %s unknown ratio" % (filename, round(ratio, 2), u_ratio)) 51 | else: 52 | print("[OK] File %s equality ratio is %s with %s unknown ratio" % (filename, round(ratio, 2), u_ratio)) 53 | valid = True 54 | except KeyError as e: 55 | print("[ERROR] File %s is completely different (%s) with %s unknown ratio" % (filename, e, u_ratio)) 56 | 57 | 58 | return valid 59 | 60 | def is_valid(): 61 | """ 62 | """ 63 | with open(file_words) as f: 64 | original = f.read().decode("latin-1").split() 65 | 66 | with open(file_words) as f: 67 | docs_ids = f.read().split() 68 | 69 | doc_words = [id2w(id) for id in docs_ids] 70 | 71 | comparison = [original[i] == doc_words[i] for i in range(original)] 72 | valid = False 73 | try: 74 | ratio = float(comparison.count(True)) / len(comparison) 75 | if ratio < confidence: 76 | print("[WARN] File %s equality ratio is %s." % (filename, round(ratio, 2))) 77 | else: 78 | print("[OK] File %s equality ratio is %s." % (filename, round(ratio, 2))) 79 | valid = True 80 | except KeyError as e: 81 | print("[ERROR] File %s is completely different (%s) with %s unknown ratio" % (filename, e)) 82 | 83 | 84 | return valid 85 | 86 | 87 | def to_word(id): 88 | """ 89 | Return Word associated with id 90 | :param id: of the word to translate 91 | :return: word associated with the ID 92 | """ 93 | try: 94 | word = id2w[id] 95 | except IndexError as e: 96 | print("ID %s not found\n%s" % (id, e)) 97 | word = '' 98 | return word 99 | 100 | return is_valid() 101 | 102 | 103 | class FileID2Word(object): 104 | """ 105 | Auxiliar class which holds the filepaths and w2id structure and yields them one at a time in order to avoid 106 | replicating the w2id structure (which can be quite big) 107 | """ 108 | 109 | def __init__(self, filepaths, id2w): 110 | self.filepaths = filepaths 111 | self.id2w = id2w 112 | 113 | def __iter__(self): 114 | for file in self.filepaths: 115 | yield (file, self.id2w) 116 | 117 | 118 | def check_translated_files(data_path, w2Id): 119 | """ 120 | Handles the parallel translation from word to id of the files in data_path with the mapping w2id 121 | :param data_path: path of the files to transform. Used to be called from either main or as block of 122 | the pipeline 123 | :param w2id: mappings to be used 124 | """ 125 | print("[BLOCK] Validating translated files from %s" % (data_path)) 126 | 127 | sorted_list = sorted(w2Id.items(), key= lambda(x): x[1]) 128 | id2words = [w for w,_ in sorted_list] 129 | del w2Id, sorted_list 130 | filepaths = [] 131 | for root, dirs, files in os.walk(data_path): 132 | filepaths.extend(["%s/%s" % (root, file) for file in files if file.endswith("_num.npy")]) 133 | threads = mp.cpu_count() * 2 134 | iter_file_w2id = FileID2Word(filepaths, id2words) 135 | print("[BLOCK] Starting validation with %s processes and %s files" % (threads, len(filepaths))) 136 | 137 | p = mp.Pool(threads, maxtasksperchild=1) 138 | valids = p.map(id2Word, iter_file_w2id) 139 | print("[BLOCK] Validation done. Correct files %s/%s. Confidence [%s]" % (valids.count(True), len(valids), confidence)) 140 | sys.stdout.flush() 141 | 142 | 143 | if __name__ == '__main__': 144 | parser = argparse.ArgumentParser() 145 | parser.add_argument('-d', '--data', type=str, help="Path of the data to be translated with word2id vector." 146 | " and clean up.", required=True) 147 | parser.add_argument('-w ', '--word_vector', type=str, help="Word2ID vector to be used for doc reverse translation.", 148 | required=True) 149 | 150 | args = parser.parse_args() 151 | data_path = args.data 152 | word2id_file = args.word_vector 153 | 154 | begin = time() 155 | 156 | w2Id = VectorManager.read_vector(word2id_file) 157 | check_translated_files(data_path, w2Id) 158 | 159 | end = time() 160 | print("Total processing time: %d seconds" % (end - begin)) 161 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/src/utils/__init__.py -------------------------------------------------------------------------------- /src/utils/flatten.py: -------------------------------------------------------------------------------- 1 | def flatten(items, seqtypes=(list, tuple)): 2 | for i, x in enumerate(items): 3 | while i < len(items) and isinstance(items[i], seqtypes): 4 | items[i:i+1] = items[i] 5 | return items -------------------------------------------------------------------------------- /src/utils/memory.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from sys import getsizeof, stderr 3 | from itertools import chain 4 | from collections import deque 5 | try: 6 | from reprlib import repr 7 | except ImportError: 8 | pass 9 | 10 | 11 | def total_size(o, handlers={}, verbose=False): 12 | """ Returns the approximate memory footprint an object and all of its contents. 13 | 14 | Automatically finds the contents of the following builtin containers and 15 | their subclasses: tuple, list, deque, dict, set and frozenset. 16 | To search other containers, add handlers to iterate over their contents: 17 | 18 | handlers = {SomeContainerClass: iter, 19 | OtherContainerClass: OtherContainerClass.get_elements} 20 | 21 | """ 22 | dict_handler = lambda d: chain.from_iterable(d.items()) 23 | all_handlers = {tuple: iter, 24 | list: iter, 25 | deque: iter, 26 | dict: dict_handler, 27 | set: iter, 28 | frozenset: iter, 29 | } 30 | all_handlers.update(handlers) # user handlers take precedence 31 | seen = set() # track which object id's have already been seen 32 | default_size = getsizeof(0) # estimate sizeof object without __sizeof__ 33 | 34 | def sizeof(o): 35 | if id(o) in seen: # do not double count the same object 36 | return 0 37 | seen.add(id(o)) 38 | s = getsizeof(o, default_size) 39 | 40 | if verbose: 41 | print(s, type(o), repr(o), file=stderr) 42 | 43 | for typ, handler in all_handlers.items(): 44 | if isinstance(o, typ): 45 | s += sum(map(sizeof, handler(o))) 46 | break 47 | return s 48 | 49 | return sizeof(o) 50 | -------------------------------------------------------------------------------- /src/utils/split_1k.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | import errno 5 | 6 | 7 | def open_file(output, counter): 8 | filename = "%s/%s/wiki_%s_sym" % (output, counter/10000, counter % 10000) 9 | if not os.path.exists(os.path.dirname(filename)): 10 | try: 11 | os.makedirs(os.path.dirname(filename)) 12 | except OSError as exc: # Guard against race condition 13 | if exc.errno != errno.EEXIST: 14 | raise 15 | 16 | return open(filename, "w") 17 | 18 | 19 | if __name__ == '__main__': 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('-d', '--data_list', type=str, help="Input data list." 22 | " and clean up.", required=True) 23 | parser.add_argument('-o', '--output', type=str, help="Output files." 24 | " and clean up.", required=True) 25 | 26 | args = parser.parse_args() 27 | data_path = args.data_list 28 | output = args.output 29 | 30 | files = open(data_path).read().split() 31 | 32 | i = 1 33 | counter = 0 34 | max = 35 35 | current_file = open_file(output, counter) 36 | for f in files: 37 | current_data = open(f).read().split() 38 | for w in current_data: 39 | if i > max: 40 | current_file.close() 41 | counter += 1 42 | i = 1 43 | current_file = open("%s/wiki_%s_sym" % (output, counter), "w") 44 | if i != 1: 45 | current_file.write(" ") 46 | current_file.write(w) 47 | i += 1 48 | 49 | current_file.close() -------------------------------------------------------------------------------- /src/utils/vector_manager.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | 4 | 5 | # Vectors represent: word <=> id/index <=> embedding 6 | # Auxiliar class handling all the read/write operations for data structures other than numpy arrays. 7 | class VectorManager(object): 8 | 9 | # Methods used to save the vectors 10 | @staticmethod 11 | def parse_into_4D(file_string): 12 | return [[[[w for w in s.split() if w] 13 | for s in p.split("\n") if s] 14 | for p in doc.split("\n\n") if p] 15 | for doc in file_string.split("\n\n\n") 16 | if doc] 17 | 18 | @staticmethod 19 | def parse_into_list(file_string): 20 | file_list = [] 21 | for doc in file_string.split("\n\n\n"): 22 | for p in doc.split("\n\n"): 23 | for s in p.split("\n"): 24 | for w in s.split(): 25 | if w: 26 | file_list.append(w) 27 | 28 | return file_list 29 | 30 | # Methods used to save the vectors 31 | @staticmethod 32 | def parse_into_sentences(file_string): 33 | sentences = [] 34 | for doc in file_string.split("\n\n\n"): 35 | for p in doc.split("\n\n"): 36 | for s in p.split("\n"): 37 | ws = s.split() 38 | if ws: 39 | sentences.append(ws) 40 | return sentences 41 | 42 | @staticmethod 43 | def parse_into_paragraphs(file_string): 44 | sentences = [] 45 | for doc in file_string.split("\n\n\n"): 46 | for p in doc.split("\n\n"): 47 | for s in p.split("\n"): 48 | ws = s.split() 49 | if ws: 50 | sentences.append(ws) 51 | return sentences 52 | 53 | # Methods used to save the vectors 54 | @staticmethod 55 | def write_pickled(filename, data): 56 | with open('%s.pklz' % filename, 'wb') as f: 57 | pickle.dump(data, f) 58 | 59 | # Methods used to save the vectors 60 | @staticmethod 61 | def write_string(filename, data): 62 | with open('%s' % filename, 'wb') as f: 63 | f.write(data) 64 | 65 | # Methods to read vectors 66 | @staticmethod 67 | def read_vector(filename): 68 | ext = filename.split(".")[-1] 69 | 70 | if ext == "npy": 71 | with open(filename, "rb") as f: 72 | return np.load(f) 73 | if ext == "pklz": 74 | with open(filename, 'rb') as f: 75 | try: 76 | return pickle.load(f, encoding="latin1") 77 | except: 78 | return pickle.load(f) 79 | else: 80 | with open(filename, 'rb') as f: 81 | data = f.read() 82 | return data.decode("latin-1") 83 | # return data.decode("latin-1") 84 | # print("Unknown file extension for file %s" % filename) 85 | 86 | @staticmethod 87 | def read_id_word_vec(): 88 | return self.read_vector("idWordVec.pklz") 89 | 90 | @staticmethod 91 | def read_word2id(): 92 | return self.read_vector("word2id.pklz") --------------------------------------------------------------------------------