├── .gitignore
├── LICENSE.md
├── README.md
├── bin
    ├── divide.py
    ├── lstm.sh
    ├── plot_tsne
    ├── preprocess.py
    ├── preprocess.sh
    ├── run_pipeline.sh
    ├── run_short_pipeline.sh
    ├── semantics_check
    ├── submit_scripts_for_supercomputers
    │   ├── submit_clean.sh
    │   ├── submit_clstm.sh
    │   ├── submit_divide.sh
    │   ├── submit_loaded_topics.sh
    │   ├── submit_lstm.sh
    │   ├── submit_make_wiki.sh
    │   ├── submit_pipeline.sh
    │   ├── submit_split.sh
    │   ├── submit_topics.sh
    │   └── submit_words2ids.sh
    ├── test_topics.sh
    └── wiki_extractor_launch.sh
├── documentation
    ├── ml-project.pdf
    ├── training_perplexities.png
    ├── word_embeddings_and_topic_detection.pdf
    └── word_embeddings_and_topic_detection_II.pdf
├── execution.txt
├── results
    ├── all_perplexities_lstm.png
    ├── learning_rate.png
    ├── train_perplexities.png
    └── train_perplexities_detail.png
└── src
    ├── __init__.py
    ├── context
        ├── TIMES.txt
        ├── __init__.py
        ├── creator.py
        ├── custom.py
        └── topics_analysis.py
    ├── lstm
        ├── __init__.py
        ├── clstm.py
        ├── input_pipeline.py
        ├── lstm.py
        ├── lstm_wp.py
        ├── reader.py
        ├── reader_frag.py
        ├── reader_test.py
        └── reader_topics.py
    ├── postprocess
        ├── semantics_check.py
        ├── test_topics.py
        └── tsne.py
    ├── preprocess
        ├── __init__.py
        ├── cleaner.py
        ├── embeddings.py
        ├── filter.py
        ├── transform_from_gensim.py
        ├── words2ids.py
        └── words2ids_validator.py
    └── utils
        ├── __init__.py
        ├── flatten.py
        ├── memory.py
        ├── split_1k.py
        └── vector_manager.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 
91 | .idea
92 | .iml
93 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # contextualLSTM
 2 | Contextual LSTM for NLP tasks like word prediction
 3 |  
 4 | This repo's goal is to implement de Contextual LSTM model for word prediction as described by [Ghosh, S., Vinyals, O., Strope, B., Roy, S., Dean, T., & Heck, L. (n.d.). Contextual LSTM (CLSTM) models for Large scale NLP tasks. https://doi.org/10.1145/12351]
 5 | 
 6 | **Notes**: there are scripts to run the pipelines. However, the project needs a bit of cleanup. If anyone is interested in using it please write to me or open an issue and I'll fix/help with any error you have.
 7 | 
 8 | 
 9 | ## Data preprocessing and embeddings
10 | 
11 | Further details about wikipedia data preprocessing at
12 | 
13 | ./documentation/word_embeddings_and_topic_detection.pdf
14 | 
15 | 
16 | ## Context creation with topic detection
17 | 
18 | Further details of different gensim topic detection methods as well as embeddings arithmetic for context creation at
19 | 
20 | ./documentation/word_embeddings_and_topic_detection_II.pdf
21 | 
22 | ## Execution
23 | 
24 | Download a wikipedia dump for example:
25 | 
26 | https://dumps.wikimedia.org/enwiki/20180420/enwiki-20180420-pages-articles.xml.bz2
27 | 
28 | After that use wiki_extractor to process it:
29 | 
30 | `./wiki_extractor_launch.sh path_to_wikipedia_dump`
31 | 
32 | where `path_to_wikipedia_dump` is the file you downloaded (e.g. enwiki-20180120-pages-articles.xml.bz2)
33 | 
34 | 
35 | To run the whole pipeline use the script:
36 | 
37 | `./run_pipeline.sh ../data/enwiki 500`
38 | 
39 | `./preprocess.sh ../data/enwiki 500 2
40 | where:
41 |  * `../data/enwiki` is the default path where preprocess script extracted and cleaned the wikipedia dump.
42 |  * 500 is the desired embedding size.
43 | 
44 | 
45 | To run just the pipeline with pre-trained embeddings of size 1000 run:
46 | 
47 | `./run_short_pipeline.sh ../data/ 1000`
48 | 
49 | You can download the required trained embeddings from here:
50 | 
51 | https://www.dropbox.com/s/ws6d8l6h6jp3ldc/embeddings.tar.gz?dl=0
52 | 
53 | You should place them inside the models/ folder
54 | 
55 | 
56 | ## LSTM 
57 | 
58 | Basic LSTM implementation with TF at  ./src/lstm.py
59 | 
60 | ## CLSTM 
61 | 
62 | Contextual LSTM implementation with TF at  ./src/clstm.py
63 | 
64 | **Although functional, this version is still too slow to be practical for training. If you want to collaborate or have any question regarding it feel free to contact me, I plan to finish it shortly and upload a detailed description of it.**
65 | 
66 | 
67 | ## Execution
68 | 
69 | Most files have their own execution script under /bin folder.
70 | All scripts named submit_XXX.sh are designed to be run in a SuperComputer with Slurm queue system. In order to run the locally, just issue the python commands followed by the correct paths.
71 | 
72 | **Note:** due to the use of many different packages not all files run with the same Python version (some with 2.7, others with 3.5.2 and the rest 3.6), I expect to unify them (or state clearly the version) soon.
73 | 


--------------------------------------------------------------------------------
/bin/divide.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | data = open("wiki_data", "r").read()
 3 | data = data.split()
 4 | size = len(data)
 5 | 
 6 | training = int(size*0.8)
 7 | validation = int(size*0.1)
 8 | testing = int(size*0.1)
 9 | 
10 | with open("wiki.train.txt", "w") as f:
11 |     f.write(data[0:training])
12 |     
13 | with open("wiki.train.txt", "w") as f:
14 |     f.write(" ".join(data[0:training]))
15 |     
16 | with open("wiki.valid.txt", "w") as f:
17 |     f.write(" ".join(data[training+1:training+1+validation]))
18 |     
19 | with open("wiki.test.txt", "w") as f:
20 |     f.write(" ".join(data[training+1+validation+1:-1]))
21 |     
22 | 


--------------------------------------------------------------------------------
/bin/lstm.sh:
--------------------------------------------------------------------------------
1 | 
2 | python ../src/lstm/lstm.py \
3 |         --data_path ../data/full.list \
4 |         --embeddings ../models/eos/idWordVec_ \
5 |         --model large \
6 |         --use_fp16 True \
7 |         --word_to_id ../models/eos/word2id_1000.pklz
8 | 
9 | 


--------------------------------------------------------------------------------
/bin/plot_tsne:
--------------------------------------------------------------------------------
1 | export PYTHONPATH="$PYTHONPATH:../src/"
2 | 
3 | python2 ../src/postprocess/tsne.py -i ../models/idWordVec.pklz -w ../models/word2vec_org_200
4 | 


--------------------------------------------------------------------------------
/bin/preprocess.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.insert(0, "../src/")
 3 | 
 4 | from preprocess.cleaner import clean_data
 5 | from preprocess.embeddings import create_embeddings
 6 | from preprocess.transform_from_gensim import transform_gensim
 7 | from preprocess.words2ids import translate_files
 8 | from preprocess.words2ids_validator import check_translated_files
 9 | from preprocess.filter import filter_data
10 | from utils.vector_manager import VectorManager
11 | from time import time
12 | 
13 | import argparse
14 | 
15 | """"
16 | Orchestrating file which handles all the processing pipeline:
17 | * Clean data
18 | * Create embeddings
19 | * Transform model structures
20 | * Translate word's lists to IDs lists.
21 | """
22 | 
23 | if __name__ == '__main__':
24 | 
25 |     parser = argparse.ArgumentParser()
26 |     parser.add_argument('-d', '--data', type=str, help="Path of the data extracted with wikiExtractor", required=True)
27 |     parser.add_argument('-s', '--size', type=int, help="Size of the word embeddings.", default=200, required=False)
28 |     parser.add_argument('-c', '--count', type=int, help="Min count for embeddings (if set to something bigger than 1 "
29 |                                                         "you should manually handle the non processed words i.e. create"
30 |                                                         " 'unknown' key and add it to the embeddings' )", default=1, required=False)
31 | 
32 |     args = parser.parse_args()
33 | 
34 |     # Arguments parsing
35 |     data_path = args.data
36 |     emb_size = args.size  # size of the embedding vectors to create
37 |     min_count = args.count  # minimum word occurrences to be in embeddings set
38 | 
39 |     print("Starting Preprocess pipeline\n\t * Data path: %s\n\t * Embedding size: %s\n\t * Min count: %s" %
40 |           (data_path, emb_size, min_count))
41 | 
42 |     # Clean Wikipedia data
43 |     t0 = time()
44 |     sentences = clean_data(data_path)
45 | 
46 |     t1 = time()
47 |     print("Time cleaning data: %s\nCreating embeddings from cleaned data..." % (t1-t0))
48 | 
49 |     # Create embeddings from the cleaned data
50 |     model = create_embeddings(data_path, emb_size, min_count)
51 |     t2 = time()
52 |     print("Time creating embeddings: %s" % (t2-t1))
53 | 
54 |     print("Saving embeddings model...")
55 |     model.save("../models/word2vec_gensim_%s" % emb_size)
56 |     model.wv.save_word2vec_format("../models/word2vec_org_%s" % emb_size,
57 |                                   "../models/vocabulary_%s" % emb_size,
58 |                                   binary=False)
59 | 
60 | 
61 |     # Get only:
62 |     #  * word2id vector (for transforming data to numerical)
63 |     #  * id_word_vec (actually contain word embeddings an associated id <-> word
64 |     t3 = time()
65 |     word2id, id_word_vec = transform_gensim(model.wv)
66 |     t4 = time()
67 |     print("Time transforming gensim to word2ID and idWordVec vectors: %s" % (t4-t3))
68 | 
69 |     # Save model for checkpointing
70 |     VectorManager.write_pickled("../models/word2id_%s" % emb_size, word2id)
71 |     VectorManager.write_pickled("../models/idWordVec_%s" % emb_size, id_word_vec)
72 | 
73 |     t5 = time()
74 |     translate_files(data_path, word2id)
75 |     t6 = time()
76 |     print("Time translating words to numbers: %s" % (t6-t5))
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/bin/preprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | export PYTHONPATH="$PYTHONPATH:../src/"
 4 | 
 5 | data_path=$1
 6 | embeddings_size=$2
 7 | min_word_count_threshold=$3
 8 | 
 9 | # Preprocess all wiki data and create embeddings
10 | python2 preprocess.py \
11 |     --data ${data_path} \
12 |     --size ${embeddings_size} \
13 | 
14 | 
15 | # Put all the files into a list to be fed to TF LSTM
16 | find ../data/ -name *num_eos > ../data/full.list
17 | 


--------------------------------------------------------------------------------
/bin/run_pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PYTHONPATH="$PYTHONPATH:../src/"
 4 | 
 5 | data_path=$1
 6 | embeddings_size=$2
 7 | 
 8 | # Preprocess all wiki data and create embeddings
 9 | python2 preprocess.py \
10 |     --data ${data_path} \
11 |     --size ${embeddings_size}
12 | 
13 | 
14 | # Put all the files into a list to be fed to TF LSTM
15 | find ../data/ -name *num_eos > ../data/full.list
16 | 
17 | # Run the LSTM 
18 | python ../src/lstm/lstm.py \
19 |         --data_path ../data/full.list \
20 |         --embeddings ../models/idWordVec_${embeddings_size}.pklz \
21 |         --model large \
22 |         --use_fp16 True \
23 |         --word_to_id ../models/word2id_${embeddings_size}.pklz
24 | 


--------------------------------------------------------------------------------
/bin/run_short_pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PYTHONPATH="$PYTHONPATH:../src/"
 4 | 
 5 | data_path=$1
 6 | embeddings_size=$2
 7 | 
 8 | 
 9 | python2 ../src/preprocess/filter.py \
10 |     --data ${data_path} \
11 |     --word_vector ../models/word2id_1000.pklz
12 | 
13 | # Translate all wiki data
14 | python2 ../src/preprocess/words2ids.py \
15 |     --data ${data_path} \
16 |     --word_vector ../models/eos/word2id_1000.pklz
17 | 
18 | 
19 | # Put all the files into a list to be fed to TF LSTM
20 | find ../data/ -name *num_eos > ../data/full.list
21 | 
22 | # Run the LSTM 
23 | python ../src/lstm/lstm.py \
24 |         --data_path ../data/full.list \
25 |         --embeddings ../models/idWordVec_${embeddings_size}.pklz \
26 |         --model large \
27 |         --use_fp16 True \
28 |         --word_to_id ../models/word2id_${embeddings_size}.pklz
29 | 


--------------------------------------------------------------------------------
/bin/semantics_check:
--------------------------------------------------------------------------------
1 | export PYTHONPATH="$PYTHONPATH:../src/"
2 | 
3 | python2 ../src/postprocess/semantics_check.py -w ../models/word2vec_org_200
4 | 


--------------------------------------------------------------------------------
/bin/submit_scripts_for_supercomputers/submit_clean.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | appName="cleanerPar"
 3 | echo "#!/bin/sh
 4 | #SBATCH --job-name=$appName
 5 | #SBATCH --exclusive
 6 | #SBATCH -t30:59:00
 7 | #SBATCH --workdir=.
 8 | #SBATCH -o $appName-%J.out
 9 | #SBATCH -e $appName-%J.err
10 | #SBATCH -N1
11 | #SBATCH -n12
12 | 
13 | export PYTHONPATH="$PYTHONPATH:/gpfs/home/bsc19/bsc19277/contextualLSTM/src/"
14 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/src/preprocess/cleaner.py -d /gpfs/home/bsc19/bsc19277/contextualLSTM/data/enwiki -w /gpfs/home/bsc19/bsc19277/contextualLSTM/models/word2id_1000.pklz" > job
15 | 
16 | sbatch < job
17 | rm job


--------------------------------------------------------------------------------
/bin/submit_scripts_for_supercomputers/submit_clstm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | appName="clstm-Verb"
 3 | echo "#!/bin/sh
 4 | #SBATCH --job-name=$appName
 5 | #SBATCH --exclusive
 6 | #SBATCH -t30:59:00
 7 | #SBATCH --workdir=.
 8 | #SBATCH -o $appName-%J.out
 9 | #SBATCH -e $appName-%J.err
10 | #SBATCH -N1
11 | #SBATCH -n16
12 | #SBATCH --mem=100000
13 | 
14 | module purge && module load K80 cuda/8.0 mkl/2017.1 CUDNN/5.1.10-cuda_8.0 intel-opencl/2016 python/3.6.0+_ML
15 | #module purge; module load K80 cuda/7.5 mkl/2017.0.098 CUDNN/5.1.3 python/3.5.2_ML
16 | #module purge && module load K80 mkl/2017.0.098 cuda/7.5 CUDNN/5.1.3  intel-opencl/2016 python/2.7.12_ML
17 | #export PYTHONPATH=$PYTHONPATH:/gpfs/home/bsc19/bsc19277/contextualLSTM/src
18 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/src/lstm/clstm.py \
19 |         --data_path /gpfs/home/bsc19/bsc19277/contextualLSTM/data/wikipedia/small.list \
20 |         --embeddings /gpfs/home/bsc19/bsc19277/contextualLSTM/models/eos/idWordVec_ \
21 |         --model small \
22 |         --use_fp16 True \
23 |         --word_to_id_path /gpfs/home/bsc19/bsc19277/contextualLSTM/models/eos/word2id_" > job
24 | 
25 | sbatch < job
26 | rm job
27 | #SBATCH --dependency=afterany:753016
28 | #SBATCH --gres gpu:0
29 | #SBATCH --constraint=k80
30 | 


--------------------------------------------------------------------------------
/bin/submit_scripts_for_supercomputers/submit_divide.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | appName="divider"
 3 | echo "#!/bin/sh
 4 | #SBATCH --job-name=$appName
 5 | #SBATCH --exclusive
 6 | #SBATCH -t30:59:00
 7 | #SBATCH --workdir=.
 8 | #SBATCH -o $appName-%J.out
 9 | #SBATCH -e $appName-%J.err
10 | #SBATCH -N1
11 | #SBATCH -n12
12 | #SBATCH --mem=100000
13 | 
14 | export PYTHONPATH="$PYTHONPATH:/gpfs/home/bsc19/bsc19277/contextualLSTM/src/"
15 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/bin/divide.py" > job
16 | 
17 | sbatch < job
18 | rm job
19 | 


--------------------------------------------------------------------------------
/bin/submit_scripts_for_supercomputers/submit_loaded_topics.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | appName="hdp_topicsLoadedAnalysis"
 3 | echo "#!/bin/sh
 4 | #SBATCH --job-name=$appName
 5 | #SBATCH --exclusive
 6 | #SBATCH -t32:00:00
 7 | #SBATCH --workdir=.
 8 | #SBATCH -o $appName-%J.out
 9 | #SBATCH -e $appName-%J.err
10 | #SBATCH -N1
11 | #SBATCH -n12
12 | 
13 | export PYTHONPATH="$PYTHONPATH:/gpfs/home/bsc19/bsc19277/contextualLSTM/src/"
14 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/src/lda/lda.py -d /gpfs/home/bsc19/bsc19277/contextualLSTM/data/enwiki -m /gpfs/home/bsc19/bsc19277/contextualLSTM/models/topics -c /gpfs/home/bsc19/bsc19277/contextualLSTM/models/gensim_tfidf.mm.bz2 -i /gpfs/home/bsc19/bsc19277/contextualLSTM/models/gensim_wordids.txt.bz2 " > job
15 | 
16 | sbatch < job
17 | rm job


--------------------------------------------------------------------------------
/bin/submit_scripts_for_supercomputers/submit_lstm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | appName="lstm"
 3 | echo "#!/bin/sh
 4 | #SBATCH --job-name=$appName
 5 | #SBATCH --exclusive
 6 | #SBATCH -t30:59:00
 7 | #SBATCH --workdir=.
 8 | #SBATCH -o $appName-%J.out
 9 | #SBATCH -e $appName-%J.err
10 | #SBATCH -N1
11 | #SBATCH -n16
12 | #SBATCH --gres gpu:4
13 | #SBATCH --constraint=k80
14 | #SBATCH --mem=100000
15 | 
16 | module purge && module load K80 cuda/8.0 mkl/2017.1 CUDNN/5.1.10-cuda_8.0 intel-opencl/2016 python/3.6.0+_ML
17 | 
18 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/src/lstm/lstm.py \
19 |         --data_path /gpfs/home/bsc19/bsc19277/contextualLSTM/data/wikipedia/full.list \
20 |         --embeddings /gpfs/home/bsc19/bsc19277/contextualLSTM/models/eos/idWordVec_ \
21 |         --model medium \
22 |         --use_fp16 True \
23 |         --word_to_id /gpfs/home/bsc19/bsc19277/contextualLSTM/models/eos/word2id_200.pklz" > job
24 | 
25 | sbatch < job
26 | rm job
27 | 
28 | 


--------------------------------------------------------------------------------
/bin/submit_scripts_for_supercomputers/submit_make_wiki.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | echo "#!/bin/sh
 4 | #SBATCH --job-name=make_wiki
 5 | #SBATCH --exclusive
 6 | #SBATCH -t30:59:00
 7 | #SBATCH --workdir=.
 8 | #SBATCH -o make_wiki-%J.out
 9 | #SBATCH -e make_wiki-%J.err
10 | #SBATCH -N1
11 | #SBATCH -n12
12 | 
13 | python -m gensim.scripts.make_wiki /gpfs/home/bsc19/bsc19277/contextualLSTM/data/enwiki-20170220-pages-articles.xml.bz2 /gpfs/home/bsc19/bsc19277/contextualLSTM/models/gensim" > job
14 | 
15 | sbatch < job
16 | rm job
17 | 


--------------------------------------------------------------------------------
/bin/submit_scripts_for_supercomputers/submit_pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | echo "#!/bin/sh
 4 | #SBATCH --job-name=word2vec
 5 | #SBATCH --exclusive
 6 | #SBATCH -t30:59:00
 7 | #SBATCH --workdir=.
 8 | #SBATCH -o word2vec-%J.out
 9 | #SBATCH -e word2vec-%J.err
10 | #SBATCH -N1
11 | #SBATCH -n12
12 | 
13 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/bin/preprocess.py -d /gpfs/home/bsc19/bsc19277/contextualLSTM/data/enwiki -s 500" > job
14 | 
15 | sbatch < job
16 | rm job


--------------------------------------------------------------------------------
/bin/submit_scripts_for_supercomputers/submit_split.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | appName="splitr"
 3 | echo "#!/bin/sh
 4 | #SBATCH --job-name=$appName
 5 | #SBATCH --exclusive
 6 | #SBATCH -t30:59:00
 7 | #SBATCH --workdir=.
 8 | #SBATCH -o $appName-%J.out
 9 | #SBATCH -e $appName-%J.err
10 | #SBATCH -N1
11 | #SBATCH -n12
12 | #SBATCH --mem=100000
13 | 
14 | export PYTHONPATH="$PYTHONPATH:/gpfs/home/bsc19/bsc19277/contextualLSTM/src/"
15 | 
16 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/src/utils/split_1k.py -d /gpfs/home/bsc19/bsc19277/contextualLSTM/data/wikipedia/full_lists/train.list -o /gpfs/home/bsc19/bsc19277/contextualLSTM/data/wikipedia/train/
17 | 
18 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/src/utils/split_1k.py -d /gpfs/home/bsc19/bsc19277/contextualLSTM/data/wikipedia/full_lists/test.list -o /gpfs/home/bsc19/bsc19277/contextualLSTM/data/wikipedia/test/
19 | 
20 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/src/utils/split_1k.py -d /gpfs/home/bsc19/bsc19277/contextualLSTM/data/wikipedia/full_lists/valid.list -o /gpfs/home/bsc19/bsc19277/contextualLSTM/data/wikipedia/valid/
21 | " > job
22 | 
23 | sbatch < job
24 | rm job
25 | 
26 | 


--------------------------------------------------------------------------------
/bin/submit_scripts_for_supercomputers/submit_topics.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | appName="topicsAnalysis"
 3 | echo "#!/bin/sh
 4 | #SBATCH --job-name=$appName
 5 | #SBATCH --exclusive
 6 | #SBATCH -t30:59:00
 7 | #SBATCH --workdir=.
 8 | #SBATCH -o $appName-%J.out
 9 | #SBATCH -e $appName-%J.err
10 | #SBATCH -N1
11 | #SBATCH -n12
12 | 
13 | export PYTHONPATH="$PYTHONPATH:/gpfs/home/bsc19/bsc19277/contextualLSTM/src/"
14 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/src/lda/lda.py -d /gpfs/home/bsc19/bsc19277/contextualLSTM/data/enwiki -m /gpfs/home/bsc19/bsc19277/contextualLSTM/models/topics -w /gpfs/home/bsc19/bsc19277/contextualLSTM/models/word2id_1000.pklz" > job
15 | 
16 | sbatch < job
17 | rm job


--------------------------------------------------------------------------------
/bin/submit_scripts_for_supercomputers/submit_words2ids.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | echo "#!/bin/sh
 4 | #SBATCH --job-name=word2ids
 5 | #SBATCH --exclusive
 6 | #SBATCH -t30:59:00
 7 | #SBATCH --workdir=.
 8 | #SBATCH -o word2ids-%J.out
 9 | #SBATCH -e word2ids-%J.err
10 | #SBATCH -N1
11 | #SBATCH -n12
12 | 
13 | export PYTHONPATH=$PYTHONPATH:/gpfs/home/bsc19/bsc19277/contextualLSTM/src
14 | 
15 | python /gpfs/home/bsc19/bsc19277/contextualLSTM/src/preprocess/words2ids.py -d /gpfs/home/bsc19/bsc19277/contextualLSTM/data/enwiki -w /gpfs/home/bsc19/bsc19277/contextualLSTM/models/eos/word2id_1000.pklz" > job
16 | 
17 | sbatch < job
18 | rm job


--------------------------------------------------------------------------------
/bin/test_topics.sh:
--------------------------------------------------------------------------------
 1 | # LDA online
 2 | python ../src/postprocess/test_topics.py
 3 | -m ../models/topics/lda_online
 4 | -w ../models/eos/word2id_1000.pklz
 5 | -i ../models/topics/gensim_wordids.txt.bz2
 6 | -e ../models/eos/i2WordVec_1000.pklz
 7 | 
 8 | # LDA Parallel
 9 | python ../src/postprocess/test_topics.py
10 | -m ../models/topics/lda_parallel_bf64b098-c517-47c8-9267-1ce116e0033d
11 | -w ../models/eos/word2id_1000.pklz
12 | -i ../models/topics/gensim_wordids.txt.bz2
13 | -e ../models/eos/i2WordVec_1000.pklz
14 | 
15 | # LSI
16 | python postprocess/test_topics.py
17 | -m ../models/topics/lsa_c59e4bd3-1553-4ff1-a448-8c5be75d3f33
18 | -w ../models/eos/word2id_1000.pklz
19 | -i ../models/topics/gensim_wordids.txt.bz2
20 | -e ../models/eos/i2WordVec_1000.pklz
21 | 


--------------------------------------------------------------------------------
/bin/wiki_extractor_launch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #Clean Wikipedia Data
3 | 
4 |  
5 | wikipedia_dump_path=$1
6 | ../src/preprocess/wikiextractor/build/scripts-3.5/WikiExtractor.py -o data/enwiki ${wikipedia_dump_path}
7 | 


--------------------------------------------------------------------------------
/documentation/ml-project.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/documentation/ml-project.pdf


--------------------------------------------------------------------------------
/documentation/training_perplexities.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/documentation/training_perplexities.png


--------------------------------------------------------------------------------
/documentation/word_embeddings_and_topic_detection.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/documentation/word_embeddings_and_topic_detection.pdf


--------------------------------------------------------------------------------
/documentation/word_embeddings_and_topic_detection_II.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/documentation/word_embeddings_and_topic_detection_II.pdf


--------------------------------------------------------------------------------
/execution.txt:
--------------------------------------------------------------------------------
1 | # Execution
2 | 
3 | Most files have their own execution script under /bin folder.
4 | All scripts named submit_XXX.sh are designed to be run in a SuperComputer with Slurm queue system. In order to run the locally, just issue the python commands followed by the correct paths. **Note:** due to the use of many different packages not all files run with the same Python version (some with 2.7, others with 3.5.2 and the rest 3.6), I expect to unify them (or state clearly the version) soon.
5 | 
6 | FINAL NOTE: when running files manually, do so from inside 'src' or export it to the Python path to ensure all dependencies are met.
7 | 


--------------------------------------------------------------------------------
/results/all_perplexities_lstm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/results/all_perplexities_lstm.png


--------------------------------------------------------------------------------
/results/learning_rate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/results/learning_rate.png


--------------------------------------------------------------------------------
/results/train_perplexities.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/results/train_perplexities.png


--------------------------------------------------------------------------------
/results/train_perplexities_detail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/results/train_perplexities_detail.png


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/src/__init__.py


--------------------------------------------------------------------------------
/src/context/TIMES.txt:
--------------------------------------------------------------------------------
 1 | hdp_topicsLoadedAnalysis-764553.out:[BLOCK] Training time for LSA: 14390.89
 2 | lsa__topicsLoadedAnalysis-764083.out:[BLOCK] Training time for LSA: 12396.78
 3 | lsa__topicsLoadedAnalysis-764552.out:[BLOCK] Training time for LSA: 14483.59
 4 | topicsLoadedAnalysis-764012.out:[BLOCK] Training time for LSA: 9075.62
 5 | 
 6 | 
 7 | model = LdaMulticore(corpus, id2word=dictionary, num_topics=100,  workers=11, passes=3)
 8 | ldam__topicsLoadedAnalysis-764082.out:[BLOCK] Training time for LDA multicore: 52544.35
 9 | 
10 | 


--------------------------------------------------------------------------------
/src/context/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/src/context/__init__.py


--------------------------------------------------------------------------------
/src/context/creator.py:
--------------------------------------------------------------------------------
  1 | from utils.vector_manager import VectorManager
  2 | from utils.flatten import flatten
  3 | from gensim.corpora import Dictionary, MmCorpus
  4 | from gensim.models import LsiModel, LdaMulticore, LdaModel, HdpModel
  5 | from time import time
  6 | 
  7 | import numpy as np
  8 | import argparse
  9 | import pickle
 10 | import sys
 11 | 
 12 | 
 13 | 
 14 | class TopicCreator(object):
 15 | 
 16 |     def __init__(self, dictionary_path, word2id, embeddings, lda=None, lsi=None):
 17 |         self.dictionary = self.load_dict(dictionary_path)
 18 |         self.word2id = VectorManager.read_vector(word2id)
 19 |         # self.word2id = self.word2id_to_id2word(word2id)
 20 |         self.embeddings = embeddings
 21 |         self.lda = lda
 22 |         self.lsi = lsi
 23 | 
 24 |     def load_dict(self, dict_path):
 25 |         print("[BLOCK] Loading  dictionary files from %s" % (dict_path))
 26 |         sys.stdout.flush()
 27 |         return Dictionary.load_from_text(dict_path)
 28 | 
 29 |     def word2id_to_id2word(self, word2id_path):
 30 | 
 31 |         word2id = pickle.load(open(word2id_path))
 32 |         id2word_c = [0] * len(word2id)
 33 |         for w in word2id:
 34 |             id2word_c[word2id[w]] = w
 35 |         return id2word_c
 36 | 
 37 | 
 38 |     def get_lsa_topic_embeding(self, document):
 39 |         """
 40 |         Construct a context vector by doing the weighted sum of the embeddings of the words of most relevant lsi topic
 41 |         :param document: sequence of text to get the context for
 42 |         :return: numpy array with the context
 43 |         """
 44 |         if not self.lsi:
 45 |             print("LSI model not provided")
 46 |             raise Exception("LSI model not available")
 47 | 
 48 |         document = [self.embeddings[int(elem)][1] for elem in document]
 49 |         corpus = [self.dictionary.doc2bow(document)]
 50 |         corpus_topics = self.lsi[corpus][0]
 51 | 
 52 |         values = [abs(val) for _, val in corpus_topics]
 53 |         index = values.index(max(values))
 54 | 
 55 |         topics = self.lsi.show_topic(index)
 56 | 
 57 | 
 58 |         embedding = np.zeros_like(self.embeddings[0][2], dtype=np.float32)
 59 |         for word, weight in topics:
 60 |             embedding = np.multiply(weight, self.embeddings[self.word2id[word]][2])
 61 | 
 62 |         return embedding
 63 | 
 64 | 
 65 |     def average_embeddings(self, document):
 66 |         """
 67 |         Construct a context vector by doing the average of the embeddings seen so far
 68 |         :return: numpy array with the context
 69 |         """
 70 |         if not self.lsi:
 71 |             print("LSI model not provided")
 72 |             raise Exception("LSI model not available")
 73 | 
 74 |         document_embeddings = [self.embeddings[int(elem)][2] for elem in document]
 75 | 
 76 |         embedding = np.mean(document_embeddings)
 77 | 
 78 |         return embedding
 79 | 
 80 |     def get_lda_best_topic_words(self, document):
 81 |         """
 82 |         Construct a context vector by returning the embedding of the most relevant word of the topic
 83 |         :param document: sequence of text to get the context for
 84 |         :return: numpy array with the context or unknown embedding if topic is not found
 85 |         """
 86 |         if not self.lda:
 87 |             print("LDA model not provided")
 88 |             raise Exception("LDA model not available")
 89 | 
 90 |         document = [self.embeddings[int(elem)][1] for elem in document]
 91 |         corpus = [self.dictionary.doc2bow(document)]
 92 |         top_topics = self.lda.top_topics(corpus, num_words=100)[0][0]
 93 | 
 94 |         if not top_topics[0][0] > 0.1:
 95 |             topic_word = '<unk>'
 96 |         else:
 97 |             topic_word = top_topics[0][1]
 98 | 
 99 |         try:
100 |             embedding = self.embeddings[self.word2id[topic_word]][2]
101 |         except KeyError as e:
102 |             embedding = self.embeddings[self.word2id['<unk>']][2]
103 |             print("Word %s not found in word2id dict, returning UNK topic (%s)" % (topic_word, e))
104 | 
105 |         return embedding
106 | 
107 |     def get_lda_topic_embedding(self, document):
108 |         """
109 |         Construct a context vector by doing the weighted sum of the embeddings of the 10 most relevant words of the topic
110 |         :param document: sequence of text to get the context for
111 |         :return: numpy array with the context
112 |         """
113 |         if not self.lda:
114 |             print("LDA model not provided")
115 |             raise Exception("LDA model not available")
116 | 
117 |         document = [self.embeddings[int(elem)][1] for elem in document]
118 |         corpus = [self.dictionary.doc2bow(document)]
119 |         topics = self.lda.top_topics(corpus, num_words=100)[0][0]
120 |         top_topic = topics[0]
121 | 
122 |         if not top_topic[0] > 0:
123 |             topic_embedding = self.embeddings[self.word2id['<unk>']][2]
124 |         else:
125 |             topic_embedding = np.zeros_like(self.embeddings[self.word2id['<unk>']][2], dtype=np.float32)
126 |             for i in range(10):
127 |                 weight = topics[i][0]
128 |                 embed = self.embeddings[self.word2id[topics[i][1]]][2]
129 |                 update = np.multiply(weight, embed)
130 |                 topic_embedding = np.add(topic_embedding, update)
131 | 
132 |         return topic_embedding


--------------------------------------------------------------------------------
/src/context/custom.py:
--------------------------------------------------------------------------------
 1 | from nltk.tokenize import RegexpTokenizer
 2 | from stop_words import get_stop_words
 3 | from nltk.stem.porter import PorterStemmer
 4 | from gensim import corpora, models
 5 | import gensim
 6 | 
 7 | tokenizer = RegexpTokenizer(r'\w+')
 8 | 
 9 | # create English stop words list
10 | en_stop = get_stop_words('en')
11 | 
12 | # Create p_stemmer of class PorterStemmer
13 | p_stemmer = PorterStemmer()
14 |     
15 | # create sample documents
16 | paragraphs = ["Space Exploration Technologies Corporation, better known as SpaceX, is an American aerospace manufacturer and space transport services company headquartered in Hawthorne, California. It was founded in 2002 by entrepreneur Elon Musk with the goal of reducing space transportation costs and enabling the colonization of Mars. SpaceX has since developed the Falcon launch vehicle family and the Dragon spacecraft family, which both currently deliver payloads into Earth orbit.", "SpaceX's achievements include the first privately funded liquid-propellant rocket to reach orbit (Falcon 1 in 2008); the first privately funded company to successfully launch, orbit, and recover a spacecraft (Dragon in 2010); the first private company to send a spacecraft to the International Space Station (Dragon in 2012), and the first propulsive landing for an orbital rocket. As of March 2017, SpaceX has since flown ten missions to the International Space Station (ISS) under a cargo resupply contract. NASA also awarded SpaceX a further development contract in 2011 to develop and demonstrate a human-rated Dragon, which would be used to transport astronauts to the ISS and return them safely to Earth.", "SpaceX announced in 2011 they were beginning a privately funded reusable launch system technology development program. In December 2015, a first stage was flown back to a landing pad near the launch site, where it successfully accomplished a propulsive vertical landing. This was the first such achievement by a rocket for orbital spaceflight. In April 2016, with the launch of CRS-8, SpaceX successfully vertically landed a first stage on an ocean drone-ship landing platform. In May 2016, in another first, SpaceX again landed a first stage, but during a significantly more energetic geostationary transfer orbit mission. In March 2017, SpaceX became the first to successfully re-launch and land the first stage of an orbital rocket.", "In 2016, CEO Elon Musk unveiled the mission architecture of the Interplanetary Transport System program, an ambitious privately funded initiative to develop spaceflight technology for use in manned interplanetary spaceflight, and which, if demand emerges, could lead to sustainable human settlements on Mars over the long term. This is the main purpose this System was designed for. In 2017, Elon Musk announced that the company had been contracted by two private individuals to send them in a Dragon spacecraft on a free return trajectory around the Moon. Provisionally launching in 2018, this could become the first instance of lunar tourism."]
17 | 
18 | space_split = [line.split(" ") for line in paragraphs]
19 | 
20 | # compile sample documents into a list
21 | 
22 | # list for tokenized documents in loop
23 | #texts = []
24 | 
25 | # loop through document list
26 | #for i in paragraphs:
27 |     
28 |     # clean and tokenize document string
29 | #    raw = i.lower()
30 | #    tokens = tokenizer.tokenize(raw)
31 | 
32 |     # remove stop words from tokens
33 | #    stopped_tokens = [i for i in tokens if not i in en_stop]
34 |     
35 |     # stem tokens
36 | #    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
37 |     
38 |     # add tokens to list
39 | #    texts.append(stemmed_tokens)
40 | 
41 | # turn our tokenized documents into a id <-> term dictionary
42 | dictionary = corpora.Dictionary(space_split)
43 |     
44 | # convert tokenized documents into a document-term matrix
45 | corpus = [dictionary.doc2bow(text) for text in space_split]
46 | 
47 | 
48 | # Use first paragraph
49 | y = [dictionary.token2id.get(word) for word in space_split[0]]
50 | X = [0] + y
51 | # generate LDA model
52 | ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)
53 | 


--------------------------------------------------------------------------------
/src/context/topics_analysis.py:
--------------------------------------------------------------------------------
  1 | from utils.vector_manager import VectorManager
  2 | from utils.flatten import flatten
  3 | from gensim.corpora import Dictionary, MmCorpus
  4 | from gensim.models import LsiModel, LdaMulticore, LdaModel, HdpModel
  5 | from time import time
  6 | 
  7 | import multiprocessing as mp
  8 | 
  9 | import argparse
 10 | import os
 11 | import sys
 12 | import bz2
 13 | 
 14 | stop_words = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any",
 15 |               "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between",
 16 |               "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does",
 17 |               "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had",
 18 |               "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her",
 19 |               "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll",
 20 |               "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me",
 21 |               "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only",
 22 |               "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she",
 23 |               "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's",
 24 |               "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they",
 25 |               "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under",
 26 |               "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't",
 27 |               "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom",
 28 |               "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've",
 29 |               "your", "yours", "yourself", "yourselves"]
 30 | 
 31 | 
 32 | def get_file_as_list(filename):
 33 |     words_list = VectorManager.parse_into_list(VectorManager.read_vector(filename))
 34 |     words_list = [w for w in words_list if w not in stop_words]
 35 |     return words_list
 36 | 
 37 | 
 38 | def get_lists(data_path):
 39 |     filepaths = []
 40 |     for root, dirs, files in os.walk(data_path):
 41 |         filepaths.extend(["%s/%s" % (root, file) for file in files if file.endswith("_clean")])
 42 | 
 43 |     p = mp.Pool(mp.cpu_count() * 2)
 44 |     files_list = p.map(get_file_as_list, filepaths)
 45 | 
 46 |     return filepaths, files_list
 47 | 
 48 | 
 49 | def get_corpus_and_dict(data_path):
 50 |     print("[BLOCK] Getting corpus and dictionary files from %s" % (data_path))
 51 |     sys.stdout.flush()
 52 | 
 53 |     file_paths, files_list = get_lists(data_path)
 54 | 
 55 |     print("[BLOCK] Building dictionary with %s documents" % len(files_list))
 56 |     sys.stdout.flush()
 57 | 
 58 |     dictionary = Dictionary(files_list)
 59 | 
 60 |     print("[BLOCK] Filtering out %s (0.1)" % (int(len(dictionary)*0.1)))
 61 |     sys.stdout.flush()
 62 | 
 63 |     dictionary.filter_n_most_frequent(int(len(dictionary)*0.1))
 64 | 
 65 |     # convert tokenized documents into a document-term matrix
 66 |     corpus = [dictionary.doc2bow(doc) for doc in files_list]
 67 | 
 68 |     return corpus, dictionary
 69 | 
 70 | 
 71 | def load_corpus_and_dict(corpus_path, id2word_path):
 72 |     print("[BLOCK] Loading  corpus and dictionary files from %s and %s" % (data_path, id2word_path))
 73 |     sys.stdout.flush()
 74 |     dictionary = Dictionary.load_from_text(id2word_path)
 75 | 
 76 |     print("[BLOCK] Loading corpus iterator")
 77 |     sys.stdout.flush()
 78 |     #mm = gensim.corpora.MmCorpus(corpus_path)
 79 |     corpus = MmCorpus(bz2.BZ2File(corpus_path)) # use this if you compressed the TFIDF output (recommended)
 80 | 
 81 |     return corpus, dictionary
 82 | 
 83 | 
 84 | def topic_analysis(corpus, dictionary, models_path, technique):
 85 | 
 86 |     import uuid
 87 |     uuid = str(uuid.uuid4())
 88 |     print("[BLOCK] Starting models for context")
 89 |     sys.stdout.flush()
 90 | 
 91 |     if technique == "all" or technique == "hdp":
 92 |         t1 = time()
 93 |         # HDP model
 94 |         model = HdpModel(corpus, id2word=dictionary)
 95 |         model.save("%s/hdp_%s" % (models_path, uuid))
 96 |         del model
 97 |         t2 = time()
 98 |         print("[BLOCK] Training time for HDP model: %s" % (round(t2-t1, 2)))
 99 |         sys.stdout.flush()
100 | 
101 |     if technique == "all" or technique == "ldap":
102 |         t1 = time()
103 |         # Parallel LDA model
104 |         model = LdaMulticore(corpus, id2word=dictionary, num_topics=100,  workers=23, passes=20)
105 |         model.save("%s/lda_parallel_%s" % (models_path, uuid))
106 |         del model
107 |         t2 = time()
108 |         print("[BLOCK] Training time for LDA multicore: %s" % (round(t2-t1, 2)))
109 |     sys.stdout.flush()
110 | 
111 |     if technique == "all" or technique == "lsa":
112 |         t1 = time()
113 |         # LSA model
114 |         model = LsiModel(corpus, id2word=dictionary, num_topics=400)
115 |         model.save("%s/lsa_%s" % (models_path, uuid))
116 |         del model
117 |         t2 = time()
118 |         print("[BLOCK] Training time for LSA: %s" % (round(t2-t1, 2)))
119 |         sys.stdout.flush()
120 | 
121 |     if technique == "all" or technique == "ldao":
122 |         t1 = time()
123 |         # Online LDA model
124 |         model = LdaModel(corpus, id2word=dictionary, num_topics=100, update_every=1, chunksize=10000, passes=5)
125 |         model.save("%s/lda_online_%s" % (models_path, uuid))
126 |         t2 = time()
127 |         print("[BLOCK] Training time for LDA online: %s" % (round(t2-t1, 2)))
128 |         sys.stdout.flush()
129 | 
130 |     if technique == "all" or technique == "lda":
131 |         t1 = time()
132 |         # Offline LDA model
133 |         model = LdaModel(corpus, id2word=dictionary, num_topics=100,  update_every=0, passes=20)
134 |         model.save("%s/lda_offline_%s" % (models_path, uuid))
135 |         del model
136 |         t2 = time()
137 |         print("[BLOCK] Training time for LDA offline: %s" % (round(t2-t1, 2)))
138 |         sys.stdout.flush()
139 | 
140 | 
141 | if __name__ == '__main__':
142 |     parser = argparse.ArgumentParser()
143 |     parser.add_argument('-d', '--data', type=str, help="Path of the data to be translated with word2id vector."
144 |                                                        " and clean up.", required=True)
145 |     parser.add_argument('-m', '--models', type=str, help="Directory were the models will be stored.", required=True)
146 |     parser.add_argument('-w', '--word_vector', type=str, help="Word2ID vector to be used for doc translation.",
147 |                         required=False, default=None)
148 |     parser.add_argument('-c', '--corpus_path', type=str, help="Corpus iterator path [wiki_en_tfidf.mm.bz2].",
149 |                         required=False, default=None)
150 |     parser.add_argument('-i', '--id_word', type=str, help="Id2Word vector path ['wiki_en_wordids.txt'].",
151 |                         required=False, default=None)
152 |     parser.add_argument('-t', '--technique', type=str, help="Technique used for topic modeling. Available options all,"
153 |                         "hierarchical dirichlet process (hdp), latent dirichlet allocation (lda), lda multicore (ldap)"
154 |                         "latent semantic anaylisis (lsa), lda online (ldao)", required=False, default="all")
155 | 
156 |     args = parser.parse_args()
157 |     data_path = args.data
158 |     models_path = args.models
159 |     word2id_file = args.word_vector
160 |     corpus_path = args.corpus_path
161 |     id2word_path = args.id_word
162 |     technique = args.technique
163 | 
164 |     begin = time()
165 | 
166 |     if word2id_file:
167 |         w2Id = VectorManager.read_vector(word2id_file)
168 | 
169 |     if corpus_path and id2word_path:
170 |         corpus, dictionary = load_corpus_and_dict(corpus_path, id2word_path)
171 |     else:
172 |         corpus, dictionary = get_corpus_and_dict(data_path)
173 | 
174 |     topic_analysis(corpus, dictionary, models_path, technique)
175 | 
176 |     end = time()
177 |     print("Total processing time: %d seconds" % (end - begin))
178 | 


--------------------------------------------------------------------------------
/src/lstm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/src/lstm/__init__.py


--------------------------------------------------------------------------------
/src/lstm/clstm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | To run:
  3 | 
  4 | $ python lstm_frag.py --data_path=path/to/train.list
  5 | 
  6 | """
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import sys
 12 | sys.path.insert(0, "../src/")
 13 | 
 14 | import inspect
 15 | import time
 16 | from utils.vector_manager import VectorManager
 17 | from context.creator import TopicCreator
 18 | # from context.create import get_lda_best_topic_words, get_lda_topic_embedding, get_lsa_topic_embeding
 19 | import subprocess
 20 | 
 21 | import numpy as np
 22 | import tensorflow as tf
 23 | from gensim.models import LsiModel, LdaModel
 24 | 
 25 | flags = tf.flags
 26 | logging = tf.logging
 27 | 
 28 | flags.DEFINE_string(
 29 |     "model", "small",
 30 |     "A type of model. Possible options are: small, medium, large.")
 31 | 
 32 | flags.DEFINE_string(
 33 |     "tasks", "all",
 34 |     "Tasks to be performed. Possible options are: all, train, test, valid")
 35 | 
 36 | flags.DEFINE_string(
 37 |     "word2id_path", "../models/eos/word2id_",
 38 |     "A type of model. Possible options are: small, medium, large.")
 39 | 
 40 | flags.DEFINE_string(
 41 |     "embeddings", "../models/eos/idWordVec_",
 42 |     "Embeddings path")
 43 | 
 44 | flags.DEFINE_string("topic_model_path", "../models/topics/lda_parallel_bf64b098-c517-47c8-9267-1ce116e0033d",
 45 |                     "Where the lda model is stored.")
 46 | 
 47 | flags.DEFINE_string("dictionary_path", "../models/topics/gensim_wordids.txt.bz2",
 48 |                     "Where the dictionary is stored.")
 49 | 
 50 | flags.DEFINE_string("data_path", None,
 51 |                     "Where the training/test data is stored.")
 52 | flags.DEFINE_string("save_path", None,
 53 |                     "Model output directory.")
 54 | flags.DEFINE_bool("use_fp16", False,
 55 |                   "Train using 16-bit floats instead of 32bit floats")
 56 | 
 57 | flags.DEFINE_string("context", "lda",
 58 |                   "Type of context to be used. Possible values are, lda, lda_mean, lsi, arithmetic")
 59 | 
 60 | FLAGS = flags.FLAGS
 61 | 
 62 | 
 63 | def data_type():
 64 |     return tf.float16 if FLAGS.use_fp16 else tf.float32
 65 | 
 66 | 
 67 | def get_context(topic_creator, segment):
 68 |     if FLAGS.context == "lda":
 69 |         return topic_creator.get_lda_best_topic_words(segment)
 70 |     if FLAGS.context == "lda_mean":
 71 |         return topic_creator.get_lda_topic_embedding(segment)
 72 |     if FLAGS.context == "lsi":
 73 |         return topic_creator.get_lsa_topic_embeding(segment)
 74 |     if FLAGS.context == "arithmetic":
 75 |         return topic_creator.average_embeddings(segment)
 76 | 
 77 | 
 78 | def generate_arrays_from_list(name, topic_creator, files, embeddings, num_steps=35, batch_size=20, embedding_size=200):
 79 |     eos_mark = [id for id, w, vec in embeddings if w == "<eos>"][0]
 80 |     eop_mark = [id for id, w, vec in embeddings if w == "<eop>"][0]
 81 |     unknown_embedding = [vec for id, w, vec in embeddings if w == "<unk>"][0]
 82 |     debug = False
 83 |     # print("EOS mark: %s, EOP mark: %s" % (eos_mark, eop_mark))
 84 |     while 1:
 85 |         for file_name in files:
 86 |             raw_list = VectorManager.parse_into_list(open(file_name).read())
 87 | 
 88 |             n_words = len(raw_list)
 89 |             batch_len = n_words // batch_size
 90 |             data = np.reshape(raw_list[0:batch_size*batch_len], [batch_size, batch_len])
 91 |             sentSegments = [list() for _ in range(batch_size)]
 92 |             parSegments = [list() for _ in range(batch_size)]
 93 | 
 94 | 
 95 |             for i in range(0, n_words - num_steps, 1):
 96 | 
 97 |                 x = data[0:batch_size, i * num_steps:(i + 1) * num_steps]
 98 |                 y = data[0:batch_size, i * num_steps + 1:(i + 1) * num_steps + 1]
 99 | 
100 |                 if len(x[0]) < num_steps or len(y[0]) < num_steps:
101 |                     break
102 | 
103 | 
104 |                 emb_x = [[embeddings[int(elem)][2] for elem in l] for l in x]
105 |                 emb_x = np.reshape(emb_x, newshape=(batch_size, num_steps, embedding_size))
106 | 
107 |                 final_x = np.zeros(shape=(batch_size, num_steps, len(embeddings[0][2])*3))
108 |                 for batch in range(0, batch_size):
109 |                     for step in range(0, num_steps):
110 |                         if debug:
111 |                             print("%s == %s ? %s [eos]\n%s == %s ? %s[eop]" % (int(x[batch][step]), eos_mark,
112 |                                                                            int(x[batch][step]) == eos_mark,
113 |                                                                            int(x[batch][step]), eop_mark,
114 |                                                                            int(x[batch][step]) == eop_mark))
115 |                         if int(x[batch][step]) == eos_mark:
116 |                             sentSegments[batch] = []
117 |                         else:
118 |                             sentSegments[batch].append(x[batch][step])
119 |                         if int(x[batch][step]) == eop_mark:
120 |                             parSegments[batch] = []
121 |                         else:
122 |                             parSegments[batch].append(x[batch][step])
123 | 
124 |                         sentTopic = unknown_embedding
125 |                         parTopic = unknown_embedding
126 |                         if sentSegments:
127 |                             sentTopic = get_context(topic_creator, sentSegments[batch])
128 | 
129 |                         if parSegments:
130 |                             if sentSegments[batch] == parSegments[batch]:
131 |                                 parTopic = sentTopic
132 |                             else:
133 |                                 parTopic = get_context(topic_creator, parSegments[batch])
134 | 
135 |                         final_x[batch][step] = np.hstack((emb_x[batch][step], sentTopic, parTopic))
136 | 
137 | 
138 | 
139 |                 if debug:
140 |                     print("Batch size %s\nNum steps %s\nEmbedding size %s" % (batch_size, num_steps, embedding_size
141 |                                                                               ))
142 |                     print("Len(x): %s\n Len(x[0] %s\n Len(x[0][0] %s" % (len(x), len(x[0]), len(x[0][0])))
143 |                     print("Len(y): %s\n Len(y[0] %s" % (len(y), len(y[0])))
144 | 
145 | 
146 | 
147 |                 y = np.reshape(y, newshape=(batch_size, num_steps))
148 | 
149 |                 yield final_x, y
150 | 
151 | class WPModel(object):
152 |     """Word Prediction model."""
153 | 
154 |     def __init__(self, is_training, config):
155 | 
156 |         self.config = config
157 |         batch_size = config.batch_size
158 |         num_steps = config.num_steps
159 |         size = config.hidden_size
160 |         vocab_size = config.vocab_size
161 |         embedding_size = config.embedding_size
162 | 
163 |         def lstm_cell():
164 |             # With the latest TensorFlow source code (as of Mar 27, 2017),
165 |             # the BasicLSTMCell will need a reuse parameter which is unfortunately not
166 |             # defined in TensorFlow 1.0. To maintain backwards compatibility, we add
167 |             # an argument check here:
168 |             # if 'reuse' in inspect.getargspec(
169 |             #     tf.contrib.rnn.BasicLSTMCell.__init__).args:
170 |             #   return tf.contrib.rnn.BasicLSTMCell(
171 |             #       size, forget_bias=0.0, state_is_tuple=True,
172 |             #       reuse=tf.get_variable_scope().reuse)
173 |             # else:
174 |             return tf.contrib.rnn.BasicLSTMCell(
175 |                 size, forget_bias=0.0, state_is_tuple=True)
176 | 
177 |         attn_cell = lstm_cell
178 |         if is_training and config.keep_prob < 1:
179 |             def attn_cell():
180 |                 return tf.contrib.rnn.DropoutWrapper(
181 |                     lstm_cell(), output_keep_prob=config.keep_prob)
182 | 
183 |         cell = tf.contrib.rnn.MultiRNNCell(
184 |             [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True)
185 | 
186 |         self._initial_state = cell.zero_state(batch_size, data_type())
187 | 
188 |         with tf.device("/cpu:0"):
189 | 
190 |             self.inputs = tf.placeholder(dtype=data_type(), shape=(batch_size, num_steps, embedding_size*3))
191 |             self.targets = tf.placeholder(dtype=tf.int32, shape=(batch_size, num_steps))
192 | 
193 |         if is_training and config.keep_prob < 1:
194 |             # Dropout allows to use the net for train and testing
195 |             # See: https://stackoverflow.com/questions/34597316/why-input-is-scaled-in-tf-nn-dropout-in-tensorflow
196 |             # and: http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf
197 |             inputs = tf.nn.dropout(self.inputs, config.keep_prob)
198 |         else:
199 |             inputs = self.inputs
200 | 
201 |         inputs = tf.unstack(inputs, num=num_steps, axis=1)
202 | 
203 |         outputs, state = tf.contrib.rnn.static_rnn(
204 |             cell, inputs, initial_state=self._initial_state)
205 | 
206 |         output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size])
207 |         softmax_w = tf.get_variable(
208 |             "softmax_w", [size, vocab_size], dtype=data_type())
209 |         softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type())
210 |         logits = tf.matmul(output, softmax_w) + softmax_b
211 |         loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
212 |             [logits],
213 |             [tf.reshape(self.targets, [-1])],
214 |             [tf.ones([batch_size * num_steps], dtype=data_type())])
215 |         self._cost = cost = tf.reduce_sum(loss) / batch_size
216 |         self._final_state = state
217 | 
218 |         if not is_training:
219 |             return
220 | 
221 |         self._lr = tf.Variable(0.0, trainable=False)
222 |         tvars = tf.trainable_variables()
223 |         grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
224 |                                           config.max_grad_norm)
225 |         optimizer = tf.train.GradientDescentOptimizer(self._lr)
226 |         self._train_op = optimizer.apply_gradients(
227 |             zip(grads, tvars),
228 |             global_step=tf.contrib.framework.get_or_create_global_step())
229 | 
230 |         self._new_lr = tf.placeholder(
231 |             tf.float32, shape=[], name="new_learning_rate")
232 |         self._lr_update = tf.assign(self._lr, self._new_lr)
233 | 
234 |     def assign_lr(self, session, lr_value):
235 |         session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
236 | 
237 |     @property
238 |     def input(self):
239 |         return self._input
240 | 
241 |     @property
242 |     def initial_state(self):
243 |         return self._initial_state
244 | 
245 |     @property
246 |     def cost(self):
247 |         return self._cost
248 | 
249 |     @property
250 |     def final_state(self):
251 |         return self._final_state
252 | 
253 |     @property
254 |     def lr(self):
255 |         return self._lr
256 | 
257 |     @property
258 |     def train_op(self):
259 |         return self._train_op
260 | 
261 | 
262 | class SmallConfig(object):
263 |     """Small config."""
264 |     init_scale = 0.1
265 |     learning_rate = 1.0
266 |     max_grad_norm = 5
267 |     num_layers = 1
268 |     num_steps = 20
269 |     hidden_size = 200
270 |     max_epoch = 4
271 |     max_max_epoch = 13
272 |     keep_prob = 1.0
273 |     lr_decay = 0.5
274 |     batch_size = 20
275 |     vocab_size = 126930
276 |     embedding_size = 200
277 |     epoch_size = 1
278 | 
279 | class MediumConfig(object):
280 |     """Medium config."""
281 |     init_scale = 0.05
282 |     learning_rate = 1.0
283 |     max_grad_norm = 5
284 |     num_layers = 1
285 |     num_steps = 35
286 |     hidden_size = 512
287 |     max_epoch = 6
288 |     max_max_epoch = 39
289 |     keep_prob = 0.5
290 |     lr_decay = 0.8
291 |     batch_size = 20
292 |     vocab_size = 126930
293 |     embedding_size = 200
294 |     epoch_size = 1
295 | 
296 | class LargeConfig(object):
297 |     """Large config."""
298 |     init_scale = 0.04
299 |     learning_rate = 1.0
300 |     max_grad_norm = 10
301 |     num_layers = 1
302 |     num_steps = 35
303 |     hidden_size = 1024
304 |     max_epoch = 14
305 |     max_max_epoch = 55
306 |     keep_prob = 0.35
307 |     lr_decay = 1 / 1.15
308 |     batch_size = 20
309 |     vocab_size = 126930
310 |     embedding_size = 1000
311 |     epoch_size = 1
312 | 
313 | class TestConfig(object):
314 |     """Tiny config, for testing."""
315 |     init_scale = 0.1
316 |     learning_rate = 1.0
317 |     max_grad_norm = 1
318 |     num_layers = 1
319 |     num_steps = 2
320 |     hidden_size = 2
321 |     max_epoch = 1
322 |     max_max_epoch = 1
323 |     keep_prob = 1.0
324 |     lr_decay = 0.5
325 |     batch_size = 10
326 |     vocab_size = 126930
327 |     embedding_size = 200
328 |     epoch_size = 1
329 | 
330 | 
331 | def run_epoch(session, generator, model, eval_op=None, verbose=False):
332 |     """Runs the model on the given data."""
333 |     start_time = time.time()
334 |     costs = 0.0
335 |     iters = 0
336 |     config = model.config
337 |     state = session.run(model.initial_state)
338 | 
339 |     fetches = {
340 |         "cost": model.cost,
341 |         "final_state": model.final_state,
342 |     }
343 |     if eval_op is not None:
344 |         fetches["eval_op"] = eval_op
345 | 
346 |     print("Epoch size starting training %s" % config.epoch_size)
347 |     sys.stdout.flush()
348 |     for step in range(config.epoch_size):
349 |         x, y = next(generator)
350 |         feed_dict = {}
351 |         for i, (c, h) in enumerate(model.initial_state):
352 |             feed_dict[c] = state[i].c
353 |             feed_dict[h] = state[i].h
354 |         # feed_dict["embeddings"] = embeddings
355 |         feed_dict[model.inputs] = x
356 |         feed_dict[model.targets] = y
357 | 
358 |         vals = session.run(fetches, feed_dict)
359 |         cost = vals["cost"]
360 |         state = vals["final_state"]
361 | 
362 |         costs += cost
363 |         iters += config.num_steps
364 | 
365 |         if verbose and step % 100 == 0:
366 |             print("%.3f perplexity: %.3f speed: %.0f wps" %
367 |                   (step * 1.0 / config.epoch_size, np.exp(costs / iters),
368 |                    iters * config.batch_size / (time.time() - start_time)))
369 |             sys.stdout.flush()
370 | 
371 |     return np.exp(costs / iters)
372 | 
373 | 
374 | def get_config():
375 |     if FLAGS.model == "small":
376 |         return SmallConfig()
377 |     elif FLAGS.model == "medium":
378 |         return MediumConfig()
379 |     elif FLAGS.model == "large":
380 |         return LargeConfig()
381 |     elif FLAGS.model == "test":
382 |         return TestConfig()
383 |     else:
384 |         raise ValueError("Invalid model: %s", FLAGS.model)
385 | 
386 | def get_epoch_size(files, config):
387 |     total = 0
388 |     for file in files:
389 |         file_words = subprocess.check_output(['wc', '-w', file])
390 |         number = file_words.split()[0]
391 |         words = int(number)
392 |         total += words - (words % (config.batch_size * config.num_steps))
393 |     print("Total words: %s, Batch size: %s, Num steps: %s" % (total, config.batch_size, config.num_steps))
394 |     sys.stdout.flush()
395 |     epoch_size = ((total // config.batch_size) - 1) // config.num_steps
396 | 
397 |     return epoch_size
398 | 
399 | def main(_):
400 |     if not FLAGS.data_path:
401 |         raise ValueError("Must set --data_path to wiki data directory list")
402 | 
403 |     vocab_size = 126930
404 | 
405 |     config = get_config()
406 |     config.vocab_size = vocab_size
407 | 
408 |     valid_config = get_config()
409 |     config.vocab_size = vocab_size
410 | 
411 | 
412 |     eval_config = get_config()
413 |     eval_config.batch_size = 1
414 |     eval_config.num_steps = 1
415 |     eval_config.vocab_size = vocab_size
416 | 
417 |     embeddings = VectorManager.read_vector("%s%s.pklz" % (FLAGS.embeddings, config.embedding_size))
418 | 
419 |     # Load LDA or LSI model for topic creator
420 |     if "lda" in FLAGS.context:
421 |         model = LdaModel.load(FLAGS.topic_model_path)
422 |     elif "lsi" in FLAGS.context:
423 |         model = LsiModel.load(FLAGS.topic_model_path)
424 |     else:
425 |         model = None
426 | 
427 |     topic_creator = TopicCreator(FLAGS.dictionary_path, "%s%s.pklz" % (FLAGS.word2id_path, config.embedding_size),
428 |                                  embeddings, model)
429 |     files = open(FLAGS.data_path).read().split()
430 | 
431 |     training_list = files[0:int(0.8 * len(files))]
432 |     validation_list = files[int(0.8 * len(files)):int(0.9 * len(files))]
433 |     testing_list = files[int(0.9 * len(files)):len(files)]
434 | 
435 |     print("Lists sizes\n * Training: %s\n * Validation: %s\n * Testing: %s" %
436 |           (len(training_list), len(validation_list), len(testing_list)))
437 | 
438 |     config.epoch_size = get_epoch_size(training_list, config)
439 |     valid_config.epoch_size = get_epoch_size(validation_list, valid_config)
440 |     eval_config.epoch_size = get_epoch_size(testing_list, eval_config)
441 | 
442 |     gen_train = generate_arrays_from_list("Train", topic_creator, training_list, embeddings, batch_size=config.batch_size,
443 |                                           embedding_size=config.embedding_size, num_steps=config.num_steps)
444 | 
445 |     gen_valid = generate_arrays_from_list("Validation", topic_creator, validation_list, embeddings, batch_size=valid_config.batch_size,
446 |                                           embedding_size=valid_config.embedding_size, num_steps=valid_config.num_steps)
447 | 
448 |     gen_test = generate_arrays_from_list("Test", topic_creator, testing_list, embeddings, batch_size=eval_config.batch_size,
449 |                                          embedding_size=eval_config.embedding_size, num_steps=eval_config.num_steps)
450 | 
451 |     print("Epoch sizes\n * Training: %s\n * Validation: %s\n * Testing: %s" %
452 |           (config.epoch_size, valid_config.epoch_size, eval_config.epoch_size))
453 |     sys.stdout.flush()
454 |     with tf.Graph().as_default():
455 |         # Args: [minval, maxval]
456 |         initializer = tf.random_uniform_initializer(-config.init_scale,
457 |                                                     config.init_scale)
458 | 
459 |         with tf.name_scope("Train"):
460 |             with tf.variable_scope("Model", reuse=None, initializer=initializer):
461 |                 m = WPModel(is_training=True, config=config)
462 |             tf.summary.scalar("Training Loss", m.cost)
463 |             tf.summary.scalar("Learning Rate", m.lr)
464 | 
465 |         with tf.name_scope("Valid"):
466 |             with tf.variable_scope("Model", reuse=True, initializer=initializer):
467 |                 mvalid = WPModel(is_training=False, config=valid_config)
468 |             tf.summary.scalar("Validation Loss", mvalid.cost)
469 | 
470 |         with tf.name_scope("Test"):
471 |             with tf.variable_scope("Model", reuse=True, initializer=initializer):
472 |                 mtest = WPModel(is_training=False, config=eval_config)
473 | 
474 |         sv = tf.train.Supervisor(logdir=FLAGS.save_path)
475 |         with sv.managed_session() as session:
476 |             for i in range(config.max_max_epoch):
477 |                 lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
478 |                 m.assign_lr(session, config.learning_rate * lr_decay)
479 | 
480 |                 print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
481 |                 train_perplexity = run_epoch(session, generator=gen_train, model=m, eval_op=m.train_op,
482 |                                              verbose=True)
483 |                 print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
484 |                 valid_perplexity = run_epoch(session, generator=gen_valid, model=mvalid)
485 |                 print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
486 | 
487 |             test_perplexity = run_epoch(session, generator=gen_test, model=mtest)
488 |             print("Test Perplexity: %.3f" % test_perplexity)
489 | 
490 |             if FLAGS.save_path:
491 |                 print("Saving model to %s." % FLAGS.save_path)
492 |                 sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
493 | 
494 | 
495 | if __name__ == "__main__":
496 |     tf.app.run()
497 | 


--------------------------------------------------------------------------------
/src/lstm/input_pipeline.py:
--------------------------------------------------------------------------------
 1 | # TensorFlow Input Pipelines for Large Data Sets
 2 | # ischlag.github.io
 3 | # TensorFlow 0.11, 07.11.2016
 4 | 
 5 | import tensorflow as tf
 6 | import numpy as np
 7 | import threading
 8 | 
 9 | # Generating some simple data
10 | r = np.arange(0.0,100003.0)
11 | raw_data = np.dstack((r,r,r,r))[0]
12 | raw_target = np.array([[1,0,0]] * 100003)
13 | 
14 | # are used to feed data into our queue
15 | queue_input_data = tf.placeholder(tf.float32, shape=[20, 4])
16 | queue_input_target = tf.placeholder(tf.float32, shape=[20, 3])
17 | 
18 | queue = tf.FIFOQueue(capacity=50, dtypes=[tf.float32, tf.float32], shapes=[[4], [3]])
19 | 
20 | enqueue_op = queue.enqueue_many([queue_input_data, queue_input_target])
21 | dequeue_op = queue.dequeue()
22 | 
23 | # tensorflow recommendation:
24 | # capacity = min_after_dequeue + (num_threads + a small safety margin) * batch_size
25 | data_batch, target_batch = tf.train.batch(dequeue_op, batch_size=15, capacity=40)
26 | # use this to shuffle batches:
27 | # data_batch, target_batch = tf.train.shuffle_batch(dequeue_op, batch_size=15, capacity=40, min_after_dequeue=5)
28 | 
29 | def enqueue(sess):
30 |   """ Iterates over our data puts small junks into our queue."""
31 |   under = 0
32 |   max = len(raw_data)
33 |   while True:
34 |     print("starting to write into queue")
35 |     upper = under + 20
36 |     print("try to enqueue ", under, " to ", upper)
37 |     if upper <= max:
38 |       curr_data = raw_data[under:upper]
39 |       curr_target = raw_target[under:upper]
40 |       under = upper
41 |     else:
42 |       rest = upper - max
43 |       curr_data = np.concatenate((raw_data[under:max], raw_data[0:rest]))
44 |       curr_target = np.concatenate((raw_target[under:max], raw_target[0:rest]))
45 |       under = rest
46 | 
47 |     sess.run(enqueue_op, feed_dict={queue_input_data: curr_data,
48 |                                     queue_input_target: curr_target})
49 |     print("added to the queue")
50 |   print("finished enqueueing")
51 | 
52 | # start the threads for our FIFOQueue and batch
53 | sess = tf.Session()
54 | enqueue_thread = threading.Thread(target=enqueue, args=[sess])
55 | enqueue_thread.isDaemon()
56 | enqueue_thread.start()
57 | 
58 | coord = tf.train.Coordinator()
59 | threads = tf.train.start_queue_runners(coord=coord, sess=sess)
60 | 
61 | # Fetch the data from the pipeline and put it where it belongs (into your model)
62 | for i in range(5):
63 |   run_options = tf.RunOptions(timeout_in_ms=4000)
64 |   curr_data_batch, curr_target_batch = sess.run([data_batch, target_batch], options=run_options)
65 |   print(curr_data_batch)
66 | 
67 | # shutdown everything to avoid zombies
68 | sess.run(queue.close(cancel_pending_enqueues=True))
69 | coord.request_stop()
70 | coord.join(threads)
71 | sess.close()
72 | Blog


--------------------------------------------------------------------------------
/src/lstm/lstm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | To run:
  3 | 
  4 | $ python lstm_frag.py --data_path=path/to/train.list
  5 | 
  6 | """
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import sys
 12 | sys.path.insert(0, "../src/")
 13 | 
 14 | from utils.vector_manager import VectorManager
 15 | 
 16 | import numpy as np
 17 | import tensorflow as tf
 18 | 
 19 | import subprocess
 20 | import inspect
 21 | import time
 22 | 
 23 | 
 24 | flags = tf.flags
 25 | logging = tf.logging
 26 | 
 27 | flags.DEFINE_string(
 28 |     "model", "small",
 29 |     "A type of model. Possible options are: small, medium, large.")
 30 | 
 31 | flags.DEFINE_string(
 32 |     "tasks", "all",
 33 |     "Tasks to be performed. Possible options are: all, train, test, valid")
 34 | 
 35 | flags.DEFINE_string(
 36 |     "word_to_id_path", "../models/eos/word2id_1000.pklz",
 37 |     "A type of model. Possible options are: small, medium, large.")
 38 | 
 39 | flags.DEFINE_string(
 40 |     "embeddings", "../models/eos/idWordVec_",
 41 |     "Embeddings path")
 42 | 
 43 | flags.DEFINE_string("data_path", None,
 44 |                     "Where the training/test data is stored.")
 45 | flags.DEFINE_string("save_path", None,
 46 |                     "Model output directory.")
 47 | flags.DEFINE_bool("use_fp16", False,
 48 |                   "Train using 16-bit floats instead of 32bit floats")
 49 | 
 50 | FLAGS = flags.FLAGS
 51 | 
 52 | 
 53 | def data_type():
 54 |     return tf.float16 if FLAGS.use_fp16 else tf.float32
 55 | 
 56 | 
 57 | def get_vocab_size():
 58 |     word_to_id = VectorManager.read_vector(FLAGS.word_to_id_path)
 59 |     size = len(word_to_id)
 60 |     print("Vocabulary size: %s" % size)
 61 |     return size
 62 | 
 63 | 
 64 | def generate_arrays_from_list(name, files, embeddings, num_steps=35, batch_size=20, embedding_size=200):
 65 | 
 66 |     debug = False
 67 |     while 1:
 68 |         for file_name in files:
 69 |             print("Generating from file %s for %s" % (file_name, name))
 70 |             raw_list = VectorManager.parse_into_list(open(file_name).read())
 71 | 
 72 |             n_words = len(raw_list)
 73 |             batch_len = n_words // batch_size
 74 |             data = np.reshape(raw_list[0:batch_size*batch_len], [batch_size, batch_len])
 75 | 
 76 |             for i in range(0, n_words - num_steps, 1):
 77 | 
 78 |                 x = data[0:batch_size, i * num_steps:(i + 1) * num_steps]
 79 |                 x = [[embeddings[int(elem)][2] for elem in l] for l in x]
 80 |                 y = data[0:batch_size, i * num_steps + 1:(i + 1) * num_steps + 1]
 81 | 
 82 | 
 83 |                 if len(x[0]) < num_steps or len(y[0]) < num_steps:
 84 |                     break
 85 |                 if debug:
 86 |                     print("Batch size %s\nNum steps %s\nEmbedding size %s" % (batch_size, num_steps, embedding_size
 87 |                                                                               ))
 88 |                     print("Len(x): %s\n Len(x[0] %s\n Len(x[0][0] %s" % (len(x), len(x[0]), len(x[0][0])))
 89 |                     print("Len(y): %s\n Len(y[0] %s" % (len(y), len(y[0])))
 90 |                 x = np.reshape(x, newshape=(batch_size, num_steps, embedding_size))
 91 | 
 92 |                 y = np.reshape(y, newshape=(batch_size, num_steps))
 93 | 
 94 |                 yield x, y
 95 | 
 96 | class WPModel(object):
 97 |     """Word Prediction model."""
 98 | 
 99 |     def __init__(self, is_training, config):
100 | 
101 |         self.config = config
102 |         batch_size = config.batch_size
103 |         num_steps = config.num_steps
104 |         size = config.hidden_size
105 |         vocab_size = config.vocab_size
106 |         embedding_size = config.embedding_size
107 | 
108 |         def lstm_cell():
109 |             # With the latest TensorFlow source code (as of Mar 27, 2017),
110 |             # the BasicLSTMCell will need a reuse parameter which is unfortunately not
111 |             # defined in TensorFlow 1.0. To maintain backwards compatibility, we add
112 |             # an argument check here:
113 |             # if 'reuse' in inspect.getargspec(
114 |             #     tf.contrib.rnn.BasicLSTMCell.__init__).args:
115 |             #   return tf.contrib.rnn.BasicLSTMCell(
116 |             #       size, forget_bias=0.0, state_is_tuple=True,
117 |             #       reuse=tf.get_variable_scope().reuse)
118 |             # else:
119 |             return tf.contrib.rnn.BasicLSTMCell(
120 |                 size, forget_bias=0.0, state_is_tuple=True)
121 | 
122 |         attn_cell = lstm_cell
123 |         if is_training and config.keep_prob < 1:
124 |             def attn_cell():
125 |                 return tf.contrib.rnn.DropoutWrapper(
126 |                     lstm_cell(), output_keep_prob=config.keep_prob)
127 | 
128 |         cell = tf.contrib.rnn.MultiRNNCell(
129 |             [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True)
130 | 
131 |         self._initial_state = cell.zero_state(batch_size, data_type())
132 | 
133 |         with tf.device("/cpu:0"):
134 |             self.inputs = tf.placeholder(dtype=data_type(), shape=(batch_size, num_steps, embedding_size))
135 |             self.targets = tf.placeholder(dtype=tf.int32, shape=(batch_size, num_steps))
136 | 
137 |         if is_training and config.keep_prob < 1:
138 |             # Dropout allows to use the net for train and testing
139 |             # See: https://stackoverflow.com/questions/34597316/why-input-is-scaled-in-tf-nn-dropout-in-tensorflow
140 |             # and: http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf
141 |             inputs = tf.nn.dropout(self.inputs, config.keep_prob)
142 |         else:
143 |             inputs = self.inputs
144 | 
145 |         inputs = tf.unstack(inputs, num=num_steps, axis=1)
146 | 
147 |         outputs, state = tf.contrib.rnn.static_rnn(
148 |             cell, inputs, initial_state=self._initial_state)
149 | 
150 |         output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size])
151 |         softmax_w = tf.get_variable(
152 |             "softmax_w", [size, vocab_size], dtype=data_type())
153 |         softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type())
154 |         logits = tf.matmul(output, softmax_w) + softmax_b
155 |         loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
156 |             [logits],
157 |             [tf.reshape(self.targets, [-1])],
158 |             [tf.ones([batch_size * num_steps], dtype=data_type())])
159 |         self._cost = cost = tf.reduce_sum(loss) / batch_size
160 |         self._final_state = state
161 | 
162 |         if not is_training:
163 |             return
164 | 
165 |         self._lr = tf.Variable(0.0, trainable=False)
166 |         tvars = tf.trainable_variables()
167 |         grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
168 |                                           config.max_grad_norm)
169 |         optimizer = tf.train.GradientDescentOptimizer(self._lr)
170 |         self._train_op = optimizer.apply_gradients(
171 |             zip(grads, tvars),
172 |             global_step=tf.contrib.framework.get_or_create_global_step())
173 | 
174 |         self._new_lr = tf.placeholder(
175 |             tf.float32, shape=[], name="new_learning_rate")
176 |         self._lr_update = tf.assign(self._lr, self._new_lr)
177 | 
178 |     def assign_lr(self, session, lr_value):
179 |         session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
180 | 
181 |     @property
182 |     def input(self):
183 |         return self._input
184 | 
185 |     @property
186 |     def initial_state(self):
187 |         return self._initial_state
188 | 
189 |     @property
190 |     def cost(self):
191 |         return self._cost
192 | 
193 |     @property
194 |     def final_state(self):
195 |         return self._final_state
196 | 
197 |     @property
198 |     def lr(self):
199 |         return self._lr
200 | 
201 |     @property
202 |     def train_op(self):
203 |         return self._train_op
204 | 
205 | 
206 | class SmallConfig(object):
207 |     """Small config."""
208 |     init_scale = 0.1
209 |     learning_rate = 1.0
210 |     max_grad_norm = 5
211 |     num_layers = 1
212 |     num_steps = 20
213 |     hidden_size = 200
214 |     max_epoch = 2
215 |     max_max_epoch = 13
216 |     keep_prob = 1.0
217 |     lr_decay = 0.5
218 |     batch_size = 20
219 |     vocab_size = 126930
220 |     embedding_size = 200
221 |     epoch_size = 1
222 | 
223 | class MediumConfig(object):
224 |     """Medium config."""
225 |     init_scale = 0.05
226 |     learning_rate = 1.0
227 |     max_grad_norm = 5
228 |     num_layers = 1
229 |     num_steps = 35
230 |     hidden_size = 512
231 |     max_epoch = 6
232 |     max_max_epoch = 39
233 |     keep_prob = 0.5
234 |     lr_decay = 0.8
235 |     batch_size = 20
236 |     vocab_size = 126930
237 |     embedding_size = 200
238 |     epoch_size = 1
239 | 
240 | class LargeConfig(object):
241 |     """Large config."""
242 |     init_scale = 0.04
243 |     learning_rate = 1.0
244 |     max_grad_norm = 10
245 |     num_layers = 1
246 |     num_steps = 35
247 |     hidden_size = 1024
248 |     max_epoch = 14
249 |     max_max_epoch = 55
250 |     keep_prob = 0.35
251 |     lr_decay = 1 / 1.15
252 |     batch_size = 20
253 |     vocab_size = 126930
254 |     embedding_size = 1000
255 |     epoch_size = 1
256 | 
257 | class TestConfig(object):
258 |     """Tiny config, for testing."""
259 |     init_scale = 0.1
260 |     learning_rate = 1.0
261 |     max_grad_norm = 1
262 |     num_layers = 1
263 |     num_steps = 2
264 |     hidden_size = 2
265 |     max_epoch = 1
266 |     max_max_epoch = 1
267 |     keep_prob = 1.0
268 |     lr_decay = 0.5
269 |     batch_size = 10
270 |     vocab_size = 126930
271 |     embedding_size = 200
272 |     epoch_size = 1
273 | 
274 | 
275 | def run_epoch(session, generator, model, eval_op=None, verbose=False):
276 |     """Runs the model on the given data."""
277 |     start_time = time.time()
278 |     costs = 0.0
279 |     iters = 0
280 |     config = model.config
281 |     state = session.run(model.initial_state)
282 | 
283 |     fetches = {
284 |         "cost": model.cost,
285 |         "final_state": model.final_state,
286 |     }
287 |     if eval_op is not None:
288 |         fetches["eval_op"] = eval_op
289 | 
290 |     print("Epoch size starting training %s" % config.epoch_size)
291 |     for step in range(config.epoch_size):
292 |         x, y = next(generator)
293 |         feed_dict = {}
294 |         for i, (c, h) in enumerate(model.initial_state):
295 |             feed_dict[c] = state[i].c
296 |             feed_dict[h] = state[i].h
297 |         feed_dict[model.inputs] = x
298 |         feed_dict[model.targets] = y
299 | 
300 |         vals = session.run(fetches, feed_dict)
301 |         cost = vals["cost"]
302 |         state = vals["final_state"]
303 | 
304 |         costs += cost
305 |         iters += config.num_steps
306 | 
307 |         # if verbose and step % 100 == 0:
308 |         print("%.3f perplexity: %.3f speed: %.0f wps" %
309 |               (step * 1.0 / config.epoch_size, np.exp(costs / iters),
310 |                iters * config.batch_size / (time.time() - start_time)))
311 |         sys.stdout.flush()
312 | 
313 |     return np.exp(costs / iters)
314 | 
315 | 
316 | def get_config():
317 |     if FLAGS.model == "small":
318 |         return SmallConfig()
319 |     elif FLAGS.model == "medium":
320 |         return MediumConfig()
321 |     elif FLAGS.model == "large":
322 |         return LargeConfig()
323 |     elif FLAGS.model == "test":
324 |         return TestConfig()
325 |     else:
326 |         raise ValueError("Invalid model: %s", FLAGS.model)
327 | 
328 | def get_epoch_size(files, config):
329 |     total = 0
330 |     for file in files:
331 |         file_words = subprocess.check_output(['wc', '-w', file])
332 |         number = file_words.split()[0]
333 |         words = int(number)
334 |         total += words - (words % (config.batch_size * config.num_steps))
335 |     print("Total words: %s, Batch size: %s, Num steps: %s" % (total, config.batch_size, config.num_steps))
336 |     sys.stdout.flush()
337 |     epoch_size = ((total // config.batch_size) - 1) // config.num_steps
338 | 
339 |     return epoch_size
340 | 
341 | def main(_):
342 |     if not FLAGS.data_path:
343 |         raise ValueError("Must set --data_path to wiki data directory list")
344 | 
345 |     vocab_size = 126930
346 | 
347 |     config = get_config()
348 |     config.vocab_size = vocab_size
349 | 
350 |     valid_config = get_config()
351 |     config.vocab_size = vocab_size
352 | 
353 | 
354 |     eval_config = get_config()
355 |     eval_config.batch_size = 1
356 |     eval_config.num_steps = 1
357 |     eval_config.vocab_size = vocab_size
358 | 
359 |     print("Embeddings path: {}".format(FLAGS.embeddings))
360 |     embeddings = VectorManager.read_vector(FLAGS.embeddings)
361 |     files = open(FLAGS.data_path).read().split()
362 | 
363 |     training_list = files[0:int(0.8 * len(files))]
364 |     validation_list = files[int(0.8 * len(files)):int(0.9 * len(files))]
365 |     testing_list = files[int(0.9 * len(files)):len(files)]
366 | 
367 |     config.epoch_size = get_epoch_size(training_list, config)
368 |     valid_config.epoch_size = get_epoch_size(validation_list, valid_config)
369 |     eval_config.epoch_size = get_epoch_size(testing_list, eval_config)
370 | 
371 |     gen_train = generate_arrays_from_list("Train", training_list, embeddings, batch_size=config.batch_size,
372 |                                           embedding_size=config.embedding_size, num_steps=config.num_steps)
373 |     gen_valid = generate_arrays_from_list("Validation", validation_list, embeddings, batch_size=valid_config.batch_size,
374 |                                           embedding_size=valid_config.embedding_size, num_steps=valid_config.num_steps)
375 |     gen_test = generate_arrays_from_list("Test", testing_list, embeddings, batch_size=eval_config.batch_size,
376 |                                          embedding_size=eval_config.embedding_size, num_steps=eval_config.num_steps)
377 | 
378 |     print("Epoch sizes\n * Training: %s\n * Validation: %s\n * Testing: %s" %
379 |           (config.epoch_size, valid_config.epoch_size, eval_config.epoch_size))
380 |     sys.stdout.flush()
381 |     with tf.Graph().as_default():
382 |         # Args: [minval, maxval]
383 |         initializer = tf.random_uniform_initializer(-config.init_scale,
384 |                                                     config.init_scale)
385 | 
386 |         with tf.name_scope("Train"):
387 |             with tf.variable_scope("Model", reuse=None, initializer=initializer):
388 |                 m = WPModel(is_training=True, config=config)
389 |             tf.summary.scalar("Training Loss", m.cost)
390 |             tf.summary.scalar("Learning Rate", m.lr)
391 | 
392 |         with tf.name_scope("Valid"):
393 |             with tf.variable_scope("Model", reuse=True, initializer=initializer):
394 |                 mvalid = WPModel(is_training=False, config=valid_config)
395 |             tf.summary.scalar("Validation Loss", mvalid.cost)
396 | 
397 |         with tf.name_scope("Test"):
398 |             with tf.variable_scope("Model", reuse=True, initializer=initializer):
399 |                 mtest = WPModel(is_training=False, config=eval_config)
400 | 
401 |         sv = tf.train.Supervisor(logdir=FLAGS.save_path)
402 |         with sv.managed_session() as session:
403 |             for i in range(config.max_max_epoch):
404 |                 lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
405 |                 m.assign_lr(session, config.learning_rate * lr_decay)
406 | 
407 |                 print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
408 |                 train_perplexity = run_epoch(session, generator=gen_train, model=m, eval_op=m.train_op,
409 |                                              verbose=True)
410 |                 print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
411 |                 valid_perplexity = run_epoch(session, generator=gen_valid, model=mvalid)
412 |                 print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
413 | 
414 |             test_perplexity = run_epoch(session, generator=gen_test, model=mtest)
415 |             print("Test Perplexity: %.3f" % test_perplexity)
416 | 
417 |             if FLAGS.save_path:
418 |                 print("Saving model to %s." % FLAGS.save_path)
419 |                 sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
420 | 
421 | 
422 | if __name__ == "__main__":
423 |     tf.app.run()
424 | 


--------------------------------------------------------------------------------
/src/lstm/lstm_wp.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Example / benchmark for building a PTB LSTM model.
 17 | 
 18 | Trains the model described in:
 19 | (Zaremba, et. al.) Recurrent Neural Network Regularization
 20 | http://arxiv.org/abs/1409.2329
 21 | 
 22 | There are 3 supported model configurations:
 23 | ===========================================
 24 | | config | epochs | train | valid  | test
 25 | ===========================================
 26 | | small  | 13     | 37.99 | 121.39 | 115.91
 27 | | medium | 39     | 48.45 |  86.16 |  82.07
 28 | | large  | 55     | 37.87 |  82.62 |  78.29
 29 | The exact results may vary depending on the random initialization.
 30 | 
 31 | The hyperparameters used in the model:
 32 | - init_scale - the initial scale of the weights
 33 | - learning_rate - the initial value of the learning rate
 34 | - max_grad_norm - the maximum permissible norm of the gradient
 35 | - num_layers - the number of LSTM layers
 36 | - num_steps - the number of unrolled steps of LSTM
 37 | - hidden_size - the number of LSTM units
 38 | - max_epoch - the number of epochs trained with the initial learning rate
 39 | - max_max_epoch - the total number of epochs for training
 40 | - keep_prob - the probability of keeping weights in the dropout layer
 41 | - lr_decay - the decay of the learning rate for each epoch after "max_epoch"
 42 | - batch_size - the batch size
 43 | 
 44 | The data required for this example is in the data/ dir of the
 45 | PTB dataset from Tomas Mikolov's webpage:
 46 | 
 47 | $ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
 48 | $ tar xvf simple-examples.tgz
 49 | 
 50 | To run:
 51 | 
 52 | $ python ptb_word_lm.py --data_path=simple-examples/data/
 53 | 
 54 | """
 55 | from __future__ import absolute_import
 56 | from __future__ import division
 57 | from __future__ import print_function
 58 | 
 59 | import sys
 60 | sys.path.insert(0, "../src/")
 61 | 
 62 | import inspect
 63 | import time
 64 | 
 65 | import numpy as np
 66 | import tensorflow as tf
 67 | 
 68 | import reader_wp as reader
 69 | 
 70 | flags = tf.flags
 71 | logging = tf.logging
 72 | 
 73 | flags.DEFINE_string(
 74 |     "model", "small",
 75 |     "A type of model. Possible options are: small, medium, large.")
 76 | 
 77 | flags.DEFINE_string(
 78 |     "tasks", "all",
 79 |     "Tasks to be performed. Possible options are: all, train, test, valid")
 80 | 
 81 | flags.DEFINE_string(
 82 |     "word_to_id_path", "../models/eos/word2id_1000.pklz",
 83 |     "A type of model. Possible options are: small, medium, large.")
 84 | 
 85 | flags.DEFINE_string("data_path", None,
 86 |                     "Where the training/test data is stored.")
 87 | flags.DEFINE_string("save_path", None,
 88 |                     "Model output directory.")
 89 | flags.DEFINE_bool("use_fp16", False,
 90 |                   "Train using 16-bit floats instead of 32bit floats")
 91 | 
 92 | FLAGS = flags.FLAGS
 93 | 
 94 | 
 95 | def data_type():
 96 |     return tf.float16 if FLAGS.use_fp16 else tf.float32
 97 | 
 98 | 
 99 | def get_vocab_size():
100 |     word_to_id = VectorManager.read_vector(FLAGS.word_to_id_path)
101 |     size = len(word_to_id)
102 |     print("Vocabulary size: %s" % size)
103 |     return size
104 | 
105 | class WPInput(object):
106 |     """The input data."""
107 | 
108 |     def __init__(self, config, data, name=None):
109 |         self.batch_size = batch_size = config.batch_size
110 |         self.num_steps = num_steps = config.num_steps
111 |         self.epoch_size = ((len(data) // batch_size) - 1) // num_steps
112 |         self.input_data, self.targets = reader.wiki_producer(
113 |             data, batch_size, num_steps, name=name)
114 | 
115 | 
116 | class WPModel(object):
117 |     """Word Prediction model."""
118 | 
119 |     def __init__(self, is_training, config, input_):
120 |         self._input = input_
121 | 
122 |         batch_size = input_.batch_size
123 |         num_steps = input_.num_steps
124 |         size = config.hidden_size
125 |         vocab_size = config.vocab_size
126 | 
127 |         # Slightly better results can be obtained with forget gate biases
128 |         # initialized to 1 but the hyperparameters of the model would need to be
129 |         # different than reported in the paper.
130 |         def lstm_cell():
131 |             # With the latest TensorFlow source code (as of Mar 27, 2017),
132 |             # the BasicLSTMCell will need a reuse parameter which is unfortunately not
133 |             # defined in TensorFlow 1.0. To maintain backwards compatibility, we add
134 |             # an argument check here:
135 |             # if 'reuse' in inspect.getargspec(
136 |             #     tf.contrib.rnn.BasicLSTMCell.__init__).args:
137 |             #   return tf.contrib.rnn.BasicLSTMCell(
138 |             #       size, forget_bias=0.0, state_is_tuple=True,
139 |             #       reuse=tf.get_variable_scope().reuse)
140 |             # else:
141 |             return tf.contrib.rnn.BasicLSTMCell(
142 |                 size, forget_bias=0.0, state_is_tuple=True)
143 | 
144 |         attn_cell = lstm_cell
145 |         if is_training and config.keep_prob < 1:
146 |             def attn_cell():
147 |                 return tf.contrib.rnn.DropoutWrapper(
148 |                     lstm_cell(), output_keep_prob=config.keep_prob)
149 | 
150 |         cell = tf.contrib.rnn.MultiRNNCell(
151 |             [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True)
152 | 
153 |         # data_type() returns float32 or float16
154 |         self._initial_state = cell.zero_state(batch_size, data_type())
155 | 
156 |         with tf.device("/cpu:0"):
157 |             # TODO: replace TF input with my embeddings
158 |             # TODO: implement PTB reader or something similar
159 |             embedding = tf.get_variable(
160 |                 "embedding", [vocab_size, size], dtype=data_type())
161 |             inputs = tf.nn.embedding_lookup(embedding, input_.input_data)
162 | 
163 | 
164 |         if is_training and config.keep_prob < 1:
165 |             # Dropout allows to use the net for train and testing
166 |             # See: https://stackoverflow.com/questions/34597316/why-input-is-scaled-in-tf-nn-dropout-in-tensorflow
167 |             # and: http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf
168 |             inputs = tf.nn.dropout(inputs, config.keep_prob)
169 | 
170 |         # Simplified version of models/tutorials/rnn/rnn.py's rnn().
171 |         # This builds an unrolled LSTM for tutorial purposes only.
172 |         # In general, use the rnn() or state_saving_rnn() from rnn.py.
173 |         #
174 |         # The alternative version of the code below is:
175 |         #
176 |         inputs = tf.unstack(inputs, num=num_steps, axis=1)
177 |         outputs, state = tf.contrib.rnn.static_rnn(
178 |             cell, inputs, initial_state=self._initial_state)
179 |         # TODO: passing the sequence_length argument will enable to input variable-length tensors
180 | 
181 |         # outputs = []
182 |         # state = self._initial_state
183 |         # with tf.variable_scope("RNN"):
184 |         #     for time_step in range(num_steps):
185 |         #         if time_step > 0:
186 |         #             tf.get_variable_scope().reuse_variables()
187 |         #         (cell_output, state) = cell(inputs[:, time_step, :], state) # Call (inputs, state)
188 |         #         outputs.append(cell_output)
189 | 
190 |         # TODO: check why outputs are stacked and resized
191 |         output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size])
192 |         softmax_w = tf.get_variable(
193 |             "softmax_w", [size, vocab_size], dtype=data_type())
194 |         softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type())
195 |         logits = tf.matmul(output, softmax_w) + softmax_b
196 |         loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
197 |             [logits],
198 |             [tf.reshape(input_.targets, [-1])],
199 |             [tf.ones([batch_size * num_steps], dtype=data_type())])
200 |         self._cost = cost = tf.reduce_sum(loss) / batch_size
201 |         self._final_state = state
202 | 
203 |         if not is_training:
204 |             return
205 | 
206 |         self._lr = tf.Variable(0.0, trainable=False)
207 |         tvars = tf.trainable_variables()
208 |         grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
209 |                                           config.max_grad_norm)
210 |         optimizer = tf.train.GradientDescentOptimizer(self._lr)
211 |         self._train_op = optimizer.apply_gradients(
212 |             zip(grads, tvars),
213 |             global_step=tf.contrib.framework.get_or_create_global_step())
214 | 
215 |         self._new_lr = tf.placeholder(
216 |             tf.float32, shape=[], name="new_learning_rate")
217 |         self._lr_update = tf.assign(self._lr, self._new_lr)
218 | 
219 |     def assign_lr(self, session, lr_value):
220 |         session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
221 | 
222 |     @property
223 |     def input(self):
224 |         return self._input
225 | 
226 |     @property
227 |     def initial_state(self):
228 |         return self._initial_state
229 | 
230 |     @property
231 |     def cost(self):
232 |         return self._cost
233 | 
234 |     @property
235 |     def final_state(self):
236 |         return self._final_state
237 | 
238 |     @property
239 |     def lr(self):
240 |         return self._lr
241 | 
242 |     @property
243 |     def train_op(self):
244 |         return self._train_op
245 | 
246 | 
247 | class SmallConfig(object):
248 |     """Small config."""
249 |     init_scale = 0.1
250 |     learning_rate = 1.0
251 |     max_grad_norm = 5
252 |     num_layers = 2
253 |     num_steps = 20
254 |     hidden_size = 200
255 |     max_epoch = 4
256 |     max_max_epoch = 13
257 |     keep_prob = 1.0
258 |     lr_decay = 0.5
259 |     batch_size = 20
260 |     vocab_size = 27942
261 | 
262 | 
263 | class MediumConfig(object):
264 |     """Medium config."""
265 |     init_scale = 0.05
266 |     learning_rate = 1.0
267 |     max_grad_norm = 5
268 |     num_layers = 2
269 |     num_steps = 35
270 |     hidden_size = 650
271 |     max_epoch = 6
272 |     max_max_epoch = 39
273 |     keep_prob = 0.5
274 |     lr_decay = 0.8
275 |     batch_size = 20
276 |     vocab_size = 10000
277 | 
278 | 
279 | class LargeConfig(object):
280 |     """Large config."""
281 |     init_scale = 0.04
282 |     learning_rate = 1.0
283 |     max_grad_norm = 10
284 |     num_layers = 2
285 |     num_steps = 35
286 |     hidden_size = 1024
287 |     max_epoch = 14
288 |     max_max_epoch = 55
289 |     keep_prob = 0.35
290 |     lr_decay = 1 / 1.15
291 |     batch_size = 20
292 |     vocab_size = 10000
293 | 
294 | 
295 | class TestConfig(object):
296 |     """Tiny config, for testing."""
297 |     init_scale = 0.1
298 |     learning_rate = 1.0
299 |     max_grad_norm = 1
300 |     num_layers = 1
301 |     num_steps = 2
302 |     hidden_size = 2
303 |     max_epoch = 1
304 |     max_max_epoch = 1
305 |     keep_prob = 1.0
306 |     lr_decay = 0.5
307 |     batch_size = 20
308 |     vocab_size = 10000
309 | 
310 | 
311 | def run_epoch(session, model, eval_op=None, verbose=False):
312 |     """Runs the model on the given data."""
313 |     start_time = time.time()
314 |     costs = 0.0
315 |     iters = 0
316 |     state = session.run(model.initial_state)
317 | 
318 |     fetches = {
319 |         "cost": model.cost,
320 |         "final_state": model.final_state,
321 |     }
322 |     if eval_op is not None:
323 |         fetches["eval_op"] = eval_op
324 | 
325 |     for step in range(model.input.epoch_size):
326 |         feed_dict = {}
327 |         for i, (c, h) in enumerate(model.initial_state):
328 |             feed_dict[c] = state[i].c
329 |             feed_dict[h] = state[i].h
330 | 
331 |         vals = session.run(fetches, feed_dict)
332 |         cost = vals["cost"]
333 |         state = vals["final_state"]
334 | 
335 |         costs += cost
336 |         iters += model.input.num_steps
337 | 
338 |         if verbose and step % (model.input.epoch_size // 10) == 10:
339 |             print("%.3f perplexity: %.3f speed: %.0f wps" %
340 |                   (step * 1.0 / model.input.epoch_size, np.exp(costs / iters),
341 |                    iters * model.input.batch_size / (time.time() - start_time)))
342 | 
343 |     return np.exp(costs / iters)
344 | 
345 | 
346 | def get_config():
347 |     if FLAGS.model == "small":
348 |         return SmallConfig()
349 |     elif FLAGS.model == "medium":
350 |         return MediumConfig()
351 |     elif FLAGS.model == "large":
352 |         return LargeConfig()
353 |     elif FLAGS.model == "test":
354 |         return TestConfig()
355 |     else:
356 |         raise ValueError("Invalid model: %s", FLAGS.model)
357 | 
358 | 
359 | def main(_):
360 |     if not FLAGS.data_path:
361 |         raise ValueError("Must set --data_path to wiki data directory")
362 | 
363 |     raw_data = reader.wiki_raw_data(FLAGS.data_path, FLAGS.word_to_id_path)
364 |     train_data, valid_data, test_data = raw_data
365 | 
366 |     #vocab_size = get_vocab_size()
367 |     vocab_size = 126930
368 | 
369 |     config = get_config()
370 |     config.vocab_size = vocab_size
371 | 
372 |     eval_config = get_config()
373 |     eval_config.batch_size = 1
374 |     eval_config.num_steps = 1
375 |     eval_config.vocab_size = vocab_size
376 | 
377 |     with tf.Graph().as_default():
378 |         # Args: [minval, maxval]
379 |         initializer = tf.random_uniform_initializer(-config.init_scale,
380 |                                                     config.init_scale)
381 | 
382 |         with tf.name_scope("Train"):
383 |             train_input = WPInput(config=config, data=train_data, name="TrainInput")
384 |             with tf.variable_scope("Model", reuse=None, initializer=initializer):
385 |                 m = WPModel(is_training=True, config=config, input_=train_input)
386 |             tf.summary.scalar("Training Loss", m.cost)
387 |             tf.summary.scalar("Learning Rate", m.lr)
388 | 
389 |         with tf.name_scope("Valid"):
390 |             valid_input = WPInput(config=config, data=valid_data, name="ValidInput")
391 |             with tf.variable_scope("Model", reuse=True, initializer=initializer):
392 |                 mvalid = WPModel(is_training=False, config=config, input_=valid_input)
393 |             tf.summary.scalar("Validation Loss", mvalid.cost)
394 | 
395 |         with tf.name_scope("Test"):
396 |             test_input = WPInput(config=eval_config, data=test_data, name="TestInput")
397 |             with tf.variable_scope("Model", reuse=True, initializer=initializer):
398 |                 mtest = WPModel(is_training=False, config=eval_config,
399 |                                 input_=test_input)
400 | 
401 |         sv = tf.train.Supervisor(logdir=FLAGS.save_path)
402 |         with sv.managed_session() as session:
403 |             for i in range(config.max_max_epoch):
404 |                 lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
405 |                 m.assign_lr(session, config.learning_rate * lr_decay)
406 | 
407 |                 print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
408 |                 train_perplexity = run_epoch(session, m, eval_op=m.train_op,
409 |                                              verbose=True)
410 |                 print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
411 |                 valid_perplexity = run_epoch(session, mvalid)
412 |                 print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
413 | 
414 |             test_perplexity = run_epoch(session, mtest)
415 |             print("Test Perplexity: %.3f" % test_perplexity)
416 | 
417 |             if FLAGS.save_path:
418 |                 print("Saving model to %s." % FLAGS.save_path)
419 |                 sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
420 | 
421 | 
422 | if __name__ == "__main__":
423 |     tf.app.run()
424 | 


--------------------------------------------------------------------------------
/src/lstm/reader.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | 
 17 | """Utilities for parsing PTB text files."""
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import collections
 23 | import os
 24 | 
 25 | import tensorflow as tf
 26 | 
 27 | 
 28 | def _read_words(filename):
 29 |   with tf.gfile.GFile(filename, "r") as f:
 30 |     return f.read().decode("utf-8").replace("\n", "<eos>").split()
 31 | 
 32 | 
 33 | def _build_vocab(filename):
 34 |   data = _read_words(filename)
 35 | 
 36 |   counter = collections.Counter(data)
 37 |   count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
 38 | 
 39 |   words, _ = list(zip(*count_pairs))
 40 |   word_to_id = dict(zip(words, range(len(words))))
 41 | 
 42 |   return word_to_id
 43 | 
 44 | 
 45 | def _file_to_word_ids(filename, word_to_id):
 46 |   data = _read_words(filename)
 47 |   return [word_to_id[word] for word in data if word in word_to_id]
 48 | 
 49 | 
 50 | def ptb_raw_data(data_path=None):
 51 |   """Load PTB raw data from data directory "data_path".
 52 | 
 53 |   Reads PTB text files, converts strings to integer ids,
 54 |   and performs mini-batching of the inputs.
 55 | 
 56 |   The PTB dataset comes from Tomas Mikolov's webpage:
 57 | 
 58 |   http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
 59 | 
 60 |   Args:
 61 |     data_path: string path to the directory where simple-examples.tgz has
 62 |       been extracted.
 63 | 
 64 |   Returns:
 65 |     tuple (train_data, valid_data, test_data, vocabulary)
 66 |     where each of the data objects can be passed to PTBIterator.
 67 |   """
 68 | 
 69 |   train_path = os.path.join(data_path, "ptb.train.txt")
 70 |   valid_path = os.path.join(data_path, "ptb.valid.txt")
 71 |   test_path = os.path.join(data_path, "ptb.test.txt")
 72 | 
 73 |   word_to_id = _build_vocab(train_path)
 74 |   train_data = _file_to_word_ids(train_path, word_to_id)
 75 |   valid_data = _file_to_word_ids(valid_path, word_to_id)
 76 |   test_data = _file_to_word_ids(test_path, word_to_id)
 77 |   vocabulary = len(word_to_id)
 78 |   return train_data, valid_data, test_data, vocabulary
 79 | 
 80 | 
 81 | def ptb_producer(raw_data, batch_size, num_steps, name=None):
 82 |   """Iterate on the raw PTB data.
 83 | 
 84 |   This chunks up raw_data into batches of examples and returns Tensors that
 85 |   are drawn from these batches.
 86 | 
 87 |   Args:
 88 |     raw_data: one of the raw data outputs from ptb_raw_data.
 89 |     batch_size: int, the batch size.
 90 |     num_steps: int, the number of unrolls.
 91 |     name: the name of this operation (optional).
 92 | 
 93 |   Returns:
 94 |     A pair of Tensors, each shaped [batch_size, num_steps]. The second element
 95 |     of the tuple is the same data time-shifted to the right by one.
 96 | 
 97 |   Raises:
 98 |     tf.errors.InvalidArgumentError: if batch_size or num_steps are too high.
 99 |   """
100 |   with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]):
101 |     raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32)
102 | 
103 |     data_len = tf.size(raw_data)
104 |     batch_len = data_len // batch_size
105 |     data = tf.reshape(raw_data[0 : batch_size * batch_len],
106 |                       [batch_size, batch_len])
107 | 
108 |     epoch_size = (batch_len - 1) // num_steps
109 |     assertion = tf.assert_positive(
110 |         epoch_size,
111 |         message="epoch_size == 0, decrease batch_size or num_steps")
112 |     with tf.control_dependencies([assertion]):
113 |       epoch_size = tf.identity(epoch_size, name="epoch_size")
114 | 
115 |     i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
116 |     x = tf.slice(data, [0, i * num_steps],
117 |                          [batch_size, (i + 1) * num_steps])
118 |     #x = tf.strided_slice(data, [0, i * num_steps],
119 |     #                     [batch_size, (i + 1) * num_steps])
120 |     x.set_shape([batch_size, num_steps])
121 |     y = tf.slice(data, [0, i * num_steps + 1],
122 |                          [batch_size, (i + 1) * num_steps + 1])
123 |     y.set_shape([batch_size, num_steps])
124 |     return x, y
125 | 


--------------------------------------------------------------------------------
/src/lstm/reader_frag.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | 
 17 | """Utilities for parsing PTB text files."""
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import collections
 23 | import os
 24 | 
 25 | import tensorflow as tf
 26 | import numpy as np
 27 | 
 28 | def _read_words(filename):
 29 |     with tf.gfile.GFile(filename, "r") as f:
 30 |         return f.read().decode("latin-1").split()
 31 | 
 32 | 
 33 | def _build_vocab(filename):
 34 |     data = _read_words(filename)
 35 | 
 36 |     counter = collections.Counter(data)
 37 |     count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
 38 | 
 39 |     words, _ = list(zip(*count_pairs))
 40 |     word_to_id = dict(zip(words, range(len(words))))
 41 | 
 42 |     return word_to_id
 43 | 
 44 | 
 45 | def _file_to_word_ids_translating(filename, word_to_id):
 46 |     data = _read_words(filename)
 47 |     return [word_to_id[word] for word in data if word in word_to_id]
 48 | 
 49 | 
 50 | def _file_to_word_ids(filename):
 51 |     data = []
 52 |     files = open(filename).read().split()
 53 |     for f in files:
 54 |         with open(f) as fn:
 55 |             data.extend([int(w) for w in fn.read().split()])
 56 | 
 57 |     return data
 58 | 
 59 | 
 60 | def wiki_raw_data(data_path=None, word_to_id_path=None):
 61 |     """Load WP raw data from data directory "data_path".
 62 | 
 63 |     Reads WP text files, converts strings to integer ids,
 64 |     and performs mini-batching of the inputs.
 65 | 
 66 |     The WP dataset comes from Tomas Mikolov's webpage:
 67 | e
 68 |     http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
 69 | 
 70 |     Args:
 71 |       data_path: string path to the directory where simple-examples.tgz has
 72 |         been extracted.
 73 | 
 74 |     Returns:
 75 |       tuple (train_data, valid_data, test_data, vocabulary)
 76 |       where each of the data objects can be passed to PTBIterator.
 77 |     """
 78 |     import sys
 79 | 
 80 |     print("Loading data from %s" % data_path)
 81 |     train_path = os.path.join(data_path, "train.list")
 82 |     valid_path = os.path.join(data_path, "valid.list")
 83 |     test_path = os.path.join(data_path, "test.list")
 84 | 
 85 |     # word_to_id = VectorManager.read_vector(word_to_id_path)
 86 |     # print("Word 2 ID size: %s" % (sys.getsizeof(word_to_id)))
 87 |     # sys.stdout.flush()
 88 | 
 89 |     #word_to_id = _build_vocab(train_path)
 90 |     train_data = open(train_path).read().split()
 91 |     print("Train size: %s" % (len(train_data)))
 92 |     sys.stdout.flush()
 93 | 
 94 |     valid_data = _file_to_word_ids(valid_path)
 95 |     print("Validation size: %s" % (len(valid_data)))
 96 |     sys.stdout.flush()
 97 | 
 98 |     test_data = _file_to_word_ids(test_path)
 99 |     print("Test size: %s" % (len(test_data)))
100 |     sys.stdout.flush()
101 | 
102 |     # vocabulary = len(word_to_id)
103 | 
104 |     return train_data, valid_data, test_data
105 | 
106 | def wiki_producer(data_name, raw_data, batch_size, num_steps, name=None):
107 |     """Iterate on the raw Wikipedia data.
108 | 
109 |     This chunks up raw_data into batches of examples and returns Tensors that
110 |     are drawn from these batches.
111 | 
112 |     Args:
113 |       raw_data: one of the raw data outputs from ptb_raw_data.
114 |       batch_size: int, the batch size.
115 |       num_steps: int, the number of unrolls.
116 |       name: the name of this operation (optional).
117 | 
118 |     Returns:
119 |       A pair of Tensors, each shaped [batch_size, num_steps]. The second element
120 |       of the tuple is the same data time-shifted to the right by one.
121 | 
122 |     Raises:
123 |       tf.errors.InvalidArgumentError: if batch_size or num_steps are too high.
124 |     """
125 |     with tf.name_scope(name, "WPProducer", [raw_data, batch_size, num_steps]):
126 | 
127 |         if data_name == "TrainInput":
128 |             data_len = 1516132009  # Validated
129 |         elif data_name == "ValidInput":
130 |             data_len = 182828964  # Validated
131 |         elif data_name == "TestInput":
132 |             data_len = 181755142  # Validated
133 |         else:
134 |             print("[ERROR] Data length not defined.")
135 |             data_len = 0
136 | 
137 |         stride = 3500000
138 | 
139 | 
140 |         #raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32)
141 |         #raw_data = np.array(raw_data, dtype=np.int32)
142 | 
143 |         batch_len = data_len // batch_size
144 |         print("Indices %s %s %s" % (batch_size * batch_len,
145 |                           batch_size, batch_len))
146 |         data = np.reshape(raw_data[0: batch_size * batch_len],
147 |                           [batch_size, batch_len])
148 | 
149 |         epoch_size = (batch_len - 1) // num_steps
150 |         assertion = tf.assert_positive(
151 |             epoch_size,
152 |             message="epoch_size == 0, decrease batch_size or num_steps")
153 |         with tf.control_dependencies([assertion]):
154 |             epoch_size = tf.identity(epoch_size, name="epoch_size")
155 | 
156 | 
157 |         i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
158 | 
159 |         index_0 = tf.multiply(i, num_steps)
160 |         index_aux = tf.add(i, 1)
161 |         index_1 = tf.multiply(index_aux, num_steps)
162 |         # i2 = q.dequeue()
163 |         # x = data[0:batch_size, i * num_steps:(i + 1) * num_steps]
164 |         #print("Slices [0, %s], [%s, %s]" % (i * num_steps, batch_size, (i + 1) * num_steps))
165 |         x = tf.strided_slice(data, [0, i * num_steps],
166 |                               [batch_size, (i + 1) * num_steps])
167 |         x.set_shape([batch_size, num_steps])
168 | 
169 |         y = data[0:batch_size, i * num_steps + 1:(i + 1) * num_steps + 1]
170 |         y = tf.strided_slice(data, [0, i * num_steps + 1],
171 |                              [batch_size, (i + 1) * num_steps + 1])
172 |         y.set_shape([batch_size, num_steps])
173 | 
174 |         return x, y, i*num_steps, (i + 1) * num_steps, data
175 | 


--------------------------------------------------------------------------------
/src/lstm/reader_test.py:
--------------------------------------------------------------------------------
 1 | from utils.vector_manager import VectorManager
 2 | from utils.flatten import flatten
 3 | import tensorflow as tf
 4 | import numpy as np
 5 | 
 6 | 
 7 | data = ['consumers', 'may', 'want', 'to', 'move', 'their',
 8 |         'telephones', 'a', 'little', 'closer', 'to', 'the',
 9 |         'tv', 'set', '<unk>', '<unk>', 'watching', 'abc', "'s",
10 |         'monday', 'night', 'football', 'can', 'now', 'vote', 'during',
11 |         '<unk>', 'for', 'the', 'greatest', 'play', 'in', 'N', 'years',
12 |         'from', 'among', 'four', 'or', 'five', '<unk>', '<unk>',
13 |         'two', 'weeks', 'ago', 'viewers', 'of', 'several', 'nbc',
14 |         '<unk>', 'consumer', 'segments', 'started', 'calling', 'a',
15 |         'N', 'number', 'for', 'advice', 'on', 'various', '<unk>',
16 |         'issues', 'and', 'the', 'new', 'syndicated', 'reality',
17 |         'show', 'hard', 'copy', 'records', 'viewers', "'", 'opinions',
18 |         'for', 'possible', 'airing', 'on', 'the', 'next', 'day', "'s",
19 |         'show', 'interactive', 'telephone', 'technology', 'has',
20 |         'taken', 'a', 'new', 'leap', 'in', '<unk>', 'and', 'television',
21 |         'programmers', 'are', 'racing', 'to', 'exploit']
22 | 
23 | 
24 | 
25 | from lstm.reader_wp import wiki_raw_data, wiki_producer
26 | 
27 | train, valid, test = wiki_raw_data("../data/wikipedia/")
28 | #data = data.flatten()
29 | batch_size = 2
30 | num_steps = 3
31 | inputs, targets, s1, s2, x = wiki_producer(train, batch_size=batch_size, num_steps=num_steps)
32 | 
33 | # print inputs
34 | # sv = tf.train.Supervisor()
35 | # with sv.managed_session() as sess:
36 | #     print sess.run([inputs, s1, s2])
37 | #     print sess.run([inputs, s1, s2])
38 | #     print sess.run([inputs, s1, s2])
39 | #     print sess.run([inputs, s1, s2])
40 | #     print sess.run([inputs, s1, s2])
41 | 
42 | data_len = np.size(train)
43 | batch_len = data_len // batch_size
44 | ndata = np.reshape(train[0: batch_size * batch_len],
45 |                   [batch_size, batch_len])
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/src/lstm/reader_topics.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | 
 17 | """Utilities for parsing PTB text files."""
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import collections
 23 | import os
 24 | 
 25 | import tensorflow as tf
 26 | 
 27 | 
 28 | def _read_words(filename):
 29 |   with tf.gfile.GFile(filename, "r") as f:
 30 |     return f.read().decode("utf-8").replace("\n", "<eos>").split()
 31 | 
 32 | 
 33 | def _build_vocab(filename):
 34 |   data = _read_words(filename)
 35 | 
 36 |   counter = collections.Counter(data)
 37 |   count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
 38 | 
 39 |   words, _ = list(zip(*count_pairs))
 40 |   word_to_id = dict(zip(words, range(len(words))))
 41 | 
 42 |   return word_to_id
 43 | 
 44 | 
 45 | def _file_to_word_ids(filename, word_to_id):
 46 |   data = _read_words(filename)
 47 |   return [word_to_id[word] for word in data if word in word_to_id]
 48 | 
 49 | 
 50 | def wiki_raw_data(data_path=None):
 51 |   """Load PTB raw data from data directory "data_path".
 52 | 
 53 |   Reads PTB text files, converts strings to integer ids,
 54 |   and performs mini-batching of the inputs.
 55 | 
 56 |   The PTB dataset comes from Tomas Mikolov's webpage:
 57 | 
 58 |   http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
 59 | 
 60 |   Args:
 61 |     data_path: string path to the directory where simple-examples.tgz has
 62 |       been extracted.
 63 | 
 64 |   Returns:
 65 |     tuple (train_data, valid_data, test_data, vocabulary)
 66 |     where each of the data objects can be passed to PTBIterator.
 67 |   """
 68 | 
 69 |   train_path = os.path.join(data_path, "wiki.train.txt")
 70 |   valid_path = os.path.join(data_path, "wiki.valid.txt")
 71 |   test_path = os.path.join(data_path, "wiki.test.txt")
 72 | 
 73 |   word_to_id = _build_vocab(train_path)
 74 |   train_data = _file_to_word_ids(train_path, word_to_id)
 75 |   valid_data = _file_to_word_ids(valid_path, word_to_id)
 76 |   test_data = _file_to_word_ids(test_path, word_to_id)
 77 |   vocabulary = len(word_to_id)
 78 |   return train_data, valid_data, test_data, vocabulary
 79 | 
 80 | 
 81 | def ptb_producer(raw_data, batch_size, num_steps, name=None):
 82 |   """Iterate on the raw Wikipedia data.
 83 | 
 84 |   This chunks up raw_data into batches of examples and returns Tensors that
 85 |   are drawn from these batches.
 86 | 
 87 |   Args:
 88 |     raw_data: one of the raw data outputs from ptb_raw_data.
 89 |     batch_size: int, the batch size.
 90 |     num_steps: int, the number of unrolls.
 91 |     name: the name of this operation (optional).
 92 | 
 93 |   Returns:
 94 |     A pair of Tensors, each shaped [batch_size, num_steps]. The second element
 95 |     of the tuple is the same data time-shifted to the right by one.
 96 | 
 97 |   Raises:
 98 |     tf.errors.InvalidArgumentError: if batch_size or num_steps are too high.
 99 |   """
100 |   with tf.name_scope(name, "WPProducer", [raw_data, batch_size, num_steps]):
101 | 
102 | 
103 | 
104 | 
105 |     #     data_len, batch_len, data, epoch_size, i, x, y, y_2 = sess.run([data_len, batch_len, data, epoch_size, i, x, y, y_2])
106 |     #
107 |     #
108 |     # batch_size = 5
109 |     # num_steps = 5
110 |     #
111 |     # tf.reset_default_graph()
112 |     raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.string)
113 | 
114 |     data_len = tf.size(raw_data)
115 |     batch_len = data_len // batch_size
116 |     data = tf.reshape(raw_data[0 : batch_size * batch_len],
117 |                       [batch_size, batch_len])
118 | 
119 |     epoch_size = (batch_len - 1) // num_steps
120 |     assertion = tf.assert_positive(
121 |         epoch_size,
122 |         message="epoch_size == 0, decrease batch_size or num_steps")
123 |     with tf.control_dependencies([assertion]):
124 |       epoch_size = tf.identity(epoch_size, name="epoch_size")
125 | 
126 |     i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
127 |     # i2 = q.dequeue()
128 |     x = tf.strided_slice(data, [0, i * num_steps],
129 |                          [batch_size, (i + 1) * num_steps])
130 |     x.set_shape([batch_size, num_steps])
131 |     y = tf.strided_slice(data, [0, i * num_steps + 1],
132 |                          [batch_size, (i + 1) * num_steps + 1])
133 |     y.set_shape([batch_size, num_steps])
134 | 
135 | 
136 | 
137 |     #
138 |     # sv = tf.train.Supervisor()
139 |     #
140 |     # with sv.managed_session() as sess:
141 |     #     ii, xx, yy, yy2 = sess.run([i, x, y, y_2])
142 | 
143 |     return x, y
144 | 
145 | # tf.reset_default_graph()
146 | # init = tf.initialize_all_variables()
147 | # sess = tf.Session()
148 | # sess.run(init)
149 | # tf.train.start_queue_runners(sess=sess)
150 | # sess.run([x, y])


--------------------------------------------------------------------------------
/src/postprocess/semantics_check.py:
--------------------------------------------------------------------------------
 1 | from gensim.models.keyedvectors import KeyedVectors
 2 | from pprint import pprint
 3 | import gensim
 4 | import os
 5 | import argparse
 6 | 
 7 | module_path = "%s/test" % os.path.dirname(gensim.__file__)
 8 | 
 9 | 
10 | def semantics_checks(wv):
11 |     """
12 |     Perform some semantics check to see that the generated word vectors are sensible
13 |     :param wv: word vectors of the embeddings
14 |     """
15 |     print("Operations using multiplicative combination objective:")
16 |     w = wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
17 |     print(" * King + Woman - Man = %s [%s]" % (w[0][0], w[0][1]))
18 |     w = wv.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'])
19 |     print(" * Baghdad + England - London = %s [%s]" % (w[0][0], w[0][1]))
20 | 
21 |     print("\n * Most similar words to Paris:")
22 |     pprint(wv.most_similar_cosmul('paris'))
23 | 
24 |     print("\n * Most similar words to Jupiter:")
25 |     pprint(wv.most_similar_cosmul('jupiter'))
26 | 
27 |     print("\n * Most similar words to Zeus:")
28 |     pprint(wv.most_similar_cosmul('zeus'))
29 | 
30 | 
31 | def compute_accuracies(wv):
32 |     """
33 |     Compute the accuracy of parameter word embeddings with 5 semantic and 9 grammatical relations
34 |     :param wv: word vectors of the embeddings
35 |     """
36 |     acc = wv.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt'))
37 |     for sec in acc:
38 |         correct = len(sec['correct'])
39 |         incorrect = len(sec['incorrect'])
40 |         total = correct + incorrect
41 |         ac = correct / float(total)
42 |         print("\n[%s]\n\tAccuracy [%s] %s/%s" % (sec['section'].title(), round(ac, 2), correct, total))
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     parser = argparse.ArgumentParser()
47 |     parser.add_argument('-w', '--word_vectors', type=str, help="Path of LM to perform the tests upon", required=True)
48 | 
49 |     args = parser.parse_args()
50 | 
51 |     # Arguments parsing
52 |     wv_path = args.word_vectors
53 | 
54 |     print("Loading model...")
55 |     wv = KeyedVectors.load_word2vec_format(wv_path, binary=False)
56 | 
57 |     # Some semantic examples
58 |     semantics_checks(wv)
59 | 
60 |     # Compute and print questions accuracies
61 |     compute_accuracies(wv)
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/src/postprocess/test_topics.py:
--------------------------------------------------------------------------------
  1 | from utils.vector_manager import VectorManager
  2 | from gensim.corpora import Dictionary, MmCorpus
  3 | from gensim.models import LsiModel, LdaMulticore, LdaModel, HdpModel
  4 | from time import time
  5 | 
  6 | import numpy as np
  7 | import argparse
  8 | import pickle
  9 | import sys
 10 | 
 11 | 
 12 | def load_dict(id2word_path):
 13 |     print("[BLOCK] Loading  dictionary files from %s" % (id2word_path))
 14 |     sys.stdout.flush()
 15 |     dictionary = Dictionary.load_from_text(id2word_path)
 16 | 
 17 | 
 18 |     return dictionary
 19 | 
 20 | 
 21 | def word2id_to_id2word(word2id_path):
 22 | 
 23 |     word2id = pickle.load(open(word2id_path))
 24 |     id2word_c = [0] * len(word2id)
 25 |     for w in word2id:
 26 |         id2word_c[word2id[w]] = w
 27 |     return id2word_c
 28 | 
 29 | 
 30 | 
 31 | def print_lsa_topic(document, dictionary, lsi):
 32 |     corpus = [dictionary.doc2bow(document.split())]
 33 |     topics = lsi[corpus]
 34 |     topics = topics[0]  # Only one document
 35 | 
 36 |     values = [abs(val) for _, val in topics]
 37 |     index = values.index(max(values))
 38 |     # print(values)
 39 |     print(topics[index], lsi.print_topic(index))
 40 | 
 41 | 
 42 | def print_hdp(document, dictionary, hdp):
 43 |     corpus = [dictionary.doc2bow(document.split())]
 44 |     corpus_hdp = hdp[corpus]
 45 | 
 46 |     for doc in corpus_hdp:
 47 |         values = [abs(val) for _, val in doc]
 48 |         index = values.index(max(values))
 49 |         # print(values)
 50 |         print(doc[index], hdp.print_topic(index))
 51 | 
 52 | 
 53 | 
 54 | if __name__ == '__main__':
 55 |     parser = argparse.ArgumentParser()
 56 |     parser.add_argument('-m', '--model', type=str, help="Directory where the model is stored.", required=True)
 57 |     parser.add_argument('-e', '--embeddings', type=str, help="Embeddings path (id_word_vec.pklz)", required=True)
 58 |     parser.add_argument('-w', '--word2id_path', type=str, help="Word2ID vector to be used for doc translation.",
 59 |                         required=True, default=None)
 60 |     parser.add_argument('-i', '--id_word', type=str, help="Id2Word vector path ['wiki_en_wordids.txt'].",
 61 |                         required=True, default=None)
 62 | 
 63 |     args = parser.parse_args()
 64 | 
 65 |     model_path = args.model
 66 |     id2word_path = args.id_word
 67 |     word2id_path = args.word2id_path
 68 |     emb_path = args.embeddings
 69 | 
 70 |     begin = time()
 71 | 
 72 |     dictionary = load_dict(id2word_path)
 73 |     id2word = word2id_to_id2word(word2id_path)
 74 |     w2Id = VectorManager.read_vector(word2id_path)
 75 |     embeddings = VectorManager.read_vector(emb_path)
 76 | 
 77 |     demo1 = "the roman consul is normally a notable person from the senate elected " \
 78 |             "by direct voting of the italic tribes"
 79 | 
 80 |     data = open("../data/small/AA/wiki_01_clean_simple").read().split("<eop>")
 81 |     s1 = data[0].split("<eos>")[0]
 82 |     data = open("../data/small/AA/wiki_00_clean_simple").read().split("<eop>")
 83 |     s2 = data[0].split("<eos>")[0]
 84 |     data = open("../data/small/AB/wiki_00_clean_simple").read().split("<eop>")
 85 |     s3 = data[0].split("<eos>")[0]
 86 |     data = open("../data/small/AB/wiki_01_clean_simple").read().split("<eop>")
 87 |     s4 = data[0].split("<eos>")[0]
 88 | 
 89 | 
 90 |     if "lda" in model_path:
 91 |         lda = LdaModel.load(model_path)
 92 |         print("Demo 1:\n%s" % demo1)
 93 |         print(get_lda_best_topic_words(demo1, dictionary, lda))
 94 |         print("Demo 2:\n%s" % s1)
 95 |         print(get_lda_best_topic_words(s1, dictionary, lda))
 96 |         print("Demo 3:\n%s" % s2)
 97 |         print(get_lda_best_topic_words(s2, dictionary, lda))
 98 |         print("Demo 4:\n%s" % s3)
 99 |         print(get_lda_best_topic_words(s3, dictionary, lda))
100 |         print("Demo 5:\n%s" % s4)
101 |         print(get_lda_best_topic_words(s4, dictionary, lda))
102 |     elif "lsa" in model_path:
103 |         lsi = LsiModel.load(model_path)
104 |         print("Demo 1:\n%s" % demo1)
105 |         print(print_lsa_topic(demo1, dictionary, lsi))
106 |         print("Demo 2:\n%s" % s1)
107 |         print(print_lsa_topic(s1, dictionary, lsi))
108 |         print("Demo 3:\n%s" % s2)
109 |         print(print_lsa_topic(s2, dictionary, lsi))
110 |         print("Demo 4:\n%s" % s3)
111 |         print(print_lsa_topic(s3, dictionary, lsi))
112 |         print("Demo 5:\n%s" % s4)
113 |         print(print_lsa_topic(s4, dictionary, lsi))
114 |         print(get_lsa_topic_embeding(s4, dictionary, lsi, w2Id, embeddings))
115 |     elif "hdp" in model_path:
116 |         hdp = HdpModel.load(model_path)
117 |         print("Demo 1:\n%s" % demo1)
118 |         print(print_hdp(demo1, dictionary, hdp))
119 | 
120 | 
121 |     end = time()
122 |     print("Total processing time: %d seconds" % (end - begin))
123 | 


--------------------------------------------------------------------------------
/src/postprocess/tsne.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | from gensim.models.keyedvectors import KeyedVectors
 5 | from utils.vector_manager import VectorManager
 6 | from sklearn.manifold import TSNE
 7 | import matplotlib.pyplot as plt
 8 | import argparse
 9 | 
10 | 
11 | def plot_tsne(id_word_vec):
12 | 
13 |     """
14 |     Compute the t-SNE dimensionality reduction values of input parameter and plot them in 2D
15 |     :param id_word_vec: vector containing the tuples (id, word, embedding) to be plotted
16 |     """
17 |     tsne = TSNE(n_components=2)
18 |     X_tsne = tsne.fit_transform([v for _, _, v in id_word_vec])
19 |     plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
20 | 
21 |     for i, word in enumerate([word for _, word, _ in id_word_vec]):
22 |         plt.annotate(word, (X_tsne[i, 0], X_tsne[i, 1]))
23 | 
24 |     plt.show()
25 | 
26 | 
27 | def subset(initial_word, id_word_vec, wv, max):
28 |     """
29 |     Get a subset of max number of words using cosmul distance starting from initial_word
30 |     :param initial_word: first word to be used to find nearest ones
31 |     :param id_word_vec: vector containing the tuples (id, word, embedding) for each word
32 |     :param wv: gensim word embeddings model
33 |     :param max: number of words to return
34 |     :return: list of tuples (id, word, embedding)
35 |     """
36 |     words = [initial_word]
37 |     subset = []
38 |     while len(words) > 0 and len(subset) < max:
39 |         w = words.pop()
40 |         sim = wv.similar_by_word(w)
41 |         ws = [w for w, _ in sim]
42 |         similars = [s for s in ws if s not in subset]
43 |         subset.extend(similars)
44 |         words.extend(similars)
45 | 
46 |     final_set = [(i, w, v) for i, w, v in id_word_vec if w in subset]
47 |     return final_set
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     parser = argparse.ArgumentParser()
52 |     parser.add_argument('-i', '--id_word_vec', type=str, help="Path of id <-> word <-> embedding vector", required=True)
53 |     parser.add_argument('-w', '--word_vectors', type=str, help="Path of LM to perform the tests upon", required=True)
54 | 
55 |     args = parser.parse_args()
56 | 
57 |     # Arguments parsing
58 |     wv_path = args.word_vectors
59 |     path = args.id_word_vec
60 | 
61 |     print("Loading model...")
62 |     wv = KeyedVectors.load_word2vec_format(wv_path, binary=False)
63 | 
64 |     print("Loading id-word-vec...")
65 |     id_word_vec = VectorManager.read_vector(path)
66 | 
67 |     print("Finding subset to plot")
68 |     initial_word = 'jupiter'
69 |     max_elements = 500
70 |     sb = subset(initial_word, id_word_vec, wv, max_elements)
71 | 
72 |     print("Plotting subset of words...")
73 |     # Plot t-SNE
74 |     plot_tsne(sb)
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/src/preprocess/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/src/preprocess/__init__.py


--------------------------------------------------------------------------------
/src/preprocess/cleaner.py:
--------------------------------------------------------------------------------
  1 | from utils.vector_manager import VectorManager
  2 | from pattern.en import tokenize
  3 | from time import time
  4 | 
  5 | import multiprocessing as mp
  6 | import os
  7 | import re
  8 | import sys
  9 | import argparse
 10 | 
 11 | 
 12 | def cleanhtml(raw_html):
 13 |     """
 14 |     Removes the <doc> tags remaining from wikiExtracted data
 15 |     :param raw_html: html/text content of a file with many docs
 16 |     :return: only text from raw_html
 17 |     """
 18 |     cleanr = re.compile('<.*?>')
 19 |     cleantext = re.sub(cleanr, ' ', raw_html)
 20 |     return cleantext
 21 | 
 22 | 
 23 | def remove_title(text):
 24 |     """
 25 |     Removes the title of a document
 26 |     :param text: text containing an article output from cleanhtml()
 27 |     :return: text of the article without title
 28 |     """
 29 |     index = text.find("\n\n")
 30 |     if index != -1:
 31 |         return text[index+2:]
 32 |     else:
 33 |         return text
 34 | 
 35 | 
 36 | def is_number(s):
 37 |     """
 38 |     Checks if the parameter s is a number
 39 |     :param s: anything
 40 |     :return: true if s is a number, false otherwise
 41 |     """
 42 |     try:
 43 |         float(s)
 44 |         return True
 45 |     except ValueError:
 46 |         return False
 47 | 
 48 | 
 49 | def _transform_file(file_path, debug=False):
 50 |     """
 51 |     Transforms a file containing articles into a 4D list of words divided into sentences,
 52 |     paragraphs and docs. Write the result to disk with the name filename_wl (words list)
 53 |     :param file_path: file to transform
 54 |     """
 55 |     if debug:
 56 |         print("Cleaning %s" % file_path)
 57 |     with open(file_path) as f:
 58 |         raw = f.read().decode("latin-1")
 59 |         data = cleanhtml(raw)
 60 |         docs = data.split("</doc>")
 61 |         del data
 62 |     file_out = "%s_wl" % file_path
 63 |     file_string = ""
 64 |     for doc in [d.strip() for d in docs if d.strip()]:
 65 |         paragraphs = [tokenize(par) for par in remove_title(cleanhtml(doc)).strip().split("\n\n") if par]
 66 |         doc_a = False
 67 |         for p in paragraphs:
 68 |             par_a = False
 69 |             for sent in p:
 70 |                 line = " ".join([word for word in sent.lower().split()
 71 |                                  if word.isalpha() or is_number(word)])
 72 |                 if line:
 73 |                     file_string += line + "\n"
 74 |                     par_a = True
 75 |                     doc_a = True
 76 | 
 77 |             if par_a:
 78 |                 file_string += "\n"
 79 |         if doc_a:
 80 |             file_string += "\n"
 81 | 
 82 |     VectorManager.write_string(file_out, file_string.encode("latin-1"))
 83 |     del file_string
 84 |     if debug:
 85 |         print("Done with %s" % file_path)
 86 | 
 87 | 
 88 | def transform(dirname, debug=False):
 89 |     """
 90 |     Handles the parallel transformation of all the dataset into 4D lists
 91 |     """
 92 |     for root, dirs, files in os.walk(dirname):
 93 |         filtered_files = ["%s/%s" % (root, file) for file in files if
 94 |                           is_number(file.split("_")[1]) and len(file.split("_")) == 2]
 95 | 
 96 |         threads = min(mp.cpu_count() * 4, filtered_files)
 97 |         print("Starting %s processes to clean %s files" % (threads, len(filtered_files)))
 98 |         i = 0
 99 |         while i < len(filtered_files):
100 |             ps = []
101 |             j = 0
102 |             while j < threads and (i + j) < len(filtered_files):
103 |                 if debug:
104 |                     print("[%s] Creating %s of %s for file %s" % (
105 |                 i, i + j, len(filtered_files), filtered_files[i + j]))
106 |                 p = (mp.Process(target=_transform_file, args=(filtered_files[i + j],)))
107 |                 p.start()
108 |                 ps.append(p)
109 |                 j += 1
110 | 
111 |             if debug:
112 |                 print("%s process in the list to join" % len(ps))
113 |             j = 0
114 |             while j < threads and (i + j) < len(filtered_files):
115 |                 if debug:
116 |                     print("[%s] Joining %s of %s for file %s" % (
117 |                 i, j, len(filtered_files), filtered_files[i + j]))
118 |                 ps[j].join()
119 |                 j += 1
120 | 
121 |             i += j
122 | 
123 |     sys.stdout.flush()
124 | 
125 | 
126 | def clean_data(files_path):
127 | 
128 |     """
129 |     Wrapper function to cleans the data and transforms it into 4D. Used to be called from either main or as block of
130 |      the pipeline
131 |     :param data_path: of the files to convert
132 |     :return: MySentences class ready to be fed to Word2Vec model
133 |     """
134 |     print("[BLOCK] Transforming sentences to 4-dimensional lists")
135 |     transform(files_path)
136 |     print("[BLOCK] Done transforming data")
137 |     sys.stdout.flush()
138 | 
139 | 
140 | if __name__ == '__main__':
141 | 
142 |     parser = argparse.ArgumentParser()
143 |     parser.add_argument('-d', '--data', type=str, help="Path of the data to be used for the word embeddings"
144 |                                                        " and clean up.", required=True)
145 | 
146 |     args = parser.parse_args()
147 |     data_path = args.data
148 | 
149 |     print("Cleaning data from %s" % (data_path))
150 | 
151 |     begin = time()
152 | 
153 |     clean_data(data_path)
154 | 
155 | 
156 |     end = time()
157 |     print("Total processing time: %d seconds" % (end - begin))
158 | 


--------------------------------------------------------------------------------
/src/preprocess/embeddings.py:
--------------------------------------------------------------------------------
 1 | from utils.vector_manager import VectorManager
 2 | from time import time
 3 | 
 4 | import multiprocessing as mp
 5 | import gensim
 6 | import os
 7 | import sys
 8 | import argparse
 9 | from contextlib import closing
10 | 
11 | 
12 | 
13 | def read_file(filename):
14 |     return VectorManager.read_vector(filename)
15 | 
16 | 
17 | class MySentences(object):
18 |     def __init__(self, dirname):
19 |         self.dirname = dirname
20 |         self.files = []
21 |         self.file_paths = []
22 |         for root, dirs, files in os.walk(self.dirname):
23 |             # for filename in [file for file in files if file.endswith("_simple")]:
24 |             for filename in [file for file in files if file.endswith("_clean")]:
25 |                 file_path = root + '/' + filename
26 |                 self.file_paths.append(file_path)
27 |         print("Got %s files to turn into sentences" % len(self.file_paths))
28 | 
29 | 
30 |     def __iter__(self):
31 |         """
32 |         Defines how to iterate the MySentences class in order to feed it directly into Word2Vec method. Yields a
33 |         sentence (as a list of words) for every iteration.
34 |         """
35 |         # for root, dirs, files in os.walk(self.dirname):
36 |         for file_path in self.file_paths:
37 |             file_data = VectorManager.read_vector(file_path)
38 |             file_sentences = VectorManager.parse_into_sentences(file_data)
39 | 
40 |             for sentence in file_sentences:
41 |                 yield sentence
42 | 
43 | 
44 | def create_embeddings(files_path, embedding_size, minimum_count):
45 | 
46 |     """
47 |     Creates embeddings with the sentences, embedding size, min_count of occurrences, a max window length of 10, and
48 |     cpu_count() number of workers. Used to be called from either main or as block of the pipeline
49 |     :param files_path: used to generate the word embeddings
50 |     :param embedding_size: size of the embeddings to generate
51 |     :param minimum_count: min. occurrences per word to be included
52 |     :return: word2vec model with all the embeddings and extra info
53 |     """
54 |     print("[BLOCK] Initializing MySentences from {}".format(files_path))
55 |     sentences = MySentences(files_path)
56 |     print("[BLOCK] Creating embeddings model")
57 |     sys.stdout.flush()
58 |     model_w2v = gensim.models.Word2Vec(sentences,
59 |                                        size=embedding_size,
60 |                                        window=10,
61 |                                        min_count=minimum_count,
62 |                                        workers=mp.cpu_count())
63 |     print("[BLOCK] Created embeddings of size %s" % embedding_size)
64 |     sys.stdout.flush()
65 | 
66 |     return model_w2v
67 | 
68 | 
69 | if __name__ == '__main__':
70 | 
71 |     parser = argparse.ArgumentParser()
72 |     parser.add_argument('-d', '--data', type=str, help="Path of the data to be used for the word embeddings"
73 |                                                        " and clean up.", required=True)
74 |     parser.add_argument('-s', '--size', type=int, help="Size of the word embeddings.", default=200, required=True)
75 |     parser.add_argument('-c', '--min_count', type=int, help="Size of the word embeddings.", default=1, required=False)
76 | 
77 |     args = parser.parse_args()
78 |     data_path = args.data
79 |     emb_size = args.size
80 |     min_count = args.min_count
81 | 
82 |     print("Creating embeddings of size %s for data in %s" % (emb_size, data_path))
83 | 
84 |     begin = time()
85 | 
86 |     model = create_embeddings(data_path, emb_size, min_count)
87 | 
88 |     print("Saving embeddings model...")
89 |     model.save("../models/word2vec_gensim_%s" % emb_size)
90 |     model.wv.save_word2vec_format("../models/word2vec_org_%s" % emb_size,
91 |                                   "../models/vocabulary_%s" % emb_size,
92 |                                   binary=False)
93 | 
94 |     end = time()
95 |     print("Total processing time: %d seconds" % (end - begin))
96 | 


--------------------------------------------------------------------------------
/src/preprocess/filter.py:
--------------------------------------------------------------------------------
  1 | from utils.vector_manager import VectorManager
  2 | from pattern.en import tokenize
  3 | from time import time
  4 | 
  5 | import multiprocessing as mp
  6 | import os
  7 | import re
  8 | import sys
  9 | import argparse
 10 | 
 11 | 
 12 | def cleanhtml(raw_html):
 13 |     """
 14 |     Removes the <doc> tags remaining from wikiExtracted data
 15 |     :param raw_html: html/text content of a file with many docs
 16 |     :return: only text from raw_html
 17 |     """
 18 |     cleanr = re.compile('<.*?>')
 19 |     cleantext = re.sub(cleanr, ' ', raw_html)
 20 |     return cleantext
 21 | 
 22 | 
 23 | def remove_title(text):
 24 |     """
 25 |     Removes the title of a document
 26 |     :param text: text containing an article output from cleanhtml()
 27 |     :return: text of the article without title
 28 |     """
 29 |     index = text.find("\n\n")
 30 |     if index != -1:
 31 |         return text[index+2:]
 32 |     else:
 33 |         return text
 34 | 
 35 | 
 36 | def is_number(s):
 37 |     """
 38 |     Checks if the parameter s is a number
 39 |     :param s: anything
 40 |     :return: true if s is a number, false otherwise
 41 |     """
 42 |     try:
 43 |         float(s)
 44 |         return True
 45 |     except ValueError:
 46 |         return False
 47 | 
 48 | def known(word, w2id):
 49 |     """
 50 |     Return ID of the word (or 0 if word is not in word2Id dict)
 51 |     :param word: to translated
 52 |     :return: Id of the word
 53 |     """
 54 | 
 55 |     try:
 56 |         word_r = w2id[word]
 57 |         return word
 58 |     except KeyError:
 59 |         return '<unk>'
 60 | 
 61 | 
 62 | def _transform_file(file_path, w2id, split_par=False, debug=False):
 63 |     """
 64 |     Transforms a file containing articles into a 4D list of words divided into sentences,
 65 |     paragraphs and docs. Write the result to disk with the name filename_clean.pklz
 66 |     :param file_path: file to transform
 67 |     """
 68 |     if debug:
 69 |         print("Cleaning %s" % file_path)
 70 |     with open(file_path) as f:
 71 |         data = f.read().decode("latin-1")
 72 |         docs = data.split("</doc>")
 73 |         del data
 74 |     if not split_par:
 75 |         file_out = "%s_clean_simple" % file_path
 76 |     else:
 77 |         file_out = "%s_clean_paragraph" % file_path
 78 |     file_string = ""
 79 |     for doc in [d.strip() for d in docs if d.strip()]:
 80 |         paragraphs = [tokenize(par) for par in remove_title(cleanhtml(doc)).strip().split("\n\n") if par]
 81 |         doc_a = False
 82 |         for p in paragraphs:
 83 |             par_a = False
 84 |             for sent in p:
 85 |                 line = [word for word in sent.lower().split()
 86 |                                  if word.isalpha() or is_number(word)]
 87 | 
 88 |                 line = " ".join([known(word, w2id) for word in line])
 89 |                 if line:
 90 |                     file_string += line + " <eos> "
 91 |                     par_a = True
 92 | 
 93 |             if par_a and split_par:
 94 |                 file_string += " <eop> "
 95 | 
 96 |     VectorManager.write_string(file_out, file_string.encode("latin-1"))
 97 |     del file_string
 98 |     if debug:
 99 |         print("Done with %s" % file_path)
100 | 
101 | 
102 | def transform(dirname, w2Id, paragraph_mark, debug=False):
103 |     """
104 |     Handles the parallel transformation of all the dataset into 4D lists
105 |     """
106 |     for root, dirs, files in os.walk(dirname):
107 |         filtered_files = []
108 |         for file in files:
109 |             print("File: {}".format(file))
110 |             try:
111 |                 if is_number(file.split("_")[1]) and len(file.split("_")) == 2:
112 |                     filtered_files.append("%s/%s" % (root, file))
113 |             except IndexError:
114 |                 pass
115 | 
116 | 
117 |         threads = min(mp.cpu_count() * 4, filtered_files)
118 |         print("Starting %s processes to clean %s files" % (threads, len(filtered_files)))
119 |         i = 0
120 |         while i < len(filtered_files):
121 |             ps = []
122 |             j = 0
123 |             while j < threads and (i + j) < len(filtered_files):
124 |                 if debug:
125 |                     print("[%s] Creating %s of %s for file %s" % (
126 |                 i, i + j, len(filtered_files), filtered_files[i + j]))
127 |                 p = (mp.Process(target=_transform_file, args=(filtered_files[i + j], w2Id, paragraph_mark)))
128 |                 p.start()
129 |                 ps.append(p)
130 |                 j += 1
131 | 
132 |             if debug:
133 |                 print("%s process in the list to join" % len(ps))
134 |             j = 0
135 |             while j < threads and (i + j) < len(filtered_files):
136 |                 if debug:
137 |                     print("[%s] Joining %s of %s for file %s" % (
138 |                 i, j, len(filtered_files), filtered_files[i + j]))
139 |                 ps[j].join()
140 |                 j += 1
141 | 
142 |             i += j
143 | 
144 |     sys.stdout.flush()
145 | 
146 | 
147 | def filter_data(files_path, w2Id, paragraph_mark):
148 | 
149 |     """
150 |     Wrapper function to filters occurrences not present in w2Id. Used to be called from either main or as block of
151 |      the pipeline
152 |     :param data_path: of the files to convert
153 |     :return: MySentences class ready to be fed to Word2Vec model
154 |     """
155 |     print("[BLOCK] Filtering sentences to files divided by <eos> (splitting also paragraphs? %s)" % paragraph_mark)
156 |     transform(files_path, w2Id, paragraph_mark)
157 |     print("[BLOCK] Done transforming data")
158 |     sys.stdout.flush()
159 | 
160 | 
161 | if __name__ == '__main__':
162 | 
163 |     parser = argparse.ArgumentParser()
164 |     parser.add_argument('-d', '--data', type=str, help="Path of the data to be used for the word embeddings"
165 |                                                        " and clean up.", required=True)
166 |     parser.add_argument('-w', '--word_vector', type=str, help="Word2ID vector to be used for doc translation.",
167 |                         required=True)
168 |     parser.add_argument('-', '--paragraph_marks', type=str, help="Add <eop> marking the end of paragraphs",
169 |                         required=False, default=False)
170 | 
171 |     args = parser.parse_args()
172 |     data_path = args.data
173 |     word2id_file = args.word_vector
174 |     paragraph_mark = args.paragraph_marks
175 | 
176 |     begin = time()
177 | 
178 |     w2Id = VectorManager.read_vector(word2id_file)
179 | 
180 |     print("Filtering data from %s" % (data_path))
181 | 
182 |     begin = time()
183 | 
184 |     filter_data(data_path, w2Id, paragraph_mark)
185 | 
186 | 
187 |     end = time()
188 |     print("Total processing time: %d seconds" % (end - begin))
189 | 


--------------------------------------------------------------------------------
/src/preprocess/transform_from_gensim.py:
--------------------------------------------------------------------------------
 1 | from gensim.models.keyedvectors import KeyedVectors
 2 | from utils.vector_manager import VectorManager
 3 | import numpy as np
 4 | import argparse
 5 | 
 6 | 
 7 | def transform_gensim(wv):
 8 |     """
 9 |     Transforms word2Vec model class to two structures: word2id dictionary (used to translate word into IDs) and
10 |     id_word_vec which contains the tuple (id, word, embedding) for each word in the model. Used to be called from
11 |     either main or as block of the pipeline.
12 |     :param wv: word2vec model with the word embeddings
13 |     :return: word2id and id_word_vec
14 |     """
15 |     print("Transforming from gensim a total of %s" % len(wv.vocab.items()))
16 |     complete_vec = [(v.index, w, wv.word_vec(w)) for w, v in wv.vocab.items()]
17 |     sorted_vec = sorted(complete_vec)
18 |     id_word_vec = sorted_vec
19 |     word2id = dict([(w, id) for id, w, _ in id_word_vec])
20 | 
21 |     return word2id, id_word_vec
22 | 
23 | if __name__ == '__main__':
24 | 
25 |     parser = argparse.ArgumentParser()
26 |     parser.add_argument('-k', '--kv', type=str, help="Path of the keyed vectors to translate [word2vec_org_XXX]",
27 |                         required=True)
28 | 
29 |     args = parser.parse_args()
30 |     data_path = args.kv
31 | 
32 |     print("Loading keyed vectors")
33 |     wv = KeyedVectors.load_word2vec_format(data_path, binary=False)
34 | 
35 |     emb_size = len(wv.syn0[0])
36 |     word2id, id_word_vec = transform_gensim(wv)
37 | 
38 |     w2id_filepath = "../models/word2id_%s" % emb_size
39 |     idWordVec_filepath = "../models/idWordVec_%s" % emb_size
40 | 
41 |     print("Writing files:\n\t * word2id: %s\n\t * idWordVec: %s" % (w2id_filepath, idWordVec_filepath))
42 |     VectorManager.write_pickled(w2id_filepath, word2id)
43 |     VectorManager.write_pickled(idWordVec_filepath, id_word_vec)
44 | 


--------------------------------------------------------------------------------
/src/preprocess/words2ids.py:
--------------------------------------------------------------------------------
  1 | from utils.vector_manager import VectorManager
  2 | from time import time
  3 | import multiprocessing as mp
  4 | import argparse
  5 | import numpy as np
  6 | import os
  7 | import sys
  8 | 
  9 | 
 10 | def word2Id(filename, w2id, debug=False):
 11 |     if debug:
 12 |         print("Translating %s" % filename)
 13 |     unk_id = 0
 14 |     file_out = "%s_num" % filename.split("_clean")[0]
 15 | 
 16 |     def transform_numpy():
 17 |         """
 18 |         Transforms a 4D list of words into a 4D numpy array of integers and writes it into file_out
 19 |         """
 20 |         docs = VectorManager.parse_into_4D(VectorManager.read_vector(filename))
 21 |         file_list = []
 22 |         for doc in docs:
 23 |             doc_list = []
 24 |             for paragraph in doc:
 25 |                 par_list = []
 26 |                 for sentence in paragraph:
 27 |                     s_id = [toId(word) for word in sentence if word]
 28 |                     if s_id:
 29 |                         par_list.append(s_id)
 30 |                 doc_list.append(par_list)
 31 |             file_list.append(doc_list)
 32 |         np.save(file_out, np.array(file_list))
 33 | 
 34 | 
 35 |     def transform():
 36 |         """
 37 |         Transforms a 4D list of words into a 4D numpy array of integers and writes it into file_out
 38 |         """
 39 |         with open(filename) as f:
 40 |             data = f.read().decode("latin-1").split()
 41 | 
 42 |         ids = " ".join([str(w2id[w]) for w in data])
 43 | 
 44 |         with open("%s_num_eos" % filename, "wb") as f:
 45 |             f.write(ids)
 46 | 
 47 | 
 48 |     def toId(word):
 49 |         """
 50 |         Return ID of the word (or 0 if word is not in word2Id dict)
 51 |         :param word: to translated
 52 |         :return: Id of the word
 53 |         """
 54 |         word_id = unk_id
 55 |         try:
 56 |             word_id = w2id[word]
 57 |         except KeyError:
 58 |             pass
 59 |         finally:
 60 |             return word_id
 61 | 
 62 |     transform()
 63 |     # return transform()
 64 | 
 65 | 
 66 | class FileW2ID(object):
 67 |     """
 68 |     Auxiliar class which holds the filepaths and w2id structure and yields them one at a time in order to avoid
 69 |     replicating the w2id structure (which can be quite big)
 70 |     """
 71 | 
 72 |     def __init__(self, filepaths, w2id):
 73 |         self.filepaths = filepaths
 74 |         self.w2id = w2id
 75 | 
 76 |     def __iter__(self):
 77 |         for file in self.filepaths:
 78 |             yield (file, self.w2id)
 79 | 
 80 | 
 81 | def translate_files(data_path, w2id, suffix="_clean", debug=False):
 82 |     """
 83 |     Handles the parallel translation from word to id of the files in data_path with the mapping w2id
 84 |     :param data_path: path of the files to transform. Used to be called from either main or as block of
 85 |      the pipeline
 86 |     :param w2id: mappings to be used
 87 |     """
 88 |     print("[BLOCK] Translating files from %s" % (data_path))
 89 | 
 90 |     filepaths = []
 91 |     for root, dirs, files in os.walk(data_path):
 92 |         filepaths.extend(["%s/%s" % (root, file) for file in files if file.endswith(suffix)])
 93 | 
 94 |     threads = min(mp.cpu_count() * 4, filepaths)
 95 | 
 96 |     print("[BLOCK] Starting %s processes to translate to IDs %s files" % (threads, len(filepaths)))
 97 |     i = 0
 98 |     while i < len(filepaths):
 99 |         ps = []
100 |         j = 0
101 |         while j < threads and (i + j) < len(filepaths):
102 |             if debug:
103 |                 print("[%s] Creating %s of %s for file %s" % (
104 |                     i, i + j, len(filepaths), filepaths[i + j]))
105 |             p = (mp.Process(target=word2Id, args=(filepaths[i + j], w2id,)))
106 |             p.start()
107 |             ps.append(p)
108 |             j += 1
109 | 
110 |         if debug:
111 |             print("%s process in the list to join" % len(ps))
112 |         j = 0
113 |         while j < threads and (i + j) < len(filepaths):
114 |             if debug:
115 |                 print("[%s] Joining %s of %s for file %s" % (
116 |                     i, j, len(filepaths), filepaths[i + j]))
117 |             ps[j].join()
118 |             j += 1
119 | 
120 |         i += j
121 |     # for p in iter_file_w2id:
122 |     #     word2Id(p)
123 |     # p = mp.Pool(threads, maxtasksperchild=1)
124 |     # p.map(word2Id, iter_file_w2id)
125 | 
126 |     print("[BLOCK] Files translated to IDs")
127 |     sys.stdout.flush()
128 | 
129 | if __name__ == '__main__':
130 |     parser = argparse.ArgumentParser()
131 |     parser.add_argument('-d', '--data', type=str, help="Path of the data to be translated with word2id vector."
132 |                                                        " and clean up.", required=True)
133 |     parser.add_argument('-w', '--word_vector', type=str, help="Word2ID vector to be used for doc translation.",
134 |                         required=False, default="../models/eos/word2id_1000.pklz")
135 | 
136 |     args = parser.parse_args()
137 |     data_path = args.data
138 |     word2id_file = args.word_vector
139 | 
140 |     begin = time()
141 | 
142 |     w2Id = VectorManager.read_vector(word2id_file)
143 |     translate_files(data_path, w2Id)
144 | 
145 |     end = time()
146 |     print("Total processing time: %d seconds" % (end - begin))
147 | 


--------------------------------------------------------------------------------
/src/preprocess/words2ids_validator.py:
--------------------------------------------------------------------------------
  1 | from utils.vector_manager import VectorManager
  2 | from time import time
  3 | import multiprocessing as mp
  4 | import argparse
  5 | import numpy as np
  6 | import os
  7 | import sys
  8 | 
  9 | confidence = 0.8
 10 | 
 11 | 
 12 | def id2Word(param):
 13 |     filename, id2w = param
 14 |     file_words = "%s_clean" % filename.split("_num")[0]
 15 |     print("Comparing original %s with %s" % (file_words, filename))
 16 | 
 17 | 
 18 |     def is_valid_numpy():
 19 |         """
 20 |         """
 21 |         docs_ids = VectorManager.read_vector(filename)
 22 |         original = VectorManager.parse_into_4D(VectorManager.read_vector(file_words))
 23 |         file_list = []
 24 |         comparison = []
 25 |         unknowns = 0
 26 |         for d in range(0, len(docs_ids)):
 27 |             doc_list = []
 28 |             for p in range(0, len(docs_ids[d])):
 29 |                 par_list = []
 30 |                 for s in range(0, len(docs_ids[d][p])):
 31 |                     sent_list = []
 32 |                     for w in range(0, len(docs_ids[d][p][s])):
 33 |                         try:
 34 |                             translated = to_word(docs_ids[d][p][s][w])
 35 |                             if translated == '<unk>':
 36 |                                 unknowns += 1
 37 |                             comparison.append(translated == original[d][p][s][w])
 38 |                             sent_list.append(translated)
 39 |                         except Exception as e:
 40 |                             print("[%s] Indices %s %s %s %s: %s" % (filename, d,p,s,w, e))
 41 |                     par_list.append(sent_list)
 42 |                 doc_list.append(par_list)
 43 |             file_list.append(doc_list)
 44 | 
 45 |         valid = False
 46 |         try:
 47 |             ratio = float(comparison.count(True)) / len(comparison)
 48 |             u_ratio = round(float(unknowns) / len(comparison), 2)
 49 |             if ratio < confidence:
 50 |                 print("[WARN] File %s equality ratio is %s with %s unknown ratio" % (filename, round(ratio, 2), u_ratio))
 51 |             else:
 52 |                 print("[OK] File %s equality ratio is %s with %s unknown ratio" % (filename, round(ratio, 2), u_ratio))
 53 |                 valid = True
 54 |         except KeyError as e:
 55 |             print("[ERROR] File %s is completely different (%s) with %s unknown ratio" % (filename, e, u_ratio))
 56 | 
 57 | 
 58 |         return valid
 59 | 
 60 |     def is_valid():
 61 |         """
 62 |         """
 63 |         with open(file_words) as f:
 64 |             original = f.read().decode("latin-1").split()
 65 | 
 66 |         with open(file_words) as f:
 67 |             docs_ids = f.read().split()
 68 | 
 69 |         doc_words = [id2w(id) for id in docs_ids]
 70 | 
 71 |         comparison = [original[i] == doc_words[i] for i in range(original)]
 72 |         valid = False
 73 |         try:
 74 |             ratio = float(comparison.count(True)) / len(comparison)
 75 |             if ratio < confidence:
 76 |                 print("[WARN] File %s equality ratio is %s." % (filename, round(ratio, 2)))
 77 |             else:
 78 |                 print("[OK] File %s equality ratio is %s." % (filename, round(ratio, 2)))
 79 |                 valid = True
 80 |         except KeyError as e:
 81 |             print("[ERROR] File %s is completely different (%s) with %s unknown ratio" % (filename, e))
 82 | 
 83 | 
 84 |         return valid
 85 | 
 86 | 
 87 |     def to_word(id):
 88 |         """
 89 |         Return Word associated with id
 90 |         :param id: of the word to translate
 91 |         :return: word associated with the ID
 92 |         """
 93 |         try:
 94 |             word = id2w[id]
 95 |         except IndexError as e:
 96 |             print("ID %s not found\n%s" % (id, e))
 97 |             word = '<unk>'
 98 |         return word
 99 | 
100 |     return is_valid()
101 | 
102 | 
103 | class FileID2Word(object):
104 |     """
105 |     Auxiliar class which holds the filepaths and w2id structure and yields them one at a time in order to avoid
106 |     replicating the w2id structure (which can be quite big)
107 |     """
108 | 
109 |     def __init__(self, filepaths, id2w):
110 |         self.filepaths = filepaths
111 |         self.id2w = id2w
112 | 
113 |     def __iter__(self):
114 |         for file in self.filepaths:
115 |             yield (file, self.id2w)
116 | 
117 | 
118 | def check_translated_files(data_path, w2Id):
119 |     """
120 |     Handles the parallel translation from word to id of the files in data_path with the mapping w2id
121 |     :param data_path: path of the files to transform. Used to be called from either main or as block of
122 |      the pipeline
123 |     :param w2id: mappings to be used
124 |     """
125 |     print("[BLOCK] Validating translated files from %s" % (data_path))
126 | 
127 |     sorted_list = sorted(w2Id.items(), key= lambda(x): x[1])
128 |     id2words = [w for w,_ in sorted_list]
129 |     del w2Id, sorted_list
130 |     filepaths = []
131 |     for root, dirs, files in os.walk(data_path):
132 |         filepaths.extend(["%s/%s" % (root, file) for file in files if file.endswith("_num.npy")])
133 |     threads = mp.cpu_count() * 2
134 |     iter_file_w2id = FileID2Word(filepaths, id2words)
135 |     print("[BLOCK] Starting validation with %s processes and %s files" % (threads, len(filepaths)))
136 | 
137 |     p = mp.Pool(threads, maxtasksperchild=1)
138 |     valids = p.map(id2Word, iter_file_w2id)
139 |     print("[BLOCK] Validation done. Correct files %s/%s. Confidence [%s]" % (valids.count(True), len(valids), confidence))
140 |     sys.stdout.flush()
141 | 
142 | 
143 | if __name__ == '__main__':
144 |     parser = argparse.ArgumentParser()
145 |     parser.add_argument('-d', '--data', type=str, help="Path of the data to be translated with word2id vector."
146 |                                                        " and clean up.", required=True)
147 |     parser.add_argument('-w ', '--word_vector', type=str, help="Word2ID vector to be used for doc reverse translation.",
148 |                         required=True)
149 | 
150 |     args = parser.parse_args()
151 |     data_path = args.data
152 |     word2id_file = args.word_vector
153 | 
154 |     begin = time()
155 | 
156 |     w2Id = VectorManager.read_vector(word2id_file)
157 |     check_translated_files(data_path, w2Id)
158 | 
159 |     end = time()
160 |     print("Total processing time: %d seconds" % (end - begin))
161 | 


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kafkasl/contextualLSTM/a4421d592c3960c79842b0f23de162e61fcab3dd/src/utils/__init__.py


--------------------------------------------------------------------------------
/src/utils/flatten.py:
--------------------------------------------------------------------------------
1 | def flatten(items, seqtypes=(list, tuple)):
2 |     for i, x in enumerate(items):
3 |         while i < len(items) and isinstance(items[i], seqtypes):
4 |             items[i:i+1] = items[i]
5 |     return items


--------------------------------------------------------------------------------
/src/utils/memory.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from sys import getsizeof, stderr
 3 | from itertools import chain
 4 | from collections import deque
 5 | try:
 6 |     from reprlib import repr
 7 | except ImportError:
 8 |     pass
 9 | 
10 | 
11 | def total_size(o, handlers={}, verbose=False):
12 |     """ Returns the approximate memory footprint an object and all of its contents.
13 | 
14 |     Automatically finds the contents of the following builtin containers and
15 |     their subclasses:  tuple, list, deque, dict, set and frozenset.
16 |     To search other containers, add handlers to iterate over their contents:
17 | 
18 |         handlers = {SomeContainerClass: iter,
19 |                     OtherContainerClass: OtherContainerClass.get_elements}
20 | 
21 |     """
22 |     dict_handler = lambda d: chain.from_iterable(d.items())
23 |     all_handlers = {tuple: iter,
24 |                     list: iter,
25 |                     deque: iter,
26 |                     dict: dict_handler,
27 |                     set: iter,
28 |                     frozenset: iter,
29 |                    }
30 |     all_handlers.update(handlers)     # user handlers take precedence
31 |     seen = set()                      # track which object id's have already been seen
32 |     default_size = getsizeof(0)       # estimate sizeof object without __sizeof__
33 | 
34 |     def sizeof(o):
35 |         if id(o) in seen:       # do not double count the same object
36 |             return 0
37 |         seen.add(id(o))
38 |         s = getsizeof(o, default_size)
39 | 
40 |         if verbose:
41 |             print(s, type(o), repr(o), file=stderr)
42 | 
43 |         for typ, handler in all_handlers.items():
44 |             if isinstance(o, typ):
45 |                 s += sum(map(sizeof, handler(o)))
46 |                 break
47 |         return s
48 | 
49 |     return sizeof(o)
50 | 


--------------------------------------------------------------------------------
/src/utils/split_1k.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import sys
 4 | import errno
 5 | 
 6 | 
 7 | def open_file(output, counter):
 8 |     filename = "%s/%s/wiki_%s_sym" % (output, counter/10000, counter % 10000)
 9 |     if not os.path.exists(os.path.dirname(filename)):
10 |         try:
11 |             os.makedirs(os.path.dirname(filename))
12 |         except OSError as exc: # Guard against race condition
13 |             if exc.errno != errno.EEXIST:
14 |                 raise
15 | 
16 |     return open(filename, "w")
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument('-d', '--data_list', type=str, help="Input data list."
22 |                         " and clean up.", required=True)
23 |     parser.add_argument('-o', '--output', type=str, help="Output files."
24 |                         " and clean up.", required=True)
25 | 
26 |     args = parser.parse_args()
27 |     data_path = args.data_list
28 |     output = args.output
29 | 
30 |     files = open(data_path).read().split()
31 | 
32 |     i = 1
33 |     counter = 0
34 |     max = 35
35 |     current_file = open_file(output, counter)
36 |     for f in files:
37 |         current_data = open(f).read().split()
38 |         for w in current_data:
39 |             if i > max:
40 |                 current_file.close()
41 |                 counter += 1
42 |                 i = 1
43 |                 current_file = open("%s/wiki_%s_sym" % (output, counter), "w")
44 |             if i != 1:
45 |                 current_file.write(" ")
46 |             current_file.write(w)
47 |             i += 1
48 | 
49 |     current_file.close()


--------------------------------------------------------------------------------
/src/utils/vector_manager.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pickle
 3 | 
 4 | 
 5 | # Vectors represent: word <=> id/index <=> embedding
 6 | # Auxiliar class handling all the read/write operations for data structures other than numpy arrays.
 7 | class VectorManager(object):
 8 | 
 9 |     # Methods used to save the vectors
10 |     @staticmethod
11 |     def parse_into_4D(file_string):
12 |         return [[[[w for w in s.split() if w]
13 |                 for s in p.split("\n") if s]
14 |                 for p in doc.split("\n\n") if p]
15 |                 for doc in file_string.split("\n\n\n")
16 |                 if doc]
17 | 
18 |     @staticmethod
19 |     def parse_into_list(file_string):
20 |         file_list = []
21 |         for doc in file_string.split("\n\n\n"):
22 |             for p in doc.split("\n\n"):
23 |                 for s in p.split("\n"):
24 |                     for w in s.split():
25 |                         if w:
26 |                             file_list.append(w)
27 | 
28 |         return file_list
29 | 
30 |     # Methods used to save the vectors
31 |     @staticmethod
32 |     def parse_into_sentences(file_string):
33 |         sentences = []
34 |         for doc in file_string.split("\n\n\n"):
35 |             for p in doc.split("\n\n"):
36 |                 for s in p.split("\n"):
37 |                     ws = s.split()
38 |                     if ws:
39 |                         sentences.append(ws)
40 |         return sentences
41 | 
42 |     @staticmethod
43 |     def parse_into_paragraphs(file_string):
44 |         sentences = []
45 |         for doc in file_string.split("\n\n\n"):
46 |             for p in doc.split("\n\n"):
47 |                 for s in p.split("\n"):
48 |                     ws = s.split()
49 |                     if ws:
50 |                         sentences.append(ws)
51 |         return sentences
52 | 
53 |     # Methods used to save the vectors
54 |     @staticmethod
55 |     def write_pickled(filename, data):
56 |         with open('%s.pklz' % filename, 'wb') as f:
57 |             pickle.dump(data, f)
58 | 
59 |     # Methods used to save the vectors
60 |     @staticmethod
61 |     def write_string(filename, data):
62 |         with open('%s' % filename, 'wb') as f:
63 |             f.write(data)
64 | 
65 |     # Methods to read vectors
66 |     @staticmethod
67 |     def read_vector(filename):
68 |         ext = filename.split(".")[-1]
69 | 
70 |         if ext == "npy":
71 |             with open(filename, "rb") as f:
72 |                 return np.load(f)
73 |         if ext == "pklz":
74 |             with open(filename, 'rb') as f:
75 |                 try:
76 |                     return pickle.load(f, encoding="latin1")
77 |                 except:
78 |                     return pickle.load(f)
79 |         else:
80 |             with open(filename, 'rb') as f:
81 |                 data = f.read()
82 |                 return data.decode("latin-1")
83 |                 # return data.decode("latin-1")
84 |             # print("Unknown file extension for file %s" % filename)
85 | 
86 |     @staticmethod
87 |     def read_id_word_vec():
88 |         return self.read_vector("idWordVec.pklz")
89 | 
90 |     @staticmethod
91 |     def read_word2id():
92 |         return self.read_vector("word2id.pklz")


--------------------------------------------------------------------------------