├── .gitignore
├── LICENSE
├── README.md
├── arch.png
├── construct_sg.py
├── data_load.py
├── demo.gif
├── download.sh
├── encode.py
├── hparams.py
├── make_phr2sg_id.py
├── model.py
├── prepro.py
├── test.py
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Smart Message Reply
 2 | 
 3 | Have you ever seen or used [Google Smart Reply](https://firebase.google.com/docs/ml-kit/generate-smart-replies)? It's a service that provides automatic reply suggestions for user messages. See below.
 4 | 
 5 | <img src="https://www.androidpolice.com/wp-content/uploads/2018/06/Screenshot_20180605-110446-1.png" height=600>
 6 | 
 7 | This is a useful application of the retrieval based chatbot. Think about it. How many times do we text a message like <i>thx</i>, <i>hey</i>, or <i>see you later</i>?
 8 | In this project, we build a simple message reply suggestion system.
 9 | 
10 | Kyubyong Park <br>
11 | Code-review by [Yj Choe](https://github.com/yjchoe)
12 | 
13 | ## Synonym group
14 | * We need to set the list of suggestions to show. Naturally, frequency is considered first. But what about those phrases that are similar in meaning? For example, should <i>thank you so much</i> and <i>thx</i>be treated independently? We don't think so. We want to group them and save our slots. How? We make use of a parallel corpus. Both <i>thank you so much</i> and <i>thx</i> are likely to be translated into the same text. Based on this assumption, we construct English synonym groups that share the same translation.
15 | 
16 | ## Model
17 | We fine-tune [huggingface's](https://github.com/huggingface/pytorch-pretrained-BERT) the [Bert](https://arxiv.org/abs/1810.04805) pretrained model for sequence classification. In it, a special starting token [CLS] stores the entire information of a sentence. Extra layers are attached to project the condensed information to classification units (here 100).
18 | 
19 | <img src="arch.png" width=400>
20 | 
21 | ## Data
22 | * We use [OpenSubtitles 2018](http://opus.nlpl.eu/OpenSubtitles-v2018.php) Spanish-English parallel corpus to construct synonym groups. OpenSubtitles is a large collection of translated movie subtitles. The en-es data consists of more than 61M aligned lines.
23 | * Ideally, a (very) large dialog corpus is needed for training, which we failed to find. We use the Cornell Movie Dialogue Corpus, instead. It's composed of 83,097 dialogues or 304,713 lines.
24 | 
25 | ## Requirements
26 | * python>=3.6
27 | * tqdm>=4.30.0
28 | * pytorch>=1.0
29 | * pytorch_pretrained_bert>=0.6.1
30 | * nltk>=3.4
31 | 
32 | ## Training
33 | * STEP 0. Download OpenSubtitles 2018 Spanish-English Parallel data.
34 | ```
35 | bash download.sh
36 | ```
37 | 
38 | * STEP 1. Construct synonym groups from the corpus.
39 | ```
40 | python construct_sg.py
41 | ```
42 | * STEP 2. Make phr2sg_id and sg_id2phr dictionaries.
43 | ```
44 | python make_phr2sg_id.py
45 | ```
46 | * STEP 3. Convert a monolingual English text to ids.
47 | ```
48 | python encode.py
49 | ```
50 | * STEP 4. Create training data and save them as pickle.
51 | ```
52 | python prepro.py
53 | ```
54 | * STEP 5. Train.
55 | ```
56 | python train.py
57 | ```
58 | 
59 | ## Test (Demo)
60 | 
61 | <img src="demo.gif">
62 | 
63 | * Download and extract the [pre-trained model](https://www.dropbox.com/s/fqomn5flbwlvndc/log.tar.gz?dl=0) and run the following command.
64 | ```
65 | python test.py --ckpt log/9500_ACC0.1.pt
66 | ```
67 | 
68 | ## Notes
69 | * Training loss slowly but steadily decreases.
70 | * Accuracy@5 on the evaluation data is from 10 to 20 percent.
71 | * For real application, a much much larger corpus is needed.
72 | * Not sure how much movie scripts are similar to message dialogues.
73 | * A better strategy for constructing synonym groups is necessary.
74 | * A retrieval-based chatbot is a realistic application as it is safter and easier than generation-based one.
75 | 
76 | 


--------------------------------------------------------------------------------
/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kyubyong/msg_reply/046f6308785d8e65d7ae429964df40a001a9675d/arch.png


--------------------------------------------------------------------------------
/construct_sg.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Construct synonym groups looking like this:
 3 | 
 4 | [
 5 | "0": {
 6 |         "_translation": "Te pondré más.",
 7 |         "phrases": [
 8 |             [
 9 |                 "I'll give you more.",
10 |                 1
11 |             ],
12 |             [
13 |                 "You'll have to have some more.",
14 |                 1
15 |             ],
16 | ...
17 | ]
18 | '''
19 | 
20 | import json
21 | from collections import Counter
22 | from operator import itemgetter
23 | from hparams import hp
24 | import os
25 | from tqdm import tqdm
26 | 
27 | def normalize(text):
28 |     text = text.strip(" -\n")
29 |     return text
30 | 
31 | if __name__ == "__main__":
32 |     # Group phrases
33 |     es2ens = dict()
34 |     en_lines = open(hp.opus_en, 'r').read().splitlines()
35 |     es_lines = open(hp.opus_es, 'r').read().splitlines()
36 |     for en, es in tqdm(zip(en_lines, es_lines), total=len(en_lines)):
37 |         en = normalize(en)
38 |         es = normalize(es)
39 |         if len(es) <= 1: continue
40 |         if es not in es2ens: es2ens[es] = []
41 |         es2ens[es].append(en)
42 |     print(f"Grouped all synonymous phrases: {len(es2ens)}")
43 | 
44 |     # Sort
45 |     data = dict()
46 |     i = 0
47 |     for es, ens in es2ens.items():
48 |         en2cnt = Counter(ens)
49 |         phrases = sorted(en2cnt.items(), key=itemgetter(1), reverse=True)
50 |         if len(phrases) > 1:
51 |             val = dict()
52 |             val["_translation"] = es
53 |             val["phrases"] = phrases
54 |             data[i] = val
55 |             i += 1
56 |     print(f"Sorted all synonymous groups by frequency: {len(data)}")
57 | 
58 |     # Write
59 |     os.makedirs(os.path.dirname(hp.sg), exist_ok=True)
60 |     with open(hp.sg, 'w') as fout:
61 |         json.dump(data, fout, ensure_ascii=False, indent=4, separators=(',', ': '), sort_keys=True)
62 | 


--------------------------------------------------------------------------------
/data_load.py:
--------------------------------------------------------------------------------
 1 | from hparams import hp
 2 | import random
 3 | import pickle
 4 | from itertools import chain
 5 | import torch
 6 | from glob import glob
 7 | 
 8 | print("Loading training files")
 9 | 
10 | train_data = pickle.load(open(hp.pkl_train, 'rb'))
11 | dev_data = pickle.load(open(hp.pkl_dev, 'rb'))
12 | 
13 | def pad(batch, maxlen):
14 |     '''Pads to the longest sample'''
15 |     return [sample + [0]*(maxlen-len(sample)) for sample in batch]
16 | 
17 | 
18 | def get_batch(max_span, batch_size, n_classes, train=True):
19 |     '''f
20 |     Returns
21 |     x: (N, T)
22 |     y: (N,)
23 |     '''
24 |     contexts_li = train_data if train else dev_data
25 | 
26 |     x, y, maxlen = [], [], 0
27 |     for _ in range(batch_size):
28 |         label = random.randint(0, n_classes-1) # randint: [a, b]
29 |         try:
30 |             contexts = contexts_li[label]  # list of lists of lists
31 |         except IndexError:
32 |             continue
33 |         if len(contexts) == 0: continue
34 |         ctx = random.choice(contexts)  # list of lists
35 |         history_span = random.randint(1, len(ctx) + 1)
36 |         history = ctx[-history_span:]  # lists
37 | 
38 |         history = list(chain.from_iterable(history) ) # list
39 |         history = history[-max_span+2:] # [3, 4, 5, ...]
40 |         history = [101] + history + [102]  # 101: [CLS], 102: [SEP]
41 |         x.append(history)
42 |         y.append(label)
43 |         maxlen = max(maxlen, len(history))
44 | 
45 |     # print(f"len(x)={len(x)}, len(y)={len(y)}, maxlen={maxlen}")
46 |     x = pad(x, maxlen)
47 |     x = torch.LongTensor(x)
48 |     y = torch.LongTensor(y)
49 |     return x, y
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kyubyong/msg_reply/046f6308785d8e65d7ae429964df40a001a9675d/demo.gif


--------------------------------------------------------------------------------
/download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Download and extract OpenSubtitles 2018 en-es parallel data"
 4 | echo "to opensubtitles2018"
 5 | wget http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/moses/en-es.txt.zip -O temp.zip;
 6 | unzip temp.zip -d opensubtitles2018/;
 7 | rm temp.zip
 8 | 
 9 | echo "Download Cornell Movie Dialogue Corpus"
10 | wget http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip -O temp.zip;
11 | unzip temp.zip;
12 | rm temp.zip
13 | 
14 | 


--------------------------------------------------------------------------------
/encode.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Generate data/cornell.txt that encodes cornell corpus.
 3 | It looks like this:
 4 | [sg_id] [text] [encoding]
 5 | 0	You makin' any headway?	2017 5003 4939 1005 2151 2132 4576 1029 1064
 6 | 0	She kissed me.	2016 4782 2033 1012 1064
 7 | 200020	Where?	2073 1029 1064
 8 | 
 9 | '''
10 | 
11 | import re, os
12 | import pickle
13 | from hparams import hp
14 | from pytorch_pretrained_bert import BertTokenizer
15 | from nltk.tokenize import sent_tokenize
16 | from tqdm import tqdm
17 | import codecs
18 | 
19 | 
20 | def refine(text):
21 |     text = text.lower()
22 |     text = re.sub("[^ A-Za-z\|]", "", text)
23 |     return text
24 | 
25 | def get_utterances(line):
26 |     text = re.search("\[(.+?)\]", line).group(1)
27 |     text = re.sub("[',]", "", text)
28 |     utts = text.split()
29 |     if len(utts) < 2:
30 |         print(line)
31 |     return utts
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
36 | 
37 |     # Load dictionaries
38 |     phr2sg_id = pickle.load(open(hp.phr2sg_id, 'rb'))
39 |     sg_id2phr = pickle.load(open(hp.sg_id2phr, 'rb'))
40 | 
41 |     # Load Cornell movie corpus
42 |     convs = os.path.join(hp.corpus, "movie_conversations.txt")
43 |     lines = os.path.join(hp.corpus, "movie_lines.txt")
44 | 
45 |     indices_li = [get_utterances(line) for line in codecs.open(convs, 'r', "utf-8").read().splitlines()]  # list of lists
46 |     idx2utt = dict()
47 |     for line in codecs.open(lines, 'r', "utf-8", errors="ignore").read().splitlines():
48 |         cols = line.split("+++$+++")
49 |         idx, utt = cols[0].strip(), cols[-1].strip()
50 |         idx2utt[idx] = utt
51 | 
52 |     os.makedirs(os.path.dirname(hp.text), exist_ok=True)
53 |     with open(hp.text, 'w') as fout:
54 |         for i, indices in tqdm(enumerate(indices_li), total=len(indices_li)):
55 |             if len(indices) < 2:
56 |                 print(indices)
57 |             utts = [idx2utt[idx] for idx in indices]
58 | 
59 |             is_valid = True
60 |             for utt in utts:
61 |                 if len(utt.strip()) < 1:
62 |                     is_valid = False
63 |                     break
64 |             if not is_valid: continue
65 | 
66 |             for utt in utts:
67 |                 utt = utt.replace("\t", " ").replace("  ", " ")
68 |                 utt0 = sent_tokenize(utt)[0]
69 |                 utt0 = refine(utt0)
70 |                 sg_id = phr2sg_id.get(utt0, 0)
71 | 
72 |                 tokens = tokenizer.tokenize(utt)[:512-1] # 512: max length of bert
73 |                 if len(tokens) == 0: continue
74 |                 tokens += ["|"]  # utterance delimiter
75 |                 ids = tokenizer.convert_tokens_to_ids(tokens)
76 |                 ids = " ".join(str(idx) for idx in ids)
77 | 
78 |                 # save
79 |                 fout.write(f"{sg_id}\t{utt}\t{ids}\n")
80 |             fout.write("\n")
81 | 
82 | 


--------------------------------------------------------------------------------
/hparams.py:
--------------------------------------------------------------------------------
 1 | class Hparams:
 2 |     # construct_sg
 3 |     opus_en = "opensubtitles2018/OpenSubtitles.en-es.en"
 4 |     opus_es = "opensubtitles2018/OpenSubtitles.en-es.es"
 5 |     sg = "data/sg.en.es.json"
 6 | 
 7 |     # make_phr2sg_id
 8 |     min_cnt = 5 # a phrase whose count is 5 or more is included
 9 |     n_phrs = 10000 # number of phrases
10 |     phr2sg_id = "data/phr2sg_id.pkl"
11 |     sg_id2phr = "data/sg_id2phr.pkl"
12 | 
13 |     # encode
14 |     corpus = "cornell movie-dialogs corpus"
15 |     text = "data/cornell.txt"
16 | 
17 |     # prepro
18 |     pkl_train = 'data/train.pkl'
19 |     pkl_dev = 'data/dev.pkl'
20 |     n_classes = 100
21 |     phr2idx = "data/phr2idx.pkl"
22 |     idx2phr = "data/idx2phr.pkl"
23 | 
24 |     # train
25 |     batch_size = 32*8  # 8 GPUs
26 |     lr = 2e-5
27 |     logdir = 'log'
28 |     vocab_size = 28996
29 |     max_span = 128 # maximum token length for context
30 |     n_train_steps = 10000
31 | 
32 |     # also test
33 |     n_candidates = 5
34 | 
35 | hp = Hparams()


--------------------------------------------------------------------------------
/make_phr2sg_id.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Make two dictionaries: phr2sg_id and sg_id2phr
 3 | 
 4 | phr2sg_id["nice work']==6152
 5 | phr2sg_id["nicely done']==6152
 6 | phr2sg_id["nice going']==6152
 7 | sg_id2phr[6152]=="Well done."
 8 | 
 9 | '''
10 | 
11 | 
12 | import json, os
13 | import operator
14 | import pickle
15 | from hparams import hp
16 | import re
17 | from tqdm import tqdm
18 | 
19 | def refine(text):
20 |     text = text.lower()
21 |     text = re.sub("[^ A-Za-z]", "", text)
22 |     return text
23 | 
24 | if __name__ == "__main__":
25 |     print("Determine the most frequent Synonym Groups")
26 |     data = json.load(open(hp.sg))
27 |     sg_id2cnt = dict()
28 |     for sg_id, sg in tqdm(data.items()):
29 |         sg_id = int(sg_id)
30 |         phrs = sg["phrases"] # [['i am mormon', 1], ["i'm a mormon", 1]]
31 |         sg_cnt = 0 # total cnt
32 |         for phr, cnt in phrs:
33 |             if cnt >= hp.min_cnt:
34 |                 sg_cnt += cnt
35 | 
36 |         sg_id2cnt[sg_id] = sg_cnt
37 | 
38 |     sg_id_cnt = sorted(sg_id2cnt.items(), key=operator.itemgetter(1), reverse=True)
39 |     sg_ids = [sg_id for sg_id, _ in sg_id_cnt][:hp.n_phrs]
40 | 
41 |     print("Determine the group of phrases")
42 |     sg_id2phr = dict()
43 |     phr2sg_id, phr2cnt = dict(), dict()
44 |     for sg_id in tqdm(sg_ids):
45 |         sg = data[str(sg_id)]
46 |         phrs = sg["phrases"]  # [['i am mormon', 1], ["i'm a mormon", 1]]
47 | 
48 |         sg_id2phr[sg_id] = phrs[0][0]
49 |         for phr, cnt in phrs:
50 |             if cnt >= hp.min_cnt:
51 |                 phr = refine(phr)
52 |                 if phr in phr2cnt and cnt > phr2cnt[phr]: # overwrite
53 |                     phr2cnt[phr] = cnt
54 |                     phr2sg_id[phr] = sg_id
55 |                 else:
56 |                     phr2cnt[phr] = cnt
57 |                     phr2sg_id[phr] = sg_id
58 | 
59 |     print("save")
60 |     os.makedirs(os.path.dirname(hp.phr2sg_id), exist_ok=True)
61 |     os.makedirs(os.path.dirname(hp.sg_id2phr), exist_ok=True)
62 |     pickle.dump(phr2sg_id, open(hp.phr2sg_id, 'wb'))
63 |     pickle.dump(sg_id2phr, open(hp.sg_id2phr, 'wb'))


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from pytorch_pretrained_bert import BertForSequenceClassification
 4 | from hparams import hp
 5 | 
 6 | class Net(nn.Module):
 7 |     def __init__(self, n_classes):
 8 |         super().__init__()
 9 |         self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased',
10 |                                                                   num_labels=n_classes)
11 |         self.softmax = nn.Softmax(-1)
12 | 
13 |     def forward(self, x):
14 |         '''
15 |         x: (N, T). int64
16 | 
17 |         Returns
18 |         logits: (N, n_classes)
19 |         y_hat: (N, n_candidates)
20 |         y_hat_prob: (N, n_candidates)
21 | 
22 |         '''
23 |         if self.training:
24 |             self.bert.train()
25 |             logits = self.bert(x)
26 |         else:
27 |             self.bert.eval()
28 |             with torch.no_grad():
29 |                 logits = self.bert(x)
30 | 
31 |         activated = self.softmax(logits)
32 |         y_hat_prob, y_hat = activated.sort(-1, descending=True)
33 |         y_hat_prob = y_hat_prob[:, :hp.n_candidates]
34 |         y_hat = y_hat[:, :hp.n_candidates]
35 | 
36 |         return logits, y_hat, y_hat_prob
37 | 
38 | 


--------------------------------------------------------------------------------
/prepro.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Make phr2idx, idx2phr, and {train|dev}.pkl
  3 | 
  4 | idx2phr:
  5 | {0: 'Yes!',
  6 |  1: 'Good answer.',
  7 |  2: 'What?',
  8 |  3: 'Good.',
  9 |  4: 'Of course I do.',
 10 |  5: "I don't know.",
 11 |  6: 'What for?',
 12 |  7: 'Oh!',
 13 |  8: 'Thank you.',
 14 |  9: 'Hello?',
 15 |  10: 'Right.',
 16 |  11: 'I know.',
 17 |  12: "What's wrong?",
 18 |  13: 'Really?',
 19 |  14: "Oh, I'm sorry.",
 20 |  15: 'Oh, yes!',
 21 |  16: 'Well...',
 22 |  17: 'Yes, sir?',
 23 |  18: 'Nothing.',
 24 |  19: 'Hi!',
 25 |  20: 'Huh!',
 26 |  21: 'Why not?',
 27 |  22: '10.',
 28 |  23: 'Who?',
 29 |  24: 'Stop it.',
 30 |  25: 'Shit!',
 31 |  26: 'What do you mean?',
 32 |  27: 'Aha.',
 33 |  28: 'Yes.',
 34 |  29: 'Come on!',
 35 |  30: 'Shut up!',
 36 |  31: 'What the hell are you talking about?',
 37 |  32: 'So.',
 38 |  33: 'Excuse me...',
 39 |  34: 'Which one?',
 40 |  35: 'What are you doing?',
 41 |  36: 'Where?',
 42 |  37: 'Oh, I see.',
 43 |  38: 'I beg you!',
 44 |  39: 'Me!',
 45 |  40: 'What happened?',
 46 |  41: 'Great!',
 47 |  42: 'Oh, no.',
 48 |  43: 'Jesus!',
 49 |  44: 'Maybe.',
 50 |  45: 'This is it.',
 51 |  46: 'Excuse me!',
 52 |  47: 'No.',
 53 |  48: 'I do.',
 54 |  49: 'Wait?',
 55 |  50: 'How?',
 56 |  51: 'No, thank you.',
 57 |  52: 'Forget it.',
 58 |  53: 'Just like me.',
 59 |  54: "I don't think so.",
 60 |  55: 'I...',
 61 |  56: 'We will.',
 62 |  57: 'Nonsense.',
 63 |  58: 'No, no',
 64 |  59: 'Oh, my God.',
 65 |  60: 'What is this?',
 66 |  61: 'Look!',
 67 |  62: "Can't I?",
 68 |  63: 'No, sir.',
 69 |  64: 'Here...',
 70 |  65: "I'm fine.",
 71 |  66: 'All right?',
 72 |  67: "I don't understand!",
 73 |  68: 'What do you want?',
 74 |  69: 'Wait a minute!',
 75 |  70: 'You!',
 76 |  71: 'How wonderful!',
 77 |  72: 'OK!',
 78 |  73: 'When was it?',
 79 |  74: 'All in order.',
 80 |  75: 'Did I?',
 81 |  76: 'I got it.',
 82 |  77: 'Nope.',
 83 |  78: 'Mmm?',
 84 |  79: 'Sir',
 85 |  80: 'Not a chance.',
 86 |  81: 'Who are you?',
 87 |  82: 'Good night...',
 88 |  83: 'Die!',
 89 |  84: 'What do you think?',
 90 |  85: 'Not exactly.',
 91 |  86: 'Where are you going?',
 92 |  87: 'Are you all right?',
 93 |  88: "I'm...",
 94 |  89: 'Like what?',
 95 |  90: 'I can imagine.',
 96 |  91: "Don't be afraid.",
 97 |  92: 'Huh?',
 98 |  93: 'Of course.',
 99 |  94: 'Bye!',
100 |  95: 'Yeah.',
101 |  96: 'Of course not!',
102 |  97: 'I got it.',
103 |  98: "No, it's not true.",
104 |  99: 'What does that mean?'}
105 | 
106 | '''
107 | 
108 | 
109 | from hparams import hp
110 | import pickle, os
111 | from tqdm import tqdm
112 | from collections import Counter
113 | 
114 | def get_most_frequent_sgs(fin, n_classes):
115 |     sg_ids = []
116 |     for line in open(fin, 'r'):
117 |         if len(line) > 1:
118 |             sg_id = line.split("\t")[0]
119 |             sg_id = int(sg_id)
120 |             if sg_id != 0: # 0: non-sg
121 |                 sg_ids.append(sg_id)
122 |     sg_id2cnt = Counter(sg_ids)
123 |     sg_ids = [sg_id for sg_id, cnt in sg_id2cnt.most_common(n_classes)]
124 |     idx2sg_id = {idx: sg_id for idx, sg_id in enumerate(sg_ids)}
125 |     sg_id2idx = {sg_id: idx for idx, sg_id in enumerate(sg_ids)}
126 |     return idx2sg_id, sg_id2idx
127 | 
128 | def prepro(fin, pkl_train, pkl_dev, n_classes, sg_id2idx):
129 |     contexts_li = [[] for _ in range(n_classes)]
130 | 
131 |     entries = open(fin, 'r').read().split("\n\n")
132 |     for entry in tqdm(entries):
133 |         lines = entry.splitlines()
134 |         for i, line in enumerate(lines):
135 |             if i==0: continue
136 |             cols = line.strip().split("\t")
137 |             sg_id, sent, ids = cols
138 |             sg_id = int(sg_id)
139 |             if sg_id in sg_id2idx:
140 |                 idx = sg_id2idx[sg_id]
141 |                 ctx = [] # e.g. [ [3, 4, 5], [23, 9, 4, 5]  ]
142 |                 for l in lines[:i]:
143 |                     ctx.append([int(id) for id in l.strip().split("\t")[-1].split()])
144 |                 contexts = contexts_li[idx]
145 |                 contexts.append(ctx)
146 |     train, dev = [], []
147 |     for contexts in contexts_li:
148 |         if len(contexts) > 1:
149 |             train.append(contexts[1:])
150 |             dev.append(contexts[:1])
151 |         else:
152 |             train.append(contexts)
153 |             dev.append([])
154 | 
155 | 
156 |     pickle.dump(train, open(pkl_train, 'wb'))
157 |     pickle.dump(dev, open(pkl_dev, 'wb'))
158 |     print("done")
159 | 
160 | if __name__ == "__main__":
161 |     os.makedirs(os.path.dirname(hp.pkl_train), exist_ok=True)
162 |     os.makedirs(os.path.dirname(hp.pkl_dev), exist_ok=True)
163 | 
164 |     idx2sg_id, sg_id2idx = get_most_frequent_sgs(hp.text, hp.n_classes)
165 | 
166 |     phr2sg_id = pickle.load(open(hp.phr2sg_id, 'rb'))
167 |     sg_id2phr = pickle.load(open(hp.sg_id2phr, 'rb'))
168 | 
169 |     phr2idx = dict()
170 |     for phr, sg_id in phr2sg_id.items():
171 |         if sg_id in sg_id2idx:
172 |             phr2idx[phr] = sg_id2idx[sg_id]
173 | 
174 |     idx2phr = dict()
175 |     for idx, sg_id in idx2sg_id.items():
176 |         if sg_id in sg_id2phr:
177 |             idx2phr[idx] = sg_id2phr[sg_id]
178 | 
179 |     pickle.dump(phr2idx, open(hp.phr2idx, 'wb'))
180 |     pickle.dump(idx2phr, open(hp.idx2phr, 'wb'))
181 | 
182 |     prepro(hp.text, hp.pkl_train, hp.pkl_dev, hp.n_classes, sg_id2idx)
183 |     print("DONE")


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | from hparams import hp
 2 | import torch
 3 | from model import Net
 4 | from pytorch_pretrained_bert import BertTokenizer
 5 | from collections import OrderedDict
 6 | from colorama import Fore, Style
 7 | import pickle, re
 8 | 
 9 | import argparse
10 | 
11 | def prepare_inputs(context, tokenizer):
12 |     '''context
13 |     context: I love you. [SEP] Sorry, I hate you.
14 |     '''
15 |     tokens = tokenizer.tokenize(context)
16 |     tokens = tokenizer.convert_tokens_to_ids(tokens)[-hp.max_span+2:]
17 |     tokens = [101] + tokens + [102]
18 |     # print(f"{Fore.LIGHTBLACK_EX}context:{tokenizer.convert_ids_to_tokens(tokens)}{Style.RESET_ALL}")
19 |     tokens = torch.LongTensor(tokens)
20 |     tokens = tokens.unsqueeze(0) # (1, T)
21 |     tokens = tokens.to("cuda")
22 |     return tokens
23 | 
24 | def suggest(context, tokenizer, model, idx2phr):
25 |     x = prepare_inputs(context, tokenizer)
26 |     model.eval()
27 |     with torch.no_grad():
28 |         _, y_hat, y_hat_prob = model(x)
29 |         y_hat = y_hat.cpu().numpy().flatten()  # (3)
30 |         y_hat_prob = y_hat_prob.cpu().numpy().flatten()  # (3)
31 |         y_hat_prob = [round(each, 2) for each in y_hat_prob]
32 |         preds = [idx2phr.get(h, "None") for h in y_hat]
33 |         preds = " | ".join(preds)
34 |         print(f"{Fore.RED}{preds}{Style.RESET_ALL}")
35 |         print(f"{Fore.GREEN}{y_hat_prob}{Style.RESET_ALL}")
36 | 
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     parser = argparse.ArgumentParser()
41 |     parser.add_argument("--ckpt", type=str, required=True,
42 |                         help="checkpoint file path")
43 |     args = parser.parse_args()
44 | 
45 | 
46 |     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
47 | 
48 |     print("Wait... loading model")
49 |     ckpt = args.ckpt
50 | 
51 |     model = Net(hp.n_classes)
52 |     model = model.cuda()
53 |     ckpt = torch.load(ckpt)
54 |     # model.load_state_dict(ckpt)
55 | 
56 |     # ckpt = OrderedDict([(k.replace("module.", "").replace("LayerNorm.weight", "LayerNorm.gamma").replace("LayerNorm.bias", "LayerNorm.beta"), v) for k, v in ckpt.items()])
57 |     ckpt = OrderedDict([(k.replace("module.", ""), v) for k, v in ckpt.items()])
58 |     model.load_state_dict(ckpt)
59 |     print("Model loaded.")
60 | 
61 |     print("# loading dictionaries ..")
62 |     idx2phr = pickle.load(open(hp.idx2phr, 'rb'))
63 | 
64 |     context = ""
65 |     print("Let's start a conversation. If you want to start a new one, please press Enter.")
66 |     while True:
67 |         line = input("A:")
68 |         if line == "":
69 |             context = ""
70 |             print("NEW CONVERSATION---")
71 |             continue
72 |         else:
73 |             context += line + " | "
74 | 
75 |         suggest(context, tokenizer, model, idx2phr)
76 | 
77 |         line = input("B:")
78 |         if line == "":
79 |             context = ""
80 |             print("NEW CONVERSATION---")
81 |             continue
82 |         else:
83 |             context += line + " | "
84 | 
85 |         suggest(context, tokenizer, model, idx2phr)
86 | 
87 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.optim as optim
  4 | from data_load import get_batch
  5 | from hparams import  hp
  6 | from model import Net
  7 | from tqdm import tqdm
  8 | import os
  9 | import random
 10 | from pytorch_pretrained_bert import BertTokenizer
 11 | import pickle
 12 | 
 13 | def train_and_eval(model, optimizer, criterion, ids2tokens, idx2phr):
 14 |     model.train()
 15 |     for step in tqdm(range(hp.n_train_steps+1)):
 16 |         x, y = get_batch(hp.max_span, hp.batch_size, hp.n_classes, True)
 17 |         x = x.cuda()
 18 |         y = y.cuda()
 19 | 
 20 |         optimizer.zero_grad()
 21 | 
 22 |         logits, y_hat, _ = model(x) # logits: (N, classes), y_hat: (N,)
 23 | 
 24 |         loss = criterion(logits, y)
 25 |         loss.backward()
 26 | 
 27 |         optimizer.step()
 28 | 
 29 |         # evaluation
 30 |         if step and step%500==0: # monitoring
 31 |             eval(model, f'{hp.logdir}/{step}', ids2tokens, idx2phr)
 32 |             print(f"step: {step}, loss: {loss.item()}")
 33 |             model.train()
 34 | 
 35 | def eval(model, f, ids2tokens, idx2phr):
 36 |     model.eval()
 37 | 
 38 |     Y, Y_hat = [], []
 39 |     with torch.no_grad():
 40 |         x, y  = get_batch(hp.max_span, hp.batch_size, hp.n_classes, False)
 41 |         x = x.cuda()
 42 | 
 43 |         _, y_hat, _ = model(x)  # y_hat: (N, n_candidates)
 44 | 
 45 |         x = x.cpu().numpy().tolist()
 46 |         y = y.cpu().numpy().tolist()
 47 |         y_hat = y_hat.cpu().numpy().tolist()
 48 | 
 49 |         Y.extend(y)
 50 |         Y_hat.extend(y_hat)
 51 | 
 52 |         # monitoring
 53 |         pointer = random.randint(0, len(x)-1)
 54 |         xx, yy, yy_hat = x[pointer], y[pointer], y_hat[pointer] # one sample
 55 | 
 56 |         tokens = ids2tokens(xx) # this is a function.
 57 |         ctx = " ".join(tokens).replace(" ##", "").split("[PAD]")[0] # bert detokenization
 58 |         gt = idx2phr[yy] # this is a dict.
 59 |         ht = " | ".join(idx2phr[each] for each in yy_hat)
 60 | 
 61 |         print(f"context: {ctx}")
 62 |         print(f"ground truth: {gt}")
 63 |         print(f"predictions: {ht}")
 64 | 
 65 |     # calc acc.
 66 |     n_samples = len(Y)
 67 |     n_correct = 0
 68 |     for y, y_hat in zip(Y, Y_hat):
 69 |         if y in y_hat:
 70 |             n_correct += 1
 71 |     acc = n_correct / n_samples
 72 |     print(f"acc@{hp.n_candidates}: %.2f"%acc)
 73 | 
 74 |     acc = str(round(acc, 2))
 75 | 
 76 |     torch.save(model.state_dict(), f"{f}_ACC{acc}.pt")
 77 | 
 78 | 
 79 | if __name__=="__main__":
 80 |     os.makedirs(hp.logdir, exist_ok=True)
 81 | 
 82 |     print("==== Load tokenizer")
 83 |     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
 84 |     ids2tokens = tokenizer.convert_ids_to_tokens
 85 | 
 86 |     print("==== Load dictionaries")
 87 |     idx2phr = pickle.load(open(hp.idx2phr, 'rb'))
 88 | 
 89 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 90 | 
 91 |     print("==== Building model")
 92 |     model = Net(hp.n_classes)
 93 |     model = model.to(device)
 94 |     model = nn.DataParallel(model)
 95 | 
 96 |     optimizer = optim.Adam(model.parameters(), lr=hp.lr)
 97 |     criterion = nn.CrossEntropyLoss()
 98 | 
 99 |     train_and_eval(model, optimizer, criterion, ids2tokens, idx2phr)
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------