├── LICENSE
├── MANIFEST.in
├── README.md
├── chars2vec
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-310.pyc
    │   ├── __init__.cpython-39.pyc
    │   ├── model.cpython-310.pyc
    │   └── model.cpython-39.pyc
    ├── model.py
    └── trained_models
    │   ├── eng_100
    │       ├── model.pkl
    │       └── weights.h5
    │   ├── eng_150
    │       ├── model.pkl
    │       └── weights.h5
    │   ├── eng_200
    │       ├── model.pkl
    │       └── weights.h5
    │   ├── eng_300
    │       ├── model.pkl
    │       └── weights.h5
    │   └── eng_50
    │       ├── model.pkl
    │       └── weights.h5
├── example_training.py
├── example_usage.py
├── requirements.txt
├── setup.cfg
└── setup.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include chars2vec/ *


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # chars2vec
  2 | 
  3 | #### Character-based word embeddings model based on RNN
  4 | 
  5 | 
  6 | Chars2vec library could be very useful if you are dealing with the texts 
  7 | containing abbreviations, slang, typos, or some other specific textual dataset. 
  8 | Chars2vec language model is based on the symbolic representation of words – 
  9 | the model maps each word to a vector of a fixed length. 
 10 | These vector representations are obtained with a custom neural network while 
 11 | the latter is being trained on pairs of similar and non-similar words. 
 12 | This custom neural net includes LSTM, reading sequences of characters in words, as its part. 
 13 | The model maps similarly written words to proximal vectors. 
 14 | This approach enables creation of an embedding in vector space for any sequence of characters. 
 15 | Chars2vec models does not keep any dictionary of embeddings, 
 16 | but generates embedding vectors inplace using pretrained model. 
 17 | 
 18 | There are pretrained models of dimensions 50, 100, 150, 200 and 300 for the English language.
 19 | The library provides convenient user API to train a model for an arbitrary set of characters. 
 20 | Read more details about the architecture of [Chars2vec: 
 21 | Character-based language model for handling real world texts with spelling 
 22 | errors and human slang](https://hackernoon.com/chars2vec-character-based-language-model-for-handling-real-world-texts-with-spelling-errors-and-a3e4053a147d) in Hacker Noon.
 23 | 
 24 | #### Model available for Python 2.7 and 3.0+.
 25 | 
 26 | ### Installation
 27 | 
 28 | <h5> 1. Build and install from source </h5>
 29 | Download project source and run in your command line
 30 | 
 31 | ~~~shell
 32 | >> python setup.py install
 33 | ~~~
 34 | 
 35 | <h5> 2. Via pip </h5>
 36 | Run in your command line
 37 | 
 38 | ~~~shell
 39 | >> pip install chars2vec
 40 | ~~~
 41 | 
 42 | ### Usage
 43 | 
 44 | Function `chars2vec.load_model(str path)` initializes the model from directory 
 45 | and returns `chars2vec.Chars2Vec` object.
 46 | There are 5 pretrained English model with dimensions: 50, 100, 150, 200 and 300.
 47 | To load this pretrained models:
 48 | 
 49 | ~~~python
 50 | import chars2vec
 51 | 
 52 | # Load Inutition Engineering pretrained model
 53 | # Models names: 'eng_50', 'eng_100', 'eng_150', 'eng_200', 'eng_300'
 54 | c2v_model = chars2vec.load_model('eng_50')
 55 | ~~~ 
 56 | Method `chars2vec.Chars2Vec.vectorize_words(words)` returns `numpy.ndarray` of shape `(n_words, dim)` with word embeddings.
 57 | 
 58 | ~~~python
 59 | words = ['list', 'of', 'words']
 60 | 
 61 | # Create word embeddings
 62 | word_embeddings = c2v_model.vectorize_words(words)
 63 | ~~~
 64 | 
 65 | ### Training
 66 | 
 67 | Function `chars2vec.train_model(int emb_dim, X_train, y_train, model_chars)` 
 68 | creates and trains new chars2vec model and returns `chars2vec.Chars2Vec` object.
 69 | 
 70 | Parameter `emb_dim` is a dimension of the model. 
 71 | 
 72 | Parameter `X_train` is a list or numpy.ndarray of word pairs.
 73 | Parameter `y_train` is a list or numpy.ndarray of target values that describe the proximity of words.
 74 | 
 75 | Training set (`X_train`, `y_train`) consists of pairs of "similar" and "not similar" words; 
 76 | a pair of "similar" words is labeled with 0 target value, and a pair of "not similar" with 1. 
 77 | 
 78 | Parameter `model_chars` is a list of chars for the model.
 79 | Characters which are not in the `model_chars`
 80 | list will be ignored by the model. 
 81 | 
 82 | Read more about chars2vec training and generation of training dataset in 
 83 | [article about chars2vec](https://hackernoon.com/chars2vec-character-based-language-model-for-handling-real-world-texts-with-spelling-errors-and-a3e4053a147d).
 84 | 
 85 | Function `chars2vec.save_model(c2v_model, str path_to_model)` saves the trained model 
 86 | to the directory.
 87 | 
 88 | 
 89 | ~~~python
 90 | import chars2vec
 91 | 
 92 | dim = 50
 93 | path_to_model = 'path/to/model/directory'
 94 | 
 95 | X_train = [('mecbanizing', 'mechanizing'), # similar words, target is equal 0
 96 |            ('dicovery', 'dis7overy'), # similar words, target is equal 0
 97 |            ('prot$oplasmatic', 'prtoplasmatic'), # similar words, target is equal 0
 98 |            ('copulateng', 'lzateful'), # not similar words, target is equal 1
 99 |            ('estry', 'evadin6'), # not similar words, target is equal 1
100 |            ('cirrfosis', 'afear') # not similar words, target is equal 1
101 |           ]
102 | 
103 | y_train = [0, 0, 0, 1, 1, 1]
104 | 
105 | model_chars = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.',
106 |                '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<',
107 |                '=', '>', '?', '@', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
108 |                'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
109 |                'x', 'y', 'z']
110 | 
111 | # Create and train chars2vec model using given training data
112 | my_c2v_model = chars2vec.train_model(dim, X_train, y_train, model_chars)
113 | 
114 | # Save your pretrained model
115 | chars2vec.save_model(my_c2v_model, path_to_model)
116 | 
117 | # Load your pretrained model 
118 | c2v_model = chars2vec.load_model(path_to_model)
119 | ~~~
120 | 
121 | Full code examples for usage and training models see in
122 | `example_usage.py` and `example_training.py` files.
123 | 


--------------------------------------------------------------------------------
/chars2vec/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import *


--------------------------------------------------------------------------------
/chars2vec/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitionEngineeringTeam/chars2vec/fe56df29d57314a824c38ad19e82ae3c34df0862/chars2vec/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/chars2vec/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitionEngineeringTeam/chars2vec/fe56df29d57314a824c38ad19e82ae3c34df0862/chars2vec/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/chars2vec/__pycache__/model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitionEngineeringTeam/chars2vec/fe56df29d57314a824c38ad19e82ae3c34df0862/chars2vec/__pycache__/model.cpython-310.pyc


--------------------------------------------------------------------------------
/chars2vec/__pycache__/model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitionEngineeringTeam/chars2vec/fe56df29d57314a824c38ad19e82ae3c34df0862/chars2vec/__pycache__/model.cpython-39.pyc


--------------------------------------------------------------------------------
/chars2vec/model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pickle
  3 | import tensorflow as tf
  4 | import os
  5 | 
  6 | class Chars2Vec:
  7 | 
  8 |     def __init__(self, emb_dim, char_to_ix):
  9 |         '''
 10 |         Creates chars2vec model.
 11 | 
 12 |         :param emb_dim: int, dimension of embeddings.
 13 |         :param char_to_ix: dict, keys are characters, values are sequence numbers of characters.
 14 |         '''
 15 | 
 16 |         if not isinstance(emb_dim, int) or emb_dim < 1:
 17 |             raise TypeError("parameter 'emb_dim' must be a positive integer")
 18 | 
 19 |         if not isinstance(char_to_ix, dict):
 20 |             raise TypeError("parameter 'char_to_ix' must be a dictionary")
 21 | 
 22 |         self.char_to_ix = char_to_ix
 23 |         self.ix_to_char = {char_to_ix[ch]: ch for ch in char_to_ix}
 24 |         self.vocab_size = len(self.char_to_ix)
 25 |         self.dim = emb_dim
 26 |         self.cache = {}
 27 | 
 28 |         lstm_input = tf.keras.layers.Input(shape=(None, self.vocab_size))
 29 | 
 30 |         x = tf.keras.layers.LSTM(emb_dim, return_sequences=True)(lstm_input)
 31 |         x = tf.keras.layers.LSTM(emb_dim)(x)
 32 | 
 33 |         self.embedding_model = tf.keras.models.Model(inputs=[lstm_input], outputs=x)
 34 | 
 35 |         model_input_1 = tf.keras.layers.Input(shape=(None, self.vocab_size))
 36 |         model_input_2 = tf.keras.layers.Input(shape=(None, self.vocab_size))
 37 | 
 38 |         embedding_1 = self.embedding_model(model_input_1)
 39 |         embedding_2 = self.embedding_model(model_input_2)
 40 |         x = tf.keras.layers.Subtract()([embedding_1, embedding_2])
 41 |         x = tf.keras.layers.Dot(1)([x, x])
 42 |         model_output = tf.keras.layers.Dense(1, activation='sigmoid')(x)
 43 | 
 44 |         self.model = tf.keras.models.Model(inputs=[model_input_1, model_input_2], outputs=model_output)
 45 |         self.model.compile(optimizer='adam', loss='mae')
 46 | 
 47 |     def fit(self, word_pairs, targets,
 48 |             max_epochs, patience, validation_split, batch_size):
 49 |         '''
 50 |         Fits model.
 51 | 
 52 |         :param word_pairs: list or numpy.ndarray of word pairs.
 53 |         :param targets: list or numpy.ndarray of targets.
 54 |         :param max_epochs: parameter 'epochs' of tensorflow model.
 55 |         :param patience: parameter 'patience' of callback in tensorflow model.
 56 |         :param validation_split: parameter 'validation_split' of tensorflow model.
 57 |         '''
 58 | 
 59 |         word_pairs = np.array(word_pairs)
 60 |         targets = np.array(targets)
 61 | 
 62 |         if not isinstance(word_pairs, list) and not isinstance(word_pairs, np.ndarray):
 63 |             raise TypeError("parameters 'word_pairs' must be a list or numpy.ndarray")
 64 | 
 65 |         if not isinstance(targets, list) and not isinstance(targets, np.ndarray):
 66 |             raise TypeError("parameters 'targets' must be a list or numpy.ndarray")
 67 | 
 68 |         x_1, x_2 = [], []
 69 | 
 70 |         for pair_words in word_pairs:
 71 |             emb_list_1 = []
 72 |             emb_list_2 = []
 73 | 
 74 |             if not isinstance(pair_words[0], str) or not isinstance(pair_words[1], str):
 75 |                 raise TypeError("word must be a string")
 76 | 
 77 |             first_word = pair_words[0].lower()
 78 |             second_word = pair_words[1].lower()
 79 | 
 80 |             for t in range(len(first_word)):
 81 | 
 82 |                 if first_word[t] in self.char_to_ix:
 83 |                     x = np.zeros(self.vocab_size)
 84 |                     x[self.char_to_ix[first_word[t]]] = 1
 85 |                     emb_list_1.append(x)
 86 | 
 87 |                 else:
 88 |                     emb_list_1.append(np.zeros(self.vocab_size))
 89 | 
 90 |             x_1.append(np.array(emb_list_1))
 91 | 
 92 |             for t in range(len(second_word)):
 93 | 
 94 |                 if second_word[t] in self.char_to_ix:
 95 |                     x = np.zeros(self.vocab_size)
 96 |                     x[self.char_to_ix[second_word[t]]] = 1
 97 |                     emb_list_2.append(x)
 98 | 
 99 |                 else:
100 |                     emb_list_2.append(np.zeros(self.vocab_size))
101 | 
102 |             x_2.append(np.array(emb_list_2))
103 | 
104 |         x_1_pad_seq = tf.keras.preprocessing.sequence.pad_sequences(x_1)
105 |         x_2_pad_seq = tf.keras.preprocessing.sequence.pad_sequences(x_2)
106 |         
107 |         self.model.fit([x_1_pad_seq, x_2_pad_seq], targets,
108 |                     batch_size=batch_size, epochs=max_epochs,
109 |                     validation_split=validation_split,
110 |                     callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience)])
111 | 
112 |     def vectorize_words(self, words, maxlen_padseq=None):
113 |         '''
114 |         Returns embeddings for list of words. Uses cache of word embeddings to vectorization speed up.
115 | 
116 |         :param words: list or numpy.ndarray of strings.
117 |         :param maxlen_padseq: parameter 'maxlen' for tensorflow pad_sequences transform.
118 | 
119 |         :return word_vectors: numpy.ndarray, word embeddings.
120 |         '''
121 | 
122 |         if not isinstance(words, list) and not isinstance(words, np.ndarray):
123 |             raise TypeError("parameter 'words' must be a list or numpy.ndarray")
124 | 
125 |         words = [w.lower() for w in words]
126 |         unique_words = np.unique(words)
127 |         new_words = [w for w in unique_words if w not in self.cache]
128 | 
129 |         if len(new_words) > 0:
130 | 
131 |             list_of_embeddings = []
132 | 
133 |             for current_word in new_words:
134 | 
135 |                 if not isinstance(current_word, str):
136 |                     raise TypeError("word must be a string")
137 | 
138 |                 current_embedding = []
139 | 
140 |                 for t in range(len(current_word)):
141 | 
142 |                     if current_word[t] in self.char_to_ix:
143 |                         x = np.zeros(self.vocab_size)
144 |                         x[self.char_to_ix[current_word[t]]] = 1
145 |                         current_embedding.append(x)
146 | 
147 |                     else:
148 |                         current_embedding.append(np.zeros(self.vocab_size))
149 | 
150 |                 list_of_embeddings.append(np.array(current_embedding))
151 | 
152 |             embeddings_pad_seq = tf.keras.preprocessing.sequence.pad_sequences(list_of_embeddings, maxlen=maxlen_padseq)
153 |             new_words_vectors = self.embedding_model(embeddings_pad_seq)
154 | 
155 |             for i in range(len(new_words)):
156 |                 self.cache[new_words[i]] = new_words_vectors[i]
157 | 
158 |         word_vectors = [self.cache[current_word] for current_word in words]
159 | 
160 |         return np.array(word_vectors)
161 | 
162 | def save_model(c2v_model, path_to_model):
163 |     '''
164 |     Saves trained model to directory.
165 | 
166 |     :param c2v_model: Chars2Vec object, trained model.
167 |     :param path_to_model: str, path to save model.
168 |     '''
169 | 
170 |     if not os.path.exists(path_to_model):
171 |         os.makedirs(path_to_model)
172 | 
173 |     c2v_model.embedding_model.save_weights(path_to_model + '/weights.h5')
174 | 
175 |     with open(path_to_model + '/model.pkl', 'wb') as f:
176 |         pickle.dump([c2v_model.dim, c2v_model.char_to_ix], f, protocol=2)
177 | 
178 | 
179 | def load_model(path):
180 |     '''
181 |     Loads trained model.
182 | 
183 |     :param path: str, if it is 'eng_50', 'eng_100', 'eng_150', 'eng_200' or 'eng_300' then loads one of default models,
184 |      else loads model from `path`.
185 | 
186 |     :return c2v_model: Chars2Vec object, trained model.
187 |     '''
188 | 
189 |     if path in ['eng_50', 'eng_100', 'eng_150', 'eng_200', 'eng_300']:
190 |         path_to_model = os.path.dirname(os.path.abspath(__file__)) + '/trained_models/' + path
191 | 
192 |     else:
193 |         path_to_model = path
194 | 
195 |     with open(path_to_model + '/model.pkl', 'rb') as f:
196 |         structure = pickle.load(f)
197 |         emb_dim, char_to_ix = structure[0], structure[1]
198 | 
199 |     c2v_model = Chars2Vec(emb_dim, char_to_ix)
200 |     c2v_model.embedding_model.load_weights(path_to_model + '/weights.h5')
201 |     c2v_model.embedding_model.compile(optimizer='adam', loss='mae')
202 | 
203 |     return c2v_model
204 | 
205 | 
206 | def train_model(emb_dim, X_train, y_train, model_chars,
207 |                 max_epochs=200, patience=10, validation_split=0.05, batch_size=64):
208 |     '''
209 |     Creates and trains chars2vec model using given training data.
210 | 
211 |     :param emb_dim: int, dimension of embeddings.
212 |     :param X_train: list or numpy.ndarray of word pairs.
213 |     :param y_train: list or numpy.ndarray of target values that describe the proximity of words.
214 |     :param model_chars: list or numpy.ndarray of basic chars in model.
215 |     :param max_epochs: parameter 'epochs' of keras model.
216 |     :param patience: parameter 'patience' of callback in keras model.
217 |     :param validation_split: parameter 'validation_split' of keras model.
218 |     :param batch_size: parameter 'batch_size' of keras model.
219 | 
220 |     :return c2v_model: Chars2Vec object, trained model.
221 |     '''
222 | 
223 |     if not isinstance(X_train, list) and not isinstance(X_train, np.ndarray):
224 |         raise TypeError("parameter 'X_train' must be a list or numpy.ndarray")\
225 | 
226 |     if not isinstance(y_train, list) and not isinstance(y_train, np.ndarray):
227 |         raise TypeError("parameter 'y_train' must be a list or numpy.ndarray")
228 | 
229 |     if not isinstance(model_chars, list) and not isinstance(model_chars, np.ndarray):
230 |         raise TypeError("parameter 'model_chars' must be a list or numpy.ndarray")
231 | 
232 |     char_to_ix = {ch: i for i, ch in enumerate(model_chars)}
233 |     c2v_model = Chars2Vec(emb_dim, char_to_ix)
234 | 
235 |     targets = [float(el) for el in y_train]
236 |     c2v_model.fit(X_train, targets, max_epochs, patience, validation_split, batch_size)
237 | 
238 |     return c2v_model
239 | 


--------------------------------------------------------------------------------
/chars2vec/trained_models/eng_100/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitionEngineeringTeam/chars2vec/fe56df29d57314a824c38ad19e82ae3c34df0862/chars2vec/trained_models/eng_100/model.pkl


--------------------------------------------------------------------------------
/chars2vec/trained_models/eng_100/weights.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitionEngineeringTeam/chars2vec/fe56df29d57314a824c38ad19e82ae3c34df0862/chars2vec/trained_models/eng_100/weights.h5


--------------------------------------------------------------------------------
/chars2vec/trained_models/eng_150/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitionEngineeringTeam/chars2vec/fe56df29d57314a824c38ad19e82ae3c34df0862/chars2vec/trained_models/eng_150/model.pkl


--------------------------------------------------------------------------------
/chars2vec/trained_models/eng_150/weights.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitionEngineeringTeam/chars2vec/fe56df29d57314a824c38ad19e82ae3c34df0862/chars2vec/trained_models/eng_150/weights.h5


--------------------------------------------------------------------------------
/chars2vec/trained_models/eng_200/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitionEngineeringTeam/chars2vec/fe56df29d57314a824c38ad19e82ae3c34df0862/chars2vec/trained_models/eng_200/model.pkl


--------------------------------------------------------------------------------
/chars2vec/trained_models/eng_200/weights.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitionEngineeringTeam/chars2vec/fe56df29d57314a824c38ad19e82ae3c34df0862/chars2vec/trained_models/eng_200/weights.h5


--------------------------------------------------------------------------------
/chars2vec/trained_models/eng_300/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitionEngineeringTeam/chars2vec/fe56df29d57314a824c38ad19e82ae3c34df0862/chars2vec/trained_models/eng_300/model.pkl


--------------------------------------------------------------------------------
/chars2vec/trained_models/eng_300/weights.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitionEngineeringTeam/chars2vec/fe56df29d57314a824c38ad19e82ae3c34df0862/chars2vec/trained_models/eng_300/weights.h5


--------------------------------------------------------------------------------
/chars2vec/trained_models/eng_50/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitionEngineeringTeam/chars2vec/fe56df29d57314a824c38ad19e82ae3c34df0862/chars2vec/trained_models/eng_50/model.pkl


--------------------------------------------------------------------------------
/chars2vec/trained_models/eng_50/weights.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitionEngineeringTeam/chars2vec/fe56df29d57314a824c38ad19e82ae3c34df0862/chars2vec/trained_models/eng_50/weights.h5


--------------------------------------------------------------------------------
/example_training.py:
--------------------------------------------------------------------------------
 1 | import chars2vec
 2 | 
 3 | 
 4 | dim = 50
 5 | 
 6 | path_to_model = 'path/to/model/directory'
 7 | 
 8 | X_train = [('mecbanizing', 'mechanizing'), # similar words, target is equal 0
 9 |            ('dicovery', 'dis7overy'), # similar words, target is equal 0
10 |            ('prot$oplasmatic', 'prtoplasmatic'), # similar words, target is equal 0
11 |            ('copulateng', 'lzateful'), # not similar words, target is equal 1
12 |            ('estry', 'evadin6'), # not similar words, target is equal 1
13 |            ('cirrfosis', 'afear') # not similar words, target is equal 1
14 |           ]
15 | 
16 | y_train = [0, 0, 0, 1, 1, 1]
17 | 
18 | model_chars = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.',
19 |                '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<',
20 |                '=', '>', '?', '@', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
21 |                'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
22 |                'x', 'y', 'z']
23 | 
24 | # Create and train chars2vec model using given training data
25 | my_c2v_model = chars2vec.train_model(dim, X_train, y_train, model_chars)
26 | 
27 | # Save pretrained model
28 | chars2vec.save_model(my_c2v_model, path_to_model)
29 | 
30 | words = ['list', 'of', 'words']
31 | 
32 | # Load pretrained model, create word embeddings
33 | c2v_model = chars2vec.load_model(path_to_model)
34 | word_embeddings = c2v_model.vectorize_words(words)
35 | 


--------------------------------------------------------------------------------
/example_usage.py:
--------------------------------------------------------------------------------
 1 | import chars2vec
 2 | import sklearn.decomposition
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | 
 6 | # Load Inutition Engineering pretrained model
 7 | # Models names: 'eng_50', 'eng_100', 'eng_150', 'eng_200', 'eng_300'
 8 | c2v_model = chars2vec.load_model('eng_50')
 9 | 
10 | words = ['Natural', 'Language', 'Understanding',
11 |          'Naturael', 'Longuge', 'Updderctundjing',
12 |          'Motural', 'Lamnguoge', 'Understaating',
13 |          'Naturrow', 'Laguage', 'Unddertandink',
14 |          'Nattural', 'Languagge', 'Umderstoneding']
15 | 
16 | # Create word embeddings
17 | word_embeddings = c2v_model.vectorize_words(words)
18 | 
19 | # Project embeddings on plane using the PCA
20 | projection_2d = sklearn.decomposition.PCA(n_components=2).fit_transform(word_embeddings)
21 | 
22 | # Draw words on plane
23 | f = plt.figure(figsize=(8, 6))
24 | 
25 | for j in range(len(projection_2d)):
26 |     plt.scatter(projection_2d[j, 0], projection_2d[j, 1],
27 |                 marker=('$' + words[j] + '$'),
28 |                 s=500 * len(words[j]), label=j,
29 |                 facecolors='green' if words[j]
30 |                             in ['Natural', 'Language', 'Understanding'] else 'black')
31 | 
32 | plt.show()
33 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | setuptools
2 | tensorflow


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import subprocess
 3 | PY_VER = sys.version[0]
 4 | subprocess.call(["pip{:} install -r requirements.txt".format(PY_VER)], shell=True)
 5 | 
 6 | from setuptools import setup
 7 | 
 8 | setup(
 9 |     name='chars2vec',
10 |     version='0.1.7',
11 |     author='Vladimir Chikin',
12 |     author_email='v4@intuition.engineering',
13 |     packages=['chars2vec'],
14 |     include_package_data=True,
15 |     package_data={'chars2vec': ['trained_models/*']},
16 |     description='Character-based word embeddings model based on RNN',
17 |     maintainer='Intuition',
18 |     maintainer_email='dev@intuition.engineering',
19 |     url='https://github.com/IntuitionEngineeringTeam/chars2vec',
20 |     download_url='https://github.com/IntuitionEngineeringTeam/chars2vec/archive/master.zip',
21 |     license='Apache License 2.0',
22 |     long_description='Chars2vec library could be very useful if you are dealing with the texts \
23 |                         containing abbreviations, slang, typos, or some other specific textual dataset. \
24 |                         Chars2vec language model is based on the symbolic representation of words – \
25 |                         the model maps each word to a vector of a fixed length. \
26 |                         These vector representations are obtained with a custom neural netowrk while \
27 |                         the latter is being trained on pairs of similar and non-similar words. \
28 |                         This custom neural net includes LSTM, reading sequences of characters in words, as its part. \
29 |                         The model maps similarly written words to proximal vectors. \
30 |                         This approach enables creation of an embedding in vector space for any sequence of characters.\
31 |                         Chars2vec models does not keep any dictionary of embeddings, \
32 |                         but generates embedding vectors inplace using pretrained model. \
33 |                         There are pretrained models of dimensions 50, 100, 150, 200 and 300 for the English language.\
34 |                         The library provides convenient user API to train a model for an arbitrary set of characters.',
35 |     classifiers=['Programming Language :: Python :: 2.7',
36 |                  'Programming Language :: Python :: 3']
37 | )


--------------------------------------------------------------------------------