├── .github
    ├── FUNDING.yml
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── custom.md
    │   └── feature_request.md
├── CODE_OF_CONDUCT.md
├── LICENSE.txt
├── README.md
├── datasets
    └── stack.csv
├── examples
    ├── .ipynb_checkpoints
    │   └── FewShot-checkpoint.ipynb
    ├── CosineClassifierDemo.ipynb
    ├── DLApproach.ipynb
    ├── DLFewShot.ipynb
    ├── FewShot.ipynb
    ├── KNNClassifierDemo.ipynb
    ├── datasets
    │   └── stack.csv
    └── sub.csv
├── fsText
    ├── CosineClassifier.py
    ├── KNNClassifier.py
    ├── RFClassifier.py
    ├── __init__.py
    └── __pycache__
    │   └── CosineClassifier.cpython-36.pyc
├── requirements.txt
├── resources
    ├── images
    │   ├── nlp_fs_4.png
    │   ├── nlp_fs_6.png
    │   ├── perf_1.png
    │   └── perf_2.png
    └── papers
    │   ├── DataAugmentation
    │       ├── 1804.08166.pdf
    │       └── 1901.11196.pdf
    │   └── FewShot
    │       ├── 1710.10280.pdf
    │       ├── 1804.02063.pdf
    │       └── 1908.08788.pdf
├── setup.cfg
└── setup.py


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/custom.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Custom issue template
 3 | about: Describe this issue template's purpose here.
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at mael.fabien@gmail.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # fsText : Few-Shot Text Classification
  2 | 
  3 | <img alt="GitHub contributors" src="https://img.shields.io/github/contributors-anon/maelfabien/FewShotTextClassification.svg"> <img alt="PyPI - Python Version" src="https://img.shields.io/pypi/pyversions/3.svg">
  4 | [![PyPI Version](https://img.shields.io/pypi/v/fsText.svg)](https://pypi.org/project/fsText/)
  5 | [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-v1.4%20adopted-ff69b4.svg)](.github/CODE_OF_CONDUCT.md)
  6 | [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](http://makeapullrequest.com)
  7 | 
  8 | 🚧 ! This library is currently a work in progress ! 🚧
  9 | 
 10 | *Use Case* : A user has a column of short texts (e.g user reviews) but the comments are not labeled. We ask the user to hand-label just a few texts of each class (i.e. few-shot), and provide a method that leverages pre-trained embeddings to generalize the classification to the whole dataset.
 11 | 
 12 | This library will gather several state-of-the-art techniques. We will present the concepts behind each algorithm and its implementation in the section below.
 13 | 
 14 | ## Table of Contents <!-- omit in toc -->
 15 | 
 16 | - [Installation](#Installation)
 17 |   - [With pip](#With-pip)
 18 |   - [From source](#From-source)
 19 | - [Implemented Models](#Models)
 20 | - [Getting started](#Getting-started)
 21 |   - [Preparing your data](#Preparing-your-data)
 22 |   - [Training models](#Training-models)
 23 |   - [Making predictions](#Making-predictions)
 24 | - [Notebook Examples](#Notebook-Examples)
 25 | - [Contributing](#Contributing)
 26 | - [References](#References)
 27 | - [LICENSE](#LICENSE)
 28 | - [Contacts and Contributors](#LICENSE)
 29 | 
 30 | ## Installation
 31 | 
 32 | ### With pip
 33 | 
 34 | ```shell
 35 | pip install fsText
 36 | ```
 37 | 
 38 | ### From source
 39 | 
 40 | ```shell
 41 | git clone https://github.com/maelfabien/fsText.git
 42 | cd fsText
 43 | pip install -e .
 44 | ```
 45 | 
 46 | ## Implemented Models
 47 | 
 48 | 
 49 | | Model              | Status               | Details | Reference Paper |
 50 | | ----------------- | --------------------| -------------------- | -------------------- |
 51 | | Word2Vec + Cosine Similarity  | ✅ | [Article](https://maelfabien.github.io/machinelearning/NLP_5/) | [Few-Shot Text Classification with Pre-Trained Word Embeddings and a Human in the Loop](https://arxiv.org/abs/1804.02063) |
 52 | | Word2Vec + Advanced Classifiers  | 🚧 | [Article](https://maelfabien.github.io/machinelearning/NLP_6/) | [Few-Shot Text Classification with Pre-Trained Word Embeddings and a Human in the Loop](https://arxiv.org/abs/1804.02063) |
 53 | | DistilBert + Advanced Classifier  | 🚧 | [Article](https://maelfabien.github.io/machinelearning/NLP_7/) | --- |
 54 | | Siamese Network | ❌ | [Article](https://data4thought.com/fewshot_learning_nlp.html) | --- |
 55 | | Fine-Tuning Pre-trained Bert | ❌ | --- | [Improving Few-shot Text Classification via Pretrained Language Representations](https://arxiv.org/abs/1908.08788) |
 56 | 
 57 | ## Getting started
 58 | 
 59 | ### Preparing your data
 60 | 
 61 | We offer a text pre-processing pipeline as well as data augmentation techniques. To use `fsText` you need to create a Pandas DataFrame with the following columns:
 62 | 
 63 | | Text              | Label               |
 64 | | ----------------- | --------------------|
 65 | | First short text  | Label of first text |
 66 | 
 67 | ### Training models
 68 | 
 69 | Fit the cosine classifier on your annotated texts:
 70 | 
 71 | ```python
 72 | from fsText.Classifier import CosineClassifier
 73 | 
 74 | clf = CosineClassifier()
 75 | clf.fit(X_train, y_train)
 76 | ```
 77 | 
 78 | We include an automated label encoding of `y_train` which can therefore take any form.
 79 | 
 80 | ### Making predictions
 81 | 
 82 | To get the prediction on the rest of your un-labeled texts:
 83 | 
 84 | ```python
 85 | clf.predict(X_test)
 86 | ```
 87 | 
 88 | To assess the accuracy of the prediction in case you have a labeled dataset:
 89 | 
 90 | ```python
 91 | from sklearn.metrics import accuracy_score
 92 | accuracy_score(clf.predict(X_test), y_test)
 93 | ```
 94 | 
 95 | ## Notebook Examples
 96 | 
 97 | We prepared some notebook examples under the [examples](examples) directory.
 98 | 
 99 | | Notebook | Description |
100 | | --- | --- |
101 | | [1] CosineClassifierDemo | A simple demonstration of fsText Cosine Classifier + Word2Vec |
102 | 
103 | ## Contributing
104 | 
105 | Read our [Contributing Guidelines](.github/CONTRIBUTING.md).
106 | 
107 | ## References
108 | 
109 | | Type                 | Title                                                                                                                                        | Author                                                                                 | Year |
110 | | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------- | ---- |
111 | | :newspaper: Paper    | [One-shot and few-shot learning of word embeddings](https://arxiv.org/abs/1710.10280)                 | Andrew K. Lampinen & James L. McClelland                                   | 2018 |
112 | | :newspaper: Paper    | [Few-Shot Text Classification with Pre-Trained Word Embeddings and a Human in the Loop](https://arxiv.org/abs/1804.02063)                | Katherine Bailey, Sunny Chopra                                  | 2018 |
113 | | :newspaper: Paper    | [Improving Few-shot Text Classification via Pretrained Language Representations](https://arxiv.org/abs/1908.08788)                | Ningyu Zhang, Zhanlin Sun, Shumin Deng, Jiaoyan Chen, Huajun Chen                                   | 2019 |
114 | 
115 | ## LICENSE
116 | 
117 | [Apache-2.0](LICENSE)
118 | 
119 | ## Contacts and contributors
120 | 
121 | <table><tr><td align="center">
122 | <td align="center">
123 | <a href="https://github.com/andrelmfarias"><img src="https://avatars3.githubusercontent.com/u/43521764?s=400&v=4" width="100px;" alt="mfix22"/>
124 | <br /><sub><b>andrelmfarias</b></sub>
125 | </a><br /><a href="https://github.com/maelfabien/fsTC/commits?author=andrelmfarias" title="Code">💻      </a></td>
126 | </td>
127 | <td align="center">
128 | <a href="https://github.com/mamrouch"><img src="https://avatars3.githubusercontent.com/u/29277719?s=400&v=4" width="100px;" alt="mfix22"/>
129 | <br /><sub><b>mamrouch</b></sub>
130 | </a><br /><a href="https://github.com/maelfabien/fsTC/commits?author=mamrouch" title="Code">💻      </a></td>
131 | </td>
132 | <td align="center">
133 | <a href="https://github.com/maelfabien"><img src="https://avatars0.githubusercontent.com/u/24256555?v=4" width="100px;" alt="mfix22"/>
134 | <br /><sub><b>maelfabien</b></sub>
135 | </a><br /><a href="https://github.com/maelfabien/fsTC/commits?author=maelfabien" title="Code">💻      </a></td>
136 | </td>
137 | </tr></table>
138 | 


--------------------------------------------------------------------------------
/examples/CosineClassifierDemo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Cosine Classifier - Example"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "ExecuteTime": {
 15 |      "end_time": "2019-09-13T09:01:58.669608Z",
 16 |      "start_time": "2019-09-13T09:01:55.888724Z"
 17 |     }
 18 |    },
 19 |    "outputs": [
 20 |     {
 21 |      "name": "stderr",
 22 |      "output_type": "stream",
 23 |      "text": [
 24 |       "/anaconda3/lib/python3.6/site-packages/requests/__init__.py:91: RequestsDependencyWarning: urllib3 (1.21.1) or chardet (2.3.0) doesn't match a supported version!\n",
 25 |       "  RequestsDependencyWarning)\n"
 26 |      ]
 27 |     }
 28 |    ],
 29 |    "source": [
 30 |     "from fsText.Classifier import CosineClassifier\n",
 31 |     "import pandas as pd"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {
 38 |     "ExecuteTime": {
 39 |      "end_time": "2019-09-13T09:04:28.679027Z",
 40 |      "start_time": "2019-09-13T09:02:24.448870Z"
 41 |     }
 42 |    },
 43 |    "outputs": [
 44 |     {
 45 |      "name": "stdout",
 46 |      "output_type": "stream",
 47 |      "text": [
 48 |       "Loading pre-trained Word2Vec model...\n"
 49 |      ]
 50 |     }
 51 |    ],
 52 |    "source": [
 53 |     "clf = CosineClassifier()"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 5,
 59 |    "metadata": {
 60 |     "ExecuteTime": {
 61 |      "end_time": "2019-09-13T09:05:28.828852Z",
 62 |      "start_time": "2019-09-13T09:05:28.776887Z"
 63 |     }
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "import numpy as np\n",
 68 |     "\n",
 69 |     "df = pd.read_csv('datasets/stack.csv')\n",
 70 |     "\n",
 71 |     "def gen_sample(sample_size, num_classes):\n",
 72 |     "    \n",
 73 |     "    df_1 = df[(df[\"Label\"]<num_classes + 1)].reset_index().drop([\"index\"], axis=1).reset_index().drop([\"index\"], axis=1)\n",
 74 |     "    train = df_1[df_1[\"Label\"] == np.unique(df_1['Label'])[0]].sample(sample_size)\n",
 75 |     "    \n",
 76 |     "    train_index = train.index.tolist()\n",
 77 |     "    \n",
 78 |     "    for i in range(1,num_classes):\n",
 79 |     "        train_2 = df_1[df_1[\"Label\"] == np.unique(df_1['Label'])[i]].sample(sample_size)\n",
 80 |     "        train = pd.concat([train, train_2], axis=0)\n",
 81 |     "        train_index.extend(train_2.index.tolist())\n",
 82 |     "        \n",
 83 |     "    test = df_1[~df_1.index.isin(train_index)]\n",
 84 |     "\n",
 85 |     "    return train, test"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 6,
 91 |    "metadata": {
 92 |     "ExecuteTime": {
 93 |      "end_time": "2019-09-13T09:05:29.967061Z",
 94 |      "start_time": "2019-09-13T09:05:29.915885Z"
 95 |     }
 96 |    },
 97 |    "outputs": [
 98 |     {
 99 |      "data": {
100 |       "text/html": [
101 |        "<div>\n",
102 |        "<style scoped>\n",
103 |        "    .dataframe tbody tr th:only-of-type {\n",
104 |        "        vertical-align: middle;\n",
105 |        "    }\n",
106 |        "\n",
107 |        "    .dataframe tbody tr th {\n",
108 |        "        vertical-align: top;\n",
109 |        "    }\n",
110 |        "\n",
111 |        "    .dataframe thead th {\n",
112 |        "        text-align: right;\n",
113 |        "    }\n",
114 |        "</style>\n",
115 |        "<table border=\"1\" class=\"dataframe\">\n",
116 |        "  <thead>\n",
117 |        "    <tr style=\"text-align: right;\">\n",
118 |        "      <th></th>\n",
119 |        "      <th>Text</th>\n",
120 |        "      <th>Label</th>\n",
121 |        "    </tr>\n",
122 |        "  </thead>\n",
123 |        "  <tbody>\n",
124 |        "    <tr>\n",
125 |        "      <th>1717</th>\n",
126 |        "      <td>Wordpress SEO Features</td>\n",
127 |        "      <td>1</td>\n",
128 |        "    </tr>\n",
129 |        "    <tr>\n",
130 |        "      <th>820</th>\n",
131 |        "      <td>How to use the jQuery Cycle Plugin with WordPr...</td>\n",
132 |        "      <td>1</td>\n",
133 |        "    </tr>\n",
134 |        "    <tr>\n",
135 |        "      <th>1905</th>\n",
136 |        "      <td>PHP/SQL/Wordpress: Group a user list by alphabet</td>\n",
137 |        "      <td>1</td>\n",
138 |        "    </tr>\n",
139 |        "    <tr>\n",
140 |        "      <th>1361</th>\n",
141 |        "      <td>How do I use underscore in a wordpress permalink</td>\n",
142 |        "      <td>1</td>\n",
143 |        "    </tr>\n",
144 |        "    <tr>\n",
145 |        "      <th>1123</th>\n",
146 |        "      <td>Wordpress register_activation_hook() + global ...</td>\n",
147 |        "      <td>1</td>\n",
148 |        "    </tr>\n",
149 |        "    <tr>\n",
150 |        "      <th>901</th>\n",
151 |        "      <td>How can I optimize a dynamic search query in O...</td>\n",
152 |        "      <td>2</td>\n",
153 |        "    </tr>\n",
154 |        "    <tr>\n",
155 |        "      <th>953</th>\n",
156 |        "      <td>How to handle line breaks in data for importin...</td>\n",
157 |        "      <td>2</td>\n",
158 |        "    </tr>\n",
159 |        "    <tr>\n",
160 |        "      <th>173</th>\n",
161 |        "      <td>Search All Fields In All Tables For A Specific...</td>\n",
162 |        "      <td>2</td>\n",
163 |        "    </tr>\n",
164 |        "    <tr>\n",
165 |        "      <th>592</th>\n",
166 |        "      <td>How do I close an OracleConnection in .NET</td>\n",
167 |        "      <td>2</td>\n",
168 |        "    </tr>\n",
169 |        "    <tr>\n",
170 |        "      <th>1155</th>\n",
171 |        "      <td>Oracle: How do I convert hex to decimal in Ora...</td>\n",
172 |        "      <td>2</td>\n",
173 |        "    </tr>\n",
174 |        "  </tbody>\n",
175 |        "</table>\n",
176 |        "</div>"
177 |       ],
178 |       "text/plain": [
179 |        "                                                   Text  Label\n",
180 |        "1717                             Wordpress SEO Features      1\n",
181 |        "820   How to use the jQuery Cycle Plugin with WordPr...      1\n",
182 |        "1905   PHP/SQL/Wordpress: Group a user list by alphabet      1\n",
183 |        "1361   How do I use underscore in a wordpress permalink      1\n",
184 |        "1123  Wordpress register_activation_hook() + global ...      1\n",
185 |        "901   How can I optimize a dynamic search query in O...      2\n",
186 |        "953   How to handle line breaks in data for importin...      2\n",
187 |        "173   Search All Fields In All Tables For A Specific...      2\n",
188 |        "592          How do I close an OracleConnection in .NET      2\n",
189 |        "1155  Oracle: How do I convert hex to decimal in Ora...      2"
190 |       ]
191 |      },
192 |      "execution_count": 6,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "train, test = gen_sample(5,2)\n",
199 |     "train"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 8,
205 |    "metadata": {
206 |     "ExecuteTime": {
207 |      "end_time": "2019-09-13T09:05:34.796066Z",
208 |      "start_time": "2019-09-13T09:05:34.784703Z"
209 |     }
210 |    },
211 |    "outputs": [],
212 |    "source": [
213 |     "clf.fit(train['Text'], train['Label'])"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 9,
219 |    "metadata": {
220 |     "ExecuteTime": {
221 |      "end_time": "2019-09-13T09:05:37.011190Z",
222 |      "start_time": "2019-09-13T09:05:36.508046Z"
223 |     }
224 |    },
225 |    "outputs": [
226 |     {
227 |      "data": {
228 |       "text/plain": [
229 |        "0.9035175879396985"
230 |       ]
231 |      },
232 |      "execution_count": 9,
233 |      "metadata": {},
234 |      "output_type": "execute_result"
235 |     }
236 |    ],
237 |    "source": [
238 |     "from sklearn.metrics import accuracy_score\n",
239 |     "accuracy_score(clf.predict(test['Text']), test['Label'])"
240 |    ]
241 |   }
242 |  ],
243 |  "metadata": {
244 |   "kernelspec": {
245 |    "display_name": "Python 3",
246 |    "language": "python",
247 |    "name": "python3"
248 |   },
249 |   "language_info": {
250 |    "codemirror_mode": {
251 |     "name": "ipython",
252 |     "version": 3
253 |    },
254 |    "file_extension": ".py",
255 |    "mimetype": "text/x-python",
256 |    "name": "python",
257 |    "nbconvert_exporter": "python",
258 |    "pygments_lexer": "ipython3",
259 |    "version": "3.6.5"
260 |   },
261 |   "latex_envs": {
262 |    "LaTeX_envs_menu_present": true,
263 |    "autoclose": false,
264 |    "autocomplete": true,
265 |    "bibliofile": "biblio.bib",
266 |    "cite_by": "apalike",
267 |    "current_citInitial": 1,
268 |    "eqLabelWithNumbers": true,
269 |    "eqNumInitial": 1,
270 |    "hotkeys": {
271 |     "equation": "Ctrl-E",
272 |     "itemize": "Ctrl-I"
273 |    },
274 |    "labels_anchors": false,
275 |    "latex_user_defs": false,
276 |    "report_style_numbering": false,
277 |    "user_envs_cfg": false
278 |   }
279 |  },
280 |  "nbformat": 4,
281 |  "nbformat_minor": 2
282 | }
283 | 


--------------------------------------------------------------------------------
/examples/DLFewShot.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "<center><h1 style=\"font-size:3em\"> Improving Few-shot Text Classification </h1></center>\n",
   8 |     "<br>\n",
   9 |     "<center><i style=\"font-size:1.3em\">via Pretrained Language Representations</i></center>"
  10 |    ]
  11 |   },
  12 |   {
  13 |    "cell_type": "markdown",
  14 |    "metadata": {},
  15 |    "source": [
  16 |     "Resources :\n",
  17 |     "- Includes Easy Data Augmentation (EDA) package : https://github.com/jasonwei20/eda_nlp/blob/master/code/eda.py introduced after this paper : https://arxiv.org/pdf/1901.11196.pdf\n",
  18 |     "- Neural Structure Learning for sentiment classification (learn graph from embedding) : https://www.tensorflow.org/neural_structured_learning/tutorials/graph_keras_lstm_imdb\n",
  19 |     "- RNN text classification with Tensorflow 2.0 : https://www.tensorflow.org/beta/tutorials/text/text_classification_rnn\n",
  20 |     "- Great paper from August 2019 : https://arxiv.org/pdf/1908.08788.pdf\n"
  21 |    ]
  22 |   },
  23 |   {
  24 |    "cell_type": "code",
  25 |    "execution_count": 1,
  26 |    "metadata": {
  27 |     "ExecuteTime": {
  28 |      "end_time": "2019-09-10T08:28:53.160794Z",
  29 |      "start_time": "2019-09-10T08:28:42.136092Z"
  30 |     }
  31 |    },
  32 |    "outputs": [
  33 |     {
  34 |      "name": "stderr",
  35 |      "output_type": "stream",
  36 |      "text": [
  37 |       "/anaconda3/lib/python3.6/site-packages/requests/__init__.py:91: RequestsDependencyWarning: urllib3 (1.21.1) or chardet (2.3.0) doesn't match a supported version!\n",
  38 |       "  RequestsDependencyWarning)\n",
  39 |       "/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
  40 |       "  from ._conv import register_converters as _register_converters\n"
  41 |      ]
  42 |     }
  43 |    ],
  44 |    "source": [
  45 |     "import pandas as pd\n",
  46 |     "import numpy as np\n",
  47 |     "from random import seed\n",
  48 |     "from random import sample\n",
  49 |     "import random\n",
  50 |     "from random import shuffle\n",
  51 |     "import re\n",
  52 |     "\n",
  53 |     "seed(42)\n",
  54 |     "np.random.seed(42)\n",
  55 |     "\n",
  56 |     "from sklearn.model_selection import train_test_split\n",
  57 |     "import matplotlib.pyplot as plt\n",
  58 |     "\n",
  59 |     "import gensim.downloader as api\n",
  60 |     "from gensim.models.keyedvectors import Word2VecKeyedVectors\n",
  61 |     "\n",
  62 |     "from sklearn.decomposition import PCA\n",
  63 |     "from sklearn.metrics import accuracy_score\n",
  64 |     "from scipy import spatial\n",
  65 |     "\n",
  66 |     "from sklearn.neighbors import KNeighborsClassifier\n",
  67 |     "from sklearn.ensemble import RandomForestClassifier\n",
  68 |     "from xgboost import XGBClassifier\n",
  69 |     "\n",
  70 |     "import nltk\n",
  71 |     "from nltk.corpus import stopwords\n",
  72 |     "from nltk.corpus import wordnet \n",
  73 |     "\n",
  74 |     "import tensorflow as tf\n",
  75 |     "import tensorflow_datasets as tfds\n",
  76 |     "\n",
  77 |     "import tensorflow.keras\n",
  78 |     "import tensorflow.keras.preprocessing.text as kpt\n",
  79 |     "from tensorflow.keras.preprocessing.text import Tokenizer\n",
  80 |     "from tensorflow.keras.preprocessing import sequence\n",
  81 |     "\n",
  82 |     "import neural_structured_learning as nsl\n",
  83 |     "import tensorflow_hub as hub\n",
  84 |     "\n",
  85 |     "import torch\n",
  86 |     "import torch.nn as nn\n",
  87 |     "#import torch.legacy.nn as luann\n",
  88 |     "import sys\n",
  89 |     "\n",
  90 |     "from torchtext.data import Field\n",
  91 |     "from torchtext.data import Dataset\n",
  92 |     "import torch\n",
  93 |     "import torch.utils.data\n",
  94 |     "from torch.autograd import Variable\n",
  95 |     "from torchtext.vocab import Vocab\n",
  96 |     "\n",
  97 |     "from collections import Counter\n",
  98 |     "from collections import OrderedDict\n",
  99 |     "\n",
 100 |     "from pytorch_pretrained_bert.tokenization import BertTokenizer\n",
 101 |     "from pytorch_pretrained_bert.modeling import (\n",
 102 |     "        BertModel,\n",
 103 |     "        BertForNextSentencePrediction,\n",
 104 |     "        BertForMaskedLM,\n",
 105 |     "        BertForMultipleChoice,\n",
 106 |     "        BertForPreTraining,\n",
 107 |     "        BertForQuestionAnswering,\n",
 108 |     "        BertForSequenceClassification,\n",
 109 |     "        BertForTokenClassification,\n",
 110 |     "        )"
 111 |    ]
 112 |   },
 113 |   {
 114 |    "cell_type": "code",
 115 |    "execution_count": 2,
 116 |    "metadata": {
 117 |     "ExecuteTime": {
 118 |      "end_time": "2019-09-10T08:28:53.181519Z",
 119 |      "start_time": "2019-09-10T08:28:53.164058Z"
 120 |     }
 121 |    },
 122 |    "outputs": [],
 123 |    "source": [
 124 |     "stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', \n",
 125 |     "'ours', 'ourselves', 'you', 'your', 'yours', \n",
 126 |     "'yourself', 'yourselves', 'he', 'him', 'his', \n",
 127 |     "'himself', 'she', 'her', 'hers', 'herself', \n",
 128 |     "'it', 'its', 'itself', 'they', 'them', 'their', \n",
 129 |     "'theirs', 'themselves', 'what', 'which', 'who', \n",
 130 |     "'whom', 'this', 'that', 'these', 'those', 'am', \n",
 131 |     "'is', 'are', 'was', 'were', 'be', 'been', 'being', \n",
 132 |     "'have', 'has', 'had', 'having', 'do', 'does', 'did',\n",
 133 |     "'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',\n",
 134 |     "'because', 'as', 'until', 'while', 'of', 'at', \n",
 135 |     "'by', 'for', 'with', 'about', 'against', 'between',\n",
 136 |     "'into', 'through', 'during', 'before', 'after', \n",
 137 |     "'above', 'below', 'to', 'from', 'up', 'down', 'in',\n",
 138 |     "'out', 'on', 'off', 'over', 'under', 'again', \n",
 139 |     "'further', 'then', 'once', 'here', 'there', 'when', \n",
 140 |     "'where', 'why', 'how', 'all', 'any', 'both', 'each', \n",
 141 |     "'few', 'more', 'most', 'other', 'some', 'such', 'no', \n",
 142 |     "'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', \n",
 143 |     "'very', 's', 't', 'can', 'will', 'just', 'don', \n",
 144 |     "'should', 'now', '']"
 145 |    ]
 146 |   },
 147 |   {
 148 |    "cell_type": "code",
 149 |    "execution_count": 3,
 150 |    "metadata": {
 151 |     "ExecuteTime": {
 152 |      "end_time": "2019-09-10T08:31:13.664314Z",
 153 |      "start_time": "2019-09-10T08:28:53.184262Z"
 154 |     }
 155 |    },
 156 |    "outputs": [],
 157 |    "source": [
 158 |     "#model2 = api.load('glove-twitter-25')\n",
 159 |     "model2 = api.load('word2vec-google-news-300')"
 160 |    ]
 161 |   },
 162 |   {
 163 |    "cell_type": "markdown",
 164 |    "metadata": {},
 165 |    "source": [
 166 |     "# Load the data"
 167 |    ]
 168 |   },
 169 |   {
 170 |    "cell_type": "markdown",
 171 |    "metadata": {},
 172 |    "source": [
 173 |     "The dataset comes from Stackoverflow Short Text Classification : https://github.com/jacoxu/StackOverflow"
 174 |    ]
 175 |   },
 176 |   {
 177 |    "cell_type": "code",
 178 |    "execution_count": 4,
 179 |    "metadata": {
 180 |     "ExecuteTime": {
 181 |      "end_time": "2019-09-10T08:31:13.736223Z",
 182 |      "start_time": "2019-09-10T08:31:13.667243Z"
 183 |     },
 184 |     "scrolled": true
 185 |    },
 186 |    "outputs": [],
 187 |    "source": [
 188 |     "df = pd.read_csv(\"stack.csv\")"
 189 |    ]
 190 |   },
 191 |   {
 192 |    "cell_type": "code",
 193 |    "execution_count": 5,
 194 |    "metadata": {
 195 |     "ExecuteTime": {
 196 |      "end_time": "2019-09-10T08:31:13.750565Z",
 197 |      "start_time": "2019-09-10T08:31:13.738681Z"
 198 |     }
 199 |    },
 200 |    "outputs": [],
 201 |    "source": [
 202 |     "def get_only_chars(line):\n",
 203 |     "\n",
 204 |     "    clean_line = \"\"\n",
 205 |     "\n",
 206 |     "    line = line.replace(\"’\", \"\")\n",
 207 |     "    line = line.replace(\"'\", \"\")\n",
 208 |     "    line = line.replace(\"-\", \" \") #replace hyphens with spaces\n",
 209 |     "    line = line.replace(\"\\t\", \" \")\n",
 210 |     "    line = line.replace(\"\\n\", \" \")\n",
 211 |     "    line = line.lower()\n",
 212 |     "\n",
 213 |     "    for char in line:\n",
 214 |     "        if char in 'qwertyuiopasdfghjklzxcvbnm ':\n",
 215 |     "            clean_line += char\n",
 216 |     "        else:\n",
 217 |     "            clean_line += ' '\n",
 218 |     "\n",
 219 |     "    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces\n",
 220 |     "    if clean_line[0] == ' ':\n",
 221 |     "        clean_line = clean_line[1:]\n",
 222 |     "    return clean_line"
 223 |    ]
 224 |   },
 225 |   {
 226 |    "cell_type": "code",
 227 |    "execution_count": 6,
 228 |    "metadata": {
 229 |     "ExecuteTime": {
 230 |      "end_time": "2019-09-10T08:31:14.012169Z",
 231 |      "start_time": "2019-09-10T08:31:13.753805Z"
 232 |     }
 233 |    },
 234 |    "outputs": [],
 235 |    "source": [
 236 |     "df['Text'] = df['Text'].apply(lambda x: get_only_chars(x))"
 237 |    ]
 238 |   },
 239 |   {
 240 |    "cell_type": "code",
 241 |    "execution_count": 7,
 242 |    "metadata": {
 243 |     "ExecuteTime": {
 244 |      "end_time": "2019-09-10T08:31:14.039451Z",
 245 |      "start_time": "2019-09-10T08:31:14.015366Z"
 246 |     }
 247 |    },
 248 |    "outputs": [
 249 |     {
 250 |      "data": {
 251 |       "text/html": [
 252 |        "<div>\n",
 253 |        "<style scoped>\n",
 254 |        "    .dataframe tbody tr th:only-of-type {\n",
 255 |        "        vertical-align: middle;\n",
 256 |        "    }\n",
 257 |        "\n",
 258 |        "    .dataframe tbody tr th {\n",
 259 |        "        vertical-align: top;\n",
 260 |        "    }\n",
 261 |        "\n",
 262 |        "    .dataframe thead th {\n",
 263 |        "        text-align: right;\n",
 264 |        "    }\n",
 265 |        "</style>\n",
 266 |        "<table border=\"1\" class=\"dataframe\">\n",
 267 |        "  <thead>\n",
 268 |        "    <tr style=\"text-align: right;\">\n",
 269 |        "      <th></th>\n",
 270 |        "      <th>Text</th>\n",
 271 |        "      <th>Label</th>\n",
 272 |        "    </tr>\n",
 273 |        "  </thead>\n",
 274 |        "  <tbody>\n",
 275 |        "    <tr>\n",
 276 |        "      <th>0</th>\n",
 277 |        "      <td>how do i fill a dataset or a datatable from a ...</td>\n",
 278 |        "      <td>18</td>\n",
 279 |        "    </tr>\n",
 280 |        "    <tr>\n",
 281 |        "      <th>1</th>\n",
 282 |        "      <td>how do you page a collection with linq</td>\n",
 283 |        "      <td>18</td>\n",
 284 |        "    </tr>\n",
 285 |        "    <tr>\n",
 286 |        "      <th>2</th>\n",
 287 |        "      <td>best subversion clients for windows vista bit</td>\n",
 288 |        "      <td>3</td>\n",
 289 |        "    </tr>\n",
 290 |        "    <tr>\n",
 291 |        "      <th>3</th>\n",
 292 |        "      <td>best practice collaborative environment bin di...</td>\n",
 293 |        "      <td>3</td>\n",
 294 |        "    </tr>\n",
 295 |        "    <tr>\n",
 296 |        "      <th>4</th>\n",
 297 |        "      <td>visual studio setup project per user registry ...</td>\n",
 298 |        "      <td>7</td>\n",
 299 |        "    </tr>\n",
 300 |        "  </tbody>\n",
 301 |        "</table>\n",
 302 |        "</div>"
 303 |       ],
 304 |       "text/plain": [
 305 |        "                                                Text  Label\n",
 306 |        "0  how do i fill a dataset or a datatable from a ...     18\n",
 307 |        "1            how do you page a collection with linq      18\n",
 308 |        "2     best subversion clients for windows vista bit       3\n",
 309 |        "3  best practice collaborative environment bin di...      3\n",
 310 |        "4  visual studio setup project per user registry ...      7"
 311 |       ]
 312 |      },
 313 |      "execution_count": 7,
 314 |      "metadata": {},
 315 |      "output_type": "execute_result"
 316 |     }
 317 |    ],
 318 |    "source": [
 319 |     "df.head()"
 320 |    ]
 321 |   },
 322 |   {
 323 |    "cell_type": "markdown",
 324 |    "metadata": {},
 325 |    "source": [
 326 |     "# Prepare the data"
 327 |    ]
 328 |   },
 329 |   {
 330 |    "cell_type": "code",
 331 |    "execution_count": 8,
 332 |    "metadata": {
 333 |     "ExecuteTime": {
 334 |      "end_time": "2019-09-10T08:31:14.046942Z",
 335 |      "start_time": "2019-09-10T08:31:14.042778Z"
 336 |     }
 337 |    },
 338 |    "outputs": [],
 339 |    "source": [
 340 |     "num_classes = 2\n",
 341 |     "sample_size = 50"
 342 |    ]
 343 |   },
 344 |   {
 345 |    "cell_type": "code",
 346 |    "execution_count": 9,
 347 |    "metadata": {
 348 |     "ExecuteTime": {
 349 |      "end_time": "2019-09-10T08:31:14.068203Z",
 350 |      "start_time": "2019-09-10T08:31:14.050592Z"
 351 |     }
 352 |    },
 353 |    "outputs": [],
 354 |    "source": [
 355 |     "# Generate samples that contains K samples of each class\n",
 356 |     "\n",
 357 |     "def gen_sample(sample_size, num_classes):\n",
 358 |     "    \n",
 359 |     "    df_1 = df[(df[\"Label\"]<num_classes + 1)].reset_index().drop([\"index\"], axis=1).reset_index().drop([\"index\"], axis=1)\n",
 360 |     "    train = df_1[df_1[\"Label\"] == np.unique(df_1['Label'])[0]].sample(sample_size)\n",
 361 |     "    \n",
 362 |     "    train_index = train.index.tolist()\n",
 363 |     "    \n",
 364 |     "    for i in range(1,num_classes):\n",
 365 |     "        train_2 = df_1[df_1[\"Label\"] == np.unique(df_1['Label'])[i]].sample(sample_size)\n",
 366 |     "        train = pd.concat([train, train_2], axis=0)\n",
 367 |     "        train_index.extend(train_2.index.tolist())\n",
 368 |     "        \n",
 369 |     "    test = df_1[~df_1.index.isin(train_index)]\n",
 370 |     "\n",
 371 |     "    return train, test"
 372 |    ]
 373 |   },
 374 |   {
 375 |    "cell_type": "code",
 376 |    "execution_count": 10,
 377 |    "metadata": {
 378 |     "ExecuteTime": {
 379 |      "end_time": "2019-09-10T08:31:14.082050Z",
 380 |      "start_time": "2019-09-10T08:31:14.072133Z"
 381 |     }
 382 |    },
 383 |    "outputs": [],
 384 |    "source": [
 385 |     "# Text processing (split, find token id, get embedidng)\n",
 386 |     "\n",
 387 |     "def transform_sentence(text, model):\n",
 388 |     "    \n",
 389 |     "    \"\"\"\n",
 390 |     "    Mean embedding vector\n",
 391 |     "    \"\"\"\n",
 392 |     "    \n",
 393 |     "    def preprocess_text(raw_text, model=model):\n",
 394 |     "        \n",
 395 |     "        \"\"\" \n",
 396 |     "        Eexcluding unknown words and get corresponding token\n",
 397 |     "        \"\"\"\n",
 398 |     "        \n",
 399 |     "        raw_text = raw_text.split()\n",
 400 |     "        \n",
 401 |     "        return list(filter(lambda x: x in model.vocab, raw_text))\n",
 402 |     "    \n",
 403 |     "    tokens = preprocess_text(text)\n",
 404 |     "    \n",
 405 |     "    if not tokens:\n",
 406 |     "        return np.zeros(model.vector_size)\n",
 407 |     "\n",
 408 |     "    text_vector = np.mean(model[tokens], axis=0)\n",
 409 |     "    \n",
 410 |     "    return np.array(text_vector)"
 411 |    ]
 412 |   },
 413 |   {
 414 |    "cell_type": "code",
 415 |    "execution_count": 11,
 416 |    "metadata": {
 417 |     "ExecuteTime": {
 418 |      "end_time": "2019-09-10T08:31:14.125152Z",
 419 |      "start_time": "2019-09-10T08:31:14.085562Z"
 420 |     }
 421 |    },
 422 |    "outputs": [],
 423 |    "source": [
 424 |     "train, test = gen_sample(sample_size, num_classes)"
 425 |    ]
 426 |   },
 427 |   {
 428 |    "cell_type": "code",
 429 |    "execution_count": 12,
 430 |    "metadata": {
 431 |     "ExecuteTime": {
 432 |      "end_time": "2019-09-10T08:31:14.145952Z",
 433 |      "start_time": "2019-09-10T08:31:14.130224Z"
 434 |     },
 435 |     "scrolled": true
 436 |    },
 437 |    "outputs": [
 438 |     {
 439 |      "data": {
 440 |       "text/html": [
 441 |        "<div>\n",
 442 |        "<style scoped>\n",
 443 |        "    .dataframe tbody tr th:only-of-type {\n",
 444 |        "        vertical-align: middle;\n",
 445 |        "    }\n",
 446 |        "\n",
 447 |        "    .dataframe tbody tr th {\n",
 448 |        "        vertical-align: top;\n",
 449 |        "    }\n",
 450 |        "\n",
 451 |        "    .dataframe thead th {\n",
 452 |        "        text-align: right;\n",
 453 |        "    }\n",
 454 |        "</style>\n",
 455 |        "<table border=\"1\" class=\"dataframe\">\n",
 456 |        "  <thead>\n",
 457 |        "    <tr style=\"text-align: right;\">\n",
 458 |        "      <th></th>\n",
 459 |        "      <th>Text</th>\n",
 460 |        "      <th>Label</th>\n",
 461 |        "    </tr>\n",
 462 |        "  </thead>\n",
 463 |        "  <tbody>\n",
 464 |        "    <tr>\n",
 465 |        "      <th>1521</th>\n",
 466 |        "      <td>how to get wordpress page id after looping posts</td>\n",
 467 |        "      <td>1</td>\n",
 468 |        "    </tr>\n",
 469 |        "    <tr>\n",
 470 |        "      <th>1737</th>\n",
 471 |        "      <td>using wp query to pull content from a specific...</td>\n",
 472 |        "      <td>1</td>\n",
 473 |        "    </tr>\n",
 474 |        "    <tr>\n",
 475 |        "      <th>1740</th>\n",
 476 |        "      <td>wordpress how to show just posts on main index</td>\n",
 477 |        "      <td>1</td>\n",
 478 |        "    </tr>\n",
 479 |        "    <tr>\n",
 480 |        "      <th>1660</th>\n",
 481 |        "      <td>wordpress is it possible to make one particula...</td>\n",
 482 |        "      <td>1</td>\n",
 483 |        "    </tr>\n",
 484 |        "    <tr>\n",
 485 |        "      <th>1411</th>\n",
 486 |        "      <td>exclude templates in wordpress page</td>\n",
 487 |        "      <td>1</td>\n",
 488 |        "    </tr>\n",
 489 |        "    <tr>\n",
 490 |        "      <th>1678</th>\n",
 491 |        "      <td>wierd date and time formating in wordpress y m d</td>\n",
 492 |        "      <td>1</td>\n",
 493 |        "    </tr>\n",
 494 |        "    <tr>\n",
 495 |        "      <th>1626</th>\n",
 496 |        "      <td>wordpress custom post type templates</td>\n",
 497 |        "      <td>1</td>\n",
 498 |        "    </tr>\n",
 499 |        "    <tr>\n",
 500 |        "      <th>1513</th>\n",
 501 |        "      <td>inserting wordpress plugin content to posts</td>\n",
 502 |        "      <td>1</td>\n",
 503 |        "    </tr>\n",
 504 |        "    <tr>\n",
 505 |        "      <th>1859</th>\n",
 506 |        "      <td>how can i delay one feed in wordpress but not ...</td>\n",
 507 |        "      <td>1</td>\n",
 508 |        "    </tr>\n",
 509 |        "    <tr>\n",
 510 |        "      <th>1072</th>\n",
 511 |        "      <td>how can i remove jquery from the frontside of ...</td>\n",
 512 |        "      <td>1</td>\n",
 513 |        "    </tr>\n",
 514 |        "    <tr>\n",
 515 |        "      <th>1811</th>\n",
 516 |        "      <td>wordpress calling recent posts widget via scri...</td>\n",
 517 |        "      <td>1</td>\n",
 518 |        "    </tr>\n",
 519 |        "    <tr>\n",
 520 |        "      <th>721</th>\n",
 521 |        "      <td>debuging register activation hook in wordpress</td>\n",
 522 |        "      <td>1</td>\n",
 523 |        "    </tr>\n",
 524 |        "    <tr>\n",
 525 |        "      <th>1636</th>\n",
 526 |        "      <td>testing pluggable function calls clashes for w...</td>\n",
 527 |        "      <td>1</td>\n",
 528 |        "    </tr>\n",
 529 |        "    <tr>\n",
 530 |        "      <th>1973</th>\n",
 531 |        "      <td>would it be quicker to make wordpress theme di...</td>\n",
 532 |        "      <td>1</td>\n",
 533 |        "    </tr>\n",
 534 |        "    <tr>\n",
 535 |        "      <th>1938</th>\n",
 536 |        "      <td>how to find and clean wordpress from script s ...</td>\n",
 537 |        "      <td>1</td>\n",
 538 |        "    </tr>\n",
 539 |        "    <tr>\n",
 540 |        "      <th>1899</th>\n",
 541 |        "      <td>wordpress set post date</td>\n",
 542 |        "      <td>1</td>\n",
 543 |        "    </tr>\n",
 544 |        "    <tr>\n",
 545 |        "      <th>1280</th>\n",
 546 |        "      <td>wordpress add comment like stackoverflow</td>\n",
 547 |        "      <td>1</td>\n",
 548 |        "    </tr>\n",
 549 |        "    <tr>\n",
 550 |        "      <th>1883</th>\n",
 551 |        "      <td>wordpress nav not visible in pages like articl...</td>\n",
 552 |        "      <td>1</td>\n",
 553 |        "    </tr>\n",
 554 |        "    <tr>\n",
 555 |        "      <th>1761</th>\n",
 556 |        "      <td>wordpress development</td>\n",
 557 |        "      <td>1</td>\n",
 558 |        "    </tr>\n",
 559 |        "    <tr>\n",
 560 |        "      <th>1319</th>\n",
 561 |        "      <td>wordpress static pages how to embed content in...</td>\n",
 562 |        "      <td>1</td>\n",
 563 |        "    </tr>\n",
 564 |        "    <tr>\n",
 565 |        "      <th>1549</th>\n",
 566 |        "      <td>filtering search results with wordpress</td>\n",
 567 |        "      <td>1</td>\n",
 568 |        "    </tr>\n",
 569 |        "    <tr>\n",
 570 |        "      <th>1174</th>\n",
 571 |        "      <td>drop down js menu blinking in ie</td>\n",
 572 |        "      <td>1</td>\n",
 573 |        "    </tr>\n",
 574 |        "    <tr>\n",
 575 |        "      <th>1371</th>\n",
 576 |        "      <td>wrap stray text in p tags</td>\n",
 577 |        "      <td>1</td>\n",
 578 |        "    </tr>\n",
 579 |        "    <tr>\n",
 580 |        "      <th>1527</th>\n",
 581 |        "      <td>wordpress menu of categories</td>\n",
 582 |        "      <td>1</td>\n",
 583 |        "    </tr>\n",
 584 |        "    <tr>\n",
 585 |        "      <th>1210</th>\n",
 586 |        "      <td>wordpress menu with superslide show</td>\n",
 587 |        "      <td>1</td>\n",
 588 |        "    </tr>\n",
 589 |        "    <tr>\n",
 590 |        "      <th>1235</th>\n",
 591 |        "      <td>getting post information outside the wordpress...</td>\n",
 592 |        "      <td>1</td>\n",
 593 |        "    </tr>\n",
 594 |        "    <tr>\n",
 595 |        "      <th>872</th>\n",
 596 |        "      <td>why cant i include a blog</td>\n",
 597 |        "      <td>1</td>\n",
 598 |        "    </tr>\n",
 599 |        "    <tr>\n",
 600 |        "      <th>1986</th>\n",
 601 |        "      <td>d slideshow for wordpress and url file access ...</td>\n",
 602 |        "      <td>1</td>\n",
 603 |        "    </tr>\n",
 604 |        "    <tr>\n",
 605 |        "      <th>1902</th>\n",
 606 |        "      <td>submit wordpress form programmatically</td>\n",
 607 |        "      <td>1</td>\n",
 608 |        "    </tr>\n",
 609 |        "    <tr>\n",
 610 |        "      <th>1947</th>\n",
 611 |        "      <td>move wordpress from home web server to web ser...</td>\n",
 612 |        "      <td>1</td>\n",
 613 |        "    </tr>\n",
 614 |        "    <tr>\n",
 615 |        "      <th>...</th>\n",
 616 |        "      <td>...</td>\n",
 617 |        "      <td>...</td>\n",
 618 |        "    </tr>\n",
 619 |        "    <tr>\n",
 620 |        "      <th>99</th>\n",
 621 |        "      <td>how to use sqlab xpert tuning to tune sql for ...</td>\n",
 622 |        "      <td>2</td>\n",
 623 |        "    </tr>\n",
 624 |        "    <tr>\n",
 625 |        "      <th>54</th>\n",
 626 |        "      <td>oracle connection problem on mac osx status fa...</td>\n",
 627 |        "      <td>2</td>\n",
 628 |        "    </tr>\n",
 629 |        "    <tr>\n",
 630 |        "      <th>923</th>\n",
 631 |        "      <td>xaconnection performance in oracle g</td>\n",
 632 |        "      <td>2</td>\n",
 633 |        "    </tr>\n",
 634 |        "    <tr>\n",
 635 |        "      <th>978</th>\n",
 636 |        "      <td>what is the pl sql api difference between orac...</td>\n",
 637 |        "      <td>2</td>\n",
 638 |        "    </tr>\n",
 639 |        "    <tr>\n",
 640 |        "      <th>504</th>\n",
 641 |        "      <td>how do i call an oracle function from oci</td>\n",
 642 |        "      <td>2</td>\n",
 643 |        "    </tr>\n",
 644 |        "    <tr>\n",
 645 |        "      <th>63</th>\n",
 646 |        "      <td>explain plan cost vs execution time</td>\n",
 647 |        "      <td>2</td>\n",
 648 |        "    </tr>\n",
 649 |        "    <tr>\n",
 650 |        "      <th>74</th>\n",
 651 |        "      <td>user interface for creating oracle sql loader ...</td>\n",
 652 |        "      <td>2</td>\n",
 653 |        "    </tr>\n",
 654 |        "    <tr>\n",
 655 |        "      <th>518</th>\n",
 656 |        "      <td>is it possible to refer to column names via bi...</td>\n",
 657 |        "      <td>2</td>\n",
 658 |        "    </tr>\n",
 659 |        "    <tr>\n",
 660 |        "      <th>1117</th>\n",
 661 |        "      <td>can i run an arbitrary oracle sql script throu...</td>\n",
 662 |        "      <td>2</td>\n",
 663 |        "    </tr>\n",
 664 |        "    <tr>\n",
 665 |        "      <th>463</th>\n",
 666 |        "      <td>oracle logon protocol o logon in g</td>\n",
 667 |        "      <td>2</td>\n",
 668 |        "    </tr>\n",
 669 |        "    <tr>\n",
 670 |        "      <th>255</th>\n",
 671 |        "      <td>databases oracle</td>\n",
 672 |        "      <td>2</td>\n",
 673 |        "    </tr>\n",
 674 |        "    <tr>\n",
 675 |        "      <th>968</th>\n",
 676 |        "      <td>resultset logic when selecting tables without ...</td>\n",
 677 |        "      <td>2</td>\n",
 678 |        "    </tr>\n",
 679 |        "    <tr>\n",
 680 |        "      <th>537</th>\n",
 681 |        "      <td>oracle stored procedures sys refcursor and nhi...</td>\n",
 682 |        "      <td>2</td>\n",
 683 |        "    </tr>\n",
 684 |        "    <tr>\n",
 685 |        "      <th>235</th>\n",
 686 |        "      <td>oracle express edition can not connect remotel...</td>\n",
 687 |        "      <td>2</td>\n",
 688 |        "    </tr>\n",
 689 |        "    <tr>\n",
 690 |        "      <th>1051</th>\n",
 691 |        "      <td>performance of remote materialized views in or...</td>\n",
 692 |        "      <td>2</td>\n",
 693 |        "    </tr>\n",
 694 |        "    <tr>\n",
 695 |        "      <th>1065</th>\n",
 696 |        "      <td>merge output cursor of sp into table</td>\n",
 697 |        "      <td>2</td>\n",
 698 |        "    </tr>\n",
 699 |        "    <tr>\n",
 700 |        "      <th>277</th>\n",
 701 |        "      <td>oracle logon trigger not being fired</td>\n",
 702 |        "      <td>2</td>\n",
 703 |        "    </tr>\n",
 704 |        "    <tr>\n",
 705 |        "      <th>453</th>\n",
 706 |        "      <td>understanding lob segments sys lob in oracle</td>\n",
 707 |        "      <td>2</td>\n",
 708 |        "    </tr>\n",
 709 |        "    <tr>\n",
 710 |        "      <th>838</th>\n",
 711 |        "      <td>oracle database character set issue with the a...</td>\n",
 712 |        "      <td>2</td>\n",
 713 |        "    </tr>\n",
 714 |        "    <tr>\n",
 715 |        "      <th>709</th>\n",
 716 |        "      <td>finding the days of the week within a date ran...</td>\n",
 717 |        "      <td>2</td>\n",
 718 |        "    </tr>\n",
 719 |        "    <tr>\n",
 720 |        "      <th>591</th>\n",
 721 |        "      <td>how to convert sql server to oracle</td>\n",
 722 |        "      <td>2</td>\n",
 723 |        "    </tr>\n",
 724 |        "    <tr>\n",
 725 |        "      <th>48</th>\n",
 726 |        "      <td>how can i avoid ta warning fom an unused param...</td>\n",
 727 |        "      <td>2</td>\n",
 728 |        "    </tr>\n",
 729 |        "    <tr>\n",
 730 |        "      <th>786</th>\n",
 731 |        "      <td>oracle hierarchical query how to include top l...</td>\n",
 732 |        "      <td>2</td>\n",
 733 |        "    </tr>\n",
 734 |        "    <tr>\n",
 735 |        "      <th>318</th>\n",
 736 |        "      <td>oracle record history using as of timestamp wi...</td>\n",
 737 |        "      <td>2</td>\n",
 738 |        "    </tr>\n",
 739 |        "    <tr>\n",
 740 |        "      <th>515</th>\n",
 741 |        "      <td>how to call a function with rowtype parameter ...</td>\n",
 742 |        "      <td>2</td>\n",
 743 |        "    </tr>\n",
 744 |        "    <tr>\n",
 745 |        "      <th>33</th>\n",
 746 |        "      <td>ssis oracle parameter mapping</td>\n",
 747 |        "      <td>2</td>\n",
 748 |        "    </tr>\n",
 749 |        "    <tr>\n",
 750 |        "      <th>434</th>\n",
 751 |        "      <td>multi line pl sql command with net oraclecommand</td>\n",
 752 |        "      <td>2</td>\n",
 753 |        "    </tr>\n",
 754 |        "    <tr>\n",
 755 |        "      <th>577</th>\n",
 756 |        "      <td>read write data from to a file in pl sql witho...</td>\n",
 757 |        "      <td>2</td>\n",
 758 |        "    </tr>\n",
 759 |        "    <tr>\n",
 760 |        "      <th>327</th>\n",
 761 |        "      <td>oracle sql parsing a name string and convertin...</td>\n",
 762 |        "      <td>2</td>\n",
 763 |        "    </tr>\n",
 764 |        "    <tr>\n",
 765 |        "      <th>830</th>\n",
 766 |        "      <td>oracle optimizing query involving date calcula...</td>\n",
 767 |        "      <td>2</td>\n",
 768 |        "    </tr>\n",
 769 |        "  </tbody>\n",
 770 |        "</table>\n",
 771 |        "<p>100 rows × 2 columns</p>\n",
 772 |        "</div>"
 773 |       ],
 774 |       "text/plain": [
 775 |        "                                                   Text  Label\n",
 776 |        "1521  how to get wordpress page id after looping posts       1\n",
 777 |        "1737  using wp query to pull content from a specific...      1\n",
 778 |        "1740     wordpress how to show just posts on main index      1\n",
 779 |        "1660  wordpress is it possible to make one particula...      1\n",
 780 |        "1411                exclude templates in wordpress page      1\n",
 781 |        "1678   wierd date and time formating in wordpress y m d      1\n",
 782 |        "1626               wordpress custom post type templates      1\n",
 783 |        "1513        inserting wordpress plugin content to posts      1\n",
 784 |        "1859  how can i delay one feed in wordpress but not ...      1\n",
 785 |        "1072  how can i remove jquery from the frontside of ...      1\n",
 786 |        "1811  wordpress calling recent posts widget via scri...      1\n",
 787 |        "721      debuging register activation hook in wordpress      1\n",
 788 |        "1636  testing pluggable function calls clashes for w...      1\n",
 789 |        "1973  would it be quicker to make wordpress theme di...      1\n",
 790 |        "1938  how to find and clean wordpress from script s ...      1\n",
 791 |        "1899                            wordpress set post date      1\n",
 792 |        "1280           wordpress add comment like stackoverflow      1\n",
 793 |        "1883  wordpress nav not visible in pages like articl...      1\n",
 794 |        "1761                             wordpress development       1\n",
 795 |        "1319  wordpress static pages how to embed content in...      1\n",
 796 |        "1549            filtering search results with wordpress      1\n",
 797 |        "1174                  drop down js menu blinking in ie       1\n",
 798 |        "1371                          wrap stray text in p tags      1\n",
 799 |        "1527                       wordpress menu of categories      1\n",
 800 |        "1210                wordpress menu with superslide show      1\n",
 801 |        "1235  getting post information outside the wordpress...      1\n",
 802 |        "872                          why cant i include a blog       1\n",
 803 |        "1986  d slideshow for wordpress and url file access ...      1\n",
 804 |        "1902             submit wordpress form programmatically      1\n",
 805 |        "1947  move wordpress from home web server to web ser...      1\n",
 806 |        "...                                                 ...    ...\n",
 807 |        "99    how to use sqlab xpert tuning to tune sql for ...      2\n",
 808 |        "54    oracle connection problem on mac osx status fa...      2\n",
 809 |        "923               xaconnection performance in oracle g       2\n",
 810 |        "978   what is the pl sql api difference between orac...      2\n",
 811 |        "504          how do i call an oracle function from oci       2\n",
 812 |        "63                  explain plan cost vs execution time      2\n",
 813 |        "74    user interface for creating oracle sql loader ...      2\n",
 814 |        "518   is it possible to refer to column names via bi...      2\n",
 815 |        "1117  can i run an arbitrary oracle sql script throu...      2\n",
 816 |        "463                  oracle logon protocol o logon in g      2\n",
 817 |        "255                                    databases oracle      2\n",
 818 |        "968   resultset logic when selecting tables without ...      2\n",
 819 |        "537   oracle stored procedures sys refcursor and nhi...      2\n",
 820 |        "235   oracle express edition can not connect remotel...      2\n",
 821 |        "1051  performance of remote materialized views in or...      2\n",
 822 |        "1065              merge output cursor of sp into table       2\n",
 823 |        "277               oracle logon trigger not being fired       2\n",
 824 |        "453       understanding lob segments sys lob in oracle       2\n",
 825 |        "838   oracle database character set issue with the a...      2\n",
 826 |        "709   finding the days of the week within a date ran...      2\n",
 827 |        "591                how to convert sql server to oracle       2\n",
 828 |        "48    how can i avoid ta warning fom an unused param...      2\n",
 829 |        "786   oracle hierarchical query how to include top l...      2\n",
 830 |        "318   oracle record history using as of timestamp wi...      2\n",
 831 |        "515   how to call a function with rowtype parameter ...      2\n",
 832 |        "33                        ssis oracle parameter mapping      2\n",
 833 |        "434    multi line pl sql command with net oraclecommand      2\n",
 834 |        "577   read write data from to a file in pl sql witho...      2\n",
 835 |        "327   oracle sql parsing a name string and convertin...      2\n",
 836 |        "830   oracle optimizing query involving date calcula...      2\n",
 837 |        "\n",
 838 |        "[100 rows x 2 columns]"
 839 |       ]
 840 |      },
 841 |      "execution_count": 12,
 842 |      "metadata": {},
 843 |      "output_type": "execute_result"
 844 |     }
 845 |    ],
 846 |    "source": [
 847 |     "train"
 848 |    ]
 849 |   },
 850 |   {
 851 |    "cell_type": "code",
 852 |    "execution_count": 13,
 853 |    "metadata": {
 854 |     "ExecuteTime": {
 855 |      "end_time": "2019-09-10T08:31:14.152530Z",
 856 |      "start_time": "2019-09-10T08:31:14.147953Z"
 857 |     }
 858 |    },
 859 |    "outputs": [
 860 |     {
 861 |      "data": {
 862 |       "text/plain": [
 863 |        "(100, 2)"
 864 |       ]
 865 |      },
 866 |      "execution_count": 13,
 867 |      "metadata": {},
 868 |      "output_type": "execute_result"
 869 |     }
 870 |    ],
 871 |    "source": [
 872 |     "train.shape"
 873 |    ]
 874 |   },
 875 |   {
 876 |    "cell_type": "code",
 877 |    "execution_count": 14,
 878 |    "metadata": {
 879 |     "ExecuteTime": {
 880 |      "end_time": "2019-09-10T08:31:14.621358Z",
 881 |      "start_time": "2019-09-10T08:31:14.155615Z"
 882 |     }
 883 |    },
 884 |    "outputs": [],
 885 |    "source": [
 886 |     "X_train = train['Text']\n",
 887 |     "y_train = train['Label'].values\n",
 888 |     "X_test = test['Text']\n",
 889 |     "y_test = test['Label'].values\n",
 890 |     "\n",
 891 |     "X_train_mean = X_train.apply(lambda x : transform_sentence(x, model2))\n",
 892 |     "X_test_mean = X_test.apply(lambda x : transform_sentence(x, model2))\n",
 893 |     "\n",
 894 |     "X_train_mean = pd.DataFrame(X_train_mean)['Text'].apply(pd.Series)\n",
 895 |     "X_test_mean = pd.DataFrame(X_test_mean)['Text'].apply(pd.Series)"
 896 |    ]
 897 |   },
 898 |   {
 899 |    "cell_type": "markdown",
 900 |    "metadata": {},
 901 |    "source": [
 902 |     "# Data Augmentation"
 903 |    ]
 904 |   },
 905 |   {
 906 |    "cell_type": "markdown",
 907 |    "metadata": {},
 908 |    "source": [
 909 |     "## Replace words"
 910 |    ]
 911 |   },
 912 |   {
 913 |    "cell_type": "code",
 914 |    "execution_count": 90,
 915 |    "metadata": {
 916 |     "ExecuteTime": {
 917 |      "end_time": "2019-09-09T14:58:59.333032Z",
 918 |      "start_time": "2019-09-09T14:58:59.298243Z"
 919 |     }
 920 |    },
 921 |    "outputs": [],
 922 |    "source": [
 923 |     "def get_synonyms(word):\n",
 924 |     "    \n",
 925 |     "    synonyms = set()\n",
 926 |     "    \n",
 927 |     "    for syn in wordnet.synsets(word): \n",
 928 |     "        for l in syn.lemmas(): \n",
 929 |     "            synonym = l.name().replace(\"_\", \" \").replace(\"-\", \" \").lower()\n",
 930 |     "            synonym = \"\".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])\n",
 931 |     "            synonyms.add(synonym) \n",
 932 |     "    \n",
 933 |     "    if word in synonyms:\n",
 934 |     "        synonyms.remove(word)\n",
 935 |     "    \n",
 936 |     "    return list(synonyms)\n",
 937 |     "\n",
 938 |     "def synonym_replacement(words, n):\n",
 939 |     "    \n",
 940 |     "    words = words.split()\n",
 941 |     "    \n",
 942 |     "    new_words = words.copy()\n",
 943 |     "    random_word_list = list(set([word for word in words if word not in stop_words]))\n",
 944 |     "    random.shuffle(random_word_list)\n",
 945 |     "    num_replaced = 0\n",
 946 |     "    \n",
 947 |     "    for random_word in random_word_list:\n",
 948 |     "        synonyms = get_synonyms(random_word)\n",
 949 |     "        \n",
 950 |     "        if len(synonyms) >= 1:\n",
 951 |     "            synonym = random.choice(list(synonyms))\n",
 952 |     "            new_words = [synonym if word == random_word else word for word in new_words]\n",
 953 |     "            #print(\"replaced\", random_word, \"with\", synonym)\n",
 954 |     "            num_replaced += 1\n",
 955 |     "        \n",
 956 |     "        if num_replaced >= n: #only replace up to n words\n",
 957 |     "            break\n",
 958 |     "\n",
 959 |     "    sentence = ' '.join(new_words)\n",
 960 |     "\n",
 961 |     "    return sentence\n",
 962 |     "\n",
 963 |     "def iterative_replace(df):\n",
 964 |     "    \n",
 965 |     "    df = df.reset_index().drop(['index'], axis=1)\n",
 966 |     "    index_row = df.index\n",
 967 |     "    df_2 = pd.DataFrame()\n",
 968 |     "    \n",
 969 |     "    for row in index_row:\n",
 970 |     "        for k in range(1,6):\n",
 971 |     "            df_2 = df_2.append({'Text':synonym_replacement(df.loc[row]['Text'], k), 'Label':df.loc[row]['Label']}, ignore_index=True)\n",
 972 |     "    return df_2"
 973 |    ]
 974 |   },
 975 |   {
 976 |    "cell_type": "markdown",
 977 |    "metadata": {},
 978 |    "source": [
 979 |     "## Delete words"
 980 |    ]
 981 |   },
 982 |   {
 983 |    "cell_type": "code",
 984 |    "execution_count": 91,
 985 |    "metadata": {
 986 |     "ExecuteTime": {
 987 |      "end_time": "2019-09-09T14:58:59.695868Z",
 988 |      "start_time": "2019-09-09T14:58:59.675230Z"
 989 |     }
 990 |    },
 991 |    "outputs": [],
 992 |    "source": [
 993 |     "def random_deletion(words, p):\n",
 994 |     "\n",
 995 |     "    words = words.split()\n",
 996 |     "    \n",
 997 |     "    #obviously, if there's only one word, don't delete it\n",
 998 |     "    if len(words) == 1:\n",
 999 |     "        return words\n",
1000 |     "\n",
1001 |     "    #randomly delete words with probability p\n",
1002 |     "    new_words = []\n",
1003 |     "    for word in words:\n",
1004 |     "        r = random.uniform(0, 1)\n",
1005 |     "        if r > p:\n",
1006 |     "            new_words.append(word)\n",
1007 |     "\n",
1008 |     "    #if you end up deleting all words, just return a random word\n",
1009 |     "    if len(new_words) == 0:\n",
1010 |     "        rand_int = random.randint(0, len(words)-1)\n",
1011 |     "        return [words[rand_int]]\n",
1012 |     "\n",
1013 |     "    sentence = ' '.join(new_words)\n",
1014 |     "    \n",
1015 |     "    return sentence\n",
1016 |     "\n",
1017 |     "def iterative_delete(df):\n",
1018 |     "    \n",
1019 |     "    df = df.reset_index().drop(['index'], axis=1)\n",
1020 |     "    index_row = df.index\n",
1021 |     "    df_2 = pd.DataFrame()\n",
1022 |     "    \n",
1023 |     "    for row in index_row:\n",
1024 |     "        df_2 = df_2.append({'Text':random_deletion(df.loc[row]['Text'], 0.25), 'Label':df.loc[row]['Label']}, ignore_index=True)\n",
1025 |     "    return df_2"
1026 |    ]
1027 |   },
1028 |   {
1029 |    "cell_type": "markdown",
1030 |    "metadata": {},
1031 |    "source": [
1032 |     "## Random Swap"
1033 |    ]
1034 |   },
1035 |   {
1036 |    "cell_type": "code",
1037 |    "execution_count": 92,
1038 |    "metadata": {
1039 |     "ExecuteTime": {
1040 |      "end_time": "2019-09-09T14:59:00.168295Z",
1041 |      "start_time": "2019-09-09T14:59:00.144499Z"
1042 |     }
1043 |    },
1044 |    "outputs": [],
1045 |    "source": [
1046 |     "def random_swap(words, n):\n",
1047 |     "    \n",
1048 |     "    words = words.split()\n",
1049 |     "    new_words = words.copy()\n",
1050 |     "    \n",
1051 |     "    for _ in range(n):\n",
1052 |     "        new_words = swap_word(new_words)\n",
1053 |     "        \n",
1054 |     "    sentence = ' '.join(new_words)\n",
1055 |     "    \n",
1056 |     "    return sentence\n",
1057 |     "\n",
1058 |     "def swap_word(new_words):\n",
1059 |     "    \n",
1060 |     "    random_idx_1 = random.randint(0, len(new_words)-1)\n",
1061 |     "    random_idx_2 = random_idx_1\n",
1062 |     "    counter = 0\n",
1063 |     "    \n",
1064 |     "    while random_idx_2 == random_idx_1:\n",
1065 |     "        random_idx_2 = random.randint(0, len(new_words)-1)\n",
1066 |     "        counter += 1\n",
1067 |     "        \n",
1068 |     "        if counter > 3:\n",
1069 |     "            return new_words\n",
1070 |     "    \n",
1071 |     "    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] \n",
1072 |     "    return new_words\n",
1073 |     "\n",
1074 |     "def iterative_swap(df):\n",
1075 |     "    \n",
1076 |     "    df = df.reset_index().drop(['index'], axis=1)\n",
1077 |     "    index_row = df.index\n",
1078 |     "    df_2 = pd.DataFrame()\n",
1079 |     "    for row in index_row:\n",
1080 |     "        df_2 = df_2.append({'Text':random_swap(df.loc[row]['Text'], 2), 'Label':df.loc[row]['Label']}, ignore_index=True)\n",
1081 |     "    return df_2"
1082 |    ]
1083 |   },
1084 |   {
1085 |    "cell_type": "markdown",
1086 |    "metadata": {},
1087 |    "source": [
1088 |     "## Random Insertion"
1089 |    ]
1090 |   },
1091 |   {
1092 |    "cell_type": "code",
1093 |    "execution_count": 93,
1094 |    "metadata": {
1095 |     "ExecuteTime": {
1096 |      "end_time": "2019-09-09T14:59:01.181955Z",
1097 |      "start_time": "2019-09-09T14:59:01.152899Z"
1098 |     }
1099 |    },
1100 |    "outputs": [],
1101 |    "source": [
1102 |     "def random_insertion(words, n):\n",
1103 |     "    \n",
1104 |     "    words = words.split()\n",
1105 |     "    new_words = words.copy()\n",
1106 |     "    \n",
1107 |     "    for _ in range(n):\n",
1108 |     "        add_word(new_words)\n",
1109 |     "        \n",
1110 |     "    sentence = ' '.join(new_words)\n",
1111 |     "    return sentence\n",
1112 |     "\n",
1113 |     "def add_word(new_words):\n",
1114 |     "    \n",
1115 |     "    synonyms = []\n",
1116 |     "    counter = 0\n",
1117 |     "    \n",
1118 |     "    while len(synonyms) < 1:\n",
1119 |     "        random_word = new_words[random.randint(0, len(new_words)-1)]\n",
1120 |     "        synonyms = get_synonyms(random_word)\n",
1121 |     "        counter += 1\n",
1122 |     "        if counter >= 10:\n",
1123 |     "            return\n",
1124 |     "        \n",
1125 |     "    random_synonym = synonyms[0]\n",
1126 |     "    random_idx = random.randint(0, len(new_words)-1)\n",
1127 |     "    new_words.insert(random_idx, random_synonym)\n",
1128 |     "    \n",
1129 |     "def iterative_insert(df):\n",
1130 |     "    \n",
1131 |     "    df = df.reset_index().drop(['index'], axis=1)\n",
1132 |     "    index_row = df.index\n",
1133 |     "    df_2 = pd.DataFrame()\n",
1134 |     "    \n",
1135 |     "    for row in index_row:\n",
1136 |     "        df_2 = df_2.append({'Text':random_insertion(df.loc[row]['Text'], 2), 'Label':df.loc[row]['Label']}, ignore_index=True)\n",
1137 |     "        \n",
1138 |     "    return df_2"
1139 |    ]
1140 |   },
1141 |   {
1142 |    "cell_type": "markdown",
1143 |    "metadata": {},
1144 |    "source": [
1145 |     "## Data Augmentation"
1146 |    ]
1147 |   },
1148 |   {
1149 |    "cell_type": "code",
1150 |    "execution_count": 94,
1151 |    "metadata": {
1152 |     "ExecuteTime": {
1153 |      "end_time": "2019-09-09T14:59:04.924959Z",
1154 |      "start_time": "2019-09-09T14:59:03.141470Z"
1155 |     }
1156 |    },
1157 |    "outputs": [
1158 |     {
1159 |      "name": "stderr",
1160 |      "output_type": "stream",
1161 |      "text": [
1162 |       "/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
1163 |       "of pandas will change to not sort by default.\n",
1164 |       "\n",
1165 |       "To accept the future behavior, pass 'sort=False'.\n",
1166 |       "\n",
1167 |       "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
1168 |       "\n",
1169 |       "  \n"
1170 |      ]
1171 |     }
1172 |    ],
1173 |    "source": [
1174 |     "df_replace = iterative_replace(train)\n",
1175 |     "df_delete = iterative_delete(train)\n",
1176 |     "df_swap = iterative_swap(train)\n",
1177 |     "df_insert = iterative_insert(train)\n",
1178 |     "\n",
1179 |     "train = pd.concat([train, df_replace, df_delete, df_swap, df_insert], axis=0).reset_index().drop(['index'], axis=1)"
1180 |    ]
1181 |   },
1182 |   {
1183 |    "cell_type": "code",
1184 |    "execution_count": 95,
1185 |    "metadata": {
1186 |     "ExecuteTime": {
1187 |      "end_time": "2019-09-09T14:59:06.350551Z",
1188 |      "start_time": "2019-09-09T14:59:05.829293Z"
1189 |     }
1190 |    },
1191 |    "outputs": [],
1192 |    "source": [
1193 |     "X_train = train['Text']\n",
1194 |     "y_train = train['Label'].values\n",
1195 |     "X_test = test['Text']\n",
1196 |     "y_test = test['Label'].values\n",
1197 |     "\n",
1198 |     "X_train_mean = X_train.apply(lambda x : transform_sentence(x, model2))\n",
1199 |     "X_test_mean = X_test.apply(lambda x : transform_sentence(x, model2))\n",
1200 |     "\n",
1201 |     "X_train_mean = pd.DataFrame(X_train_mean)['Text'].apply(pd.Series)\n",
1202 |     "X_test_mean = pd.DataFrame(X_test_mean)['Text'].apply(pd.Series)"
1203 |    ]
1204 |   },
1205 |   {
1206 |    "cell_type": "markdown",
1207 |    "metadata": {},
1208 |    "source": [
1209 |     "# Utilities in PyTorch "
1210 |    ]
1211 |   },
1212 |   {
1213 |    "cell_type": "markdown",
1214 |    "metadata": {
1215 |     "ExecuteTime": {
1216 |      "end_time": "2019-09-10T08:38:21.419881Z",
1217 |      "start_time": "2019-09-10T08:38:21.415733Z"
1218 |     }
1219 |    },
1220 |    "source": [
1221 |     "## CNN Model"
1222 |    ]
1223 |   },
1224 |   {
1225 |    "cell_type": "code",
1226 |    "execution_count": 16,
1227 |    "metadata": {
1228 |     "ExecuteTime": {
1229 |      "end_time": "2019-09-10T08:34:17.541159Z",
1230 |      "start_time": "2019-09-10T08:34:17.426574Z"
1231 |     }
1232 |    },
1233 |    "outputs": [],
1234 |    "source": [
1235 |     "class MaxPool(nn.Module):\n",
1236 |     "    def __init__(self, dim=1):\n",
1237 |     "        super(MaxPool, self).__init__()\n",
1238 |     "        self.dim = dim\n",
1239 |     "    \n",
1240 |     "    def forward(self, input):\n",
1241 |     "        return torch.max(input, self.dim)[0]\n",
1242 |     "\n",
1243 |     "    def __repr__(self):\n",
1244 |     "        return self.__class__.__name__ +'('+ 'dim=' + str(self.dim) + ')'\n",
1245 |     "\n",
1246 |     "class View(nn.Module):\n",
1247 |     "    def __init__(self, *sizes):\n",
1248 |     "        super(View, self).__init__()\n",
1249 |     "        self.sizes_list = sizes\n",
1250 |     "\n",
1251 |     "    def forward(self, input):\n",
1252 |     "        return input.view(*self.sizes_list)\n",
1253 |     "\n",
1254 |     "    def __repr__(self):\n",
1255 |     "        return self.__class__.__name__ + ' (' \\\n",
1256 |     "               + 'sizes=' + str(self.sizes_list) + ')'\n",
1257 |     "\n",
1258 |     "class Transpose(nn.Module):\n",
1259 |     "    def __init__(self, dim1=0, dim2=1):\n",
1260 |     "        super(Transpose, self).__init__()\n",
1261 |     "        self.dim1 = dim1\n",
1262 |     "        self.dim2 = dim2\n",
1263 |     "\n",
1264 |     "    def forward(self, input):\n",
1265 |     "        return input.transpose(self.dim1, self.dim2).contiguous()\n",
1266 |     "\n",
1267 |     "    def __repr__(self):\n",
1268 |     "        return self.__class__.__name__ + ' (' \\\n",
1269 |     "               + 'between=' + str(self.dim1) + ',' + str(self.dim2) + ')'\n",
1270 |     "\n",
1271 |     "class CNNModel(nn.Module):\n",
1272 |     "    def __init__(self, vocab_size, num_labels, emb_size, w_hid_size, h_hid_size, win, batch_size,with_proj=False):\n",
1273 |     "        super(CNNModel, self).__init__()\n",
1274 |     "\n",
1275 |     "        self.model = nn.Sequential()\n",
1276 |     "        self.model.add_module('transpose', Transpose())\n",
1277 |     "        self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=emb_size)\n",
1278 |     "        self.model.add_module('emb', self.embed)\n",
1279 |     "        if with_proj:\n",
1280 |     "            self.model.add_module('view1', View(-1, emb_size))\n",
1281 |     "            self.model.add_module('linear1', nn.Linear(emb_size, w_hid_size))\n",
1282 |     "            self.model.add_module('relu1', nn.ReLU())\n",
1283 |     "        else:\n",
1284 |     "            w_hid_size = emb_size\n",
1285 |     "\n",
1286 |     "        self.model.add_module('trans2', Transpose(1, 2))\n",
1287 |     "\n",
1288 |     "        conv_nn = nn.Conv1d(w_hid_size, h_hid_size, win, padding=1)\n",
1289 |     "        self.model.add_module('conv', conv_nn)\n",
1290 |     "        self.model.add_module('relu2', nn.ReLU())\n",
1291 |     "\n",
1292 |     "        self.model.add_module('max', MaxPool(2))\n",
1293 |     "\n",
1294 |     "        self.model.add_module('view4', View(-1, h_hid_size))\n",
1295 |     "        self.model.add_module('linear2', nn.Linear(h_hid_size, num_labels))\n",
1296 |     "        self.model.add_module('softmax', nn.LogSoftmax())\n",
1297 |     "\n",
1298 |     "\n",
1299 |     "    def forward(self, x):\n",
1300 |     "\n",
1301 |     "        output = self.model.forward(x)\n",
1302 |     "\n",
1303 |     "        return output"
1304 |    ]
1305 |   },
1306 |   {
1307 |    "cell_type": "markdown",
1308 |    "metadata": {
1309 |     "ExecuteTime": {
1310 |      "end_time": "2019-09-10T08:38:21.419881Z",
1311 |      "start_time": "2019-09-10T08:38:21.415733Z"
1312 |     }
1313 |    },
1314 |    "source": [
1315 |     "## Load BERT"
1316 |    ]
1317 |   },
1318 |   {
1319 |    "cell_type": "code",
1320 |    "execution_count": 17,
1321 |    "metadata": {
1322 |     "ExecuteTime": {
1323 |      "end_time": "2019-09-10T08:36:59.732744Z",
1324 |      "start_time": "2019-09-10T08:36:59.624085Z"
1325 |     }
1326 |    },
1327 |    "outputs": [
1328 |     {
1329 |      "name": "stdout",
1330 |      "output_type": "stream",
1331 |      "text": [
1332 |       "Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n"
1333 |      ]
1334 |     }
1335 |    ],
1336 |    "source": [
1337 |     "dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']\n",
1338 |     "\n",
1339 |     "def bertTokenizer(*args, **kwargs):\n",
1340 |     "    tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)\n",
1341 |     "    return tokenizer\n",
1342 |     "\n",
1343 |     "def bertModel(*args, **kwargs):\n",
1344 |     "    model = BertModel.from_pretrained(*args, **kwargs)\n",
1345 |     "    return model\n",
1346 |     "\n",
1347 |     "def bertForNextSentencePrediction(*args, **kwargs):\n",
1348 |     "    model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)\n",
1349 |     "    return model\n",
1350 |     "\n",
1351 |     "def bertForPreTraining(*args, **kwargs):\n",
1352 |     "    model = BertForPreTraining.from_pretrained(*args, **kwargs)\n",
1353 |     "    return model\n",
1354 |     "\n",
1355 |     "def bertForMaskedLM(*args, **kwargs):\n",
1356 |     "    model = BertForMaskedLM.from_pretrained(*args, **kwargs)\n",
1357 |     "    return model\n",
1358 |     "\n",
1359 |     "def bertForSequenceClassification(*args, **kwargs):\n",
1360 |     "    model = BertForSequenceClassification.from_pretrained(*args, **kwargs)\n",
1361 |     "    return model\n",
1362 |     "\n",
1363 |     "def bertForMultipleChoice(*args, **kwargs):\n",
1364 |     "    model = BertForMultipleChoice.from_pretrained(*args, **kwargs)\n",
1365 |     "    return model\n",
1366 |     "\n",
1367 |     "def bertForQuestionAnswering(*args, **kwargs):\n",
1368 |     "    model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)\n",
1369 |     "    return model\n",
1370 |     "\n",
1371 |     "def bertForTokenClassification(*args, **kwargs):\n",
1372 |     "    model = BertForTokenClassification.from_pretrained(*args, **kwargs)\n",
1373 |     "    return model"
1374 |    ]
1375 |   },
1376 |   {
1377 |    "cell_type": "markdown",
1378 |    "metadata": {},
1379 |    "source": [
1380 |     "## Bert Fine-Tuning"
1381 |    ]
1382 |   },
1383 |   {
1384 |    "cell_type": "code",
1385 |    "execution_count": null,
1386 |    "metadata": {},
1387 |    "outputs": [],
1388 |    "source": [
1389 |     "import argparse\n",
1390 |     "import logging\n",
1391 |     "import os\n",
1392 |     "import random\n",
1393 |     "from io import open\n",
1394 |     "\n",
1395 |     "import numpy as np\n",
1396 |     "import torch\n",
1397 |     "from torch.utils.data import DataLoader, Dataset, RandomSampler\n",
1398 |     "from torch.utils.data.distributed import DistributedSampler\n",
1399 |     "from tqdm import tqdm, trange\n",
1400 |     "\n",
1401 |     "from pytorch_pretrained_bert.modeling import BertForPreTraining\n",
1402 |     "from pytorch_pretrained_bert.tokenization import BertTokenizer\n",
1403 |     "from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule\n",
1404 |     "from torch.nn import CrossEntropyLoss\n",
1405 |     "\n",
1406 |     "\n",
1407 |     "class BERTDataset(Dataset):\n",
1408 |     "    def __init__(self, corpus_path, tokenizer, seq_len, encoding=\"utf-8\", corpus_lines=None, on_memory=True):\n",
1409 |     "        self.vocab = tokenizer.vocab\n",
1410 |     "        self.tokenizer = tokenizer\n",
1411 |     "        self.seq_len = seq_len\n",
1412 |     "        self.on_memory = on_memory\n",
1413 |     "        self.corpus_lines = corpus_lines  # number of non-empty lines in input corpus\n",
1414 |     "        self.corpus_path = corpus_path\n",
1415 |     "        self.encoding = encoding\n",
1416 |     "        self.current_doc = 0  # to avoid random sentence from same doc\n",
1417 |     "\n",
1418 |     "        # for loading samples directly from file\n",
1419 |     "        self.sample_counter = 0  # used to keep track of full epochs on file\n",
1420 |     "        self.line_buffer = None  # keep second sentence of a pair in memory and use as first sentence in next pair\n",
1421 |     "\n",
1422 |     "        # for loading samples in memory\n",
1423 |     "        self.current_random_doc = 0\n",
1424 |     "        self.num_docs = 0\n",
1425 |     "        self.sample_to_doc = [] # map sample index to doc and line\n",
1426 |     "\n",
1427 |     "        # load samples into memory\n",
1428 |     "        if on_memory:\n",
1429 |     "            self.all_docs = []\n",
1430 |     "            doc = []\n",
1431 |     "            self.corpus_lines = 0\n",
1432 |     "            with open(corpus_path, \"r\", encoding=encoding) as f:\n",
1433 |     "                for line in tqdm(f, desc=\"Loading Dataset\", total=corpus_lines):\n",
1434 |     "                    line = line.strip()\n",
1435 |     "                    if line == \"\":\n",
1436 |     "                        self.all_docs.append(doc)\n",
1437 |     "                        doc = []\n",
1438 |     "                        #remove last added sample because there won't be a subsequent line anymore in the doc\n",
1439 |     "                        self.sample_to_doc.pop()\n",
1440 |     "                    else:\n",
1441 |     "                        #store as one sample\n",
1442 |     "                        sample = {\"doc_id\": len(self.all_docs),\n",
1443 |     "                                  \"line\": len(doc)}\n",
1444 |     "                        self.sample_to_doc.append(sample)\n",
1445 |     "                        doc.append(line)\n",
1446 |     "                        self.corpus_lines = self.corpus_lines + 1\n",
1447 |     "\n",
1448 |     "            # if last row in file is not empty\n",
1449 |     "            if self.all_docs[-1] != doc:\n",
1450 |     "                self.all_docs.append(doc)\n",
1451 |     "                self.sample_to_doc.pop()\n",
1452 |     "\n",
1453 |     "            self.num_docs = len(self.all_docs)\n",
1454 |     "\n",
1455 |     "        # load samples later lazily from disk\n",
1456 |     "        else:\n",
1457 |     "            if self.corpus_lines is None:\n",
1458 |     "                with open(corpus_path, \"r\", encoding=encoding) as f:\n",
1459 |     "                    self.corpus_lines = 0\n",
1460 |     "                    for line in tqdm(f, desc=\"Loading Dataset\", total=corpus_lines):\n",
1461 |     "                        if line.strip() == \"\":\n",
1462 |     "                            self.num_docs += 1\n",
1463 |     "                        else:\n",
1464 |     "                            self.corpus_lines += 1\n",
1465 |     "\n",
1466 |     "                    # if doc does not end with empty line\n",
1467 |     "                    if line.strip() != \"\":\n",
1468 |     "                        self.num_docs += 1\n",
1469 |     "\n",
1470 |     "            self.file = open(corpus_path, \"r\", encoding=encoding)\n",
1471 |     "            self.random_file = open(corpus_path, \"r\", encoding=encoding)\n",
1472 |     "\n",
1473 |     "    def __len__(self):\n",
1474 |     "        # last line of doc won't be used, because there's no \"nextSentence\". Additionally, we start counting at 0.\n",
1475 |     "        return self.corpus_lines - self.num_docs - 1\n",
1476 |     "\n",
1477 |     "    def __getitem__(self, item):\n",
1478 |     "        cur_id = self.sample_counter\n",
1479 |     "        self.sample_counter += 1\n",
1480 |     "        if not self.on_memory:\n",
1481 |     "            # after one epoch we start again from beginning of file\n",
1482 |     "            if cur_id != 0 and (cur_id % len(self) == 0):\n",
1483 |     "                self.file.close()\n",
1484 |     "                self.file = open(self.corpus_path, \"r\", encoding=self.encoding)\n",
1485 |     "\n",
1486 |     "        t1, t2, is_next_label = self.random_sent(item)\n",
1487 |     "\n",
1488 |     "        # tokenize\n",
1489 |     "        tokens_a = self.tokenizer.tokenize(t1)\n",
1490 |     "        tokens_b = self.tokenizer.tokenize(t2)\n",
1491 |     "\n",
1492 |     "        # combine to one sample\n",
1493 |     "        cur_example = InputExample(guid=cur_id, tokens_a=tokens_a, tokens_b=tokens_b, is_next=is_next_label)\n",
1494 |     "\n",
1495 |     "        # transform sample to features\n",
1496 |     "        cur_features = convert_example_to_features(cur_example, self.seq_len, self.tokenizer)\n",
1497 |     "\n",
1498 |     "        cur_tensors = (torch.tensor(cur_features.input_ids),\n",
1499 |     "                       torch.tensor(cur_features.input_mask),\n",
1500 |     "                       torch.tensor(cur_features.segment_ids),\n",
1501 |     "                       torch.tensor(cur_features.lm_label_ids),\n",
1502 |     "                       torch.tensor(cur_features.is_next))\n",
1503 |     "\n",
1504 |     "        return cur_tensors\n",
1505 |     "\n",
1506 |     "    def random_sent(self, index):\n",
1507 |     "        \"\"\"\n",
1508 |     "        Get one sample from corpus consisting of two sentences. With prob. 50% these are two subsequent sentences\n",
1509 |     "        from one doc. With 50% the second sentence will be a random one from another doc.\n",
1510 |     "        :param index: int, index of sample.\n",
1511 |     "        :return: (str, str, int), sentence 1, sentence 2, isNextSentence Label\n",
1512 |     "        \"\"\"\n",
1513 |     "        t1, t2 = self.get_corpus_line(index)\n",
1514 |     "        t = random.random()\n",
1515 |     "        if t > 0.5:\n",
1516 |     "            label = 0\n",
1517 |     "        else:\n",
1518 |     "            t2 = self.get_random_line()\n",
1519 |     "            label = 1\n",
1520 |     "\n",
1521 |     "        assert len(t1) > 0\n",
1522 |     "        assert len(t2) > 0\n",
1523 |     "        return t1, t2, label\n",
1524 |     "\n",
1525 |     "    def get_corpus_line(self, item):\n",
1526 |     "        \"\"\"\n",
1527 |     "        Get one sample from corpus consisting of a pair of two subsequent lines from the same doc.\n",
1528 |     "        :param item: int, index of sample.\n",
1529 |     "        :return: (str, str), two subsequent sentences from corpus\n",
1530 |     "        \"\"\"\n",
1531 |     "        t1 = \"\"\n",
1532 |     "        t2 = \"\"\n",
1533 |     "        assert item < self.corpus_lines\n",
1534 |     "        if self.on_memory:\n",
1535 |     "            sample = self.sample_to_doc[item]\n",
1536 |     "            t1 = self.all_docs[sample[\"doc_id\"]][sample[\"line\"]]\n",
1537 |     "            t2 = self.all_docs[sample[\"doc_id\"]][sample[\"line\"]+1]\n",
1538 |     "            # used later to avoid random nextSentence from same doc\n",
1539 |     "            self.current_doc = sample[\"doc_id\"]\n",
1540 |     "            return t1, t2\n",
1541 |     "        else:\n",
1542 |     "            if self.line_buffer is None:\n",
1543 |     "                # read first non-empty line of file\n",
1544 |     "                while t1 == \"\" :\n",
1545 |     "                    t1 = next(self.file).strip()\n",
1546 |     "                    t2 = next(self.file).strip()\n",
1547 |     "            else:\n",
1548 |     "                # use t2 from previous iteration as new t1\n",
1549 |     "                t1 = self.line_buffer\n",
1550 |     "                t2 = next(self.file).strip()\n",
1551 |     "                # skip empty rows that are used for separating documents and keep track of current doc id\n",
1552 |     "                while t2 == \"\" or t1 == \"\":\n",
1553 |     "                    t1 = next(self.file).strip()\n",
1554 |     "                    t2 = next(self.file).strip()\n",
1555 |     "                    self.current_doc = self.current_doc+1\n",
1556 |     "            self.line_buffer = t2\n",
1557 |     "\n",
1558 |     "        assert t1 != \"\"\n",
1559 |     "        assert t2 != \"\"\n",
1560 |     "        return t1, t2\n",
1561 |     "\n",
1562 |     "    def get_random_line(self):\n",
1563 |     "        \"\"\"\n",
1564 |     "        Get random line from another document for nextSentence task.\n",
1565 |     "        :return: str, content of one line\n",
1566 |     "        \"\"\"\n",
1567 |     "        # Similar to original tf repo: This outer loop should rarely go for more than one iteration for large\n",
1568 |     "        # corpora. However, just to be careful, we try to make sure that\n",
1569 |     "        # the random document is not the same as the document we're processing.\n",
1570 |     "        for _ in range(10):\n",
1571 |     "            if self.on_memory:\n",
1572 |     "                rand_doc_idx = random.randint(0, len(self.all_docs)-1)\n",
1573 |     "                rand_doc = self.all_docs[rand_doc_idx]\n",
1574 |     "                line = rand_doc[random.randrange(len(rand_doc))]\n",
1575 |     "            else:\n",
1576 |     "                rand_index = random.randint(1, self.corpus_lines if self.corpus_lines < 1000 else 1000)\n",
1577 |     "                # rand_index = 892\n",
1578 |     "                #pick random line\n",
1579 |     "                for _ in range(rand_index):\n",
1580 |     "                    line = self.get_next_line()\n",
1581 |     "                \n",
1582 |     "            #check if our picked random line is really from another doc like we want it to be\n",
1583 |     "            if self.current_random_doc != self.current_doc:\n",
1584 |     "                \n",
1585 |     "                break\n",
1586 |     "        # print(\"random Index:\", rand_index, line)\n",
1587 |     "        return line\n",
1588 |     "\n",
1589 |     "    def get_next_line(self):\n",
1590 |     "        \"\"\" Gets next line of random_file and starts over when reaching end of file\"\"\"\n",
1591 |     "        try:\n",
1592 |     "            line = next(self.random_file).strip()\n",
1593 |     "\n",
1594 |     "            #keep track of which document we are currently looking at to later avoid having the same doc as t1\n",
1595 |     "            while line == \"\":\n",
1596 |     "                self.current_random_doc = self.current_random_doc + 1\n",
1597 |     "                line = next(self.random_file).strip()\n",
1598 |     "        except StopIteration:\n",
1599 |     "            self.random_file.close()\n",
1600 |     "            self.random_file = open(self.corpus_path, \"r\", encoding=self.encoding)\n",
1601 |     "            line = next(self.random_file).strip()\n",
1602 |     "        \n",
1603 |     "        return line\n",
1604 |     "\n",
1605 |     "\n",
1606 |     "class InputExample(object):\n",
1607 |     "    \"\"\"A single training/test example for the language model.\"\"\"\n",
1608 |     "\n",
1609 |     "    def __init__(self, guid, tokens_a, tokens_b=None, is_next=None, lm_labels=None):\n",
1610 |     "        \"\"\"Constructs a InputExample.\n",
1611 |     "        Args:\n",
1612 |     "            guid: Unique id for the example.\n",
1613 |     "            tokens_a: string. The untokenized text of the first sequence. For single\n",
1614 |     "            sequence tasks, only this sequence must be specified.\n",
1615 |     "            tokens_b: (Optional) string. The untokenized text of the second sequence.\n",
1616 |     "            Only must be specified for sequence pair tasks.\n",
1617 |     "            label: (Optional) string. The label of the example. This should be\n",
1618 |     "            specified for train and dev examples, but not for test examples.\n",
1619 |     "        \"\"\"\n",
1620 |     "        self.guid = guid\n",
1621 |     "        self.tokens_a = tokens_a\n",
1622 |     "        self.tokens_b = tokens_b\n",
1623 |     "        self.is_next = is_next  # nextSentence\n",
1624 |     "        self.lm_labels = lm_labels  # masked words for language model\n",
1625 |     "\n",
1626 |     "\n",
1627 |     "class InputFeatures(object):\n",
1628 |     "    \"\"\"A single set of features of data.\"\"\"\n",
1629 |     "\n",
1630 |     "    def __init__(self, input_ids, input_mask, segment_ids, is_next, lm_label_ids):\n",
1631 |     "        self.input_ids = input_ids\n",
1632 |     "        self.input_mask = input_mask\n",
1633 |     "        self.segment_ids = segment_ids\n",
1634 |     "        self.is_next = is_next\n",
1635 |     "        self.lm_label_ids = lm_label_ids\n",
1636 |     "\n",
1637 |     "\n",
1638 |     "def random_word(tokens, tokenizer):\n",
1639 |     "    \"\"\"\n",
1640 |     "    Masking some random tokens for Language Model task with probabilities as in the original BERT paper.\n",
1641 |     "    :param tokens: list of str, tokenized sentence.\n",
1642 |     "    :param tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)\n",
1643 |     "    :return: (list of str, list of int), masked tokens and related labels for LM prediction\n",
1644 |     "    \"\"\"\n",
1645 |     "    output_label = []\n",
1646 |     "\n",
1647 |     "    for i, token in enumerate(tokens):\n",
1648 |     "        prob = random.random()\n",
1649 |     "        # mask token with 15% probability\n",
1650 |     "        if prob < 0.15:\n",
1651 |     "            prob /= 0.15\n",
1652 |     "\n",
1653 |     "            # 80% randomly change token to mask token\n",
1654 |     "            if prob < 0.8:\n",
1655 |     "                tokens[i] = \"[MASK]\"\n",
1656 |     "\n",
1657 |     "            # 10% randomly change token to random token\n",
1658 |     "            elif prob < 0.9:\n",
1659 |     "                tokens[i] = random.choice(list(tokenizer.vocab.items()))[0]\n",
1660 |     "\n",
1661 |     "            # -> rest 10% randomly keep current token\n",
1662 |     "\n",
1663 |     "            # append current token to output (we will predict these later)\n",
1664 |     "            try:\n",
1665 |     "                output_label.append(tokenizer.vocab[token])\n",
1666 |     "            except KeyError:\n",
1667 |     "                # For unknown words (should not occur with BPE vocab)\n",
1668 |     "                output_label.append(tokenizer.vocab[\"[UNK]\"])\n",
1669 |     "                logger.warning(\"Cannot find token '{}' in vocab. Using [UNK] insetad\".format(token))\n",
1670 |     "        else:\n",
1671 |     "            # no masking token (will be ignored by loss function later)\n",
1672 |     "            output_label.append(-1)\n",
1673 |     "\n",
1674 |     "    return tokens, output_label\n",
1675 |     "\n",
1676 |     "\n",
1677 |     "def convert_example_to_features(example, max_seq_length, tokenizer):\n",
1678 |     "    \"\"\"\n",
1679 |     "    Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with\n",
1680 |     "    IDs, LM labels, input_mask, CLS and SEP tokens etc.\n",
1681 |     "    :param example: InputExample, containing sentence input as strings and is_next label\n",
1682 |     "    :param max_seq_length: int, maximum length of sequence.\n",
1683 |     "    :param tokenizer: Tokenizer\n",
1684 |     "    :return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training)\n",
1685 |     "    \"\"\"\n",
1686 |     "    tokens_a = example.tokens_a\n",
1687 |     "    tokens_b = example.tokens_b\n",
1688 |     "    # Modifies `tokens_a` and `tokens_b` in place so that the total\n",
1689 |     "    # length is less than the specified length.\n",
1690 |     "    # Account for [CLS], [SEP], [SEP] with \"- 3\"\n",
1691 |     "    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)\n",
1692 |     "\n",
1693 |     "    tokens_a, t1_label = random_word(tokens_a, tokenizer)\n",
1694 |     "    tokens_b, t2_label = random_word(tokens_b, tokenizer)\n",
1695 |     "    # concatenate lm labels and account for CLS, SEP, SEP\n",
1696 |     "    lm_label_ids = ([-1] + t1_label + [-1] + t2_label + [-1])\n",
1697 |     "\n",
1698 |     "    # The convention in BERT is:\n",
1699 |     "    # (a) For sequence pairs:\n",
1700 |     "    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n",
1701 |     "    #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1\n",
1702 |     "    # (b) For single sequences:\n",
1703 |     "    #  tokens:   [CLS] the dog is hairy . [SEP]\n",
1704 |     "    #  type_ids: 0   0   0   0  0     0 0\n",
1705 |     "    #\n",
1706 |     "    # Where \"type_ids\" are used to indicate whether this is the first\n",
1707 |     "    # sequence or the second sequence. The embedding vectors for `type=0` and\n",
1708 |     "    # `type=1` were learned during pre-training and are added to the wordpiece\n",
1709 |     "    # embedding vector (and position vector). This is not *strictly* necessary\n",
1710 |     "    # since the [SEP] token unambigiously separates the sequences, but it makes\n",
1711 |     "    # it easier for the model to learn the concept of sequences.\n",
1712 |     "    #\n",
1713 |     "    # For classification tasks, the first vector (corresponding to [CLS]) is\n",
1714 |     "    # used as as the \"sentence vector\". Note that this only makes sense because\n",
1715 |     "    # the entire model is fine-tuned.\n",
1716 |     "    tokens = []\n",
1717 |     "    segment_ids = []\n",
1718 |     "    tokens.append(\"[CLS]\")\n",
1719 |     "    segment_ids.append(0)\n",
1720 |     "    for token in tokens_a:\n",
1721 |     "        tokens.append(token)\n",
1722 |     "        segment_ids.append(0)\n",
1723 |     "    tokens.append(\"[SEP]\")\n",
1724 |     "    segment_ids.append(0)\n",
1725 |     "\n",
1726 |     "    assert len(tokens_b) > 0\n",
1727 |     "    for token in tokens_b:\n",
1728 |     "        tokens.append(token)\n",
1729 |     "        segment_ids.append(1)\n",
1730 |     "    tokens.append(\"[SEP]\")\n",
1731 |     "    segment_ids.append(1)\n",
1732 |     "\n",
1733 |     "    input_ids = tokenizer.convert_tokens_to_ids(tokens)\n",
1734 |     "\n",
1735 |     "    # The mask has 1 for real tokens and 0 for padding tokens. Only real\n",
1736 |     "    # tokens are attended to.\n",
1737 |     "    input_mask = [1] * len(input_ids)\n",
1738 |     "\n",
1739 |     "    # Zero-pad up to the sequence length.\n",
1740 |     "    while len(input_ids) < max_seq_length:\n",
1741 |     "        input_ids.append(0)\n",
1742 |     "        input_mask.append(0)\n",
1743 |     "        segment_ids.append(0)\n",
1744 |     "        lm_label_ids.append(-1)\n",
1745 |     "\n",
1746 |     "    assert len(input_ids) == max_seq_length\n",
1747 |     "    assert len(input_mask) == max_seq_length\n",
1748 |     "    assert len(segment_ids) == max_seq_length\n",
1749 |     "    assert len(lm_label_ids) == max_seq_length\n",
1750 |     "\n",
1751 |     "    if example.guid < 5:\n",
1752 |     "        logger.info(\"*** Example ***\")\n",
1753 |     "        logger.info(\"guid: %s\" % (example.guid))\n",
1754 |     "        logger.info(\"tokens: %s\" % \" \".join(\n",
1755 |     "                [str(x) for x in tokens]))\n",
1756 |     "        logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n",
1757 |     "        logger.info(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n",
1758 |     "        logger.info(\n",
1759 |     "                \"segment_ids: %s\" % \" \".join([str(x) for x in segment_ids]))\n",
1760 |     "        logger.info(\"LM label: %s \" % (lm_label_ids))\n",
1761 |     "        logger.info(\"Is next sentence label: %s \" % (example.is_next))\n",
1762 |     "\n",
1763 |     "    features = InputFeatures(input_ids=input_ids,\n",
1764 |     "                             input_mask=input_mask,\n",
1765 |     "                             segment_ids=segment_ids,\n",
1766 |     "                             lm_label_ids=lm_label_ids,\n",
1767 |     "                             is_next=example.is_next)\n",
1768 |     "    return features"
1769 |    ]
1770 |   },
1771 |   {
1772 |    "cell_type": "markdown",
1773 |    "metadata": {
1774 |     "ExecuteTime": {
1775 |      "end_time": "2019-09-10T08:38:21.419881Z",
1776 |      "start_time": "2019-09-10T08:38:21.415733Z"
1777 |     }
1778 |    },
1779 |    "source": [
1780 |     "## Build Vocabulary"
1781 |    ]
1782 |   },
1783 |   {
1784 |    "cell_type": "code",
1785 |    "execution_count": null,
1786 |    "metadata": {},
1787 |    "outputs": [],
1788 |    "source": [
1789 |     "class MTLField(Field):\n",
1790 |     "\n",
1791 |     "    def __init__(\n",
1792 |     "            self, **kwargs):\n",
1793 |     "        super(MTLField, self).__init__(**kwargs)\n",
1794 |     "\n",
1795 |     "    def build_vocab(self, dataset_list, **kwargs):\n",
1796 |     "        ## Load BERT\n",
1797 |     "        counter = Counter()\n",
1798 |     "        sources = []\n",
1799 |     "        for arg in dataset_list:\n",
1800 |     "            if isinstance(arg, Dataset):\n",
1801 |     "                sources += [getattr(arg, name) for name, field in\n",
1802 |     "                            arg.fields.items() if field is self]\n",
1803 |     "            else:\n",
1804 |     "                sources.append(arg)\n",
1805 |     "        for data in sources:\n",
1806 |     "            for x in data:\n",
1807 |     "                if not self.sequential:\n",
1808 |     "                    x = [x]\n",
1809 |     "                counter.update(x)\n",
1810 |     "        specials = list(OrderedDict.fromkeys(\n",
1811 |     "            tok for tok in [self.pad_token, self.init_token, self.eos_token]\n",
1812 |     "            if tok is not None))\n",
1813 |     "        self.vocab = Vocab(counter, specials=specials, **kwargs)"
1814 |    ]
1815 |   },
1816 |   {
1817 |    "cell_type": "markdown",
1818 |    "metadata": {},
1819 |    "source": [
1820 |     "## MAML CNN Classifier"
1821 |    ]
1822 |   },
1823 |   {
1824 |    "cell_type": "code",
1825 |    "execution_count": null,
1826 |    "metadata": {},
1827 |    "outputs": [],
1828 |    "source": [
1829 |     "import sys, os, glob, random\n",
1830 |     "import time\n",
1831 |     "import parser\n",
1832 |     "import torch\n",
1833 |     "import torch.nn as nn\n",
1834 |     "# from AdaAdam import AdaAdam\n",
1835 |     "import torch.optim as OPT\n",
1836 |     "import numpy as np\n",
1837 |     "from copy import deepcopy\n",
1838 |     "from tqdm import tqdm, trange\n",
1839 |     "import logging\n",
1840 |     "\n",
1841 |     "from torchtext import data\n",
1842 |     "import DataProcessing\n",
1843 |     "from DataProcessing.MLTField import MTLField\n",
1844 |     "from DataProcessing.NlcDatasetSingleFile import NlcDatasetSingleFile\n",
1845 |     "from CNNModel import CNNModel\n",
1846 |     "\n",
1847 |     "\n",
1848 |     "logger = logging.getLogger(__name__)\n",
1849 |     "\n",
1850 |     "logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',\n",
1851 |     "                    datefmt = '%m/%d/%Y %H:%M:%S',\n",
1852 |     "                    level = logging.INFO )\n",
1853 |     "batch_size = 10\n",
1854 |     "seed = 12345678\n",
1855 |     "torch.manual_seed(seed)\n",
1856 |     "Train = False\n",
1857 |     "\n",
1858 |     "\n",
1859 |     "device = torch.device(\"cuda\" if torch.cuda.is_available()  else \"cpu\")\n",
1860 |     "n_gpu = torch.cuda.device_count()\n",
1861 |     "random.seed(seed)\n",
1862 |     "np.random.seed(seed)\n",
1863 |     "torch.manual_seed(seed)\n",
1864 |     "if n_gpu > 0:\n",
1865 |     "    torch.cuda.manual_seed_all(seed)\n",
1866 |     "\n",
1867 |     "def load_train_test_files(listfilename, test_suffix='.test'):\n",
1868 |     "    filein = open(listfilename, 'r')\n",
1869 |     "    file_tuples = []\n",
1870 |     "    task_classes = ['.t2', '.t4', '.t5']\n",
1871 |     "    for line in filein:\n",
1872 |     "        array = line.strip().split('\\t')\n",
1873 |     "        line = array[0]\n",
1874 |     "        for t_class in task_classes:\n",
1875 |     "            trainfile = line + t_class + '.train'\n",
1876 |     "            devfile = line + t_class + '.dev'\n",
1877 |     "            testfile = line + t_class + test_suffix\n",
1878 |     "            file_tuples.append((trainfile, devfile, testfile))\n",
1879 |     "    filein.close()\n",
1880 |     "    return file_tuples\n",
1881 |     "\n",
1882 |     "filelist = 'data/Amazon_few_shot/workspace.filtered.list'\n",
1883 |     "targetlist = 'data/Amazon_few_shot/workspace.target.list'\n",
1884 |     "workingdir = 'data/Amazon_few_shot'\n",
1885 |     "emfilename = 'glove.6B.300d'\n",
1886 |     "emfiledir = '..'\n",
1887 |     "\n",
1888 |     "datasets = []\n",
1889 |     "list_datasets = []\n",
1890 |     "\n",
1891 |     "\n",
1892 |     "file_tuples = load_train_test_files(filelist)\n",
1893 |     "print(file_tuples)\n",
1894 |     "\n",
1895 |     "TEXT = MTLField(lower=True)\n",
1896 |     "for (trainfile, devfile, testfile) in file_tuples:\n",
1897 |     "    print(trainfile, devfile, testfile)\n",
1898 |     "    LABEL1 = data.Field(sequential=False)\n",
1899 |     "    train1, dev1, test1 = NlcDatasetSingleFile.splits(\n",
1900 |     "        TEXT, LABEL1, path=workingdir, train=trainfile,\n",
1901 |     "        validation=devfile, test=testfile)\n",
1902 |     "    datasets.append((TEXT, LABEL1, train1, dev1, test1))\n",
1903 |     "    list_datasets.append(train1)\n",
1904 |     "    list_datasets.append(dev1)\n",
1905 |     "    list_datasets.append(test1)\n",
1906 |     "\n",
1907 |     "target_datasets = []\n",
1908 |     "target_file = load_train_test_files(targetlist)\n",
1909 |     "print(target_file)\n",
1910 |     "\n",
1911 |     "for (trainfile, devfile, testfile) in target_file:\n",
1912 |     "    print(trainfile, devfile, testfile)\n",
1913 |     "    LABEL2 = data.Field(sequential=False)\n",
1914 |     "    train2, dev2, test2 = NlcDatasetSingleFile.splits(TEXT, LABEL2, path=workingdir, \n",
1915 |     "    train=trainfile,validation=devfile, test=testfile)\n",
1916 |     "    target_datasets.append((TEXT, LABEL2, train2, dev2, test2))\n",
1917 |     "\n",
1918 |     "    \n",
1919 |     "\n",
1920 |     "datasets_iters = []\n",
1921 |     "for (TEXT, LABEL, train, dev, test) in datasets:\n",
1922 |     "    train_iter, dev_iter, test_iter = data.BucketIterator.splits(\n",
1923 |     "        (train, dev, test), batch_size=batch_size, device=device,shuffle=True)\n",
1924 |     "    train_iter.repeat = False\n",
1925 |     "    datasets_iters.append((train_iter, dev_iter, test_iter))\n",
1926 |     "\n",
1927 |     "fsl_ds_iters = []\n",
1928 |     "for (TEXT, LABEL, train, dev, test) in target_datasets:\n",
1929 |     "    train_iter, dev_iter, test_iter = data.BucketIterator.splits(\n",
1930 |     "        (train,dev, test), batch_size=batch_size, device=device)\n",
1931 |     "    train_iter.repeat = False\n",
1932 |     "    fsl_ds_iters.append((train_iter, dev_iter, test_iter))\n",
1933 |     "\n",
1934 |     "num_batch_total = 0\n",
1935 |     "for i, (TEXT, LABEL, train, dev, test) in enumerate(datasets):\n",
1936 |     "    # print('DATASET%d'%(i+1))\n",
1937 |     "    # print('train.fields', train.fields)\n",
1938 |     "    # print('len(train)', len(train))\n",
1939 |     "    # print('len(dev)', len(dev))\n",
1940 |     "    # print('len(test)', len(test))\n",
1941 |     "    # print('vars(train[0])', vars(train[0]))\n",
1942 |     "    num_batch_total += len(train) / batch_size\n",
1943 |     "\n",
1944 |     "TEXT.build_vocab(list_datasets, vectors = emfilename, vectors_cache = emfiledir)\n",
1945 |     "# TEXT.build_vocab(list_dataset)\n",
1946 |     "\n",
1947 |     "# build the vocabulary\n",
1948 |     "for taskid, (TEXT, LABEL, train, dev, test) in enumerate(datasets):\n",
1949 |     "    LABEL.build_vocab(train, dev, test)\n",
1950 |     "    LABEL.vocab.itos = LABEL.vocab.itos[1:]\n",
1951 |     "\n",
1952 |     "    for k, v in LABEL.vocab.stoi.items():\n",
1953 |     "        LABEL.vocab.stoi[k] = v - 1\n",
1954 |     "\n",
1955 |     "    # print vocab information\n",
1956 |     "    # print('len(TEXT.vocab)', len(TEXT.vocab))\n",
1957 |     "    # print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())\n",
1958 |     "\n",
1959 |     "    # print(LABEL.vocab.itos)\n",
1960 |     "    # print(len(LABEL.vocab.itos))\n",
1961 |     "\n",
1962 |     "    # print(len(LABEL.vocab.stoi))\n",
1963 |     "fsl_num_tasks = 0\n",
1964 |     "for taskid, (TEXT, LABEL, train, dev, test) in enumerate(target_datasets):\n",
1965 |     "    fsl_num_tasks += 1\n",
1966 |     "    LABEL.build_vocab(train, dev, test)\n",
1967 |     "    LABEL.vocab.itos = LABEL.vocab.itos[1:]\n",
1968 |     "    for k, v in LABEL.vocab.stoi.items():\n",
1969 |     "        LABEL.vocab.stoi[k] = v - 1\n",
1970 |     "\n",
1971 |     "nums_embed = len(TEXT.vocab)\n",
1972 |     "dim_embed = 100\n",
1973 |     "dim_w_hid = 200\n",
1974 |     "dim_h_hid = 100\n",
1975 |     "Inner_lr = 2e-6\n",
1976 |     "Outer_lr = 1e-5\n",
1977 |     "\n",
1978 |     "n_labels = []\n",
1979 |     "for (TEXT, LABEL, train, dev, test) in datasets:\n",
1980 |     "   n_labels.append(len(LABEL.vocab))\n",
1981 |     "print(n_labels)\n",
1982 |     "num_tasks = len(n_labels)\n",
1983 |     "print(\"num_tasks\", num_tasks)\n",
1984 |     "winsize = 3\n",
1985 |     "num_labels = len(LABEL.vocab.itos)\n",
1986 |     "model = CNNModel(nums_embed, num_labels, dim_embed, dim_w_hid, dim_h_hid, winsize, batch_size)\n",
1987 |     "\n",
1988 |     "print(\"GPU Device: \", device)\n",
1989 |     "model.to(device)\n",
1990 |     "print(model)\n",
1991 |     "\n",
1992 |     "criterion = nn.CrossEntropyLoss()\n",
1993 |     "opt = OPT.Adam(model.parameters(), lr=Inner_lr)\n",
1994 |     "Inner_epochs = 4\n",
1995 |     "epochs = 2\n",
1996 |     "\n",
1997 |     "N_task = 5\n",
1998 |     "\n",
1999 |     "task_list = np.arange(num_tasks)\n",
2000 |     "print(\"Total Batch: \", num_batch_total)\n",
2001 |     "output_model_file = '/tmp/CNN_MAML_output'\n",
2002 |     "if Train:\n",
2003 |     "    for t in trange(int(num_batch_total*epochs/Inner_epochs), desc=\"Iterations\"):\n",
2004 |     "        selected_task = np.random.choice(task_list, N_task,replace=False)\n",
2005 |     "        weight_before = deepcopy(model.state_dict())\n",
2006 |     "        update_vars = []\n",
2007 |     "        fomaml_vars = []\n",
2008 |     "        for task_id in selected_task:\n",
2009 |     "            # print(task_id)\n",
2010 |     "            (train_iter, dev_iter, test_iter) = datasets_iters[task_id]\n",
2011 |     "            train_iter.init_epoch()\n",
2012 |     "            model.train()\n",
2013 |     "            n_correct = 0\n",
2014 |     "            n_step = 0\n",
2015 |     "            for inner_iter in range(Inner_epochs):\n",
2016 |     "                batch = next(iter(train_iter))\n",
2017 |     "\n",
2018 |     "                # print(batch.text)\n",
2019 |     "                # print(batch.label)\n",
2020 |     "                logits = model(batch.text)\n",
2021 |     "                loss = criterion(logits.view(-1, num_labels), batch.label.data.view(-1))\n",
2022 |     "                \n",
2023 |     "\n",
2024 |     "                n_correct = (torch.max(logits, 1)[1].view(batch.label.size()).data == batch.label.data).sum()\n",
2025 |     "                n_step = batch.batch_size\n",
2026 |     "                loss.backward()\n",
2027 |     "                opt.step()\n",
2028 |     "                opt.zero_grad()\n",
2029 |     "            task_acc = 100.*n_correct/n_step\n",
2030 |     "            if t%10 == 0:\n",
2031 |     "                logger.info(\"Iter: %d, task id: %d, train acc: %f\", t, task_id, task_acc)\n",
2032 |     "            weight_after = deepcopy(model.state_dict())\n",
2033 |     "            update_vars.append(weight_after)\n",
2034 |     "            model.load_state_dict(weight_before)\n",
2035 |     "\n",
2036 |     "        new_weight_dict = {}\n",
2037 |     "        for name in weight_before:\n",
2038 |     "            weight_list = [tmp_weight_dict[name] for tmp_weight_dict in update_vars]\n",
2039 |     "            weight_shape = list(weight_list[0].size())\n",
2040 |     "            stack_shape = [len(weight_list)] + weight_shape\n",
2041 |     "            stack_weight = torch.empty(stack_shape)\n",
2042 |     "            for i in range(len(weight_list)):\n",
2043 |     "                stack_weight[i,:] = weight_list[i] \n",
2044 |     "            new_weight_dict[name] = torch.mean(stack_weight, dim=0).cuda()\n",
2045 |     "            new_weight_dict[name] = weight_before[name]+(new_weight_dict[name]-weight_before[name])/Inner_lr*Outer_lr\n",
2046 |     "        model.load_state_dict(new_weight_dict)\n",
2047 |     "\n",
2048 |     "\n",
2049 |     "    torch.save(model.state_dict(), output_model_file)\n",
2050 |     "\n",
2051 |     "model.load_state_dict(torch.load(output_model_file))\n",
2052 |     "logger.info(\"***** Running evaluation *****\")\n",
2053 |     "fsl_task_list = np.arange(fsl_num_tasks)\n",
2054 |     "weight_before = deepcopy(model.state_dict())\n",
2055 |     "fsl_epochs = 3\n",
2056 |     "Total_acc = 0\n",
2057 |     "opt = OPT.Adam(model.parameters(), lr=3e-4)\n",
2058 |     "\n",
2059 |     "for task_id in fsl_task_list:\n",
2060 |     "    model.train()\n",
2061 |     "    (train_iter, dev_iter, test_iter) = fsl_ds_iters[task_id]\n",
2062 |     "    train_iter.init_epoch()\n",
2063 |     "    batch = next(iter(train_iter))\n",
2064 |     "    for i in range(fsl_epochs):\n",
2065 |     "        logits = model(batch.text)\n",
2066 |     "        loss = criterion(logits.view(-1, num_labels), batch.label.data.view(-1))\n",
2067 |     "        n_correct = (torch.max(logits, 1)[1].view(batch.label.size()).data == batch.label.data).sum()\n",
2068 |     "        n_size = batch.batch_size\n",
2069 |     "        train_acc = 100. * n_correct / n_size\n",
2070 |     "        loss = criterion(logits.view(-1, num_labels), batch.label.data.view(-1))\n",
2071 |     "        loss.backward()\n",
2072 |     "        opt.step()\n",
2073 |     "        opt.zero_grad()\n",
2074 |     "        logger.info(\"  Task id: %d, fsl epoch: %d, Acc: %f, loss: %f\", task_id, i, train_acc, loss)\n",
2075 |     "\n",
2076 |     "    model.eval()\n",
2077 |     "    test_iter.init_epoch()\n",
2078 |     "    n_correct = 0\n",
2079 |     "    n_size = 0\n",
2080 |     "    for test_batch_idx, test_batch in enumerate(test_iter):\n",
2081 |     "        with torch.no_grad():\n",
2082 |     "            logits = model(test_batch.text)\n",
2083 |     "        loss = criterion(logits.view(-1, num_labels), test_batch.label.data.view(-1))\n",
2084 |     "        n_correct += (torch.max(logits, 1)[1].view(test_batch.label.size()).data == test_batch.label.data).sum()\n",
2085 |     "        n_size += test_batch.batch_size\n",
2086 |     "    test_acc = 100.* n_correct/n_size\n",
2087 |     "    logger.info(\"FSL test Number: %d, Accuracy: %f\",n_size, test_acc)\n",
2088 |     "    Total_acc += test_acc\n",
2089 |     "    model.load_state_dict(weight_before)\n",
2090 |     "\n",
2091 |     "print(\"Mean Accuracy is : \", float(Total_acc)/fsl_num_tasks)\n"
2092 |    ]
2093 |   }
2094 |  ],
2095 |  "metadata": {
2096 |   "kernelspec": {
2097 |    "display_name": "Python 3",
2098 |    "language": "python",
2099 |    "name": "python3"
2100 |   },
2101 |   "language_info": {
2102 |    "codemirror_mode": {
2103 |     "name": "ipython",
2104 |     "version": 3
2105 |    },
2106 |    "file_extension": ".py",
2107 |    "mimetype": "text/x-python",
2108 |    "name": "python",
2109 |    "nbconvert_exporter": "python",
2110 |    "pygments_lexer": "ipython3",
2111 |    "version": "3.6.5"
2112 |   },
2113 |   "latex_envs": {
2114 |    "LaTeX_envs_menu_present": true,
2115 |    "autoclose": false,
2116 |    "autocomplete": true,
2117 |    "bibliofile": "biblio.bib",
2118 |    "cite_by": "apalike",
2119 |    "current_citInitial": 1,
2120 |    "eqLabelWithNumbers": true,
2121 |    "eqNumInitial": 1,
2122 |    "hotkeys": {
2123 |     "equation": "Ctrl-E",
2124 |     "itemize": "Ctrl-I"
2125 |    },
2126 |    "labels_anchors": false,
2127 |    "latex_user_defs": false,
2128 |    "report_style_numbering": false,
2129 |    "user_envs_cfg": false
2130 |   }
2131 |  },
2132 |  "nbformat": 4,
2133 |  "nbformat_minor": 2
2134 | }
2135 | 


--------------------------------------------------------------------------------
/examples/KNNClassifierDemo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# K-NN Classifier - Example"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 13,
 13 |    "metadata": {
 14 |     "ExecuteTime": {
 15 |      "end_time": "2019-09-14T05:58:22.844762Z",
 16 |      "start_time": "2019-09-14T05:58:22.841213Z"
 17 |     }
 18 |    },
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import fsText"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 18,
 27 |    "metadata": {
 28 |     "ExecuteTime": {
 29 |      "end_time": "2019-09-14T05:59:21.382000Z",
 30 |      "start_time": "2019-09-14T05:59:21.375627Z"
 31 |     }
 32 |    },
 33 |    "outputs": [
 34 |     {
 35 |      "data": {
 36 |       "text/plain": [
 37 |        "fsText.Classifier.CosineClassifier"
 38 |       ]
 39 |      },
 40 |      "execution_count": 18,
 41 |      "metadata": {},
 42 |      "output_type": "execute_result"
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "fsText.Classifier.CosineClassifier"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 17,
 52 |    "metadata": {
 53 |     "ExecuteTime": {
 54 |      "end_time": "2019-09-14T05:58:53.112061Z",
 55 |      "start_time": "2019-09-14T05:58:53.096141Z"
 56 |     }
 57 |    },
 58 |    "outputs": [
 59 |     {
 60 |      "ename": "ImportError",
 61 |      "evalue": "cannot import name 'RFClassifier'",
 62 |      "output_type": "error",
 63 |      "traceback": [
 64 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 65 |       "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
 66 |       "\u001b[0;32m<ipython-input-17-d04b84bb3f16>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m#from fsText import KNNClassifier\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mfsText\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mClassifier\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mRFClassifier\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 67 |       "\u001b[0;31mImportError\u001b[0m: cannot import name 'RFClassifier'"
 68 |      ]
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "#from fsText import KNNClassifier\n",
 73 |     "from fsText.Classifier import RFClassifier\n",
 74 |     "import pandas as pd\n",
 75 |     "import numpy as np"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 2,
 81 |    "metadata": {
 82 |     "ExecuteTime": {
 83 |      "end_time": "2019-09-13T21:43:07.573588Z",
 84 |      "start_time": "2019-09-13T21:41:24.320231Z"
 85 |     }
 86 |    },
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stdout",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "Loading pre-trained Word2Vec model...\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "clf = KNNClassifier()"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 3,
103 |    "metadata": {
104 |     "ExecuteTime": {
105 |      "end_time": "2019-09-13T21:43:07.642012Z",
106 |      "start_time": "2019-09-13T21:43:07.576367Z"
107 |     }
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "df = pd.read_csv('examples/datasets/stack.csv')\n",
112 |     "\n",
113 |     "def gen_sample(sample_size, num_classes):\n",
114 |     "    \n",
115 |     "    df_1 = df[(df[\"Label\"]<num_classes + 1)].reset_index().drop([\"index\"], axis=1).reset_index().drop([\"index\"], axis=1)\n",
116 |     "    train = df_1[df_1[\"Label\"] == np.unique(df_1['Label'])[0]].sample(sample_size)\n",
117 |     "    \n",
118 |     "    train_index = train.index.tolist()\n",
119 |     "    \n",
120 |     "    for i in range(1,num_classes):\n",
121 |     "        train_2 = df_1[df_1[\"Label\"] == np.unique(df_1['Label'])[i]].sample(sample_size)\n",
122 |     "        train = pd.concat([train, train_2], axis=0)\n",
123 |     "        train_index.extend(train_2.index.tolist())\n",
124 |     "        \n",
125 |     "    test = df_1[~df_1.index.isin(train_index)]\n",
126 |     "\n",
127 |     "    return train, test"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 8,
133 |    "metadata": {
134 |     "ExecuteTime": {
135 |      "end_time": "2019-09-13T21:43:21.436978Z",
136 |      "start_time": "2019-09-13T21:43:21.404884Z"
137 |     }
138 |    },
139 |    "outputs": [
140 |     {
141 |      "data": {
142 |       "text/html": [
143 |        "<div>\n",
144 |        "<style scoped>\n",
145 |        "    .dataframe tbody tr th:only-of-type {\n",
146 |        "        vertical-align: middle;\n",
147 |        "    }\n",
148 |        "\n",
149 |        "    .dataframe tbody tr th {\n",
150 |        "        vertical-align: top;\n",
151 |        "    }\n",
152 |        "\n",
153 |        "    .dataframe thead th {\n",
154 |        "        text-align: right;\n",
155 |        "    }\n",
156 |        "</style>\n",
157 |        "<table border=\"1\" class=\"dataframe\">\n",
158 |        "  <thead>\n",
159 |        "    <tr style=\"text-align: right;\">\n",
160 |        "      <th></th>\n",
161 |        "      <th>Text</th>\n",
162 |        "      <th>Label</th>\n",
163 |        "    </tr>\n",
164 |        "  </thead>\n",
165 |        "  <tbody>\n",
166 |        "    <tr>\n",
167 |        "      <th>3958</th>\n",
168 |        "      <td>WP_Insert_Post and GUID Issue [Wordpress]</td>\n",
169 |        "      <td>1</td>\n",
170 |        "    </tr>\n",
171 |        "    <tr>\n",
172 |        "      <th>2540</th>\n",
173 |        "      <td>How can I debug WordPress in IIS?</td>\n",
174 |        "      <td>1</td>\n",
175 |        "    </tr>\n",
176 |        "    <tr>\n",
177 |        "      <th>3594</th>\n",
178 |        "      <td>wordpress: how to get x youtube thumbnails of ...</td>\n",
179 |        "      <td>1</td>\n",
180 |        "    </tr>\n",
181 |        "    <tr>\n",
182 |        "      <th>3240</th>\n",
183 |        "      <td>Where to place a query to show only one post i...</td>\n",
184 |        "      <td>1</td>\n",
185 |        "    </tr>\n",
186 |        "    <tr>\n",
187 |        "      <th>3638</th>\n",
188 |        "      <td>Wordpress Blog RSS Feed Problems</td>\n",
189 |        "      <td>1</td>\n",
190 |        "    </tr>\n",
191 |        "    <tr>\n",
192 |        "      <th>3702</th>\n",
193 |        "      <td>Excluding one category in Wordpress</td>\n",
194 |        "      <td>1</td>\n",
195 |        "    </tr>\n",
196 |        "    <tr>\n",
197 |        "      <th>1720</th>\n",
198 |        "      <td>debuging \"register_activation_hook\" in wordpress</td>\n",
199 |        "      <td>1</td>\n",
200 |        "    </tr>\n",
201 |        "    <tr>\n",
202 |        "      <th>3622</th>\n",
203 |        "      <td>Wordpress \"Read more\" is not working</td>\n",
204 |        "      <td>1</td>\n",
205 |        "    </tr>\n",
206 |        "    <tr>\n",
207 |        "      <th>3895</th>\n",
208 |        "      <td>Create blog post simply and easily</td>\n",
209 |        "      <td>1</td>\n",
210 |        "    </tr>\n",
211 |        "    <tr>\n",
212 |        "      <th>3727</th>\n",
213 |        "      <td>Why is IE7 rendering these differently?</td>\n",
214 |        "      <td>1</td>\n",
215 |        "    </tr>\n",
216 |        "    <tr>\n",
217 |        "      <th>3900</th>\n",
218 |        "      <td>Using AJAX to load WordPress pages</td>\n",
219 |        "      <td>1</td>\n",
220 |        "    </tr>\n",
221 |        "    <tr>\n",
222 |        "      <th>3310</th>\n",
223 |        "      <td>WordPress Monthly Archive by Year</td>\n",
224 |        "      <td>1</td>\n",
225 |        "    </tr>\n",
226 |        "    <tr>\n",
227 |        "      <th>3521</th>\n",
228 |        "      <td>Is there an easier way to add menu items to a ...</td>\n",
229 |        "      <td>1</td>\n",
230 |        "    </tr>\n",
231 |        "    <tr>\n",
232 |        "      <th>538</th>\n",
233 |        "      <td>Wordpress: How can i move my index (blogpage) ...</td>\n",
234 |        "      <td>1</td>\n",
235 |        "    </tr>\n",
236 |        "    <tr>\n",
237 |        "      <th>3286</th>\n",
238 |        "      <td>wp_list_categories does not show current category</td>\n",
239 |        "      <td>1</td>\n",
240 |        "    </tr>\n",
241 |        "    <tr>\n",
242 |        "      <th>388</th>\n",
243 |        "      <td>What is the default URL for APEX for an Oracle...</td>\n",
244 |        "      <td>2</td>\n",
245 |        "    </tr>\n",
246 |        "    <tr>\n",
247 |        "      <th>2147</th>\n",
248 |        "      <td>Oracle DB: How can I write query ignoring case?</td>\n",
249 |        "      <td>2</td>\n",
250 |        "    </tr>\n",
251 |        "    <tr>\n",
252 |        "      <th>1332</th>\n",
253 |        "      <td>Oracle converts empty string to null but JPA d...</td>\n",
254 |        "      <td>2</td>\n",
255 |        "    </tr>\n",
256 |        "    <tr>\n",
257 |        "      <th>575</th>\n",
258 |        "      <td>Is there a way to do full text search of all o...</td>\n",
259 |        "      <td>2</td>\n",
260 |        "    </tr>\n",
261 |        "    <tr>\n",
262 |        "      <th>1943</th>\n",
263 |        "      <td>Faster 'select distinct thing_id,thing_name fr...</td>\n",
264 |        "      <td>2</td>\n",
265 |        "    </tr>\n",
266 |        "    <tr>\n",
267 |        "      <th>1564</th>\n",
268 |        "      <td>When did oracle start supporting \"top\": select...</td>\n",
269 |        "      <td>2</td>\n",
270 |        "    </tr>\n",
271 |        "    <tr>\n",
272 |        "      <th>2063</th>\n",
273 |        "      <td>Return an Oracle Ref Cursor to a SqlServer T-S...</td>\n",
274 |        "      <td>2</td>\n",
275 |        "    </tr>\n",
276 |        "    <tr>\n",
277 |        "      <th>2541</th>\n",
278 |        "      <td>How do I insert sysdate into a column using OD...</td>\n",
279 |        "      <td>2</td>\n",
280 |        "    </tr>\n",
281 |        "    <tr>\n",
282 |        "      <th>981</th>\n",
283 |        "      <td>Is it possible to kill a single query in oracl...</td>\n",
284 |        "      <td>2</td>\n",
285 |        "    </tr>\n",
286 |        "    <tr>\n",
287 |        "      <th>2282</th>\n",
288 |        "      <td>Calculate difference between 2 date / times in...</td>\n",
289 |        "      <td>2</td>\n",
290 |        "    </tr>\n",
291 |        "    <tr>\n",
292 |        "      <th>404</th>\n",
293 |        "      <td>Oracle date</td>\n",
294 |        "      <td>2</td>\n",
295 |        "    </tr>\n",
296 |        "    <tr>\n",
297 |        "      <th>2218</th>\n",
298 |        "      <td>Compare strings by their written representatio...</td>\n",
299 |        "      <td>2</td>\n",
300 |        "    </tr>\n",
301 |        "    <tr>\n",
302 |        "      <th>1081</th>\n",
303 |        "      <td>Allow Oracle User to connect from one IP addre...</td>\n",
304 |        "      <td>2</td>\n",
305 |        "    </tr>\n",
306 |        "    <tr>\n",
307 |        "      <th>2346</th>\n",
308 |        "      <td>What's the equivalent of Oracle's to_char in A...</td>\n",
309 |        "      <td>2</td>\n",
310 |        "    </tr>\n",
311 |        "    <tr>\n",
312 |        "      <th>871</th>\n",
313 |        "      <td>Oracle: how to use updateXML to update multipl...</td>\n",
314 |        "      <td>2</td>\n",
315 |        "    </tr>\n",
316 |        "    <tr>\n",
317 |        "      <th>338</th>\n",
318 |        "      <td>subversion diff including new files</td>\n",
319 |        "      <td>3</td>\n",
320 |        "    </tr>\n",
321 |        "    <tr>\n",
322 |        "      <th>663</th>\n",
323 |        "      <td>Unlocking a SVN working copy which has unversi...</td>\n",
324 |        "      <td>3</td>\n",
325 |        "    </tr>\n",
326 |        "    <tr>\n",
327 |        "      <th>898</th>\n",
328 |        "      <td>Windows Backup for SVN Repositories</td>\n",
329 |        "      <td>3</td>\n",
330 |        "    </tr>\n",
331 |        "    <tr>\n",
332 |        "      <th>2376</th>\n",
333 |        "      <td>Using svn:ignore to ignore everything but cert...</td>\n",
334 |        "      <td>3</td>\n",
335 |        "    </tr>\n",
336 |        "    <tr>\n",
337 |        "      <th>1323</th>\n",
338 |        "      <td>How can I only commit property changes without...</td>\n",
339 |        "      <td>3</td>\n",
340 |        "    </tr>\n",
341 |        "    <tr>\n",
342 |        "      <th>2098</th>\n",
343 |        "      <td>How to forbit subversion commits to svn:extern...</td>\n",
344 |        "      <td>3</td>\n",
345 |        "    </tr>\n",
346 |        "    <tr>\n",
347 |        "      <th>136</th>\n",
348 |        "      <td>How can I speed up SVN updates?</td>\n",
349 |        "      <td>3</td>\n",
350 |        "    </tr>\n",
351 |        "    <tr>\n",
352 |        "      <th>57</th>\n",
353 |        "      <td>Begining SVN</td>\n",
354 |        "      <td>3</td>\n",
355 |        "    </tr>\n",
356 |        "    <tr>\n",
357 |        "      <th>1013</th>\n",
358 |        "      <td>Is there any way to only update added files?</td>\n",
359 |        "      <td>3</td>\n",
360 |        "    </tr>\n",
361 |        "    <tr>\n",
362 |        "      <th>1137</th>\n",
363 |        "      <td>Change Revesion Number in Subversion, even if ...</td>\n",
364 |        "      <td>3</td>\n",
365 |        "    </tr>\n",
366 |        "    <tr>\n",
367 |        "      <th>2027</th>\n",
368 |        "      <td>svn create tag problem</td>\n",
369 |        "      <td>3</td>\n",
370 |        "    </tr>\n",
371 |        "    <tr>\n",
372 |        "      <th>708</th>\n",
373 |        "      <td>Is there some way to commit a file \"partially\"...</td>\n",
374 |        "      <td>3</td>\n",
375 |        "    </tr>\n",
376 |        "    <tr>\n",
377 |        "      <th>124</th>\n",
378 |        "      <td>Free Online SVN repositories</td>\n",
379 |        "      <td>3</td>\n",
380 |        "    </tr>\n",
381 |        "    <tr>\n",
382 |        "      <th>302</th>\n",
383 |        "      <td>Create a tag upon every build of the application?</td>\n",
384 |        "      <td>3</td>\n",
385 |        "    </tr>\n",
386 |        "    <tr>\n",
387 |        "      <th>958</th>\n",
388 |        "      <td>Subversion plugin to Visual Studio?</td>\n",
389 |        "      <td>3</td>\n",
390 |        "    </tr>\n",
391 |        "    <tr>\n",
392 |        "      <th>3200</th>\n",
393 |        "      <td>How to view error messages from ruby CGI app o...</td>\n",
394 |        "      <td>4</td>\n",
395 |        "    </tr>\n",
396 |        "    <tr>\n",
397 |        "      <th>2822</th>\n",
398 |        "      <td>PHP using too much memory</td>\n",
399 |        "      <td>4</td>\n",
400 |        "    </tr>\n",
401 |        "    <tr>\n",
402 |        "      <th>1831</th>\n",
403 |        "      <td>Getting Apache to execute command on every pag...</td>\n",
404 |        "      <td>4</td>\n",
405 |        "    </tr>\n",
406 |        "    <tr>\n",
407 |        "      <th>2901</th>\n",
408 |        "      <td>XAMPP: I edited PHP.ini, and now Apache crashes</td>\n",
409 |        "      <td>4</td>\n",
410 |        "    </tr>\n",
411 |        "    <tr>\n",
412 |        "      <th>195</th>\n",
413 |        "      <td>Restrict Apache to only allow access using SSL...</td>\n",
414 |        "      <td>4</td>\n",
415 |        "    </tr>\n",
416 |        "    <tr>\n",
417 |        "      <th>3623</th>\n",
418 |        "      <td>Adding slashes to the end of directories + mor...</td>\n",
419 |        "      <td>4</td>\n",
420 |        "    </tr>\n",
421 |        "    <tr>\n",
422 |        "      <th>689</th>\n",
423 |        "      <td>IIS equivalent of VirtualHost in Apache</td>\n",
424 |        "      <td>4</td>\n",
425 |        "    </tr>\n",
426 |        "    <tr>\n",
427 |        "      <th>2272</th>\n",
428 |        "      <td>Why do some page requests hang when fetching i...</td>\n",
429 |        "      <td>4</td>\n",
430 |        "    </tr>\n",
431 |        "    <tr>\n",
432 |        "      <th>1361</th>\n",
433 |        "      <td>Setting a header in apache</td>\n",
434 |        "      <td>4</td>\n",
435 |        "    </tr>\n",
436 |        "    <tr>\n",
437 |        "      <th>1874</th>\n",
438 |        "      <td>Apache Perl http headers problem</td>\n",
439 |        "      <td>4</td>\n",
440 |        "    </tr>\n",
441 |        "    <tr>\n",
442 |        "      <th>3444</th>\n",
443 |        "      <td>.htaccess mod-rewrite how to</td>\n",
444 |        "      <td>4</td>\n",
445 |        "    </tr>\n",
446 |        "    <tr>\n",
447 |        "      <th>2684</th>\n",
448 |        "      <td>Problem with URL rewriting for same .php page</td>\n",
449 |        "      <td>4</td>\n",
450 |        "    </tr>\n",
451 |        "    <tr>\n",
452 |        "      <th>299</th>\n",
453 |        "      <td>Top &amp; httpd - demystifying what is actually ru...</td>\n",
454 |        "      <td>4</td>\n",
455 |        "    </tr>\n",
456 |        "    <tr>\n",
457 |        "      <th>2857</th>\n",
458 |        "      <td>Weird behaviour with two Trac instances under ...</td>\n",
459 |        "      <td>4</td>\n",
460 |        "    </tr>\n",
461 |        "    <tr>\n",
462 |        "      <th>3205</th>\n",
463 |        "      <td>defer processing during apache page render</td>\n",
464 |        "      <td>4</td>\n",
465 |        "    </tr>\n",
466 |        "  </tbody>\n",
467 |        "</table>\n",
468 |        "</div>"
469 |       ],
470 |       "text/plain": [
471 |        "                                                   Text  Label\n",
472 |        "3958          WP_Insert_Post and GUID Issue [Wordpress]      1\n",
473 |        "2540                  How can I debug WordPress in IIS?      1\n",
474 |        "3594  wordpress: how to get x youtube thumbnails of ...      1\n",
475 |        "3240  Where to place a query to show only one post i...      1\n",
476 |        "3638                   Wordpress Blog RSS Feed Problems      1\n",
477 |        "3702                Excluding one category in Wordpress      1\n",
478 |        "1720   debuging \"register_activation_hook\" in wordpress      1\n",
479 |        "3622               Wordpress \"Read more\" is not working      1\n",
480 |        "3895                 Create blog post simply and easily      1\n",
481 |        "3727            Why is IE7 rendering these differently?      1\n",
482 |        "3900                 Using AJAX to load WordPress pages      1\n",
483 |        "3310                  WordPress Monthly Archive by Year      1\n",
484 |        "3521  Is there an easier way to add menu items to a ...      1\n",
485 |        "538   Wordpress: How can i move my index (blogpage) ...      1\n",
486 |        "3286  wp_list_categories does not show current category      1\n",
487 |        "388   What is the default URL for APEX for an Oracle...      2\n",
488 |        "2147    Oracle DB: How can I write query ignoring case?      2\n",
489 |        "1332  Oracle converts empty string to null but JPA d...      2\n",
490 |        "575   Is there a way to do full text search of all o...      2\n",
491 |        "1943  Faster 'select distinct thing_id,thing_name fr...      2\n",
492 |        "1564  When did oracle start supporting \"top\": select...      2\n",
493 |        "2063  Return an Oracle Ref Cursor to a SqlServer T-S...      2\n",
494 |        "2541  How do I insert sysdate into a column using OD...      2\n",
495 |        "981   Is it possible to kill a single query in oracl...      2\n",
496 |        "2282  Calculate difference between 2 date / times in...      2\n",
497 |        "404                                         Oracle date      2\n",
498 |        "2218  Compare strings by their written representatio...      2\n",
499 |        "1081  Allow Oracle User to connect from one IP addre...      2\n",
500 |        "2346  What's the equivalent of Oracle's to_char in A...      2\n",
501 |        "871   Oracle: how to use updateXML to update multipl...      2\n",
502 |        "338                 subversion diff including new files      3\n",
503 |        "663   Unlocking a SVN working copy which has unversi...      3\n",
504 |        "898                 Windows Backup for SVN Repositories      3\n",
505 |        "2376  Using svn:ignore to ignore everything but cert...      3\n",
506 |        "1323  How can I only commit property changes without...      3\n",
507 |        "2098  How to forbit subversion commits to svn:extern...      3\n",
508 |        "136                     How can I speed up SVN updates?      3\n",
509 |        "57                                         Begining SVN      3\n",
510 |        "1013       Is there any way to only update added files?      3\n",
511 |        "1137  Change Revesion Number in Subversion, even if ...      3\n",
512 |        "2027                             svn create tag problem      3\n",
513 |        "708   Is there some way to commit a file \"partially\"...      3\n",
514 |        "124                        Free Online SVN repositories      3\n",
515 |        "302   Create a tag upon every build of the application?      3\n",
516 |        "958                 Subversion plugin to Visual Studio?      3\n",
517 |        "3200  How to view error messages from ruby CGI app o...      4\n",
518 |        "2822                          PHP using too much memory      4\n",
519 |        "1831  Getting Apache to execute command on every pag...      4\n",
520 |        "2901    XAMPP: I edited PHP.ini, and now Apache crashes      4\n",
521 |        "195   Restrict Apache to only allow access using SSL...      4\n",
522 |        "3623  Adding slashes to the end of directories + mor...      4\n",
523 |        "689             IIS equivalent of VirtualHost in Apache      4\n",
524 |        "2272  Why do some page requests hang when fetching i...      4\n",
525 |        "1361                         Setting a header in apache      4\n",
526 |        "1874                   Apache Perl http headers problem      4\n",
527 |        "3444                       .htaccess mod-rewrite how to      4\n",
528 |        "2684      Problem with URL rewriting for same .php page      4\n",
529 |        "299   Top & httpd - demystifying what is actually ru...      4\n",
530 |        "2857  Weird behaviour with two Trac instances under ...      4\n",
531 |        "3205         defer processing during apache page render      4"
532 |       ]
533 |      },
534 |      "execution_count": 8,
535 |      "metadata": {},
536 |      "output_type": "execute_result"
537 |     }
538 |    ],
539 |    "source": [
540 |     "train, test = gen_sample(15,4)\n",
541 |     "train"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "code",
546 |    "execution_count": 9,
547 |    "metadata": {
548 |     "ExecuteTime": {
549 |      "end_time": "2019-09-13T21:43:22.008108Z",
550 |      "start_time": "2019-09-13T21:43:21.998952Z"
551 |     },
552 |     "scrolled": true
553 |    },
554 |    "outputs": [
555 |     {
556 |      "data": {
557 |       "text/plain": [
558 |        "3958            WP_Insert_Post and GUID Issue [Wordpress]\n",
559 |        "2540                    How can I debug WordPress in IIS?\n",
560 |        "3594    wordpress: how to get x youtube thumbnails of ...\n",
561 |        "3240    Where to place a query to show only one post i...\n",
562 |        "3638                     Wordpress Blog RSS Feed Problems\n",
563 |        "3702                  Excluding one category in Wordpress\n",
564 |        "1720     debuging \"register_activation_hook\" in wordpress\n",
565 |        "3622                 Wordpress \"Read more\" is not working\n",
566 |        "3895                   Create blog post simply and easily\n",
567 |        "3727              Why is IE7 rendering these differently?\n",
568 |        "3900                   Using AJAX to load WordPress pages\n",
569 |        "3310                    WordPress Monthly Archive by Year\n",
570 |        "3521    Is there an easier way to add menu items to a ...\n",
571 |        "538     Wordpress: How can i move my index (blogpage) ...\n",
572 |        "3286    wp_list_categories does not show current category\n",
573 |        "388     What is the default URL for APEX for an Oracle...\n",
574 |        "2147      Oracle DB: How can I write query ignoring case?\n",
575 |        "1332    Oracle converts empty string to null but JPA d...\n",
576 |        "575     Is there a way to do full text search of all o...\n",
577 |        "1943    Faster 'select distinct thing_id,thing_name fr...\n",
578 |        "1564    When did oracle start supporting \"top\": select...\n",
579 |        "2063    Return an Oracle Ref Cursor to a SqlServer T-S...\n",
580 |        "2541    How do I insert sysdate into a column using OD...\n",
581 |        "981     Is it possible to kill a single query in oracl...\n",
582 |        "2282    Calculate difference between 2 date / times in...\n",
583 |        "404                                           Oracle date\n",
584 |        "2218    Compare strings by their written representatio...\n",
585 |        "1081    Allow Oracle User to connect from one IP addre...\n",
586 |        "2346    What's the equivalent of Oracle's to_char in A...\n",
587 |        "871     Oracle: how to use updateXML to update multipl...\n",
588 |        "338                   subversion diff including new files\n",
589 |        "663     Unlocking a SVN working copy which has unversi...\n",
590 |        "898                   Windows Backup for SVN Repositories\n",
591 |        "2376    Using svn:ignore to ignore everything but cert...\n",
592 |        "1323    How can I only commit property changes without...\n",
593 |        "2098    How to forbit subversion commits to svn:extern...\n",
594 |        "136                       How can I speed up SVN updates?\n",
595 |        "57                                           Begining SVN\n",
596 |        "1013         Is there any way to only update added files?\n",
597 |        "1137    Change Revesion Number in Subversion, even if ...\n",
598 |        "2027                               svn create tag problem\n",
599 |        "708     Is there some way to commit a file \"partially\"...\n",
600 |        "124                          Free Online SVN repositories\n",
601 |        "302     Create a tag upon every build of the application?\n",
602 |        "958                   Subversion plugin to Visual Studio?\n",
603 |        "3200    How to view error messages from ruby CGI app o...\n",
604 |        "2822                            PHP using too much memory\n",
605 |        "1831    Getting Apache to execute command on every pag...\n",
606 |        "2901      XAMPP: I edited PHP.ini, and now Apache crashes\n",
607 |        "195     Restrict Apache to only allow access using SSL...\n",
608 |        "3623    Adding slashes to the end of directories + mor...\n",
609 |        "689               IIS equivalent of VirtualHost in Apache\n",
610 |        "2272    Why do some page requests hang when fetching i...\n",
611 |        "1361                           Setting a header in apache\n",
612 |        "1874                     Apache Perl http headers problem\n",
613 |        "3444                         .htaccess mod-rewrite how to\n",
614 |        "2684        Problem with URL rewriting for same .php page\n",
615 |        "299     Top & httpd - demystifying what is actually ru...\n",
616 |        "2857    Weird behaviour with two Trac instances under ...\n",
617 |        "3205           defer processing during apache page render\n",
618 |        "Name: Text, dtype: object"
619 |       ]
620 |      },
621 |      "execution_count": 9,
622 |      "metadata": {},
623 |      "output_type": "execute_result"
624 |     }
625 |    ],
626 |    "source": [
627 |     "train['Text']"
628 |    ]
629 |   },
630 |   {
631 |    "cell_type": "code",
632 |    "execution_count": 10,
633 |    "metadata": {
634 |     "ExecuteTime": {
635 |      "end_time": "2019-09-13T21:43:22.351746Z",
636 |      "start_time": "2019-09-13T21:43:22.336298Z"
637 |     }
638 |    },
639 |    "outputs": [],
640 |    "source": [
641 |     "clf.fit(train['Text'], train['Label'])"
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "code",
646 |    "execution_count": 11,
647 |    "metadata": {
648 |     "ExecuteTime": {
649 |      "end_time": "2019-09-13T21:43:24.164718Z",
650 |      "start_time": "2019-09-13T21:43:23.820541Z"
651 |     }
652 |    },
653 |    "outputs": [
654 |     {
655 |      "data": {
656 |       "text/plain": [
657 |        "0.5032994923857868"
658 |       ]
659 |      },
660 |      "execution_count": 11,
661 |      "metadata": {},
662 |      "output_type": "execute_result"
663 |     }
664 |    ],
665 |    "source": [
666 |     "from sklearn.metrics import accuracy_score\n",
667 |     "accuracy_score(clf.predict(test['Text']), test['Label'])"
668 |    ]
669 |   },
670 |   {
671 |    "cell_type": "code",
672 |    "execution_count": null,
673 |    "metadata": {},
674 |    "outputs": [],
675 |    "source": []
676 |   }
677 |  ],
678 |  "metadata": {
679 |   "kernelspec": {
680 |    "display_name": "Python 3",
681 |    "language": "python",
682 |    "name": "python3"
683 |   },
684 |   "language_info": {
685 |    "codemirror_mode": {
686 |     "name": "ipython",
687 |     "version": 3
688 |    },
689 |    "file_extension": ".py",
690 |    "mimetype": "text/x-python",
691 |    "name": "python",
692 |    "nbconvert_exporter": "python",
693 |    "pygments_lexer": "ipython3",
694 |    "version": "3.6.5"
695 |   },
696 |   "latex_envs": {
697 |    "LaTeX_envs_menu_present": true,
698 |    "autoclose": false,
699 |    "autocomplete": true,
700 |    "bibliofile": "biblio.bib",
701 |    "cite_by": "apalike",
702 |    "current_citInitial": 1,
703 |    "eqLabelWithNumbers": true,
704 |    "eqNumInitial": 1,
705 |    "hotkeys": {
706 |     "equation": "Ctrl-E",
707 |     "itemize": "Ctrl-I"
708 |    },
709 |    "labels_anchors": false,
710 |    "latex_user_defs": false,
711 |    "report_style_numbering": false,
712 |    "user_envs_cfg": false
713 |   }
714 |  },
715 |  "nbformat": 4,
716 |  "nbformat_minor": 2
717 | }
718 | 


--------------------------------------------------------------------------------
/examples/sub.csv:
--------------------------------------------------------------------------------
 1 | ,Email Address,Address,Phone Number,MEMBER_RATING,CONFIRM_TIME,LATITUDE,LONGITUDE,TIMEZONE,CC,REGION,LAST_CHANGED,LEID,EUID,NOTES,TAGS,PURCHASE_LINK,CREDIT_CARD
 2 | 0,kashif.kudalkar@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-13 23:14:20,21.0711000,75.2886000,Asia/Kolkata,IN,MH,2019-08-13 (23:14:20),361672977,042eb5272e,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
 3 | 1,chendaye666@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-15 22:50:21,37.4056000,'-122.0780000,America/Los_Angeles,US,CA,2019-08-15 (22:50:21),363571854,6709be1433,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
 4 | 2,18611646660@qq.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-09-05 12:51:52,39.9125000,116.3890000,Asia/Harbin,CN,BJ,2019-09-05 (12:51:52),373111117,bff7653dbe,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
 5 | 3,ShurmanV@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-08-27 17:54:05,49.4296000,32.0528000,Asia/Harbin,ua,71,2019-08-27 (17:54:05),369432473,db46b86530,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
 6 | 4,wqw547243068@163.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-09-06 08:35:03,39.9125000,116.3890000,Asia/Harbin,CN,BJ,2019-09-06 (08:35:03),373187729,4f66319ae9,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
 7 | 5,eeijcea@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-09-05 11:09:57,23.2235000,'-106.4180000,America/Mazatlan,MX,SIN,2019-09-05 (11:09:57),373083229,d4b8de536f,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
 8 | 6,343468823@qq.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-09-04 03:03:16,31.8613000,117.2750000,Asia/Shanghai,CN,AH,2019-09-04 (03:03:16),372773637,413edbacef,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
 9 | 7,dhyun.kang@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-06 21:46:19,40.7147000,'-74.3629000,America/New_York,US,NJ,2019-08-06 (21:46:19),359091805,0d0bf5001c,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
10 | 8,rhugg59@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-09-09 17:33:01,33.6784000,'-111.8070000,America/Phoenix,US,AZ,2019-09-09 (17:33:01),373410137,e59576a437,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
11 | 9,hongyunnchen@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-09-06 03:20:47,22.6732000,120.4780000,Asia/Taipei,TW,PIF,2019-09-06 (03:20:47),373149681,8e3ee6c295,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
12 | 10,xixiwuwu@163.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-11 09:40:47,39.9125000,116.3890000,Asia/Harbin,CN,BJ,2019-08-11 (09:40:47),360283457,51af2e5a17,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
13 | 11,derekjr560@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-09-07 14:01:56,5.5590000,'-0.2007000,Africa/Accra,GH,AA,2019-09-07 (14:01:56),373283073,ce6e6dce02,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
14 | 12,fjlikun@qq.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-09-05 23:33:03,26.0685000,119.3030000,Asia/Shanghai,CN,FJ,2019-09-05 (23:33:03),373137301,45f8189dc4,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
15 | 13,zanchen2@hotmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-06 04:00:55,34.2655000,108.8830000,Asia/Chongqing,CN,SN,2019-08-06 (04:00:55),358728749,a61873014b,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
16 | 14,earlev4@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-09-01 12:31:16,39.6754000,'-104.9620000,America/Denver,US,CO,2019-09-01 (12:31:16),372463825,e834c06d2c,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
17 | 15,bandagr@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-08-28 16:05:15,55.5198000,13.2377000,Asia/Kolkata,se,m,2019-08-28 (16:05:15),369713945,33426ec803,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
18 | 16,jussstsushant@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-09-07 13:58:35,25.6196000,85.1538000,Asia/Kolkata,IN,BR,2019-09-07 (13:58:35),373283029,c8e6684aec,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
19 | 17,kelyan.morfouesse@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-09-05 04:08:48,48.8608000,2.3423400,Europe/Paris,FR,IDF,2019-09-05 (04:08:48),373004845,49f91f6ac1,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
20 | 18,mael.fabien@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-07-27 05:36:36,48.9470000,2.1464700,Asia/Chongqing,fr,idf,2019-07-27 (05:36:36),356463985,b7dd1d207d,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
21 | 19,stephane_mulard@hotmail.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-08-14 16:49:58,48.9470000,2.1464700,Asia/Chongqing,fr,idf,2019-08-14 (16:49:58),361926909,ca0ffcc58f,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
22 | 20,auroua@yeah.net,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-04 20:00:05,34.2655000,108.8830000,Asia/Chongqing,CN,SN,2019-08-04 (20:00:05),358446729,b7e22acef6,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
23 | 21,ronald@elmit.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-09-09 21:23:47,25.0389000,121.5090000,Asia/Taipei,TW,TPE,2019-09-09 (21:23:47),373456505,ad186f0918,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
24 | 22,srikant.gopalan@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-25 08:16:44,28.1145000,'-82.3639000,America/Kentucky/Monticello,US,FL,2019-08-25 (08:16:44),368724765,4995289de9,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
25 | 23,liuxingyu009@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-09-09 19:30:51,30.6694000,104.0710000,Asia/Chongqing,CN,SC,2019-09-09 (19:30:51),373454953,55bc179aa1,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
26 | 24,15201856170@163.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-03 08:49:42,37.5626000,'-121.9980000,America/Los_Angeles,US,CA,2019-08-03 (08:49:42),358277673,41e80f754a,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
27 | 25,xrickliao@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-08-09 15:00:34,25.0389000,121.5090000,America/Los_Angeles,tw,tpe,2019-08-09 (15:00:34),360087061,ee7e10f87d,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
28 | 26,nefilim1@ieee.org,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-09-07 11:56:11,47.0263000,'-122.7960000,America/Los_Angeles,US,WA,2019-09-07 (11:56:11),373278465,40235c867a,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
29 | 27,v.l.kuznetsov1@yandex.ru,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-09-05 15:43:45,55.8102000,37.7135000,Europe/Moscow,RU,MOW,2019-09-05 (15:43:45),373127257,fc9923a200,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
30 | 28,phuthai@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-08-19 22:08:36,13.7174000,100.5130000,Asia/Kolkata,th,10,2019-08-19 (22:08:36),364797781,bd6f5bd954,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
31 | 29,afrazchelsea@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-09-06 16:58:28,18.5130000,73.8788000,Asia/Kolkata,IN,MH,2019-09-06 (16:58:28),373255249,244b117ed1,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
32 | 30,kmakeit@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-07-30 10:14:57,38.8648000,'-77.1947000,America/New_York,US,VA,2019-07-30 (10:14:57),357171281,231bdbec4f,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
33 | 31,rsoankamble@outlook.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-08-09 19:34:59,42.4589000,'-83.3468000,248/947,us,mi,2019-08-09 (19:34:59),360135589,731f959388,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
34 | 32,huyen.t.t.tran.22.12@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-09-04 04:48:09,52.4900000,13.4577000,Europe/Berlin,DE,BE,2019-09-04 (04:48:09),372780881,18da3faec9,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
35 | 33,harryji830@126.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-08-12 05:31:15,31.2358000,121.4830000,America/Los_Angeles,cn,sh,2019-08-12 (05:31:15),360644021,e70cb885e3,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
36 | 34,pottstimothy@yahoo.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-07-29 19:28:31,47.7381000,'-117.4460000,America/Los_Angeles,US,WA,2019-07-29 (19:28:31),357033489,2498623063,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
37 | 35,rajmail16@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-09-10 20:27:26,41.5486000,'-8.4290900,Europe/Lisbon,PT,03,2019-09-10 (20:27:26),373664513,a12cac6c6d,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
38 | 36,pxy0592@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-07-30 09:06:22,39.9125000,116.3890000,Asia/Harbin,CN,BJ,2019-07-30 (09:06:22),357127833,de32445ce7,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
39 | 37,daniel.kang@prudential.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-12 10:43:44,40.8196000,'-74.1630000,America/New_York,US,NJ,2019-08-12 (10:43:44),360956865,b13a1ed4a8,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
40 | 38,syrine8bettaieb@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-19 07:12:06,36.8330000,10.2170000,Africa/Tunis,TN,11,2019-08-19 (07:12:06),364226557,e51c83ffed,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
41 | 39,pav-sahnyuk@yandex.ru,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-09-09 16:47:38,55.7551000,37.6342000,Europe/Moscow,RU,MOW,2019-09-09 (16:47:38),373387653,4e709460a3,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
42 | 40,anandkumar604@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-10 12:22:16,12.9534000,77.5956000,Asia/Kolkata,IN,KA,2019-08-10 (12:22:16),360219573,98fce76c6f,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
43 | 41,nagarjunaabhishek@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-09-06 04:11:31,12.9687000,77.5877000,Asia/Kolkata,IN,KA,2019-09-06 (04:11:31),373150313,76cf4844c0,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
44 | 42,leodenale@yahoo.com.br,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-09-11 06:49:51,'-30.0334000,'-51.1336000,America/Sao_Paulo,BR,RS,2019-09-11 (06:49:51),373699485,ca93ba9140,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
45 | 43,vkazei@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-08-24 07:25:20,21.5000000,39.1670000,Asia/Shanghai,sa,02,2019-08-24 (07:25:20),368621981,b11ae5075e,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
46 | 44,siqi_9@163.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-05 21:49:21,31.7664000,119.9480000,Asia/Shanghai,CN,JS,2019-08-05 (21:49:21),358716589,303b383e18,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
47 | 45,akram.nejat10i@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-03 10:04:29,34.6435000,50.8903000,Asia/Tehran,IR,26,2019-08-03 (10:04:29),358293893,973cd850c0,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
48 | 46,m18813055625@163.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-07 21:31:30,39.9125000,116.3890000,Asia/Harbin,CN,BJ,2019-08-07 (21:31:31),359492165,f472c12ba6,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
49 | 47,law_kwok_keung@hotmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-22 11:19:10,22.2759000,114.1670000,Asia/Chongqing,HK,NO REGION,2019-08-22 (11:19:10),368047929,aace16964b,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
50 | 48,mike.trizna@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-07 15:37:45,38.9812000,'-77.3873000,America/New_York,US,VA,2019-08-07 (15:37:45),359304553,688d64a405,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
51 | 49,zllin001@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-09-07 23:59:57,25.0389000,121.5090000,Asia/Taipei,TW,TPE,2019-09-07 (23:59:57),373286913,1a45c3bc50,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
52 | 50,ZOONINWESTMELBOURNE@YAHOO.COM,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-09-09 12:56:11,39.3279000,'-76.7440000,America/New_York,US,MD,2019-09-09 (12:56:11),373362541,013a96a595,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
53 | 51,joechen@nttu.edu.tw,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-09-05 22:23:18,25.0389000,121.5090000,Asia/Taipei,TW,TPE,2019-09-05 (22:23:18),373136385,09fa659b08,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
54 | 52,ravikantsingh.jsr@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-27 22:44:19,16.3127000,80.4459000,Asia/Kolkata,IN,AP,2019-08-27 (22:44:19),369452325,97a7d5162e,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
55 | 53,leibingye@outlook.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-04 21:06:01,23.1255000,113.2370000,Asia/Chongqing,CN,GD,2019-08-04 (21:06:01),358449465,52aa29b650,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
56 | 54,gawangilbert@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-01 15:28:07,'-23.5156000,'-46.6295000,America/Sao_Paulo,BR,SP,2019-08-01 (15:28:07),357898005,d4cc5addf5,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
57 | 55,sandy16x@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-07-30 06:07:57,39.9125000,116.3890000,Asia/Harbin,CN,BJ,2019-07-30 (06:07:57),357092353,dff9e58fae,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
58 | 56,chuanliang.jiang@usaa.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-07-31 12:14:28,29.5397000,'-98.5519000,America/Chicago,US,TX,2019-07-31 (12:14:28),357697657,d849c49877,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
59 | 57,gjdm@libertysurf.fr,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-08-12 03:32:25,36.5809000,'-4.5937500,Asia/Harbin,es,ma,2019-08-12 (03:32:25),360536705,f1a321af0d,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
60 | 58,ayhamkan@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-08-28 15:40:03,52.5250000,13.3980000,Asia/Harbin,de,be,2019-08-28 (15:40:03),369709981,f326866ff9,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
61 | 59,1280397840@qq.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-07-30 05:12:11,39.9125000,116.3890000,Asia/Harbin,CN,BJ,2019-07-30 (05:12:11),357090841,f47b2a4825,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
62 | 60,fahaddad@microsoft.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-09-05 21:36:54,53.3667000,'-1.5074200,Europe/London,UK,SHF,2019-09-05 (21:36:54),373135545,2701181402,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
63 | 61,sharjeel400@live.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-09-06 15:14:29,33.6000000,73.0830000,Asia/Karachi,PK,PB,2019-09-06 (15:14:29),373253413,47710b3709,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
64 | 62,antony.harnist@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-09-09 05:25:55,49.6124000,6.1295000,Europe/Luxembourg,LU,LU,2019-09-09 (05:25:55),373322617,627bd7209a,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
65 | 63,phdstuff@yahoo.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-09-11 03:44:16,24.9056000,67.0822000,Asia/Karachi,PK,SD,2019-09-11 (03:44:16),373691621,7c3f83c4cf,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
66 | 64,vicchugu@yahoo.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-08-31 23:55:15,37.4171000,'-122.1320000,America/Los_Angeles,US,CA,2019-08-31 (23:55:15),372414469,e5ea41f3db,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
67 | 65,hmicheal50@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-06 02:47:22,9.0231000,38.7468000,Africa/Addis_Ababa,ET,AA,2019-08-06 (02:47:22),358724721,bdec9263cb,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
68 | 66,iamaplayer@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-09-04 01:46:30,36.1130000,103.5990000,Asia/Chongqing,CN,GS,2019-09-04 (01:46:30),372772149,6f15522ab2,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
69 | 67,anhpt.fit@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-08-14 12:12:26,59.9390000,30.3158000,America/Los_Angeles,ru,spe,2019-08-14 (12:12:26),361844085,291daf0df4,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
70 | 68,fdsa@fds.net,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-05 15:46:43,37.8185000,'-121.9690000,America/Los_Angeles,US,CA,2019-08-05 (15:46:43),358639145,8450488e23,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
71 | 69,steliosiordanis@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-10 09:36:28,37.9872000,23.7341000,Europe/Athens,GR,I,2019-08-10 (09:36:28),360172037,c786f095fe,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
72 | 70,mrr2172@columbia.edu,United States,Num : +33 6 78 04 52 11,Grade : 2,2019-08-15 12:03:06,42.3508000,'-71.1184000,America/New_York,US,MA,2019-08-15 (12:03:06),363494933,dac3ea0114,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
73 | 71,mamyezoo@gmail.com,United States,Num : +33 6 78 04 52 11,Grade : 3,2019-08-26 08:02:29,15.5518000,32.5324000,,sd,kh,2019-08-26 (08:02:29),368961137,d45924e85d,,#NewSub #ProdCat_1,https%3A%2F%2Fwww.urlencoder.org%2F,1234-4568-9012 022
74 | 


--------------------------------------------------------------------------------
/fsText/CosineClassifier.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from sklearn.model_selection import train_test_split
  4 | import gensim.downloader as api
  5 | from scipy import spatial
  6 | import re
  7 | from sklearn.neighbors import KNeighborsClassifier
  8 | from sklearn.ensemble import RandomForestClassifier
  9 | from sklearn import preprocessing
 10 | 
 11 | class process_txt:
 12 | 
 13 |     def __init__(self):
 14 | 
 15 |         print("Loading pre-trained Word2Vec model...")
 16 |         self.model = api.load("word2vec-google-news-300")
 17 |         self.le = preprocessing.LabelEncoder()
 18 | 
 19 |     def clean_line(self, line):
 20 | 
 21 |         clean_line = ""
 22 | 
 23 |         line = line.replace("’", "")
 24 |         line = line.replace("'", "")
 25 |         line = line.replace("-", " ")  # replace hyphens with spaces
 26 |         line = line.replace("\t", " ")
 27 |         line = line.replace("\n", " ")
 28 |         line = line.lower()
 29 | 
 30 |         for char in line:
 31 |             if char in "qwertyuiopasdfghjklzxcvbnm ":
 32 |                 clean_line += char
 33 |             else:
 34 |                 clean_line += " "
 35 | 
 36 |         clean_line = re.sub(" +", " ", clean_line)  # delete extra spaces
 37 |         if clean_line[0] == " ":
 38 |             clean_line = clean_line[1:]
 39 |         return clean_line
 40 | 
 41 |     def preprocess(self, txt):
 42 | 
 43 |         txt = txt.apply(lambda x: self.clean_line(x))
 44 | 
 45 |         return txt
 46 | 
 47 |     def filter_text(self, raw_text):
 48 | 
 49 |         """ 
 50 |         Excluding unknown words and get corresponding token
 51 |         """
 52 |         raw_text = raw_text.split()
 53 | 
 54 |         return list(filter(lambda x: x in self.model.vocab, raw_text))
 55 | 
 56 |     def transform_text(self, txt):
 57 | 
 58 |         tokens = self.filter_text(txt)
 59 | 
 60 |         if not tokens:
 61 |             return np.zeros(self.model.vector_size)
 62 | 
 63 |         text_vector = np.mean(self.model[tokens], axis=0)
 64 | 
 65 |         return np.array(text_vector)
 66 | 
 67 |     def label_encoder(self, y_train):
 68 |         return self.le.fit_transform(y_train)
 69 | 
 70 | 
 71 | class CosineClassifier():
 72 | 
 73 |     def __init__(self):
 74 | 
 75 |         self.preprocess = process_txt()
 76 |         
 77 | 
 78 |     def fit(self, X_train, y_train):
 79 | 
 80 |         X_train = self.preprocess.preprocess(X_train)
 81 |         X_train = X_train.apply(lambda x : self.preprocess.transform_text(x)).values
 82 | 
 83 |         y_train = self.preprocess.label_encoder(y_train)
 84 | 
 85 |         self.classes = np.unique(y_train)
 86 | 
 87 |         mean_embedding = {}
 88 |         for cl in self.classes :
 89 |             mean_embedding[cl] = np.mean((X_train[y_train == cl]), axis=0)
 90 | 
 91 |         self.embedding_fit = mean_embedding
 92 | 
 93 |     def classify_txt(self, txt):
 94 | 
 95 |         best_dist = 1
 96 |         best_label = -1
 97 | 
 98 |         for cl in self.classes :
 99 | 
100 |             dist = spatial.distance.cosine(
101 |                 self.preprocess.transform_text(txt), self.embedding_fit[cl]
102 |             )
103 | 
104 |             if dist <= best_dist:
105 |                 best_dist = dist
106 |                 best_label = cl
107 | 
108 |         return best_label
109 | 
110 |     def predict(self, X_test):
111 | 
112 |         X_test = self.preprocess.preprocess(X_test)
113 |         y_pred = np.array([self.classify_txt(t) for t in X_test])
114 |         y_pred = self.preprocess.le.inverse_transform(y_pred)
115 | 
116 |         return y_pred
117 | 


--------------------------------------------------------------------------------
/fsText/KNNClassifier.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from sklearn.model_selection import train_test_split
 4 | import gensim.downloader as api
 5 | import re
 6 | from sklearn.neighbors import KNeighborsClassifier
 7 | from sklearn import preprocessing
 8 | 
 9 | class process_txt:
10 | 
11 |     def __init__(self):
12 | 
13 |         print("Loading pre-trained Word2Vec model...")
14 |         self.model = api.load("word2vec-google-news-300")
15 |         self.le = preprocessing.LabelEncoder()
16 | 
17 |     def clean_line(self, line):
18 | 
19 |         clean_line = ""
20 | 
21 |         line = line.replace("’", "")
22 |         line = line.replace("'", "")
23 |         line = line.replace("-", " ")  # replace hyphens with spaces
24 |         line = line.replace("\t", " ")
25 |         line = line.replace("\n", " ")
26 |         line = line.lower()
27 | 
28 |         for char in line:
29 |             if char in "qwertyuiopasdfghjklzxcvbnm ":
30 |                 clean_line += char
31 |             else:
32 |                 clean_line += " "
33 | 
34 |         clean_line = re.sub(" +", " ", clean_line)  # delete extra spaces
35 |         if clean_line[0] == " ":
36 |             clean_line = clean_line[1:]
37 |         return clean_line
38 | 
39 |     def preprocess(self, txt):
40 | 
41 |         txt = txt.apply(lambda x: self.clean_line(x))
42 | 
43 |         return txt
44 | 
45 |     def filter_text(self, raw_text):
46 | 
47 |         """ 
48 |         Excluding unknown words and get corresponding token
49 |         """
50 |         raw_text = raw_text.split()
51 | 
52 |         return list(filter(lambda x: x in self.model.vocab, raw_text))
53 | 
54 |     def transform_text(self, txt):
55 | 
56 |         tokens = self.filter_text(txt)
57 | 
58 |         if not tokens:
59 |             return np.zeros(self.model.vector_size)
60 | 
61 |         text_vector = np.mean(self.model[tokens], axis=0)
62 | 
63 |         return np.array(text_vector)
64 | 
65 |     def label_encoder(self, y_train):
66 |         return self.le.fit_transform(y_train)
67 | 
68 | 
69 | class KNNClassifier():
70 | 
71 |     def __init__(self):
72 | 
73 |         self.preprocess = process_txt()
74 | 
75 |     def fit(self, X_train, y_train):
76 | 
77 |         X_train = self.preprocess.preprocess(X_train)
78 |         X_train = X_train.apply(lambda x : self.preprocess.transform_text(x)).values
79 | 
80 |         y_train = self.preprocess.label_encoder(y_train)
81 |         unique, counts = np.unique(y_train, return_counts=True)
82 | 
83 |         sample_size=min(counts)
84 | 
85 |         clf = KNeighborsClassifier(n_neighbors=sample_size, p=2)
86 |         clf.fit(list(X_train), y_train)
87 |         self.clf = clf
88 | 
89 |     def predict(self, X_test):
90 | 
91 |         X_test = self.preprocess.preprocess(X_test)
92 |         X_test = [self.preprocess.transform_text(txt) for txt in X_test]
93 | 
94 |         y_pred = self.clf.predict(X_test)
95 |         y_pred = self.preprocess.le.inverse_transform(y_pred)
96 | 
97 |         return y_pred
98 | 


--------------------------------------------------------------------------------
/fsText/RFClassifier.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from sklearn.model_selection import train_test_split
 4 | import gensim.downloader as api
 5 | import re
 6 | from sklearn.ensemble import RandomForestClassifier
 7 | from sklearn import preprocessing
 8 | 
 9 | class process_txt:
10 | 
11 |     def __init__(self):
12 | 
13 |         print("Loading pre-trained Word2Vec model...")
14 |         self.model = api.load("word2vec-google-news-300")
15 |         self.le = preprocessing.LabelEncoder()
16 | 
17 |     def clean_line(self, line):
18 | 
19 |         clean_line = ""
20 | 
21 |         line = line.replace("’", "")
22 |         line = line.replace("'", "")
23 |         line = line.replace("-", " ")  # replace hyphens with spaces
24 |         line = line.replace("\t", " ")
25 |         line = line.replace("\n", " ")
26 |         line = line.lower()
27 | 
28 |         for char in line:
29 |             if char in "qwertyuiopasdfghjklzxcvbnm ":
30 |                 clean_line += char
31 |             else:
32 |                 clean_line += " "
33 | 
34 |         clean_line = re.sub(" +", " ", clean_line)  # delete extra spaces
35 |         if clean_line[0] == " ":
36 |             clean_line = clean_line[1:]
37 |         return clean_line
38 | 
39 |     def preprocess(self, txt):
40 | 
41 |         txt = txt.apply(lambda x: self.clean_line(x))
42 | 
43 |         return txt
44 | 
45 |     def filter_text(self, raw_text):
46 | 
47 |         """ 
48 |         Excluding unknown words and get corresponding token
49 |         """
50 |         raw_text = raw_text.split()
51 | 
52 |         return list(filter(lambda x: x in self.model.vocab, raw_text))
53 | 
54 |     def transform_text(self, txt):
55 | 
56 |         tokens = self.filter_text(txt)
57 | 
58 |         if not tokens:
59 |             return np.zeros(self.model.vector_size)
60 | 
61 |         text_vector = np.mean(self.model[tokens], axis=0)
62 | 
63 |         return np.array(text_vector)
64 | 
65 |     def label_encoder(self, y_train):
66 |         return self.le.fit_transform(y_train)
67 | 
68 | 
69 | class RFClassifier():
70 | 
71 |     def __init__(self):
72 | 
73 |         self.preprocess = process_txt()
74 | 
75 |     def fit(self, X_train, y_train):
76 | 
77 |         X_train = self.preprocess.preprocess(X_train)
78 |         X_train = X_train.apply(lambda x : self.preprocess.transform_text(x)).values
79 | 
80 |         y_train = self.preprocess.label_encoder(y_train)
81 |         unique, counts = np.unique(y_train, return_counts=True)
82 | 
83 |         sample_size=min(counts)
84 | 
85 |         clf = RandomForestClassifier(n_estimators=150)
86 |         clf.fit(list(X_train), y_train)
87 |         self.clf = clf
88 | 
89 |     def predict(self, X_test):
90 | 
91 |         X_test = self.preprocess.preprocess(X_test)
92 |         X_test = [self.preprocess.transform_text(txt) for txt in X_test]
93 | 
94 |         y_pred = self.clf.predict(X_test)
95 |         y_pred = self.preprocess.le.inverse_transform(y_pred)
96 | 
97 |         return y_pred
98 | 


--------------------------------------------------------------------------------
/fsText/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/fsText/__pycache__/CosineClassifier.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fsText-suite/fsText/e19ca5774307862abf324c39b5b4bafab7e7d1f5/fsText/__pycache__/CosineClassifier.cpython-36.pyc


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gensim
2 | scipy
3 | numpy
4 | pandas
5 | scikit_learn
6 | 


--------------------------------------------------------------------------------
/resources/images/nlp_fs_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fsText-suite/fsText/e19ca5774307862abf324c39b5b4bafab7e7d1f5/resources/images/nlp_fs_4.png


--------------------------------------------------------------------------------
/resources/images/nlp_fs_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fsText-suite/fsText/e19ca5774307862abf324c39b5b4bafab7e7d1f5/resources/images/nlp_fs_6.png


--------------------------------------------------------------------------------
/resources/images/perf_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fsText-suite/fsText/e19ca5774307862abf324c39b5b4bafab7e7d1f5/resources/images/perf_1.png


--------------------------------------------------------------------------------
/resources/images/perf_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fsText-suite/fsText/e19ca5774307862abf324c39b5b4bafab7e7d1f5/resources/images/perf_2.png


--------------------------------------------------------------------------------
/resources/papers/DataAugmentation/1804.08166.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fsText-suite/fsText/e19ca5774307862abf324c39b5b4bafab7e7d1f5/resources/papers/DataAugmentation/1804.08166.pdf


--------------------------------------------------------------------------------
/resources/papers/DataAugmentation/1901.11196.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fsText-suite/fsText/e19ca5774307862abf324c39b5b4bafab7e7d1f5/resources/papers/DataAugmentation/1901.11196.pdf


--------------------------------------------------------------------------------
/resources/papers/FewShot/1710.10280.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fsText-suite/fsText/e19ca5774307862abf324c39b5b4bafab7e7d1f5/resources/papers/FewShot/1710.10280.pdf


--------------------------------------------------------------------------------
/resources/papers/FewShot/1804.02063.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fsText-suite/fsText/e19ca5774307862abf324c39b5b4bafab7e7d1f5/resources/papers/FewShot/1804.02063.pdf


--------------------------------------------------------------------------------
/resources/papers/FewShot/1908.08788.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fsText-suite/fsText/e19ca5774307862abf324c39b5b4bafab7e7d1f5/resources/papers/FewShot/1908.08788.pdf


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | setup(
 4 |       name = 'fsText',         # How you named your package folder (MyLib)
 5 |       packages = ['fsText'],   # Chose the same as "name"
 6 |       version = '0.4',      # Start with a small number and increase it with every change you make
 7 |       license='MIT',        # Chose a license from here: https://help.github.com/articles/licensing-a-repository
 8 |       description = 'Few Shot Text Classification',   # Give a short description about your library
 9 |       author = 'André, Matyas, Maël',                   # Type in your name
10 |       author_email = 'mael.fabien@gmail.com',      # Type in your E-Mail
11 |       url = 'https://github.com/maelfabien/fsText',   # Provide either the link to your github or to your website
12 |       download_url = 'https://github.com/maelfabien/fsText/archive/v_04.tar.gz',    # I explain this later on
13 |       keywords = ['Few', 'Shot', 'Text', 'Classification'],   # Keywords that define your package best
14 |       install_requires=[
15 |                         'gensim',
16 |                         'scipy',
17 |                         'numpy',
18 |                         'pandas',
19 |                         'scikit_learn'
20 |                         ],
21 |       classifiers=[
22 |                    'Development Status :: 3 - Alpha',      # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package
23 |                    'Intended Audience :: Developers',      # Define that your audience are developers
24 |                    'Topic :: Software Development :: Build Tools',
25 |                    'License :: OSI Approved :: MIT License',   # Again, pick a license
26 |                    'Programming Language :: Python :: 3',      #Specify which pyhton versions that you want to support
27 |                    'Programming Language :: Python :: 3.4',
28 |                    'Programming Language :: Python :: 3.5',
29 |                    'Programming Language :: Python :: 3.6',
30 |                    ],
31 |       )
32 | 


--------------------------------------------------------------------------------