├── setup.cfg
├── logo.png
├── textgenie
    ├── __init__.py
    ├── textgenie.py
    └── grammar_utils.py
├── examples
    ├── basic.py
    └── examples.ipynb
├── setup.py
├── sentences_aug.txt
├── .gitignore
├── README.md
└── LICENSE


/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hetpandya/textgenie/HEAD/logo.png


--------------------------------------------------------------------------------
/textgenie/__init__.py:
--------------------------------------------------------------------------------
1 | from .textgenie import TextGenie
2 | 
3 | __version__ = "0.1.9.7b"
4 | __author__    = "Het Pandya"
5 | __license__   = "MIT"
6 | 


--------------------------------------------------------------------------------
/examples/basic.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | from textgenie import TextGenie
 4 | 
 5 | t5_model = "hetpandya/t5-base-tapaco"
 6 | bert_model = "microsoft/deberta-v3-large"
 7 | 
 8 | textgenie = TextGenie(t5_model, bert_model, spacy_model_name="en_core_web_lg", device="cuda")
 9 | 
10 | # Augment a list of sentences
11 | sentences = [
12 |     "The video was posted on Facebook by Alex.",
13 |     "I plan to run it again this time",
14 | ]
15 | 
16 | results = textgenie.magic_lamp(
17 |     sentences, "paraphrase: ", n_mask_predictions=5, convert_to_active=True, add_suffix_token=False
18 | )
19 | 
20 | print(results)
21 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | import re
 3 | 
 4 | def get_property(prop, project):
 5 |     """
 6 |     Credits: https://stackoverflow.com/a/41110107
 7 |     """
 8 |     result = re.search(r'{}\s*=\s*[\'"]([^\'"]*)[\'"]'.format(prop), open(project + '/__init__.py').read())
 9 |     return result.group(1)
10 | 
11 | 
12 | with open("README.md", "r") as f:
13 |     long_description = f.read()
14 | 
15 | setup(
16 |     name="textgenie",
17 |     version=get_property('__version__', "textgenie"),
18 |     description="A python library to augment text data using NLP.",
19 |     long_description=long_description,
20 |     long_description_content_type="text/markdown",
21 |     author="Het Pandya",
22 |     url="http://github.com/hetpandya/TextGenie",
23 |     author_email="hetpandya6797@gmail.com",
24 |     license="MIT",
25 |     install_requires=[
26 |         "torch>=1.5.0",
27 |         "transformers",
28 |         "sentencepiece",
29 |         "spacy",
30 |         "tqdm",
31 |         "pandas",
32 |         "Pattern @ git+https://github.com/clips/pattern.git",
33 |     ],
34 |     packages=["textgenie"],
35 | )
36 | 
37 | 


--------------------------------------------------------------------------------
/sentences_aug.txt:
--------------------------------------------------------------------------------
 1 | Theही was posted on Facebook by Alex.
 2 | The video was posted on Facebook by Alex.
 3 | The video has been posted by Alex on Facebook.
 4 | The attaches was posted on Facebook by Alex.
 5 | This video was posted on Facebook by Alex.
 6 | The video was posted on Facebook by segregated.
 7 | Theanje was posted on Facebook by Alex.
 8 | The video was posted to Facebook by Alex.
 9 | The video was posted onanje by Alex.
10 | The video was posted on Facebook byihan.
11 | Alex posted the video on Facebook. 
12 | The video was posted on Jenelle by Alex.
13 | The video was posted on Facebook by attaches.
14 | The video was posted on attaches by Alex.
15 | The video was posted on합 by Alex.
16 | Theći was posted on Facebook by Alex.
17 | The video was posted in Facebook by Alex.
18 | The video was posted on Facebook by hordes.
19 | The video was posted on Facebook bycardi.
20 | The video was posted onrified by Alex.
21 | The minecraft was posted on Facebook by Alex.
22 | I plan to run next time there this time, going one again?
23 | I want to do it again, no less, in parallel.
24 | So I like him.
25 | I plan to run it again this Tiguan
26 | I plan to run it again this minecraft
27 | I plan to run it again this morgan
28 | I plan to run it again thisgenous
29 | I'll run anything same again tonight.
30 | I plan to run it again this time
31 | I plan to run it again this Jenelle
32 | I planing to run that again: "Farefall”?
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | [![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://en.wikipedia.org/wiki/MIT_License)
  3 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
  4 | <a href="https://pepy.tech/project/textgenie">
  5 |     <img alt="Downloads" src="https://pepy.tech/badge/textgenie">
  6 | </a>
  7 | 
  8 | <p align="center">
  9 |   <img src="https://github.com/hetpandya/textgenie/raw/main/logo.png" alt="logo" width="70%" />
 10 | </p>
 11 | 
 12 | # TextGenie
 13 | 
 14 | TextGenie is a text data augmentations library that helps you augment your text dataset and generate similar kind of samples, thus generating a more robust dataset to train better models. It also takes care of labeled datasets while generating similar samples keeping their labels in memory. 
 15 | 
 16 | It uses various Natural Language Processing methods such as paraphrase generation, BERT mask filling and converting text to active voice if found in passive voices. This library currently supports `English` Language.
 17 | 
 18 | ## Installation
 19 | ```
 20 | pip install textgenie
 21 | ```
 22 | 
 23 | ## Example
 24 | ```python
 25 | from textgenie import TextGenie
 26 | 
 27 | textgenie = TextGenie("hetpandya/t5-small-tapaco", "bert-base-uncased")
 28 | 
 29 | # Augment a list of sentences
 30 | sentences = [
 31 |     "The video was posted on Facebook by Alex.",
 32 |     "I plan to run it again this time",
 33 | ]
 34 | textgenie.magic_lamp(
 35 |     sentences, "paraphrase: ", n_mask_predictions=5, convert_to_active=True
 36 | )
 37 | 
 38 | # Augment data in a txt file
 39 | textgenie.magic_lamp(
 40 |     "sentences.txt", "paraphrase: ", n_mask_predictions=5, convert_to_active=True
 41 | )
 42 | 
 43 | # Augment data in a csv file with labels
 44 | textgenie.magic_lamp(
 45 |     "sentences.csv",
 46 |     "paraphrase: ",
 47 |     n_mask_predictions=5,
 48 |     convert_to_active=True,
 49 |     label_column="Label",
 50 |     data_column="Text",
 51 |     column_names=["Text", "Label"],
 52 | )
 53 | ```
 54 | Examples can be found in the examples [notebook](https://github.com/hetpandya/textgenie/blob/main/examples/examples.ipynb).
 55 | 
 56 | ## Usage
 57 | <!--ts-->
 58 | - Initializing the augmentor:
 59 |   ```textgenie = TextGenie(paraphrase_model_name='model_name',mask_model_name='model_name',spacy_model_name="model_name",device="cpu")```
 60 |   - Parameters:
 61 |     - *paraphrase_model_name*: 
 62 |       - The name of the T5 paraphrase model.
 63 |       - A list of pretrained model for paraphrase generation can be found [here](https://github.com/hetpandya/paraphrase-datasets-pretrained-models#pretrained-models)
 64 |     - *mask_model_name*:
 65 |       - BERT model that will be used to fill masks. This model is disabled by default. But can be enabled by mentioning the name of the BERT model to be used. A list of mask filling models can be found [here](https://huggingface.co/models?filter=en&pipeline_tag=fill-mask)
 66 |     - *spacy_model_name*:
 67 |       - Name of the Spacy model. Available models can be found [here](https://spacy.io/models). The default value is set to *en_core_web_sm*.
 68 |     - *device*:
 69 |       - The device where the model will be loaded. The default value is set to *cpu*.
 70 | - Methods:  
 71 |   - augment_sent_mask_filling():
 72 |     - Generate augmented data using BERT mask filling.
 73 |     - Parameters:
 74 |       - *sent*:
 75 |         - The sentence on which augmentation has to be applied.
 76 |       - *n_mask_predictions*:  
 77 |         - The number of predictions, the BERT mask filling model should generate. The default value is set to *5*.
 78 |   - augment_sent_t5():
 79 |     - Generate augmented data using T5 paraphrasing model. 
 80 |     - Parameters:
 81 |       - *sent*:
 82 |         - The sentence on which augmentation has to be applied. 
 83 |       - *prefix*:
 84 |         - The prefix for the T5 model input.
 85 |       - *n_predictions*:
 86 |         - The number of number augmentations, the function should return. The default value is set to *5*.
 87 |       - *top_k*:
 88 |         - The number of predictions, the T5 model should generate. The default value is set to *120*. 
 89 |       - *max_length*:
 90 |         - The max length of the sentence to feed to the model. The default value is set to *256*. 
 91 |   - convert_to_active():
 92 |     - Converts a sentence to active voice, if found in passive voice. Otherwise returns the same sentence.
 93 |     - Parameters:
 94 |       - *sent*:
 95 |         - The sentence that has to be converted.
 96 |   - magic_once():
 97 |     - This is a wrapper method for *augment_sent_mask_filling()*, *augment_sent_t5()* and *convert_to_active()* methods. Using this, a sentence can be augmented using all the above mentioned techniques. 
 98 |     - Since this method can operate on individual text data, it can be merged with other packages.
 99 |     - Parameters:
100 |       - *sent*:
101 |         - The sentence that has to be augmented.
102 |       - *paraphrase_prefix*:
103 |         - The prefix for the T5 model input.
104 |       - *n_paraphrase_predictions*:
105 |         - The number of number augmentations, the function should return. The default value is set to *5*.
106 |       - *paraphrase_top_k*:
107 |         - The number of predictions, the T5 model should generate. The default value is set to *120*. 
108 |       - *paraphrase_max_length*:
109 |         - The max length of the sentence to feed to the model. The default value is set to *256*. 
110 |       - *n_mask_predictions*:
111 |         - The number of predictions, the BERT mask filling model should generate. The default value is set to *None*.
112 |       - *convert_to_active*:
113 |         - If the sentence should be converted to active voice. The default value is set to *True*.
114 |   - magic_lamp():
115 |     - This method can be used for augmenting whole dataset. Currently accepted dataset formats are: `txt`,`csv`,`tsv` and `list`. 
116 |     - If the dataset is in `list` or `txt` format, a list of augmented sentences will be returned. Also, a `txt` file with the name *sentences_aug.txt* is saved containing the output of the augmented data. 
117 |     - If a dataset is in `csv` or `tsv` format with labels, the dataset will be augmented along with keeping in memory the labels for the new samples and a pandas dataframe of the augmented data will be returned. A `tsv` file will be generated with the augmented output with name `original_file_name_aug.tsv` 
118 |     - Parameters:
119 |       - *sentences*:
120 |         - The dataset that has to be augmented. This can be a `Python List`, a `txt`, `csv` or `tsv` file.
121 |       - *paraphrase_prefix*:
122 |         - The prefix for the T5 model input.
123 |       - *n_paraphrase_predictions*:
124 |         - The number of number augmentations, the function should return. The default value is set to *5*.
125 |       - *paraphrase_top_k*:
126 |         - The number of predictions, the T5 model should generate. The default value is set to *120*. 
127 |       - *paraphrase_max_length*:
128 |         - The max length of the sentence to feed to the model. The default value is set to *256*. 
129 |       - *n_mask_predictions*:
130 |         - The number of predictions, the BERT mask filling model should generate. The default value is set to *None*.
131 |       - *convert_to_active*:
132 |         - If the sentence should be converted to active voice. The default value is set to *True*.
133 |       - *label_column*:
134 |         - The name of the column that contains labeled data. The default value is set to *None*. This parameter is not required to be set if the dataset is in a `Python List` or a `txt` file.
135 |       - *data_column*:
136 |         - The name of the column that contains data. The default value is set to *None*. This parameter too is not required if the dataset is a `Python List` or a `txt` file.
137 |       - *column_names*:
138 |         - If the `csv` or `tsv` does not have column names, a Python list has to be passed to give the columns a name. Since this function also accepts `Python List` and a `txt` file, the default value is set to *None*. But, if `csv` or `tsv` files are used, this parameter has to be set.
139 | <!--te-->
140 | 
141 | ## References
142 | [Passive To Active](https://github.com/DanManN/pass2act) licensed under the Apache License 2.0
143 | 
144 | ## Links
145 | Please find an in depth explanation about the library [on my blog](https://towardsdatascience.com/textgenie-augmenting-your-text-dataset-with-just-2-lines-of-code-23ce883a0715).
146 | 
147 | ## License
148 | Please check `LICENSE` for more details.
149 | 
150 | 


--------------------------------------------------------------------------------
/textgenie/textgenie.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from tqdm import tqdm
  3 | from .grammar_utils import pass2act, is_passive
  4 | from transformers import T5ForConditionalGeneration, T5Tokenizer
  5 | from string import punctuation
  6 | import os
  7 | import pandas as pd
  8 | 
  9 | 
 10 | def set_seed(seed):
 11 |     torch.manual_seed(seed)
 12 | 
 13 | 
 14 | set_seed(42)
 15 | 
 16 | 
 17 | class TextGenie:
 18 |     def __init__(
 19 |         self,
 20 |         paraphrase_model_name,
 21 |         mask_model_name=None,
 22 |         spacy_model_name="en_core_web_sm",
 23 |         device="cpu",
 24 |     ):
 25 |         tqdm.write("Loading Paraphrase Model..")
 26 |         self.paraphrase_model = T5ForConditionalGeneration.from_pretrained(
 27 |             paraphrase_model_name
 28 |         )
 29 |         self.paraphrase_tokenizer = T5Tokenizer.from_pretrained(paraphrase_model_name)
 30 |         self.paraphrase_model = self.paraphrase_model.to(device)
 31 |         self.device = device
 32 | 
 33 |         import spacy
 34 |         self.nlp = spacy.load(spacy_model_name)
 35 | 
 36 |         if mask_model_name:
 37 |             tqdm.write("Loading Mask Fill Model..")
 38 |             from transformers import pipeline
 39 |             from string import punctuation
 40 | 
 41 |             self.mask_augmenter = pipeline("fill-mask", model=mask_model_name)
 42 | 
 43 |     def extract_keywords(self, sentence):
 44 |         result = []
 45 |         pos_tag = ["PROPN", "NOUN", "ADJ"]
 46 |         consider_tags = ["NUM"]
 47 |         pos_tag = pos_tag + consider_tags
 48 | 
 49 |         doc = self.nlp(sentence)
 50 | 
 51 |         for token in doc:
 52 |             if (
 53 |                 token.text in self.nlp.Defaults.stop_words or token.text in punctuation
 54 |             ) and token.pos_ not in consider_tags:
 55 |                 continue
 56 |             if token.pos_ in pos_tag:
 57 |                 result.append(token.text)
 58 |         return list(set(result))
 59 | 
 60 |     def augment_sent_mask_filling(self, sent, n_mask_predictions=5):
 61 |         keywords = self.extract_keywords(sent)
 62 |         augmented_sents = []
 63 |         for keyword in keywords:
 64 |             masked_sent = sent.replace(keyword, self.mask_augmenter.tokenizer.mask_token, 1)
 65 |             augmented_sents.extend(
 66 |                 [
 67 |                     generated_sent["sequence"]
 68 |                     for generated_sent in self.mask_augmenter(
 69 |                         masked_sent, top_k=n_mask_predictions
 70 |                     )
 71 |                     if generated_sent["sequence"].lower() != sent.lower()
 72 |                 ]
 73 |             )
 74 |         return augmented_sents
 75 | 
 76 |     def augment_sent_t5(self, sent, prefix, n_predictions=5, top_k=120, max_length=256, add_suffix_token=True):
 77 |         text = prefix + sent + " </s>" if add_suffix_token else ""
 78 |         encoding = self.paraphrase_tokenizer.encode_plus(
 79 |             text, pad_to_max_length=True, return_tensors="pt"
 80 |         )
 81 |         input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding[
 82 |             "attention_mask"
 83 |         ].to(self.device)
 84 | 
 85 |         beam_outputs = self.paraphrase_model.generate(
 86 |             input_ids=input_ids,
 87 |             attention_mask=attention_masks,
 88 |             do_sample=True,
 89 |             max_length=max_length,
 90 |             top_k=top_k,
 91 |             top_p=0.98,
 92 |             early_stopping=True,
 93 |             num_return_sequences=n_predictions,
 94 |         )
 95 | 
 96 |         final_outputs = []
 97 |         for beam_output in beam_outputs:
 98 |             generated_sent = self.paraphrase_tokenizer.decode(
 99 |                 beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True
100 |             )
101 |             if (
102 |                 generated_sent.lower() != sent.lower()
103 |                 and generated_sent not in final_outputs
104 |             ):
105 |                 final_outputs.append(generated_sent)
106 |         return final_outputs
107 | 
108 |     def convert_to_active(self, sent):
109 |         if is_passive(sent, nlp=self.nlp):
110 |             return pass2act(sent, nlp=self.nlp)
111 |         else:
112 |             return sent
113 | 
114 |     def magic_once(
115 |         self,
116 |         sent,
117 |         paraphrase_prefix,
118 |         n_paraphrase_predictions=5,
119 |         paraphrase_top_k=120,
120 |         paraphrase_max_length=256,
121 |         n_mask_predictions=None,
122 |         convert_to_active=True,
123 |         add_suffix_token=True,
124 |     ):
125 |         sent = sent.strip()
126 |         output = []
127 |         output.append(sent)
128 |         output += self.augment_sent_t5(
129 |             sent,
130 |             paraphrase_prefix,
131 |             n_paraphrase_predictions,
132 |             paraphrase_top_k,
133 |             paraphrase_max_length,
134 |         )
135 |         if n_mask_predictions and isinstance(n_mask_predictions, int):
136 |             output += self.augment_sent_mask_filling(sent, n_mask_predictions)
137 |         if convert_to_active:
138 |             active_voice = self.convert_to_active(sent)
139 |             if active_voice.lower() != sent.lower():
140 |                 output.append(active_voice)
141 |         return list(set(output))
142 | 
143 |     def magic_lamp(
144 |         self,
145 |         sentences,
146 |         paraphrase_prefix,
147 |         n_paraphrase_predictions=5,
148 |         paraphrase_top_k=120,
149 |         paraphrase_max_length=256,
150 |         n_mask_predictions=None,
151 |         convert_to_active=True,
152 |         label_column=None,
153 |         data_column=None,
154 |         column_names=None,
155 |         add_suffix_token=True,
156 |     ):
157 |         all_sentences = None
158 |         with_labels = False
159 |         out_file = os.path.join(os.getcwd(), "sentences_aug.txt")
160 | 
161 |         if isinstance(sentences, str):
162 |             sentences = os.path.join(os.getcwd(), sentences)
163 |             if sentences.endswith(".txt"):
164 |                 all_sentences = open(sentences).read().strip().split("\n")
165 |             elif sentences.endswith(".csv") or sentences.endswith(".tsv"):
166 |                 if not label_column:
167 |                     raise Exception(
168 |                         "Please provide the column name that contains labels using the 'label_column' parameter."
169 |                     )
170 |                 if not data_column:
171 |                     raise Exception(
172 |                         "Please provide the column name that contains data using the 'data_column' parameter."
173 |                     )
174 |                 if column_names and not isinstance(column_names, list):
175 |                     raise Exception("Please provide column names in a python list.")
176 |                 out_file = (
177 |                     sentences.replace(".csv", "").replace(".tsv", "") + "_aug.tsv"
178 |                 )
179 |                 with_labels = True
180 |                 if sentences.endswith(".csv"):
181 |                     if column_names:
182 |                         all_sentences = pd.read_csv(sentences, names=column_names)
183 |                     else:
184 |                         all_sentences = pd.read_csv(sentences)
185 |                 elif sentences.endswith(".tsv"):
186 |                     if column_names:
187 |                         all_sentences = pd.read_csv(
188 |                             sentences, names=column_names, sep="\t"
189 |                         )
190 |                     else:
191 |                         all_sentences = pd.read_csv(sentences)
192 |                 if label_column not in all_sentences.columns:
193 |                     raise Exception(
194 |                         "Please provide label column name for the dataset using the 'label_column' parameter. If already provided, please check for typos in the name of the label column."
195 |                     )
196 |                 if data_column not in all_sentences.columns:
197 |                     raise Exception(
198 |                         "Please provide data column name for the dataset using the 'data_column' parameter. If already provided, please check for typos in the name of the data column."
199 |                     )
200 |                 labels = all_sentences[label_column].unique()
201 | 
202 |                 if all_sentences.iloc[0].tolist() == column_names:
203 |                     all_sentences = all_sentences.drop(0)
204 |                 augmented_data = []
205 | 
206 |                 for ix in tqdm(range(all_sentences.shape[0])):
207 |                     sent = all_sentences[data_column][ix].strip()
208 |                     label = all_sentences[label_column][ix].strip()
209 |                     aug_sent = self.magic_once(
210 |                         sent,
211 |                         paraphrase_prefix,
212 |                         n_paraphrase_predictions,
213 |                         paraphrase_top_k,
214 |                         paraphrase_max_length,
215 |                         n_mask_predictions,
216 |                         convert_to_active,
217 |                         add_suffix_token,
218 |                     )
219 |                     aug_sent = [[s, label] for s in aug_sent]
220 |                     augmented_data.extend(aug_sent)
221 |                 augmented_data = pd.DataFrame(
222 |                     data=augmented_data, columns=["Text", "Label"]
223 |                 )
224 |                 augmented_data.to_csv(out_file, sep="\t", index=None)
225 |             else:
226 |                 raise Exception(
227 |                     "Unsupported file format. Currently, following formats are supported: list/csv/tsv"
228 |                 )
229 |         elif isinstance(sentences, list):
230 |             all_sentences = sentences
231 |         if all_sentences is None:
232 |             raise Exception("Error: No sentences found.")
233 |         if not with_labels:
234 |             augmented_data = []
235 |             for sent in tqdm(all_sentences):
236 |                 augmented_data.extend(
237 |                     self.magic_once(
238 |                         sent,
239 |                         paraphrase_prefix,
240 |                         n_paraphrase_predictions,
241 |                         paraphrase_top_k,
242 |                         paraphrase_max_length,
243 |                         n_mask_predictions,
244 |                         convert_to_active,
245 |                         add_suffix_token,
246 |                     )
247 |                 )
248 |             with open(out_file, "w") as f:
249 |                 for line in augmented_data:
250 |                     f.write(line + "\n")
251 |         tqdm.write(f"\nCompleted writing output to {out_file}.")
252 |         return augmented_data
253 | 


--------------------------------------------------------------------------------
/textgenie/grammar_utils.py:
--------------------------------------------------------------------------------
  1 | import spacy
  2 | from spacy.matcher import Matcher
  3 | 
  4 | try:
  5 |     import pattern.en as en
  6 |     from pattern.en import lexeme
  7 | except LookupError:
  8 |     import nltk
  9 | 
 10 |     nltk.download("omw-1.4")
 11 |     import pattern.en as en
 12 |     from pattern.en import lexeme
 13 | 
 14 | noundict = {
 15 |     "i": "me",
 16 |     "we": "us",
 17 |     "you": "you",
 18 |     "he": "him",
 19 |     "she": "her",
 20 |     "they": "them",
 21 |     "them": "they",
 22 |     "her": "she",
 23 |     "him": "he",
 24 |     "us": "we",
 25 |     "me": "i",
 26 | }
 27 | 
 28 | 
 29 | def nouninv(noun):
 30 |     n = noun.lower()
 31 |     if n in noundict:
 32 |         return noundict[n]
 33 |     return noun
 34 | 
 35 | 
 36 | def pattern_stopiteration_workaround():
 37 |     try:
 38 |         print(lexeme("check"))
 39 |     except:
 40 |         pass
 41 | 
 42 | 
 43 | pattern_stopiteration_workaround()
 44 | 
 45 | 
 46 | def pass2act(doc, nlp, rec=False):
 47 |     """
 48 |     Author : Daniel Nohimovich & Zhekai Jin (Scott)
 49 |     Course : ECE 467 Natural Language Processing
 50 |     Instructor : Professor Carl Sable
 51 |     https://github.com/DanManN/pass2act
 52 |     All rights reserved by the orignal author.
 53 |     This source code is licensed under the Apache License 2.0 found in the
 54 |     LICENSE file in the root directory of this source tree.
 55 |     """
 56 |     parse = nlp(doc)
 57 |     newdoc = ""
 58 |     for sent in parse.sents:
 59 | 
 60 |         # Init parts of sentence to capture:
 61 |         subjpass = ""
 62 |         subj = ""
 63 |         verb = ""
 64 |         verbaspect = ""
 65 |         verbtense = ""
 66 |         adverb = {"bef": "", "aft": ""}
 67 |         part = ""
 68 |         prep = ""
 69 |         agent = ""
 70 |         aplural = False
 71 |         advcltree = None
 72 |         aux = list(list(nlp(". .").sents)[0])  # start with 2 'null' elements
 73 |         xcomp = ""
 74 |         punc = "."
 75 |         # Analyse dependency tree:
 76 |         for word in sent:
 77 |             if word.dep_ == "advcl":
 78 |                 if word.head.dep_ in ("ROOT", "auxpass"):
 79 |                     advcltree = word.subtree
 80 |             if word.dep_ == "nsubjpass":
 81 |                 if word.head.dep_ == "ROOT":
 82 |                     subjpass = "".join(
 83 |                         w.text_with_ws.lower()
 84 |                         if w.tag_ not in ("NNP", "NNPS")
 85 |                         else w.text_with_ws
 86 |                         for w in word.subtree
 87 |                     ).strip()
 88 |             if word.dep_ == "nsubj":
 89 |                 subj = "".join(
 90 |                     w.text_with_ws.lower()
 91 |                     if w.tag_ not in ("NNP", "NNPS")
 92 |                     else w.text_with_ws
 93 |                     for w in word.subtree
 94 |                 ).strip()
 95 |                 if word.head.dep_ == "auxpass":
 96 |                     if word.head.head.dep_ == "ROOT":
 97 |                         subjpass = subj
 98 |             if word.dep_ in ("advmod", "npadvmod", "oprd"):
 99 |                 if word.head.dep_ == "ROOT":
100 |                     if verb == "":
101 |                         adverb["bef"] = "".join(
102 |                             w.text_with_ws.lower()
103 |                             if w.tag_ not in ("NNP", "NNPS")
104 |                             else w.text_with_ws
105 |                             for w in word.subtree
106 |                         ).strip()
107 |                     else:
108 |                         adverb["aft"] = "".join(
109 |                             w.text_with_ws.lower()
110 |                             if w.tag_ not in ("NNP", "NNPS")
111 |                             else w.text_with_ws
112 |                             for w in word.subtree
113 |                         ).strip()
114 |             if word.dep_ == "auxpass":
115 |                 if word.head.dep_ == "ROOT":
116 |                     if not subjpass:
117 |                         subjpass = subj
118 |             if word.dep_ in ("aux", "auxpass", "neg"):
119 |                 if word.head.dep_ == "ROOT":
120 |                     aux += [word]
121 |             if word.dep_ == "ROOT":
122 |                 verb = word.text
123 |                 if word.tag_ == "VB":
124 |                     verbtense = en.INFINITIVE
125 |                 elif word.tag_ == "VBD":
126 |                     verbtense = en.PAST
127 |                 elif word.tag_ == "VBG":
128 |                     verbtense = en.PRESENT
129 |                     verbaspect = en.PROGRESSIVE
130 |                 elif word.tag_ == "VBN":
131 |                     verbtense = en.PAST
132 |                 else:
133 |                     verbtense = en.tenses(word.text)[0][0]
134 |             if word.dep_ == "prt":
135 |                 if word.head.dep_ == "ROOT":
136 |                     part = "".join(
137 |                         w.text_with_ws.lower()
138 |                         if w.tag_ not in ("NNP", "NNPS")
139 |                         else w.text_with_ws
140 |                         for w in word.subtree
141 |                     ).strip()
142 |             if word.dep_ == "prep":
143 |                 if word.head.dep_ == "ROOT":
144 |                     prep = "".join(
145 |                         w.text_with_ws.lower()
146 |                         if w.tag_ not in ("NNP", "NNPS")
147 |                         else w.text_with_ws
148 |                         for w in word.subtree
149 |                     ).strip()
150 |             if word.dep_.endswith("obj"):
151 |                 if word.head.dep_ == "agent":
152 |                     if word.head.head.dep_ == "ROOT":
153 |                         agent = "".join(
154 |                             w.text + ", "
155 |                             if w.dep_ == "appos"
156 |                             else (
157 |                                 w.text_with_ws.lower()
158 |                                 if w.tag_ not in ("NNP", "NNPS")
159 |                                 else w.text_with_ws
160 |                             )
161 |                             for w in word.subtree
162 |                         ).strip()
163 |                         aplural = word.tag_ in ("NNS", "NNPS")
164 |             if word.dep_ in ("xcomp", "ccomp", "conj"):
165 |                 if word.head.dep_ == "ROOT":
166 |                     xcomp = "".join(
167 |                         w.text_with_ws.lower()
168 |                         if w.tag_ not in ("NNP", "NNPS")
169 |                         else w.text_with_ws
170 |                         for w in word.subtree
171 |                     ).strip()
172 |                     that = xcomp.startswith("that")
173 |                     xcomp = pass2act(xcomp, True).strip(" .")
174 |                     if not xcomp.startswith("that") and that:
175 |                         xcomp = "that " + xcomp
176 |             if word.dep_ == "punct" and not rec:
177 |                 if word.text != '"':
178 |                     punc = word.text
179 | 
180 |         # exit if not passive:
181 |         if subjpass == "":
182 |             newdoc += str(sent) + " "
183 |             continue
184 | 
185 |         # if no agent is found:
186 |         if agent == "":
187 |             # what am I gonna do? BITconEEEEEEECT!!!!
188 |             newdoc += str(sent) + " "
189 |             continue
190 | 
191 |         # invert nouns:
192 |         agent = nouninv(agent)
193 |         subjpass = nouninv(subjpass)
194 | 
195 |         # FUCKING CONJUGATION!!!!!!!!!!!!!:
196 |         auxstr = ""
197 |         num = en.SINGULAR if not aplural or agent in ("he", "she") else en.PLURAL
198 |         aux.append(aux[0])
199 |         verbaspect = None
200 |         for (pp, p, a, n) in zip(aux, aux[1:], aux[2:], aux[3:]):
201 |             if a.lemma_ == ".":
202 |                 continue
203 | 
204 |             if a.lemma_ == "not":
205 |                 if p.lemma_ == "be":
206 |                     if n.lemma_ == "be":
207 |                         verbtense = en.tenses(a.text)[0][0]
208 |                         auxstr += (
209 |                             en.conjugate(
210 |                                 "be", tense=en.tenses(p.text)[0][0], number=num
211 |                             )
212 |                             + " "
213 |                         )
214 |                         verbaspect = en.PROGRESSIVE
215 |                     else:
216 |                         auxstr += (
217 |                             en.conjugate(
218 |                                 "do", tense=en.tenses(p.text)[0][0], number=num
219 |                             )
220 |                             + " "
221 |                         )
222 |                         verbtense = en.INFINITIVE
223 |                 auxstr += "not "
224 |             elif a.lemma_ == "be":
225 |                 if p.lemma_ == "be":
226 |                     verbtense = en.tenses(a.text)[0][0]
227 |                     auxstr += (
228 |                         en.conjugate("be", tense=en.tenses(a.text)[0][0], number=num)
229 |                         + " "
230 |                     )
231 |                     verbaspect = en.PROGRESSIVE
232 |                 elif p.tag_ == "MD":
233 |                     verbtense = en.INFINITIVE
234 |             elif a.lemma_ == "have":
235 |                 num == en.PLURAL if p.tag_ == "MD" else num
236 |                 auxstr += (
237 |                     en.conjugate("have", tense=en.tenses(a.text)[0][0], number=num)
238 |                     + " "
239 |                 )
240 |                 if n.lemma_ == "be":
241 |                     verbaspect = en.PROGRESSIVE
242 |                     verbtense = en.tenses(n.text)[0][0]
243 |             else:
244 |                 auxstr += a.text_with_ws
245 |         auxstr = auxstr.lower().strip()
246 | 
247 |         if verbaspect:
248 |             verb = en.conjugate(verb, tense=verbtense, aspect=verbaspect)
249 |         else:
250 |             verb = en.conjugate(verb, tense=verbtense)
251 | 
252 |         advcl = ""
253 |         if advcltree:
254 |             for w in advcltree:
255 |                 if w.pos_ == "VERB" and en.tenses(w.text)[0][4] == en.PROGRESSIVE:
256 |                     advcl += (
257 |                         "which "
258 |                         + en.conjugate(w.text, tense=en.tenses(verb)[0][0])
259 |                         + " "
260 |                     )
261 |                 else:
262 |                     advcl += w.text_with_ws
263 | 
264 |         newsent = (
265 |             " ".join(
266 |                 list(
267 |                     filter(
268 |                         None,
269 |                         [
270 |                             agent,
271 |                             auxstr,
272 |                             adverb["bef"],
273 |                             verb,
274 |                             part,
275 |                             subjpass,
276 |                             adverb["aft"],
277 |                             advcl,
278 |                             prep,
279 |                             xcomp,
280 |                         ],
281 |                     )
282 |                 )
283 |             )
284 |             + punc
285 |         )
286 |         if not rec:
287 |             newsent = newsent[0].upper() + newsent[1:]
288 |         newdoc += newsent + " "
289 |     return newdoc
290 | 
291 | 
292 | def is_passive(sentence, nlp):
293 |     doc = nlp(sentence)
294 |     passive_rule = [
295 |         {"DEP": "nsubjpass"},
296 |         {"DEP": "aux", "OP": "*"},
297 |         {"DEP": "auxpass"},
298 |         {"TAG": "VBN"},
299 |     ]
300 | 
301 |     matcher = Matcher(nlp.vocab)
302 | 
303 |     matcher.add("Passive", [passive_rule])
304 |     matches = matcher(doc)
305 |     if matches:
306 |         return True
307 |     else:
308 |         return False
309 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/examples/examples.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "textgenie-examples.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": []
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "code",
 21 |       "metadata": {
 22 |         "colab": {
 23 |           "base_uri": "https://localhost:8080/",
 24 |           "height": 1000
 25 |         },
 26 |         "id": "4b69iyum-FxW",
 27 |         "outputId": "8fb63125-6ac3-4fc8-c4c1-20fcfea72e31"
 28 |       },
 29 |       "source": [
 30 |         "!pip install --upgrade textgenie"
 31 |       ],
 32 |       "execution_count": 10,
 33 |       "outputs": [
 34 |         {
 35 |           "output_type": "stream",
 36 |           "text": [
 37 |             "Collecting git+https://github.com/hetpandya/textgenie.git\n",
 38 |             "  Cloning https://github.com/hetpandya/textgenie.git to /tmp/pip-req-build-42y38bmw\n",
 39 |             "  Running command git clone -q https://github.com/hetpandya/textgenie.git /tmp/pip-req-build-42y38bmw\n",
 40 |             "Requirement already satisfied, skipping upgrade: torch>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from textgenie==0.1.2) (1.9.0+cu102)\n",
 41 |             "Requirement already satisfied, skipping upgrade: transformers in /usr/local/lib/python3.7/dist-packages (from textgenie==0.1.2) (4.7.0)\n",
 42 |             "Requirement already satisfied, skipping upgrade: sentencepiece in /usr/local/lib/python3.7/dist-packages (from textgenie==0.1.2) (0.1.96)\n",
 43 |             "Requirement already satisfied, skipping upgrade: spacy in /usr/local/lib/python3.7/dist-packages (from textgenie==0.1.2) (2.2.4)\n",
 44 |             "Requirement already satisfied, skipping upgrade: tqdm in /usr/local/lib/python3.7/dist-packages (from textgenie==0.1.2) (4.41.1)\n",
 45 |             "Requirement already satisfied, skipping upgrade: pattern in /usr/local/lib/python3.7/dist-packages (from textgenie==0.1.2) (3.6)\n",
 46 |             "Requirement already satisfied, skipping upgrade: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch>=1.5.0->textgenie==0.1.2) (3.7.4.3)\n",
 47 |             "Requirement already satisfied, skipping upgrade: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (2019.12.20)\n",
 48 |             "Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (2.23.0)\n",
 49 |             "Requirement already satisfied, skipping upgrade: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (1.19.5)\n",
 50 |             "Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (4.5.0)\n",
 51 |             "Requirement already satisfied, skipping upgrade: filelock in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (3.0.12)\n",
 52 |             "Requirement already satisfied, skipping upgrade: huggingface-hub==0.0.8 in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (0.0.8)\n",
 53 |             "Requirement already satisfied, skipping upgrade: tokenizers<0.11,>=0.10.1 in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (0.10.3)\n",
 54 |             "Requirement already satisfied, skipping upgrade: pyyaml in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (3.13)\n",
 55 |             "Requirement already satisfied, skipping upgrade: packaging in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (20.9)\n",
 56 |             "Requirement already satisfied, skipping upgrade: sacremoses in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (0.0.45)\n",
 57 |             "Requirement already satisfied, skipping upgrade: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (1.1.3)\n",
 58 |             "Requirement already satisfied, skipping upgrade: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (7.4.0)\n",
 59 |             "Requirement already satisfied, skipping upgrade: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (2.0.5)\n",
 60 |             "Requirement already satisfied, skipping upgrade: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (0.4.1)\n",
 61 |             "Requirement already satisfied, skipping upgrade: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (1.0.0)\n",
 62 |             "Requirement already satisfied, skipping upgrade: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (1.0.5)\n",
 63 |             "Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (57.0.0)\n",
 64 |             "Requirement already satisfied, skipping upgrade: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (1.0.5)\n",
 65 |             "Requirement already satisfied, skipping upgrade: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (0.8.2)\n",
 66 |             "Requirement already satisfied, skipping upgrade: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (3.0.5)\n",
 67 |             "Requirement already satisfied, skipping upgrade: beautifulsoup4 in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (4.6.3)\n",
 68 |             "Requirement already satisfied, skipping upgrade: future in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (0.16.0)\n",
 69 |             "Requirement already satisfied, skipping upgrade: feedparser in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (6.0.7)\n",
 70 |             "Requirement already satisfied, skipping upgrade: lxml in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (4.2.6)\n",
 71 |             "Requirement already satisfied, skipping upgrade: nltk in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (3.2.5)\n",
 72 |             "Requirement already satisfied, skipping upgrade: cherrypy in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (18.6.0)\n",
 73 |             "Requirement already satisfied, skipping upgrade: scipy in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (1.4.1)\n",
 74 |             "Requirement already satisfied, skipping upgrade: pdfminer.six in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (20201018)\n",
 75 |             "Requirement already satisfied, skipping upgrade: python-docx in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (0.8.11)\n",
 76 |             "Requirement already satisfied, skipping upgrade: backports.csv in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (1.0.7)\n",
 77 |             "Requirement already satisfied, skipping upgrade: mysqlclient in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (2.0.3)\n",
 78 |             "Requirement already satisfied, skipping upgrade: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers->textgenie==0.1.2) (3.0.4)\n",
 79 |             "Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers->textgenie==0.1.2) (2021.5.30)\n",
 80 |             "Requirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers->textgenie==0.1.2) (1.24.3)\n",
 81 |             "Requirement already satisfied, skipping upgrade: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers->textgenie==0.1.2) (2.10)\n",
 82 |             "Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers->textgenie==0.1.2) (3.4.1)\n",
 83 |             "Requirement already satisfied, skipping upgrade: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers->textgenie==0.1.2) (2.4.7)\n",
 84 |             "Requirement already satisfied, skipping upgrade: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers->textgenie==0.1.2) (1.0.1)\n",
 85 |             "Requirement already satisfied, skipping upgrade: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers->textgenie==0.1.2) (1.15.0)\n",
 86 |             "Requirement already satisfied, skipping upgrade: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers->textgenie==0.1.2) (7.1.2)\n",
 87 |             "Requirement already satisfied, skipping upgrade: sgmllib3k in /usr/local/lib/python3.7/dist-packages (from feedparser->pattern->textgenie==0.1.2) (1.0.0)\n",
 88 |             "Requirement already satisfied, skipping upgrade: cheroot>=8.2.1 in /usr/local/lib/python3.7/dist-packages (from cherrypy->pattern->textgenie==0.1.2) (8.5.2)\n",
 89 |             "Requirement already satisfied, skipping upgrade: zc.lockfile in /usr/local/lib/python3.7/dist-packages (from cherrypy->pattern->textgenie==0.1.2) (2.0)\n",
 90 |             "Requirement already satisfied, skipping upgrade: jaraco.collections in /usr/local/lib/python3.7/dist-packages (from cherrypy->pattern->textgenie==0.1.2) (3.3.0)\n",
 91 |             "Requirement already satisfied, skipping upgrade: more-itertools in /usr/local/lib/python3.7/dist-packages (from cherrypy->pattern->textgenie==0.1.2) (8.8.0)\n",
 92 |             "Requirement already satisfied, skipping upgrade: portend>=2.1.1 in /usr/local/lib/python3.7/dist-packages (from cherrypy->pattern->textgenie==0.1.2) (2.7.1)\n",
 93 |             "Requirement already satisfied, skipping upgrade: sortedcontainers in /usr/local/lib/python3.7/dist-packages (from pdfminer.six->pattern->textgenie==0.1.2) (2.4.0)\n",
 94 |             "Requirement already satisfied, skipping upgrade: cryptography in /usr/local/lib/python3.7/dist-packages (from pdfminer.six->pattern->textgenie==0.1.2) (3.4.7)\n",
 95 |             "Requirement already satisfied, skipping upgrade: jaraco.functools in /usr/local/lib/python3.7/dist-packages (from cheroot>=8.2.1->cherrypy->pattern->textgenie==0.1.2) (3.3.0)\n",
 96 |             "Requirement already satisfied, skipping upgrade: jaraco.classes in /usr/local/lib/python3.7/dist-packages (from jaraco.collections->cherrypy->pattern->textgenie==0.1.2) (3.2.1)\n",
 97 |             "Requirement already satisfied, skipping upgrade: jaraco.text in /usr/local/lib/python3.7/dist-packages (from jaraco.collections->cherrypy->pattern->textgenie==0.1.2) (3.5.0)\n",
 98 |             "Requirement already satisfied, skipping upgrade: tempora>=1.8 in /usr/local/lib/python3.7/dist-packages (from portend>=2.1.1->cherrypy->pattern->textgenie==0.1.2) (4.1.1)\n",
 99 |             "Requirement already satisfied, skipping upgrade: cffi>=1.12 in /usr/local/lib/python3.7/dist-packages (from cryptography->pdfminer.six->pattern->textgenie==0.1.2) (1.14.5)\n",
100 |             "Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.7/dist-packages (from tempora>=1.8->portend>=2.1.1->cherrypy->pattern->textgenie==0.1.2) (2018.9)\n",
101 |             "Requirement already satisfied, skipping upgrade: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi>=1.12->cryptography->pdfminer.six->pattern->textgenie==0.1.2) (2.20)\n",
102 |             "Building wheels for collected packages: textgenie\n",
103 |             "  Building wheel for textgenie (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
104 |             "  Created wheel for textgenie: filename=textgenie-0.1.2-cp37-none-any.whl size=8705 sha256=f8cb665c1c8c93f7792b85d66124020d326625f21d43873fb6d0db71f67437af\n",
105 |             "  Stored in directory: /tmp/pip-ephem-wheel-cache-8giac7ls/wheels/35/24/87/4f20f5d3fa823cf98bf2d27bb95281c19c3436f82888aa6adc\n",
106 |             "Successfully built textgenie\n",
107 |             "Installing collected packages: textgenie\n",
108 |             "  Found existing installation: textgenie 0.1.1\n",
109 |             "    Uninstalling textgenie-0.1.1:\n",
110 |             "      Successfully uninstalled textgenie-0.1.1\n",
111 |             "Successfully installed textgenie-0.1.2\n"
112 |           ],
113 |           "name": "stdout"
114 |         },
115 |         {
116 |           "output_type": "display_data",
117 |           "data": {
118 |             "application/vnd.colab-display-data+json": {
119 |               "pip_warning": {
120 |                 "packages": [
121 |                   "textgenie"
122 |                 ]
123 |               }
124 |             }
125 |           },
126 |           "metadata": {
127 |             "tags": []
128 |           }
129 |         }
130 |       ]
131 |     },
132 |     {
133 |       "cell_type": "code",
134 |       "metadata": {
135 |         "id": "_h3lGuIV-KKI"
136 |       },
137 |       "source": [
138 |         "from textgenie import TextGenie"
139 |       ],
140 |       "execution_count": 1,
141 |       "outputs": []
142 |     },
143 |     {
144 |       "cell_type": "code",
145 |       "metadata": {
146 |         "colab": {
147 |           "base_uri": "https://localhost:8080/"
148 |         },
149 |         "id": "Tg2igVb6-UxL",
150 |         "outputId": "790b4af8-9eb8-42ed-cf01-cb9cb5555e72"
151 |       },
152 |       "source": [
153 |         "textgenie = TextGenie(\"ramsrigouthamg/t5_paraphraser\",'bert-base-uncased')"
154 |       ],
155 |       "execution_count": 2,
156 |       "outputs": [
157 |         {
158 |           "output_type": "stream",
159 |           "text": [
160 |             "Loading Paraphrase Model..\n",
161 |             "Loading Mask Fill Model..\n"
162 |           ],
163 |           "name": "stdout"
164 |         },
165 |         {
166 |           "output_type": "stream",
167 |           "text": [
168 |             "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
169 |             "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
170 |             "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
171 |           ],
172 |           "name": "stderr"
173 |         }
174 |       ]
175 |     },
176 |     {
177 |       "cell_type": "code",
178 |       "metadata": {
179 |         "colab": {
180 |           "base_uri": "https://localhost:8080/"
181 |         },
182 |         "id": "P06v8CkK-vHu",
183 |         "outputId": "bec9a13a-58cd-4f91-bf1c-f2f652fff9e5"
184 |       },
185 |       "source": [
186 |         "# Augment a list of sentences\n",
187 |         "sentences = [\"The video was posted on Facebook by Alex.\",\"I plan to run it again this time\"]\n",
188 |         "textgenie.magic_lamp(sentences,\"paraphrase: \",n_paraphrase_predictions=15,n_mask_predictions=15,convert_to_active=True)"
189 |       ],
190 |       "execution_count": 5,
191 |       "outputs": [
192 |         {
193 |           "output_type": "stream",
194 |           "text": [
195 |             "\r  0%|          | 0/2 [00:00<?, ?it/s]/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py:2111: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
196 |             "  FutureWarning,\n",
197 |             "/usr/local/lib/python3.7/dist-packages/transformers/models/t5/tokenization_t5.py:191: UserWarning: This sequence already has </s>. In future versions this behavior may lead to duplicated eos tokens being added.\n",
198 |             "  f\"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added.\"\n",
199 |             "100%|██████████| 2/2 [00:18<00:00,  9.20s/it]"
200 |           ],
201 |           "name": "stderr"
202 |         },
203 |         {
204 |           "output_type": "stream",
205 |           "text": [
206 |             "\n",
207 |             "Completed writing output to /content/sentences_aug.txt.\n"
208 |           ],
209 |           "name": "stdout"
210 |         },
211 |         {
212 |           "output_type": "stream",
213 |           "text": [
214 |             "\n"
215 |           ],
216 |           "name": "stderr"
217 |         },
218 |         {
219 |           "output_type": "execute_result",
220 |           "data": {
221 |             "text/plain": [
222 |               "['the clip was posted on facebook by alex.',\n",
223 |               " 'the video was posted on facebook by youtube.',\n",
224 |               " 'the event was posted on facebook by alex.',\n",
225 |               " 'the text was posted on facebook by alex.',\n",
226 |               " 'the cover was posted on facebook by alex.',\n",
227 |               " 'the story was posted on facebook by alex.',\n",
228 |               " 'the article was posted on facebook by alex.',\n",
229 |               " 'the film was posted on facebook by alex.',\n",
230 |               " 'the video was posted on facebook by twitter.',\n",
231 |               " 'Is it true that the video, posted in Facebook, was created by Alex?',\n",
232 |               " 'the photo was posted on facebook by alex.',\n",
233 |               " 'What videos have you seen on FaceBook (Alex)?',\n",
234 |               " 'If I could capture this on Facebook, what would be the point of posting an Instagram video?',\n",
235 |               " 'I just saw the video posted on Facebook by Alex Wenzel. This will impact to tell you more than we realise.',\n",
236 |               " 'the video was posted on blogs by alex.',\n",
237 |               " 'the video was posted on facebook by mtv.',\n",
238 |               " 'the video was posted on youtube by alex.',\n",
239 |               " 'the video was posted on tv by alex.',\n",
240 |               " 'the video was posted on facebook by anonymous.',\n",
241 |               " 'the single was posted on facebook by alex.',\n",
242 |               " 'the video was posted on vine by alex.',\n",
243 |               " 'the video was posted on facebook by rihanna.',\n",
244 |               " 'the video was posted on facebook by members.',\n",
245 |               " 'the video was posted on twitch by alex.',\n",
246 |               " 'the video was posted on facebook by others.',\n",
247 |               " 'the video was posted on amazon by alex.',\n",
248 |               " 'the video was posted on facebook by fans.',\n",
249 |               " 'Alex posted the video on Facebook. ',\n",
250 |               " 'The video was posted on Facebook by Alex Pease.',\n",
251 |               " 'the video was posted on facebook by fox.',\n",
252 |               " 'the video was posted on facebook by her.',\n",
253 |               " 'the video was posted on google by alex.',\n",
254 |               " 'the video was posted on facebook by himself.',\n",
255 |               " 'the song was posted on facebook by alex.',\n",
256 |               " 'the video was posted on itunes by alex.',\n",
257 |               " 'the video was posted on video by alex.',\n",
258 |               " 'the video was posted on twitter by alex.',\n",
259 |               " 'the video was posted on facebook by rt.',\n",
260 |               " 'the video was posted on myspace by alex.',\n",
261 |               " 'the video was posted on site by alex.',\n",
262 |               " 'the trailer was posted on facebook by alex.',\n",
263 |               " \"Watch Alex Rodriguez' Facebook video here.\",\n",
264 |               " 'the video was posted on facebook by friends.',\n",
265 |               " \"This video was posted on Facebook by Alex. I've been using a mobile app for the past few days but just cant seem to find the time to download it.\",\n",
266 |               " 'the track was posted on facebook by alex.',\n",
267 |               " 'the video was posted on mtv by alex.',\n",
268 |               " 'the album was posted on facebook by alex.',\n",
269 |               " 'This is a viral video I uploaded to Facebook and showed on my Facebook profile.',\n",
270 |               " 'The video was posted on Facebook by Alex.',\n",
271 |               " 'the announcement was posted on facebook by alex.',\n",
272 |               " 'the video was posted on facebook by them.',\n",
273 |               " 'the video was posted on internet by alex.',\n",
274 |               " 'the video was posted on facebook by him.',\n",
275 |               " 'i plan to run it again this month',\n",
276 |               " 'i plan to run it again this.',\n",
277 |               " 'i plan to run it again this week',\n",
278 |               " 'I plan to run it again this time this time.',\n",
279 |               " 'Is it possible to run it again after it starts if you want it again?',\n",
280 |               " 'I plan to run it again this time this time this time I know the plot. I guess they will be able to continue to run it then.',\n",
281 |               " 'i plan to run it again this ;',\n",
282 |               " \"I plan to run this again this time this time around. I'll be writing more frequently than I have the time and the plan is much less complex.\",\n",
283 |               " 'I plan to run it again this time this time this time.',\n",
284 |               " \"I plan to run it again this time this time this time. I'm sure that I'll be able to find a runner again.\",\n",
285 |               " 'I plan to run it again this time',\n",
286 |               " \"I plan to run it again this time this time again this time I can't remember whether I really needed to keep it running but its good enough.\",\n",
287 |               " 'I will run it again. I plan to run it again this time.',\n",
288 |               " \"I'll run it now again but the second time I've completed.\",\n",
289 |               " 'I plan to run it again this time this time. It actually helps me. So, I will run it again.',\n",
290 |               " 'i plan to run it again this day',\n",
291 |               " 'i plan to run it again this...',\n",
292 |               " 'i plan to run it again this year',\n",
293 |               " 'i plan to run it again this summer',\n",
294 |               " 'What is the plan to run it again?',\n",
295 |               " 'I plan to run It again this time now this time in Linux.',\n",
296 |               " 'How is this book going to be run again?',\n",
297 |               " 'i plan to run it again this season',\n",
298 |               " \"I plan to run it again this time I'm not going to run it again this time. If I didn't run it, I don't expect to miss out on it.\",\n",
299 |               " 'i plan to run it again this weekend',\n",
300 |               " 'i plan to run it again this!',\n",
301 |               " 'i plan to run it again this semester',\n",
302 |               " 'i plan to run it again this?',\n",
303 |               " 'i plan to run it again this morning',\n",
304 |               " 'I can always run it again, I just want to try putting it back. This time a fortnight after.']"
305 |             ]
306 |           },
307 |           "metadata": {
308 |             "tags": []
309 |           },
310 |           "execution_count": 5
311 |         }
312 |       ]
313 |     },
314 |     {
315 |       "cell_type": "code",
316 |       "metadata": {
317 |         "colab": {
318 |           "base_uri": "https://localhost:8080/"
319 |         },
320 |         "id": "fN8rA8bc-cF1",
321 |         "outputId": "93d4aafb-5027-4c13-9f49-c7a54777cc7e"
322 |       },
323 |       "source": [
324 |         "%%writefile sentences.txt\n",
325 |         "At dinner, six shrimp were eaten by Harry.\n",
326 |         "Beautiful giraffes roam the savannah."
327 |       ],
328 |       "execution_count": 6,
329 |       "outputs": [
330 |         {
331 |           "output_type": "stream",
332 |           "text": [
333 |             "Writing sentences.txt\n"
334 |           ],
335 |           "name": "stdout"
336 |         }
337 |       ]
338 |     },
339 |     {
340 |       "cell_type": "code",
341 |       "metadata": {
342 |         "colab": {
343 |           "base_uri": "https://localhost:8080/"
344 |         },
345 |         "id": "TQLNyJyH-t_T",
346 |         "outputId": "f3765c08-2fd7-4b3f-bc33-6413fe94e4c4"
347 |       },
348 |       "source": [
349 |         "# Augment data in a txt file\n",
350 |         "textgenie.magic_lamp(\"sentences.txt\",\"paraphrase: \",n_mask_predictions=5,convert_to_active=True)"
351 |       ],
352 |       "execution_count": 7,
353 |       "outputs": [
354 |         {
355 |           "output_type": "stream",
356 |           "text": [
357 |             "\r  0%|          | 0/2 [00:00<?, ?it/s]/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py:2111: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
358 |             "  FutureWarning,\n",
359 |             "/usr/local/lib/python3.7/dist-packages/transformers/models/t5/tokenization_t5.py:191: UserWarning: This sequence already has </s>. In future versions this behavior may lead to duplicated eos tokens being added.\n",
360 |             "  f\"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added.\"\n",
361 |             "100%|██████████| 2/2 [00:07<00:00,  3.70s/it]"
362 |           ],
363 |           "name": "stderr"
364 |         },
365 |         {
366 |           "output_type": "stream",
367 |           "text": [
368 |             "\n",
369 |             "Completed writing output to /content/sentences_aug.txt.\n"
370 |           ],
371 |           "name": "stdout"
372 |         },
373 |         {
374 |           "output_type": "stream",
375 |           "text": [
376 |             "\n"
377 |           ],
378 |           "name": "stderr"
379 |         },
380 |         {
381 |           "output_type": "execute_result",
382 |           "data": {
383 |             "text/plain": [
384 |               "['at lunch, six shrimp were eaten by harry.',\n",
385 |               " 'at dinner, six shrimp were eaten by hand.',\n",
386 |               " 'at night, six shrimp were eaten by harry.',\n",
387 |               " 'At dinner, Harry was having 6 shrimps.',\n",
388 |               " 'At dinner, Harry ate six shrimp.',\n",
389 |               " 'at least, six shrimp were eaten by harry.',\n",
390 |               " 'at dinner, six shrimp were eaten by him.',\n",
391 |               " 'at dinner, six shrimp were eaten by chicken.',\n",
392 |               " 'at dinner, six pancakes were eaten by harry.',\n",
393 |               " 'at dinner, six shrimp were eaten by everyone.',\n",
394 |               " 'at dinner, his shrimp were eaten by harry.',\n",
395 |               " \"During Harry's dinner, he ate eight shrimp.\",\n",
396 |               " 'at dinner, her shrimp were eaten by harry.',\n",
397 |               " 'at dinner, these shrimp were eaten by harry.',\n",
398 |               " 'at dinner, six eggs were eaten by harry.',\n",
399 |               " 'Harry ate six shrimp at dinner. ',\n",
400 |               " 'Harry ate six shrimp.',\n",
401 |               " 'at first, six shrimp were eaten by harry.',\n",
402 |               " 'at dinner, the shrimp were eaten by harry.',\n",
403 |               " 'at dinner, some shrimp were eaten by harry.',\n",
404 |               " 'at dinner, six sandwiches were eaten by harry.',\n",
405 |               " \"During Harry's dinner, six shrimp were eaten by Harry.\",\n",
406 |               " 'At dinner, six shrimp were eaten by Harry.',\n",
407 |               " 'at dinner, six dishes were eaten by harry.',\n",
408 |               " 'at dinner, six meals were eaten by harry.',\n",
409 |               " 'at dinner, six shrimp were eaten by themselves.',\n",
410 |               " 'How many beautiful giraffes do you see in the savannah?',\n",
411 |               " 'black giraffes roam the savannah.',\n",
412 |               " 'little giraffes roam the savannah.',\n",
413 |               " 'the giraffes roam the savannah.',\n",
414 |               " 'beautiful giraffes roam the park.',\n",
415 |               " 'beautiful butterflies roam the savannah.',\n",
416 |               " 'beautiful giraffes roam the land.',\n",
417 |               " 'Beautiful giraffes roam the savannah.',\n",
418 |               " 'large giraffes roam the savannah.',\n",
419 |               " 'What are some impressive giraffes that roam the Savannah?',\n",
420 |               " 'beautiful birds roam the savannah.',\n",
421 |               " 'beautiful giraffes roam the streets.',\n",
422 |               " 'beautiful giraffes roam the grounds.',\n",
423 |               " 'beautiful animals roam the savannah.',\n",
424 |               " 'In winter, in the middle of nowhere, a giraffe roams the Sabana Desert. What do they do?',\n",
425 |               " 'beautiful creatures roam the savannah.',\n",
426 |               " 'Beautiful giraffes roam the Savanna.',\n",
427 |               " 'wild giraffes roam the savannah.',\n",
428 |               " 'beautiful women roam the savannah.',\n",
429 |               " 'beautiful giraffes roam the beach.']"
430 |             ]
431 |           },
432 |           "metadata": {
433 |             "tags": []
434 |           },
435 |           "execution_count": 7
436 |         }
437 |       ]
438 |     },
439 |     {
440 |       "cell_type": "code",
441 |       "metadata": {
442 |         "colab": {
443 |           "base_uri": "https://localhost:8080/"
444 |         },
445 |         "id": "mzoLK-mq_H3e",
446 |         "outputId": "eea8ad04-5cad-4094-b9d7-0dd601cec684"
447 |       },
448 |       "source": [
449 |         "%%writefile dataset.csv\n",
450 |         "Sue changed the flat tire., Label1\n",
451 |         "The crew paved the entire stretch of highway., Label2\n",
452 |         "The critic wrote a scathing review., Label1\n",
453 |         "I will clean the house every Saturday., Label2 "
454 |       ],
455 |       "execution_count": 8,
456 |       "outputs": [
457 |         {
458 |           "output_type": "stream",
459 |           "text": [
460 |             "Writing dataset.csv\n"
461 |           ],
462 |           "name": "stdout"
463 |         }
464 |       ]
465 |     },
466 |     {
467 |       "cell_type": "code",
468 |       "metadata": {
469 |         "colab": {
470 |           "base_uri": "https://localhost:8080/"
471 |         },
472 |         "id": "xzW11dvT_eNw",
473 |         "outputId": "cf322632-0ef5-46df-a672-9d955dcc1e20"
474 |       },
475 |       "source": [
476 |         "# Augment data in a csv file with labels\n",
477 |         "augmented_dataset = textgenie.magic_lamp(\"dataset.csv\",\"paraphrase: \",n_paraphrase_predictions=15,n_mask_predictions=15,convert_to_active=True,label_column=\"Label\",column_names=[\"Text\",\"Label\"])"
478 |       ],
479 |       "execution_count": 3,
480 |       "outputs": [
481 |         {
482 |           "output_type": "stream",
483 |           "text": [
484 |             "\r  0%|          | 0/4 [00:00<?, ?it/s]/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py:2111: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
485 |             "  FutureWarning,\n",
486 |             "/usr/local/lib/python3.7/dist-packages/transformers/models/t5/tokenization_t5.py:191: UserWarning: This sequence already has </s>. In future versions this behavior may lead to duplicated eos tokens being added.\n",
487 |             "  f\"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added.\"\n",
488 |             "100%|██████████| 4/4 [00:30<00:00,  7.73s/it]"
489 |           ],
490 |           "name": "stderr"
491 |         },
492 |         {
493 |           "output_type": "stream",
494 |           "text": [
495 |             "\n",
496 |             "Completed writing output to /content/dataset_aug.csv.\n"
497 |           ],
498 |           "name": "stdout"
499 |         },
500 |         {
501 |           "output_type": "stream",
502 |           "text": [
503 |             "\n"
504 |           ],
505 |           "name": "stderr"
506 |         }
507 |       ]
508 |     },
509 |     {
510 |       "cell_type": "code",
511 |       "metadata": {
512 |         "colab": {
513 |           "base_uri": "https://localhost:8080/",
514 |           "height": 402
515 |         },
516 |         "id": "VdejwBeM_uWQ",
517 |         "outputId": "23326c3a-a75d-4c9f-9d5d-09b58b37f252"
518 |       },
519 |       "source": [
520 |         "augmented_dataset"
521 |       ],
522 |       "execution_count": 4,
523 |       "outputs": [
524 |         {
525 |           "output_type": "execute_result",
526 |           "data": {
527 |             "text/html": [
528 |               "<div>\n",
529 |               "<style scoped>\n",
530 |               "    .dataframe tbody tr th:only-of-type {\n",
531 |               "        vertical-align: middle;\n",
532 |               "    }\n",
533 |               "\n",
534 |               "    .dataframe tbody tr th {\n",
535 |               "        vertical-align: top;\n",
536 |               "    }\n",
537 |               "\n",
538 |               "    .dataframe thead th {\n",
539 |               "        text-align: right;\n",
540 |               "    }\n",
541 |               "</style>\n",
542 |               "<table border=\"1\" class=\"dataframe\">\n",
543 |               "  <thead>\n",
544 |               "    <tr style=\"text-align: right;\">\n",
545 |               "      <th></th>\n",
546 |               "      <th>Text</th>\n",
547 |               "      <th>Label</th>\n",
548 |               "    </tr>\n",
549 |               "  </thead>\n",
550 |               "  <tbody>\n",
551 |               "    <tr>\n",
552 |               "      <th>0</th>\n",
553 |               "      <td>i changed the flat tire.</td>\n",
554 |               "      <td>Label1</td>\n",
555 |               "    </tr>\n",
556 |               "    <tr>\n",
557 |               "      <th>1</th>\n",
558 |               "      <td>Sue changed my flat tire.</td>\n",
559 |               "      <td>Label1</td>\n",
560 |               "    </tr>\n",
561 |               "    <tr>\n",
562 |               "      <th>2</th>\n",
563 |               "      <td>In the end Sue changed the flat tire.</td>\n",
564 |               "      <td>Label1</td>\n",
565 |               "    </tr>\n",
566 |               "    <tr>\n",
567 |               "      <th>3</th>\n",
568 |               "      <td>She changed my tire to flat. How can she fix t...</td>\n",
569 |               "      <td>Label1</td>\n",
570 |               "    </tr>\n",
571 |               "    <tr>\n",
572 |               "      <th>4</th>\n",
573 |               "      <td>reacher changed the flat tire.</td>\n",
574 |               "      <td>Label1</td>\n",
575 |               "    </tr>\n",
576 |               "    <tr>\n",
577 |               "      <th>...</th>\n",
578 |               "      <td>...</td>\n",
579 |               "      <td>...</td>\n",
580 |               "    </tr>\n",
581 |               "    <tr>\n",
582 |               "      <th>224</th>\n",
583 |               "      <td>i will clean the house every day.</td>\n",
584 |               "      <td>Label2</td>\n",
585 |               "    </tr>\n",
586 |               "    <tr>\n",
587 |               "      <th>225</th>\n",
588 |               "      <td>i will clean the house every evening.</td>\n",
589 |               "      <td>Label2</td>\n",
590 |               "    </tr>\n",
591 |               "    <tr>\n",
592 |               "      <th>226</th>\n",
593 |               "      <td>I can clean the house every Saturday. I make a...</td>\n",
594 |               "      <td>Label2</td>\n",
595 |               "    </tr>\n",
596 |               "    <tr>\n",
597 |               "      <th>227</th>\n",
598 |               "      <td>I plan to clean the house every weekend. How d...</td>\n",
599 |               "      <td>Label2</td>\n",
600 |               "    </tr>\n",
601 |               "    <tr>\n",
602 |               "      <th>228</th>\n",
603 |               "      <td>I plan to clean our house every weekend. It's ...</td>\n",
604 |               "      <td>Label2</td>\n",
605 |               "    </tr>\n",
606 |               "  </tbody>\n",
607 |               "</table>\n",
608 |               "<p>229 rows × 2 columns</p>\n",
609 |               "</div>"
610 |             ],
611 |             "text/plain": [
612 |               "                                                  Text   Label\n",
613 |               "0                             i changed the flat tire.  Label1\n",
614 |               "1                            Sue changed my flat tire.  Label1\n",
615 |               "2                In the end Sue changed the flat tire.  Label1\n",
616 |               "3    She changed my tire to flat. How can she fix t...  Label1\n",
617 |               "4                       reacher changed the flat tire.  Label1\n",
618 |               "..                                                 ...     ...\n",
619 |               "224                  i will clean the house every day.  Label2\n",
620 |               "225              i will clean the house every evening.  Label2\n",
621 |               "226  I can clean the house every Saturday. I make a...  Label2\n",
622 |               "227  I plan to clean the house every weekend. How d...  Label2\n",
623 |               "228  I plan to clean our house every weekend. It's ...  Label2\n",
624 |               "\n",
625 |               "[229 rows x 2 columns]"
626 |             ]
627 |           },
628 |           "metadata": {
629 |             "tags": []
630 |           },
631 |           "execution_count": 4
632 |         }
633 |       ]
634 |     }
635 |   ]
636 | }


--------------------------------------------------------------------------------