├── setup.cfg
├── logo.png
├── textgenie
├── __init__.py
├── textgenie.py
└── grammar_utils.py
├── examples
├── basic.py
└── examples.ipynb
├── setup.py
├── sentences_aug.txt
├── .gitignore
├── README.md
└── LICENSE
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hetpandya/textgenie/HEAD/logo.png
--------------------------------------------------------------------------------
/textgenie/__init__.py:
--------------------------------------------------------------------------------
1 | from .textgenie import TextGenie
2 |
3 | __version__ = "0.1.9.7b"
4 | __author__ = "Het Pandya"
5 | __license__ = "MIT"
6 |
--------------------------------------------------------------------------------
/examples/basic.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from textgenie import TextGenie
4 |
5 | t5_model = "hetpandya/t5-base-tapaco"
6 | bert_model = "microsoft/deberta-v3-large"
7 |
8 | textgenie = TextGenie(t5_model, bert_model, spacy_model_name="en_core_web_lg", device="cuda")
9 |
10 | # Augment a list of sentences
11 | sentences = [
12 | "The video was posted on Facebook by Alex.",
13 | "I plan to run it again this time",
14 | ]
15 |
16 | results = textgenie.magic_lamp(
17 | sentences, "paraphrase: ", n_mask_predictions=5, convert_to_active=True, add_suffix_token=False
18 | )
19 |
20 | print(results)
21 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | import re
3 |
4 | def get_property(prop, project):
5 | """
6 | Credits: https://stackoverflow.com/a/41110107
7 | """
8 | result = re.search(r'{}\s*=\s*[\'"]([^\'"]*)[\'"]'.format(prop), open(project + '/__init__.py').read())
9 | return result.group(1)
10 |
11 |
12 | with open("README.md", "r") as f:
13 | long_description = f.read()
14 |
15 | setup(
16 | name="textgenie",
17 | version=get_property('__version__', "textgenie"),
18 | description="A python library to augment text data using NLP.",
19 | long_description=long_description,
20 | long_description_content_type="text/markdown",
21 | author="Het Pandya",
22 | url="http://github.com/hetpandya/TextGenie",
23 | author_email="hetpandya6797@gmail.com",
24 | license="MIT",
25 | install_requires=[
26 | "torch>=1.5.0",
27 | "transformers",
28 | "sentencepiece",
29 | "spacy",
30 | "tqdm",
31 | "pandas",
32 | "Pattern @ git+https://github.com/clips/pattern.git",
33 | ],
34 | packages=["textgenie"],
35 | )
36 |
37 |
--------------------------------------------------------------------------------
/sentences_aug.txt:
--------------------------------------------------------------------------------
1 | Theही was posted on Facebook by Alex.
2 | The video was posted on Facebook by Alex.
3 | The video has been posted by Alex on Facebook.
4 | The attaches was posted on Facebook by Alex.
5 | This video was posted on Facebook by Alex.
6 | The video was posted on Facebook by segregated.
7 | Theanje was posted on Facebook by Alex.
8 | The video was posted to Facebook by Alex.
9 | The video was posted onanje by Alex.
10 | The video was posted on Facebook byihan.
11 | Alex posted the video on Facebook.
12 | The video was posted on Jenelle by Alex.
13 | The video was posted on Facebook by attaches.
14 | The video was posted on attaches by Alex.
15 | The video was posted on합 by Alex.
16 | Theći was posted on Facebook by Alex.
17 | The video was posted in Facebook by Alex.
18 | The video was posted on Facebook by hordes.
19 | The video was posted on Facebook bycardi.
20 | The video was posted onrified by Alex.
21 | The minecraft was posted on Facebook by Alex.
22 | I plan to run next time there this time, going one again?
23 | I want to do it again, no less, in parallel.
24 | So I like him.
25 | I plan to run it again this Tiguan
26 | I plan to run it again this minecraft
27 | I plan to run it again this morgan
28 | I plan to run it again thisgenous
29 | I'll run anything same again tonight.
30 | I plan to run it again this time
31 | I plan to run it again this Jenelle
32 | I planing to run that again: "Farefall”?
33 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | [](https://en.wikipedia.org/wiki/MIT_License)
3 | [](https://github.com/psf/black)
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | # TextGenie
13 |
14 | TextGenie is a text data augmentations library that helps you augment your text dataset and generate similar kind of samples, thus generating a more robust dataset to train better models. It also takes care of labeled datasets while generating similar samples keeping their labels in memory.
15 |
16 | It uses various Natural Language Processing methods such as paraphrase generation, BERT mask filling and converting text to active voice if found in passive voices. This library currently supports `English` Language.
17 |
18 | ## Installation
19 | ```
20 | pip install textgenie
21 | ```
22 |
23 | ## Example
24 | ```python
25 | from textgenie import TextGenie
26 |
27 | textgenie = TextGenie("hetpandya/t5-small-tapaco", "bert-base-uncased")
28 |
29 | # Augment a list of sentences
30 | sentences = [
31 | "The video was posted on Facebook by Alex.",
32 | "I plan to run it again this time",
33 | ]
34 | textgenie.magic_lamp(
35 | sentences, "paraphrase: ", n_mask_predictions=5, convert_to_active=True
36 | )
37 |
38 | # Augment data in a txt file
39 | textgenie.magic_lamp(
40 | "sentences.txt", "paraphrase: ", n_mask_predictions=5, convert_to_active=True
41 | )
42 |
43 | # Augment data in a csv file with labels
44 | textgenie.magic_lamp(
45 | "sentences.csv",
46 | "paraphrase: ",
47 | n_mask_predictions=5,
48 | convert_to_active=True,
49 | label_column="Label",
50 | data_column="Text",
51 | column_names=["Text", "Label"],
52 | )
53 | ```
54 | Examples can be found in the examples [notebook](https://github.com/hetpandya/textgenie/blob/main/examples/examples.ipynb).
55 |
56 | ## Usage
57 |
58 | - Initializing the augmentor:
59 | ```textgenie = TextGenie(paraphrase_model_name='model_name',mask_model_name='model_name',spacy_model_name="model_name",device="cpu")```
60 | - Parameters:
61 | - *paraphrase_model_name*:
62 | - The name of the T5 paraphrase model.
63 | - A list of pretrained model for paraphrase generation can be found [here](https://github.com/hetpandya/paraphrase-datasets-pretrained-models#pretrained-models)
64 | - *mask_model_name*:
65 | - BERT model that will be used to fill masks. This model is disabled by default. But can be enabled by mentioning the name of the BERT model to be used. A list of mask filling models can be found [here](https://huggingface.co/models?filter=en&pipeline_tag=fill-mask)
66 | - *spacy_model_name*:
67 | - Name of the Spacy model. Available models can be found [here](https://spacy.io/models). The default value is set to *en_core_web_sm*.
68 | - *device*:
69 | - The device where the model will be loaded. The default value is set to *cpu*.
70 | - Methods:
71 | - augment_sent_mask_filling():
72 | - Generate augmented data using BERT mask filling.
73 | - Parameters:
74 | - *sent*:
75 | - The sentence on which augmentation has to be applied.
76 | - *n_mask_predictions*:
77 | - The number of predictions, the BERT mask filling model should generate. The default value is set to *5*.
78 | - augment_sent_t5():
79 | - Generate augmented data using T5 paraphrasing model.
80 | - Parameters:
81 | - *sent*:
82 | - The sentence on which augmentation has to be applied.
83 | - *prefix*:
84 | - The prefix for the T5 model input.
85 | - *n_predictions*:
86 | - The number of number augmentations, the function should return. The default value is set to *5*.
87 | - *top_k*:
88 | - The number of predictions, the T5 model should generate. The default value is set to *120*.
89 | - *max_length*:
90 | - The max length of the sentence to feed to the model. The default value is set to *256*.
91 | - convert_to_active():
92 | - Converts a sentence to active voice, if found in passive voice. Otherwise returns the same sentence.
93 | - Parameters:
94 | - *sent*:
95 | - The sentence that has to be converted.
96 | - magic_once():
97 | - This is a wrapper method for *augment_sent_mask_filling()*, *augment_sent_t5()* and *convert_to_active()* methods. Using this, a sentence can be augmented using all the above mentioned techniques.
98 | - Since this method can operate on individual text data, it can be merged with other packages.
99 | - Parameters:
100 | - *sent*:
101 | - The sentence that has to be augmented.
102 | - *paraphrase_prefix*:
103 | - The prefix for the T5 model input.
104 | - *n_paraphrase_predictions*:
105 | - The number of number augmentations, the function should return. The default value is set to *5*.
106 | - *paraphrase_top_k*:
107 | - The number of predictions, the T5 model should generate. The default value is set to *120*.
108 | - *paraphrase_max_length*:
109 | - The max length of the sentence to feed to the model. The default value is set to *256*.
110 | - *n_mask_predictions*:
111 | - The number of predictions, the BERT mask filling model should generate. The default value is set to *None*.
112 | - *convert_to_active*:
113 | - If the sentence should be converted to active voice. The default value is set to *True*.
114 | - magic_lamp():
115 | - This method can be used for augmenting whole dataset. Currently accepted dataset formats are: `txt`,`csv`,`tsv` and `list`.
116 | - If the dataset is in `list` or `txt` format, a list of augmented sentences will be returned. Also, a `txt` file with the name *sentences_aug.txt* is saved containing the output of the augmented data.
117 | - If a dataset is in `csv` or `tsv` format with labels, the dataset will be augmented along with keeping in memory the labels for the new samples and a pandas dataframe of the augmented data will be returned. A `tsv` file will be generated with the augmented output with name `original_file_name_aug.tsv`
118 | - Parameters:
119 | - *sentences*:
120 | - The dataset that has to be augmented. This can be a `Python List`, a `txt`, `csv` or `tsv` file.
121 | - *paraphrase_prefix*:
122 | - The prefix for the T5 model input.
123 | - *n_paraphrase_predictions*:
124 | - The number of number augmentations, the function should return. The default value is set to *5*.
125 | - *paraphrase_top_k*:
126 | - The number of predictions, the T5 model should generate. The default value is set to *120*.
127 | - *paraphrase_max_length*:
128 | - The max length of the sentence to feed to the model. The default value is set to *256*.
129 | - *n_mask_predictions*:
130 | - The number of predictions, the BERT mask filling model should generate. The default value is set to *None*.
131 | - *convert_to_active*:
132 | - If the sentence should be converted to active voice. The default value is set to *True*.
133 | - *label_column*:
134 | - The name of the column that contains labeled data. The default value is set to *None*. This parameter is not required to be set if the dataset is in a `Python List` or a `txt` file.
135 | - *data_column*:
136 | - The name of the column that contains data. The default value is set to *None*. This parameter too is not required if the dataset is a `Python List` or a `txt` file.
137 | - *column_names*:
138 | - If the `csv` or `tsv` does not have column names, a Python list has to be passed to give the columns a name. Since this function also accepts `Python List` and a `txt` file, the default value is set to *None*. But, if `csv` or `tsv` files are used, this parameter has to be set.
139 |
140 |
141 | ## References
142 | [Passive To Active](https://github.com/DanManN/pass2act) licensed under the Apache License 2.0
143 |
144 | ## Links
145 | Please find an in depth explanation about the library [on my blog](https://towardsdatascience.com/textgenie-augmenting-your-text-dataset-with-just-2-lines-of-code-23ce883a0715).
146 |
147 | ## License
148 | Please check `LICENSE` for more details.
149 |
150 |
--------------------------------------------------------------------------------
/textgenie/textgenie.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from tqdm import tqdm
3 | from .grammar_utils import pass2act, is_passive
4 | from transformers import T5ForConditionalGeneration, T5Tokenizer
5 | from string import punctuation
6 | import os
7 | import pandas as pd
8 |
9 |
10 | def set_seed(seed):
11 | torch.manual_seed(seed)
12 |
13 |
14 | set_seed(42)
15 |
16 |
17 | class TextGenie:
18 | def __init__(
19 | self,
20 | paraphrase_model_name,
21 | mask_model_name=None,
22 | spacy_model_name="en_core_web_sm",
23 | device="cpu",
24 | ):
25 | tqdm.write("Loading Paraphrase Model..")
26 | self.paraphrase_model = T5ForConditionalGeneration.from_pretrained(
27 | paraphrase_model_name
28 | )
29 | self.paraphrase_tokenizer = T5Tokenizer.from_pretrained(paraphrase_model_name)
30 | self.paraphrase_model = self.paraphrase_model.to(device)
31 | self.device = device
32 |
33 | import spacy
34 | self.nlp = spacy.load(spacy_model_name)
35 |
36 | if mask_model_name:
37 | tqdm.write("Loading Mask Fill Model..")
38 | from transformers import pipeline
39 | from string import punctuation
40 |
41 | self.mask_augmenter = pipeline("fill-mask", model=mask_model_name)
42 |
43 | def extract_keywords(self, sentence):
44 | result = []
45 | pos_tag = ["PROPN", "NOUN", "ADJ"]
46 | consider_tags = ["NUM"]
47 | pos_tag = pos_tag + consider_tags
48 |
49 | doc = self.nlp(sentence)
50 |
51 | for token in doc:
52 | if (
53 | token.text in self.nlp.Defaults.stop_words or token.text in punctuation
54 | ) and token.pos_ not in consider_tags:
55 | continue
56 | if token.pos_ in pos_tag:
57 | result.append(token.text)
58 | return list(set(result))
59 |
60 | def augment_sent_mask_filling(self, sent, n_mask_predictions=5):
61 | keywords = self.extract_keywords(sent)
62 | augmented_sents = []
63 | for keyword in keywords:
64 | masked_sent = sent.replace(keyword, self.mask_augmenter.tokenizer.mask_token, 1)
65 | augmented_sents.extend(
66 | [
67 | generated_sent["sequence"]
68 | for generated_sent in self.mask_augmenter(
69 | masked_sent, top_k=n_mask_predictions
70 | )
71 | if generated_sent["sequence"].lower() != sent.lower()
72 | ]
73 | )
74 | return augmented_sents
75 |
76 | def augment_sent_t5(self, sent, prefix, n_predictions=5, top_k=120, max_length=256, add_suffix_token=True):
77 | text = prefix + sent + " " if add_suffix_token else ""
78 | encoding = self.paraphrase_tokenizer.encode_plus(
79 | text, pad_to_max_length=True, return_tensors="pt"
80 | )
81 | input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding[
82 | "attention_mask"
83 | ].to(self.device)
84 |
85 | beam_outputs = self.paraphrase_model.generate(
86 | input_ids=input_ids,
87 | attention_mask=attention_masks,
88 | do_sample=True,
89 | max_length=max_length,
90 | top_k=top_k,
91 | top_p=0.98,
92 | early_stopping=True,
93 | num_return_sequences=n_predictions,
94 | )
95 |
96 | final_outputs = []
97 | for beam_output in beam_outputs:
98 | generated_sent = self.paraphrase_tokenizer.decode(
99 | beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True
100 | )
101 | if (
102 | generated_sent.lower() != sent.lower()
103 | and generated_sent not in final_outputs
104 | ):
105 | final_outputs.append(generated_sent)
106 | return final_outputs
107 |
108 | def convert_to_active(self, sent):
109 | if is_passive(sent, nlp=self.nlp):
110 | return pass2act(sent, nlp=self.nlp)
111 | else:
112 | return sent
113 |
114 | def magic_once(
115 | self,
116 | sent,
117 | paraphrase_prefix,
118 | n_paraphrase_predictions=5,
119 | paraphrase_top_k=120,
120 | paraphrase_max_length=256,
121 | n_mask_predictions=None,
122 | convert_to_active=True,
123 | add_suffix_token=True,
124 | ):
125 | sent = sent.strip()
126 | output = []
127 | output.append(sent)
128 | output += self.augment_sent_t5(
129 | sent,
130 | paraphrase_prefix,
131 | n_paraphrase_predictions,
132 | paraphrase_top_k,
133 | paraphrase_max_length,
134 | )
135 | if n_mask_predictions and isinstance(n_mask_predictions, int):
136 | output += self.augment_sent_mask_filling(sent, n_mask_predictions)
137 | if convert_to_active:
138 | active_voice = self.convert_to_active(sent)
139 | if active_voice.lower() != sent.lower():
140 | output.append(active_voice)
141 | return list(set(output))
142 |
143 | def magic_lamp(
144 | self,
145 | sentences,
146 | paraphrase_prefix,
147 | n_paraphrase_predictions=5,
148 | paraphrase_top_k=120,
149 | paraphrase_max_length=256,
150 | n_mask_predictions=None,
151 | convert_to_active=True,
152 | label_column=None,
153 | data_column=None,
154 | column_names=None,
155 | add_suffix_token=True,
156 | ):
157 | all_sentences = None
158 | with_labels = False
159 | out_file = os.path.join(os.getcwd(), "sentences_aug.txt")
160 |
161 | if isinstance(sentences, str):
162 | sentences = os.path.join(os.getcwd(), sentences)
163 | if sentences.endswith(".txt"):
164 | all_sentences = open(sentences).read().strip().split("\n")
165 | elif sentences.endswith(".csv") or sentences.endswith(".tsv"):
166 | if not label_column:
167 | raise Exception(
168 | "Please provide the column name that contains labels using the 'label_column' parameter."
169 | )
170 | if not data_column:
171 | raise Exception(
172 | "Please provide the column name that contains data using the 'data_column' parameter."
173 | )
174 | if column_names and not isinstance(column_names, list):
175 | raise Exception("Please provide column names in a python list.")
176 | out_file = (
177 | sentences.replace(".csv", "").replace(".tsv", "") + "_aug.tsv"
178 | )
179 | with_labels = True
180 | if sentences.endswith(".csv"):
181 | if column_names:
182 | all_sentences = pd.read_csv(sentences, names=column_names)
183 | else:
184 | all_sentences = pd.read_csv(sentences)
185 | elif sentences.endswith(".tsv"):
186 | if column_names:
187 | all_sentences = pd.read_csv(
188 | sentences, names=column_names, sep="\t"
189 | )
190 | else:
191 | all_sentences = pd.read_csv(sentences)
192 | if label_column not in all_sentences.columns:
193 | raise Exception(
194 | "Please provide label column name for the dataset using the 'label_column' parameter. If already provided, please check for typos in the name of the label column."
195 | )
196 | if data_column not in all_sentences.columns:
197 | raise Exception(
198 | "Please provide data column name for the dataset using the 'data_column' parameter. If already provided, please check for typos in the name of the data column."
199 | )
200 | labels = all_sentences[label_column].unique()
201 |
202 | if all_sentences.iloc[0].tolist() == column_names:
203 | all_sentences = all_sentences.drop(0)
204 | augmented_data = []
205 |
206 | for ix in tqdm(range(all_sentences.shape[0])):
207 | sent = all_sentences[data_column][ix].strip()
208 | label = all_sentences[label_column][ix].strip()
209 | aug_sent = self.magic_once(
210 | sent,
211 | paraphrase_prefix,
212 | n_paraphrase_predictions,
213 | paraphrase_top_k,
214 | paraphrase_max_length,
215 | n_mask_predictions,
216 | convert_to_active,
217 | add_suffix_token,
218 | )
219 | aug_sent = [[s, label] for s in aug_sent]
220 | augmented_data.extend(aug_sent)
221 | augmented_data = pd.DataFrame(
222 | data=augmented_data, columns=["Text", "Label"]
223 | )
224 | augmented_data.to_csv(out_file, sep="\t", index=None)
225 | else:
226 | raise Exception(
227 | "Unsupported file format. Currently, following formats are supported: list/csv/tsv"
228 | )
229 | elif isinstance(sentences, list):
230 | all_sentences = sentences
231 | if all_sentences is None:
232 | raise Exception("Error: No sentences found.")
233 | if not with_labels:
234 | augmented_data = []
235 | for sent in tqdm(all_sentences):
236 | augmented_data.extend(
237 | self.magic_once(
238 | sent,
239 | paraphrase_prefix,
240 | n_paraphrase_predictions,
241 | paraphrase_top_k,
242 | paraphrase_max_length,
243 | n_mask_predictions,
244 | convert_to_active,
245 | add_suffix_token,
246 | )
247 | )
248 | with open(out_file, "w") as f:
249 | for line in augmented_data:
250 | f.write(line + "\n")
251 | tqdm.write(f"\nCompleted writing output to {out_file}.")
252 | return augmented_data
253 |
--------------------------------------------------------------------------------
/textgenie/grammar_utils.py:
--------------------------------------------------------------------------------
1 | import spacy
2 | from spacy.matcher import Matcher
3 |
4 | try:
5 | import pattern.en as en
6 | from pattern.en import lexeme
7 | except LookupError:
8 | import nltk
9 |
10 | nltk.download("omw-1.4")
11 | import pattern.en as en
12 | from pattern.en import lexeme
13 |
14 | noundict = {
15 | "i": "me",
16 | "we": "us",
17 | "you": "you",
18 | "he": "him",
19 | "she": "her",
20 | "they": "them",
21 | "them": "they",
22 | "her": "she",
23 | "him": "he",
24 | "us": "we",
25 | "me": "i",
26 | }
27 |
28 |
29 | def nouninv(noun):
30 | n = noun.lower()
31 | if n in noundict:
32 | return noundict[n]
33 | return noun
34 |
35 |
36 | def pattern_stopiteration_workaround():
37 | try:
38 | print(lexeme("check"))
39 | except:
40 | pass
41 |
42 |
43 | pattern_stopiteration_workaround()
44 |
45 |
46 | def pass2act(doc, nlp, rec=False):
47 | """
48 | Author : Daniel Nohimovich & Zhekai Jin (Scott)
49 | Course : ECE 467 Natural Language Processing
50 | Instructor : Professor Carl Sable
51 | https://github.com/DanManN/pass2act
52 | All rights reserved by the orignal author.
53 | This source code is licensed under the Apache License 2.0 found in the
54 | LICENSE file in the root directory of this source tree.
55 | """
56 | parse = nlp(doc)
57 | newdoc = ""
58 | for sent in parse.sents:
59 |
60 | # Init parts of sentence to capture:
61 | subjpass = ""
62 | subj = ""
63 | verb = ""
64 | verbaspect = ""
65 | verbtense = ""
66 | adverb = {"bef": "", "aft": ""}
67 | part = ""
68 | prep = ""
69 | agent = ""
70 | aplural = False
71 | advcltree = None
72 | aux = list(list(nlp(". .").sents)[0]) # start with 2 'null' elements
73 | xcomp = ""
74 | punc = "."
75 | # Analyse dependency tree:
76 | for word in sent:
77 | if word.dep_ == "advcl":
78 | if word.head.dep_ in ("ROOT", "auxpass"):
79 | advcltree = word.subtree
80 | if word.dep_ == "nsubjpass":
81 | if word.head.dep_ == "ROOT":
82 | subjpass = "".join(
83 | w.text_with_ws.lower()
84 | if w.tag_ not in ("NNP", "NNPS")
85 | else w.text_with_ws
86 | for w in word.subtree
87 | ).strip()
88 | if word.dep_ == "nsubj":
89 | subj = "".join(
90 | w.text_with_ws.lower()
91 | if w.tag_ not in ("NNP", "NNPS")
92 | else w.text_with_ws
93 | for w in word.subtree
94 | ).strip()
95 | if word.head.dep_ == "auxpass":
96 | if word.head.head.dep_ == "ROOT":
97 | subjpass = subj
98 | if word.dep_ in ("advmod", "npadvmod", "oprd"):
99 | if word.head.dep_ == "ROOT":
100 | if verb == "":
101 | adverb["bef"] = "".join(
102 | w.text_with_ws.lower()
103 | if w.tag_ not in ("NNP", "NNPS")
104 | else w.text_with_ws
105 | for w in word.subtree
106 | ).strip()
107 | else:
108 | adverb["aft"] = "".join(
109 | w.text_with_ws.lower()
110 | if w.tag_ not in ("NNP", "NNPS")
111 | else w.text_with_ws
112 | for w in word.subtree
113 | ).strip()
114 | if word.dep_ == "auxpass":
115 | if word.head.dep_ == "ROOT":
116 | if not subjpass:
117 | subjpass = subj
118 | if word.dep_ in ("aux", "auxpass", "neg"):
119 | if word.head.dep_ == "ROOT":
120 | aux += [word]
121 | if word.dep_ == "ROOT":
122 | verb = word.text
123 | if word.tag_ == "VB":
124 | verbtense = en.INFINITIVE
125 | elif word.tag_ == "VBD":
126 | verbtense = en.PAST
127 | elif word.tag_ == "VBG":
128 | verbtense = en.PRESENT
129 | verbaspect = en.PROGRESSIVE
130 | elif word.tag_ == "VBN":
131 | verbtense = en.PAST
132 | else:
133 | verbtense = en.tenses(word.text)[0][0]
134 | if word.dep_ == "prt":
135 | if word.head.dep_ == "ROOT":
136 | part = "".join(
137 | w.text_with_ws.lower()
138 | if w.tag_ not in ("NNP", "NNPS")
139 | else w.text_with_ws
140 | for w in word.subtree
141 | ).strip()
142 | if word.dep_ == "prep":
143 | if word.head.dep_ == "ROOT":
144 | prep = "".join(
145 | w.text_with_ws.lower()
146 | if w.tag_ not in ("NNP", "NNPS")
147 | else w.text_with_ws
148 | for w in word.subtree
149 | ).strip()
150 | if word.dep_.endswith("obj"):
151 | if word.head.dep_ == "agent":
152 | if word.head.head.dep_ == "ROOT":
153 | agent = "".join(
154 | w.text + ", "
155 | if w.dep_ == "appos"
156 | else (
157 | w.text_with_ws.lower()
158 | if w.tag_ not in ("NNP", "NNPS")
159 | else w.text_with_ws
160 | )
161 | for w in word.subtree
162 | ).strip()
163 | aplural = word.tag_ in ("NNS", "NNPS")
164 | if word.dep_ in ("xcomp", "ccomp", "conj"):
165 | if word.head.dep_ == "ROOT":
166 | xcomp = "".join(
167 | w.text_with_ws.lower()
168 | if w.tag_ not in ("NNP", "NNPS")
169 | else w.text_with_ws
170 | for w in word.subtree
171 | ).strip()
172 | that = xcomp.startswith("that")
173 | xcomp = pass2act(xcomp, True).strip(" .")
174 | if not xcomp.startswith("that") and that:
175 | xcomp = "that " + xcomp
176 | if word.dep_ == "punct" and not rec:
177 | if word.text != '"':
178 | punc = word.text
179 |
180 | # exit if not passive:
181 | if subjpass == "":
182 | newdoc += str(sent) + " "
183 | continue
184 |
185 | # if no agent is found:
186 | if agent == "":
187 | # what am I gonna do? BITconEEEEEEECT!!!!
188 | newdoc += str(sent) + " "
189 | continue
190 |
191 | # invert nouns:
192 | agent = nouninv(agent)
193 | subjpass = nouninv(subjpass)
194 |
195 | # FUCKING CONJUGATION!!!!!!!!!!!!!:
196 | auxstr = ""
197 | num = en.SINGULAR if not aplural or agent in ("he", "she") else en.PLURAL
198 | aux.append(aux[0])
199 | verbaspect = None
200 | for (pp, p, a, n) in zip(aux, aux[1:], aux[2:], aux[3:]):
201 | if a.lemma_ == ".":
202 | continue
203 |
204 | if a.lemma_ == "not":
205 | if p.lemma_ == "be":
206 | if n.lemma_ == "be":
207 | verbtense = en.tenses(a.text)[0][0]
208 | auxstr += (
209 | en.conjugate(
210 | "be", tense=en.tenses(p.text)[0][0], number=num
211 | )
212 | + " "
213 | )
214 | verbaspect = en.PROGRESSIVE
215 | else:
216 | auxstr += (
217 | en.conjugate(
218 | "do", tense=en.tenses(p.text)[0][0], number=num
219 | )
220 | + " "
221 | )
222 | verbtense = en.INFINITIVE
223 | auxstr += "not "
224 | elif a.lemma_ == "be":
225 | if p.lemma_ == "be":
226 | verbtense = en.tenses(a.text)[0][0]
227 | auxstr += (
228 | en.conjugate("be", tense=en.tenses(a.text)[0][0], number=num)
229 | + " "
230 | )
231 | verbaspect = en.PROGRESSIVE
232 | elif p.tag_ == "MD":
233 | verbtense = en.INFINITIVE
234 | elif a.lemma_ == "have":
235 | num == en.PLURAL if p.tag_ == "MD" else num
236 | auxstr += (
237 | en.conjugate("have", tense=en.tenses(a.text)[0][0], number=num)
238 | + " "
239 | )
240 | if n.lemma_ == "be":
241 | verbaspect = en.PROGRESSIVE
242 | verbtense = en.tenses(n.text)[0][0]
243 | else:
244 | auxstr += a.text_with_ws
245 | auxstr = auxstr.lower().strip()
246 |
247 | if verbaspect:
248 | verb = en.conjugate(verb, tense=verbtense, aspect=verbaspect)
249 | else:
250 | verb = en.conjugate(verb, tense=verbtense)
251 |
252 | advcl = ""
253 | if advcltree:
254 | for w in advcltree:
255 | if w.pos_ == "VERB" and en.tenses(w.text)[0][4] == en.PROGRESSIVE:
256 | advcl += (
257 | "which "
258 | + en.conjugate(w.text, tense=en.tenses(verb)[0][0])
259 | + " "
260 | )
261 | else:
262 | advcl += w.text_with_ws
263 |
264 | newsent = (
265 | " ".join(
266 | list(
267 | filter(
268 | None,
269 | [
270 | agent,
271 | auxstr,
272 | adverb["bef"],
273 | verb,
274 | part,
275 | subjpass,
276 | adverb["aft"],
277 | advcl,
278 | prep,
279 | xcomp,
280 | ],
281 | )
282 | )
283 | )
284 | + punc
285 | )
286 | if not rec:
287 | newsent = newsent[0].upper() + newsent[1:]
288 | newdoc += newsent + " "
289 | return newdoc
290 |
291 |
292 | def is_passive(sentence, nlp):
293 | doc = nlp(sentence)
294 | passive_rule = [
295 | {"DEP": "nsubjpass"},
296 | {"DEP": "aux", "OP": "*"},
297 | {"DEP": "auxpass"},
298 | {"TAG": "VBN"},
299 | ]
300 |
301 | matcher = Matcher(nlp.vocab)
302 |
303 | matcher.add("Passive", [passive_rule])
304 | matches = matcher(doc)
305 | if matches:
306 | return True
307 | else:
308 | return False
309 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/examples/examples.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "textgenie-examples.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": []
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "code",
21 | "metadata": {
22 | "colab": {
23 | "base_uri": "https://localhost:8080/",
24 | "height": 1000
25 | },
26 | "id": "4b69iyum-FxW",
27 | "outputId": "8fb63125-6ac3-4fc8-c4c1-20fcfea72e31"
28 | },
29 | "source": [
30 | "!pip install --upgrade textgenie"
31 | ],
32 | "execution_count": 10,
33 | "outputs": [
34 | {
35 | "output_type": "stream",
36 | "text": [
37 | "Collecting git+https://github.com/hetpandya/textgenie.git\n",
38 | " Cloning https://github.com/hetpandya/textgenie.git to /tmp/pip-req-build-42y38bmw\n",
39 | " Running command git clone -q https://github.com/hetpandya/textgenie.git /tmp/pip-req-build-42y38bmw\n",
40 | "Requirement already satisfied, skipping upgrade: torch>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from textgenie==0.1.2) (1.9.0+cu102)\n",
41 | "Requirement already satisfied, skipping upgrade: transformers in /usr/local/lib/python3.7/dist-packages (from textgenie==0.1.2) (4.7.0)\n",
42 | "Requirement already satisfied, skipping upgrade: sentencepiece in /usr/local/lib/python3.7/dist-packages (from textgenie==0.1.2) (0.1.96)\n",
43 | "Requirement already satisfied, skipping upgrade: spacy in /usr/local/lib/python3.7/dist-packages (from textgenie==0.1.2) (2.2.4)\n",
44 | "Requirement already satisfied, skipping upgrade: tqdm in /usr/local/lib/python3.7/dist-packages (from textgenie==0.1.2) (4.41.1)\n",
45 | "Requirement already satisfied, skipping upgrade: pattern in /usr/local/lib/python3.7/dist-packages (from textgenie==0.1.2) (3.6)\n",
46 | "Requirement already satisfied, skipping upgrade: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch>=1.5.0->textgenie==0.1.2) (3.7.4.3)\n",
47 | "Requirement already satisfied, skipping upgrade: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (2019.12.20)\n",
48 | "Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (2.23.0)\n",
49 | "Requirement already satisfied, skipping upgrade: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (1.19.5)\n",
50 | "Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (4.5.0)\n",
51 | "Requirement already satisfied, skipping upgrade: filelock in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (3.0.12)\n",
52 | "Requirement already satisfied, skipping upgrade: huggingface-hub==0.0.8 in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (0.0.8)\n",
53 | "Requirement already satisfied, skipping upgrade: tokenizers<0.11,>=0.10.1 in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (0.10.3)\n",
54 | "Requirement already satisfied, skipping upgrade: pyyaml in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (3.13)\n",
55 | "Requirement already satisfied, skipping upgrade: packaging in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (20.9)\n",
56 | "Requirement already satisfied, skipping upgrade: sacremoses in /usr/local/lib/python3.7/dist-packages (from transformers->textgenie==0.1.2) (0.0.45)\n",
57 | "Requirement already satisfied, skipping upgrade: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (1.1.3)\n",
58 | "Requirement already satisfied, skipping upgrade: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (7.4.0)\n",
59 | "Requirement already satisfied, skipping upgrade: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (2.0.5)\n",
60 | "Requirement already satisfied, skipping upgrade: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (0.4.1)\n",
61 | "Requirement already satisfied, skipping upgrade: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (1.0.0)\n",
62 | "Requirement already satisfied, skipping upgrade: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (1.0.5)\n",
63 | "Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (57.0.0)\n",
64 | "Requirement already satisfied, skipping upgrade: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (1.0.5)\n",
65 | "Requirement already satisfied, skipping upgrade: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (0.8.2)\n",
66 | "Requirement already satisfied, skipping upgrade: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy->textgenie==0.1.2) (3.0.5)\n",
67 | "Requirement already satisfied, skipping upgrade: beautifulsoup4 in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (4.6.3)\n",
68 | "Requirement already satisfied, skipping upgrade: future in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (0.16.0)\n",
69 | "Requirement already satisfied, skipping upgrade: feedparser in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (6.0.7)\n",
70 | "Requirement already satisfied, skipping upgrade: lxml in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (4.2.6)\n",
71 | "Requirement already satisfied, skipping upgrade: nltk in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (3.2.5)\n",
72 | "Requirement already satisfied, skipping upgrade: cherrypy in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (18.6.0)\n",
73 | "Requirement already satisfied, skipping upgrade: scipy in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (1.4.1)\n",
74 | "Requirement already satisfied, skipping upgrade: pdfminer.six in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (20201018)\n",
75 | "Requirement already satisfied, skipping upgrade: python-docx in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (0.8.11)\n",
76 | "Requirement already satisfied, skipping upgrade: backports.csv in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (1.0.7)\n",
77 | "Requirement already satisfied, skipping upgrade: mysqlclient in /usr/local/lib/python3.7/dist-packages (from pattern->textgenie==0.1.2) (2.0.3)\n",
78 | "Requirement already satisfied, skipping upgrade: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers->textgenie==0.1.2) (3.0.4)\n",
79 | "Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers->textgenie==0.1.2) (2021.5.30)\n",
80 | "Requirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers->textgenie==0.1.2) (1.24.3)\n",
81 | "Requirement already satisfied, skipping upgrade: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers->textgenie==0.1.2) (2.10)\n",
82 | "Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers->textgenie==0.1.2) (3.4.1)\n",
83 | "Requirement already satisfied, skipping upgrade: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers->textgenie==0.1.2) (2.4.7)\n",
84 | "Requirement already satisfied, skipping upgrade: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers->textgenie==0.1.2) (1.0.1)\n",
85 | "Requirement already satisfied, skipping upgrade: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers->textgenie==0.1.2) (1.15.0)\n",
86 | "Requirement already satisfied, skipping upgrade: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers->textgenie==0.1.2) (7.1.2)\n",
87 | "Requirement already satisfied, skipping upgrade: sgmllib3k in /usr/local/lib/python3.7/dist-packages (from feedparser->pattern->textgenie==0.1.2) (1.0.0)\n",
88 | "Requirement already satisfied, skipping upgrade: cheroot>=8.2.1 in /usr/local/lib/python3.7/dist-packages (from cherrypy->pattern->textgenie==0.1.2) (8.5.2)\n",
89 | "Requirement already satisfied, skipping upgrade: zc.lockfile in /usr/local/lib/python3.7/dist-packages (from cherrypy->pattern->textgenie==0.1.2) (2.0)\n",
90 | "Requirement already satisfied, skipping upgrade: jaraco.collections in /usr/local/lib/python3.7/dist-packages (from cherrypy->pattern->textgenie==0.1.2) (3.3.0)\n",
91 | "Requirement already satisfied, skipping upgrade: more-itertools in /usr/local/lib/python3.7/dist-packages (from cherrypy->pattern->textgenie==0.1.2) (8.8.0)\n",
92 | "Requirement already satisfied, skipping upgrade: portend>=2.1.1 in /usr/local/lib/python3.7/dist-packages (from cherrypy->pattern->textgenie==0.1.2) (2.7.1)\n",
93 | "Requirement already satisfied, skipping upgrade: sortedcontainers in /usr/local/lib/python3.7/dist-packages (from pdfminer.six->pattern->textgenie==0.1.2) (2.4.0)\n",
94 | "Requirement already satisfied, skipping upgrade: cryptography in /usr/local/lib/python3.7/dist-packages (from pdfminer.six->pattern->textgenie==0.1.2) (3.4.7)\n",
95 | "Requirement already satisfied, skipping upgrade: jaraco.functools in /usr/local/lib/python3.7/dist-packages (from cheroot>=8.2.1->cherrypy->pattern->textgenie==0.1.2) (3.3.0)\n",
96 | "Requirement already satisfied, skipping upgrade: jaraco.classes in /usr/local/lib/python3.7/dist-packages (from jaraco.collections->cherrypy->pattern->textgenie==0.1.2) (3.2.1)\n",
97 | "Requirement already satisfied, skipping upgrade: jaraco.text in /usr/local/lib/python3.7/dist-packages (from jaraco.collections->cherrypy->pattern->textgenie==0.1.2) (3.5.0)\n",
98 | "Requirement already satisfied, skipping upgrade: tempora>=1.8 in /usr/local/lib/python3.7/dist-packages (from portend>=2.1.1->cherrypy->pattern->textgenie==0.1.2) (4.1.1)\n",
99 | "Requirement already satisfied, skipping upgrade: cffi>=1.12 in /usr/local/lib/python3.7/dist-packages (from cryptography->pdfminer.six->pattern->textgenie==0.1.2) (1.14.5)\n",
100 | "Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.7/dist-packages (from tempora>=1.8->portend>=2.1.1->cherrypy->pattern->textgenie==0.1.2) (2018.9)\n",
101 | "Requirement already satisfied, skipping upgrade: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi>=1.12->cryptography->pdfminer.six->pattern->textgenie==0.1.2) (2.20)\n",
102 | "Building wheels for collected packages: textgenie\n",
103 | " Building wheel for textgenie (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
104 | " Created wheel for textgenie: filename=textgenie-0.1.2-cp37-none-any.whl size=8705 sha256=f8cb665c1c8c93f7792b85d66124020d326625f21d43873fb6d0db71f67437af\n",
105 | " Stored in directory: /tmp/pip-ephem-wheel-cache-8giac7ls/wheels/35/24/87/4f20f5d3fa823cf98bf2d27bb95281c19c3436f82888aa6adc\n",
106 | "Successfully built textgenie\n",
107 | "Installing collected packages: textgenie\n",
108 | " Found existing installation: textgenie 0.1.1\n",
109 | " Uninstalling textgenie-0.1.1:\n",
110 | " Successfully uninstalled textgenie-0.1.1\n",
111 | "Successfully installed textgenie-0.1.2\n"
112 | ],
113 | "name": "stdout"
114 | },
115 | {
116 | "output_type": "display_data",
117 | "data": {
118 | "application/vnd.colab-display-data+json": {
119 | "pip_warning": {
120 | "packages": [
121 | "textgenie"
122 | ]
123 | }
124 | }
125 | },
126 | "metadata": {
127 | "tags": []
128 | }
129 | }
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "metadata": {
135 | "id": "_h3lGuIV-KKI"
136 | },
137 | "source": [
138 | "from textgenie import TextGenie"
139 | ],
140 | "execution_count": 1,
141 | "outputs": []
142 | },
143 | {
144 | "cell_type": "code",
145 | "metadata": {
146 | "colab": {
147 | "base_uri": "https://localhost:8080/"
148 | },
149 | "id": "Tg2igVb6-UxL",
150 | "outputId": "790b4af8-9eb8-42ed-cf01-cb9cb5555e72"
151 | },
152 | "source": [
153 | "textgenie = TextGenie(\"ramsrigouthamg/t5_paraphraser\",'bert-base-uncased')"
154 | ],
155 | "execution_count": 2,
156 | "outputs": [
157 | {
158 | "output_type": "stream",
159 | "text": [
160 | "Loading Paraphrase Model..\n",
161 | "Loading Mask Fill Model..\n"
162 | ],
163 | "name": "stdout"
164 | },
165 | {
166 | "output_type": "stream",
167 | "text": [
168 | "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
169 | "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
170 | "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
171 | ],
172 | "name": "stderr"
173 | }
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "metadata": {
179 | "colab": {
180 | "base_uri": "https://localhost:8080/"
181 | },
182 | "id": "P06v8CkK-vHu",
183 | "outputId": "bec9a13a-58cd-4f91-bf1c-f2f652fff9e5"
184 | },
185 | "source": [
186 | "# Augment a list of sentences\n",
187 | "sentences = [\"The video was posted on Facebook by Alex.\",\"I plan to run it again this time\"]\n",
188 | "textgenie.magic_lamp(sentences,\"paraphrase: \",n_paraphrase_predictions=15,n_mask_predictions=15,convert_to_active=True)"
189 | ],
190 | "execution_count": 5,
191 | "outputs": [
192 | {
193 | "output_type": "stream",
194 | "text": [
195 | "\r 0%| | 0/2 [00:00, ?it/s]/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py:2111: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
196 | " FutureWarning,\n",
197 | "/usr/local/lib/python3.7/dist-packages/transformers/models/t5/tokenization_t5.py:191: UserWarning: This sequence already has . In future versions this behavior may lead to duplicated eos tokens being added.\n",
198 | " f\"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added.\"\n",
199 | "100%|██████████| 2/2 [00:18<00:00, 9.20s/it]"
200 | ],
201 | "name": "stderr"
202 | },
203 | {
204 | "output_type": "stream",
205 | "text": [
206 | "\n",
207 | "Completed writing output to /content/sentences_aug.txt.\n"
208 | ],
209 | "name": "stdout"
210 | },
211 | {
212 | "output_type": "stream",
213 | "text": [
214 | "\n"
215 | ],
216 | "name": "stderr"
217 | },
218 | {
219 | "output_type": "execute_result",
220 | "data": {
221 | "text/plain": [
222 | "['the clip was posted on facebook by alex.',\n",
223 | " 'the video was posted on facebook by youtube.',\n",
224 | " 'the event was posted on facebook by alex.',\n",
225 | " 'the text was posted on facebook by alex.',\n",
226 | " 'the cover was posted on facebook by alex.',\n",
227 | " 'the story was posted on facebook by alex.',\n",
228 | " 'the article was posted on facebook by alex.',\n",
229 | " 'the film was posted on facebook by alex.',\n",
230 | " 'the video was posted on facebook by twitter.',\n",
231 | " 'Is it true that the video, posted in Facebook, was created by Alex?',\n",
232 | " 'the photo was posted on facebook by alex.',\n",
233 | " 'What videos have you seen on FaceBook (Alex)?',\n",
234 | " 'If I could capture this on Facebook, what would be the point of posting an Instagram video?',\n",
235 | " 'I just saw the video posted on Facebook by Alex Wenzel. This will impact to tell you more than we realise.',\n",
236 | " 'the video was posted on blogs by alex.',\n",
237 | " 'the video was posted on facebook by mtv.',\n",
238 | " 'the video was posted on youtube by alex.',\n",
239 | " 'the video was posted on tv by alex.',\n",
240 | " 'the video was posted on facebook by anonymous.',\n",
241 | " 'the single was posted on facebook by alex.',\n",
242 | " 'the video was posted on vine by alex.',\n",
243 | " 'the video was posted on facebook by rihanna.',\n",
244 | " 'the video was posted on facebook by members.',\n",
245 | " 'the video was posted on twitch by alex.',\n",
246 | " 'the video was posted on facebook by others.',\n",
247 | " 'the video was posted on amazon by alex.',\n",
248 | " 'the video was posted on facebook by fans.',\n",
249 | " 'Alex posted the video on Facebook. ',\n",
250 | " 'The video was posted on Facebook by Alex Pease.',\n",
251 | " 'the video was posted on facebook by fox.',\n",
252 | " 'the video was posted on facebook by her.',\n",
253 | " 'the video was posted on google by alex.',\n",
254 | " 'the video was posted on facebook by himself.',\n",
255 | " 'the song was posted on facebook by alex.',\n",
256 | " 'the video was posted on itunes by alex.',\n",
257 | " 'the video was posted on video by alex.',\n",
258 | " 'the video was posted on twitter by alex.',\n",
259 | " 'the video was posted on facebook by rt.',\n",
260 | " 'the video was posted on myspace by alex.',\n",
261 | " 'the video was posted on site by alex.',\n",
262 | " 'the trailer was posted on facebook by alex.',\n",
263 | " \"Watch Alex Rodriguez' Facebook video here.\",\n",
264 | " 'the video was posted on facebook by friends.',\n",
265 | " \"This video was posted on Facebook by Alex. I've been using a mobile app for the past few days but just cant seem to find the time to download it.\",\n",
266 | " 'the track was posted on facebook by alex.',\n",
267 | " 'the video was posted on mtv by alex.',\n",
268 | " 'the album was posted on facebook by alex.',\n",
269 | " 'This is a viral video I uploaded to Facebook and showed on my Facebook profile.',\n",
270 | " 'The video was posted on Facebook by Alex.',\n",
271 | " 'the announcement was posted on facebook by alex.',\n",
272 | " 'the video was posted on facebook by them.',\n",
273 | " 'the video was posted on internet by alex.',\n",
274 | " 'the video was posted on facebook by him.',\n",
275 | " 'i plan to run it again this month',\n",
276 | " 'i plan to run it again this.',\n",
277 | " 'i plan to run it again this week',\n",
278 | " 'I plan to run it again this time this time.',\n",
279 | " 'Is it possible to run it again after it starts if you want it again?',\n",
280 | " 'I plan to run it again this time this time this time I know the plot. I guess they will be able to continue to run it then.',\n",
281 | " 'i plan to run it again this ;',\n",
282 | " \"I plan to run this again this time this time around. I'll be writing more frequently than I have the time and the plan is much less complex.\",\n",
283 | " 'I plan to run it again this time this time this time.',\n",
284 | " \"I plan to run it again this time this time this time. I'm sure that I'll be able to find a runner again.\",\n",
285 | " 'I plan to run it again this time',\n",
286 | " \"I plan to run it again this time this time again this time I can't remember whether I really needed to keep it running but its good enough.\",\n",
287 | " 'I will run it again. I plan to run it again this time.',\n",
288 | " \"I'll run it now again but the second time I've completed.\",\n",
289 | " 'I plan to run it again this time this time. It actually helps me. So, I will run it again.',\n",
290 | " 'i plan to run it again this day',\n",
291 | " 'i plan to run it again this...',\n",
292 | " 'i plan to run it again this year',\n",
293 | " 'i plan to run it again this summer',\n",
294 | " 'What is the plan to run it again?',\n",
295 | " 'I plan to run It again this time now this time in Linux.',\n",
296 | " 'How is this book going to be run again?',\n",
297 | " 'i plan to run it again this season',\n",
298 | " \"I plan to run it again this time I'm not going to run it again this time. If I didn't run it, I don't expect to miss out on it.\",\n",
299 | " 'i plan to run it again this weekend',\n",
300 | " 'i plan to run it again this!',\n",
301 | " 'i plan to run it again this semester',\n",
302 | " 'i plan to run it again this?',\n",
303 | " 'i plan to run it again this morning',\n",
304 | " 'I can always run it again, I just want to try putting it back. This time a fortnight after.']"
305 | ]
306 | },
307 | "metadata": {
308 | "tags": []
309 | },
310 | "execution_count": 5
311 | }
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "metadata": {
317 | "colab": {
318 | "base_uri": "https://localhost:8080/"
319 | },
320 | "id": "fN8rA8bc-cF1",
321 | "outputId": "93d4aafb-5027-4c13-9f49-c7a54777cc7e"
322 | },
323 | "source": [
324 | "%%writefile sentences.txt\n",
325 | "At dinner, six shrimp were eaten by Harry.\n",
326 | "Beautiful giraffes roam the savannah."
327 | ],
328 | "execution_count": 6,
329 | "outputs": [
330 | {
331 | "output_type": "stream",
332 | "text": [
333 | "Writing sentences.txt\n"
334 | ],
335 | "name": "stdout"
336 | }
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "metadata": {
342 | "colab": {
343 | "base_uri": "https://localhost:8080/"
344 | },
345 | "id": "TQLNyJyH-t_T",
346 | "outputId": "f3765c08-2fd7-4b3f-bc33-6413fe94e4c4"
347 | },
348 | "source": [
349 | "# Augment data in a txt file\n",
350 | "textgenie.magic_lamp(\"sentences.txt\",\"paraphrase: \",n_mask_predictions=5,convert_to_active=True)"
351 | ],
352 | "execution_count": 7,
353 | "outputs": [
354 | {
355 | "output_type": "stream",
356 | "text": [
357 | "\r 0%| | 0/2 [00:00, ?it/s]/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py:2111: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
358 | " FutureWarning,\n",
359 | "/usr/local/lib/python3.7/dist-packages/transformers/models/t5/tokenization_t5.py:191: UserWarning: This sequence already has . In future versions this behavior may lead to duplicated eos tokens being added.\n",
360 | " f\"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added.\"\n",
361 | "100%|██████████| 2/2 [00:07<00:00, 3.70s/it]"
362 | ],
363 | "name": "stderr"
364 | },
365 | {
366 | "output_type": "stream",
367 | "text": [
368 | "\n",
369 | "Completed writing output to /content/sentences_aug.txt.\n"
370 | ],
371 | "name": "stdout"
372 | },
373 | {
374 | "output_type": "stream",
375 | "text": [
376 | "\n"
377 | ],
378 | "name": "stderr"
379 | },
380 | {
381 | "output_type": "execute_result",
382 | "data": {
383 | "text/plain": [
384 | "['at lunch, six shrimp were eaten by harry.',\n",
385 | " 'at dinner, six shrimp were eaten by hand.',\n",
386 | " 'at night, six shrimp were eaten by harry.',\n",
387 | " 'At dinner, Harry was having 6 shrimps.',\n",
388 | " 'At dinner, Harry ate six shrimp.',\n",
389 | " 'at least, six shrimp were eaten by harry.',\n",
390 | " 'at dinner, six shrimp were eaten by him.',\n",
391 | " 'at dinner, six shrimp were eaten by chicken.',\n",
392 | " 'at dinner, six pancakes were eaten by harry.',\n",
393 | " 'at dinner, six shrimp were eaten by everyone.',\n",
394 | " 'at dinner, his shrimp were eaten by harry.',\n",
395 | " \"During Harry's dinner, he ate eight shrimp.\",\n",
396 | " 'at dinner, her shrimp were eaten by harry.',\n",
397 | " 'at dinner, these shrimp were eaten by harry.',\n",
398 | " 'at dinner, six eggs were eaten by harry.',\n",
399 | " 'Harry ate six shrimp at dinner. ',\n",
400 | " 'Harry ate six shrimp.',\n",
401 | " 'at first, six shrimp were eaten by harry.',\n",
402 | " 'at dinner, the shrimp were eaten by harry.',\n",
403 | " 'at dinner, some shrimp were eaten by harry.',\n",
404 | " 'at dinner, six sandwiches were eaten by harry.',\n",
405 | " \"During Harry's dinner, six shrimp were eaten by Harry.\",\n",
406 | " 'At dinner, six shrimp were eaten by Harry.',\n",
407 | " 'at dinner, six dishes were eaten by harry.',\n",
408 | " 'at dinner, six meals were eaten by harry.',\n",
409 | " 'at dinner, six shrimp were eaten by themselves.',\n",
410 | " 'How many beautiful giraffes do you see in the savannah?',\n",
411 | " 'black giraffes roam the savannah.',\n",
412 | " 'little giraffes roam the savannah.',\n",
413 | " 'the giraffes roam the savannah.',\n",
414 | " 'beautiful giraffes roam the park.',\n",
415 | " 'beautiful butterflies roam the savannah.',\n",
416 | " 'beautiful giraffes roam the land.',\n",
417 | " 'Beautiful giraffes roam the savannah.',\n",
418 | " 'large giraffes roam the savannah.',\n",
419 | " 'What are some impressive giraffes that roam the Savannah?',\n",
420 | " 'beautiful birds roam the savannah.',\n",
421 | " 'beautiful giraffes roam the streets.',\n",
422 | " 'beautiful giraffes roam the grounds.',\n",
423 | " 'beautiful animals roam the savannah.',\n",
424 | " 'In winter, in the middle of nowhere, a giraffe roams the Sabana Desert. What do they do?',\n",
425 | " 'beautiful creatures roam the savannah.',\n",
426 | " 'Beautiful giraffes roam the Savanna.',\n",
427 | " 'wild giraffes roam the savannah.',\n",
428 | " 'beautiful women roam the savannah.',\n",
429 | " 'beautiful giraffes roam the beach.']"
430 | ]
431 | },
432 | "metadata": {
433 | "tags": []
434 | },
435 | "execution_count": 7
436 | }
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "metadata": {
442 | "colab": {
443 | "base_uri": "https://localhost:8080/"
444 | },
445 | "id": "mzoLK-mq_H3e",
446 | "outputId": "eea8ad04-5cad-4094-b9d7-0dd601cec684"
447 | },
448 | "source": [
449 | "%%writefile dataset.csv\n",
450 | "Sue changed the flat tire., Label1\n",
451 | "The crew paved the entire stretch of highway., Label2\n",
452 | "The critic wrote a scathing review., Label1\n",
453 | "I will clean the house every Saturday., Label2 "
454 | ],
455 | "execution_count": 8,
456 | "outputs": [
457 | {
458 | "output_type": "stream",
459 | "text": [
460 | "Writing dataset.csv\n"
461 | ],
462 | "name": "stdout"
463 | }
464 | ]
465 | },
466 | {
467 | "cell_type": "code",
468 | "metadata": {
469 | "colab": {
470 | "base_uri": "https://localhost:8080/"
471 | },
472 | "id": "xzW11dvT_eNw",
473 | "outputId": "cf322632-0ef5-46df-a672-9d955dcc1e20"
474 | },
475 | "source": [
476 | "# Augment data in a csv file with labels\n",
477 | "augmented_dataset = textgenie.magic_lamp(\"dataset.csv\",\"paraphrase: \",n_paraphrase_predictions=15,n_mask_predictions=15,convert_to_active=True,label_column=\"Label\",column_names=[\"Text\",\"Label\"])"
478 | ],
479 | "execution_count": 3,
480 | "outputs": [
481 | {
482 | "output_type": "stream",
483 | "text": [
484 | "\r 0%| | 0/4 [00:00, ?it/s]/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py:2111: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
485 | " FutureWarning,\n",
486 | "/usr/local/lib/python3.7/dist-packages/transformers/models/t5/tokenization_t5.py:191: UserWarning: This sequence already has . In future versions this behavior may lead to duplicated eos tokens being added.\n",
487 | " f\"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added.\"\n",
488 | "100%|██████████| 4/4 [00:30<00:00, 7.73s/it]"
489 | ],
490 | "name": "stderr"
491 | },
492 | {
493 | "output_type": "stream",
494 | "text": [
495 | "\n",
496 | "Completed writing output to /content/dataset_aug.csv.\n"
497 | ],
498 | "name": "stdout"
499 | },
500 | {
501 | "output_type": "stream",
502 | "text": [
503 | "\n"
504 | ],
505 | "name": "stderr"
506 | }
507 | ]
508 | },
509 | {
510 | "cell_type": "code",
511 | "metadata": {
512 | "colab": {
513 | "base_uri": "https://localhost:8080/",
514 | "height": 402
515 | },
516 | "id": "VdejwBeM_uWQ",
517 | "outputId": "23326c3a-a75d-4c9f-9d5d-09b58b37f252"
518 | },
519 | "source": [
520 | "augmented_dataset"
521 | ],
522 | "execution_count": 4,
523 | "outputs": [
524 | {
525 | "output_type": "execute_result",
526 | "data": {
527 | "text/html": [
528 | "\n",
529 | "\n",
542 | "
\n",
543 | " \n",
544 | " \n",
545 | " | \n",
546 | " Text | \n",
547 | " Label | \n",
548 | "
\n",
549 | " \n",
550 | " \n",
551 | " \n",
552 | " | 0 | \n",
553 | " i changed the flat tire. | \n",
554 | " Label1 | \n",
555 | "
\n",
556 | " \n",
557 | " | 1 | \n",
558 | " Sue changed my flat tire. | \n",
559 | " Label1 | \n",
560 | "
\n",
561 | " \n",
562 | " | 2 | \n",
563 | " In the end Sue changed the flat tire. | \n",
564 | " Label1 | \n",
565 | "
\n",
566 | " \n",
567 | " | 3 | \n",
568 | " She changed my tire to flat. How can she fix t... | \n",
569 | " Label1 | \n",
570 | "
\n",
571 | " \n",
572 | " | 4 | \n",
573 | " reacher changed the flat tire. | \n",
574 | " Label1 | \n",
575 | "
\n",
576 | " \n",
577 | " | ... | \n",
578 | " ... | \n",
579 | " ... | \n",
580 | "
\n",
581 | " \n",
582 | " | 224 | \n",
583 | " i will clean the house every day. | \n",
584 | " Label2 | \n",
585 | "
\n",
586 | " \n",
587 | " | 225 | \n",
588 | " i will clean the house every evening. | \n",
589 | " Label2 | \n",
590 | "
\n",
591 | " \n",
592 | " | 226 | \n",
593 | " I can clean the house every Saturday. I make a... | \n",
594 | " Label2 | \n",
595 | "
\n",
596 | " \n",
597 | " | 227 | \n",
598 | " I plan to clean the house every weekend. How d... | \n",
599 | " Label2 | \n",
600 | "
\n",
601 | " \n",
602 | " | 228 | \n",
603 | " I plan to clean our house every weekend. It's ... | \n",
604 | " Label2 | \n",
605 | "
\n",
606 | " \n",
607 | "
\n",
608 | "
229 rows × 2 columns
\n",
609 | "
"
610 | ],
611 | "text/plain": [
612 | " Text Label\n",
613 | "0 i changed the flat tire. Label1\n",
614 | "1 Sue changed my flat tire. Label1\n",
615 | "2 In the end Sue changed the flat tire. Label1\n",
616 | "3 She changed my tire to flat. How can she fix t... Label1\n",
617 | "4 reacher changed the flat tire. Label1\n",
618 | ".. ... ...\n",
619 | "224 i will clean the house every day. Label2\n",
620 | "225 i will clean the house every evening. Label2\n",
621 | "226 I can clean the house every Saturday. I make a... Label2\n",
622 | "227 I plan to clean the house every weekend. How d... Label2\n",
623 | "228 I plan to clean our house every weekend. It's ... Label2\n",
624 | "\n",
625 | "[229 rows x 2 columns]"
626 | ]
627 | },
628 | "metadata": {
629 | "tags": []
630 | },
631 | "execution_count": 4
632 | }
633 | ]
634 | }
635 | ]
636 | }
--------------------------------------------------------------------------------