├── README.md ├── requirements.txt ├── rpunct ├── __init__.py ├── punctuate.py └── utils.py ├── setup.py ├── tests └── sample_text.txt └── training ├── prep_data.py └── train.py /README.md: -------------------------------------------------------------------------------- 1 | # ✏️ rpunct - Restore Punctuation 2 | [![forthebadge](https://forthebadge.com/images/badges/made-with-crayons.svg)]() 3 | 4 | This repo contains code for Punctuation restoration. 5 | 6 | This package is intended for direct use as a punctuation restoration model for the general English language. Alternatively, you can use this for further fine-tuning on domain-specific texts for punctuation restoration tasks. 7 | It uses HuggingFace's `bert-base-uncased` model weights that have been fine-tuned for Punctuation restoration. 8 | 9 | Punctuation restoration works on arbitrarily large text. 10 | And uses GPU if it's available otherwise will default to CPU. 11 | 12 | List of punctuations we restore: 13 | * Upper-casing 14 | * Period: **.** 15 | * Exclamation: **!** 16 | * Question Mark: **?** 17 | * Comma: **,** 18 | * Colon: **:** 19 | * Semi-colon: **;** 20 | * Apostrophe: **'** 21 | * Dash: **-** 22 | 23 | --------------------------- 24 | ## 🚀 Usage 25 | **Below is a quick way to get up and running with the model.** 26 | 1. First, install the package. 27 | ```bash 28 | pip install rpunct 29 | ``` 30 | 2. Sample python code. 31 | ```python 32 | from rpunct import RestorePuncts 33 | # The default language is 'english' 34 | rpunct = RestorePuncts() 35 | rpunct.punctuate("""in 2018 cornell researchers built a high-powered detector that in combination with an algorithm-driven process called ptychography set a world record 36 | by tripling the resolution of a state-of-the-art electron microscope as successful as it was that approach had a weakness it only worked with ultrathin samples that were 37 | a few atoms thick anything thicker would cause the electrons to scatter in ways that could not be disentangled now a team again led by david muller the samuel b eckert 38 | professor of engineering has bested its own record by a factor of two with an electron microscope pixel array detector empad that incorporates even more sophisticated 39 | 3d reconstruction algorithms the resolution is so fine-tuned the only blurring that remains is the thermal jiggling of the atoms themselves""") 40 | # Outputs the following: 41 | # In 2018, Cornell researchers built a high-powered detector that, in combination with an algorithm-driven process called Ptychography, set a world record by tripling the 42 | # resolution of a state-of-the-art electron microscope. As successful as it was, that approach had a weakness. It only worked with ultrathin samples that were a few atoms 43 | # thick. Anything thicker would cause the electrons to scatter in ways that could not be disentangled. Now, a team again led by David Muller, the Samuel B. 44 | # Eckert Professor of Engineering, has bested its own record by a factor of two with an Electron microscope pixel array detector empad that incorporates even more 45 | # sophisticated 3d reconstruction algorithms. The resolution is so fine-tuned the only blurring that remains is the thermal jiggling of the atoms themselves. 46 | ``` 47 | 48 | ----------------------------------------------- 49 | ## 🎯 Accuracy 50 | Here is the number of product reviews we used for finetuning the model: 51 | 52 | | Language | Number of text samples| 53 | | -------- | ----------------- | 54 | | English | 560,000 | 55 | 56 | We found the best convergence around _**3 epochs**_, which is what presented here and available via a download. 57 | 58 | ----------------------------------------------- 59 | The fine-tuned model obtained the following accuracy on 45,990 held-out text samples: 60 | 61 | | Accuracy | Overall F1 | Eval Support | 62 | | -------- | ---------------------- | ------------------- | 63 | | 91% | 90% | 45,990 64 | 65 | ----------------------------------------------- 66 | ## 💻🎯 Further Fine-Tuning 67 | 68 | To start fine-tuning or training please look into `training/train.py` file. 69 | Running `python training/train.py` will replicate the results of this model. 70 | 71 | ----------------------------------------------- 72 | ## ☕ Contact 73 | Contact [Daulet Nurmanbetov](daulet.nurmanbetov@gmail.com) for questions, feedback and/or requests for similar models. 74 | 75 | ----------------------------------------------- 76 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | langdetect==1.0.9 2 | pandas==1.2.4 3 | simpletransformers==0.61.4 4 | six==1.16.0 5 | torch==1.8.1 6 | -------------------------------------------------------------------------------- /rpunct/__init__.py: -------------------------------------------------------------------------------- 1 | from .punctuate import RestorePuncts 2 | -------------------------------------------------------------------------------- /rpunct/punctuate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 💾⚙️🔮 3 | 4 | __author__ = "Daulet N." 5 | __email__ = "daulet.nurmanbetov@gmail.com" 6 | 7 | import logging 8 | from langdetect import detect 9 | from simpletransformers.ner import NERModel 10 | 11 | 12 | class RestorePuncts: 13 | def __init__(self, wrds_per_pred=250): 14 | self.wrds_per_pred = wrds_per_pred 15 | self.overlap_wrds = 30 16 | self.valid_labels = ['OU', 'OO', '.O', '!O', ',O', '.U', '!U', ',U', ':O', ';O', ':U', "'O", '-O', '?O', '?U'] 17 | self.model = NERModel("bert", "felflare/bert-restore-punctuation", labels=self.valid_labels, 18 | args={"silent": True, "max_seq_length": 512}) 19 | 20 | def punctuate(self, text: str, lang:str=''): 21 | """ 22 | Performs punctuation restoration on arbitrarily large text. 23 | Detects if input is not English, if non-English was detected terminates predictions. 24 | Overrride by supplying `lang='en'` 25 | 26 | Args: 27 | - text (str): Text to punctuate, can be few words to as large as you want. 28 | - lang (str): Explicit language of input text. 29 | """ 30 | if not lang and len(text) > 10: 31 | lang = detect(text) 32 | if lang != 'en': 33 | raise Exception(F"""Non English text detected. Restore Punctuation works only for English. 34 | If you are certain the input is English, pass argument lang='en' to this function. 35 | Punctuate received: {text}""") 36 | 37 | # plit up large text into bert digestable chunks 38 | splits = self.split_on_toks(text, self.wrds_per_pred, self.overlap_wrds) 39 | # predict slices 40 | # full_preds_lst contains tuple of labels and logits 41 | full_preds_lst = [self.predict(i['text']) for i in splits] 42 | # extract predictions, and discard logits 43 | preds_lst = [i[0][0] for i in full_preds_lst] 44 | # join text slices 45 | combined_preds = self.combine_results(text, preds_lst) 46 | # create punctuated prediction 47 | punct_text = self.punctuate_texts(combined_preds) 48 | return punct_text 49 | 50 | def predict(self, input_slice): 51 | """ 52 | Passes the unpunctuated text to the model for punctuation. 53 | """ 54 | predictions, raw_outputs = self.model.predict([input_slice]) 55 | return predictions, raw_outputs 56 | 57 | @staticmethod 58 | def split_on_toks(text, length, overlap): 59 | """ 60 | Splits text into predefined slices of overlapping text with indexes (offsets) 61 | that tie-back to original text. 62 | This is done to bypass 512 token limit on transformer models by sequentially 63 | feeding chunks of < 512 toks. 64 | Example output: 65 | [{...}, {"text": "...", 'start_idx': 31354, 'end_idx': 32648}, {...}] 66 | """ 67 | wrds = text.replace('\n', ' ').split(" ") 68 | resp = [] 69 | lst_chunk_idx = 0 70 | i = 0 71 | 72 | while True: 73 | # words in the chunk and the overlapping portion 74 | wrds_len = wrds[(length * i):(length * (i + 1))] 75 | wrds_ovlp = wrds[(length * (i + 1)):((length * (i + 1)) + overlap)] 76 | wrds_split = wrds_len + wrds_ovlp 77 | 78 | # Break loop if no more words 79 | if not wrds_split: 80 | break 81 | 82 | wrds_str = " ".join(wrds_split) 83 | nxt_chunk_start_idx = len(" ".join(wrds_len)) 84 | lst_char_idx = len(" ".join(wrds_split)) 85 | 86 | resp_obj = { 87 | "text": wrds_str, 88 | "start_idx": lst_chunk_idx, 89 | "end_idx": lst_char_idx + lst_chunk_idx, 90 | } 91 | 92 | resp.append(resp_obj) 93 | lst_chunk_idx += nxt_chunk_start_idx + 1 94 | i += 1 95 | logging.info(f"Sliced transcript into {len(resp)} slices.") 96 | return resp 97 | 98 | @staticmethod 99 | def combine_results(full_text: str, text_slices): 100 | """ 101 | Given a full text and predictions of each slice combines predictions into a single text again. 102 | Performs validataion wether text was combined correctly 103 | """ 104 | split_full_text = full_text.replace('\n', ' ').split(" ") 105 | split_full_text = [i for i in split_full_text if i] 106 | split_full_text_len = len(split_full_text) 107 | output_text = [] 108 | index = 0 109 | 110 | if len(text_slices[-1]) <= 3 and len(text_slices) > 1: 111 | text_slices = text_slices[:-1] 112 | 113 | for _slice in text_slices: 114 | slice_wrds = len(_slice) 115 | for ix, wrd in enumerate(_slice): 116 | # print(index, "|", str(list(wrd.keys())[0]), "|", split_full_text[index]) 117 | if index == split_full_text_len: 118 | break 119 | 120 | if split_full_text[index] == str(list(wrd.keys())[0]) and \ 121 | ix <= slice_wrds - 3 and text_slices[-1] != _slice: 122 | index += 1 123 | pred_item_tuple = list(wrd.items())[0] 124 | output_text.append(pred_item_tuple) 125 | elif split_full_text[index] == str(list(wrd.keys())[0]) and text_slices[-1] == _slice: 126 | index += 1 127 | pred_item_tuple = list(wrd.items())[0] 128 | output_text.append(pred_item_tuple) 129 | assert [i[0] for i in output_text] == split_full_text 130 | return output_text 131 | 132 | @staticmethod 133 | def punctuate_texts(full_pred: list): 134 | """ 135 | Given a list of Predictions from the model, applies the predictions to text, 136 | thus punctuating it. 137 | """ 138 | punct_resp = "" 139 | for i in full_pred: 140 | word, label = i 141 | if label[-1] == "U": 142 | punct_wrd = word.capitalize() 143 | else: 144 | punct_wrd = word 145 | 146 | if label[0] != "O": 147 | punct_wrd += label[0] 148 | 149 | punct_resp += punct_wrd + " " 150 | punct_resp = punct_resp.strip() 151 | # Append trailing period if doesnt exist. 152 | if punct_resp[-1].isalnum(): 153 | punct_resp += "." 154 | return punct_resp 155 | 156 | 157 | if __name__ == "__main__": 158 | punct_model = RestorePuncts() 159 | # read test file 160 | with open('../tests/sample_text.txt', 'r') as fp: 161 | test_sample = fp.read() 162 | # predict text and print 163 | punctuated = punct_model.punctuate(test_sample) 164 | print(punctuated) 165 | -------------------------------------------------------------------------------- /rpunct/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 💾⚙️🔮 3 | 4 | __author__ = "Daulet N." 5 | __email__ = "daulet.nurmanbetov@gmail.com" 6 | 7 | def prepare_unpunct_text(text): 8 | """ 9 | Given a text, normalizes it to subsequently restore punctuation 10 | """ 11 | formatted_txt = text.replace('\n', '').strip() 12 | formatted_txt = formatted_txt.lower() 13 | formatted_txt_lst = formatted_txt.split(" ") 14 | punct_strp_txt = [strip_punct(i) for i in formatted_txt_lst] 15 | normalized_txt = " ".join([i for i in punct_strp_txt if i]) 16 | return normalized_txt 17 | 18 | def strip_punct(wrd): 19 | """ 20 | Given a word, strips non aphanumeric characters that precede and follow it 21 | """ 22 | if not wrd: 23 | return wrd 24 | 25 | while not wrd[-1:].isalnum(): 26 | if not wrd: 27 | break 28 | wrd = wrd[:-1] 29 | 30 | while not wrd[:1].isalnum(): 31 | if not wrd: 32 | break 33 | wrd = wrd[1:] 34 | return wrd 35 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 💾⚙️🔮 3 | 4 | __author__ = "Daulet N." 5 | __email__ = "daulet.nurmanbetov@gmail.com" 6 | 7 | from setuptools import find_packages, setup 8 | 9 | with open("README.md", "r") as fh: 10 | long_description = fh.read() 11 | 12 | with open("requirements.txt", "r") as fh: 13 | requirements = fh.readlines() 14 | requirements = [i.strip() for i in requirements] 15 | 16 | setup( 17 | name="rpunct", 18 | version="1.0.2", 19 | author="Daulet Nurmanbetov", 20 | author_email="daulet.nurmanbetov@gmail.com", 21 | description="An easy-to-use package to restore punctuation of text.", 22 | long_description=long_description, 23 | long_description_content_type="text/markdown", 24 | url="https://github.com/Felflare/rpunct", 25 | packages=find_packages(), 26 | classifiers=[ 27 | "Intended Audience :: Science/Research", 28 | "License :: OSI Approved :: MIT License", 29 | "Programming Language :: Python :: 3", 30 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 31 | ], 32 | python_requires=">=3.6", 33 | install_requires=requirements, 34 | ) -------------------------------------------------------------------------------- /tests/sample_text.txt: -------------------------------------------------------------------------------- 1 | a rising price does not tell you something is working if you don’t believe that’s true consider what happened to holders of tulip bulbs in the 1600s that’s how michael w green sees cryptocurrency it’s like driving uphill with no brakes and those relentlessly boosting it refuse to acknowledge its profound vulnerabilities and downsides what happens for example when you lose your crypto-key what if a 51 attack happens in which 51 of the processing power in a blockchain network come under attack by a group with immense computing power this is to say nothing of the environmental aspect of bitcoin mining which uses more energy than entire nations or the fact that groups like the al-qassam brigades are using cryptocurrency to finance their operations and solicit donations michael has been an investor for more than 30 years he recently joined simplify where he is introducing new innovations in etfs he has previously been at logica capital advisers where he was the chief strategist thiel macro where he managed the personal capital of peter thiel and canyon capital advisors a $23 billion multi-strategy hedge fund based in los angeles he also founded ice farm capital a discretionary global macro hedge fund seeded by soros fund management i came across michael in this debate billed as the best bitcoin debate ever recorded it’s really worthwhile as is following michael on twitter earlier today balaji s srinivasan made the case for crypto here’s michael with the case against the bitcoin universe is awash in “old man shouts at bitcoin memes in response to charlie munger’s recent assertion that the explosive growth of bitcoin is contrary to the interests of civilization i like memes as much as the next middle-aged guy interested in tech but in my view the snarky reaction to munger speaks to something quite deep a divergent view about the nature of civilization bitcoin advocates like my debate partner balaji s srinivasan generally fall into a category of “techno-progressives individuals who believe that the combination of unfettered free markets and technology inevitably advance society here’s how balaji put it in 2015 squint a few years out to visualize the complete marginalization of today’s establishment the future is nationalists vs technologists a full-throated jealous defender of borders language and culture or a rootless cosmopolitan with a laptop bent on callow disruption the primary axis won’t be left versus right he argued but the cloud versus the land “you have your elections and we have our search engines vote how you want we will move where we like. in other words the nation-state is necessarily fighting a rear-guard anti-progress agenda and the individual assuming they are technologically savvy and wealthy is necessarily unbound as a “sovereign individual (a bit of a bible to the bitcoiners) by stripping the provision of “money from the state bitcoin has become the ultimate expression of freedom from government interference based on this belief system bitcoin must by definition be a social good munger in contrast is expressing contrasting beliefs about human nature and about the nature of politics and markets namely that our inevitable flaws including our lack of perfect foresight require guardrails to reduce the risk of unintended consequences like for example speculation in “low-risk mortgages that drove a global financial crisis without borders languages culture and yes fiat currency our civilization can be easily placed at risk whether the techno-progressive belief system or munger’s is correct is uncertain at this point i am no apologist for the existing system the united states government has made and continues to make unconscionable choices that harm our society especially the young a cursory review of the student loan debacle provides more than enough evidence to make that plain such choices are negatively impacting our ability to work towards our common goals a condition i discuss at length in recent media appearances those concerns do not change the fact that a system built around bitcoin despite its obvious technological innovation is fundamentally flawed a real world game of monopoly where the objective becomes to take wealth from others rather than work both individually and collectively to raise living standards for all i want to specify that i am referring to the rule set of bitcoin specifically rather than broader innovations in cryptocurrency while many other cryptocurrency experiments have flaws as the early leader to the space bitcoin has created unique vulnerabilities by growing rapidly in an environment without a regulatory regime prepared for it what’s worse is that its proponents are more than willing to conceal the facts to promote its success for this reason alone we should be skeptical indeed the bitcoin narrative is built on a foundation of half-truths untruths social darwinism cynicism an odd comfort with criminality and nonchalance about the security provided by the nation-state crypto advocates are fun to follow on twitter but they won’t tell you the following 1 money exists for one purpose to cancel debt written on the front of every dollar bill is the phrase “this note is legal tender for all debts public and private. your taxes are paid in fiat currency your mortgage is payable in fiat currency and the brief liability created by the purchase of your morning coffee is settled in fiat currency any dispute surrounding these debts will be resolved in a court system that will allow settlement in fiat currency or place you in jail (or worse if you refuse to honor that settlement the police courts and military are funded via the issuance and receipt (in the form of taxes and fees of fiat currency in contrast bitcoin is a speculative asset that like all assets requires systems of law and force to protect it i recently debated the chief strategist of kraken among the largest crypto exchanges on this subject he conceded that bitcoin required protection from malicious actors and suggested u.s government intervention to protect his investments there are no atheists in foxholes 2 i am not an apologist for american hegemony and all the behaviors it has enabled but imagine the counterfactual over the course of the 20th century the relative standard of living of those who lived under the protective umbrella of pax americana exploded relative to those living under the competing soviet or chinese systems while techno-optimists will suggest that the counterfactual is utopian the evidence on the ground is far darker i would encourage a read of the work of radigan carter a pseudonymous (and disenchanted u.s special forces operative who has written eloquently on the subject and has argued that a world without u.s leadership is a world even he would be afraid of (radigan is uncertain about crypto and holds a small allocation. 3 china iran and russia are playing the dominant role in the world of cryptocurrency in the last week of april mining pools based in china accounted for roughly 90 of the processing power (“hash rate” in the bitcoin network roughly three weeks ago a power outage in the xinjiang region of china resulted in a plunge in global bitcoin processing bitcoin mining the process of record keeping for the “immutable chain of record on which the bitcoin network depends is dominated by entities in countries with the stated objective to harm the interests of the united states bitcoin proponents continuously assure us that this is “just about to change, but the data has not shifted in a meaningful manner in the last five years this is not a decentralized system it is centralized in the countries that seek our destruction 4 when peter thiel floated the idea a few weeks ago that perhaps “bitcoin should also be thought of in part as a chinese financial weapon against the u.s., he wasn’t mincing words “it threatens fiat money but it especially threatens the u.s dollar, he said with china having banned domestic ownership and usage of bitcoin while dominating bitcoin production and encouraging foreign speculation in the asset this seems a reasonable avenue of exploration 5 those bullish on crypto love to point out that there is more criminal activity occurring in u.s dollars than in bitcoin given the dominance of the dollar in economic activity this should surprise absolutely no one what they leave out is that roughly 40 of crypto transactions are used for illicit activity the data on crypto usage in criminal activity is intentionally obscured within the industry while statistics like <1 are regularly reported this data mixes crypto exchange volumes (similar to the trading volumes on the new york stock exchange but admittedly manipulated with end economic activity using the most recent estimates for final purchases involving crypto (similar to the calculation of gdp) the roughly $400 million per month in illicit activity for 2020 would equate to greater than 40 of all bitcoin usage this week’s attack on a u.s energy pipeline carried out by a terrorist group operating out of russia and requesting ransomware payment in bitcoin suggests munger is not unfair in his characterization of bitcoin as useful to extortionists 6 bitcoin mining is remarkably energy intensive even bitcoin’s most ardent proponents acknowledge that bitcoin energy consumption has now reached levels roughly equivalent to that of sweden elon musk who agreed to accept bitcoin for tesla purchases as of this week has abandoned that plan as the company came under fire for the environmental damage created by bitcoin mining he is far from alone bitcoin usage for purchases of goods and services has stagnated.the latest data available from statista suggests that the largest use for bitcoin in end markets is for prepaid gift cards followed by payment for internet (largely pornography and gambling and vpn services after stagnating for years at around $4 billion per year historical data on “merchant volumes has largely disappeared from the data record odd for a transparent “ledger of record. 7 bitcoin marketing is designed to mislead with an expressed refusal to report anything negative a strategy frequently used to great effect in silicon valley to promote entities of dubious value like wework if that seems far-fetched i would encourage readers to watch the shenanigans of michael “bitstein goldstein in his 2019 presentation “the art of bitcoin rhetoric how to meme bitcoin to the moon. or perhaps read elaine ou’s “reject nocoiner orthodoxy. 8 just because the price of something is going up does not mean there it has an underlying value nothing makes that case clearer than the story of dogecoin the latest crypto sensation dogecoin exposes both the current mania and the irrelevance of many of bitcoin’s claimed attributes it is not scarce nor secure nor limited elon musk admitted on “saturday night live that it’s a “hustle. and yet the price of dogecoin has risen far more rapidly in recent months than bitcoin 9 while arguing for a utopian future bitcoin proponents embrace the time-tested methods of exploiting societal fear to drive adoption as social animals we are terrified of being left behind by the tribe the language of bitcoin “not gonna make it or “have fun staying poor are intentionally designed to exploit this amygdala response and drive participation in a scheme that relies entirely on driving additional participation to generate gains for current participants 10 the final lie is the most important “bitcoin is unstoppable. over a year ago some talented technologists proposed a mechanism by which a state actor could disable the bitcoin network at remarkably low cost this challenge went ignored until i raised it in a recent debate with promoter anthony “pomp pompliano in a break with bitcoin promotion orthodoxy this was acknowledged as valid by the bitcoin maximalist andreas antonopolous and a solution to the threat remains elusive why are you unaware of this because the objective of bitcoin promoters is to drive speculative activity into bitcoin rather than offer you an informed choice with a focus on delegitimizing government substituting offensive memes for culture crowding out legitimate energy uses and supporting anarchy in the name of individual “freedom to generate speculative profits i’d suggest that despite his advanced age charlie munger not to mention gary gensler janet yellen peter thiel and nassim taleb still gets it -------------------------------------------------------------------------------- /training/prep_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 💾⚙️🔮 3 | 4 | __author__ = "Daulet N." 5 | __email__ = "daulet.nurmanbetov@gmail.com" 6 | 7 | import os 8 | import re 9 | import json 10 | import random 11 | import pandas as pd 12 | 13 | 14 | def create_train_datasets(): 15 | output_file_names = [] 16 | download_df() 17 | for i in ['yelp_polarity_reviews_train.csv', 'yelp_polarity_reviews_test.csv']: 18 | name = i.split(".")[0] 19 | split_nm = name.split("_")[-1] 20 | df_name = name.split("_")[0] 21 | create_rpunct_dataset(i, f"{name}_data.json") 22 | output_file_names.append(f"{df_name}_{split_nm}.txt") 23 | create_training_samples(f"{name}_data.json", f"{df_name}_{split_nm}.txt") 24 | return output_file_names 25 | 26 | 27 | def download_df(dir_path=''): 28 | import tensorflow_datasets as tfds 29 | data_type = ['train', 'test'] 30 | ds = tfds.load('yelp_polarity_reviews', split=data_type, shuffle_files=True) 31 | for i in ds: 32 | i = tfds.as_dataframe(i) 33 | csv_path = os.path.join(dir_path, f'yelp_polarity_reviews_{i}.csv') 34 | i.to_csv(csv_path, index=False) 35 | 36 | 37 | def create_record(row): 38 | """ 39 | Create labels for Punctuation Restoration task for each token. 40 | """ 41 | pattern = re.compile("[\W_]+") 42 | new_obs = [] 43 | 44 | observation = eval(row).decode().replace('\\n', ' ').split() 45 | 46 | for obs in observation: 47 | text_obs = obs.lower() 48 | text_obs = pattern.sub('', text_obs) 49 | 50 | if not text_obs: 51 | continue 52 | if not obs[-1].isalnum(): 53 | new_lab = obs[-1] 54 | else: 55 | new_lab = "O" 56 | if obs[0].isupper(): 57 | new_lab += "U" 58 | else: 59 | new_lab += "O" 60 | 61 | new_obs.append({'sentence_id': 0, 'words': text_obs, 'labels': new_lab}) 62 | return new_obs 63 | 64 | 65 | def create_rpunct_dataset(orig_yelp_dataframe, rpunct_dataset_path='rpunct_data.json'): 66 | df = pd.read_csv(orig_yelp_dataframe) 67 | # Filter to only positive examples 68 | df = df[df['label'] == 1].reset_index(drop=True) 69 | # Dataframe Shape 70 | print(f"Dataframe samples: {df.shape}") 71 | 72 | all_records = [] 73 | for i in range(df.shape[0]): 74 | orig_row = df['text'][i] 75 | records = create_record(orig_row) 76 | all_records.extend(records) 77 | 78 | with open(rpunct_dataset_path, 'w') as fp: 79 | json.dump(all_records, fp) 80 | 81 | 82 | def create_training_samples(json_loc_file, file_out_nm='train_data', num_splits=5): 83 | """ 84 | Given a looong list of tokens, splits them into 500 token chunks 85 | thus creating observations. This is for fine-tuning with simpletransformers 86 | later on. 87 | """ 88 | random.seed(1337) 89 | observations = [] 90 | _round = 0 91 | 92 | while _round < num_splits: 93 | with open(json_loc_file, 'r') as fp: 94 | all_records = json.load(fp) 95 | 96 | size = len(all_records) // num_splits 97 | all_records = all_records[size * _round:size * (_round + 1)] 98 | splits = create_tokenized_obs(all_records) 99 | full_data = pd.DataFrame(all_records) 100 | del all_records 101 | 102 | for i in splits: 103 | data_slice = full_data.iloc[i[0]:i[1], ] 104 | observations.append(data_slice.values.tolist()) 105 | _round += 1 106 | random.shuffle(observations) 107 | 108 | with open(f'{file_out_nm}_{_round}.txt', 'w') as fp2: 109 | json.dump(observations, fp2) 110 | 111 | del full_data 112 | del observations 113 | 114 | 115 | def create_tokenized_obs(input_list, num_toks=500, offset=250): 116 | """ 117 | Given a large set of tokens, determines splits of 118 | 500 token sized observations, with an offset(sliding window) of 250 tokens. 119 | It is important that first token is capitalized and we fed as many tokens as possible. 120 | In a real use-case we will not know where splits are so we'll just feed all tokens till limit. 121 | """ 122 | start = -1 123 | loop_end = -1 124 | appends = [] 125 | for ix, i in enumerate(input_list): 126 | if ix == loop_end: 127 | start = -1 128 | if i['labels'][-1] == "U" and start == -1: 129 | start = ix 130 | end = ix + num_toks 131 | appends.append((start, end)) 132 | loop_end = start + offset 133 | 134 | return appends 135 | 136 | if __name__ == "__main__": 137 | output_file_names = create_train_datasets() 138 | print(f"Created following files: {output_file_names}") 139 | -------------------------------------------------------------------------------- /training/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 💾⚙️🔮 3 | 4 | __author__ = "Daulet N." 5 | __email__ = "daulet.nurmanbetov@gmail.com" 6 | 7 | import json 8 | from simpletransformers.ner import NERModel 9 | 10 | VALID_LABELS = ['OU', 'OO', '.O', '!O', ',O', '.U', '!U', ',U', ':O', ';O', ':U', "'O", '-O', '?O', '?U'] 11 | 12 | 13 | def e2e_train(): 14 | prepare_data() 15 | steps, tr_details = train_model() 16 | print(f"Steps: {steps}; Train details: {tr_details}") 17 | 18 | 19 | def train_model(): 20 | """ 21 | Trains simpletransformers model 22 | """ 23 | # Create a NERModel 24 | model = NERModel("bert", "bert-base-uncased", 25 | args={"overwrite_output_dir": True, 26 | "num_train_epochs": 3, 27 | "max_seq_length": 512, 28 | "lazy_loading": True}, 29 | labels=VALID_LABELS) 30 | 31 | # # Train the model 32 | steps, tr_details = model.train_model('rpunct_train_set.txt') 33 | return steps, tr_details 34 | 35 | 36 | def prepare_data(): 37 | """ 38 | Prepares data from Original text into Connnl formatted datasets ready for training 39 | In addition constraints label space to only labels we care about 40 | """ 41 | token_data = load_datasets(['telp_train_1.txt', 'telp_train_2.txt', 'telp_train_3.txt', 'telp_train_4.txt']) 42 | clean_up_labels(token_data, valid_labels) 43 | eval_set = token_data[-int(len(token_data) * 0.10):] 44 | train_set = token_data[:int(len(token_data) * 0.90)] 45 | create_text_file(train_set, 'rpunct_train_set.txt') 46 | create_text_file(eval_set, 'rpunct_test_set.txt') 47 | 48 | 49 | def load_datasets(dataset_paths): 50 | """ 51 | Given a list of data paths returns a single data object containing all data slices 52 | """ 53 | token_data = [] 54 | for d_set in [dataset_paths]: 55 | with open(d_set, 'r') as fp: 56 | data_slice = json.load(fp) 57 | token_data.extend(data_slice) 58 | del data_slice 59 | return token_data 60 | 61 | 62 | def get_label_stats(dataset): 63 | """ 64 | Generates frequency of different labels in the dataset. 65 | """ 66 | calcs = {} 67 | for i in dataset: 68 | for tok in i: 69 | if tok[2] not in calcs.keys(): 70 | calcs[tok[2]] = 1 71 | else: 72 | calcs[tok[2]] += 1 73 | print(calcs) 74 | return calcs 75 | 76 | 77 | def clean_up_labels(dataset, valid_labels): 78 | """ 79 | Given a list of Valid labels cleans up the dataset 80 | by limiting to only the labels available. 81 | 82 | In addition prepares observations for training. 83 | """ 84 | for ix, i in enumerate(dataset): 85 | for tok in i: 86 | tok[0] = ix 87 | if tok[2] not in valid_labels: 88 | case = tok[2][-1] 89 | tok[2] = f"O{case}" 90 | if len(tok[2]) < 2: 91 | tok[2] = "OO" 92 | 93 | 94 | def create_text_file(dataset, name): 95 | """ 96 | Create Connl ner format file 97 | """ 98 | with open(name, 'w') as fp: 99 | for obs in dataset: 100 | for tok in obs: 101 | line = tok[1] + " " + tok[2] + '\n' 102 | fp.write(line) 103 | fp.write('\n') 104 | 105 | 106 | if __name__ == "__main__": 107 | print("Training the model.") 108 | e2e_train() 109 | --------------------------------------------------------------------------------