├── .gitignore ├── LICENSE ├── README.md ├── requirements.txt ├── rrnlp ├── __init__.py └── models │ ├── PICO_tagger.py │ ├── RCT_classifier.py │ ├── RoB_classifier.py │ ├── RoB_classifier_LR.py │ ├── __init__.py │ ├── encoder.py │ ├── ev_inf_classifier.py │ ├── sample_size_extractor.py │ ├── study_design_classifier.py │ ├── util │ ├── __init__.py │ ├── index_numbers.py │ ├── minimap │ │ ├── __init__.py │ │ ├── cui_to_mh.pck │ │ ├── cui_to_mh_supp.pck │ │ ├── ignorelist.txt │ │ ├── minimap.py │ │ ├── prepositions_conjunctions.txt │ │ ├── str_to_cui.pck │ │ ├── str_to_cui_supp.pck │ │ └── subtrees.pck │ └── schwartz_hearst.py │ └── weights │ ├── .gitkeep │ └── weights_manifest.json └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 byron wallace 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RRnlp 2 | 3 | This library provides (easy!) access to a suite of models for extracting key data from abstracts of randomized controlled trials (RCTs). 4 | 5 | In particular, `rrnlp` features lightweight variants of the models defined in Trialstreamer (https://trialstreamer.robotreviewer.net/; https://academic.oup.com/jamia/article/27/12/1903/5907063). However, the models here — all save for the sample size extractor constructed as linear layers on top of `SciBERT` representations, with only minimal fine tuning of `SciBERT` layers — are still experimental, and may not be as performant as the models used in Trialstreamer (yet!). See below for example usage. 6 | 7 | # Use 8 | 9 | ```python 10 | import rrnlp 11 | 12 | trial_reader = rrnlp.TrialReader() 13 | 14 | ti_abs = {"ti": 'A Cluster-Randomized Trial of Hydroxychloroquine for Prevention of Covid-19', 15 | "ab": '''Background: Current strategies for preventing severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) infection are limited to nonpharmacologic interventions. Hydroxychloroquine has been proposed as a postexposure therapy to prevent coronavirus disease 2019 (Covid-19), but definitive evidence is lacking.\n\nMethods: We conducted an open-label, cluster-randomized trial involving asymptomatic contacts of patients with polymerase-chain-reaction (PCR)-confirmed Covid-19 in Catalonia, Spain. We randomly assigned clusters of contacts to the hydroxychloroquine group (which received the drug at a dose of 800 mg once, followed by 400 mg daily for 6 days) or to the usual-care group (which received no specific therapy). The primary outcome was PCR-confirmed, symptomatic Covid-19 within 14 days. The secondary outcome was SARS-CoV-2 infection, defined by symptoms compatible with Covid-19 or a positive PCR test regardless of symptoms. Adverse events were assessed for up to 28 days.\n\nResults: The analysis included 2314 healthy contacts of 672 index case patients with Covid-19 who were identified between March 17 and April 28, 2020. A total of 1116 contacts were randomly assigned to receive hydroxychloroquine and 1198 to receive usual care. Results were similar in the hydroxychloroquine and usual-care groups with respect to the incidence of PCR-confirmed, symptomatic Covid-19 (5.7% and 6.2%, respectively; risk ratio, 0.86 [95% confidence interval, 0.52 to 1.42]). In addition, hydroxychloroquine was not associated with a lower incidence of SARS-CoV-2 transmission than usual care (18.7% and 17.8%, respectively). The incidence of adverse events was higher in the hydroxychloroquine group than in the usual-care group (56.1% vs. 5.9%), but no treatment-related serious adverse events were reported.\n\nConclusions: Postexposure therapy with hydroxychloroquine did not prevent SARS-CoV-2 infection or symptomatic Covid-19 in healthy persons exposed to a PCR-positive case patient. (Funded by the crowdfunding campaign YoMeCorono and others; BCN-PEP-CoV2 ClinicalTrials.gov number, NCT04304053.).'''} 16 | 17 | preds = trial_reader.read_trial(ti_abs) 18 | ``` 19 | 20 | Should yield the following dictionary 21 | 22 | ```python 23 | import pprint 24 | pp = pprint.PrettyPrinter(width=200) 25 | pp.pprint(preds) 26 | 27 | {'bias_ab_bot': {'prob_low_rob': 0.14128409107623344}, 28 | 'pico_span_bot': {'i': ['hydroxychloroquine', 'Hydroxychloroquine', 'usual care', 'drug', 'usual-care group (which received no specific therapy', 'hydroxychloroquine group'], 29 | 'i_mesh': [{'cui': 'C0020336', 'mesh_term': 'Hydroxychloroquine', 'mesh_ui': 'D006886'}, 30 | {'cui': 'C0013227', 'mesh_term': 'Pharmaceutical Preparations', 'mesh_ui': 'D004364'}, 31 | {'cui': 'C1257890', 'mesh_term': 'Population Groups', 'mesh_ui': 'D044382'}, 32 | {'cui': 'C0087111', 'mesh_term': 'Therapeutics', 'mesh_ui': 'D013812'}], 33 | 'o': ['PCR-confirmed, symptomatic Covid-19', 34 | 'SARS-CoV-2 infection', 35 | 'incidence of adverse events', 36 | 'symptomatic Covid-19', 37 | 'Adverse', 38 | 'serious adverse events', 39 | 'Covid-19 or a positive PCR test', 40 | 'SARS-CoV-2', 41 | 'incidence of PCR-confirmed, symptomatic Covid-19', 42 | 'incidence of SARS-CoV-2 transmission', 43 | 'symptoms'], 44 | 'o_mesh': [{'cui': 'C0032520', 'mesh_term': 'Polymerase Chain Reaction', 'mesh_ui': 'D016133'}, 45 | {'cui': 'TS-COV19', 'mesh_term': 'COVID-19', 'mesh_ui': 'C000657245'}, 46 | {'cui': 'C1175743', 'mesh_term': 'SARS Virus', 'mesh_ui': 'D045473'}, 47 | {'cui': 'C3714514', 'mesh_term': 'Infection', 'mesh_ui': 'D007239'}, 48 | {'cui': 'C0021149', 'mesh_term': 'Incidence', 'mesh_ui': 'D015994'}, 49 | {'cui': 'C0040722', 'mesh_term': 'transmission', 'mesh_ui': 'Q000635'}, 50 | {'cui': 'C0683368', 'mesh_term': 'symptoms', 'mesh_ui': 'Q000175'}], 51 | 'p': ['2314 healthy contacts of 672 index case patients with Covid-19 who were identified between March 17 and April 28, 2020', 52 | 'asymptomatic contacts of patients with polymerase-chain-reaction', 53 | 'healthy persons', 54 | 'Covid-19 in Catalonia, Spain', 55 | 'PCR-positive'], 56 | 'p_mesh': [{'cui': 'C0600653', 'mesh_term': 'Index', 'mesh_ui': 'D020481'}, 57 | {'cui': 'C0030705', 'mesh_term': 'Patient', 'mesh_ui': 'D010361'}, 58 | {'cui': 'TS-COV19', 'mesh_term': 'COVID-19', 'mesh_ui': 'C000657245'}, 59 | {'cui': 'C0032520', 'mesh_term': 'Polymerase Chain Reaction', 'mesh_ui': 'D016133'}, 60 | {'cui': 'C0027361', 'mesh_term': 'Person', 'mesh_ui': 'D009272'}, 61 | {'cui': 'C0037747', 'mesh_term': 'Spain', 'mesh_ui': 'D013030'}]}, 62 | 'punchline_bot': {'effect': '— no diff', 63 | 'punchline_text': 'Results were similar in the hydroxychloroquine and usual-care groups with respect to the incidence of PCR-confirmed, symptomatic Covid-19 (5.7% and 6.2%, ' 64 | 'respectively; risk ratio, 0.86 [95% confidence interval, 0.52 to 1.42]).'}, 65 | 'rct_bot': {'is_rct': True, 'prob_rct': 0.6828127889603965, 'scores': {'is_rct_balanced': True, 'is_rct_precise': True, 'is_rct_sensitive': True}}, 66 | 'sample_size_bot': {'num_randomized': '2314'}} 67 | ``` 68 | 69 | # Installing 70 | 71 | *NOTE*: As of mid-October 2021, installing `rrnlp` via `pip` does not cooperate well with python 3.10; we suggest using 3.9. 72 | 73 | The easiest way to install the latest version is via `pip`. 74 | 75 | ```bash 76 | pip install rrnlp 77 | ``` 78 | 79 | (Model weights will then be downloaded as needed when you import `rrnlp`.) We suggest using a custom environment, so if you're using `conda` this might look something like 80 | 81 | ```bash 82 | conda create --name rrnlp python=3.9 83 | conda activate rrnlp 84 | pip install rrnlp 85 | ``` 86 | 87 | Alternatively, if you want to use the bleeding-edge (for better or worse) you can try installing directly via `git` 88 | 89 | ```bash 90 | pip install git+https://github.com/bwallace/RRnlp.git 91 | ``` 92 | 93 | (Or can `clone` and then `install .` locally.) 94 | 95 | 96 | # Citation 97 | 98 | This set of models is a compilation of several different lines of work. If you use this and find it useful for your work, please consider citing (some subset of) the following. 99 | 100 | For the overall system: 101 | 102 | ``` 103 | Marshall, I.J., Nye, B., Kuiper, J., Noel-Storr, A., Marshall, R., Maclean, R., Soboczenski, F., Nenkova, A., Thomas, J. and Wallace, B.C., 2020. Trialstreamer: A living, automatically updated database of clinical trial reports. Journal of the American Medical Informatics Association, 27(12), pp.1903-1912. 104 | 105 | Nye, B.E., Nenkova, A., Marshall, I.J. and Wallace, B.C., 2020, July. Trialstreamer: mapping and browsing medical evidence in real-time. In Proceedings of the conference. Association for Computational Linguistics. North American Chapter. Meeting (Vol. 2020, p. 63). 106 | ``` 107 | 108 | For the "inference" component specifically ("punchlines" and directionality): 109 | 110 | ``` 111 | Eric Lehman, Jay DeYoung, Regina Barzilay, and Byron C. Wallace. Inferring Which Medical Treatments Work from Reports of Clinical Trials. In Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics (NAACL), pages 3705–3717, 2019. 112 | 113 | Jay DeYoung, Eric Lehman, Benjamin Nye, Iain Marshall, and Byron C. Wallace. Evidence Inference 2.0: More Data, Better Models. In Proceedings of BioNLP; co-located with the Association for Computational Linguistics (ACL), 2020. 114 | ``` 115 | 116 | If you are using the PICO snippets 117 | 118 | ``` 119 | Benjamin Nye, Jessy Li, Roma Patel, Yinfei Yang, Iain Marshall, Ani Nenkova, and Byron C. Wallace. A Corpus with Multi-Level Annotations of Patients, Interventions and Outcomes to Support Language Processing for Medical Literature. In Proceedings of the Conference of the Association for Computational Linguistics (ACL), pages 197–207, 2018. 120 | ``` 121 | 122 | For the RCT classifier 123 | 124 | ``` 125 | Marshall, Iain J., Anna Noel‐Storr, Joël Kuiper, James Thomas, and Byron C. Wallace. "Machine learning for identifying randomized controlled trials: an evaluation and practitioner's guide." Research Synthesis Methods 9, no. 4 (2018): 602-614. 126 | ``` 127 | 128 | And for risk of bias 129 | 130 | ``` 131 | Iain J. Marshall, Joël Kuiper, and Byron C. Wallace. RobotReviewer: Evaluation of a System for Automatically Assessing Bias in Clinical Trials. Journal of the American Medical Informatics Association (JAMIA), 23(1):193–201, 2016. 132 | ``` 133 | 134 | # Support 135 | 136 | This work has been supported by National Institutes of Health (NIH) under the National Library of Medicine (NLM), grant R01-LM012086 and by the National Science Foundation (NSF) under Grant 1750978: "CAREER: Structured Scientific Evidence Extraction: Models and Corpora". The work has also been partially supported by the UK Medical Research Council (MRC), through its Skills Development Fellowship program, fellowship MR/N015185/1. 137 | 138 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gensim>=4.1.2,<=4.2.0 2 | numpy>=1.22.2,<=1.23.2 3 | regex>=2021.4.4,<=2022.8.17 4 | requests>=2.25.1,<=2.28.1 5 | scipy>=1.6.3,<=1.8.3 6 | scikit-learn==1.1.2 7 | spacy==3.0.6 8 | torch>=1.8.1,<=1.12.1 9 | transformers>=4.0.0,<=4.21.3 10 | -------------------------------------------------------------------------------- /rrnlp/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A module that ties together the constituent models into a single 3 | interface; will pull everything it can from an input article. 4 | 5 | Note: if you find this useful, please see: 6 | https://github.com/bwallace/RRnlp#citation. 7 | ''' 8 | from typing import Type, Tuple, List 9 | 10 | import warnings 11 | 12 | import rrnlp 13 | 14 | from rrnlp.models import PICO_tagger, ev_inf_classifier, \ 15 | sample_size_extractor, RoB_classifier_LR, \ 16 | RCT_classifier, study_design_classifier, get_device \ 17 | 18 | class TrialReader: 19 | task_loaders = { 20 | "rct_bot": RCT_classifier.AbsRCTBot, 21 | "pico_span_bot": PICO_tagger.PICOBot, 22 | "punchline_bot": ev_inf_classifier.EvInfBot, 23 | "bias_ab_bot": RoB_classifier_LR.AbsRoBBot, 24 | "sample_size_bot": sample_size_extractor.MLPSampleSizeClassifier, 25 | "study_design_bot": study_design_classifier.AbsStudyDesignBot 26 | } 27 | 28 | 29 | def __init__(self, tasks=None, device='auto'): 30 | if tasks is None: 31 | tasks = TrialReader.task_loaders.keys() 32 | else: 33 | assert all([task in TrialReader.task_loaders for task in tasks]) 34 | 35 | self.models = {task: TrialReader.task_loaders[task](device=get_device(device)) for task in tasks} 36 | 37 | def read_trial(self, ab: dict, process_rcts_only=True, 38 | task_list=None) -> Type[dict]: 39 | """ 40 | The default behaviour is that non-RCTs do not have all extractions done (to save time). 41 | If you wish to use all the models anyway (which might not behave entirely as expected) 42 | then set `process_rcts_only=False`. 43 | """ 44 | 45 | if task_list is None: 46 | task_list = ["rct_bot", "pico_span_bot", "punchline_bot", 47 | "bias_ab_bot", "sample_size_bot"] 48 | 49 | return_dict = {} 50 | return_dict["rct_bot"] = {"is_rct": False} 51 | 52 | if process_rcts_only: 53 | task_list.remove('rct_bot') 54 | # First: is this an RCT? If not, the rest of the models do not make 55 | # a lot of sense so we will warn the user 56 | return_dict["rct_bot"] = self.models['rct_bot'].predict_for_ab(ab) 57 | 58 | if not return_dict["rct_bot"]["is_rct"]: 59 | if process_rcts_only: 60 | warnings.warn('''Predicted as non-RCT, so rest of models not run. Re-run 61 | with `process_rcts_only=False` to get all predictions.''') 62 | else: 63 | warnings.filterwarnings('once', 'The input does not appear to describe an RCT;' 64 | 'interpret predictions accordingly.') 65 | 66 | if (not process_rcts_only) or return_dict["rct_bot"]["is_rct"]: 67 | 68 | for task in task_list: 69 | return_dict[task] = self.models[task].predict_for_ab(ab) 70 | 71 | return return_dict 72 | 73 | 74 | # For e.g.: 75 | # import rrnlp 76 | # trial_reader = rrnlp.TrialReader() 77 | 78 | # ti_abs = {"ti": 'A Cluster-Randomized Trial of Hydroxychloroquine for Prevention of Covid-19', 79 | # "ab": """ Background: Current strategies for preventing severe acute 80 | # respiratory syndrome coronavirus 2 (SARS-CoV-2) infection are 81 | # limited to nonpharmacologic interventions. Hydroxychloroquine has 82 | # been proposed as a postexposure therapy to prevent coronavirus 83 | # disease 2019 (Covid-19), but definitive evidence is lacking. 84 | 85 | # Methods: We conducted an open-label, cluster-randomized trial 86 | # involving asymptomatic contacts of patients with 87 | # polymerase-chain-reaction (PCR)-confirmed Covid-19 in Catalonia, 88 | # Spain. We randomly assigned clusters of contacts to the 89 | # hydroxychloroquine group (which received the drug at a dose of 800 mg 90 | # once, followed by 400 mg daily for 6 days) or to the usual-care 91 | # group (which received no specific therapy). The primary outcome was 92 | # PCR-confirmed, symptomatic Covid-19 within 14 days. The secondary 93 | # outcome was SARS-CoV-2 infection, defined by symptoms compatible with 94 | # Covid-19 or a positive PCR test regardless of symptoms. Adverse 95 | # events were assessed for up to 28 days.\n\nResults: The analysis 96 | # included 2314 healthy contacts of 672 index case patients with 97 | # Covid-19 who were identified between March 17 and April 28, 2020. A 98 | # total of 1116 contacts were randomly assigned to receive 99 | # hydroxychloroquine and 1198 to receive usual care. Results were 100 | # similar in the hydroxychloroquine and usual-care groups with respect 101 | # to the incidence of PCR-confirmed, symptomatic Covid-19 (5.7% and 102 | # 6.2%, respectively; risk ratio, 0.86 [95% confidence interval, 0.52 103 | # to 1.42]). In addition, hydroxychloroquine was not associated with a 104 | # lower incidence of SARS-CoV-2 transmission than usual care (18.7% and 105 | # 17.8%, respectively). The incidence of adverse events was higher in 106 | # the hydroxychloroquine group than in the usual-care group (56.1% vs. 107 | # 5.9%), but no treatment-related serious adverse events were 108 | # reported.\n\nConclusions: Postexposure therapy with 109 | # hydroxychloroquine did not prevent SARS-CoV-2 infection or 110 | # symptomatic Covid-19 in healthy persons exposed to a PCR-positive 111 | # case patient. (Funded by the crowdfunding campaign YoMeCorono and 112 | # others; BCN-PEP-CoV2 ClinicalTrials.gov number, NCT04304053.). 113 | # """ 114 | # } 115 | # preds = trial_reader.read_trial(ti_abs) 116 | -------------------------------------------------------------------------------- /rrnlp/models/PICO_tagger.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This module extracts descriptions (snippets) that describe the trial 3 | Population, Interventions/Comparators, and Outcomes (PICO elements) 4 | from abstracts of RCT reports. 5 | 6 | Reference: 7 | 8 | Nye, B., Li, J.J., Patel, R., Yang, Y., Marshall, I.J., Nenkova, A. 9 | and Wallace, B.C. 10 | A corpus with multi-level annotations of patients, interventions and 11 | outcomes to support language processing for medical literature. 12 | In Proceedings of Association for Computational Linguistics (ACL), 2018. 13 | ''' 14 | 15 | 16 | import os 17 | import string 18 | from typing import Type, Tuple, List 19 | 20 | import torch 21 | from transformers import BertForTokenClassification 22 | 23 | import rrnlp 24 | from rrnlp.models import encoder, get_device 25 | from rrnlp.models.util.minimap import minimap 26 | from rrnlp.models.util.schwartz_hearst import extract_abbreviation_definition_pairs 27 | 28 | 29 | 30 | 31 | weights_path = rrnlp.models.weights_path 32 | doi = rrnlp.models.files_needed['PICO_tagger']['zenodo'] 33 | 34 | # this dictionary specifies paths to the (torch) weights on disk for 35 | # the P, I, O models (both the classifier or 'clf' layer and the 36 | # (custom, top layers of the) encoders. 37 | weights_paths = { 38 | "p" : {"clf": os.path.join(weights_path, f"{doi}_population_clf.pt"), 39 | "encoder" : os.path.join(weights_path, f"{doi}_population_encoder_custom.pt")}, 40 | "i" : {"clf": os.path.join(weights_path, f"{doi}_interventions_clf.pt"), 41 | "encoder" : os.path.join(weights_path, f"{doi}_interventions_encoder_custom.pt")}, 42 | "o" : {"clf": os.path.join(weights_path, f"{doi}_outcomes_clf.pt"), 43 | "encoder" : os.path.join(weights_path, f"{doi}_outcomes_encoder_custom.pt")} 44 | } 45 | 46 | ids2tags = { 47 | "p" : {0:'pop', 1:'O'}, 48 | "i" : {0:'intervention', 1:'O'}, 49 | "o" : {0:'outcome', 1:'O'} 50 | } 51 | 52 | def get_tagging_model(element: str, device=None) -> Type[BertForTokenClassification]: 53 | ''' Load in and return a tagger for a given element ''' 54 | 55 | assert(element in ids2tags.keys()) 56 | device = get_device(device) 57 | 58 | # note that we assume the models were trained under I/O 59 | # encoding such that num_labels is 2 60 | model = BertForTokenClassification.from_pretrained('allenai/scibert_scivocab_uncased', 61 | num_labels=2) 62 | model = model.to(device) 63 | 64 | 65 | # load in the correct top layer weights 66 | clf_weights_path = weights_paths[element]['clf'] 67 | model.classifier.load_state_dict(torch.load(clf_weights_path, 68 | map_location=torch.device(device))) 69 | 70 | 71 | encoder_weights_path = weights_paths[element]['encoder'] 72 | custom_encoder_layers = torch.load(encoder_weights_path, 73 | map_location=torch.device(device)) 74 | encoder.load_encoder_layers(model.bert, encoder.get_muppet(), custom_encoder_layers) 75 | 76 | return model 77 | 78 | def print_labels(tokens: List[str], labels: List[str]) -> List[str]: 79 | ''' Helper to gather strings assigned labels ''' 80 | all_strs, cur_str = [], [] 81 | cur_lbl = "O" 82 | for token, lbl in zip(tokens, labels): 83 | if lbl != "O": 84 | cur_str.append(token) 85 | cur_lbl = lbl 86 | elif cur_lbl != "O": 87 | str_ = " ".join(cur_str) 88 | all_strs.append(str_) 89 | cur_str = [] 90 | cur_lbl = "O" 91 | 92 | return all_strs 93 | 94 | def predict_for_str(model: Type[BertForTokenClassification], string: str, 95 | id2tag: dict, print_tokens: bool=True, o_lbl:str="O", 96 | return_strings_only: bool=True, 97 | device=None) -> list: 98 | ''' 99 | Make predictions for the input text using the given tagging model. 100 | ''' 101 | if device is None: 102 | device = get_device() 103 | 104 | model.eval() 105 | words = string.split(" ") 106 | 107 | x = encoder.tokenize([words]) 108 | 109 | with torch.no_grad(): 110 | preds = model(torch.tensor(x['input_ids']).to(device))['logits'].cpu().numpy().argmax(axis=2) 111 | preds = [id2tag[p] for p in preds[0]] 112 | 113 | cur_w_idx = None 114 | word_preds = [] 115 | for pred, word_idx in zip(preds, x.word_ids()): 116 | if word_idx != cur_w_idx and word_idx is not None: 117 | word_preds.append(pred) 118 | cur_w_idx = word_idx 119 | 120 | words_and_preds = list(zip(words, word_preds)) 121 | if return_strings_only: 122 | return print_labels(words, word_preds) 123 | 124 | return words_and_preds 125 | 126 | def cleanup(spans: List[str]) -> List[str]: 127 | ''' 128 | A helper (static) function for prettifying / deduplicating the 129 | PICO snippets extracted by the model. 130 | ''' 131 | def clean_span(s): 132 | s_clean = s.strip() 133 | # remove punctuation 134 | s_clean = s_clean.strip(string.punctuation) 135 | 136 | # remove 'Background:' when we pick it up 137 | s_clean = s_clean.replace("Background", "") 138 | return s_clean 139 | 140 | 141 | cleaned_spans = [clean_span(s) for s in spans] 142 | # remove empty 143 | cleaned_spans = [s for s in cleaned_spans if s] 144 | # dedupe 145 | return list(set(cleaned_spans)) 146 | 147 | 148 | class PICOBot: 149 | ''' Lightweight class that holds taggers for all elements ''' 150 | def __init__(self, device='auto'): 151 | self.PICO_models = {} 152 | for element in ['p', 'i', 'o']: 153 | self.PICO_models[element] = get_tagging_model(element, device=device) 154 | 155 | 156 | def predict_for_ab(self, ab: dict) -> dict: 157 | 158 | ti_abs = ab['ab'].strip() 159 | 160 | preds_d = {} 161 | 162 | 163 | abbrev_dict = extract_abbreviation_definition_pairs(doc_text=ti_abs) 164 | 165 | for element, model in self.PICO_models.items(): 166 | 167 | id2tag = ids2tags[element] 168 | predicted_spans = cleanup(predict_for_str(model, ti_abs, id2tag)) 169 | MeSH_terms = minimap.get_unique_terms(predicted_spans, abbrevs=abbrev_dict) 170 | preds_d[element] = predicted_spans 171 | preds_d[f"{element}_mesh"] = MeSH_terms 172 | 173 | return preds_d 174 | 175 | 176 | ''' 177 | e.g., 178 | 179 | import PICO_tagger 180 | bot = PICO_tagger.PICOBot() 181 | ti_abs = {"ti": 'A Cluster-Randomized Trial of Hydroxychloroquine for Prevention of Covid-19', 182 | "ab": """ Background: Current strategies for preventing severe acute 183 | respiratory syndrome coronavirus 2 (SARS-CoV-2) infection are 184 | limited to nonpharmacologic interventions. Hydroxychloroquine has 185 | been proposed as a postexposure therapy to prevent coronavirus 186 | disease 2019 (Covid-19), but definitive evidence is lacking. 187 | 188 | Methods: We conducted an open-label, cluster-randomized trial 189 | involving asymptomatic contacts of patients with 190 | polymerase-chain-reaction (PCR)-confirmed Covid-19 in Catalonia, 191 | Spain. We randomly assigned clusters of contacts to the 192 | hydroxychloroquine group (which received the drug at a dose of 800 mg 193 | once, followed by 400 mg daily for 6 days) or to the usual-care 194 | group (which received no specific therapy). The primary outcome was 195 | PCR-confirmed, symptomatic Covid-19 within 14 days. The secondary 196 | outcome was SARS-CoV-2 infection, defined by symptoms compatible with 197 | Covid-19 or a positive PCR test regardless of symptoms. Adverse 198 | events were assessed for up to 28 days.\n\nResults: The analysis 199 | included 2314 healthy contacts of 672 index case patients with 200 | Covid-19 who were identified between March 17 and April 28, 2020. A 201 | total of 1116 contacts were randomly assigned to receive 202 | hydroxychloroquine and 1198 to receive usual care. Results were 203 | similar in the hydroxychloroquine and usual-care groups with respect 204 | to the incidence of PCR-confirmed, symptomatic Covid-19 (5.7% and 205 | 6.2%, respectively; risk ratio, 0.86 [95% confidence interval, 0.52 206 | to 1.42]). In addition, hydroxychloroquine was not associated with a 207 | lower incidence of SARS-CoV-2 transmission than usual care (18.7% and 208 | 17.8%, respectively). The incidence of adverse events was higher in 209 | the hydroxychloroquine group than in the usual-care group (56.1% vs. 210 | 5.9%), but no treatment-related serious adverse events were 211 | reported.\n\nConclusions: Postexposure therapy with 212 | hydroxychloroquine did not prevent SARS-CoV-2 infection or 213 | symptomatic Covid-19 in healthy persons exposed to a PCR-positive 214 | case patient. (Funded by the crowdfunding campaign YoMeCorono and 215 | others; BCN-PEP-CoV2 ClinicalTrials.gov number, NCT04304053.). 216 | """ 217 | } 218 | 219 | 220 | 221 | preds = bot.predict_for_ab(ti_abs) 222 | ''' 223 | -------------------------------------------------------------------------------- /rrnlp/models/RCT_classifier.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This module classifies input abstracts as describing a Randomized Controlled Trial 3 | (in humans) or not. 4 | 5 | For reference (and citation), see: 6 | 7 | Marshall, Iain J., Anna Noel‐Storr, Joël Kuiper, James Thomas, and Byron C. Wallace. 8 | "Machine learning for identifying randomized controlled trials: an evaluation and 9 | practitioner's guide." Research synthesis methods 9, no. 4 (2018): 602-614. 10 | ''' 11 | 12 | import os 13 | from typing import Type, Tuple, List 14 | 15 | import torch 16 | from transformers import BertForSequenceClassification 17 | 18 | import rrnlp 19 | from rrnlp.models import encoder, get_device 20 | 21 | 22 | import pickle 23 | 24 | 25 | 26 | # Thresholds evaluated via bootstrap on Clinical hedges 27 | thresholds = {'bert': {'precise': 0.007859864302367195, 28 | 'sensitive': 0.0027666038410490913, 29 | 'balanced': 0.005165116927458068}, 30 | 'bert_ptyp': {'precise': 0.04210370868268101, 31 | 'sensitive': 0.040919136397870086, 32 | 'balanced': 0.0034764010192827734}} 33 | 34 | weights_path = rrnlp.models.weights_path 35 | doi = rrnlp.models.files_needed['RCT_classifier']['zenodo'] 36 | 37 | # These are the paths to the classifier (clf) and (custom; top-k layer) 38 | # encoder weights for the RCT model. 39 | clf_weights_path = os.path.join(weights_path, f"{doi}_RCT_overall_abs_clf.pt") 40 | # Task-specific weights for the encoder 41 | shared_encoder_weights_path = os.path.join(weights_path, f"{doi}_RCT_encoder_custom.pt") 42 | 43 | with open(os.path.join(weights_path, f"{doi}_bert_LR.pck"), 'rb') as f: 44 | lr = pickle.load(f) 45 | 46 | def get_RCT_model(device='auto') -> Type[BertForSequenceClassification]: 47 | ''' Load in and return RCT model weights. ''' 48 | device = get_device(device) 49 | 50 | # Note that we assume the models were trained under I/O encoding 51 | # such that num_labels is 2 52 | model = BertForSequenceClassification.from_pretrained('allenai/scibert_scivocab_uncased', 53 | num_labels=2) 54 | 55 | 56 | # Read in encoder: a mix of shared weights and custom 57 | custom_encoder_layers = torch.load(shared_encoder_weights_path, 58 | map_location=torch.device(device)) 59 | 60 | encoder.load_encoder_layers(model.bert, encoder.get_muppet(), custom_encoder_layers) 61 | 62 | # Load in the correct top layer (classifier) weights 63 | model.classifier.load_state_dict(torch.load(clf_weights_path, 64 | map_location=torch.device(device))) 65 | model.to(device) 66 | return model 67 | 68 | 69 | class AbsRCTBot: 70 | ''' Lightweight container class that holds RCT model ''' 71 | def __init__(self, device='auto'): 72 | self.RCT_model = get_RCT_model(get_device(device)) 73 | self.RCT_model.eval() 74 | 75 | def classify(self, raw_bert_score: float) -> dict: 76 | """ 77 | gets balanced classification, but also returns full scores 78 | """ 79 | prob_rct = lr.predict_proba([[raw_bert_score]])[:,1] 80 | 81 | scores = {} 82 | for t in ['sensitive', 'balanced', 'precise']: 83 | scores[f"is_rct_{t}"] = bool((prob_rct > thresholds['bert'][t])[0]) 84 | 85 | 86 | return {"is_rct": scores['is_rct_balanced'], "prob_rct": float((prob_rct)[0]), "scores": scores} 87 | 88 | def predict_for_ab(self, ab: dict) -> float: 89 | ti_and_abs = ab['ti'] + ' ' + ab['ab'] 90 | ''' Predicts p(low risk of bias) for input abstract ''' 91 | x = encoder.tokenize(ti_and_abs, is_split_into_words=False) 92 | 93 | with torch.no_grad(): 94 | 95 | x_input_ids = torch.tensor(x['input_ids']).to(self.RCT_model.device).unsqueeze(dim=0) 96 | attention_mask= torch.tensor(x['attention_mask']).to(self.RCT_model.device).unsqueeze(dim=0) 97 | 98 | logits = self.RCT_model(x_input_ids, attention_mask=attention_mask)['logits'].cpu() 99 | probs = torch.nn.functional.softmax(logits, dim=1).numpy() 100 | 101 | raw_rct_score = probs[0][1] 102 | return self.classify(raw_rct_score) 103 | 104 | def make_preds_for_abstract(self, ti_and_abs: str) -> float: 105 | self.predict_for_doc(ti_and_abs) 106 | 107 | 108 | ### 109 | # e.g. 110 | # 111 | # import RCT_classifier 112 | # RCT_bot = RCT_classifier.AbsRCTBot() 113 | # 114 | # ti_abs = {"ti": 'A Cluster-Randomized Trial of Hydroxychloroquine for Prevention of Covid-19', 115 | # "ab": """ Background: Current strategies for preventing severe acute 116 | # respiratory syndrome coronavirus 2 (SARS-CoV-2) infection are 117 | # limited to nonpharmacologic interventions. Hydroxychloroquine has 118 | # been proposed as a postexposure therapy to prevent coronavirus 119 | # disease 2019 (Covid-19), but definitive evidence is lacking. 120 | 121 | # Methods: We conducted an open-label, cluster-randomized trial 122 | # involving asymptomatic contacts of patients with 123 | # polymerase-chain-reaction (PCR)-confirmed Covid-19 in Catalonia, 124 | # Spain. We randomly assigned clusters of contacts to the 125 | # hydroxychloroquine group (which received the drug at a dose of 800 mg 126 | # once, followed by 400 mg daily for 6 days) or to the usual-care 127 | # group (which received no specific therapy). The primary outcome was 128 | # PCR-confirmed, symptomatic Covid-19 within 14 days. The secondary 129 | # outcome was SARS-CoV-2 infection, defined by symptoms compatible with 130 | # Covid-19 or a positive PCR test regardless of symptoms. Adverse 131 | # events were assessed for up to 28 days.\n\nResults: The analysis 132 | # included 2314 healthy contacts of 672 index case patients with 133 | # Covid-19 who were identified between March 17 and April 28, 2020. A 134 | # total of 1116 contacts were randomly assigned to receive 135 | # hydroxychloroquine and 1198 to receive usual care. Results were 136 | # similar in the hydroxychloroquine and usual-care groups with respect 137 | # to the incidence of PCR-confirmed, symptomatic Covid-19 (5.7% and 138 | # 6.2%, respectively; risk ratio, 0.86 [95% confidence interval, 0.52 139 | # to 1.42]). In addition, hydroxychloroquine was not associated with a 140 | # lower incidence of SARS-CoV-2 transmission than usual care (18.7% and 141 | # 17.8%, respectively). The incidence of adverse events was higher in 142 | # the hydroxychloroquine group than in the usual-care group (56.1% vs. 143 | # 5.9%), but no treatment-related serious adverse events were 144 | # reported.\n\nConclusions: Postexposure therapy with 145 | # hydroxychloroquine did not prevent SARS-CoV-2 infection or 146 | # symptomatic Covid-19 in healthy persons exposed to a PCR-positive 147 | # case patient. (Funded by the crowdfunding campaign YoMeCorono and 148 | # others; BCN-PEP-CoV2 ClinicalTrials.gov number, NCT04304053.). 149 | # """ 150 | # } 151 | # pred_RCT = RCT_bot.predict_for_doc(ti_abs) 152 | 153 | -------------------------------------------------------------------------------- /rrnlp/models/RoB_classifier.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This module performs "risk of bias" assessment for reports of RCTs, albeit using 3 | only *abstracts*; this is therefore a very simplified RoB scheme, with a single, 4 | overall "risk" assessed, as opposed to doing this per domain (as in the Cochrane 5 | tool). 6 | 7 | For reference (and citation), see: 8 | 9 | Marshall, Iain J., Joël Kuiper, and Byron C. Wallace. 10 | "RobotReviewer: evaluation of a system for automatically 11 | assessing bias in clinical trials." 12 | Journal of the American Medical Informatics Association 23, no. 1 13 | 2016: 193-201. 14 | 15 | And more about the abstract-only approach here: 16 | 17 | Marshall, I.J., Nye, B., Kuiper, J., Noel-Storr, A., Marshall, R., 18 | Maclean, R., Soboczenski, F., Nenkova, A., Thomas, J. and 19 | Wallace, B.C. 20 | "Trialstreamer: A living, automatically updated database of clinical trial reports." 21 | Journal of the American Medical Informatics Association, 27(12), 22 | 2020: pp.1903-1912. 23 | ''' 24 | 25 | import os 26 | from typing import Type, Tuple, List 27 | 28 | import torch 29 | from transformers import BertForSequenceClassification 30 | 31 | import rrnlp 32 | from rrnlp.models import encoder, get_device 33 | 34 | weights_path = rrnlp.models.weights_path 35 | doi = rrnlp.models.files_needed['RoB_classifier']['zenodo'] 36 | 37 | # These are the paths to the classifier (clf) and (custom; top-k layer) 38 | # encoder weights for the RoB model. 39 | clf_weights_path = os.path.join(weights_path, f"{doi}_RoB_overall_abs_clf.pt") 40 | # Task-specific weights for the encoder 41 | shared_encoder_weights_path = os.path.join(weights_path, f"{doi}_RoB_encoder_custom.pt") 42 | 43 | def get_RoB_model(device='auto') -> Type[BertForSequenceClassification]: 44 | ''' Load in and return RoB model weights. ''' 45 | device = get_device('device') 46 | 47 | # Note that we assume the models were trained under I/O encoding 48 | # such that num_labels is 2 49 | model = BertForSequenceClassification.from_pretrained('allenai/scibert_scivocab_uncased', 50 | num_labels=2) 51 | 52 | 53 | # Read in encoder: a mix of shared weights and custom 54 | custom_encoder_layers = torch.load(shared_encoder_weights_path, 55 | map_location=torch.device(device)) 56 | 57 | encoder.load_encoder_layers(model.bert, encoder.get_muppet(), custom_encoder_layers) 58 | 59 | # Load in the correct top layer (classifier) weights 60 | model.classifier.load_state_dict(torch.load(clf_weights_path, 61 | map_location=torch.device(device))) 62 | return model 63 | 64 | 65 | class AbsRoBBot: 66 | ''' Lightweight container class that holds RoB model ''' 67 | def __init__(self, device=None): 68 | self.RoB_model = get_RoB_model(device=device) 69 | self.RoB_model.eval() 70 | 71 | def predict_for_ab(self, ab: dict) -> dict: 72 | ti_and_abs = ab['ti'] + ' ' + ab['ab'] 73 | ''' Predicts p(low risk of bias) for input abstract ''' 74 | x = encoder.tokenize(ti_and_abs, is_split_into_words=False) 75 | 76 | with torch.no_grad(): 77 | 78 | x_input_ids = torch.tensor(x['input_ids']).to(self.RoB_model.device).unsqueeze(dim=0) 79 | attention_mask= torch.tensor(x['attention_mask'])\ 80 | .to(self.RoB_model.device).unsqueeze(dim=0) 81 | 82 | logits = self.RoB_model(x_input_ids, attention_mask=attention_mask)['logits'].cpu() 83 | probs = torch.nn.functional.softmax(logits, dim=1).numpy() 84 | 85 | prob_low_risk = probs[0][1] 86 | return {"prob_low_rob": prob_low_risk} 87 | 88 | def make_preds_for_abstract(self, ti_and_abs: str) -> float: 89 | self.predict_for_doc(ti_and_abs) 90 | 91 | 92 | ### 93 | # e.g. 94 | # 95 | # import RoB_classifier 96 | # RoB_bot = RoB_classifier.AbsRoBBot() 97 | # 98 | # ti_abs = {"ti": 'A Cluster-Randomized Trial of Hydroxychloroquine for Prevention of Covid-19', 99 | # "ab": """ Background: Current strategies for preventing severe acute 100 | # respiratory syndrome coronavirus 2 (SARS-CoV-2) infection are 101 | # limited to nonpharmacologic interventions. Hydroxychloroquine has 102 | # been proposed as a postexposure therapy to prevent coronavirus 103 | # disease 2019 (Covid-19), but definitive evidence is lacking. 104 | 105 | # Methods: We conducted an open-label, cluster-randomized trial 106 | # involving asymptomatic contacts of patients with 107 | # polymerase-chain-reaction (PCR)-confirmed Covid-19 in Catalonia, 108 | # Spain. We randomly assigned clusters of contacts to the 109 | # hydroxychloroquine group (which received the drug at a dose of 800 mg 110 | # once, followed by 400 mg daily for 6 days) or to the usual-care 111 | # group (which received no specific therapy). The primary outcome was 112 | # PCR-confirmed, symptomatic Covid-19 within 14 days. The secondary 113 | # outcome was SARS-CoV-2 infection, defined by symptoms compatible with 114 | # Covid-19 or a positive PCR test regardless of symptoms. Adverse 115 | # events were assessed for up to 28 days.\n\nResults: The analysis 116 | # included 2314 healthy contacts of 672 index case patients with 117 | # Covid-19 who were identified between March 17 and April 28, 2020. A 118 | # total of 1116 contacts were randomly assigned to receive 119 | # hydroxychloroquine and 1198 to receive usual care. Results were 120 | # similar in the hydroxychloroquine and usual-care groups with respect 121 | # to the incidence of PCR-confirmed, symptomatic Covid-19 (5.7% and 122 | # 6.2%, respectively; risk ratio, 0.86 [95% confidence interval, 0.52 123 | # to 1.42]). In addition, hydroxychloroquine was not associated with a 124 | # lower incidence of SARS-CoV-2 transmission than usual care (18.7% and 125 | # 17.8%, respectively). The incidence of adverse events was higher in 126 | # the hydroxychloroquine group than in the usual-care group (56.1% vs. 127 | # 5.9%), but no treatment-related serious adverse events were 128 | # reported.\n\nConclusions: Postexposure therapy with 129 | # hydroxychloroquine did not prevent SARS-CoV-2 infection or 130 | # symptomatic Covid-19 in healthy persons exposed to a PCR-positive 131 | # case patient. (Funded by the crowdfunding campaign YoMeCorono and 132 | # others; BCN-PEP-CoV2 ClinicalTrials.gov number, NCT04304053.). 133 | # """ 134 | # } 135 | # 136 | # pred_low_RoB = RoB_bot.predict_for_doc(ti_abs) 137 | -------------------------------------------------------------------------------- /rrnlp/models/RoB_classifier_LR.py: -------------------------------------------------------------------------------- 1 | """ 2 | the BiasAbRobot class takes the *abstract* of a clinical trial as 3 | input as a string, and returns bias information as a dict which 4 | can be easily converted to JSON. 5 | 6 | V2.0 7 | 8 | Returns an indicative probability that the article is at low risk of bias, based on the abstract alone. 9 | 10 | 11 | 12 | 13 | """ 14 | 15 | # Authors: Iain Marshall 16 | # Joel Kuiper 17 | # Byron Wallce 18 | 19 | import json 20 | import uuid 21 | import os 22 | import rrnlp 23 | from sklearn.feature_extraction.text import HashingVectorizer 24 | import pickle 25 | import numpy as np 26 | import re 27 | import scipy 28 | from scipy.sparse import hstack 29 | import rrnlp 30 | 31 | 32 | weights_path = rrnlp.models.weights_path 33 | 34 | doi = rrnlp.models.files_needed['RoB_classifier_LR']['zenodo'] 35 | 36 | class AbsRoBBot: 37 | ''' Lightweight container class that holds RoB logistic regression model ''' 38 | 39 | def __init__(self, device=None): 40 | 41 | with open(os.path.join(weights_path, f'{doi}_bias_prob_clf.pck'), 'rb') as f: 42 | self.clf = pickle.load(f) 43 | 44 | self.vec = HashingVectorizer(ngram_range=(1, 3), stop_words='english') 45 | 46 | 47 | def predict_for_ab(self, ab: dict) -> dict: 48 | 49 | """ 50 | Annotate abstract of clinical trial report 51 | 52 | """ 53 | ti_and_abs = ab['ti'] + " " + ab['ab'] 54 | X = self.vec.transform([ti_and_abs]) 55 | 56 | probs = self.clf.predict_proba(X)[:,1].tolist() 57 | 58 | return {"prob_low_rob": probs[0]} 59 | 60 | 61 | 62 | ### 63 | # e.g. 64 | # 65 | # import RoB_classifier_LR 66 | # RoB_bot = RoB_classifier_LR.AbsRoBBot() 67 | # 68 | # ti_abs = {"ti": 'A Cluster-Randomized Trial of Hydroxychloroquine for Prevention of Covid-19', 69 | # "ab": """ Background: Current strategies for preventing severe acute 70 | # respiratory syndrome coronavirus 2 (SARS-CoV-2) infection are 71 | # limited to nonpharmacologic interventions. Hydroxychloroquine has 72 | # been proposed as a postexposure therapy to prevent coronavirus 73 | # disease 2019 (Covid-19), but definitive evidence is lacking. 74 | 75 | # Methods: We conducted an open-label, cluster-randomized trial 76 | # involving asymptomatic contacts of patients with 77 | # polymerase-chain-reaction (PCR)-confirmed Covid-19 in Catalonia, 78 | # Spain. We randomly assigned clusters of contacts to the 79 | # hydroxychloroquine group (which received the drug at a dose of 800 mg 80 | # once, followed by 400 mg daily for 6 days) or to the usual-care 81 | # group (which received no specific therapy). The primary outcome was 82 | # PCR-confirmed, symptomatic Covid-19 within 14 days. The secondary 83 | # outcome was SARS-CoV-2 infection, defined by symptoms compatible with 84 | # Covid-19 or a positive PCR test regardless of symptoms. Adverse 85 | # events were assessed for up to 28 days.\n\nResults: The analysis 86 | # included 2314 healthy contacts of 672 index case patients with 87 | # Covid-19 who were identified between March 17 and April 28, 2020. A 88 | # total of 1116 contacts were randomly assigned to receive 89 | # hydroxychloroquine and 1198 to receive usual care. Results were 90 | # similar in the hydroxychloroquine and usual-care groups with respect 91 | # to the incidence of PCR-confirmed, symptomatic Covid-19 (5.7% and 92 | # 6.2%, respectively; risk ratio, 0.86 [95% confidence interval, 0.52 93 | # to 1.42]). In addition, hydroxychloroquine was not associated with a 94 | # lower incidence of SARS-CoV-2 transmission than usual care (18.7% and 95 | # 17.8%, respectively). The incidence of adverse events was higher in 96 | # the hydroxychloroquine group than in the usual-care group (56.1% vs. 97 | # 5.9%), but no treatment-related serious adverse events were 98 | # reported.\n\nConclusions: Postexposure therapy with 99 | # hydroxychloroquine did not prevent SARS-CoV-2 infection or 100 | # symptomatic Covid-19 in healthy persons exposed to a PCR-positive 101 | # case patient. (Funded by the crowdfunding campaign YoMeCorono and 102 | # others; BCN-PEP-CoV2 ClinicalTrials.gov number, NCT04304053.). 103 | # """ 104 | # } 105 | # 106 | # pred_low_RoB = RoB_bot.predict_for_doc(ti_abs) 107 | -------------------------------------------------------------------------------- /rrnlp/models/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import time 4 | import sys 5 | import tarfile 6 | import requests 7 | import urllib 8 | import rrnlp 9 | import torch 10 | 11 | 12 | weights_path = os.path.join(os.path.dirname(rrnlp.__file__), 13 | "models", "weights") 14 | 15 | def get_device(device='auto'): 16 | if device == 'auto' or device is None: 17 | return torch.cuda.current_device() if torch.cuda.is_available() else "cpu" 18 | elif isinstance(device, (torch.device, int)): 19 | return device 20 | elif 'cuda' in device: 21 | if torch.cuda.is_available(): 22 | return torch.cuda.current_device() 23 | else: 24 | raise Exception('requested a GPU but none available!') 25 | else: 26 | return torch.device(device) 27 | 28 | 29 | with open(os.path.join(weights_path, "weights_manifest.json"), 'r') as f: 30 | files_needed = json.load(f) 31 | 32 | 33 | # Helper for fetching files. 34 | def reporthook(count, block_size, total_size): 35 | # shamelessly stolen: https://blog.shichao.io/2012/10/04/progress_speed_indicator_for_urlretrieve_in_python.html 36 | global start_time 37 | if count == 0: 38 | start_time = time.time() 39 | return 40 | duration = time.time() - start_time 41 | progress_size = int(count * block_size) 42 | speed = int(progress_size / (1024 * duration)) 43 | percent = int(count * block_size * 100 / total_size) 44 | sys.stdout.write("\r...%d%%, %d MB, %d KB/s, %d seconds passed" % 45 | (percent, progress_size / (1024 * 1024), speed, duration)) 46 | sys.stdout.flush() 47 | 48 | 49 | ### 50 | # Download all model weights if necessary 51 | for model_name, model_data in files_needed.items(): 52 | 53 | for f in model_data['files']: 54 | 55 | url = f"https://zenodo.org/record/{model_data['zenodo']}/files/" + f 56 | 57 | f_path = os.path.join(weights_path, f"{model_data['zenodo']}_{f}") 58 | 59 | if not os.path.exists(f_path): 60 | import urllib.request 61 | print(f"Attempting to fetch weights from Zenodo {url}...") 62 | # TODO this is slow so should probably add a progress bar; 63 | # at present it just kinda sits there for a long time. 64 | 65 | urllib.request.urlretrieve(url, f_path, reporthook) 66 | if os.path.exists(f_path): 67 | print("success!") 68 | else: 69 | raise Exception(f"Sorry; unable to download data needed for the {model_name} model ({f}) - you will be unable to use this model.") 70 | 71 | 72 | ### 73 | # Finally, grab scispacy models required. Note that we do this mainly 74 | # to make pip cooperate, since these are requirements not available 75 | # on pypi. 76 | spacy_weights_path = os.path.join(weights_path, "en_core_sci_sm-0.4.0", 77 | "en_core_sci_sm", "en_core_sci_sm-0.4.0") 78 | if not os.path.exists(spacy_weights_path): 79 | print("downloading scispacy models...") 80 | # @TODO probably factor this out somewhere 81 | en_core_sci_sm_url = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz" 82 | file_stream = requests.get(en_core_sci_sm_url, stream=True) 83 | tarred = tarfile.open(fileobj=file_stream.raw, mode="r|gz") 84 | tarred.extractall(path=weights_path) 85 | print("ok!") 86 | -------------------------------------------------------------------------------- /rrnlp/models/encoder.py: -------------------------------------------------------------------------------- 1 | 2 | ''' 3 | The main purpose of this is to hold a single instance of a muppet for 4 | other modules to reference and share. 5 | ''' 6 | import os 7 | from collections import OrderedDict 8 | from typing import Type, Tuple, List 9 | 10 | import torch 11 | import transformers 12 | transformers.logging.set_verbosity_error() 13 | from transformers import AutoModel, AutoTokenizer, BertModel 14 | 15 | import numpy as np 16 | 17 | import spacy 18 | 19 | import rrnlp 20 | spacy_weights_path = os.path.join(rrnlp.models.weights_path, 21 | "en_core_sci_sm-0.4.0", "en_core_sci_sm", 22 | "en_core_sci_sm-0.4.0") 23 | nlp = spacy.load(spacy_weights_path) 24 | 25 | # Change as appropriate! 26 | tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') 27 | muppet = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased') 28 | #tokenizer = AutoTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract') 29 | #muppet = AutoModel.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract') 30 | 31 | MAX_LEN = 512 32 | 33 | # Freeze encoder layers by default. 34 | for param in muppet.parameters(): 35 | param.requires_grad = False 36 | 37 | def get_muppet() -> Type[BertModel]: 38 | return muppet 39 | 40 | def tokenize(texts: List[str], is_split_into_words: bool=True): 41 | ''' 42 | Assumes texts is a list of texts that have been **split into words**, like: 43 | 44 | [['Impact', 'of', 'early', 'mobilization', 'on', 'glycemic', ... ], 45 | ['Results', 'of', 'the', 'EICESS-92', ...] 46 | ] 47 | 48 | Unless this arg is False, in which case assumes these are just vanilla text 49 | inputs. 50 | ''' 51 | if is_split_into_words: 52 | return tokenizer(texts, is_split_into_words=True, 53 | return_offsets_mapping=True, padding=True, 54 | truncation=True, max_length=MAX_LEN) 55 | 56 | return tokenizer(texts, padding=True, truncation=True, max_length=MAX_LEN) 57 | 58 | 59 | ''' 60 | Helper methods for accessing/unfreezing layers. 61 | ''' 62 | def get_top_k_BERT_layers(bert_inst: Type[BertModel], k:int, 63 | n_encoder_layers:int=12) -> dict: 64 | ''' 65 | Return top $k$ layer parameters from the given (Bert) encoder 66 | in a parameters dictionary. 67 | ''' 68 | layer_indices = [n_encoder_layers-j for j in range(1, k+1)] 69 | layer_d = OrderedDict() 70 | 71 | for params_name, param in bert_inst.named_parameters(): 72 | if "encoder.layer" in params_name and int(params_name.split(".")[2]) in layer_indices: 73 | layer_d[params_name] = param 74 | 75 | return layer_d 76 | 77 | def unfreeze_last_k_layers(bert_inst: Type[BertModel], k: int, 78 | n_encoder_layers:int =12) -> None: 79 | ''' Unfreezes the top k layers; modifies given BERT instance. ''' 80 | encoder_layers_to_unfreeze = get_top_k_BERT_layers(bert_inst, k) 81 | for layer_name, layer_params in encoder_layers_to_unfreeze.items(): 82 | layer_params.requires_grad = True 83 | 84 | 85 | def load_encoder_layers(bespoke_muppet: Type[BertModel], 86 | shared_muppet: Type[BertModel], 87 | custom_layers: dict): 88 | ''' 89 | Update the target ('bespoke') BERT (or similar) encoder (first arg) such 90 | that it will comprise model parameter values equal to whatever is in 91 | "custom layers" for all layers that this StateDict contains, and values 92 | equal to those in the 'shared muppet' (another BERT or similar) otherwise. 93 | Modifies in-place (by val); returns None. 94 | ''' 95 | updated_sd = bespoke_muppet.state_dict() 96 | 97 | for layer_name, layer_params in bespoke_muppet.state_dict().items(): 98 | 99 | if layer_name in custom_layers.keys(): 100 | updated_sd[layer_name] = custom_layers[layer_name] 101 | else: 102 | updated_sd[layer_name] = shared_muppet.state_dict()[layer_name] 103 | 104 | bespoke_muppet.load_state_dict(updated_sd) 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /rrnlp/models/ev_inf_classifier.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This module performs the "evidence inference" task, using a simple 3 | "pipelined" approach in which we first try and identify a "punchline" 4 | sentence, and then infer the directionality of the evidence that 5 | seems to be reported in this. 6 | 7 | References: 8 | 9 | Inferring Which Medical Treatments Work from Reports of Clinical Trials. 10 | Eric Lehman, Jay DeYoung, Regina Barzilay, and Byron C. Wallace. 11 | Proceedings of the North American Chapter of the Association for Computational 12 | Linguistics (NAACL), 2019. 13 | 14 | Evidence Inference 2.0: More Data, Better Models. Jay DeYoung, Eric Lehman, 15 | Iain J. Marshall, and Byron C. Wallace. 16 | Proceedings of BioNLP (co-located with ACL), 2020. 17 | ''' 18 | 19 | import os 20 | import sys 21 | from typing import Type, Tuple, List 22 | 23 | import numpy as np 24 | 25 | import torch 26 | from transformers import BertForSequenceClassification 27 | 28 | import rrnlp 29 | from rrnlp.models import encoder, get_device 30 | 31 | weights_path = rrnlp.models.weights_path 32 | 33 | doi = rrnlp.models.files_needed['ev_inf_classifier']['zenodo'] 34 | 35 | # Paths to model weights for both the "punchline" extractor model and the 36 | # "inference" model. Both comprise custom encoder layers and a top layer 37 | # weight vector. 38 | clf_punchline_weights_path = os.path.join(weights_path, f"{doi}_evidence_identification_clf.pt") 39 | shared_enc_punchline_weights_path = os.path.join(weights_path, f"{doi}_evidence_identification_encoder_custom.pt") 40 | 41 | clf_inference_weights_path = os.path.join(weights_path, f"{doi}_inference_clf.pt") 42 | shared_enc_inference_weights_path = os.path.join(weights_path, f"{doi}_inference_encoder_custom.pt") 43 | 44 | def get_punchline_extractor(device='auto') -> Type[BertForSequenceClassification]: 45 | ''' 46 | Returns the 'punchline' extractor, which seeks out sentences that seem to convey 47 | main findings. 48 | ''' 49 | device = get_device(device=device) 50 | model = BertForSequenceClassification.from_pretrained('allenai/scibert_scivocab_uncased', 51 | num_labels=2) 52 | model = model.to(device) 53 | 54 | # Overwrite some of the encoder layers with custom weights. 55 | custom_encoder_layers = torch.load(shared_enc_punchline_weights_path, map_location=torch.device(device)) 56 | encoder.load_encoder_layers(model.bert, encoder.get_muppet(), custom_encoder_layers) 57 | 58 | # Load in the correct top layer weights. 59 | model.classifier.load_state_dict(torch.load(clf_punchline_weights_path, 60 | map_location=torch.device(device))) 61 | 62 | return model 63 | 64 | 65 | def get_inference_model(device='auto') -> Type[BertForSequenceClassification]: 66 | ''' 67 | This is a three-way classification model that attempts to classify punchline 68 | sentences as reporting a result where the intervention resulted in a sig. 69 | decrease, no diff, or sig. increase w/r/t the outcome measured. 70 | ''' 71 | device = get_device(device) 72 | model = BertForSequenceClassification.from_pretrained('allenai/scibert_scivocab_uncased', 73 | num_labels=3) 74 | model = model.to(device) 75 | # Overwrite some of the encoder layers with custom weights 76 | custom_encoder_layers = torch.load(shared_enc_inference_weights_path, 77 | map_location=torch.device(device)) 78 | encoder.load_encoder_layers(model.bert, encoder.get_muppet(), custom_encoder_layers) 79 | 80 | # Load in the correct top layer weights 81 | model.classifier.load_state_dict(torch.load(clf_inference_weights_path, 82 | map_location=torch.device(device))) 83 | return model 84 | 85 | 86 | class PunchlineExtractorBot: 87 | ''' Lightweight container class for extracting punchlines. ''' 88 | 89 | def __init__(self, device='auto'): 90 | self.punchline_extractor_model = get_punchline_extractor(device=device) 91 | self.punchline_extractor_model.eval() 92 | 93 | def predict_for_sentences(self, sents: List[str]) -> Type[np.array]: 94 | 95 | x = encoder.tokenize(sents, is_split_into_words=False) 96 | 97 | with torch.no_grad(): 98 | 99 | x_input_ids = torch.tensor(x['input_ids']).to(self.punchline_extractor_model.device) 100 | attention_mask= torch.tensor(x['attention_mask']).to(self.punchline_extractor_model.device) 101 | 102 | logits = self.punchline_extractor_model(x_input_ids, attention_mask=attention_mask)['logits'].cpu() 103 | probs = torch.nn.functional.softmax(logits, dim=1).numpy() 104 | 105 | return probs 106 | 107 | def predict_for_ab(self, ab: dict) -> Tuple[str, float]: 108 | ti_and_abs = ab['ti'] + ' ' + ab['ab'] 109 | # Split into sentences via scispacy 110 | sentences = [s.text.strip() for s in encoder.nlp(ti_and_abs).sents] 111 | # filter newline sentences 112 | sentences = list(filter(lambda s: len(s.strip()) > 0, sentences)) 113 | # Make punchline predictions 114 | pred_probs = self.predict_for_sentences(sentences) 115 | best_sent_idx = np.argmax(pred_probs[:,1]) 116 | # Retrieve highest scoring sentence 117 | best_sent = sentences[best_sent_idx] 118 | return best_sent, pred_probs[best_sent_idx][1] 119 | 120 | 121 | class InferenceBot: 122 | ''' Container for *inference* model which classifies punchlines. ''' 123 | def __init__(self, device='auto'): 124 | self.inference_model = get_inference_model(device=device) 125 | self.inference_model.eval() 126 | 127 | def predict_for_sentence(self, sent: str) -> Type[np.array]: 128 | ''' 129 | Make a threeway pred for the given sentence: Is this punchline 130 | reporting a sig. decrease (-1), no diff (0), or sig increase (1)? 131 | ''' 132 | if type(sent) == str: 133 | sent = [sent] 134 | 135 | x = encoder.tokenize(sent, is_split_into_words=False) 136 | 137 | with torch.no_grad(): 138 | 139 | x_input_ids = torch.tensor(x['input_ids']).to(self.inference_model.device) 140 | attention_mask= torch.tensor(x['attention_mask']).to(self.inference_model.device) 141 | 142 | logits = self.inference_model(x_input_ids, attention_mask=attention_mask)['logits'].cpu() 143 | probs = torch.nn.functional.softmax(logits, dim=1).numpy() 144 | 145 | return probs 146 | 147 | class EvInfBot: 148 | ''' Composes the punchline extractor and inference model. ''' 149 | def __init__(self, device='auto'): 150 | self.pl_bot = PunchlineExtractorBot(device=device) 151 | self.inf_bot = InferenceBot(device=device) 152 | 153 | self.direction_strs = ["↓ sig. decrease", "— no diff", "↑ sig. increase"] 154 | 155 | def predict_for_ab(self, ab: dict) -> Tuple[str, str]: 156 | 157 | 158 | # Get punchline. 159 | pl_sent, pred_probs = self.pl_bot.predict_for_ab(ab) 160 | 161 | # Infer direction. 162 | direction_probs = self.inf_bot.predict_for_sentence(pl_sent) 163 | return {"punchline_text": pl_sent, "effect": self.direction_strs[np.argmax(direction_probs)]} 164 | 165 | ### 166 | # e.g. 167 | # 168 | # from rrnlp.models import ev_inf_classifier 169 | # pl_bot = ev_inf_classifier.PunchlineExtractorBot() 170 | # 171 | # sentence = ["patients in group b died more often"] 172 | # pred_punchline = pl_bot.predict_for_sentences(sentence) 173 | # 174 | # ti_abs = {"ti": 'A Cluster-Randomized Trial of Hydroxychloroquine for Prevention of Covid-19', 175 | # "ab": """ Background: Current strategies for preventing severe acute 176 | # respiratory syndrome coronavirus 2 (SARS-CoV-2) infection are 177 | # limited to nonpharmacologic interventions. Hydroxychloroquine has 178 | # been proposed as a postexposure therapy to prevent coronavirus 179 | # disease 2019 (Covid-19), but definitive evidence is lacking. 180 | 181 | # Methods: We conducted an open-label, cluster-randomized trial 182 | # involving asymptomatic contacts of patients with 183 | # polymerase-chain-reaction (PCR)-confirmed Covid-19 in Catalonia, 184 | # Spain. We randomly assigned clusters of contacts to the 185 | # hydroxychloroquine group (which received the drug at a dose of 800 mg 186 | # once, followed by 400 mg daily for 6 days) or to the usual-care 187 | # group (which received no specific therapy). The primary outcome was 188 | # PCR-confirmed, symptomatic Covid-19 within 14 days. The secondary 189 | # outcome was SARS-CoV-2 infection, defined by symptoms compatible with 190 | # Covid-19 or a positive PCR test regardless of symptoms. Adverse 191 | # events were assessed for up to 28 days.\n\nResults: The analysis 192 | # included 2314 healthy contacts of 672 index case patients with 193 | # Covid-19 who were identified between March 17 and April 28, 2020. A 194 | # total of 1116 contacts were randomly assigned to receive 195 | # hydroxychloroquine and 1198 to receive usual care. Results were 196 | # similar in the hydroxychloroquine and usual-care groups with respect 197 | # to the incidence of PCR-confirmed, symptomatic Covid-19 (5.7% and 198 | # 6.2%, respectively; risk ratio, 0.86 [95% confidence interval, 0.52 199 | # to 1.42]). In addition, hydroxychloroquine was not associated with a 200 | # lower incidence of SARS-CoV-2 transmission than usual care (18.7% and 201 | # 17.8%, respectively). The incidence of adverse events was higher in 202 | # the hydroxychloroquine group than in the usual-care group (56.1% vs. 203 | # 5.9%), but no treatment-related serious adverse events were 204 | # reported.\n\nConclusions: Postexposure therapy with 205 | # hydroxychloroquine did not prevent SARS-CoV-2 infection or 206 | # symptomatic Covid-19 in healthy persons exposed to a PCR-positive 207 | # case patient. (Funded by the crowdfunding campaign YoMeCorono and 208 | # others; BCN-PEP-CoV2 ClinicalTrials.gov number, NCT04304053.). 209 | # """ 210 | # } 211 | # sent, prob = pl_bot.predict_for_ab(ti_abs) 212 | # 213 | # inf_bot = ev_inf_classifier.InferenceBot() 214 | # inf_bot.predict_for_sentence(sent) 215 | # 216 | # OR in one swoop... 217 | # 218 | # ev_bot = ev_inf_classifier.EvInfBot() 219 | # ev_bot.predict_for_ab(ti_abs) 220 | 221 | -------------------------------------------------------------------------------- /rrnlp/models/sample_size_extractor.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This module responsible for attempting to extract sample sizes 3 | (number randomized) from abstracts. We do this by identifying 4 | integer tokens in inputs, and then assembling bespoke feature 5 | vectors for each of these, and running these through a simple 6 | MLP. This module relies on static word vectors. 7 | ''' 8 | 9 | import operator 10 | import os 11 | import sys 12 | import typing 13 | import time 14 | import urllib 15 | from typing import Type, Tuple, List 16 | 17 | import numpy as np 18 | 19 | import gensim 20 | 21 | import torch 22 | from torch import nn 23 | 24 | import rrnlp 25 | from rrnlp.models.util import index_numbers 26 | from rrnlp.models import encoder 27 | 28 | weights_path = rrnlp.models.weights_path 29 | doi = rrnlp.models.files_needed['sample_size_extractor']['zenodo'] 30 | word_embeddings_path = os.path.join(weights_path, f"{doi}_PubMed-w2v.bin") # note that this is not DOI'ed - but fetched from the gensim source 31 | MLP_weights_path = os.path.join(weights_path, f"{doi}_sample_size_weights.pt") 32 | 33 | 34 | def replace_n_equals(abstract_tokens: List[str]) -> List[str]: 35 | ''' Helper to replace "n=" occurences ''' 36 | for j, t in enumerate(abstract_tokens): 37 | if "n=" in t.lower(): 38 | # Also replace closing paren, if present 39 | t_n = t.split("=")[1].replace(")", "") 40 | abstract_tokens[j] = t_n 41 | return abstract_tokens 42 | 43 | 44 | class MLPSampleSize(nn.Module): 45 | ''' A very simple MLP for sample size extraction. ''' 46 | def __init__(self, n_input_features: int=912, n_hidden: int=256): 47 | super().__init__() 48 | self.layers = nn.Sequential( 49 | nn.Linear(n_input_features, n_hidden), 50 | nn.ReLU(), 51 | nn.Linear(n_hidden, 1), 52 | nn.Sigmoid() 53 | ) 54 | 55 | 56 | def forward(self, X: Type[torch.Tensor]) -> Type[torch.FloatTensor]: 57 | ''' 58 | Returns a probability that the input tokens represented by rows 59 | of X are sample sizes. 60 | ''' 61 | return self.layers(X) 62 | 63 | 64 | class MLPSampleSizeClassifier: 65 | ''' 66 | This class wraps a simple window-based (torch) MLP and bespoke 67 | feature extraction functions, etc. 68 | ''' 69 | def __init__(self, device='auto'): 70 | 71 | self.nlp = encoder.nlp 72 | # This is for POS tags 73 | self.PoS_tags_to_indices = {} 74 | self.tag_names = [u'""', u'#', u'$', u"''", u',', u'-LRB-', u'-RRB-', u'.', u':', u'ADD', u'AFX', u'BES', u'CC', u'CD', u'DT', u'EX', u'FW', u'GW', u'HVS', u'HYPH', u'IN', u'JJ', u'JJR', u'JJS', u'LS', u'MD', u'NFP', u'NIL', u'NN', u'NNP', u'NNPS', u'NNS', u'PDT', u'POS', u'PRP', u'PRP$', u'RB', u'RBR', u'RBS', u'RP', u'SP', u'SYM', u'TO', u'UH', u'VB', u'VBD', u'VBG', u'VBN', u'VBP', u'VBZ', u'WDT', u'WP', u'WP$', u'WRB', u'XX', u'``'] 75 | for idx, tag in enumerate(self.tag_names): 76 | self.PoS_tags_to_indices[tag] = idx 77 | 78 | self.n_tags = len(self.tag_names)#len(self.nlp.tagger.tag_names) 79 | 80 | # Threshold governing whether to abstain from predicting 81 | # this as a sample size altogether (for highest scoring 82 | # integer). As always, this was definitely set in a totally 83 | # scientifically sound way ;). 84 | self.magic_threshold = 0.0205 # @TODO revisit 85 | 86 | self.number_tagger = index_numbers.NumberTagger() 87 | 88 | self.model = MLPSampleSize() 89 | self.model.load_state_dict(torch.load(MLP_weights_path )) 90 | self.word_embeddings = load_trained_w2v_model(word_embeddings_path) 91 | 92 | 93 | def PoS_tags_to_one_hot(self, tag: str) -> Type[np.array]: 94 | ''' 95 | Helper to map from string tags to one hot vectors encoding them. 96 | ''' 97 | one_hot = np.zeros(self.n_tags) 98 | if tag in self.PoS_tags_to_indices: 99 | one_hot[self.PoS_tags_to_indices[tag]] = 1.0 100 | else: 101 | pass 102 | return one_hot 103 | 104 | 105 | def featurize_for_input(self, X: List[dict]) -> List[Type[torch.Tensor]]: 106 | ''' 107 | Map from a list of dictionaries mapping features to values to a 108 | torch Tensor representing the given input. 109 | ''' 110 | Xv = [] 111 | 112 | left_token_inputs, left_PoS, right_token_inputs, right_PoS, other_inputs = [], [], [], [], [] 113 | 114 | # Consider just setting to zeros? 115 | unk_vec = np.mean(self.word_embeddings.vectors, axis=0) 116 | 117 | # Helper to grab embeddings for words where available, unk 118 | # vector otherwise 119 | def get_w_embedding(w): 120 | try: 121 | return self.word_embeddings[w] 122 | except: 123 | return unk_vec 124 | 125 | # Iterate over all instances in input, map from dictionaries of features to 126 | # tensors that encode them. 127 | for x in X: 128 | 129 | left_embeds = np.concatenate([get_w_embedding(w_i) for w_i in x["left_word"]]) 130 | right_embeds = np.concatenate([get_w_embedding(w_i) for w_i in x["right_word"]]) 131 | 132 | left_pos = self.PoS_tags_to_one_hot(x["left_PoS"]) 133 | right_pos = self.PoS_tags_to_one_hot(x["left_PoS"]) 134 | 135 | other_features = np.array(x["other_features"]) 136 | 137 | xv = np.concatenate([left_embeds, right_embeds, left_pos, right_pos]) 138 | 139 | Xv.append(torch.tensor(xv)) 140 | 141 | return Xv 142 | 143 | 144 | def predict_for_ab(self, ab: dict) -> typing.Union[str, None]: 145 | ''' 146 | Given an abstract, this returns either the extracted sample 147 | size, or None if this cannot be located. 148 | ''' 149 | abstract_text = ab['ab'] 150 | 151 | abstract_text_w_numbers = self.number_tagger.swap(abstract_text) 152 | abstract_tokens, POS_tags = tokenize_abstract(abstract_text_w_numbers, self.nlp) 153 | abstract_tokens = replace_n_equals(abstract_tokens) 154 | 155 | # Extract dictionaries of features for each token in the abstract 156 | abstract_features, numeric_token_indices = abstract2features(abstract_tokens, POS_tags) 157 | 158 | # If there are no numbers in the input text, then just give up 159 | if len(numeric_token_indices) == 0: 160 | return {"num_randomized": None} 161 | 162 | # Convert to a m x d Tensor (m = number of tokens; d = input dims) 163 | X = torch.vstack(self.featurize_for_input(abstract_features)).float() 164 | 165 | # Make prediction, retrieve associated token 166 | preds = self.model(X).detach().numpy() 167 | most_likely_idx = np.argmax(preds) 168 | 169 | # Abstain from returning a token if the *best* scoring token is beneath 170 | # a somewhat arbitrarily chosen threshold (since not all abstracts will 171 | # contain sample sizes.) 172 | if preds[most_likely_idx] >= self.magic_threshold: 173 | return {"num_randomized": abstract_tokens[numeric_token_indices[most_likely_idx]]} 174 | else: 175 | return {"num_randomized": None} 176 | 177 | 178 | def load_trained_w2v_model(path: str) -> Type[gensim.models.keyedvectors.KeyedVectors]: 179 | ''' Load in and return word vectors at the given path. ''' 180 | m = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True) 181 | return m 182 | 183 | def y_to_bin(y: List[str]) -> Type[np.array]: 184 | y_bin = np.zeros(len(y)) 185 | for idx, y_i in enumerate(y): 186 | if y_i == "N": 187 | y_bin[idx] = 1.0 188 | return y_bin 189 | 190 | def _is_an_int(s: str) -> bool: 191 | try: 192 | int(s) 193 | return True 194 | except: 195 | return False 196 | 197 | 198 | def tokenize_abstract(abstract: str, nlp=None) -> Tuple[List[str], List[str]]: 199 | ''' 200 | Tokenizes given abstract string, returns tokens and inferred PoS tags. 201 | ''' 202 | tokens, POS_tags = [], [] 203 | ab = nlp(abstract) 204 | for word in ab: 205 | tokens.append(word.text) 206 | POS_tags.append(word.tag_) 207 | 208 | return tokens, POS_tags 209 | 210 | def abstract2features(abstract_tokens: List[str], POS_tags: List[str]) \ 211 | -> Tuple[List[dict], List[int]]: 212 | ''' 213 | Given a tokenized input abstract (and associated list of predicted 214 | PoS tags), this function assembles a dictionary of artisinal features 215 | extracted from the inputs relevant to predicting whether the 216 | constituent tokens are sample sizes or not. We consider *only* integer 217 | candidates as potential sample sizes, and return features for these 218 | as well as their indices. 219 | ''' 220 | 221 | #### 222 | # Some of the features we use rely on 'global' info, 223 | # so we take a pass over the entire abstract here 224 | # to extract what we need: 225 | # 1. keep track of all numbers in the abstract 226 | # 2. keep track of indices where "years" mentioned 227 | # 3. keep track of indices where "patients" mentioned 228 | # the latter because years are a potential source of 229 | # confusion! 230 | years_tokens = ["years", "year"] 231 | patients_tokens = ["patients", "subjects", "participants"] 232 | all_nums_in_abstract, years_indices, patient_indices = [], [], [] 233 | for idx, t in enumerate(abstract_tokens): 234 | t_lower = t.lower() 235 | 236 | if t_lower in years_tokens: 237 | years_indices.append(idx) 238 | 239 | if t_lower in patients_tokens: 240 | patient_indices.append(idx) 241 | 242 | try: 243 | num = int(t) 244 | all_nums_in_abstract.append(num) 245 | except: 246 | pass 247 | 248 | # Note that we keep track of all candidates/numbers 249 | # and pass this back. 250 | x, numeric_token_indices = [], [] 251 | for word_idx in range(len(abstract_tokens)): 252 | if (_is_an_int(abstract_tokens[word_idx])): 253 | numeric_token_indices.append(word_idx) 254 | features = word2features(abstract_tokens, POS_tags, word_idx, 255 | all_nums_in_abstract, years_indices, 256 | patient_indices) 257 | x.append(features) 258 | 259 | # Here x is a list of dictionaries encoding features (key/val pairs) 260 | # for all *numerical* tokens we identified — we treat these as 261 | # candidate sample size tokens to be scored; the corresponding 262 | # indices for these candidates are stored in the second return 263 | # value, ie., numeric_token_indices. 264 | return x, numeric_token_indices 265 | 266 | 267 | def get_window_indices(all_tokens: List[str], i: int, window_size: int)\ 268 | -> Tuple[int, int]: 269 | lower_idx = max(0, i-window_size) 270 | upper_idx = min(i+window_size, len(all_tokens)-1) 271 | return (lower_idx, upper_idx) 272 | 273 | def word2features(abstract_tokens: List[str], POS_tags: List[str], i:int, 274 | all_nums_in_abstract: List[int], years_indices: List[int], 275 | patient_indices: List[int], window_size_for_years: int = 5, 276 | window_size_patient_mention: int = 4) -> dict: 277 | ''' 278 | Returns a dictionary of features for the token at position i, using 279 | the global (abstract) information provided in the given input 280 | lists. 281 | 282 | @TODO this function is a mess and should be rewritten. 283 | ''' 284 | ll_word, l_word, r_word, rr_word = "", "", "", "" 285 | 286 | l_POS, r_POS = "", "" 287 | t_word = abstract_tokens[i] 288 | 289 | if i > 1: 290 | ll_word = abstract_tokens[i-2].lower() 291 | else: 292 | ll_word = "BoS" 293 | 294 | if i > 0: 295 | l_word = abstract_tokens[i-1].lower() 296 | l_POS = POS_tags[i-1] 297 | else: 298 | l_word = "BoS" 299 | l_POS = "XX" # i.e., unknown 300 | 301 | if i < len(abstract_tokens)-2: 302 | rr_word = abstract_tokens[i+2].lower() 303 | else: 304 | r_word = "LoS" 305 | 306 | if i < len(abstract_tokens)-1: 307 | r_word = abstract_tokens[i+1].lower() 308 | r_POS = POS_tags[i+1] 309 | else: 310 | r_word = "LoS" 311 | r_POS = "XX" 312 | 313 | target_num = int(t_word) 314 | # Add a feature for being largest in document 315 | biggest_num_in_abstract = 0.0 316 | if target_num >= max(all_nums_in_abstract): 317 | biggest_num_in_abstract = 1.0 318 | 319 | # This feature encodes whether "year" or "years" is mentioned 320 | # within window_size_for_years tokens of the target (i) 321 | years_mention_within_window = 0.0 322 | lower_idx, upper_idx = get_window_indices(abstract_tokens, i, window_size_for_years) 323 | for year_idx in years_indices: 324 | if lower_idx < year_idx <= upper_idx: 325 | years_mention_within_window = 1.0 326 | break 327 | 328 | # Ditto the above, but for "patients" 329 | patients_mention_follows_within_window = 0.0 330 | _, upper_idx = get_window_indices(abstract_tokens, i, window_size_patient_mention) 331 | for patient_idx in patient_indices: 332 | if i < patient_idx <= upper_idx: 333 | patients_mention_follows_within_window = 1.0 334 | break 335 | 336 | # Some adhocery (craft feature engineering!) 337 | target_looks_like_a_year = 0.0 338 | lower_year, upper_year = 1940, 2020 # totally made up. 339 | if lower_year <= target_num <= upper_year: 340 | target_looks_like_a_year = 1.0 341 | 342 | return {"left_word":[ll_word, l_word], 343 | "right_word":[rr_word, r_word], 344 | "left_PoS":l_POS, "right_PoS":r_POS, 345 | "other_features":[biggest_num_in_abstract, years_mention_within_window, 346 | target_looks_like_a_year, 347 | patients_mention_follows_within_window]} 348 | 349 | 350 | def example(): 351 | from rrnlp.models import sample_size_extractor 352 | ss = sample_size_extractor.MLPSampleSizeClassifier() 353 | 354 | ab = '''Background: World Health Organization expert groups recommended mortality trials of four repurposed antiviral drugs - remdesivir, hydroxychloroquine, lopinavir, and interferon beta-1a - in patients hospitalized with coronavirus disease 2019 (Covid-19). Methods: We randomly assigned inpatients with Covid-19 equally between one of the trial drug regimens that was locally available and open control (up to five options, four active and the local standard of care). The intention-to-treat primary analyses examined in-hospital mortality in the four pairwise comparisons of each trial drug and its control (drug available but patient assigned to the same care without that drug). Rate ratios for death were calculated with stratification according to age and status regarding mechanical ventilation at trial entry. Results: At 405 hospitals in 30 countries, 11,330 adults underwent randomization; 2750 were assigned to receive remdesivir, 954 to hydroxychloroquine, 1411 to lopinavir (without interferon), 2063 to interferon (including 651 to interferon plus lopinavir), and 4088 to no trial drug. Adherence was 94 to 96% midway through treatment, with 2 to 6% crossover. In total, 1253 deaths were reported (median day of death, day 8; interquartile range, 4 to 14). The Kaplan-Meier 28-day mortality was 11.8% (39.0% if the patient was already receiving ventilation at randomization and 9.5% otherwise). Death occurred in 301 of 2743 patients receiving remdesivir and in 303 of 2708 receiving its control (rate ratio, 0.95; 95% confidence interval [CI], 0.81 to 1.11; P = 0.50), in 104 of 947 patients receiving hydroxychloroquine and in 84 of 906 receiving its control (rate ratio, 1.19; 95% CI, 0.89 to 1.59; P = 0.23), in 148 of 1399 patients receiving lopinavir and in 146 of 1372 receiving its control (rate ratio, 1.00; 95% CI, 0.79 to 1.25; P = 0.97), and in 243 of 2050 patients receiving interferon and in 216 of 2050 receiving its control (rate ratio, 1.16; 95% CI, 0.96 to 1.39; P = 0.11). No drug definitely reduced mortality, overall or in any subgroup, or reduced initiation of ventilation or hospitalization duration. Conclusions: These remdesivir, hydroxychloroquine, lopinavir, and interferon regimens had little or no effect on hospitalized patients with Covid-19, as indicated by overall mortality, initiation of ventilation, and duration of hospital stay. (Funded by the World Health Organization; ISRCTN Registry number, ISRCTN83971151; ClinicalTrials.gov number, NCT04315948.).''' 355 | 356 | ss.predict_for_ab(ab) 357 | 358 | 359 | ### 360 | # e.g. 361 | # 362 | # import sample_size_extractor 363 | # sample_size_bot = sample_size_extractor.MLPSampleSizeClassifier() 364 | # 365 | # ti_abs = {"ti": 'A Cluster-Randomized Trial of Hydroxychloroquine for Prevention of Covid-19', 366 | # "ab": """ Background: Current strategies for preventing severe acute 367 | # respiratory syndrome coronavirus 2 (SARS-CoV-2) infection are 368 | # limited to nonpharmacologic interventions. Hydroxychloroquine has 369 | # been proposed as a postexposure therapy to prevent coronavirus 370 | # disease 2019 (Covid-19), but definitive evidence is lacking. 371 | 372 | # Methods: We conducted an open-label, cluster-randomized trial 373 | # involving asymptomatic contacts of patients with 374 | # polymerase-chain-reaction (PCR)-confirmed Covid-19 in Catalonia, 375 | # Spain. We randomly assigned clusters of contacts to the 376 | # hydroxychloroquine group (which received the drug at a dose of 800 mg 377 | # once, followed by 400 mg daily for 6 days) or to the usual-care 378 | # group (which received no specific therapy). The primary outcome was 379 | # PCR-confirmed, symptomatic Covid-19 within 14 days. The secondary 380 | # outcome was SARS-CoV-2 infection, defined by symptoms compatible with 381 | # Covid-19 or a positive PCR test regardless of symptoms. Adverse 382 | # events were assessed for up to 28 days.\n\nResults: The analysis 383 | # included 2314 healthy contacts of 672 index case patients with 384 | # Covid-19 who were identified between March 17 and April 28, 2020. A 385 | # total of 1116 contacts were randomly assigned to receive 386 | # hydroxychloroquine and 1198 to receive usual care. Results were 387 | # similar in the hydroxychloroquine and usual-care groups with respect 388 | # to the incidence of PCR-confirmed, symptomatic Covid-19 (5.7% and 389 | # 6.2%, respectively; risk ratio, 0.86 [95% confidence interval, 0.52 390 | # to 1.42]). In addition, hydroxychloroquine was not associated with a 391 | # lower incidence of SARS-CoV-2 transmission than usual care (18.7% and 392 | # 17.8%, respectively). The incidence of adverse events was higher in 393 | # the hydroxychloroquine group than in the usual-care group (56.1% vs. 394 | # 5.9%), but no treatment-related serious adverse events were 395 | # reported.\n\nConclusions: Postexposure therapy with 396 | # hydroxychloroquine did not prevent SARS-CoV-2 infection or 397 | # symptomatic Covid-19 in healthy persons exposed to a PCR-positive 398 | # case patient. (Funded by the crowdfunding campaign YoMeCorono and 399 | # others; BCN-PEP-CoV2 ClinicalTrials.gov number, NCT04304053.). 400 | # """ 401 | # } 402 | # 403 | # sample_size = sample_size_bot.predict_for_doc(ti_abs) 404 | -------------------------------------------------------------------------------- /rrnlp/models/study_design_classifier.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This module predicts the most likely study design for a given article. 3 | This version uses a series of simple logistic regression classifier. 4 | ''' 5 | 6 | import os 7 | import pickle 8 | from sklearn.feature_extraction.text import HashingVectorizer 9 | from sklearn.linear_model import SGDClassifier 10 | import rrnlp 11 | 12 | 13 | weights_path = rrnlp.models.weights_path 14 | doi = rrnlp.models.files_needed['study_design_classifier']['zenodo'] 15 | 16 | 17 | 18 | vec = HashingVectorizer(ngram_range=(1, 4), stop_words='english') 19 | 20 | 21 | 22 | 23 | study_designs = {"sr": "Systematic review", 24 | 'cohort': "Cohort study", 25 | "consensus": "Consensus statement", 26 | "ct": "Clinical trial (non-randomized)", 27 | "ct_protocol": "Clinical trial protocol", 28 | "guideline": "Clinical guideline", 29 | "qual": "Qualitative study", 30 | "rct": "Randomized controlled trial"} 31 | 32 | 33 | study_design_clfs = {} 34 | 35 | 36 | def get_models(): 37 | ''' Load in and return RCT model weights. ''' 38 | 39 | models = {} 40 | 41 | for sd in study_designs.keys(): 42 | with open(os.path.join(weights_path, f"{doi}_{sd}_lr.pck"), 'rb') as f: 43 | models[sd] = pickle.load(f) 44 | 45 | return models 46 | 47 | 48 | class AbsStudyDesignBot: 49 | ''' Lightweight container class that holds study design model ''' 50 | def __init__(self, device=None): 51 | self.models = get_models() 52 | 53 | def predict_for_ab(self, ab: dict) -> float: 54 | ti_and_abs = ab['ti'] + ' ' + ab['ab'] 55 | ''' Predicts p(low risk of bias) for input abstract ''' 56 | 57 | probs = [] 58 | 59 | x = vec.transform([ti_and_abs]) 60 | for sd, clf in self.models.items(): 61 | pred = clf.predict_proba(x)[:,1] 62 | probs.append((sd, float(pred[0]))) 63 | 64 | out = {} 65 | 66 | most_likely = max(probs, key=lambda x: x[1]) 67 | if most_likely[1] >= 0.5: 68 | out['study_design'] = most_likely[0] 69 | else: 70 | out['study_design'] = 'unknown' 71 | 72 | for sd, pred in probs: 73 | out[f"prob_{sd}"] = pred 74 | out[f"is_{sd}"] = bool(pred >=0.5) 75 | 76 | 77 | return out 78 | 79 | 80 | ### 81 | # e.g. 82 | # 83 | # from rrnlp import study_design_classifier 84 | # sd_bot = study_design_classifier.AbsStudyDesignBot() 85 | 86 | # ti_abs = {"ti": 'A Cluster-Randomized Trial of Hydroxychloroquine for Prevention of Covid-19', 87 | # "ab": """ Background: Current strategies for preventing severe acute 88 | # respiratory syndrome coronavirus 2 (SARS-CoV-2) infection are 89 | # limited to nonpharmacologic interventions. Hydroxychloroquine has 90 | # been proposed as a postexposure therapy to prevent coronavirus 91 | # disease 2019 (Covid-19), but definitive evidence is lacking. 92 | 93 | # Methods: We conducted an open-label, cluster-randomized trial 94 | # involving asymptomatic contacts of patients with 95 | # polymerase-chain-reaction (PCR)-confirmed Covid-19 in Catalonia, 96 | # Spain. We randomly assigned clusters of contacts to the 97 | # hydroxychloroquine group (which received the drug at a dose of 800 mg 98 | # once, followed by 400 mg daily for 6 days) or to the usual-care 99 | # group (which received no specific therapy). The primary outcome was 100 | # PCR-confirmed, symptomatic Covid-19 within 14 days. The secondary 101 | # outcome was SARS-CoV-2 infection, defined by symptoms compatible with 102 | # Covid-19 or a positive PCR test regardless of symptoms. Adverse 103 | # events were assessed for up to 28 days.\n\nResults: The analysis 104 | # included 2314 healthy contacts of 672 index case patients with 105 | # Covid-19 who were identified between March 17 and April 28, 2020. A 106 | # total of 1116 contacts were randomly assigned to receive 107 | # hydroxychloroquine and 1198 to receive usual care. Results were 108 | # similar in the hydroxychloroquine and usual-care groups with respect 109 | # to the incidence of PCR-confirmed, symptomatic Covid-19 (5.7% and 110 | # 6.2%, respectively; risk ratio, 0.86 [95% confidence interval, 0.52 111 | # to 1.42]). In addition, hydroxychloroquine was not associated with a 112 | # lower incidence of SARS-CoV-2 transmission than usual care (18.7% and 113 | # 17.8%, respectively). The incidence of adverse events was higher in 114 | # the hydroxychloroquine group than in the usual-care group (56.1% vs. 115 | # 5.9%), but no treatment-related serious adverse events were 116 | # reported.\n\nConclusions: Postexposure therapy with 117 | # hydroxychloroquine did not prevent SARS-CoV-2 infection or 118 | # symptomatic Covid-19 in healthy persons exposed to a PCR-positive 119 | # case patient. (Funded by the crowdfunding campaign YoMeCorono and 120 | # others; BCN-PEP-CoV2 ClinicalTrials.gov number, NCT04304053.). 121 | # """ 122 | # } 123 | # pred_sd = sd_bot.predict_for_ab(ti_abs) 124 | 125 | 126 | -------------------------------------------------------------------------------- /rrnlp/models/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bwallace/RRnlp/e1a26b4ed1c8d65f2c2e2558dc9f0918572306d0/rrnlp/models/util/__init__.py -------------------------------------------------------------------------------- /rrnlp/models/util/index_numbers.py: -------------------------------------------------------------------------------- 1 | # three million, two hundred and fourteen thousand, one hundred and twelve 2 | 3 | # 3276191 4 | 5 | import collections 6 | import re 7 | import timeit #for testing 8 | 9 | 10 | # TODO 11 | # improve handling of the word 'a' 12 | # ideally will operate as number 1 in front of hundred, thousand etc. 13 | # but not flag as a number otherwise 14 | 15 | class Indexer(): 16 | """ 17 | base class for various text taggers 18 | 19 | takes in text; main data structure is a list of tuples 20 | [tag, start, end] 21 | 22 | where: tag is any data type 23 | start and end are integers representing the start and end indices in the string 24 | """ 25 | 26 | def __init__(self): 27 | pass 28 | 29 | def tag(self, text): 30 | pass 31 | 32 | 33 | class WordTagger(Indexer): 34 | """ 35 | simple regular expression word tokenizer 36 | """ 37 | 38 | def tag(self, text): 39 | self.tags = self.get_words(text) 40 | 41 | def get_words(self, text): 42 | return [(m.group(), m.start(), m.end()) for m in re.finditer("([\.\,\;']|[a-z0-9]+)", text, re.IGNORECASE) if m.group() not in ['and', ',']] 43 | 44 | 45 | class NumberTagger(WordTagger): 46 | 47 | def __init__(self): 48 | self.load_numberwords() 49 | Indexer.__init__(self) 50 | 51 | def load_numberwords(self): 52 | self.numberwords = { 53 | # 'a': 1, 54 | 'one': 1, 55 | 'two': 2, 56 | 'three': 3, 57 | 'four': 4, 58 | 'five': 5, 59 | 'six': 6, 60 | 'seven': 7, 61 | 'eight': 8, 62 | 'nine': 9, 63 | 'ten': 10, 64 | 'eleven': 11, 65 | 'twelve': 12, 66 | 'thirteen': 13, 67 | 'fourteen': 14, 68 | 'fifteen': 15, 69 | 'sixteen': 16, 70 | 'seventeen':17, 71 | 'eighteen': 18, 72 | 'nineteen': 19, 73 | 'twenty': 20, 74 | 'thirty': 30, 75 | 'forty': 40, 76 | 'fifty': 50, 77 | 'sixty': 60, 78 | 'seventy': 70, 79 | 'eighty': 80, 80 | 'ninety': 90, 81 | 'hundred': 100, 82 | 'thousand': 1000, 83 | 'million': 1000000, 84 | 'billion': 1000000000, 85 | 'trillion': 1000000000000 86 | } 87 | 88 | def swap(self, text): 89 | """ 90 | returns string with number words replaced with digits 91 | """ 92 | text = re.sub(r"(?<=[0-9])[\s\,](?=[0-9])", "", text) 93 | tags = self.tag(text) 94 | # tags.sort(key=lambda (number, start, end): start) # get tags and sort by start index 95 | tags.sort(key=lambda indices: indices[1]) 96 | 97 | output_list = [] 98 | progress_index = 0 99 | 100 | for (number, start_index, end_index) in tags: 101 | output_list.append(text[progress_index:start_index]) # add the unedited string from the last marker up to the number 102 | output_list.append(str(number)) # add the string digits of the number 103 | progress_index = end_index # skip the marker forward to the end of the original number words 104 | 105 | output_list.append(text[progress_index:]) # if no tags, this will append the whole unchanged string 106 | 107 | return ''.join(output_list) 108 | 109 | 110 | def tag(self, text): 111 | """ 112 | produces a list of tuples (number, start_index, end_index) 113 | """ 114 | words = self.get_words(text) 115 | words.reverse() 116 | 117 | number_parts = [] 118 | number_parts_index = -1 119 | 120 | last_word_was_a_number = False 121 | 122 | # first get groups of consecutive numbers from the reversed word list 123 | 124 | 125 | 126 | for word, start, end in words: 127 | 128 | word_num = self.numberwords.get(word.lower()) 129 | 130 | if word_num is None: 131 | last_word_was_a_number = False 132 | else: 133 | if last_word_was_a_number == False: 134 | number_parts.append([]) 135 | number_parts_index += 1 136 | last_word_was_a_number = True 137 | 138 | number_parts[number_parts_index].append((word_num, start, end)) 139 | 140 | output = [] 141 | 142 | 143 | # then calculate the number for each part 144 | 145 | for number_part in number_parts: 146 | number = self.recursive_nums([word_num for word_num, start, end in number_part]) 147 | start = min([start for word_num, start, end in number_part]) 148 | end = max([end for word_num, start, end in number_part]) 149 | 150 | output.append((number, start, end)) 151 | return(output) 152 | 153 | def recursive_nums(self, numlist): 154 | 155 | # first split list up 156 | 157 | tens_index = 0 158 | tens = [100, 1000, 1000000, 1000000000, 1000000000000] 159 | 160 | current_multiplier = 1 161 | 162 | split_list = collections.defaultdict(list) 163 | 164 | for num in numlist: 165 | if num in tens[tens_index:]: 166 | tens_index = tens.index(num)+1 167 | current_multiplier = num 168 | else: 169 | split_list[current_multiplier].append(num) 170 | 171 | counter = 0 172 | 173 | # then sum up the component parts 174 | 175 | for multiplier, numbers in split_list.items(): 176 | # check if multiples of ten left 177 | 178 | for number in numbers: 179 | if number in tens: 180 | counter += multiplier * self.recursive_nums(numbers) 181 | break 182 | else: 183 | counter += multiplier * sum(numbers) 184 | 185 | return counter 186 | 187 | 188 | 189 | # counter = 0 190 | 191 | # for i, num in enumerate(numlist): 192 | # if num % 10 == 0: 193 | # counter += (num * recursive_nums(numlist[i+1:])) 194 | # else: 195 | # counter += num 196 | 197 | # return counter 198 | 199 | _swap_num = NumberTagger().swap 200 | def swap_num(text): 201 | return _swap_num(text) 202 | 203 | 204 | 205 | def test(t): 206 | b = t.tag("""Specific immunotherapy is still widely used in grass-pollen allergy, 207 | but its side effects may limit its use. We tested the safety and efficacy of a 208 | formalinized high-molecular-weight allergoid prepared from a mixed grass-pollen 209 | extract with two injection schedules in a double-blind, placebo-controlled study. 210 | Eighteen patients received placebo, 19 received the low-dose schedule (maximal 211 | dose: 2000 PNU) and 20 received the high-dose schedule (maximal dose: 10,000 PNU). 212 | Only one patient presented a systemic reaction of moderate severity for a dose 213 | of 1200 PNU. Before the onset of the pollen season, patients had a nasal challenge 214 | with orchard grass-pollen grains, a skin test titration, and the titration 215 | of serum-specific IgG. Both groups of patients presented a significant reduction in nasal 216 | and skin sensitivities and a significant increase in IgG compared to placebo. Symptoms and 217 | medications for rhinitis and asthma were studied during the season, and both groups receiving 218 | allergoids had a significant reduction of symptom-medication scores for nasal and bronchial 219 | symptoms. There was a highly significant correlation between nasal symptom-medication 220 | scores during the season and the results of nasal challenges. High-molecular-weight 221 | allergoids are safe and effective.""") 222 | 223 | 224 | 225 | 226 | def main(): 227 | 228 | t = NumberTagger() 229 | 230 | # b = t.swap("""Specific immunotherapy is still widely used in grass-pollen allergy, but its side effects may limit its use. We tested the safety and efficacy of a formalinized high-molecular-weight allergoid prepared from a mixed grass-pollen extract with two injection schedules in a double-blind, placebo-controlled study. Eighteen patients received placebo, 19 received the low-dose schedule (maximal dose: 2000 PNU) and 20 received the high-dose schedule (maximal dose: 10,000 PNU). Only one patient presented a systemic reaction of moderate severity for a dose of 1200 PNU. Before the onset of the pollen season, patients had a nasal challenge with orchard grass-pollen grains, a skin test titration, and the titration of serum-specific IgG. Both groups of patients presented a significant reduction in nasal and skin sensitivities and a significant increase in IgG compared to placebo. Symptoms and medications for rhinitis and asthma were studied during the season, and both groups receiving allergoids had a significant reduction of symptom-medication scores for nasal and bronchial symptoms. There was a highly significant correlation between nasal symptom-medication scores during the season and the results of nasal challenge High-molecular-weight allergoids are safe and effective.""") 231 | b = t.swap('three million, two hundred and fourteen thousand, one hundred and twelve') 232 | print (b) 233 | 234 | 235 | 236 | # three million, two hundred and fourteen thousand, one hundred and twelve 237 | # testnums = [12, 100, 1, 1000, 14, 100, 2, 1000000, 3] 238 | # testnums = [100, 2, 1000000, 40] 239 | # output 3214112 240 | # testnums = [1, 90, 100, 1, 1000, 6 ,70, 100, 2, 1000000, 3] 241 | 242 | # testnums = [6, 70, 100, 2] 243 | 244 | # testnums = [1, 90] 245 | # testanswer = 3276191 246 | 247 | # print recursive_nums(testnums) 248 | 249 | 250 | if __name__ == '__main__': 251 | main() 252 | -------------------------------------------------------------------------------- /rrnlp/models/util/minimap/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bwallace/RRnlp/e1a26b4ed1c8d65f2c2e2558dc9f0918572306d0/rrnlp/models/util/minimap/__init__.py -------------------------------------------------------------------------------- /rrnlp/models/util/minimap/cui_to_mh.pck: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bwallace/RRnlp/e1a26b4ed1c8d65f2c2e2558dc9f0918572306d0/rrnlp/models/util/minimap/cui_to_mh.pck -------------------------------------------------------------------------------- /rrnlp/models/util/minimap/cui_to_mh_supp.pck: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bwallace/RRnlp/e1a26b4ed1c8d65f2c2e2558dc9f0918572306d0/rrnlp/models/util/minimap/cui_to_mh_supp.pck -------------------------------------------------------------------------------- /rrnlp/models/util/minimap/ignorelist.txt: -------------------------------------------------------------------------------- 1 | we 2 | a 3 | suffer 4 | suffering 5 | power 6 | aged 7 | care 8 | practice 9 | period -------------------------------------------------------------------------------- /rrnlp/models/util/minimap/minimap.py: -------------------------------------------------------------------------------- 1 | # 2 | # minimap 3 | # 4 | import os 5 | import pickle 6 | 7 | import spacy 8 | from spacy.tokens import Doc 9 | from itertools import chain 10 | 11 | import rrnlp 12 | from rrnlp.models import encoder 13 | nlp = encoder.nlp 14 | 15 | minimap_path = os.path.join(os.path.dirname(rrnlp.__file__), 16 | "models", "util", "minimap") 17 | 18 | 19 | 20 | 21 | #nlp = spacy.load("en_core_web_sm") 22 | 23 | # ignore list 24 | with open(os.path.join(minimap_path, 'ignorelist.txt'), 'r') as f: 25 | ignores = set((l.strip() for l in f)) 26 | 27 | 28 | with open(os.path.join(minimap_path, 'str_to_cui.pck'), 'rb') as f: 29 | str_to_cui = pickle.load(f) 30 | 31 | 32 | with open(os.path.join(minimap_path, 'cui_to_mh.pck'), 'rb') as f: 33 | cui_to_mh = pickle.load(f) 34 | 35 | 36 | # add manual extras 37 | with open(os.path.join(minimap_path, 'str_to_cui_supp.pck'), 'rb') as f: 38 | str_to_cui_supp = pickle.load(f) 39 | str_to_cui.update(str_to_cui_supp) 40 | 41 | 42 | with open(os.path.join(minimap_path, 'cui_to_mh_supp.pck'), 'rb') as f: 43 | cui_to_mh_supp = pickle.load(f) 44 | cui_to_mh.update(cui_to_mh_supp) 45 | 46 | 47 | 48 | # some extra filtering rules to improve precision 49 | 50 | drop_terms = set() 51 | 52 | for k, v in str_to_cui.items(): 53 | # strings which are too ambiguous (too many CUIs... 15 from experimentation) 54 | if len(set(v))>15: 55 | drop_terms.add(k) 56 | 57 | 58 | for k, v in str_to_cui.items(): 59 | # strings which are too short to be informative (2 chars or less tends to generate nonsense CUIs) 60 | if len(k)<=2: 61 | drop_terms.add(k) 62 | 63 | for t in drop_terms: 64 | str_to_cui.pop(t) 65 | 66 | 67 | # regular expressions and text processing functions 68 | 69 | import re 70 | 71 | with open(os.path.join(minimap_path,'prepositions_conjunctions.txt'), 'r') as f: 72 | prep_conj = [l.strip() for l in f] 73 | 74 | prep_conj_re = re.compile(r'\b({})\b'.format('|'.join(prep_conj))) 75 | nos_ignore = re.compile(r'\bNOS\b') # note do after lowercase 76 | pos_ignore = re.compile(r"(?<=\w)(\'s?)\b") 77 | left_paren = re.compile(r"^\[(X|V|D|M|EDTA|SO|Q)\]") 78 | paren = re.compile(r"[\(\[]\w+[\)\]]") 79 | strip_space = re.compile(r"\s+") 80 | 81 | def remove_nos(text): 82 | return nos_ignore.sub(' ', text) 83 | 84 | def remove_pos(text): 85 | return pos_ignore.sub('', text) 86 | 87 | def syn_uninv(text): 88 | try: 89 | inversion_point = text.index(', ') 90 | except ValueError: 91 | # not found 92 | return text 93 | 94 | if inversion_point+2 == len(text): 95 | # i.e. if the ', ' is at the end of the string 96 | return text 97 | 98 | if prep_conj_re.search(text[inversion_point+2:]): 99 | return text 100 | else: 101 | return text[inversion_point+2:] + " " + text[:inversion_point] 102 | 103 | def ne_parentheticals(text_str): 104 | text_str = left_paren.sub('', text_str) 105 | text_str = paren.sub('', text_str) 106 | return text_str 107 | 108 | def get_lemma(t): 109 | if t.text in exceptions: 110 | return exceptions[t.text] 111 | else: 112 | return t.lemma_ 113 | 114 | # pipelines 115 | 116 | def minimap(text_str, chunks=False, abbrevs=None): 117 | return matcher(pipeline(text_str, umls_mode=False, abbrevs=abbrevs), chunks=chunks) 118 | 119 | 120 | def pipeline(text_str, umls_mode=True, abbrevs=None): 121 | 122 | # sub out abbreviations if abbreviation dict given 123 | if abbrevs: 124 | for abbrev, expansion in abbrevs.items(): 125 | try: 126 | text_str = re.sub(r"\b" + re.escape(abbrev) + r"\b", expansion, text_str) 127 | 128 | except: 129 | print(f"Regex error caused for one abstract! (for text string '{text_str}')") 130 | print(f"and abbreviation dictionary '{abbrevs}'") 131 | # to avoid weird errors in abbreviations generating error causing regex strings (which are not causing a named exception) 132 | continue 133 | 134 | # 1. removal of parentheticals 135 | # if umls_mode: 136 | text_str = ne_parentheticals(text_str) 137 | 138 | # hyphens to spaces 139 | text_str = text_str.replace('-', ' ') 140 | # 3. conversion to lowercase 141 | # text_str = text_str.lower() 142 | # 2. syntactic uninverstion 143 | if umls_mode: 144 | text_str = syn_uninv(text_str) 145 | # 4. stripping of possessives 146 | text_str = remove_pos(text_str) 147 | # strip NOS's 148 | if umls_mode: 149 | text_str = remove_nos(text_str) 150 | # last... remove any multiple spaces, or starting/ending with space 151 | text_str = strip_space.sub(' ', text_str) 152 | text_str = text_str.strip() 153 | return text_str 154 | 155 | 156 | 157 | from itertools import chain 158 | 159 | 160 | def matcher(text, chunks=False): 161 | doc = nlp(text.lower()) 162 | 163 | if chunks: 164 | return list(chain.from_iterable(matcher(np.text, chunks=False) for np in doc.noun_chunks)) 165 | tokens = [t.text.lower() for t in doc] 166 | lemmas = [t.lemma_ for t in doc if t.text.lower()] 167 | lemmas = [l for l in lemmas if l != '-PRON-'] 168 | 169 | 170 | matches = [] 171 | max_len = len(doc) 172 | window = max_len 173 | 174 | 175 | while window: 176 | 177 | for i in range(max_len - window + 1): 178 | window_text = ' '.join(tokens[i:i+window]) 179 | window_lemma = ' '.join(lemmas[i:i+window]) 180 | 181 | 182 | if window_lemma and window_lemma in str_to_cui and window_lemma not in ignores and window_text \ 183 | not in nlp.Defaults.stop_words: 184 | 185 | 186 | for entry in str_to_cui[window_lemma]: 187 | mh = cui_to_mh[entry].copy() 188 | mh['start_idx'] = i 189 | mh['end_idx'] = i+window 190 | mh['source_text'] = doc[mh['start_idx']:mh['end_idx']].text 191 | matches.append(mh) 192 | 193 | window -= 1 194 | 195 | 196 | 197 | matches.sort(key=lambda x: (x['start_idx'], -x['end_idx'])) 198 | 199 | 200 | 201 | filtered_terms = [] 202 | 203 | right_border = 0 204 | for match in matches: 205 | if match['start_idx'] >= right_border: 206 | filtered_terms.append(match) 207 | right_border = match['end_idx'] 208 | 209 | return filtered_terms 210 | 211 | 212 | def get_unique_terms(l, abbrevs=None): 213 | 214 | terms = [minimap(s, abbrevs=abbrevs) for s in l] 215 | flat_terms = [item for sublist in terms for item in sublist] 216 | encountered_terms = set() 217 | unique_terms = [] 218 | for term in flat_terms: 219 | if term['cui'] not in encountered_terms: 220 | term.pop('start_idx') 221 | term.pop('end_idx') 222 | term.pop('source_text') 223 | unique_terms.append(term) 224 | encountered_terms.add(term['cui']) 225 | return unique_terms 226 | 227 | -------------------------------------------------------------------------------- /rrnlp/models/util/minimap/prepositions_conjunctions.txt: -------------------------------------------------------------------------------- 1 | aboard 2 | about 3 | above 4 | across 5 | after 6 | against 7 | along 8 | alongside 9 | amid 10 | among 11 | amongst 12 | apropos 13 | around 14 | as 15 | astride 16 | at 17 | bar 18 | before 19 | behind 20 | below 21 | beneath 22 | beside 23 | besides 24 | between 25 | beyond 26 | but 27 | by 28 | circa 29 | come 30 | despite 31 | down 32 | during 33 | except 34 | for 35 | from 36 | in 37 | inside 38 | into 39 | less 40 | like 41 | minus 42 | near 43 | nearer 44 | nearest 45 | notwithstanding 46 | of 47 | off 48 | on 49 | onto 50 | opposite 51 | out 52 | outside 53 | over 54 | past 55 | per 56 | plus 57 | post 58 | pre 59 | pro 60 | re 61 | sans 62 | save 63 | short 64 | since 65 | than 66 | through 67 | throughout 68 | till 69 | to 70 | toward 71 | towards 72 | under 73 | underneath 74 | unlike 75 | until 76 | unto 77 | up 78 | upon 79 | upside 80 | versus 81 | vs 82 | v 83 | via 84 | with 85 | within 86 | without 87 | worth 88 | and 89 | but 90 | because 91 | although 92 | or 93 | provided that 94 | as long as 95 | in order that 96 | in order to 97 | nor 98 | but also -------------------------------------------------------------------------------- /rrnlp/models/util/minimap/str_to_cui.pck: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bwallace/RRnlp/e1a26b4ed1c8d65f2c2e2558dc9f0918572306d0/rrnlp/models/util/minimap/str_to_cui.pck -------------------------------------------------------------------------------- /rrnlp/models/util/minimap/str_to_cui_supp.pck: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bwallace/RRnlp/e1a26b4ed1c8d65f2c2e2558dc9f0918572306d0/rrnlp/models/util/minimap/str_to_cui_supp.pck -------------------------------------------------------------------------------- /rrnlp/models/util/minimap/subtrees.pck: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bwallace/RRnlp/e1a26b4ed1c8d65f2c2e2558dc9f0918572306d0/rrnlp/models/util/minimap/subtrees.pck -------------------------------------------------------------------------------- /rrnlp/models/util/schwartz_hearst.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import regex 3 | import sys 4 | 5 | """ 6 | A Python 3 refactoring of Vincent Van Asch's Python 2 code at 7 | 8 | http://www.cnts.ua.ac.be/~vincent/scripts/abbreviations.py 9 | 10 | Based on 11 | 12 | A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text 13 | A. Schwartz and M. Hearst 14 | Biocomputing, 2003, pp 451-462. 15 | 16 | """ 17 | 18 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 19 | log = logging.getLogger(__name__) 20 | 21 | 22 | class Candidate(str): 23 | def __init__(self, value): 24 | super().__init__() 25 | self.start = 0 26 | self.stop = 0 27 | 28 | def set_position(self, start, stop): 29 | self.start = start 30 | self.stop = stop 31 | 32 | 33 | def yield_lines_from_file(file_path): 34 | with open(file_path, 'rb') as f: 35 | for line in f: 36 | try: 37 | line = line.decode('utf-8') 38 | except UnicodeDecodeError: 39 | line = line.decode('latin-1').encode('utf-8').decode('utf-8') 40 | line = line.strip() 41 | yield line 42 | 43 | 44 | def yield_lines_from_doc(doc_text): 45 | for line in doc_text.split("\n"): 46 | yield line.strip() 47 | 48 | 49 | def best_candidates(sentence): 50 | """ 51 | :param sentence: line read from input file 52 | :return: a Candidate iterator 53 | """ 54 | 55 | if '(' in sentence: 56 | # Check some things first 57 | if sentence.count('(') != sentence.count(')'): 58 | raise ValueError("Unbalanced parentheses: {}".format(sentence)) 59 | 60 | if sentence.find('(') > sentence.find(')'): 61 | raise ValueError("First parentheses is right: {}".format(sentence)) 62 | 63 | closeindex = -1 64 | while 1: 65 | # Look for open parenthesis 66 | openindex = sentence.find('(', closeindex + 1) 67 | 68 | if openindex == -1: break 69 | 70 | # Look for closing parentheses 71 | closeindex = openindex + 1 72 | open = 1 73 | skip = False 74 | while open: 75 | try: 76 | char = sentence[closeindex] 77 | except IndexError: 78 | # We found an opening bracket but no associated closing bracket 79 | # Skip the opening bracket 80 | skip = True 81 | break 82 | if char == '(': 83 | open += 1 84 | elif char in [')', ';', ':']: 85 | open -= 1 86 | closeindex += 1 87 | 88 | if skip: 89 | closeindex = openindex + 1 90 | continue 91 | 92 | # Output if conditions are met 93 | start = openindex + 1 94 | stop = closeindex - 1 95 | candidate = sentence[start:stop] 96 | 97 | # Take into account whitespace that should be removed 98 | start = start + len(candidate) - len(candidate.lstrip()) 99 | stop = stop - len(candidate) + len(candidate.rstrip()) 100 | candidate = sentence[start:stop] 101 | 102 | if conditions(candidate): 103 | new_candidate = Candidate(candidate) 104 | new_candidate.set_position(start, stop) 105 | yield new_candidate 106 | 107 | 108 | def conditions(candidate): 109 | """ 110 | Based on Schwartz&Hearst 111 | 112 | 2 <= len(str) <= 10 113 | len(tokens) <= 2 114 | re.search('\p{L}', str) 115 | str[0].isalnum() 116 | 117 | and extra: 118 | if it matches (\p{L}\.?\s?){2,} 119 | it is a good candidate. 120 | 121 | :param candidate: candidate abbreviation 122 | :return: True if this is a good candidate 123 | """ 124 | viable = True 125 | if regex.match('(\p{L}\.?\s?){2,}', candidate.lstrip()): 126 | viable = True 127 | if len(candidate) < 2 or len(candidate) > 10: 128 | viable = False 129 | if len(candidate.split()) > 2: 130 | viable = False 131 | if not regex.search('\p{L}', candidate): 132 | viable = False 133 | if not candidate[0].isalnum(): 134 | viable = False 135 | 136 | return viable 137 | 138 | 139 | def get_definition(candidate, sentence): 140 | """ 141 | Takes a candidate and a sentence and returns the definition candidate. 142 | 143 | The definintion candidate is the set of tokens (in front of the candidate) 144 | that starts with a token starting with the first character of the candidate 145 | 146 | :param candidate: candidate abbreviation 147 | :param sentence: current sentence (single line from input file) 148 | :return: candidate definition for this abbreviation 149 | """ 150 | # Take the tokens in front of the candidate 151 | tokens = regex.split(r'[\s\-]', sentence[:candidate.start - 2].lower()) 152 | # the char that we are looking for 153 | key = candidate[0].lower() 154 | 155 | # Count the number of tokens that start with the same character as the candidate 156 | firstchars = [t[0] for t in tokens] 157 | 158 | definition_freq = firstchars.count(key) 159 | candidate_freq = candidate.lower().count(key) 160 | 161 | # Look for the list of tokens in front of candidate that 162 | # have a sufficient number of tokens starting with key 163 | if candidate_freq <= definition_freq: 164 | # we should at least have a good number of starts 165 | count = 0 166 | start = 0 167 | startindex = len(firstchars) - 1 168 | while count < candidate_freq: 169 | if abs(start) > len(firstchars): 170 | raise ValueError("candiate {} not found".format(candidate)) 171 | start -= 1 172 | # Look up key in the definition 173 | try: 174 | startindex = firstchars.index(key, len(firstchars) + start) 175 | except ValueError: 176 | pass 177 | 178 | # Count the number of keys in definition 179 | count = firstchars[startindex:].count(key) 180 | 181 | # We found enough keys in the definition so return the definition as a definition candidate 182 | start = len(' '.join(tokens[:startindex])) 183 | stop = candidate.start - 1 184 | candidate = sentence[start:stop] 185 | 186 | # Remove whitespace 187 | start = start + len(candidate) - len(candidate.lstrip()) 188 | stop = stop - len(candidate) + len(candidate.rstrip()) 189 | candidate = sentence[start:stop] 190 | 191 | new_candidate = Candidate(candidate) 192 | new_candidate.set_position(start, stop) 193 | return new_candidate 194 | 195 | else: 196 | raise ValueError('There are less keys in the tokens in front of candidate than there are in the candidate') 197 | 198 | 199 | def select_definition(definition, abbrev): 200 | """ 201 | Takes a definition candidate and an abbreviation candidate 202 | and returns True if the chars in the abbreviation occur in the definition 203 | 204 | Based on 205 | A simple algorithm for identifying abbreviation definitions in biomedical texts, Schwartz & Hearst 206 | :param definition: candidate definition 207 | :param abbrev: candidate abbreviation 208 | :return: 209 | """ 210 | 211 | if len(definition) < len(abbrev): 212 | raise ValueError('Abbreviation is longer than definition') 213 | 214 | if abbrev in definition.split(): 215 | raise ValueError('Abbreviation is full word of definition') 216 | 217 | sindex = -1 218 | lindex = -1 219 | 220 | while 1: 221 | try: 222 | longchar = definition[lindex].lower() 223 | except IndexError: 224 | raise 225 | 226 | shortchar = abbrev[sindex].lower() 227 | 228 | if not shortchar.isalnum(): 229 | sindex -= 1 230 | 231 | if sindex == -1 * len(abbrev): 232 | if shortchar == longchar: 233 | if lindex == -1 * len(definition) or not definition[lindex - 1].isalnum(): 234 | break 235 | else: 236 | lindex -= 1 237 | else: 238 | lindex -= 1 239 | if lindex == -1 * (len(definition) + 1): 240 | raise ValueError("definition {} was not found in {}".format(abbrev, definition)) 241 | 242 | else: 243 | if shortchar == longchar: 244 | sindex -= 1 245 | lindex -= 1 246 | else: 247 | lindex -= 1 248 | 249 | new_candidate = Candidate(definition[lindex:len(definition)]) 250 | new_candidate.set_position(definition.start, definition.stop) 251 | definition = new_candidate 252 | 253 | tokens = len(definition.split()) 254 | length = len(abbrev) 255 | 256 | if tokens > min([length + 5, length * 2]): 257 | raise ValueError("did not meet min(|A|+5, |A|*2) constraint") 258 | 259 | # Do not return definitions that contain unbalanced parentheses 260 | if definition.count('(') != definition.count(')'): 261 | raise ValueError("Unbalanced parentheses not allowed in a definition") 262 | 263 | return definition 264 | 265 | 266 | def extract_abbreviation_definition_pairs(file_path=None, doc_text=None): 267 | abbrev_map = dict() 268 | omit = 0 269 | written = 0 270 | if file_path: 271 | sentence_iterator = enumerate(yield_lines_from_file(file_path)) 272 | elif doc_text: 273 | sentence_iterator = enumerate(yield_lines_from_doc(doc_text)) 274 | else: 275 | return abbrev_map 276 | 277 | for i, sentence in sentence_iterator: 278 | try: 279 | for candidate in best_candidates(sentence): 280 | try: 281 | definition = get_definition(candidate, sentence) 282 | except (ValueError, IndexError) as e: 283 | log.debug("{} Omitting candidate {}. Reason: {}".format(i, candidate, e.args[0])) 284 | omit += 1 285 | else: 286 | try: 287 | definition = select_definition(definition, candidate) 288 | except (ValueError, IndexError) as e: 289 | log.debug("{} Omitting definition {} for candidate {}. Reason: {}".format(i, definition, candidate, e.args[0])) 290 | omit += 1 291 | else: 292 | abbrev_map[candidate] = definition 293 | written += 1 294 | except (ValueError, IndexError) as e: 295 | log.debug("{} Error processing sentence {}: {}".format(i, sentence, e.args[0])) 296 | log.debug("{} abbreviations detected and kept ({} omitted)".format(written, omit)) 297 | return abbrev_map 298 | 299 | 300 | if __name__ == '__main__': 301 | print(extract_abbreviation_definition_pairs(file_path=sys.argv[1])) 302 | -------------------------------------------------------------------------------- /rrnlp/models/weights/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bwallace/RRnlp/e1a26b4ed1c8d65f2c2e2558dc9f0918572306d0/rrnlp/models/weights/.gitkeep -------------------------------------------------------------------------------- /rrnlp/models/weights/weights_manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "PICO_tagger": 3 | {"zenodo": "5139998", 4 | "files": ["interventions_clf.pt", "population_clf.pt", "outcomes_clf.pt", 5 | "interventions_encoder_custom.pt", "outcomes_encoder_custom.pt", "population_encoder_custom.pt"]}, 6 | "ev_inf_classifier": {"zenodo": "5139998", 7 | "files": ["evidence_identification_clf.pt", "inference_clf.pt", 8 | "evidence_identification_encoder_custom.pt", "inference_encoder_custom.pt"]}, 9 | "RCT_classifier": {"zenodo": "5139998", 10 | "files": ["RCT_overall_abs_clf.pt", "RCT_encoder_custom.pt", "bert_LR.pck"]}, 11 | "RoB_classifier_LR": {"zenodo": "5139998", 12 | "files": ["bias_prob_clf.pck"]}, 13 | "RoB_classifier": {"zenodo": "5139998", 14 | "files": ["RoB_overall_abs_clf.pt", "RoB_encoder_custom.pt"]}, 15 | "sample_size_extractor": {"zenodo": "5139998", 16 | "files": ["sample_size_weights.pt", "RoB_encoder_custom.pt", "PubMed-w2v.bin"]}, 17 | "study_design_classifier": 18 | {"zenodo": "5939332", 19 | "files": ["cohort_lr.pck", "consensus_lr.pck", "ct_lr.pck", "ct_protocol_lr.pck", "guideline_lr.pck", 20 | "qual_lr.pck", "rct_lr.pck", "sr_lr.pck"] 21 | } 22 | } -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | 4 | path_to_weights = os.path.join('rrnlp', 'models', 'weights') 5 | path_to_minimap = os.path.join('rrnlp', 'models', 'util', 'minimap') 6 | 7 | def package_files(directory): 8 | paths = [] 9 | for (path, directories, filenames) in os.walk(directory): 10 | for filename in filenames: 11 | paths.append(os.path.join('..', path, filename)) 12 | return paths 13 | 14 | extra_files = ["../requirements.txt", 15 | os.path.join("../", path_to_weights, "weights_manifest.json")] 16 | extra_files.extend(package_files(path_to_minimap)) 17 | 18 | with open('requirements.txt') as f: 19 | required = f.read().splitlines() 20 | 21 | setup(name='rrnlp', 22 | version='1.0.3', 23 | description='NLP for EBM', 24 | url='https://github.com/bwallace/RRnlp', 25 | author='Byron Wallace, Iain Marshall', 26 | author_email='b.wallace@northeastern.edu', 27 | license='MIT', 28 | packages=find_packages(), 29 | package_data={'': extra_files}, 30 | install_requires=required, 31 | zip_safe=False) 32 | --------------------------------------------------------------------------------