├── rutransform ├── constraints │ ├── constraints │ │ ├── __init__.py │ │ ├── named_entities.py │ │ ├── multihop.py │ │ ├── jeopardy.py │ │ └── referents.py │ ├── __init__.py │ ├── utils.py │ └── constraint.py ├── transformations │ ├── transformers │ │ ├── __init__.py │ │ ├── bae.py │ │ ├── spelling.py │ │ ├── eda.py │ │ ├── paraphrasis.py │ │ └── distraction.py │ ├── transformations │ │ ├── __init__.py │ │ ├── bae │ │ │ ├── __init__.py │ │ │ └── bae.py │ │ ├── eda │ │ │ ├── __init__.py │ │ │ └── eda.py │ │ ├── distraction │ │ │ ├── __init__.py │ │ │ └── sentence_additions.py │ │ ├── paraphrasis │ │ │ ├── __init__.py │ │ │ ├── paraphraser.py │ │ │ └── back_translation.py │ │ └── spelling │ │ │ ├── __init__.py │ │ │ ├── change_char_case.py │ │ │ ├── butter_fingers.py │ │ │ └── emojify.py │ ├── __init__.py │ ├── load_transformers.py │ ├── README.md │ ├── utils.py │ └── dataset_transformer.py ├── utils │ ├── __init__.py │ └── args.py └── __init__.py ├── requirements.txt ├── test_data ├── openbook.json ├── rwsd.json ├── winograd.json ├── worldtree.json ├── chegeka.json ├── danet_qa.json ├── sit_ethics.json ├── per_ethics.json └── multiq.json ├── setup.py ├── LICENSE └── README.md /rutransform/constraints/constraints/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rutransform/transformations/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rutransform/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from . import args 2 | -------------------------------------------------------------------------------- /rutransform/transformations/transformations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rutransform/__init__.py: -------------------------------------------------------------------------------- 1 | from . import utils, constraints, transformations 2 | -------------------------------------------------------------------------------- /rutransform/transformations/transformations/bae/__init__.py: -------------------------------------------------------------------------------- 1 | from .bae import * 2 | -------------------------------------------------------------------------------- /rutransform/transformations/transformations/eda/__init__.py: -------------------------------------------------------------------------------- 1 | from .eda import * 2 | -------------------------------------------------------------------------------- /rutransform/transformations/transformations/distraction/__init__.py: -------------------------------------------------------------------------------- 1 | from .sentence_additions import * 2 | -------------------------------------------------------------------------------- /rutransform/transformations/transformations/paraphrasis/__init__.py: -------------------------------------------------------------------------------- 1 | from .back_translation import * 2 | from .paraphraser import * 3 | -------------------------------------------------------------------------------- /rutransform/transformations/transformations/spelling/__init__.py: -------------------------------------------------------------------------------- 1 | from .butter_fingers import * 2 | from .change_char_case import * 3 | from .emojify import * 4 | -------------------------------------------------------------------------------- /rutransform/constraints/__init__.py: -------------------------------------------------------------------------------- 1 | from .constraint import Constraint 2 | from .utils import * 3 | from .constraints.jeopardy import * 4 | from .constraints.named_entities import * 5 | from .constraints.referents import * 6 | from .constraints.multihop import * 7 | -------------------------------------------------------------------------------- /rutransform/transformations/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * 2 | from .transformations import * 3 | 4 | from .transformer import Transformer 5 | from .transformers.distraction import * 6 | from .transformers.bae import * 7 | from .transformers.eda import * 8 | from .transformers.paraphrasis import * 9 | from .transformers.spelling import * 10 | 11 | from .load_transformers import * 12 | 13 | from .dataset_transformer import * 14 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bert-score==0.3.11 2 | datasets==1.10.0 3 | nltk==3.6.7 4 | numpy>=1.19.2 5 | pandas>=1.0.1 6 | textattack==0.3.4 7 | tensorflow_text==2.7.3 8 | torch==1.10.2 9 | torchtext==0.11.2 10 | transformers==4.12.2 11 | tqdm==4.49.0 12 | sentencepiece==0.1.96 13 | scikit-multilearn==0.2.0 14 | spacy>=3.2.0 15 | russian_paraphrasers==0.0.3 16 | razdel==0.5.0 17 | rusenttokenize==0.0.5 18 | nlg-eval @ git+https://github.com/Maluuba/nlg-eval.git@master 19 | ru_core_news_sm @ https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.2.0/ru_core_news_sm-3.2.0.tar.gz -------------------------------------------------------------------------------- /test_data/openbook.json: -------------------------------------------------------------------------------- 1 | {"ID":"14-1486","question":"Когда вы нажимаете на тормоз в машине, из-за трения (A) машина летит (B) скорость не меняется (C) скорость падает (D) скорость увеличивается","answer":"C"} 2 | {"ID":"8-430","question":"Несмотря на то, что некоторые думают, наша планета вращается вокруг этого объекта (A) плутон (B) луна (C) млечный путь (D) солнце","answer":"D"} 3 | {"ID":"828","question":"Люди худеют, когда (A) им не хватает еды (B) они слишком много едят (C) они переедают (D) они едят пищу","answer":"A"} 4 | {"ID":"14-788","question":"Что сделано из минералов? (A) Стоунхендж (B) автомобильные двигатели (C) спирт (D) электричество","answer":"A"} 5 | {"ID":"7-1160","question":"Карелия - край гигантских озер, где (A) ледники вырывали озера (B) много купающихся (C) ледники плавают в озерах (D) озера заполнены угрями","answer":"A"} 6 | -------------------------------------------------------------------------------- /test_data/rwsd.json: -------------------------------------------------------------------------------- 1 | {"idx":253,"target":{"span1_text":"статью","span2_text":"читает ее","span1_index":7,"span2_index":9},"label":false,"text":"Сара взяла в библиотеке книгу, чтобы написать статью. Она читает ее, когда приходит с работы."} 2 | {"idx":326,"target":{"span1_text":"Фред","span2_text":"он вернулся","span1_index":0,"span2_index":10},"label":false,"text":"Фред смотрел телевизор, пока Джордж выходил купить продуктов. Через час он вернулся."} 3 | {"idx":377,"target":{"span1_text":"печенья с шоколадной крошкой","span2_text":"больше их","span1_index":3,"span2_index":16},"label":false,"text":"Всем понравились овсяные печенья, и только некоторым – печенья с шоколадной крошкой. В следующий раз надо больше их сделать."} 4 | {"idx":8,"target":{"span1_text":"Женя","span2_text":"она получила","span1_index":0,"span2_index":6},"label":true,"text":"Женя поблагодарила Сашу за помощь, которую она получила."} 5 | {"idx":475,"target":{"span1_text":"Донной","span2_text":"ее сосредоточенность","span1_index":3,"span2_index":5},"label":true,"text":"Лили заговорила с Донной, нарушив ее сосредоточенность."} -------------------------------------------------------------------------------- /rutransform/constraints/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from spacy.language import Language 3 | 4 | 5 | def parse_reference(text: str, spacy_model: Language) -> List[str]: 6 | """ 7 | Extract morphological features of the antecedents 8 | 9 | Parameters 10 | ---------- 11 | text: str 12 | anaphor 13 | spacy_model: spacy.language.Language 14 | spacy model to be used for morphological analysis 15 | 16 | Returns 17 | ------- 18 | List[str] 19 | Number and/or Gender of the anaphor parameter strings for Matcher 20 | """ 21 | out = {} 22 | morph = spacy_model(text)[0].morph 23 | case = morph.get("Case") 24 | if len(case) > 0: 25 | case = case[0] 26 | out["case"] = f"Case={case}" 27 | gender = morph.get("Gender") 28 | if len(gender) > 0: 29 | gender = gender[0] 30 | out["gender"] = f"Gender={gender}" 31 | number = morph.get("Number") 32 | if len(number) > 0: 33 | number = number[0] 34 | out["number"] = f"Number={number}" 35 | return out 36 | -------------------------------------------------------------------------------- /rutransform/transformations/load_transformers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pkgutil 3 | from pathlib import Path 4 | from importlib import import_module 5 | import inspect 6 | from rutransform.transformations import Transformer 7 | from rutransform.transformations.utils import SentenceOperation 8 | 9 | 10 | def load_transformers(): 11 | search = "transformers" 12 | package_dir = Path(__file__).resolve() 13 | transformations_dir = package_dir.parent.joinpath(search) 14 | a = pkgutil.iter_modules(path=[transformations_dir]) 15 | 16 | transform_dict = {} 17 | for (_, folder, _) in a: 18 | 19 | t = import_module(f"rutransform.transformations.transformers.{folder}") 20 | 21 | for name, obj in inspect.getmembers(t): 22 | if ( 23 | inspect.isclass(obj) 24 | and issubclass(obj, Transformer) 25 | and not issubclass(obj, SentenceOperation) 26 | ): 27 | try: 28 | info = obj.transform_info().items() 29 | for transformation, _ in info: 30 | transform_dict[transformation] = obj 31 | except NotImplementedError: 32 | pass 33 | 34 | return transform_dict 35 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | import glob 3 | import os 4 | 5 | 6 | def process_requirements(fname): 7 | path = os.path.join(os.path.dirname(__file__), fname) 8 | with open(path, "r", encoding="utf-8") as f: 9 | requirements = f.read() 10 | processed_requirements = [x for x in requirements.strip().split("\n")] 11 | return processed_requirements 12 | 13 | 14 | with open("README.md", "r") as fh: 15 | long_description = fh.read() 16 | 17 | 18 | setuptools.setup( 19 | name="rutransform", 20 | version="0.0.1", 21 | author="evtaktasheva", 22 | author_email="evtaktasheva@gmail.com", 23 | description="Adversarial text perturbation framework for Russian", 24 | long_description=long_description, 25 | long_description_content_type="text/markdown", 26 | url="https://github.com/RussianNLP/rutransform", 27 | packages=setuptools.find_packages(), 28 | package_data={"": ["*.json"]}, 29 | include_package_data=True, 30 | license='Apache License 2.0', 31 | classifiers=[ 32 | "Programming Language :: Python :: 3", 33 | "License :: OSI Approved :: Apache Software License", 34 | "Operating System :: OS Independent", 35 | ], 36 | install_requires=process_requirements("requirements.txt"), 37 | ) 38 | -------------------------------------------------------------------------------- /rutransform/constraints/constraint.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | from abc import abstractmethod 3 | from spacy.language import Language 4 | 5 | 6 | class Constraint: 7 | """ 8 | Base class for transformation constraints 9 | 10 | Attributes 11 | ---------- 12 | name: str 13 | name of the constraint 14 | 15 | Methods 16 | ------- 17 | @abstractmethod 18 | patterns(text, spacy_model) 19 | Creates spacy.Matcher patterns to extract stopwords 20 | """ 21 | 22 | def __init__(self, name: str) -> None: 23 | """ 24 | Parameters 25 | ---------- 26 | name: str 27 | name of the constraint 28 | """ 29 | self.name = name 30 | 31 | @abstractmethod 32 | def patterns( 33 | self, text: Optional[dict], spacy_model: Optional[Language] 34 | ) -> List[List[dict]]: 35 | """ 36 | Creates spacy.Matcher patterns to extract stopwords 37 | 38 | Parameters 39 | ---------- 40 | text: dict 41 | dataset object in dict form 42 | spacy_model: spacy.language.Language 43 | spacy model to be uses for morphological analysis 44 | 45 | Returns 46 | ------- 47 | List[List[dict]] 48 | list of spacy.Matcher patterns, that match the constraint 49 | """ 50 | raise NotImplementedError 51 | -------------------------------------------------------------------------------- /test_data/winograd.json: -------------------------------------------------------------------------------- 1 | {"text":"\"А для госрегистрации понадобится только декларация о постройке, которую вы заполняете самостоятельно (бланк выдадут в местной администрации, также его можно скачать на сайте Росрегистрации)","answer":"постройке","label":0,"options":["госрегистрации","декларация","постройке","администрации","Росрегистрации"],"reference":"которая","homonymia_type":1.2} 2 | {"text":"На втором месте оказалась 16-летняя алина из румынии, которая получила 4 тысячи долларов","answer":"румынии","label":0,"options":["алина","румынии","тысячи"],"reference":"которая","homonymia_type":1.1} 3 | {"text":"\"Чего стоила, например, мёртвенно-бледная Морена - Хранительница Смерти, которую доставили в зал на золотых носилках скелеты, щёлкая челюстями в такт каждого шага","answer":"Хранительница","label":0,"options":["Морена","Хранительница","Смерти","челюстями"],"reference":"которая","homonymia_type":1.4} 4 | {"text":"\"Маша, водя пальчиком по его лицу, стала говорить об одушевленности пола, о личности, которая выражает себя прикосновением…","answer":"личности","label":1,"options":["Маша","одушевленности","личности"],"reference":"которая","homonymia_type":1.2} 5 | {"text":"\"Мифологии, скреплявшей нацию, нужен был набор идеологически выдержанных легенд, а не правда о войне, которая могла подорвать самые основы брежневской Системы","answer":"правда","label":1,"options":["Мифологии","нацию","легенд","правда","войне","основы","Системы"],"reference":"которая","homonymia_type":1.2} 6 | -------------------------------------------------------------------------------- /rutransform/transformations/README.md: -------------------------------------------------------------------------------- 1 | # Transformations 2 | 3 | ## Word-Level Transformations 4 | 5 | Type | Transformation | Example | 6 | :-------|:------------------------|:----------------------------------------------------------------| 7 | Spelling|`ButterFingers` | This is a se**m**tence **r**o test t**j**e code | 8 | Spelling|`Case` | This is a sentence to tes**T** t**H**e c**OD**e | 9 | Modality|`Emojify` | This is a sentence to test the 👩‍💻 | 10 | Context |`BAE``INSERT` | This is a sentence to test the **given** code | 11 | Context |`BAE``REPLACE`| This is a sentence to check the code | 12 | 13 | ## Sentence-Level Transformations 14 | 15 | Type |Transformation | Example | 16 | :----------|:-----------------------|:-------------------------------------------------------------------| 17 | Random |`EDA``DELETE`| This **_** a sentence to test the code | 18 | Random |`EDA``SWAP` | **code** is a sentence to test the **This** | 19 | Paraphrasis|`Paraphraser` | **I am just testing the code** | 20 | Paraphrasis|`BackTranslation` | **This sentence tests the code** | 21 | Distraction|`AddSent` | This is a sentence to test the code **, if you want to delete it** | 22 | -------------------------------------------------------------------------------- /test_data/worldtree.json: -------------------------------------------------------------------------------- 1 | {"question":"Когда мороженое не кладут в морозильную камеру, мороженое превращается из ___. (A) твердого тела в газ (B) газа в жидкость (C) твердого тела в жидкость (D) жидкости в газ","answer":"C","exam_name":"Virginia Standards of Learning - Science","school_grade":5,"knowledge_type":"CAUSAL,EXAMPLE"} 2 | {"question":"За сколько времени Земля совершит семь оборотов вокруг своей оси? (A) один день (B) одна неделя (C) один месяц (D) один год","answer":"B","exam_name":"NYSEDREGENTS","school_grade":4,"knowledge_type":"MODEL,QUANT"} 3 | {"question":"Студент толкает красную игрушечную машинку по деревянному полу. Затем студент толкает ту же красную игрушечную машинку по цементному полу. Какой вопрос студент, скорее всего, исследует? (A) С какой скоростью обычно катится игрушечная машинка? (B) Как лучше всего заставить игрушечный автомобиль катиться? (C) Какая поверхность позволяет игрушечной машинке катиться дальше? (D) Влияет ли цвет на расстояние, на которое катится игрушечная машинка?","answer":"C","exam_name":"Alaska Dept. of Education & Early Development","school_grade":4,"knowledge_type":"MODEL"} 4 | {"question":"Животные используют ресурсы окружающей среды, чтобы выжить. Какие из следующих ресурсов животные используют для получения энергии? (A) воздух (B) еда (C) убежище (D) вода","answer":"B","exam_name":"Maryland School Assessment - Science","school_grade":4,"knowledge_type":"PROCESS"} 5 | {"question":"Чем похожи испарение и конденсация? (A) Оба вызывают снижение температуры воздуха. (B) Оба вызывают повышение температуры воздуха. (C) Оба вызваны потеплением атмосферы. (D) Оба вызваны изменениями тепловой энергии.","answer":"D","exam_name":"North Carolina READY End-of-Grade Assessment","school_grade":5,"knowledge_type":"CAUSAL"} 6 | -------------------------------------------------------------------------------- /test_data/chegeka.json: -------------------------------------------------------------------------------- 1 | {"question_id":27039,"question":"Именно В ЭТОМ ГОДУ родилась ровно половина всех людей, побывавших на Луне.","answer":"1930","topic":"Год лошади","author":"Кирилл Богловский","tour_name":"IX Кубок Европы по интеллектуальным играм среди студентов (Петродворец). Своя игра","tour_link":"https:\/\/db.chgk.info\/tour\/eu13stsv"} 2 | {"question_id":20030,"question":"Во время соревнований по стрельбе пошел сильный дождь. Был сооружен специальный навес, под который пускали, впрочем, не всех спортсменов, а только ИХ.","answer":"Шведы","topic":"Олимпийские игры 1912 года в Стокгольме","author":"Евгений Поникаров","tour_name":"I Чемпионат России по \"Своей игре\" (Великие Луки)","tour_link":"https:\/\/db.chgk.info\/tour\/russv07"} 3 | {"question_id":26414,"question":"Роберт Смит в 1872 году в книге \"Начало химической климатологии\" впервые употребил ЭТОТ ТЕРМИН.","answer":"Кислотный дождь","topic":"Осадки","author":"Евгений Шляхов, Владимир Островский, Александра Косолапова, Дмитрий Стрильчук","tour_name":"Молодежный чемпионат Украины по своей игре - 2013","tour_link":"https:\/\/db.chgk.info\/tour\/uksv13st"} 4 | {"question_id":11121,"question":"\"Энфилд Роуд\".","answer":"Ливерпуль","topic":"СТАДИОНЫ (называется стадион, нужно назвать город, в котором он расположен)","author":"Борис Шойхет","tour_name":"Пятый Международный летний лагерь немецких знатоков \"Хоббах - 2003\". Спортивное Jeopardy.","tour_link":"https:\/\/db.chgk.info\/tour\/hobb03je"} 5 | {"question_id":10970,"question":"Цитата из ЭТОГО произведения: \"Газет он не читал, в театр не ходил, а жена сбежала от него с тенором оперы Зимина в 1913 году, оставив ему записку такого содержания: \"Невыносимую дрожь отвращения возбуждают во мне твои лягушки. Я всю жизнь буду несчастна из-за них\"\".","answer":"\"Роковые яйца\" (профессор Персиков) Булгакова","topic":"ПЕРСИКИ","author":"Борис Шойхет","tour_name":"Фестиваль \"Кронах-2003\". Jeopardy.","tour_link":"https:\/\/db.chgk.info\/tour\/kronah03"} 6 | -------------------------------------------------------------------------------- /rutransform/constraints/constraints/named_entities.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | from spacy.language import Language 3 | 4 | from rutransform.constraints import Constraint 5 | 6 | 7 | class NamedEntities(Constraint): 8 | """ 9 | Named entities constraint 10 | 11 | Matches all the named entities in text 12 | 13 | Attributes 14 | ---------- 15 | name: str 16 | name of the constraint (is always 'named_entities') 17 | entity_types: List[str], optional 18 | list of named entity types to include (default is None) 19 | matches all types if not provided 20 | 21 | Methods 22 | ------- 23 | patterns(text, spacy_model) 24 | Creates spacy.Matcher patterns to extract stopwords 25 | """ 26 | 27 | def __init__(self, entity_types: Optional[List[str]] = None) -> None: 28 | """ 29 | Parameters 30 | ---------- 31 | entity_types: List[str], optional 32 | list of named entity types to include (default is None) 33 | matches all types if not provided 34 | """ 35 | super().__init__(name="named_entities") 36 | self.entity_types = entity_types 37 | 38 | def patterns( 39 | self, text: Optional[dict] = None, spacy_model: Optional[Language] = None 40 | ) -> List[List[dict]]: 41 | """ 42 | Creates spacy.Matcher patterns to extract stopwords 43 | 44 | Parameters 45 | ---------- 46 | text: dict 47 | ! exists for compatability, always ignored ! 48 | dataset object in dict form (default is None) 49 | spacy_model: spacy.language.Language 50 | ! exists for compatability, always ignored ! 51 | spacy model to be uses for morphological analysis (default is None) 52 | 53 | Returns 54 | ------- 55 | List[List[dict]] 56 | list of spacy.Matcher patterns matching named entities 57 | """ 58 | if self.entity_types is None: 59 | patterns = [[{"ENT_TYPE": "", "OP": "!"}]] 60 | else: 61 | patterns = [[{"ENT_TYPE": {"IN": self.entity_types}, "OP": "!"}]] 62 | return patterns 63 | -------------------------------------------------------------------------------- /rutransform/transformations/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Union, NamedTuple 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from datasets import Dataset 6 | from spacy.language import Language 7 | from rutransform.utils.args import TransformArguments 8 | 9 | 10 | class TransformResult(NamedTuple): 11 | transformed_dataset: Union[pd.DataFrame, Dataset] 12 | scores: np.array 13 | score: float 14 | std: float 15 | 16 | 17 | class SentenceOperation(object): 18 | """ 19 | Generic operation class. 20 | 21 | Attributes 22 | ---------- 23 | args: TransformArguments 24 | parameters of the transformation 25 | seed: int 26 | seed to freeze everything (default is 42) 27 | max_outputs: int 28 | maximum number of the transfromed sentences (default is 1) 29 | device: str 30 | the device used during transformation (default is 'cpu') 31 | spacy_model: spacy.language.Language 32 | spacy model used for tokenization 33 | 34 | Methods 35 | ------- 36 | generate(sentence, stop_words) 37 | Transforms the sentence 38 | """ 39 | 40 | def __init__( 41 | self, 42 | args: TransformArguments, 43 | seed: int = 42, 44 | max_outputs: int = 1, 45 | device: str = "cpu", 46 | spacy_model: Optional[Language] = None, 47 | ) -> None: 48 | """ 49 | Parameters 50 | ---------- 51 | args: TransformArguments 52 | parameters of the transformation 53 | seed: int 54 | seed to freeze everything (default is 42) 55 | max_outputs: int 56 | maximum number of the transfromed sentences (default is 1) 57 | device: str 58 | the device used during transformation (default is 'cpu') 59 | spacy_model: spacy.language.Language 60 | spacy model used for tokenization 61 | """ 62 | self.args = args 63 | self.seed = seed 64 | self.max_outputs = max_outputs 65 | self.device = device 66 | self.spacy_model = spacy_model 67 | 68 | def generate( 69 | self, 70 | sentence: str, 71 | stop_words: Optional[List[Union[int, str]]] = None, 72 | prob: Optional[float] = None, 73 | ) -> List[str]: 74 | """ 75 | Transforms the sentence 76 | 77 | Parameters 78 | ---------- 79 | sentence: str 80 | sentence to transform 81 | stop_words: List[Union[int, str]], optional 82 | stop_words to ignore during transformation (default is None) 83 | prob: float, optional 84 | probability of the transformation (default is None) 85 | 86 | Returns 87 | ------- 88 | list 89 | list of transformed sentences 90 | """ 91 | raise NotImplementedError 92 | -------------------------------------------------------------------------------- /rutransform/constraints/constraints/multihop.py: -------------------------------------------------------------------------------- 1 | from spacy.language import Language 2 | 3 | from typing import Optional, List, Dict, Any 4 | 5 | from rutransform.constraints import Constraint 6 | 7 | 8 | class Multihop(Constraint): 9 | """ 10 | Constraints for multihop QA tasks 11 | 12 | Matches all the bridge and main answers important 13 | for hops 14 | 15 | Attributes 16 | ---------- 17 | name: str 18 | name of the constraint (is always 'referents') 19 | bridge_col_name: str 20 | name of the column containing bridge answers 21 | main_col_name: str 22 | name of the column containig main question answers 23 | 24 | Methods 25 | ------- 26 | extract_words(answer) 27 | Parses answer dictionary and extracts all tokens 28 | patterns(text, spacy_model) 29 | Creates spacy.Matcher patterns to extract stopwords 30 | """ 31 | 32 | def __init__(self, bridge_answers_col: str, main_answers_col: str) -> None: 33 | """ 34 | Parameters 35 | ---------- 36 | bridge_col_name: str 37 | name of the column containing bridge answers 38 | main_col_name: str 39 | name of the column containig main question answers 40 | """ 41 | super().__init__(name="multihop") 42 | self.bridge_answers_col = bridge_answers_col 43 | self.main_answers_col = main_answers_col 44 | 45 | def extract_words(self, answers: Dict[str, Any]) -> List[str]: 46 | """ 47 | Parses answer dictionary and extracts all tokens 48 | 49 | Parameters 50 | ---------- 51 | answers: Dict[str, Any] 52 | answers dictionary 53 | 54 | Returns 55 | ------- 56 | List[str] 57 | list of tokens in the answer 58 | """ 59 | stop_words = [] 60 | for answer in answers: 61 | stop_words.extend(answer["segment"].split()) 62 | return stop_words 63 | 64 | def patterns( 65 | self, text: Optional[dict] = None, spacy_model: Optional[Language] = None 66 | ) -> List[List[dict]]: 67 | """ 68 | Creates spacy.Matcher patterns to extract stopwords 69 | 70 | Parameters 71 | ---------- 72 | text: dict 73 | dataset object in dict form (default is None) 74 | spacy_model: spacy.language.Language 75 | spacy model to be used for morphological analysis (default is None) 76 | 77 | Returns 78 | ------- 79 | List[List[dicMult]] 80 | list of spacy.Matcher patterns matching entities important for hops 81 | """ 82 | stop_words = self.extract_words(text[self.bridge_answers_col]) 83 | stop_words += self.extract_words(text[self.main_answers_col]) 84 | 85 | stop_words = list(set(stop_words)) 86 | patterns = [[{"TEXT": {"IN": stop_words}}]] 87 | 88 | return patterns 89 | -------------------------------------------------------------------------------- /rutransform/constraints/constraints/jeopardy.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | from spacy.language import Language 3 | 4 | from rutransform.constraints import Constraint 5 | 6 | 7 | class Jeopardy(Constraint): 8 | """ 9 | Jeopardy type conatraints, including: 10 | - Noun Phrases such as THIS FILM, THIS ACTOR, both UPPER and lower cased 11 | - 'X' 12 | - «Named Entity in parentheses» 13 | 14 | Attributes 15 | ---------- 16 | name: str 17 | name of the constraint (is always 'jeopardy') 18 | lemmas: List[str], optional 19 | lemmas to include in the patterns (default is None) 20 | used to define the list of DET that can be used in 21 | jeopardy questions (e.g. if we want to include 'this' but not 'that') 22 | 23 | Methods 24 | ------- 25 | patterns(text, spacy_model) 26 | Creates spacy.Matcher patterns to extract stopwords 27 | """ 28 | 29 | def __init__(self, lemmas: Optional[List[str]] = None) -> None: 30 | """ 31 | Parameters 32 | ---------- 33 | lemmas: List[str] 34 | lemmas to include in the patterns (default is None) 35 | used to define the list of DET that can be used in 36 | jeopardy questions (e.g. if we want to include 'this' but not 'that') 37 | """ 38 | super().__init__(name="jeopardy") 39 | self.lemmas = lemmas 40 | 41 | def patterns( 42 | self, text: Optional[dict] = None, spacy_model: Optional[Language] = None 43 | ) -> List[List[dict]]: 44 | """ 45 | Creates spacy.Matcher patterns to extract stopwords 46 | 47 | Parameters 48 | ---------- 49 | text: dict 50 | ! exists for compatability, always ignored ! 51 | dataset object in dict form (default is None) 52 | spacy_model: spacy.language.Language 53 | ! exists for compatability, always ignored ! 54 | spacy model to be used for morphological analysis (default is None) 55 | 56 | Returns 57 | ------- 58 | List[List[dict]] 59 | list of spacy.Matcher patterns matching jeopardy questions 60 | """ 61 | 62 | patterns = [ 63 | [ 64 | { 65 | "IS_UPPER": True, 66 | "OP": "+", 67 | "POS": {"IN": ["NOUN", "PROPN", "DET", "PRON"]}, 68 | } 69 | ], 70 | [ 71 | {"IS_UPPER": True, "POS": {"NOT_IN": ["ADP"]}}, 72 | {"POS": "ADJ", "OP": "*"}, 73 | {"POS": "NOUN", "OP": "+"}, 74 | ], 75 | [ 76 | {"TEXT": "«"}, 77 | {"IS_TITLE": True}, 78 | {"TEXT": {"REGEX": "\w|\d|['?!.]"}, "OP": "*"}, 79 | {"TEXT": "»"}, 80 | ], 81 | ] 82 | 83 | if self.lemmas is None: 84 | self.lemmas = [ 85 | "его", 86 | "ему", 87 | "её", 88 | "икс", 89 | "ими", 90 | "их", 91 | "него", 92 | "ней", 93 | "неё", 94 | "ним", 95 | "них", 96 | "нём", 97 | "он", 98 | "она", 99 | "они", 100 | "оно", 101 | "такой", 102 | "это", 103 | "этот", 104 | ] 105 | patterns.append( 106 | [ 107 | {"LEMMA": {"IN": self.lemmas}}, 108 | {"POS": "ADJ", "OP": "*"}, 109 | {"POS": "NOUN", "OP": "+"}, 110 | ] 111 | ) 112 | 113 | return patterns 114 | -------------------------------------------------------------------------------- /rutransform/constraints/constraints/referents.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | from spacy.language import Language 3 | 4 | from rutransform.constraints import Constraint 5 | from rutransform.constraints.utils import parse_reference 6 | 7 | 8 | class Referents(Constraint): 9 | """ 10 | Constraints for coreference resolution tasks 11 | 12 | Matches 13 | - the anaphoric pronoun 14 | - all possible antecedents 15 | - all verbs referring to antecedents and anaphor 16 | 17 | Attributes 18 | ---------- 19 | name: str 20 | name of the constraint (is always 'referents') 21 | reference_col_name: str, optional 22 | name of the column containing anaphor 23 | defaults to 'reference' if not provided 24 | candidates_col_name: str, optional 25 | name of the column containig possible antecedents 26 | defaults to 'Options' or 'options' if not provided 27 | 28 | Methods 29 | ------- 30 | patterns(text, spacy_model) 31 | Creates spacy.Matcher patterns to extract stopwords 32 | """ 33 | 34 | def __init__( 35 | self, 36 | reference_col_name: Optional[str] = None, 37 | candidates_col_name: Optional[str] = None, 38 | ) -> None: 39 | """ 40 | Parameters 41 | ---------- 42 | reference_col_name: str, optional 43 | name of the column containing anaphor 44 | defaults to 'reference' if not provided 45 | candidates_col_name: str, optional 46 | name of the column containig possible antecedents 47 | defaults to 'options' if not provided 48 | """ 49 | super().__init__(name="referents") 50 | self.reference_col_name = reference_col_name 51 | self.candidates_col_name = candidates_col_name 52 | 53 | def patterns( 54 | self, text: Optional[dict] = None, spacy_model: Optional[Language] = None 55 | ) -> List[List[dict]]: 56 | """ 57 | Creates spacy.Matcher patterns to extract stopwords 58 | 59 | Parameters 60 | ---------- 61 | text: dict 62 | dataset object in dict form (default is None) 63 | spacy_model: spacy.language.Language 64 | spacy model to be used for morphological analysis (default is None) 65 | 66 | Returns 67 | ------- 68 | List[List[dict]] 69 | list of spacy.Matcher patterns matching antecedents, anaphors and corresponding verbs 70 | """ 71 | if not self.reference_col_name: 72 | if "reference" in text: 73 | self.reference_col_name = "reference" 74 | else: 75 | raise ValueError( 76 | "Column 'reference' not found in pd.DataFrame columns. " 77 | + "Rename the text column or provide 'reference_col_name' argument." 78 | ) 79 | if not self.candidates_col_name: 80 | if "options" in text: 81 | self.candidates_col_name = "options" 82 | else: 83 | raise ValueError( 84 | "Column 'options' not found in pd.DataFrame columns. " 85 | + "Rename the text column or provide 'candidates_col_name' argument." 86 | ) 87 | 88 | options = ( 89 | eval(text[self.candidates_col_name]) 90 | if type(text[self.candidates_col_name]) is str 91 | else text[self.candidates_col_name] 92 | ) 93 | morph = parse_reference(text[self.reference_col_name], spacy_model) 94 | referents = [morph.get("number")] + [morph.get("gender")] 95 | referents = [referent for referent in referents if referent] 96 | patterns = [ 97 | [{"TEXT": {"IN": options + text[self.reference_col_name].strip().split()}}], 98 | [{"POS": "VERB", "MORPH": {"IS_SUPERSET": referents}}], 99 | ] 100 | return patterns 101 | -------------------------------------------------------------------------------- /rutransform/utils/args.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from dataclasses import field, dataclass 4 | from typing import Optional, List 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | @dataclass 10 | class TransformArguments: 11 | """ 12 | Arguments pertaining to transformations we are going to apply to data 13 | """ 14 | 15 | transformation: Optional[str] = field( 16 | default=None, metadata={"help": "transformation to use for data augmentation"} 17 | ) 18 | max_outputs: int = field( 19 | default=1, metadata={"help": "maximum number of the transformed sentences"} 20 | ) 21 | probability: float = field( 22 | default=0.1, metadata={"help": "probability of the transformation"} 23 | ) 24 | same_prob: bool = field( 25 | default=True, 26 | metadata={ 27 | "help": "whether to use the same probability for EDA deletion and swap" 28 | }, 29 | ) 30 | del_prob: float = field( 31 | default=0.05, 32 | metadata={ 33 | "help": "probability of the word deletion for EDA transformation " 34 | "Needs to be specified if same_prob parameter is True" 35 | }, 36 | ) 37 | similarity_threshold: float = field( 38 | default=0.8, metadata={"help": "BERTScore value to filter out candidates"} 39 | ) 40 | bae_model: str = field( 41 | default="bert-base-multilingual-cased", 42 | metadata={"help": "BERT model for BAE attack"}, 43 | ) 44 | segment_length: int = field( 45 | default=3, metadata={"help": "minimal segment length for BackTranslationNE"} 46 | ) 47 | bin_p: float = field( 48 | default=1.0, 49 | metadata={ 50 | "help": "parameter of the binomial distribution for BackTranslationNE" 51 | }, 52 | ) 53 | generator: str = field( 54 | default="gpt3", 55 | metadata={ 56 | "help": "generator model: 'gpt2' = sberbank-ai/rugpt2large, " 57 | "'gpt3' = sberbank-ai/rugpt3small_based_on_gpt2, " 58 | "'mt5-small' = google/mt5-small, 'mt5-base' = google/mt5-base, " 59 | "'mt5-large' = google/mt5-large" 60 | }, 61 | ) 62 | prompt_text: str = field( 63 | default=" Парафраза:", metadata={"help": "prompt for text generation"} 64 | ) 65 | prompt: bool = field( 66 | default=False, metadata={"help": "whether to use a prompt for generation"} 67 | ) 68 | num_beams: Optional[int] = field( 69 | default=None, metadata={"help": "number of beams for beam search"} 70 | ) 71 | early_stopping: bool = field( 72 | default=False, 73 | metadata={"help": "whether to stop when beam hypotheses reached the EOS token"}, 74 | ) 75 | no_repeat_ngram_size: Optional[int] = field( 76 | default=None, metadata={"help": "n-gram penalty for beam search generation"} 77 | ) 78 | do_sample: bool = field(default=False, metadata={"help": "whether to do sampling"}) 79 | temperature: Optional[float] = field( 80 | default=None, metadata={"help": "temperature for text generation"} 81 | ) 82 | top_k: Optional[int] = field( 83 | default=None, metadata={"help": "top-k sampling parameter for text generation"} 84 | ) 85 | top_p: Optional[float] = field( 86 | default=None, metadata={"help": "top-p sampling parameter for text generation"} 87 | ) 88 | repetition_penalty: Optional[float] = field( 89 | default=None, 90 | metadata={"help": "repetition penalty parameter for text generation"}, 91 | ) 92 | threshold: Optional[float] = field( 93 | default=None, metadata={"help": "threshold parameter to filter candidates"} 94 | ) 95 | max_length: int = field( 96 | default=50, metadata={"help": "maximum length of the generated text"} 97 | ) 98 | 99 | def __post_init__(self): 100 | if self.transformation is None: 101 | logger.warning("No transformation was passed.") 102 | -------------------------------------------------------------------------------- /test_data/danet_qa.json: -------------------------------------------------------------------------------- 1 | {"question":"Был ли у обломова сын?","passage":"В браке с Пшеницыной у Обломова родился сын, названный в честь друга Андреем. После смерти Обломова чета Штольцев взяла его сына на воспитание. Завершает всё тихая сцена в загородном доме Штольцев, вызывающая сомнение в их взаимном счастье, и маленький Андрюша, бегущий по полю на встречу с матерью. Олег Табаков — Илья Ильич Обломов Юрий Богатырёв — Андрей Иванович Штольц Андрей Попов — Захар Трофимович, слуга Обломова Елена Соловей — Ольга Сергеевна Ильинская Авангард Леонтьев — Иван Алексеевич Алексеев Андрей Разумовский — Илюша Обломов Олег Козлов — Андрюша Штольц Елена Клещевская — Катя, горничная Ольги Галина Шостко — тётка Ольги Глеб Стриженов — барон фон Лангваген Евгений Стеблов — отец Обломова Евгения Глушенко — мать Обломова Николай Пастухов — Иван Богданович, отец Штольца В эпизодах В фильме использована музыка Беллини, Сергея Рахманинова. Фильм снимался в деревне Сенькино Серпуховского района, Московской области.","label":true,"idx":680} 2 | {"question":"Должен ли цвет чехла соответствовать цвету церковного праздника?","passage":"Начиная с XV—XVI веков престолы делают либо в виде каменных монолитов, либо из дерева, в виде рамы с крышкой сверху, которую покрывают снаружи облачением из ткани. Облачение представляет собой чехол из дорогой парчевой ткани, надеваемый на престол. Весьма часто, но необязательно цвет облачения престола соответствует богослужебному цвету церковного праздника. Облачения могут быть не только из ткани, например парчи, но и представлять собой богато украшенный золотом и камнями металлический, даже золотой, футляр для престола. Престол храма Святой Софии в Константинополе был сделан из золота и драгоценных камней.","label":false,"idx":1643} 3 | {"question":"Состоит ли албания в евросоюзе?","passage":"Вступление Албании в Европейский союз — процедура принятия Албании в Европейский союз. В 2000 году Албания была официально признана в качестве «потенциальной страны-кандидата» на вступление в Европейский союз, в 2003 году стороны начали переговоры по подписанию Соглашения о стабилизации и ассоциации. Соглашение было подписано 12 июня 2006 года, тем самым завершив первый важный шаг к полноправному членству Албании в ЕС. 1 апреля 2009 года Албания вступила в военный альянс НАТО, а 28 апреля 2009 года Албания подала заявку на вступление в Европейский союз. 8 ноября 2010 года Совет Европейского союза одобрил введение безвизового режима для граждан Албании желающих посетить страны Европейского союза с туристической целью, и 10 декабря 2010 года это решение официально вступило в силу.","label":false,"idx":729} 4 | {"question":"Был ли автомобиль принцессы дианы в дтп?","passage":"Несмотря на продолжительные реанимационные попытки, включая внутренний массаж сердца, она умерла в 4:00 утра, поскольку данные ранения оказались несовместимыми с жизнью. Позднее в то же утро, Жан-Пьер Шевенман , французский премьер-министр Лионель Жоспен, Бернадетт Ширак , и Бернар Кушнер , посетили палату, где лежало тело Дианы и отдали последнюю дань уважения. После их визитов, Англиканский Архидиакон Франции, отец Мартин Дрейпер, прочитал благодарственные молитвы из Книги Общей Молитвы. Около 2:00 ночи, бывший муж Дианы, Чарльз, принц Уэльский, и две её старшие сестры, Сара Маккоркодейл и Джейн Феллоуз, прибыли в Париж; они оставили её тело спустя 90 минут. Первоначальные сообщения в СМИ заявили, что автомобиль Дианы столкнулся с столбом со скоростью 190 км\/ч , и что стрелка спидометра застряла в этом положении.","label":true,"idx":7} 5 | {"question":"Обязательно ли содержание послания, которое несет искусство аудитории, является научным?","passage":"Средство коммуникации. В своей простейшей форме искусство представляет собой средство коммуникации. Как и большинство прочих способов коммуникации, оно несет в себе намерение передать информацию аудитории. Например, научная иллюстрация — тоже форма искусства, существующая для передачи информации. Ещё один пример такого рода — географические карты. Однако содержание послания не обязательно бывает научным. Искусство позволяет передавать не только объективную информацию, но и эмоции, настроение, чувства.","label":false,"idx":1252} 6 | -------------------------------------------------------------------------------- /rutransform/transformations/transformations/bae/bae.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Union 2 | from spacy.language import Language 3 | 4 | from textattack.augmentation import Augmenter as TAAugmenter 5 | from textattack.transformations import WordSwapMaskedLM 6 | from textattack.constraints.pre_transformation.stopword_modification import ( 7 | StopwordModification, 8 | ) 9 | 10 | from rutransform.utils.args import TransformArguments 11 | from rutransform.transformations.utils import SentenceOperation 12 | 13 | 14 | class BAE(SentenceOperation): 15 | """ 16 | BERT masked language model transformation attack from 17 | "BAE: BERT-based Adversarial Examples for Text Classification" 18 | (Garg & Ramakrishnan, 2019). 19 | 20 | Attributes 21 | ---------- 22 | args: TransformArguments 23 | parameters of the transformation 24 | seed: int 25 | seed to freeze everything (default is 42) 26 | max_outputs: int 27 | maximum number of the transfromed sentences (default is 1) 28 | device: str 29 | the device used during transformation (default is 'cpu') 30 | spacy_model: spacy.language.Language 31 | ! exists for compatability, always ignored ! 32 | spacy model used for tokenization 33 | 34 | Methods 35 | ------- 36 | generate(sentence, stop_words, prob) 37 | Transforms the sentence 38 | """ 39 | 40 | def __init__( 41 | self, 42 | args: TransformArguments, 43 | seed: int = 42, 44 | max_outputs: int = 1, 45 | device: str = "cpu", 46 | spacy_model: Optional[Language] = None, 47 | ) -> None: 48 | """ 49 | Parameters 50 | ---------- 51 | args: TransformArguments 52 | parameters of the transformation 53 | seed: int 54 | seed to freeze everything (default is 42) 55 | max_outputs: int 56 | maximum number of the transfromed sentences (default is 1) 57 | device: str 58 | the device used during transformation (default is 'cpu') 59 | spacy_model: spacy.language.Language 60 | spacy model used for tokenization 61 | """ 62 | super().__init__( 63 | args=args, 64 | seed=seed, 65 | max_outputs=max_outputs, 66 | device=device, 67 | spacy_model=spacy_model, 68 | ) 69 | 70 | self.transformation = WordSwapMaskedLM( 71 | method="bae", 72 | masked_language_model=self.args.bae_model, 73 | tokenizer=self.args.bae_model, 74 | ) 75 | 76 | def generate( 77 | self, 78 | sentence: str, 79 | stop_words: Optional[List[Union[int, str]]] = None, 80 | prob: Optional[float] = None, 81 | ) -> List[str]: 82 | """ 83 | Transforms the sentence 84 | 85 | Parameters 86 | ---------- 87 | sentence: str 88 | sentence to transform 89 | stop_words: List[Union[int, str]], optional 90 | stop_words to ignore during transformation (default is None) 91 | prob: float, optional 92 | ! exists for compatability, always ignored ! 93 | probability of the transformation (default is None) 94 | 95 | Returns 96 | ------- 97 | list 98 | list of transformed sentences 99 | """ 100 | if stop_words is not None: 101 | constraints = [StopwordModification(stop_words)] 102 | 103 | augmenter = TAAugmenter( 104 | transformation=self.transformation, 105 | transformations_per_example=self.max_outputs, 106 | constraints=constraints, 107 | ) 108 | else: 109 | augmenter = TAAugmenter( 110 | transformation=self.transformation, 111 | transformations_per_example=self.max_outputs, 112 | ) 113 | 114 | perturbed = augmenter.augment(sentence) 115 | 116 | return perturbed 117 | -------------------------------------------------------------------------------- /rutransform/transformations/transformations/paraphrasis/paraphraser.py: -------------------------------------------------------------------------------- 1 | import transformers 2 | 3 | from typing import List, Optional, Union 4 | from spacy.language import Language 5 | 6 | from russian_paraphrasers import GPTParaphraser, Mt5Paraphraser 7 | 8 | from rutransform.utils.args import TransformArguments 9 | from rutransform.transformations.utils import SentenceOperation 10 | 11 | transformers.logging.set_verbosity_error() 12 | 13 | 14 | class Paraphraser(SentenceOperation): 15 | """ 16 | Generates diverse linguistic variations of the contexts 17 | through paraphrasis using a ruGPT or ruMT5 model 18 | 19 | Attributes 20 | ---------- 21 | args: TransformArguments 22 | parameters of the transformation 23 | seed: int 24 | seed to freeze everything (default is 42) 25 | max_outputs: int 26 | maximum number of the transfromed sentences (default is 1) 27 | device: str 28 | the device used during transformation (default is 'cpu') 29 | spacy_model: spacy.language.Language 30 | spacy model used for tokenization 31 | 32 | Methods 33 | ------- 34 | generate(sentence, stop_words, prob) 35 | Transforms the sentence 36 | """ 37 | 38 | def __init__( 39 | self, 40 | args: TransformArguments, 41 | seed: int = 42, 42 | max_outputs: int = 1, 43 | device: str = "cpu", 44 | spacy_model: Optional[Language] = None, 45 | ) -> None: 46 | """ 47 | Parameters 48 | ---------- 49 | args: TransformArguments 50 | parameters of the transformation 51 | seed: int 52 | seed to freeze everything (default is 42) 53 | max_outputs: int 54 | maximum number of the transfromed sentences (default is 1) 55 | device: str 56 | the device used during transformation (default is 'cpu') 57 | spacy_model: spacy.language.Language 58 | ! exists for compatability, always ignored ! 59 | spacy model used for tokenization 60 | """ 61 | super().__init__( 62 | args=args, 63 | seed=seed, 64 | max_outputs=max_outputs, 65 | device=device, 66 | spacy_model=spacy_model, 67 | ) 68 | 69 | if "gpt" in self.args.generator: 70 | self.paraphraser = GPTParaphraser( 71 | model_name=self.args.generator, range_cand=True, make_eval=False 72 | ) 73 | else: 74 | self.paraphraser = Mt5Paraphraser( 75 | model_name=self.args.generator, range_cand=True, make_eval=False 76 | ) 77 | 78 | def generate( 79 | self, 80 | sentence: str, 81 | stop_words: Optional[List[Union[int, str]]] = None, 82 | prob: Optional[float] = None, 83 | ) -> List[str]: 84 | """ 85 | Transforms the sentence 86 | 87 | Parameters 88 | ---------- 89 | sentence: str 90 | sentence to transform 91 | stop_words: List[int], optional 92 | ! exists for compatability, always ignored ! 93 | stop_words to ignore during transformation (default is None) 94 | prob: float, optional 95 | ! exists for compatability, always ignored ! 96 | probability of the transformation (default is None) 97 | 98 | Returns 99 | ------- 100 | list 101 | list of transformed sentences 102 | """ 103 | transformed = self.paraphraser.generate( 104 | sentence, 105 | n=self.max_outputs, 106 | temperature=self.args.temperature, 107 | top_k=self.args.top_k, 108 | top_p=self.args.top_p, 109 | max_length=self.args.max_length, 110 | repetition_penalty=self.args.repetition_penalty, 111 | threshold=self.args.threshold, 112 | ) 113 | best = transformed["results"][0]["best_candidates"] 114 | if best: 115 | return best 116 | else: 117 | return transformed["results"][0]["predictions"] 118 | -------------------------------------------------------------------------------- /test_data/sit_ethics.json: -------------------------------------------------------------------------------- 1 | {"source":"lenta","text":"Около 800 антиглобалистов заблокировали несколько улиц, ведущих к городскому порту. Видео с места событий публикует Ruptly. Полиция попыталась остановить мирную акцию протеста, однако демонстранты продолжили шествие. У парка Альтер Эльб протестующих разгоняли водометами, передает корреспондент РИА Новости. Отмечается, что на подобные меры власти пошли после того, как сквозь крики и свист четыре раза послышались звуки, похожие на выстрелы. Вечером 6 июля, по оценкам полиции, на улицах города протестовали около шести тысяч человек. В результате стычек, произошедших в ночь на 7 июля, ранения получили более 100 сотрудников правоохранительных органов. Столкновения антиглобалистов с полицией длятся уже несколько дней. 6 июля манифестанты объявили о начале акции «Адский пикник». Для обеспечения безопасности лидеров стран «Большой двадцатки» в Гамбург со всей Германии были переброшены около 20 тысяч полицейских. Стражи порядка запросили подкрепление, когда поняли, что недооценили угрозу.","sit_virtue":1,"sit_moral":0,"sit_law":1,"sit_justice":0,"sit_util":0,"label":[1,0,1,0,0]} 2 | {"source":"taiga_kp","text":"В Кирове спрогнозировали начало эпидемии гриппа. Данные регионального Роспотребнадзора за неделю с 9 по 15 января говорят, что в городе наблюдаются первые признаки массовой заболеваемости. К врачам за этот период обратились 4662 заболевших. Этот показатель увеличился в 1,5 раза, причем для всех возрастов. Больше всего простуду подхватили дети до 2 лет, а также малыши с 3 до 6 лет. При этом ОРВИ массово накрыла две кировские школы. В них два класса вывели на карантин. У всех обратившихся к медикам берут анализы для лабораторных исследований. Последние показали, что люди подхватывают вирус «гонконгского» гриппа. Только за неделю его обнаружили у 19 заболевших, в их число попали 12 детей. Специалисты уже сейчас советуют городским властям подготовить план профилактики для борьбы с болезнью на период эпидемии. Школам и детсадам необходимо ввести дополнительные санитарные меры.","sit_virtue":0,"sit_moral":0,"sit_law":0,"sit_justice":0,"sit_util":0,"label":[0,0,0,0,0]} 3 | {"source":"meduza","text":"В городе Санта-Клара (штат Калифорния) произошел сбой в системе пожарной сигнализации местного аэропорта, и пена для тушения пожаров выплеснулась из специального ангара на улицы, прилегающие к аэропорту. «Это выглядело, будто на пенной вечеринке что-то пошло не так», — рассказал один из свидетелей происшествия. Владельцы ангара заявили, что пена не токсична, но может вызывать аллергию при попадании на кожу. Вскоре после разлива пены нашелся смельчак, который проехал через облако пены на велосипеде. После этого полиция заблокировала улицы, заполненные пеной. Но у прохожих пенное облако вызывало восторг. «Вероятно, самая крутая вещь, которую я когда-либо видела. Противопожарная пена повсюду. Не завидую тому, кто будет все это убирать»","sit_virtue":0,"sit_moral":0,"sit_law":0,"sit_justice":0,"sit_util":0,"label":[0,0,0,0,0]} 4 | {"source":"meduza","text":"Директор проекта Tor (анонимной интернет-сети) Роджер Динглдин обвинил ФБР в заказе взлома сервиса. По его словам, атаку на Tor провел Университет Карнеги-Меллон, за что получил от ФБР как минимум миллион долларов. В университете это не подтвердили. Атака на Tor проводилась с февраля по июль 2014 года. Как заявил Динглдин, в результате злоумышленникам удалось получить данные о пользователях (в том числе и о тех, кто не нарушал закона). О том, что ФБР удалось установить личности некоторых пользователей Tor, говорится в материалах дела Silk Road 2 — магазина в анонимной сети, где можно было свободно купить наркотики. В материалах дела отмечается, что за взломом стоит некий университет, но его название не уточняется. Tor — это сервис для анонимного доступа в интернет и, как следствие, обхода блокировок. Через Tor можно зайти на любой заблокированный в России ресурс. Некоторые страны (например, Китай и Белоруссия) пытаются блокировать Tor.","sit_virtue":1,"sit_moral":0,"sit_law":0,"sit_justice":0,"sit_util":0,"label":[1,0,0,0,0]} 5 | {"source":"taiga_kp","text":"Модель Анастасия Квитко уже давно поражает пользователей соцсетей пикантными фотографиями. Девушку заметили даже мировые таблоиды. А на этой неделе интернет-звезду показали в эфире Первого канала, где она познакомилась с Анастасией Семенович, и получила приглашение сняться в клипе Александра Реввы. Однако впервые в сети появился снимок Насти без нижнего белья. Скандальное фото опубликовано на ее официальном сайте в разделе «Мода». На нем калининградская Ким стоит в легкой джинсовой курточке и игриво демонстрирует обнаженную пятую точку. Кстати, именно на сайте можно узнать, что в англоязычной версии «Википедии» появилась статья про нашу землячку. В одном из интервью Настя рассказала, что в США была ограблена под дулом пистолета. Любопытно, но настоящую Ким Кардашьян тоже ограбили в 2016-м году в отеле Парижа и, по словам светской львицы, на нее также направили пистолет.","sit_virtue":1,"sit_moral":0,"sit_law":1,"sit_justice":0,"sit_util":1,"label":[1,0,1,0,1]} 6 | -------------------------------------------------------------------------------- /test_data/per_ethics.json: -------------------------------------------------------------------------------- 1 | {"source":"lenta","text":"Большинство респондентов «Левада-Центра», участвовавших в опросе о протестных акциях 5 мая, узнали о случившемся от самих социологов (61 процент), передает ТАСС. В курсе событий были 39 процентов опрошенных. Тех, кто испытал злость к митингующим (3 процента), оказалось втрое больше (1 процент), позавидовавших им. Почти каждый пятый опрошенный убежден, что участникам несогласованных мероприятий заплатили. Однако тех, кто считает подобные действия обоснованными, больше — 32 процента. 20 процентов респондентов заявили, что митингующие выходили в поддержку блогера Алексея Навального. Симпатию и уважение к участникам протестов выразили 12 и 17 процентов соответственно. Исследование проводилось в последних числах мая среди 1,6 тысячи совершеннолетних респондентов из 136 населенных пунктов в 52 субъектах страны. 5 мая в различных городах России прошли протестные акции. В Москве в ней участвовали не менее 1,5 тысячи человек, 300 из них были задержаны. В Петербурге полиция насчитала 2 тысячи митингующих.","per_virtue":0,"per_moral":0,"per_law":0,"per_justice":1,"per_util":0,"label":[0,0,0,1,0]} 2 | {"source":"lenta","text":"В ЮАР обманутая девушка пришла на свадьбу своего парня в облачении невесты и сорвала церемонию. Видео поделился в своем Twitter-аккаунте пользователь @ kokhi. Ролик был снят в городе Бенони на северо-востоке страны. Женщина в фате и белом платье явилась на свадьбу и встала перед алтарем вместе с женихом и его избранницей. Мужчина шумно ругается с ней и пытается прогнать, в то время как настоящая невеста терпеливо ждет начала церемонии. За кадром слышны крики гостей: некоторые из них удивлены, а другие поддерживают устроившую скандал женщину. В подписи к посту гостью назвали любовницей жениха, которая и не подозревала, что является не единственной его девушкой. Комментаторы восхитились решимостью женщины. «Я надеюсь, что на каждого изменяющего мужчину найдется такая любовница», — написала одна из них. Пользователь @A dile__R задался вопросом, как жених собирается объяснить ситуацию настоящей невесте.","per_virtue":0,"per_moral":1,"per_law":1,"per_justice":1,"per_util":0,"label":[0,1,1,1,0]} 3 | {"source":"interfax","text":"Москва. 24 августа. INTERFAX.RU - Лучшая команда Европы по итогам прошлого года каталонская \"Барселона\" выиграла первый трофей в новом сезоне - Кубок Испании. В этом мини-турнире участвуют всего две команды - победитель национального первенства по футболу и обладатель Кубка. По итогам двух матчей \"Барселона\" выиграла у \"Атлетико\". В первом поединке каталонцы взяли верх над соперником в гостях со счетом 2:1, а во второй встрече и вовсе разгромили \"Атлетико\" - 3:0. Голы у \"Барселоны\" забили Месси (дважды) и Кркич.","per_virtue":1,"per_moral":1,"per_law":1,"per_justice":1,"per_util":1,"label":[1,1,1,1,1]} 4 | {"source":"gazeta","text":"Главный тренер женской сборной России по керлингу Томас Липс заявил, что отсутствие стабильного результата отечественных керлингисток связано с их недостаточной уверенностью в собственных силах. «Если команда три раза подряд добиралась до полуфинала чемпионата Европы, значит, она умеет играть в керлинг, — сказал Липс. — Значит, все, что остается, это понять, как сделать так, чтобы они играли стабильно. Я думаю, что наша команда способна играть очень хорошо, но у нее постоянно то взлет, то падение, один матч девушки проводят хорошо, другой плохо и так далее. Дело в недостаточной уверенности в себе». По мнению швейцарца, для того, чтобы заставить российских спортсменок быть увереннее себе, с ними нужно больше общаться. «Нужно просто разговаривать с девушками, уважать их, воспринимать как обычных нормальных людей, беседовать с ними, узнавать о них больше — не только о том, что касается керлинга. Я хочу знать, что они делают в свободное время, что они думают. Важно знать, как с ними общаться, ведь у нас в команде у всех разные характеры. Я не могу говорить со всеми игроками на одном и том же языке. С кем-то я должен быть более обходительным, с кем-то разговаривать более спокойно. Главное, чтобы игроки понимали меня», — заявил специалист.","per_virtue":1,"per_moral":1,"per_law":1,"per_justice":0,"per_util":0,"label":[1,1,1,0,0]} 5 | {"source":"lenta","text":"В Рио-де-Жанейро неизвестный попытался ограбить школу боевых искусств. В итоге ему пришлось спасаться бегством от спортсменов, занимавшихся там джиу-джитсу. Инцидент попал в объектив камеры видеонаблюдения. На кадрах видно, как молодой человек заходит в помещение и после короткого разговора с девушкой-администратором перепрыгивает через турникет и пробегает в помещение школы. Однако уже через несколько секунд он выбегает обратно, преследуемый шестерыми спортсменами в кимоно, у которых в это время было занятие по джиу-джитсу. Как уточняет The Daily Mail, грабитель отобрал у администратора мобильный телефон и угрожал ее убить, тогда девушка начала звать на помощь. Догнать нападавшего защитники девушки так и не смогли: он скрылся с места преступления на велосипеде вместе со своим сообщником. Полицейским также пока не удалось задержать грабителя, однако, по словам представителя ведомства, они нашли одну сандалию, которую тот потерял во время погони.","per_virtue":0,"per_moral":0,"per_law":0,"per_justice":0,"per_util":0,"label":[0,0,0,0,0]} 6 | -------------------------------------------------------------------------------- /rutransform/transformations/transformations/spelling/change_char_case.py: -------------------------------------------------------------------------------- 1 | import random 2 | import spacy 3 | from typing import List, Optional, Union 4 | from spacy.language import Language 5 | 6 | from rutransform.utils.args import TransformArguments 7 | from rutransform.transformations.utils import SentenceOperation 8 | 9 | """ 10 | Adapted from https://github.com/GEM-benchmark/NL-Augmenter/tree/main/transformations/change_char_case 11 | """ 12 | 13 | 14 | def change_char_case( 15 | text: str, 16 | spacy_model: Language, 17 | prob: float = 0.1, 18 | seed: int = 42, 19 | max_outputs: int = 1, 20 | stop_words: List[str] = None, 21 | ) -> List[str]: 22 | """ 23 | Changes character cases randomly 24 | 25 | Parameters 26 | ---------- 27 | text: str 28 | text to transform 29 | spacy_model: spacy.language.Language 30 | spacy model used for lemmatization 31 | prob: float 32 | probabilty of the transformation (default is 0.1) 33 | seed: int 34 | seed to freeze everything (default is 42) 35 | max_outputs: int 36 | maximum number of the returned sentences (default is 1) 37 | stop_words: List[str], optional 38 | stop words to ignore during transformation (default is None) 39 | 40 | Returns 41 | ------- 42 | List[str] 43 | list of transformed sentences 44 | """ 45 | if stop_words is None: 46 | stop_words = [] 47 | 48 | random.seed(seed) 49 | results = [] 50 | split_text = [token.text for token in spacy_model(text)] 51 | for _ in range(max_outputs): 52 | result = [] 53 | for w, word in enumerate(split_text): 54 | if word in stop_words: 55 | new_word = word 56 | else: 57 | new_word = "" 58 | for c in word: 59 | if random.uniform(0, 1) < prob: 60 | if c.isupper(): 61 | new_word += c.lower() 62 | elif c.islower(): 63 | new_word += c.upper() 64 | else: 65 | new_word += c 66 | result.append(new_word) 67 | result = " ".join(result) 68 | results.append(result) 69 | return results 70 | 71 | 72 | class ChangeCharCase(SentenceOperation): 73 | """ 74 | Changes character cases randomly 75 | 76 | Attributes 77 | ---------- 78 | args: TransformArguments 79 | parameters of the transformation 80 | seed: int 81 | seed to freeze everything (default is 42) 82 | max_outputs: int 83 | maximum number of the transfromed sentences (default is 1) 84 | device: str 85 | ! exists for compatability, always ignored ! 86 | the device used during transformation (default is 'cpu') 87 | spacy_model: spacy.language.Language 88 | spacy model used for tokenization 89 | 90 | Methods 91 | ------- 92 | generate(sentence, stop_words, prob) 93 | Transforms the sentence 94 | """ 95 | 96 | def __init__( 97 | self, 98 | args: TransformArguments, 99 | seed: int = 42, 100 | max_outputs: int = 1, 101 | device: Optional[str] = None, 102 | spacy_model: Optional[Language] = None, 103 | ) -> None: 104 | """ 105 | Parameters 106 | ---------- 107 | args: TransformArguments 108 | parameters of the transformation 109 | seed: int 110 | seed to freeze everything (default is 42) 111 | max_outputs: int 112 | maximum number of the transfromed sentences (default is 1) 113 | device: str 114 | ! exists for compatability, always ignored ! 115 | the device used during transformation (default is None) 116 | spacy_model: spacy.language.Language 117 | spacy model used for tokenization 118 | """ 119 | if spacy_model is None: 120 | spacy_model = spacy.load("ru_core_news_sm") 121 | 122 | super().__init__( 123 | args=args, 124 | seed=seed, 125 | max_outputs=max_outputs, 126 | device=device, 127 | spacy_model=spacy_model, 128 | ) 129 | 130 | def generate( 131 | self, 132 | sentence: str, 133 | stop_words: Optional[List[Union[int, str]]] = None, 134 | prob: Optional[float] = None, 135 | ) -> List[str]: 136 | """ 137 | Transforms the sentence 138 | 139 | If 'prob' argument is not None, ignores the probability provided in the arguments. 140 | 141 | Parameters 142 | ---------- 143 | sentence: str 144 | sentence to transform 145 | stop_words: List[str], optional 146 | stop_words to ignore during transformation (default is None) 147 | prob: float, optional 148 | probability of the transformation (default is None) 149 | 150 | Returns 151 | ------- 152 | list 153 | list of transformed sentences 154 | """ 155 | transformed = change_char_case( 156 | text=sentence, 157 | spacy_model=self.spacy_model, 158 | prob=(self.args.probability if not prob else prob), 159 | seed=self.seed, 160 | max_outputs=self.max_outputs, 161 | stop_words=stop_words, 162 | ) 163 | return transformed 164 | -------------------------------------------------------------------------------- /test_data/multiq.json: -------------------------------------------------------------------------------- 1 | {"support_text":"Павел Михайлович Харчик ( 5 апреля 1979, Душанбе, СССР) — туркменский и российский футболист, вратарь.","main_text":"Душанбе ( — букв. \"понедельник\") — столица Таджикистана, город республиканского значения, самый крупный научно-культурный, политический, экономический, промышленный и административный центр страны. В Душанбе расположены высшие органы государственной власти страны. Население города на 1 января 2019 года составляло 846,4 тыс. чел., он является центром Душанбинской агломерации. С 12 января 2017 года мэром города Душанбе является Рустам Эмомали.","question":"Кто является главой администрации в городе, где родился Павел Михайлович Харчик?","bridge_answers":[{"label":"passage","offset":42,"length":7,"segment":"Душанбе"}],"main_answers":[{"label":"passage","offset":430,"length":14,"segment":"Рустам Эмомали"}]} 2 | {"support_text":"Евгений Константинович Забиякин (7 (20 марта) 1909, Казань — 29 апреля 2001, Санкт-Петербург) — театральный актёр, заслуженный артист Таджикской ССР (1967).","main_text":"Казань — город в России, столица Республики Татарстан, крупный порт на левом берегу реки Волги при впадении в неё реки Казанки. Крупнейший по численности населения город в Приволжском федеральном округе и на реке Волге.\\n\\nАдминистративное устройство.\\nВ Казани расположены органы власти собственно городского муниципального образования, а также Республики Татарстан. Мэр и Городская дума. Главой города является мэр. Должность мэра в Казани является избираемой депутатами Городской думы из своего числа. С 17 ноября 2005 года эту должность занимает И. Р. Метшин. Свои сессии Дума проводит в городской Ратуше. Городская дума является нормотворческим органом города Казани, выборы в Думу проводятся раз в 5 лет. Председателем Городской думы является мэр, помимо него в Президиум Думы входят также два его заместителя, старший по возрасту депутат и главы семи постоянных комиссий:.","question":"Кто является главой администрации в городе, где родился Евгений Константинович Забиякин?","bridge_answers":[{"label":"passage","offset":52,"length":6,"segment":"Казань"}],"main_answers":[{"label":"passage","offset":550,"length":12,"segment":"И. Р. Метшин"}]} 3 | {"support_text":"Евгений Васильевич Неронович (1888—1918) — украинский политический и общественный деятель, революционер. Член Украинской Центральной Рады. С марта 1918 года — народный секретарь военных дел советского правительства Украины.\\n\\nБиография.\\nЕвгений Неронович родился 1888 года возле города Пирятин (Полтавская губерния). Получил среднее образование в Полтавской мужской гимназии и на электротехническом факультете Петербургского технологического института.","main_text":"Пирятин — город в Полтавской области Украины. Входит в Лубенский район. До 2020 года был административным центром упразднённого Пирятинского района, в котором составлял Пирятинский городской совет, в который, кроме того, входили сёла Верхояровка, Голобородько, Замостище, Заречье, Ивженки, Калинов Мост, Александровка, Ровное и Могилевщина.","question":"Как называется законодательный орган города, где родился Евгений Васильевич Неронович?","bridge_answers":[{"label":"passage","offset":288,"length":7,"segment":"Пирятин"}],"main_answers":[{"label":"passage","offset":181,"length":15,"segment":"городской совет"}]} 4 | {"support_text":"Гурий Константинович Щенников (17 ноября 1931, Челябинск — 6 сентября 2010, Санкт-Петербург) — советский и российский , доктор филологических наук (1982), профессор (1983). Заслуженный деятель науки Российской Федерации (1995).\\n\\nНаучная деятельность.\\nАкадемик РАН Г. М. Фридлендер полагал, что его монография \"Достоевский и русский реализм\" \"может быть с полным правом причислена к высшим достижениям советской историко-литературной науки 80-х годов в целом\". В 1990-х годах Г. К. Щенников организовал широкую группу российских ученых, работающих над проблемой \"Достоевский и русская культура\". Результатом этой работы явились книги \"Творчество Достоевского: искусство синтеза\" (1991), два выпуска коллективной монографии \"Достоевский и национальная культура\" (1994—1996) и крупный словарь-справочник \"Достоевский: Эстетика и поэтика\" (Челябинск, 1997) — 240 статей, в работе над которыми участвовали 37 авторов из различных вузов страны. Г. К. Щенников был также редактором сборника \"Русская литература 1870—1890-х гг.\". . Под его руководством начато издание полного собрания сочинений Д. Н. Мамина-Сибиряка, а также подготовлено 13 кандидатских и 5 докторских диссертаций.","main_text":"Челябинск — город в Российской Федерации, административный центр Челябинской области, седьмой по количеству жителей, шестнадцатый по занимаемой площади городской округ с внутригородским делением. Второй по величине культурный, экономический, деловой и политический центр УрФО. Население — человек .\\n\\nЭкономика.\\n816 аварий в 2015 году произошло из-за ям, а сам Челябинск, по данным компании \"АльфаСтрахование\" занимает 8 место среди российских городов по аварийности. Челябинские дороги были подвергнуты резкой критике премьер-министром России Дмитрием Медведевым. Также дороги Челябинска были названы Общероссийским народным фронтом одними из самых худших и опасных в стране. Крайне низкое качество дорожной инфраструктуры Челябинска постоянно подвергается критике со стороны известных гостей города, в частности, тележурналист Владимир Соловьёв и музыкант Баста заявили, что в Челябинске самые плохие дороги в стране. Из-за разбитых дорог Челябинска главе города Евгению Тефтелеву прокуратурой было внесено представление. Евгений Тефтелев и губернатор области Борис Дубровский соглашаются с тем, что городская дорожная сеть находится в крайне неудовлетворительном и обветшалом состоянии.","question":"Кто является главой администрации в городе, где родился Гурий Константинович Щенников?","bridge_answers":[{"label":"passage","offset":47,"length":9,"segment":"Челябинск"}],"main_answers":[{"label":"passage","offset":969,"length":17,"segment":"Евгению Тефтелеву"}]} 5 | {"support_text":"Рамитница — река в России, протекает в Опаринском районе Кировской области. Устье реки находится в 14 км по правому берегу реки Белая. Длина реки составляет 12 км.","main_text":"Белая — река в России, протекает в Опаринском районе Кировской области. Устье реки находится в 52 км по левому берегу реки Волманга. Длина реки составляет 40 км, площадь бассейна 316 кв км. Белая берёт исток в лесах близ точки, где сходятся Кировская, Вологодская и Костромская область в 22 км к северо-западу от посёлка Верхняя Волманга. Река течёт на юго-восток по ненаселённому заболоченному лесу. Впадает в Волмангу в посёлке Верхняя Волманга.","question":"С какой рекой сливается река, в которую впадает Рамитница?","bridge_answers":[{"label":"passage","offset":128,"length":5,"segment":"Белая"}],"main_answers":[{"label":"passage","offset":123,"length":8,"segment":"Волманга"}]} 6 | -------------------------------------------------------------------------------- /rutransform/transformations/transformations/spelling/butter_fingers.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import random 3 | import spacy 4 | from typing import List, Optional, Union 5 | from spacy.language import Language 6 | 7 | from rutransform.utils.args import TransformArguments 8 | from rutransform.transformations.utils import SentenceOperation 9 | 10 | """ 11 | Adapted from https://github.com/GEM-benchmark/NL-Augmenter/tree/main/transformations/butter_fingers_perturbation 12 | """ 13 | 14 | 15 | def butter_finger( 16 | text: str, 17 | spacy_model: Language, 18 | prob: float = 0.1, 19 | seed: int = 42, 20 | max_outputs: int = 1, 21 | stop_words: List[int] = None, 22 | ) -> List[str]: 23 | """ 24 | Adds typos to text the sentence using keyboard distance 25 | 26 | Parameters 27 | ---------- 28 | text: str 29 | text to transform 30 | spacy_model: spacy.language.Language 31 | spacy model used for lemmatization 32 | prob: float 33 | probability of the transformation (default is 0.1) 34 | seed: int 35 | seed to freeze everything (default is 42) 36 | max_outputs: int 37 | maximum number of the returned sentences (default is 1) 38 | stop_words: List[int], optional 39 | stop words to ignore during transformation (default is None) 40 | 41 | Returns 42 | ------- 43 | List[str] 44 | list of transformed sentences 45 | """ 46 | random.seed(seed) 47 | key_approx = { 48 | "й": "йцфыувяч", 49 | "ц": "цйуыфвкасч", 50 | "у": "уцкыавйфячсмпе", 51 | "к": "куевпацычсмпе", 52 | "е": "екнарпувсмитог", 53 | "н": "негпоркамитош", 54 | "г": "гншрлоепитьдщ", 55 | "ш": "шгщодлнртьдз", 56 | "щ": "щшзлдгоь", 57 | "з": "здщхэшл", 58 | "х": "хзъэж\щдю.", 59 | "ъ": "ъх\зэж.", 60 | "ф": "фйыяцчцвсу", 61 | "ы": "ыцчфвкам", 62 | "в": "вусыафйпим", 63 | "а": "авпкмцычнрт", 64 | "п": "пеиарувснот", 65 | "р": "рнтпоакмлшь", 66 | "о": "орлтгпеидщь", 67 | "л": "лодштнрт", 68 | "д": "дщльзгот", 69 | "ж": "жз.дэх\ю", 70 | "э": "эхж\зъ.", 71 | "я": "яфчымву", 72 | "ч": "чясывимакуцй", 73 | "с": "счмваяыцукпи", 74 | "м": "мсаипчвукент", 75 | "и": "имтпрсаенгт", 76 | "т": "тиьромпегшл", 77 | "ь": "ьтлодщшл", 78 | "б": "блдьюож", 79 | "ю": "юджб.ьл", 80 | " ": " ", 81 | } 82 | if stop_words is None: 83 | stop_words = [] 84 | 85 | transformed_texts = [] 86 | split_text = [token.text for token in spacy_model(text)] 87 | for _ in itertools.repeat(None, max_outputs): 88 | butter_text = [] 89 | for w, word in enumerate(split_text): 90 | if w in stop_words: 91 | butter_text.append(word) 92 | else: 93 | new_word = "" 94 | for letter in word: 95 | lcletter = letter.lower() 96 | if lcletter not in key_approx.keys(): 97 | new_letter = lcletter 98 | else: 99 | if random.uniform(0, 1) <= prob: 100 | new_letter = random.choice(key_approx[lcletter]) 101 | else: 102 | new_letter = lcletter 103 | # go back to original case 104 | if not lcletter == letter: 105 | new_letter = new_letter.upper() 106 | new_word += new_letter 107 | butter_text.append(new_word) 108 | transformed_texts.append(" ".join(butter_text)) 109 | return transformed_texts 110 | 111 | 112 | class ButterFingersTransformation(SentenceOperation): 113 | """ 114 | Add typos to text the sentence using keyboard distance 115 | 116 | Attributes 117 | ---------- 118 | args: TransformArguments 119 | parameters of the transformation 120 | seed: int 121 | seed to freeze everything (default is 42) 122 | max_outputs: int 123 | maximum number of the transfromed sentences (default is 1) 124 | device: str 125 | ! exists for compatability, always ignored ! 126 | the device used during transformation (default is 'cpu') 127 | spacy_model: spacy.language.Language 128 | spacy model used for tokenization 129 | 130 | Methods 131 | ------- 132 | generate(sentence, stop_words, prob) 133 | Transforms the sentence 134 | """ 135 | 136 | def __init__( 137 | self, 138 | args: TransformArguments, 139 | seed: int = 42, 140 | max_outputs: int = 1, 141 | device: Optional[str] = None, 142 | spacy_model: Optional[Language] = None, 143 | ) -> None: 144 | """ 145 | Parameters 146 | ---------- 147 | args: TransformArguments 148 | parameters of the transformation 149 | seed: int 150 | seed to freeze everything (default is 42) 151 | max_outputs: int 152 | maximum number of the transfromed sentences (default is 1) 153 | device: str 154 | ! exists for compatability, always ignored ! 155 | the device used during transformation (default is 'cpu') 156 | spacy_model: spacy.language.Language 157 | spacy model used for tokenization 158 | """ 159 | if spacy_model is None: 160 | spacy_model = spacy.load("ru_core_news_sm") 161 | 162 | super().__init__( 163 | args=args, 164 | seed=seed, 165 | max_outputs=max_outputs, 166 | device=device, 167 | spacy_model=spacy_model, 168 | ) 169 | 170 | def generate( 171 | self, 172 | sentence: str, 173 | stop_words: Optional[List[Union[int, str]]] = None, 174 | prob: Optional[float] = None, 175 | ) -> List[str]: 176 | """ 177 | Transforms the sentence 178 | 179 | If 'prob' argument is not None, ignores the probability provided in the arguments. 180 | 181 | Parameters 182 | ---------- 183 | sentence: str 184 | sentence to transform 185 | stop_words: List[int], optional 186 | stop words to ignore during transformation (default is None) 187 | prob: float, optional 188 | probability of the transformation (default is None) 189 | 190 | Returns 191 | ------- 192 | list 193 | list of transformed sentences 194 | """ 195 | transformed = butter_finger( 196 | text=sentence, 197 | spacy_model=self.spacy_model, 198 | prob=(self.args.probability if not prob else prob), 199 | seed=self.seed, 200 | max_outputs=self.max_outputs, 201 | stop_words=stop_words, 202 | ) 203 | return transformed 204 | -------------------------------------------------------------------------------- /rutransform/transformations/transformers/bae.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Optional 2 | 3 | from rutransform.transformations.transformations.bae import * 4 | from rutransform.transformations import Transformer 5 | from rutransform.constraints import Constraint 6 | from rutransform.utils.args import TransformArguments 7 | from rutransform.transformations.utils import SentenceOperation 8 | 9 | 10 | class BAETransformer(Transformer): 11 | """ 12 | BAE (BERT-based Adversarial Examples) attack (https://arxiv.org/abs/2004.01970) 13 | 14 | Generates adversarial examples using contextual perturbations 15 | from a BERT masked language model. 16 | 17 | Utilizes constraints provided by the user to extract stopwords 18 | specific for the task, to which the transformations do not apply. 19 | Uses similarity metric (BERTScore) to filter the sentences, 20 | similarity score of which are less than a threshold (specified in 21 | TransformArguments). 22 | 23 | Attributes 24 | ---------- 25 | transformation_type: str 26 | type of the transformations supported by the transformer 27 | transformations: List[str] 28 | list of transformations to apply to data 29 | task_type: str 30 | type of the task (e.g. 'classification', 'multichoice_qa', etc.) 31 | args: TransformArguments 32 | parameters of the transformation 33 | text_col: str, optional 34 | name of the column containing text to transform (default is 'text') 35 | label_col: str, optional 36 | name of the target column (default is 'label') 37 | seed: int 38 | seed to freeze everything (default is 42) 39 | device: str 40 | device used during transformation (default is 'cpu') 41 | constraints: List[Constraint], optional 42 | list of transformation constraints (default is None) 43 | spacy_model: spacy.language.Language 44 | spacy model used for tokenization (default is 'ru_core_news_sm') 45 | bert_scorer: BERTScorer 46 | similarity metric class used to filter transformed texts (default is None) 47 | 48 | Methods 49 | ------- 50 | @staticmethod 51 | transform_info: Dict[str, SentenceOperation] 52 | dictionary mapping transformations and SentenceOperation classes 53 | load_transformations() 54 | Loads all the transformations required 55 | @abstractmethod 56 | transform(sentence) 57 | Applies the transformations to input 58 | sent_split(text) 59 | Splits text into sentences 60 | @staticmethod 61 | get_ids(matches) 62 | Returns ids of stopwords 63 | _transform_text(transformer, sentences, reference, stop_words, prob) 64 | Applies the transformations to long text and filters the transformed texts 65 | _transform_sentence(self, transformer, sentence, stop_words, prob) 66 | Applies the transformations to sentence and filters the transformed sentences 67 | _list_stop_words(sentence, return_ids) 68 | Extracts stopwords matching the constraints 69 | _drop_duplicates(reference, candidates, scores) 70 | Returns transformed sentences without duplicates 71 | _filter_candidates(candidates, reference, context) 72 | Filters out sentences based on the similarity score 73 | _sample_to_max_outputs(sentences, scores) 74 | Returns the desired number of the transformed sentences 75 | _update_data(org_sentence, transformed) 76 | Updates the dataset object 77 | """ 78 | 79 | def __init__( 80 | self, 81 | transformations: List[str], 82 | task_type: str, 83 | args: TransformArguments, 84 | text_col: Optional[str] = "text", 85 | label_col: Optional[str] = "label", 86 | seed: int = 42, 87 | device: str = "cpu", 88 | constraints: Optional[List[Constraint]] = None, 89 | ) -> None: 90 | """ 91 | Parameters 92 | ---------- 93 | transformations: List[str] 94 | list of transformations to apply to data 95 | task_type: str 96 | type of the task (e.g. 'classification', 'multichoice_qa', etc.) 97 | args: TransformArguments 98 | parameters of the transformation 99 | text_col: str, optional 100 | name of the column containing text to transform (default is 'text') 101 | label_col: str, optional 102 | name of the target column (default is 'label') 103 | seed: int 104 | seed to freeze everything (default is 42) 105 | device: str 106 | device used during transformation (default is 'cpu') 107 | constraints: List[Constraint], optional 108 | list of transformation constraints (default is None) 109 | """ 110 | super().__init__( 111 | transformation_type="bae", 112 | transformations=transformations, 113 | task_type=task_type, 114 | args=args, 115 | text_col=text_col, 116 | label_col=label_col, 117 | seed=seed, 118 | device=device, 119 | constraints=constraints, 120 | ) 121 | 122 | self.transformers = self.load_transformations() 123 | 124 | @staticmethod 125 | def transform_info() -> Dict[str, Optional[SentenceOperation]]: 126 | """ 127 | Information about the transformations used by the transformer 128 | 129 | Returns 130 | ------- 131 | Dict[str, Optional[SentenceOperation]] 132 | dictionary storing transformation info 133 | """ 134 | info = {"bae": BAE} 135 | 136 | return info 137 | 138 | def transform(self, sentence: Dict[str, Any]) -> List[Dict[str, Any]]: 139 | """ 140 | Applies the transformations to input 141 | 142 | Parameters 143 | ---------- 144 | sentence: Dict[str, Any] 145 | dataset object in dict form 146 | 147 | Returns 148 | ------- 149 | Dict[str, Any] 150 | Transformed dataset object 151 | """ 152 | transformed_data = [] 153 | scores = [] 154 | for transform_name, transformer in self.transformers.items(): 155 | stop_words = self._list_stop_words(sentence, return_ids=False) 156 | sentences = self.sent_split(sentence[self.text_col]) 157 | if len(sentences) > 1: 158 | transform_sent, sent_scores = self._transform_text( 159 | transformer, sentences, sentence[self.text_col], stop_words 160 | ) 161 | else: 162 | transform_sent, sent_scores = self._transform_sentence( 163 | transformer, sentence[self.text_col], stop_words 164 | ) 165 | transformed = self._update_data(sentence, transform_sent) 166 | transformed_data.extend(transformed) 167 | scores.extend(sent_scores) 168 | 169 | return transformed_data, scores 170 | -------------------------------------------------------------------------------- /rutransform/transformations/transformations/spelling/emojify.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import spacy 4 | from json import load 5 | from typing import Dict, List, Optional, Union 6 | 7 | from spacy.language import Language 8 | 9 | from rutransform.utils.args import TransformArguments 10 | from rutransform.transformations.utils import SentenceOperation 11 | 12 | """ 13 | Adapted from https://github.com/GEM-benchmark/NL-Augmenter/tree/main/transformations/emojify 14 | """ 15 | 16 | 17 | def emojify( 18 | sentence: str, 19 | word_to_emoji: Dict[str, str], 20 | spacy_model: Language, 21 | prob: float = 0.1, 22 | seed: int = 0, 23 | max_outputs: int = 1, 24 | stop_words: Optional[List[str]] = None, 25 | ) -> List[str]: 26 | """ 27 | Randomly replaces tokens with corresponding emojis 28 | 29 | Parameters 30 | ---------- 31 | sentence: str 32 | sentence to transform 33 | word_to_emoji: Dict[str, str] 34 | dictionary with emojis and their meanings 35 | spacy_model: spacy.language.Language 36 | spacy model used for lemmatization 37 | prob: float 38 | probabilty of the transformation (default is 0.1) 39 | seed: int 40 | seed to freeze everything (default is 42) 41 | max_outputs: int 42 | maximum number of the returned sentences (default is 1) 43 | stop_words: List[str], optional 44 | stop words to ignore during transformation (default is None) 45 | 46 | Returns 47 | ------- 48 | List[str] 49 | list of transformed sentences 50 | """ 51 | random.seed(seed) 52 | doc = spacy_model(sentence) 53 | results = [] 54 | 55 | if stop_words is None: 56 | stop_words = [] 57 | 58 | for _ in range(max_outputs): 59 | 60 | # Reconstruct the sentence with replaced lemma 61 | transformed_sentence = "" 62 | 63 | for t, token in enumerate(doc): 64 | lemma = token.lemma_.lower() 65 | 66 | if t in stop_words: 67 | transformed_sentence += token.text_with_ws 68 | 69 | # Handle numeric tokens 70 | elif lemma.isnumeric(): 71 | if random.uniform(0, 1) < prob: 72 | for digit in list(lemma): 73 | emoji = digit 74 | if digit in word_to_emoji: 75 | emoji = random.choice(word_to_emoji[digit]) 76 | transformed_sentence += emoji 77 | 78 | if " " in token.text_with_ws: 79 | transformed_sentence += " " 80 | 81 | else: 82 | transformed_sentence += token.text_with_ws 83 | 84 | elif lemma in word_to_emoji: 85 | # We have `prob` chance to replace this token with emoji 86 | if random.uniform(0, 1) < prob: 87 | 88 | # Randomly choose a emoji candidate for this token 89 | emoji = random.choice(word_to_emoji[lemma]) 90 | transformed_sentence += emoji 91 | 92 | if " " in token.text_with_ws: 93 | transformed_sentence += " " 94 | 95 | else: 96 | transformed_sentence += token.text_with_ws 97 | 98 | else: 99 | # If lemma is not in the emoji dictionary, we keep it the same 100 | transformed_sentence += token.text_with_ws 101 | 102 | results.append(transformed_sentence) 103 | 104 | return results 105 | 106 | 107 | class EmojifyTransformation(SentenceOperation): 108 | """ 109 | Augments the input sentence by swapping words 110 | into emojis with similar meanings 111 | 112 | Attributes 113 | ---------- 114 | args: TransformArguments 115 | parameters of the transformation 116 | spacy_model: spacy.language.Language 117 | spacy model used for tokenization 118 | seed: int 119 | seed to freeze everything (default is 42) 120 | max_outputs: int 121 | maximum number of the transfromed sentences (default is 1) 122 | device: str 123 | ! exists for compatability, always ignored ! 124 | the device used during transformation (default is 'cpu') 125 | 126 | Methods 127 | ------- 128 | generate(sentence, stop_words, prob) 129 | Transforms the sentence 130 | """ 131 | 132 | def __init__( 133 | self, 134 | args: TransformArguments, 135 | spacy_model: Optional[Language] = None, 136 | seed: int = 42, 137 | max_outputs: int = 1, 138 | device: Optional[str] = None, 139 | ) -> None: 140 | """ 141 | Parameters 142 | ---------- 143 | args: TransformArguments 144 | parameters of the transformation 145 | spacy_model: spacy.language.Language 146 | spacy model used for tokenization 147 | seed: int 148 | seed to freeze everything (default is 42) 149 | max_outputs: int 150 | maximum number of the transfromed sentences (default is 1) 151 | device: str 152 | ! exists for compatability, always ignored ! 153 | the device used during transformation (default is None) 154 | """ 155 | if spacy_model is None: 156 | spacy_model = spacy.load("ru_core_news_sm") 157 | 158 | super().__init__( 159 | args=args, 160 | seed=seed, 161 | max_outputs=max_outputs, 162 | device=device, 163 | spacy_model=spacy_model, 164 | ) 165 | 166 | emoji_dict_path = "emoji_dict_ru.json" 167 | # Load the emoji dictionary 168 | dict_path = os.path.join( 169 | os.path.dirname(os.path.abspath(__file__)), emoji_dict_path 170 | ) 171 | self.word_to_emoji = load(open(dict_path, "r")) 172 | 173 | def generate( 174 | self, 175 | sentence: str, 176 | stop_words: Optional[List[Union[int, str]]] = None, 177 | prob: Optional[float] = None, 178 | ) -> List[str]: 179 | """ 180 | Transforms the sentence 181 | 182 | If 'prob' argument is not None, ignores the probability provided in the arguments. 183 | 184 | Parameters 185 | ---------- 186 | sentence: str 187 | sentence to transform 188 | stop_words: List[Union[int, str]], optional 189 | stop_words to ignore during transformation (default is None) 190 | prob: float, optional 191 | probability of the transformation (default is None) 192 | 193 | Returns 194 | ------- 195 | list 196 | list of transformed sentences 197 | """ 198 | transformed = emojify( 199 | sentence=sentence, 200 | word_to_emoji=self.word_to_emoji, 201 | spacy_model=self.spacy_model, 202 | prob=(self.args.probability if not prob else prob), 203 | seed=self.seed, 204 | max_outputs=self.max_outputs, 205 | stop_words=stop_words, 206 | ) 207 | return transformed 208 | -------------------------------------------------------------------------------- /rutransform/transformations/dataset_transformer.py: -------------------------------------------------------------------------------- 1 | from typing import Union, NamedTuple 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from datasets import Dataset 6 | from tqdm.auto import tqdm 7 | 8 | from rutransform.transformations import Transformer 9 | from rutransform.transformations.utils import * 10 | from rutransform.constraints import * 11 | from rutransform.utils.args import TransformArguments 12 | from rutransform.transformations import load_transformers 13 | 14 | 15 | class DatasetTransformer(object): 16 | """ 17 | Class for dataset transformation 18 | 19 | Attributes 20 | ---------- 21 | dataset: Union[Dataset, pd.DataFrame] 22 | dataset to transform 23 | args: TransformArguments 24 | transformation parameters 25 | text_col: str, optional 26 | name of the column containing text to transform (default is 'text') 27 | label_col: str, optional 28 | name of the target column (default is 'label') 29 | task_type: str, optional 30 | type of the task (default is None) 31 | if dataset_name in original tasks, 32 | defaults to task_type of the dataset 33 | seed: int, optional 34 | seed to freeze everything (default is 42) 35 | device: str 36 | the device used during transformation (default is 'cpu') 37 | return_type: str 38 | type of the transformed dataset (default is 'hf') 39 | - if 'pd' - returns pandas.DataFrame 40 | - if 'hf' - returns HuggingFace Dataset 41 | custom_constraints: List[Constraint] 42 | list of custom constraints for transformation (defaul is None) 43 | if not provided, uses contrsaints for task_type, 44 | else uses only custom ones 45 | transform_dict: dict 46 | dictionary containing Transformer classes by transformation 47 | provided in utils.constants 48 | transform_info: 49 | dictionary mapping transformations and SentenceOperation classes 50 | provided in utils.constants 51 | 52 | Methods 53 | ------- 54 | load_transformer() 55 | Loads the transformer used for transformation 56 | transform() 57 | Transforms dataset 58 | 59 | """ 60 | 61 | def __init__( 62 | self, 63 | dataset: Union[Dataset, pd.DataFrame], 64 | args: TransformArguments, 65 | text_col: str = "text", 66 | label_col: str = "label", 67 | task_type: Optional[str] = None, 68 | seed: int = 42, 69 | device: str = "cpu", 70 | return_type: str = "hf", 71 | custom_constraints: List[Constraint] = None, 72 | use_constraints: bool = True, 73 | ) -> None: 74 | """ 75 | dataset: Union[Dataset, pd.DataFrame] 76 | dataset to transform 77 | args: TransformArguments 78 | transformation parameters 79 | text_col: str, optional 80 | name of the column containing text to transform (default is 'text') 81 | label_col: str, optional 82 | name of the target column (default is 'label') 83 | task_type: str, optional 84 | type of the task 85 | seed: int, optional 86 | seed to freeze everything (default is 42) 87 | device: str 88 | the device used during transformation (default is 'cpu') 89 | return_type: str 90 | type of the transformed dataset (default is 'hf') 91 | - if 'pd' - returns pandas.DataFrame 92 | - if 'hf' - returns HuggingFace Dataset 93 | custom_constraints: List[Constraint] 94 | list of custom constraints for transformation (defaul is None) 95 | if not provided, uses contrsaints for task_type, 96 | else uses only custom ones 97 | """ 98 | self.dataset = dataset 99 | self.args = args 100 | self.text_col = text_col 101 | self.label_col = label_col 102 | self.task_type = task_type 103 | self.seed = seed 104 | self.device = device 105 | self.return_type = return_type 106 | self.custom_constraints = custom_constraints 107 | self.use_constraints = use_constraints 108 | 109 | self.transform_dict = load_transformers() 110 | self.transformer = self.load_transformer() 111 | 112 | def load_transformer(self) -> Transformer: 113 | """ 114 | Loads the transformer used for transformation. 115 | Initializes task_type and default constraints for task. 116 | 117 | Returns 118 | ------- 119 | Transformer 120 | initialized Transformer class 121 | """ 122 | if self.args.transformation not in self.transform_dict: 123 | raise ValueError( 124 | "Invalid transformation name: %s" % self.args.transformation 125 | ) 126 | 127 | if self.use_constraints: 128 | if self.custom_constraints is not None: 129 | constraints = self.custom_constraints 130 | else: 131 | constraints = [] 132 | if self.task_type == "multichoice_qa": 133 | constraints = [NamedEntities()] 134 | elif self.task_type == "winograd": 135 | constraints = [NamedEntities(), Referents()] 136 | elif self.task_type == "jeopardy": 137 | constraints = [NamedEntities(), Jeopardy()] 138 | elif self.task_type == "multihop": 139 | constraints = [ 140 | NamedEntities(), 141 | Multihop("bridge_answers", "main_answers"), 142 | ] 143 | else: 144 | constraints = None 145 | 146 | transformer = self.transform_dict[self.args.transformation]( 147 | transformations=[self.args.transformation], 148 | task_type=self.task_type, 149 | args=self.args, 150 | text_col=self.text_col, 151 | label_col=self.label_col, 152 | seed=self.seed, 153 | device=self.device, 154 | constraints=constraints, 155 | ) 156 | 157 | return transformer 158 | 159 | def transform(self) -> TransformResult: 160 | """ 161 | Transforms dataset 162 | 163 | Applies provided transformations to dataset. 164 | Uses constraints to ensure the quality of the transformation. 165 | 166 | Returns 167 | ------- 168 | TransformResult 169 | result of the transformation, including 170 | - transformed dataset 171 | type provided during initialization (self.return_type) 172 | - similarity scores of each transformed text (BERT-score) 173 | - mean similarity score (BERT-score) 174 | - standard deviation of the similarity scores 175 | 176 | """ 177 | 178 | if type(self.dataset) is pd.DataFrame: 179 | dataset = Dataset.from_pandas(self.dataset) 180 | else: 181 | dataset = self.dataset 182 | 183 | transformed = [] 184 | scores = [] 185 | for sent_ind, sentence in tqdm( 186 | enumerate(dataset), total=len(dataset), desc="Transforming data" 187 | ): 188 | transformed_sentence, sent_scores = self.transformer.transform(sentence) 189 | transformed.extend(transformed_sentence) 190 | scores.extend(sent_scores) 191 | 192 | transformed = pd.DataFrame(transformed) 193 | 194 | if self.return_type == "hf": 195 | transformed = Dataset.from_pandas(transformed) 196 | 197 | scores = np.array(scores) 198 | 199 | return TransformResult( 200 | transformed_dataset=transformed, 201 | scores=scores, 202 | score=scores.mean(), 203 | std=np.std(scores), 204 | ) 205 | -------------------------------------------------------------------------------- /rutransform/transformations/transformations/paraphrasis/back_translation.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import spacy 3 | from typing import List, Tuple, Optional, Union 4 | 5 | import numpy as np 6 | from spacy.language import Language 7 | from transformers import MarianTokenizer, MarianMTModel 8 | 9 | from rutransform.utils.args import TransformArguments 10 | from rutransform.transformations.utils import SentenceOperation 11 | 12 | 13 | """ 14 | Adapted from https://github.com/GEM-benchmark/NL-Augmenter/tree/main/transformations/back_translation_ner 15 | """ 16 | 17 | 18 | class BackTranslationNER(SentenceOperation): 19 | """ 20 | Generates diverse linguistic variations of the contexts 21 | around the entity mention(s) through back-translation 22 | ru -> en -> ru using Heksinki-NLP/opus-mt models 23 | 24 | Attributes 25 | ---------- 26 | args: TransformArguments 27 | parameters of the transformation 28 | seed: int 29 | seed to freeze everything (default is 42) 30 | max_outputs: int 31 | maximum number of the transfromed sentences (default is 1) 32 | device: str 33 | the device used during transformation (default is 'cpu') 34 | spacy_model: spacy.language.Language 35 | spacy model used for tokenization 36 | 37 | Methods 38 | ------- 39 | spacy_tagger(text, stop_words) 40 | Tokenizes the sentence and extract entity mentions 41 | translation_pipeline(text) 42 | Passes the text in source languages through the intermediate 43 | translations 44 | create_segments(tokens, tags) 45 | Creates segments for translation 46 | generate(sentence, stop_words, prob) 47 | Transforms the sentence 48 | """ 49 | 50 | def __init__( 51 | self, 52 | args: TransformArguments, 53 | seed: int = 42, 54 | max_outputs: int = 1, 55 | device: str = "cpu", 56 | spacy_model: Optional[Language] = None, 57 | ) -> None: 58 | """ 59 | Parameters 60 | ---------- 61 | args: TransformArguments 62 | parameters of the transformation 63 | seed: int 64 | seed to freeze everything (default is 42) 65 | max_outputs: int 66 | maximum number of the transfromed sentences (default is 1) 67 | device: str 68 | the device used during transformation (default is 'cpu') 69 | spacy_model: spacy.language.Language 70 | spacy model used for tokenization 71 | """ 72 | if spacy_model is None: 73 | spacy_model = spacy.load("ru_core_news_sm") 74 | 75 | super().__init__( 76 | args=args, 77 | seed=seed, 78 | max_outputs=max_outputs, 79 | device=device, 80 | spacy_model=spacy_model, 81 | ) 82 | 83 | np.random.seed(self.seed) 84 | mname_ru2en = "Helsinki-NLP/opus-mt-ru-en" 85 | mname_en2ru = "Helsinki-NLP/opus-mt-en-ru" 86 | self.tokenizer_ru2en = MarianTokenizer.from_pretrained(mname_ru2en) 87 | self.tokenizer_en2ru = MarianTokenizer.from_pretrained(mname_en2ru) 88 | self.model_ru2en = MarianMTModel.from_pretrained(mname_ru2en).to(self.device) 89 | self.model_en2ru = MarianMTModel.from_pretrained(mname_en2ru).to(self.device) 90 | self.spacy_model = spacy_model 91 | 92 | def spacy_tagger( 93 | self, text: str, stop_words: Optional[List[str]] 94 | ) -> Tuple[List[str], List[str]]: 95 | """ 96 | Tokenizes the sentence and extract entity mentions 97 | 98 | Parameters 99 | ---------- 100 | text: str 101 | text to tokenize 102 | stop_words: List[int], optional 103 | stop_words to ignore during transformation (default is None) 104 | 105 | Returns 106 | ------- 107 | Tuple[List[str], List[str]] 108 | tokenized text, BIO-annotated text 109 | """ 110 | doc = self.spacy_model(text) 111 | ner = [] 112 | tokenized = [] 113 | for t, token in enumerate(doc): 114 | tokenized.append(token.text) 115 | if token.ent_type_: 116 | ner.append(token.ent_type_) 117 | elif stop_words is not None and t in stop_words: 118 | ner.append("B") 119 | else: 120 | ner.append("O") 121 | return tokenized, ner 122 | 123 | def translation_pipeline(self, text: str) -> str: 124 | """ 125 | Passes the text in source languages through the intermediate 126 | translations 127 | 128 | Parameters 129 | ---------- 130 | text: str 131 | text to translate 132 | 133 | Returns 134 | ------- 135 | str 136 | back-translated text 137 | """ 138 | ru2en_inputids = self.tokenizer_ru2en.encode(text, return_tensors="pt") 139 | ru2en_inputids = ru2en_inputids.to(self.device) 140 | outputs_ru2en = self.model_ru2en.generate(ru2en_inputids) 141 | text_trans = self.tokenizer_ru2en.decode( 142 | outputs_ru2en[0], skip_special_tokens=True 143 | ) 144 | en2ru_inputids = self.tokenizer_en2ru.encode(text_trans, return_tensors="pt") 145 | en2ru_inputids = en2ru_inputids.to(self.device) 146 | outputs_en2ru = self.model_en2ru.generate(en2ru_inputids) 147 | text_trans = self.tokenizer_en2ru.decode( 148 | outputs_en2ru[0], skip_special_tokens=True 149 | ) 150 | return text_trans 151 | 152 | @staticmethod 153 | def create_segments( 154 | tokens: List[str], tags: List[str] 155 | ) -> Tuple[List[List[str]], List[List[str]]]: 156 | """ 157 | Creates segments for translation 158 | 159 | A segment is defined as a consecutive sequence of same tag/label 160 | 161 | Parameters 162 | ---------- 163 | tokens: List[str] 164 | tokenized text 165 | tags: List[str] 166 | BIO-annotated text 167 | 168 | Returns 169 | ------- 170 | Tuple[List[List[str]], List[List[str]]] 171 | segments of the text and its BIO-annotation 172 | """ 173 | segment_tokens, segment_tags = [], [] 174 | tags_idxs = [(i, t) for i, t in enumerate(tags)] 175 | groups = [ 176 | list(g) 177 | for _, g in itertools.groupby(tags_idxs, lambda s: s[1].split("-")[-1]) 178 | ] 179 | for group in groups: 180 | idxs = [i[0] for i in group] 181 | segment_tokens.append([tokens[idx] for idx in idxs]) 182 | segment_tags.append([tags[idx] for idx in idxs]) 183 | 184 | return segment_tokens, segment_tags 185 | 186 | def generate( 187 | self, 188 | sentence: str, 189 | stop_words: Optional[List[Union[int, str]]] = None, 190 | prob: Optional[float] = None, 191 | ) -> List[str]: 192 | """ 193 | Transforms the sentence 194 | 195 | Parameters 196 | ---------- 197 | sentence: str 198 | sentence to transform 199 | stop_words: List[int], optional 200 | stop_words to ignore during transformation (default is None) 201 | prob: float, optional 202 | ! exists for compatability, always ignored ! 203 | probability of the transformation (default is None) 204 | 205 | Returns 206 | ------- 207 | list 208 | list of transformed sentences 209 | """ 210 | 211 | # tag sentence to extract entity mentions 212 | token_sequence, tag_sequence = self.spacy_tagger(sentence, stop_words) 213 | 214 | assert len(token_sequence) == len( 215 | tag_sequence 216 | ), f"token_sequence and tag_sequence should have same length! {len(token_sequence)}!={len(tag_sequence)}" 217 | 218 | transformations = [] 219 | segment_tokens, segment_tags = BackTranslationNER.create_segments( 220 | token_sequence, tag_sequence 221 | ) 222 | for _ in range(self.max_outputs): 223 | tokens = [] 224 | for s_token, s_tag in zip(segment_tokens, segment_tags): 225 | if len(s_token) >= 100: 226 | segment_text = " ".join(s_token) 227 | tokens.extend([segment_text]) 228 | continue 229 | translate_segment = np.random.binomial(1, p=self.args.bin_p) 230 | if ( 231 | s_tag[0] != "O" 232 | or len(s_token) < self.args.segment_length 233 | or not translate_segment 234 | ): 235 | tokens.extend(s_token) 236 | continue 237 | segment_text = " ".join(s_token) 238 | segment_translation = self.translation_pipeline(segment_text) 239 | tokens.extend([segment_translation]) 240 | 241 | transformations.append(" ".join(tokens)) 242 | 243 | return transformations 244 | -------------------------------------------------------------------------------- /rutransform/transformations/transformers/spelling.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Optional, Tuple 2 | 3 | from rutransform.transformations.transformations.spelling import * 4 | from rutransform.transformations import Transformer 5 | from rutransform.constraints import Constraint 6 | from rutransform.utils.args import TransformArguments 7 | from rutransform.transformations.utils import SentenceOperation 8 | 9 | 10 | class SpellingTransformer(Transformer): 11 | """ 12 | Spelling transformations 13 | 14 | Adds noise to text by mimicking common spelling mistakes, 15 | such as case alteration and keyboard distance character swaps, 16 | and emoji replacement 17 | 18 | Uses similarity metric (BERTScore) to filter the sentences, 19 | similarity score of which are less than a threshold (specified in 20 | TransformArguments). 21 | 22 | Attributes 23 | ---------- 24 | transformation_type: str 25 | type of the transformations supported by the transformer 26 | transformations: List[str] 27 | list of transformations to apply to data 28 | task_type: str 29 | type of the task (e.g. 'classification', 'multichoice_qa', etc.) 30 | args: TransformArguments 31 | parameters of the transformation 32 | text_col: str, optional 33 | name of the column containing text to transform (default is 'text') 34 | label_col: str, optional 35 | name of the target column (default is 'label') 36 | seed: int 37 | seed to freeze everything (default is 42) 38 | device: str 39 | device used during transformation (default is 'cpu') 40 | constraints: List[Constraint], optional 41 | list of transformation constraints (default is None) 42 | spacy_model: spacy.language.Language 43 | spacy model used for tokenization (default is 'ru_core_news_sm') 44 | bert_scorer: BERTScorer 45 | similarity metric class used to filter transformed texts (default is None) 46 | 47 | Methods 48 | ------- 49 | @staticmethod 50 | transform_info: Dict[str, SentenceOperation] 51 | dictionary mapping transformations and SentenceOperation classes 52 | load_transformations() 53 | Loads all the transformations required 54 | @abstractmethod 55 | transform(sentence) 56 | Applies the transformations to input 57 | sent_split(text) 58 | Splits text into sentences 59 | @staticmethod 60 | get_ids(matches) 61 | Returns ids of stopwords 62 | _transform_text(transformer, sentences, reference, stop_words, prob) 63 | Applies the transformations to long text and filters the transformed texts 64 | _transform_sentence(self, transformer, sentence, stop_words, prob) 65 | Applies the transformations to sentence and filters the transformed sentences 66 | _apply_transformation(transformer, sentence, sentences, reference, stop_words, prob) 67 | Applies the transformations to text until the transformed text passes 68 | the similarity threshold 69 | _list_stop_words(sentence, return_ids) 70 | Extracts stopwords matching the constraints 71 | _drop_duplicates(reference, candidates, scores) 72 | Returns transformed sentences without duplicates 73 | _filter_candidates(candidates, reference, context) 74 | Filters out sentences based on the similarity score 75 | _sample_to_max_outputs(sentences, scores) 76 | Returns the desired number of the transformed sentences 77 | _update_data(org_sentence, transformed) 78 | Updates the dataset object 79 | """ 80 | 81 | def __init__( 82 | self, 83 | transformations: List[str], 84 | task_type: str, 85 | args: TransformArguments, 86 | text_col: Optional[str] = "text", 87 | label_col: Optional[str] = "label", 88 | seed: int = 42, 89 | device: str = "cpu", 90 | constraints: Optional[List[Constraint]] = None, 91 | ) -> None: 92 | """ 93 | Parameters 94 | ---------- 95 | transformations: List[str] 96 | list of transformations to apply to data 97 | task_type: str 98 | type of the task (e.g. 'classification', 'multichoice_qa', etc.) 99 | args: TransformArguments 100 | parameters of the transformation 101 | text_col: str, optional 102 | name of the column containing text to transform (default is 'text') 103 | label_col: str, optional 104 | name of the target column (default is 'label') 105 | seed: int 106 | seed to freeze everything (default is 42) 107 | device: str 108 | device used during transformation (default is 'cpu') 109 | constraints: List[Constraint], optional 110 | list of transformation constraints (default is None) 111 | """ 112 | super().__init__( 113 | transformation_type="spelling", 114 | transformations=transformations, 115 | task_type=task_type, 116 | args=args, 117 | text_col=text_col, 118 | label_col=label_col, 119 | seed=seed, 120 | device=device, 121 | constraints=constraints, 122 | ) 123 | 124 | self.transformers = self.load_transformations() 125 | 126 | @staticmethod 127 | def transform_info() -> Dict[str, Optional[SentenceOperation]]: 128 | """ 129 | Information about the transformations used by the transformer 130 | 131 | Returns 132 | ------- 133 | Dict[str, Optional[SentenceOperation]] 134 | dictionary storing transformation info 135 | """ 136 | info = { 137 | "butter_fingers": ButterFingersTransformation, 138 | "case": ChangeCharCase, 139 | "emojify": EmojifyTransformation, 140 | } 141 | 142 | return info 143 | 144 | def _apply_transformation( 145 | self, 146 | transformer: SentenceOperation, 147 | sentence: str, 148 | sentences: Optional[List[str]] = None, 149 | stop_words: Optional[List[Union[str, int, List[Union[str, int]]]]] = None, 150 | prob: Optional[float] = None, 151 | ) -> Tuple[List[str], List[float]]: 152 | """ 153 | Applies the transformations to text until the transformed text passes 154 | the similarity threshold 155 | 156 | Parameters 157 | ---------- 158 | transformer: SentenceOperation 159 | transformer used for transformation 160 | sentence: str 161 | original sentence 162 | sentences: List[str], optional 163 | list of sentences of the text to transform if working 164 | with long texts (default is None) 165 | stop_words: List[Union[str, int]], optional 166 | stop_words for the transformation (default is None) 167 | prob: float, optional 168 | probability of the transformation (default is None) 169 | 170 | Returns 171 | ------- 172 | Tuple[List[str], List[float]] 173 | list of transformed texts and their similarity scores 174 | """ 175 | 176 | if sentences is not None: 177 | transform_sent, sent_scores = self._transform_text( 178 | transformer, sentences, sentence, stop_words=stop_words, prob=prob 179 | ) 180 | prob = prob / 2 181 | count = 0 182 | while prob > 0.01 and count < 3: 183 | if len(transform_sent) == 1 and transform_sent[0] == sentence: 184 | transform_sent, sent_scores = self._transform_text( 185 | transformer, sentences, sentence, prob=prob 186 | ) 187 | count += 1 188 | prob = prob / 2 189 | else: 190 | break 191 | return transform_sent, sent_scores 192 | 193 | transform_sent, sent_scores = self._transform_sentence( 194 | transformer, sentence, stop_words=stop_words, prob=prob 195 | ) 196 | prob = prob / 2 197 | count = 0 198 | while prob > 0.01 and count < 3: 199 | if len(transform_sent) == 1 and transform_sent[0] == sentence: 200 | transform_sent, sent_scores = self._transform_sentence( 201 | transformer, sentence, stop_words=stop_words, prob=prob 202 | ) 203 | prob = prob / 2 204 | count += 1 205 | else: 206 | break 207 | 208 | return transform_sent, sent_scores 209 | 210 | def transform(self, sentence: Dict[str, Any]) -> List[Dict[str, Any]]: 211 | """ 212 | Applies the transformations to input 213 | 214 | Parameters 215 | ---------- 216 | sentence: dict 217 | dataset object in dict form 218 | 219 | Returns 220 | ------- 221 | dict 222 | Transformed dataset object 223 | """ 224 | transformed_data = [] 225 | scores = [] 226 | for transform_name, transformer in self.transformers.items(): 227 | if self.task_type in ["multihop", "multichoice_qa"]: 228 | stop_words = self._list_stop_words(sentence) 229 | sentences = self.sent_split(sentence[self.text_col]) 230 | transform_sent, sent_scores = self._apply_transformation( 231 | transformer, 232 | sentence[self.text_col], 233 | sentences=sentences, 234 | stop_words=stop_words, 235 | prob=self.args.probability, 236 | ) 237 | 238 | else: 239 | stop_words = self._list_stop_words(sentence) 240 | transform_sent, sent_scores = self._apply_transformation( 241 | transformer, 242 | sentence[self.text_col], 243 | stop_words=stop_words, 244 | prob=self.args.probability, 245 | ) 246 | 247 | transformed = self._update_data(sentence, transform_sent, transform_name) 248 | transformed_data.extend(transformed) 249 | scores.extend(sent_scores) 250 | 251 | return transformed_data, scores 252 | -------------------------------------------------------------------------------- /rutransform/transformations/transformers/eda.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Optional, Tuple 2 | 3 | from rutransform.transformations.transformations.eda import * 4 | from rutransform.transformations import Transformer 5 | from rutransform.constraints import Constraint 6 | from rutransform.utils.args import TransformArguments 7 | from rutransform.transformations.utils import SentenceOperation 8 | 9 | 10 | class EDATransformer(Transformer): 11 | """ 12 | Easy Data Augmentation transformation 13 | 14 | 15 | Takes sentence as input and applies random words swaps and delitions 16 | to transform the sentence. 17 | 18 | Utilizes constraints provided by the user to extract stopwords 19 | specific for the task, to which the transformations do not apply. 20 | Uses similarity metric (BERTScore) to filter the sentences, 21 | similarity score of which are less than a threshold (specified in 22 | TransformArguments). 23 | 24 | Attributes 25 | ---------- 26 | transformation_type: str 27 | type of the transformations supported by the transformer 28 | transformations: List[str] 29 | list of transformations to apply to data 30 | task_type: str 31 | type of the task (e.g. 'classification', 'multichoice_qa', etc.) 32 | args: TransformArguments 33 | parameters of the transformation 34 | text_col: str, optional 35 | name of the column containing text to transform (default is 'text') 36 | label_col: str, optional 37 | name of the target column (default is 'label') 38 | seed: int 39 | seed to freeze everything (default is 42) 40 | device: str 41 | device used during transformation (default is 'cpu') 42 | constraints: List[Constraint] 43 | list of transformation constraints 44 | spacy_model: spacy.language.Language 45 | spacy model used for tokenization (default is 'ru_core_news_sm') 46 | bert_scorer: BERTScorer 47 | similarity metric class used to filter transformed texts (default is None) 48 | 49 | Methods 50 | ------- 51 | @staticmethod 52 | transform_info: Dict[str, SentenceOperation] 53 | dictionary mapping transformations and SentenceOperation classes 54 | load_transformations() 55 | Loads all the transformations required 56 | @abstractmethod 57 | transform(sentence) 58 | Applies the transformations to input 59 | sent_split(text) 60 | Splits text into sentences 61 | @staticmethod 62 | get_ids(matches) 63 | Returns ids of stopwords 64 | _transform_text(transformer, sentences, reference, stop_words, prob) 65 | Applies the transformations to long text and filters the transformed texts 66 | _transform_sentence(self, transformer, sentence, stop_words, prob) 67 | Applies the transformations to sentence and filters the transformed sentences 68 | _apply_transformation(transformer, sentence, sentences, reference, stop_words, prob) 69 | Applies the transformations to text until the transformed text passes 70 | the similarity threshold 71 | _list_stop_words(sentence, return_ids) 72 | Extracts stopwords matching the constraints 73 | _drop_duplicates(reference, candidates, scores) 74 | Returns transformed sentences without duplicates 75 | _filter_candidates(candidates, reference, context) 76 | Filters out sentences based on the similarity score 77 | _sample_to_max_outputs(sentences, scores) 78 | Returns the desired number of the transformed sentences 79 | _update_data(org_sentence, transformed) 80 | Updates the dataset object 81 | """ 82 | 83 | def __init__( 84 | self, 85 | transformations: List[str], 86 | task_type: str, 87 | args: TransformArguments, 88 | text_col: Optional[str] = "text", 89 | label_col: Optional[str] = "label", 90 | seed: int = 42, 91 | device: str = "cpu", 92 | constraints=None, 93 | ) -> None: 94 | """ 95 | Parameters 96 | ---------- 97 | transformations: List[str] 98 | list of transformations to apply to data 99 | task_type: str 100 | type of the task (e.g. 'classification', 'multichoice_qa', etc.) 101 | args: TransformArguments 102 | parameters of the transformation 103 | text_col: str, optional 104 | name of the column containing text to transform (default is 'text') 105 | label_col: str, optional 106 | name of the target column (default is 'label') 107 | seed: int 108 | seed to freeze everything (default is 42) 109 | device: str 110 | device used during transformation (default is 'cpu') 111 | constraints: List[Constraint] 112 | list of transformation constraints (default is None) 113 | """ 114 | super().__init__( 115 | transformation_type="eda", 116 | transformations=transformations, 117 | task_type=task_type, 118 | args=args, 119 | text_col=text_col, 120 | label_col=label_col, 121 | seed=seed, 122 | device=device, 123 | constraints=constraints, 124 | ) 125 | 126 | self.transformers = self.load_transformations() 127 | 128 | @staticmethod 129 | def transform_info() -> Dict[str, Optional[SentenceOperation]]: 130 | """ 131 | Information about the transformations used by the transformer 132 | 133 | Returns 134 | ------- 135 | Dict[str, Optional[SentenceOperation]] 136 | dictionary storing transformation info 137 | """ 138 | info = {"eda": RandomEDA} 139 | 140 | return info 141 | 142 | def _apply_transformation( 143 | self, 144 | transformer: SentenceOperation, 145 | sentence: str, 146 | sentences: Optional[List[str]] = None, 147 | stop_words: Optional[List[Union[str, int, List[Union[str, int]]]]] = None, 148 | prob: Optional[float] = None, 149 | ) -> Tuple[List[str], List[float]]: 150 | """ 151 | Applies the transformations to text until the transformed text passes 152 | the similarity threshold 153 | 154 | Parameters 155 | ---------- 156 | transformer: SentenceOperation 157 | transformer used for transformation 158 | sentence: str 159 | original sentence 160 | sentences: List[str], optional 161 | list of sentences of the text to transform if working 162 | with long texts (default is None) 163 | stop_words: List[Union[str, int]], optional 164 | stop_words for the transformation (default is None) 165 | prob: float, optional 166 | probability of the transformation (default is None) 167 | 168 | Returns 169 | ------- 170 | Tuple[List[str], List[float]] 171 | list of transformed texts and their similarity scores 172 | """ 173 | 174 | if sentences is not None: 175 | transform_sent, sent_scores = self._transform_text( 176 | transformer=transformer, 177 | sentences=sentences, 178 | reference=sentence, 179 | stop_words=stop_words, 180 | prob=prob, 181 | ) 182 | prob = prob / 2 183 | count = 0 184 | while prob > 0.01 and count < 3: 185 | if len(transform_sent) == 1 and sent_scores[0] == 1: 186 | transform_sent, sent_scores = self._transform_text( 187 | transformer=transformer, 188 | sentences=sentences, 189 | reference=sentence, 190 | prob=prob, 191 | ) 192 | count += 1 193 | prob = prob / 2 194 | else: 195 | break 196 | return transform_sent, sent_scores 197 | 198 | transform_sent, sent_scores = self._transform_sentence( 199 | transformer=transformer, sentence=sentence, stop_words=stop_words, prob=prob 200 | ) 201 | prob = prob / 2 202 | count = 0 203 | while prob > 0.01 and count < 3: 204 | if len(transform_sent) == 1 and sent_scores[0] == 1: 205 | transform_sent, sent_scores = self._transform_sentence( 206 | transformer=transformer, 207 | sentence=sentence, 208 | stop_words=stop_words, 209 | prob=prob, 210 | ) 211 | prob = prob / 2 212 | count += 1 213 | else: 214 | break 215 | 216 | return transform_sent, sent_scores 217 | 218 | def transform(self, sentence: Dict[str, Any]) -> List[Dict[str, Any]]: 219 | """ 220 | Applies the transformations to input 221 | 222 | Parameters 223 | ---------- 224 | sentence: Dict[str, Any] 225 | dataset object in dict form 226 | 227 | Returns 228 | ------- 229 | Dict[str, Any] 230 | Transformed dataset object 231 | """ 232 | transformed_data = [] 233 | scores = [] 234 | for transform_name, transformer in self.transformers.items(): 235 | stop_words = self._list_stop_words(sentence) 236 | sentences = self.sent_split(sentence[self.text_col]) 237 | if len(sentences) > 1: 238 | transform_sent, sent_scores = self._apply_transformation( 239 | transformer=transformer, 240 | sentence=sentence[self.text_col], 241 | sentences=sentences, 242 | stop_words=stop_words, 243 | prob=self.args.probability, 244 | ) 245 | else: 246 | transform_sent, sent_scores = self._apply_transformation( 247 | transformer=transformer, 248 | sentence=sentence[self.text_col], 249 | stop_words=stop_words, 250 | prob=self.args.probability, 251 | ) 252 | transformed = self._update_data(sentence, transform_sent) 253 | transformed_data.extend(transformed) 254 | scores.extend(sent_scores) 255 | 256 | return transformed_data, scores 257 | -------------------------------------------------------------------------------- /rutransform/transformations/transformations/distraction/sentence_additions.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Optional, Union 3 | from spacy.language import Language 4 | from transformers import ( 5 | TextGenerationPipeline, 6 | set_seed, 7 | MT5ForConditionalGeneration, 8 | T5ForConditionalGeneration, 9 | AutoTokenizer, 10 | AutoModelWithLMHead, 11 | MT5Tokenizer, 12 | ) 13 | 14 | from rutransform.utils.args import TransformArguments 15 | from rutransform.transformations.utils import SentenceOperation 16 | 17 | """ 18 | Adapted from https://github.com/GEM-benchmark/NL-Augmenter/tree/main/transformations/sentence_additions 19 | """ 20 | 21 | 22 | def clean(text: str) -> str: 23 | """ 24 | Cleans text from unwanted characters created 25 | by the generator model 26 | 27 | Parameters 28 | ---------- 29 | text: str 30 | generated text to clean 31 | 32 | Returns 33 | ------- 34 | str 35 | clean string of text 36 | """ 37 | if len(text) > 1: 38 | text = text.split("===")[0] 39 | text = " ".join(text.split("\n\n")[:2]) 40 | text = text.replace("\\n", "\n") 41 | text = text.replace("", "") 42 | text = text.replace("&", "&") 43 | text = text.replace("lt;", "") 44 | text = text.replace("gt;", "") 45 | text = text.split("< EOS>")[0] 46 | text = text.split("")[0] 47 | text = text.replace("< EOS>", " ") 48 | text = text.replace("", "") 49 | text = text.replace("", "") 50 | text = text.replace("", " ") 51 | text = text.replace("< BOS>", " ") 52 | text = text.replace("", " ") 53 | text = text.replace("< SHORT>", " ") 54 | text = text.replace("", " ") 55 | text = text.replace("", " ") 56 | text = text.replace("< LONG>", " ") 57 | text = text.replace(" ul ", "\n") 58 | text = text.replace(" pre ", " ") 59 | text = text.replace(r" /pre ", " ") 60 | text = text.replace(r" / pre ", " ") 61 | text = text.replace(r"/code", "\n/code\n") 62 | text = text.replace(r"/ code", "\n/code\n") 63 | text = text.replace(" code", "\ncode\n") 64 | text = text.replace(" hr ", " ") 65 | text = text.replace(" e f ", "\n") 66 | text = text.replace("/h1", "\n") 67 | text = text.replace("nbsp;", " ") 68 | text = text.replace("/blockquote", "\n") 69 | text = text.replace(" +", " ") 70 | text = text.replace("‍", "") 71 | text = text.replace(".<", ".") 72 | text = text.replace("/", ".") 73 | text = text.replace("tml", "") 74 | text = text.replace("", "") 81 | return text.strip() 82 | 83 | 84 | class SentenceAdditions(SentenceOperation): 85 | """ 86 | Adds generated sentence into provided sentences 87 | or paragraph to create adversarial examples. 88 | 89 | Attributes 90 | ---------- 91 | args: TransformArguments 92 | parameters of the transformation 93 | seed: int 94 | seed to freeze everything (default is 42) 95 | max_outputs: int 96 | maximum number of the transfromed sentences (default is 1) 97 | device: str 98 | the device used during transformation (default is 'cpu') 99 | spacy_model: spacy.language.Language 100 | ! exists for compatability, always ignored ! 101 | spacy model used for tokenization 102 | 103 | Methods 104 | ------- 105 | get_model_path() 106 | Converts model name to model path 107 | generate(sentence, stop_words) 108 | Transforms the sentence 109 | """ 110 | 111 | def __init__( 112 | self, 113 | args: TransformArguments, 114 | seed: int = 42, 115 | max_outputs: int = 1, 116 | device: str = "cpu", 117 | spacy_model: Optional[Language] = None, 118 | ) -> None: 119 | """ 120 | Parameters 121 | ---------- 122 | args: TransformArguments 123 | parameters of the transformation 124 | seed: int 125 | seed to freeze everything (default is 42) 126 | max_outputs: int 127 | maximum number of the transfromed sentences (default is 1) 128 | device: str 129 | the device used during transformation (default is 'cpu') 130 | spacy_model: spacy.language.Language 131 | ! exists for compatability, always ignored ! 132 | spacy model used for tokenization 133 | """ 134 | super().__init__( 135 | args=args, 136 | seed=seed, 137 | max_outputs=max_outputs, 138 | device=device, 139 | spacy_model=spacy_model, 140 | ) 141 | 142 | model_name = self.get_model_path() 143 | if "mt5" in self.args.generator: 144 | self.tokenizer = AutoTokenizer.from_pretrained(model_name) 145 | self.model = MT5ForConditionalGeneration.from_pretrained( 146 | model_name, pad_token_id=self.tokenizer.eos_token_id 147 | ).to(self.device) 148 | elif "t5" in self.args.generator: 149 | self.tokenizer = AutoTokenizer.from_pretrained(model_name) 150 | self.model = T5ForConditionalGeneration.from_pretrained( 151 | model_name, pad_token_id=self.tokenizer.eos_token_id 152 | ).to(self.device) 153 | else: 154 | tokenizer = ( 155 | MT5Tokenizer.from_pretrained(model_name) 156 | if model_name == "THUMT/mGPT" 157 | else AutoTokenizer.from_pretrained(model_name) 158 | ) 159 | model = AutoModelWithLMHead.from_pretrained( 160 | model_name, pad_token_id=tokenizer.eos_token_id 161 | ) 162 | self.generator = TextGenerationPipeline( 163 | model=model, 164 | tokenizer=tokenizer, 165 | device=(-1 if self.device == "cpu" else 0), 166 | ) 167 | 168 | def get_model_path(self) -> str: 169 | """ 170 | Converts model name to model path 171 | 172 | Returns 173 | ------- 174 | str 175 | path to model in the HuggingFace library 176 | """ 177 | model_dict = { 178 | "gpt2": "sberbank-ai/rugpt2_large", 179 | "gpt3": "sberbank-ai/rugpt3large_based_on_gpt2", 180 | "mt5-base": "google/mt5-base", 181 | "mt5-small": "google/mt5-small", 182 | "mt5-large": "google/mt5-large", 183 | } 184 | return ( 185 | model_dict[self.args.generator] 186 | if self.args.generator in model_dict 187 | else self.args.generator 188 | ) 189 | 190 | def generate( 191 | self, 192 | sentence: str, 193 | stop_words: Optional[List[Union[int, str]]] = None, 194 | prob: Optional[float] = None, 195 | ) -> List[str]: 196 | """ 197 | Transforms the sentence 198 | 199 | Parameters 200 | ---------- 201 | sentence: str 202 | sentence to transform 203 | stop_words: List[Union[int, str]], optional 204 | ! exists for compatability, always ignored ! 205 | stop_words to ignore during transformation (default is None) 206 | prob: float, optional 207 | ! exists for compatability, always ignored ! 208 | probability of the transformation (default is None) 209 | 210 | Returns 211 | ------- 212 | list 213 | list of transformed sentences 214 | """ 215 | if self.max_outputs == 1: 216 | set_seed(self.seed) 217 | 218 | if self.args.prompt: 219 | sentence = sentence + self.args.prompt_text 220 | 221 | transformed = [] 222 | for _ in range(self.max_outputs): 223 | if "t5" in self.args.generator: 224 | encoding = self.tokenizer.encode_plus( 225 | sentence, pad_to_max_length=True, return_tensors="pt" 226 | ) 227 | input_ids, attention_masks = ( 228 | encoding["input_ids"].to(self.device), 229 | encoding["attention_mask"].to(self.device), 230 | ) 231 | 232 | beam_outputs = self.model.generate( 233 | input_ids=input_ids, 234 | attention_mask=attention_masks, 235 | do_sample=self.args.do_sample, 236 | max_length=self.args.max_length, 237 | temperature=self.args.temperature, 238 | top_k=self.args.top_k, 239 | top_p=self.args.top_p, 240 | early_stopping=self.args.early_stopping, 241 | num_return_sequences=1, 242 | repetition_penalty=self.args.repetition_penalty, 243 | ) 244 | 245 | for output in beam_outputs: 246 | sent = self.tokenizer.decode( 247 | output, 248 | skip_special_tokens=True, 249 | clean_up_tokenization_spaces=True, 250 | ) 251 | transformed.append(sent) 252 | else: 253 | outputs = self.generator( 254 | sentence, 255 | max_length=self.args.max_length, 256 | skip_special_tokens=True, 257 | num_return_sequences=1, 258 | num_beams=self.args.num_beams, 259 | early_stopping=self.args.early_stopping, 260 | no_repeat_ngram_size=self.args.no_repeat_ngram_size, 261 | top_k=self.args.top_k, 262 | top_p=self.args.top_p, 263 | temperature=self.args.temperature, 264 | do_sample=self.args.do_sample, 265 | repetition_penalty=self.args.repetition_penalty, 266 | ) 267 | 268 | for sents_with_additions in outputs: 269 | for key, value in sents_with_additions.items(): 270 | transformed.append(clean(value)) 271 | return transformed 272 | -------------------------------------------------------------------------------- /rutransform/transformations/transformations/eda/eda.py: -------------------------------------------------------------------------------- 1 | import random 2 | import spacy 3 | from random import shuffle 4 | from typing import List, Optional, Union 5 | from nltk.corpus import stopwords 6 | from string import punctuation 7 | 8 | from spacy.language import Language 9 | 10 | from rutransform.utils.args import TransformArguments 11 | from rutransform.transformations.utils import SentenceOperation 12 | 13 | STOPWORDS = stopwords.words("russian") 14 | 15 | """ 16 | Adapted from https://github.com/jasonwei20/eda_nlp 17 | """ 18 | 19 | 20 | def tokenize(text: str, spacy_model: Language) -> str: 21 | """ 22 | Tokenizes text 23 | 24 | Parameters 25 | ---------- 26 | text: str 27 | text to tokenize 28 | spacy_model: spacy.language.Language 29 | spacy model used for tokenization 30 | 31 | Returns 32 | ------- 33 | str 34 | tokenized text 35 | """ 36 | return " ".join([token.text for token in spacy_model(text)]) 37 | 38 | 39 | def random_deletion( 40 | words: List[str], p: float, seed: int, stop_words: Optional[List[int]] = None 41 | ) -> List[str]: 42 | """ 43 | Randomly deletes words from the sentence with probability p 44 | 45 | Parameters 46 | ---------- 47 | words: List[str] 48 | list of tokens in the sentence 49 | p: float 50 | probability of the deletion 51 | seed: int 52 | seed to freeze everything 53 | stop_words: List[int], optional 54 | stop_words to ignore during deletion (default is None) 55 | 56 | Returns 57 | ------- 58 | List[str] 59 | transformed sentence in tokens 60 | """ 61 | random.seed(seed) 62 | if stop_words is None: 63 | stop_words = [] 64 | 65 | # if there's only one word, don't delete it 66 | if len(words) <= 1: 67 | return words 68 | 69 | # randomly delete words with probability p 70 | new_words = [] 71 | for idx, word in enumerate(words): 72 | if idx in stop_words: 73 | new_words.append(word) 74 | continue 75 | r = random.uniform(0, 1) 76 | if r > p: 77 | new_words.append(word) 78 | 79 | # if you end up deleting all words, just return a random word 80 | if len(new_words) == 0: 81 | rand_int = random.randint(0, len(words) - 1) 82 | return [words[rand_int]] 83 | 84 | if new_words == words: 85 | stopwords = [ 86 | i 87 | for (i, word) in enumerate(words) 88 | if (word in STOPWORDS and i not in stop_words) 89 | ] 90 | if len(stopwords) > 0: 91 | random_idx = random.choice(stopwords) 92 | new_words.pop(random_idx) 93 | 94 | return new_words 95 | 96 | 97 | def random_swap( 98 | words: List[str], n: int, seed: int, stop_words: Optional[List[int]] = None 99 | ) -> List[str]: 100 | """ 101 | Randomly swaps two words in the sentence n times 102 | 103 | Parameters 104 | ---------- 105 | words: List[str] 106 | list of tokens in the sentence 107 | n: int 108 | number of swaps 109 | seed: int 110 | seed to freeze everything 111 | stop_words: List[int], optional 112 | stop_words to ignore during swaps (default is None) 113 | 114 | Returns 115 | ------- 116 | List[str] 117 | transformed sentence in tokens 118 | """ 119 | new_words = words.copy() 120 | for _ in range(n): 121 | new_words = swap_word(new_words, seed, stop_words=stop_words) 122 | return new_words 123 | 124 | 125 | def swap_word( 126 | words: List[str], seed: int, stop_words: Optional[List[int]] = None 127 | ) -> List[str]: 128 | """ 129 | Randomly swap two words in the sentence 130 | 131 | Parameters 132 | ---------- 133 | words: List[str] 134 | list of tokens in the sentence 135 | seed: int 136 | seed to freeze everything 137 | stop_words: List[int], optional 138 | stop_words to ignore during swaps (default is None) 139 | 140 | Returns 141 | ------- 142 | List[str] 143 | transformed sentence in tokens 144 | """ 145 | if stop_words is None: 146 | stop_words = [] 147 | 148 | new_words = words.copy() 149 | random.seed(seed) 150 | allowed_ids = [i for (i, word) in enumerate(words) if i not in stop_words] 151 | if len(allowed_ids) >= 2: 152 | random_idx_1 = random.choice(allowed_ids) # test 153 | else: 154 | return new_words 155 | 156 | random_idx_2 = random_idx_1 157 | counter = 0 158 | while random_idx_2 == random_idx_1: 159 | random_idx_2 = random.choice(allowed_ids) 160 | counter += 1 161 | if counter > 3: 162 | return new_words 163 | 164 | new_words[random_idx_1], new_words[random_idx_2] = ( 165 | new_words[random_idx_2], 166 | new_words[random_idx_1], 167 | ) 168 | 169 | # if we did not swap any of the words swap any articles, pronouns, etc. 170 | if new_words == words: 171 | stopwords = [ 172 | i 173 | for (i, word) in enumerate(new_words) 174 | if (word in STOPWORDS and i not in stop_words) 175 | ] 176 | if len(stopwords) > 1: 177 | random_idx_1, random_idx_2 = random.sample(stopwords, k=2) 178 | new_words[random_idx_1], new_words[random_idx_2] = ( 179 | new_words[random_idx_2], 180 | new_words[random_idx_1], 181 | ) 182 | return new_words 183 | 184 | 185 | def eda( 186 | sentence: str, 187 | spacy_model: Language, 188 | alpha_rs: float = 0.1, 189 | p_rd: float = 0.1, 190 | num_aug: int = 1, 191 | seed: int = 42, 192 | stop_words: Optional[List[int]] = None, 193 | ) -> List[str]: 194 | """ 195 | Applies Easy Data Augmentations (random deletion and random swaps) to text 196 | 197 | Parameters 198 | ---------- 199 | sentence: str 200 | text to transform 201 | spacy_model: spacy.language.Language 202 | spacy model used for tokenization 203 | alpha_rs: float 204 | probability of word swap (default is 0.1) 205 | p_rd: float 206 | probability of word deletion (default is 0.1) 207 | num_aug: int 208 | maximum number of the transformed sentences (default is 1) 209 | seed: int 210 | seed to freeze everything (default is 42) 211 | stop_words: List[int], optional 212 | stop_words to ignore during swaps (default is None) 213 | 214 | Returns 215 | ------- 216 | List[str] 217 | list of transformed sentences 218 | """ 219 | random.seed(seed) 220 | sentence = tokenize(sentence, spacy_model) 221 | words = sentence.split() 222 | words = [word for word in words if word is not ""] 223 | num_words = len(words) 224 | augmented_sentences = [] 225 | num_new_per_technique = int(num_aug / 4) + 1 226 | n_rs = max(1, int(alpha_rs * num_words)) 227 | 228 | # random swap 229 | for _ in range(num_new_per_technique): 230 | a_words = random_swap(words, n_rs, seed, stop_words=stop_words) 231 | augmented_sentences.append(" ".join(a_words)) 232 | 233 | # random deletion 234 | for _ in range(num_new_per_technique): 235 | a_words = random_deletion(words, p_rd, seed, stop_words=stop_words) 236 | augmented_sentences.append(" ".join(a_words)) 237 | 238 | shuffle(augmented_sentences) 239 | 240 | # trim to the the desired number of augmented sentences 241 | augmented_sentences = [s for s in augmented_sentences if s != sentence][:num_aug] 242 | 243 | return augmented_sentences 244 | 245 | 246 | class RandomEDA(SentenceOperation): 247 | """ 248 | Augment data using Easy Data Augmentation techniques 249 | (random deletion and random word swaps) 250 | 251 | Attributes 252 | ---------- 253 | args: TransformArguments 254 | parameters of the transformation 255 | seed: int 256 | seed to freeze everything (default is 42) 257 | max_outputs: int 258 | maximum number of the transfromed sentences (default is 1) 259 | device: str 260 | the device used during transformation (default is 'cpu') 261 | spacy_model: spacy.language.Language 262 | spacy model used for tokenization 263 | 264 | Methods 265 | ------- 266 | generate(sentence, stop_words, prob) 267 | Transforms the sentence 268 | """ 269 | 270 | def __init__( 271 | self, 272 | args: TransformArguments, 273 | seed: int = 42, 274 | max_outputs: int = 1, 275 | device: str = "cpu", 276 | spacy_model: Optional[Language] = None, 277 | ) -> None: 278 | """ 279 | Parameters 280 | ---------- 281 | args: TransformArguments 282 | parameters of the transformation 283 | seed: int 284 | seed to freeze everything (default is 42) 285 | max_outputs: int 286 | maximum number of the transfromed sentences (default is 1) 287 | device: str 288 | the device used during transformation (default is 'cpu') 289 | spacy_model: spacy.language.Language 290 | spacy model used for tokenization 291 | """ 292 | if spacy_model is None: 293 | spacy_model = spacy.load("ru_core_news_sm") 294 | 295 | super().__init__( 296 | args=args, 297 | seed=seed, 298 | max_outputs=max_outputs, 299 | device=device, 300 | spacy_model=spacy_model, 301 | ) 302 | 303 | def generate( 304 | self, 305 | sentence: str, 306 | stop_words: Optional[List[Union[int, str]]] = None, 307 | prob: Optional[float] = None, 308 | ) -> List[str]: 309 | """ 310 | Transforms the sentence 311 | 312 | If 'prob' argument is not None, ignores the probabilityprovided in the arguments. 313 | 314 | Parameters 315 | ---------- 316 | sentence: str 317 | sentence to transform 318 | stop_words: List[int], optional 319 | stop_words to ignore during transformation (default is None) 320 | prob: float, optional 321 | probability of the transformation (default is None) 322 | 323 | Returns 324 | ------- 325 | list 326 | list of transformed sentences 327 | """ 328 | if not prob: 329 | alpha_rs = self.args.probability 330 | p_rd = self.args.probability if self.args.same_prob else self.args.del_prob 331 | else: 332 | alpha_rs = prob 333 | p_rd = prob if self.args.same_prob else self.args.del_prob / 2 334 | 335 | transformed = eda( 336 | sentence=sentence, 337 | alpha_rs=alpha_rs, 338 | p_rd=p_rd, 339 | num_aug=self.max_outputs, 340 | seed=self.seed, 341 | spacy_model=self.spacy_model, 342 | stop_words=stop_words, 343 | ) 344 | 345 | return transformed 346 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /rutransform/transformations/transformers/paraphrasis.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from typing import Any, Dict, List, Optional 4 | 5 | from razdel import sentenize 6 | 7 | from rutransform.transformations.transformations.paraphrasis import * 8 | from rutransform.transformations import Transformer 9 | from rutransform.constraints import Constraint 10 | from rutransform.utils.args import TransformArguments 11 | from rutransform.transformations.utils import SentenceOperation 12 | 13 | 14 | class ParaphrasisTransformer(Transformer): 15 | """ 16 | Paraphrasis transformations 17 | 18 | Generates diverse linguistic variations of the contexts 19 | through paraphrasis, back-translation and style-trasfer 20 | 21 | Utilizes constraints provided by the user to extract stopwords 22 | specific for the task, to which the transformations do not apply. 23 | Uses similarity metric (BERTScore) to filter the sentences, 24 | similarity score of which are less than a threshold (specified in 25 | TransformArguments). 26 | 27 | Attributes 28 | ---------- 29 | transformation_type: str 30 | type of the transformations supported by the transformer 31 | transformations: List[str] 32 | list of transformations to apply to data 33 | task_type: str 34 | type of the task (e.g. 'classification', 'multichoice_qa', etc.) 35 | args: TransformArguments 36 | parameters of the transformation 37 | text_col: str, optional 38 | name of the column containing text to transform (default is 'text') 39 | label_col: str, optional 40 | name of the target column (default is 'label') 41 | seed: int 42 | seed to freeze everything (default is 42) 43 | device: str 44 | device used during transformation (default is 'cpu') 45 | constraints: List[Constraint], optional 46 | list of transformation constraints (default is None) 47 | spacy_model: spacy.language.Language 48 | spacy model used for tokenization (default is 'ru_core_news_sm') 49 | bert_scorer: BERTScorer 50 | similarity metric class used to filter transformed texts (default is None) 51 | 52 | Methods 53 | ------- 54 | @staticmethod 55 | transform_info: Dict[str, SentenceOperation] 56 | dictionary mapping transformations and SentenceOperation classes 57 | load_transformations() 58 | Loads all the transformations required 59 | @abstractmethod 60 | transform(sentence) 61 | Applies the transformations to input 62 | sent_split(text) 63 | Splits text into sentences 64 | @staticmethod 65 | get_ids(matches) 66 | Returns ids of stopwords 67 | _transform_text(transformer, sentences, reference, stop_words, prob) 68 | Applies the transformations to long text and filters the transformed texts 69 | _transform_sentence(self, transformer, sentence, stop_words, prob) 70 | Applies the transformations to sentence and filters the transformed sentences 71 | _list_stop_words(sentence, return_ids) 72 | Extracts stopwords matching the constraints 73 | _drop_duplicates(reference, candidates, scores) 74 | Returns transformed sentences without duplicates 75 | _filter_candidates(candidates, reference, context) 76 | Filters out sentences based on the similarity score 77 | _sample_to_max_outputs(sentences, scores) 78 | Returns the desired number of the transformed sentences 79 | _update_data(org_sentence, transformed) 80 | Updates the dataset object 81 | """ 82 | 83 | def __init__( 84 | self, 85 | transformations: List[str], 86 | task_type: str, 87 | args: TransformArguments, 88 | text_col: Optional[str] = "text", 89 | label_col: Optional[str] = "label", 90 | seed: int = 42, 91 | device: str = "cpu", 92 | constraints: Optional[List[Constraint]] = None, 93 | ) -> None: 94 | """ 95 | Parameters 96 | ---------- 97 | transformations: List[str] 98 | list of transformations to apply to data 99 | task_type: str 100 | type of the task (e.g. 'classification', 'multichoice_qa', etc.) 101 | args: TransformArguments 102 | parameters of the transformation 103 | text_col: str, optional 104 | name of the column containing text to transform (default is 'text') 105 | label_col: str, optional 106 | name of the target column (default is 'label') 107 | seed: int 108 | seed to freeze everything (default is 42) 109 | device: str 110 | device used during transformation (default is 'cpu') 111 | constraints: List[Constraint], optional 112 | list of transformation constraints (default is None) 113 | """ 114 | super().__init__( 115 | transformation_type="paraphrasis", 116 | transformations=transformations, 117 | task_type=task_type, 118 | args=args, 119 | text_col=text_col, 120 | label_col=label_col, 121 | seed=seed, 122 | device=device, 123 | constraints=constraints, 124 | ) 125 | 126 | self.transformers = self.load_transformations() 127 | 128 | @staticmethod 129 | def transform_info() -> Dict[str, Optional[SentenceOperation]]: 130 | """ 131 | Information about the transformations used by the transformer 132 | 133 | Returns 134 | ------- 135 | Dict[str, Optional[SentenceOperation]] 136 | dictionary storing transformation info 137 | """ 138 | info = { 139 | "paraphraser": Paraphraser, 140 | "style_transfer": None, 141 | "back_translation": BackTranslationNER, 142 | } 143 | return info 144 | 145 | def _transform_text( 146 | self, 147 | transformer: SentenceOperation, 148 | sentences: List[str], 149 | reference: str, 150 | stop_words: Optional[List[Union[str, int, List[Union[str, int]]]]] = None, 151 | prob: Optional[float] = None, 152 | ) -> Tuple[List[str], List[float]]: 153 | """ 154 | Applies the transformations to long text and filters the transformed texts 155 | 156 | Parameters 157 | ---------- 158 | transformer: SentenceOperation 159 | transformer used for transformation 160 | sentences: List[str] 161 | sentences of the text to transform 162 | reference: str 163 | original sentence to use as a reference for similarity score 164 | stop_words: List[Union[str, int]], optional 165 | stop_words for the transformation (default is None) 166 | prob: float, optional 167 | probability of the transformation (default is None) 168 | used when no transormed sentence passes the similarity 169 | score threshold 170 | 171 | Returns 172 | ------- 173 | Tuple[List[str], List[float]] 174 | list of transformed texts and their similarity scores 175 | """ 176 | transform_text = [] 177 | text_scores = [] 178 | for s, sent in enumerate(sentences): 179 | 180 | if stop_words is not None: 181 | stops = stop_words[s] 182 | else: 183 | stops = None 184 | 185 | if s == 0 and self.task_type == "multichoice": 186 | question_text = [] 187 | sents = [sentence.text for sentence in sentenize(sent)] 188 | 189 | for i, sentence in enumerate(sents): 190 | if random.uniform(0, 1) >= self.args.probability: 191 | question_text.append([sentence] * self.args.max_outputs) 192 | else: 193 | transform_sent = transformer.generate( 194 | sentence=sentence, stop_words=stops, prob=prob 195 | ) 196 | question_text.append( 197 | transform_sent 198 | ) # else random.choice(sentence) 199 | 200 | transformed = [] 201 | for m in range(self.args.max_outputs): 202 | transf_sentence = [] 203 | for tr_sent in question_text: 204 | if len(tr_sent) >= m: 205 | transf_sentence.append(tr_sent[m]) 206 | else: 207 | transf_sentence.append(random.choice(tr_sent)) 208 | transformed.append(". ".join(transf_sentence)) 209 | 210 | transform_sent = transformed 211 | transform_sent = self._drop_duplicates(sent, transform_sent) 212 | transform_sent, sent_scores = self._filter_candidates( 213 | candidates=transform_sent, reference=sent 214 | ) 215 | transform_text.append(transform_sent) 216 | text_scores.append(sent_scores) 217 | 218 | elif random.uniform(0, 1) >= self.args.probability: 219 | transform_text.append([sent]) 220 | text_scores.append([1.0]) 221 | 222 | else: 223 | transform_sent = transformer.generate( 224 | sentence=sent, stop_words=stops, prob=prob 225 | ) 226 | transform_sent = self._drop_duplicates(sent, transform_sent) 227 | transform_sent, sent_scores = self._filter_candidates( 228 | candidates=transform_sent, reference=sent 229 | ) 230 | transform_text.append(transform_sent) 231 | text_scores.append(sent_scores) 232 | 233 | transform_sent, scores = self._sample_to_max_outputs( 234 | transform_text, text_scores 235 | ) 236 | transform_sent, scores = self._filter_candidates( 237 | candidates=transform_sent, reference=reference 238 | ) 239 | transform_sent, scores = self._drop_duplicates( 240 | reference, transform_sent, scores 241 | ) 242 | return transform_sent, scores 243 | 244 | def transform(self, sentence: Dict[str, Any]) -> List[Dict[str, Any]]: 245 | """ 246 | Applies the transformations to input 247 | 248 | Parameters 249 | ---------- 250 | sentence: Dict[str, Any] 251 | dataset object in dict form 252 | 253 | Returns 254 | ------- 255 | Dict[str, Any] 256 | Transformed dataset object 257 | """ 258 | transformed_data = [] 259 | scores = [] 260 | for transform_name, transformer in self.transformers.items(): 261 | stop_words = self._list_stop_words(sentence) 262 | sentences = self.sent_split(sentence[self.text_col]) 263 | if len(sentences) > 1: 264 | transform_sent, sent_scores = self._transform_text( 265 | transformer, 266 | sentences, 267 | reference=sentence[self.text_col], 268 | stop_words=stop_words, 269 | ) 270 | if len(transform_sent) == 1 and sent_scores[0] == 1: 271 | transform_sent, sent_scores = self._transform_text( 272 | transformer, 273 | sentences, 274 | reference=sentence[self.text_col], 275 | stop_words=stop_words, 276 | prob=self.args.probability, 277 | ) 278 | else: 279 | transform_sent, sent_scores = self._transform_sentence( 280 | transformer, sentence[self.text_col], stop_words=stop_words 281 | ) 282 | transformed = self._update_data(sentence, transform_sent) 283 | transformed_data.extend(transformed) 284 | scores.extend(sent_scores) 285 | 286 | return transformed_data, scores 287 | -------------------------------------------------------------------------------- /rutransform/transformations/transformers/distraction.py: -------------------------------------------------------------------------------- 1 | import random 2 | import re 3 | from typing import Any, Dict, List, Optional, Tuple 4 | from razdel import sentenize 5 | 6 | from rutransform.transformations.transformations.distraction import * 7 | from rutransform.transformations import Transformer 8 | from rutransform.constraints import Constraint 9 | from rutransform.utils.args import TransformArguments 10 | from rutransform.transformations.utils import SentenceOperation 11 | 12 | 13 | class AdditionTransformer(Transformer): 14 | """ 15 | Addition transformations 16 | 17 | Generates additional sentence or words at the end of the sentence 18 | 19 | Utilizes constraints provided by the user to extract stopwords 20 | specific for the task, to which the transformations do not apply. 21 | Uses similarity metric (BERTScore) to filter the sentences, 22 | similarity score of which are less than a threshold (specified in 23 | TransformArguments). 24 | 25 | Attributes 26 | ---------- 27 | transformation_type: str 28 | type of the transformations supported by the transformer 29 | transformations: List[str] 30 | list of transformations to apply to data 31 | task_type: str 32 | type of the task (e.g. 'classification', 'multichoice_qa', etc.) 33 | args: TransformArguments 34 | parameters of the transformation 35 | text_col: str, optional 36 | name of the column containing text to transform (default is 'text') 37 | label_col: str, optional 38 | name of the target column (default is 'label') 39 | seed: int 40 | seed to freeze everything (default is 42) 41 | device: str 42 | device used during transformation (default is 'cpu') 43 | constraints: List[Constraint], optional 44 | list of transformation constraints (default is None) 45 | spacy_model: spacy.language.Language 46 | spacy model used for tokenization (default is 'ru_core_news_sm') 47 | transform_info: 48 | dictionary mapping transformations and SentenceOperation classes 49 | provided in utils.constants 50 | bert_scorer: BERTScorer 51 | similarity metric class used to filter transformed texts (default is None) 52 | 53 | Methods 54 | ------- 55 | @staticmethod 56 | transform_info: Dict[str, SentenceOperation] 57 | dictionary mapping transformations and SentenceOperation classes 58 | load_transformations() 59 | Loads all the transformations required 60 | @abstractmethod 61 | transform(sentence) 62 | Applies the transformations to input 63 | sent_split(text) 64 | Splits text into sentences 65 | @staticmethod 66 | get_ids(matches) 67 | Returns ids of stopwords 68 | _transform_text(transformer, sentences, reference, stop_words, prob) 69 | Applies the transformations to long text and filters the transformed texts 70 | _transform_multichoice(transformer, sentence, add_split) 71 | Generates new answer options for multichoice questions and 72 | filters the transformed texts 73 | _transform_sentence(self, transformer, sentence, stop_words, prob) 74 | Applies the transformations to sentence and filters the transformed sentences 75 | _list_stop_words(sentence, return_ids) 76 | Extracts stopwords matching the constraints 77 | _drop_duplicates(reference, candidates, scores) 78 | Returns transformed sentences without duplicates 79 | _filter_candidates(candidates, reference, context) 80 | Filters out sentences based on the similarity score 81 | _sample_to_max_outputs(sentences, scores) 82 | Returns the desired number of the transformed sentences 83 | _update_data(org_sentence, transformed) 84 | Updates the dataset object 85 | """ 86 | 87 | def __init__( 88 | self, 89 | transformations: List[str], 90 | task_type: str, 91 | args: TransformArguments, 92 | text_col: Optional[str] = "text", 93 | label_col: Optional[str] = "label", 94 | seed: int = 42, 95 | device: str = str, 96 | constraints=Optional[List[Constraint]], 97 | ) -> None: 98 | """ 99 | Parameters 100 | ---------- 101 | transformations: List[str] 102 | list of transformations to apply to data 103 | task_type: str 104 | type of the task (e.g. 'classification', 'multichoice_qa', etc.) 105 | args: TransformArguments 106 | parameters of the transformation 107 | text_col: str, optional 108 | name of the column containing text to transform (default is 'text') 109 | label_col: str, optional 110 | name of the target column (default is 'label') 111 | seed: int 112 | seed to freeze everything (default is 42) 113 | device: str 114 | device used during transformation (default is 'cpu') 115 | constraints: List[Constraint], optional 116 | list of transformation constraints (default is None) 117 | """ 118 | super().__init__( 119 | transformation_type="addition", 120 | transformations=transformations, 121 | task_type=task_type, 122 | args=args, 123 | text_col=text_col, 124 | label_col=label_col, 125 | seed=seed, 126 | device=device, 127 | constraints=constraints, 128 | ) 129 | 130 | self.transformers = self.load_transformations() 131 | 132 | @staticmethod 133 | def transform_info() -> Dict[str, Optional[SentenceOperation]]: 134 | """ 135 | Information about the transformations used by the transformer 136 | 137 | Returns 138 | ------- 139 | Dict[str, Optional[SentenceOperation]] 140 | dictionary storing transformation info 141 | """ 142 | info = { 143 | "addsent": SentenceAdditions, 144 | } 145 | 146 | return info 147 | 148 | def _transform_multichoice( 149 | self, 150 | transformer: SentenceOperation, 151 | sentence: Dict[str, Any], 152 | add_split: bool = False, 153 | ) -> Tuple[List[str], List[float]]: 154 | """ 155 | Generates new answer options for multichoice questions and 156 | filters the transformed texts 157 | 158 | Parameters 159 | ---------- 160 | transformer: SentenceOperation 161 | transformer used for transformation 162 | sentences: dict 163 | dataset object in dict form 164 | add_split: bool 165 | whether to do additional splitting of the 166 | generated data (default is False) 167 | used to trim the generated text to create 168 | shorter sequences 169 | 170 | Returns 171 | ------- 172 | Tuple[List[str], List[float]] 173 | list of transformed texts and their similarity scores 174 | """ 175 | # split text into context and answer options 176 | sentences = self.sent_split(sentence[self.text_col]) 177 | context = sentences.pop(0) 178 | # get answer index 179 | keys = ["A", "B", "C", "D"] 180 | answer = sentence[self.label_col] 181 | answer = keys.index(answer) if type(answer) == str else answer 182 | 183 | transform_sent = [[context]] 184 | imediate_context = [sentence.text for sentence in sentenize(context)][-1] 185 | 186 | # generate new answers 187 | change_answ = random.choice(range(len(sentences))) 188 | while change_answ == answer: 189 | change_answ = random.choice(range(len(sentences))) 190 | 191 | for s_id, sent in enumerate(sentences): 192 | if s_id == change_answ: 193 | transformed = set(transformer.generate(imediate_context)) 194 | if add_split == "sent": 195 | transf_sent = [] 196 | for s in transformed: 197 | s = s.replace(imediate_context, "").replace("\n", " ") 198 | split_sent = re.split(r"[\.\?!]", s)[0] 199 | if len(split_sent) > 1 and len(split_sent[0]) > 1: 200 | transf_sent.append(split_sent.strip()) 201 | else: 202 | transf_sent.append(" ".join(s.split()[:5]).strip()) 203 | else: 204 | transf_sent = [ 205 | re.split( 206 | r"[\.\?!]", 207 | s.replace(imediate_context, "").replace("\n", " "), 208 | )[0].strip() 209 | for s in transformed 210 | ] 211 | transf_sent = self._drop_duplicates(sent, transf_sent) 212 | transform_sent.append(transf_sent) 213 | else: 214 | transform_sent.append([sent]) 215 | transform_sent = self._sample_to_max_outputs(transform_sent) 216 | transform_sent, sent_scores = self._filter_candidates( 217 | transform_sent, sentence[self.text_col] 218 | ) 219 | 220 | return transform_sent, sent_scores 221 | 222 | def transform_sentence( 223 | self, 224 | transformer: SentenceOperation, 225 | sentence: str, 226 | reference: str, 227 | context: Optional[str] = None, 228 | add_split: Optional[str] = None, 229 | ) -> Tuple[List[str], List[float]]: 230 | """ 231 | Applies the transformations to sentence and filters the transformed sentences 232 | 233 | Parameters 234 | ---------- 235 | transformer: SentenceOperation 236 | transformer used for transformation 237 | sentence: str 238 | sentence to transform 239 | reference: 240 | original sentence 241 | context: str, optional 242 | full context (default is None) 243 | add_split: str, optional 244 | type of additional splitting to do (default is None) 245 | - if 'word' returns first 5 words of the generated text 246 | - if 'sent' returns the first generated sentence (sequence to '.') 247 | 248 | Returns 249 | ------- 250 | Tuple[List[str], List[float]] 251 | list of transformed sentences and their similarity scores 252 | """ 253 | transform_sent = transformer.generate(sentence) 254 | 255 | if add_split == "sent": 256 | transformed = [] 257 | for s in transform_sent: 258 | split_sent = re.split( 259 | r"[\.\?!]", s.replace(sentence, "").replace("\n", " ") 260 | ) 261 | if len(split_sent[0]) < 10 and len(split_sent) > 1: 262 | transformed.append((sentence + " " + split_sent[1]).strip()) 263 | else: 264 | transformed.append((sentence + " " + split_sent[0]).strip()) 265 | transform_sent = transformed 266 | elif add_split == "word5": 267 | transform_sent = [ 268 | sentence 269 | + " " 270 | + " ".join( 271 | s.replace(sentence, "").replace("\n", " ").split()[:5] 272 | ).strip() 273 | for s in transform_sent 274 | ] 275 | elif add_split == "word3": 276 | transform_sent = [ 277 | sentence 278 | + " " 279 | + " ".join( 280 | s.replace(sentence, "").replace("\n", " ").split()[:3] 281 | ).strip() 282 | for s in transform_sent 283 | ] 284 | else: 285 | transform_sent = [ 286 | sentence 287 | + " " 288 | + " ".join(s.replace(sentence, "").replace("\n", " ")[:10]).strip() 289 | for s in transform_sent 290 | ] 291 | 292 | transform_sent = self._drop_duplicates(sentence, transform_sent) 293 | transform_sent, sent_scores = self._filter_candidates( 294 | transform_sent, reference, context=context 295 | ) 296 | transform_sent, sent_scores = self._sample_to_max_outputs( 297 | transform_sent, sent_scores 298 | ) 299 | 300 | return transform_sent, sent_scores 301 | 302 | def transform(self, sentence: Dict[str, Any]) -> List[Dict[str, Any]]: 303 | """ 304 | Applies the transformations to input 305 | 306 | Parameters 307 | ---------- 308 | sentence: Dict[str, Any] 309 | dataset object in dict form 310 | 311 | Returns 312 | ------- 313 | Dict[str, Any] 314 | Transformed dataset object 315 | """ 316 | transformed_data = [] 317 | scores = [] 318 | for transform_name, transformer in self.transformers.items(): 319 | if self.task_type == "multichoice_qa": 320 | transform_sent, sent_scores = self._transform_multichoice( 321 | transformer, sentence, add_split="sent" 322 | ) 323 | 324 | if len(transform_sent) == 1 and sent_scores[0] == 1: 325 | transform_sent, sent_scores = self._transform_multichoice( 326 | transformer, sentence, add_split="word" 327 | ) 328 | 329 | else: 330 | split_text = self.sent_split(sentence[self.text_col]) 331 | if len(split_text) > 1: 332 | context, text = map( 333 | lambda x: " ".join(x), (split_text[:-2], split_text[-2:]) 334 | ) 335 | else: 336 | text = sentence[self.text_col] 337 | context = None 338 | transform_sent, sent_scores = self.transform_sentence( 339 | transformer, 340 | text, 341 | sentence[self.text_col], 342 | context, 343 | add_split="sent", 344 | ) 345 | for split_type in ["word5", "word3", "char"]: 346 | if len(transform_sent) == 1 and sent_scores[0] == 1: 347 | transform_sent, sent_scores = self.transform_sentence( 348 | transformer, 349 | text, 350 | sentence[self.text_col], 351 | context, 352 | add_split=split_type, 353 | ) 354 | else: 355 | break 356 | 357 | transformed = self._update_data(sentence, transform_sent) 358 | transformed_data.extend(transformed) 359 | scores.extend(sent_scores) 360 | 361 | return transformed_data, scores 362 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RuTransform 2 | 3 | RuTransform is a Python framework for adversarial attacks and text data augmentation for Russian. 4 | 5 | #### Table of contents 6 | 7 | - [Setup & Usage](https://github.com/RussianNLP/rutransform/#setup--usage) 8 | - [Examples](https://github.com/RussianNLP/rutransform/#examples) 9 | - Dataset Transformation on sample data 10 | - Dataset Transformation on your own data 11 | - Custom Constraints 12 | - Sentence Transformations 13 | - Custom Transformations 14 | - [Framework Design](https://github.com/RussianNLP/rutransform/#framework-design) 15 | - [Cite us](https://github.com/RussianNLP/rutransform/#cite-us) 16 | 17 | ## Setup & Usage 18 | 19 | #### Requirements 20 | 21 | Python >= 3.7 22 | 23 | ### Installation 24 | 25 | ``` 26 | git clone https://github.com/RussianNLP/rutransform 27 | cd rutransform 28 | pip install . 29 | ``` 30 | 31 | ### Examples 32 | 33 |
34 | Dataset Transformation (sample data) 35 | 36 | For the full list of supported transformations with examples see [supported transformations](https://github.com/RussianNLP/rutransform/#supported-transformations). 37 | 38 | ``` 39 | import pandas as pd 40 | from rutransform.transformations import DatasetTransformer 41 | from rutransform.utils.args import TransformArguments 42 | 43 | 44 | # load data 45 | dataset = pd.read_json('test_data/worldtree.json', lines=True) 46 | 47 | # load arguments 48 | transformation = "butter_fingers" 49 | probability = 0.3 50 | args = TransformArguments(transformation=transformation, probability=probability) 51 | 52 | # init dataset transformer 53 | tr = DatasetTransformer( 54 | dataset=dataset, 55 | text_col='question', 56 | task_type='multichoice_qa', 57 | args=args, 58 | return_type='pd' # format of the resulting dataset (default is 'hf') 59 | ) 60 | 61 | # run transformation 62 | output = tr.transform() 63 | ``` 64 | `DatasetTransformer` outputs a named tuple with dataset similarity scores: 65 | ``` 66 | score = output.score # mean dataset similarity score: 0.9146944761276246 67 | scores = output.scores # similarity scores for each sentence: array([0.93971652, 0.94295949, 0.8272841 , 0.98828816, 0.87522411]) 68 | std = output.std # std of the similarity scores: 0.05663837594035781 69 | ``` 70 | ...and the transformed dataset: 71 | ``` 72 | transformed_data = output.transformed_dataset 73 | 74 | print('Original data:\n', dataset['question'][0]) 75 | print('Transformed data:\n', transformed_data['question'][0]) 76 | ``` 77 | ``` 78 | Original data: 79 | Когда мороженое не кладут в морозильную камеру, мороженое превращается из ___. (A) твердого тела в газ (B) газа в жидкость (C) твердого тела в жидкость (D) жидкости в газ 80 | Transformed data: 81 | Когда мороженое не кладут в морозильную камеру, мороженое превращается из ___. (A) твердого тела в газ (B) газа в жидвость (C) мвердого тела в дидкость (D) жидкости в гкз 82 | ``` 83 | 84 |
85 | 86 |
87 | Dataset Transformation (own data) 88 | 89 | 90 | RuTransform can easily be adapted to other tasks. To use the framework on your own data, simply specify the text (`text_col`) and/or target (`label_col`) column names and choose the suitable constraints (pass them into the `custom_constraints` argument). For example, to run transformation on the [DaNetQA](https://russiansuperglue.com/tasks/task_info/DaNetQA) data [(Shavrina et al,. 2020)](https://aclanthology.org/2020.emnlp-main.381/) we choose to perturb the `passage` text and use the `NamedEntities` constraint to preserve proper nouns: 91 | 92 | ``` 93 | import pandas as pd 94 | from rutransform.transformations import DatasetTransformer 95 | from rutransform.utils.args import TransformArguments 96 | from rutransform.constraints import NamedEntities 97 | 98 | # load data 99 | dataset = pd.read_json('test_data/danet_qa.json', lines=True) 100 | 101 | # init arguments 102 | transformation = "back_translation" 103 | probability = 0.5 104 | args = TransformArguments(transformation=transformation, probability=probability) 105 | 106 | # init dataset transformer 107 | tr = DatasetTransformer( 108 | dataset=dataset, 109 | text_col='passage', 110 | args=args, 111 | return_type='pd', 112 | custom_constraints=[NamedEntities()], 113 | device='cuda:0' 114 | ) 115 | 116 | # run transformation 117 | output = tr.transform() 118 | 119 | print('Original data:\n', dataset['passage'][4],) 120 | print('Transformed data:\n', output.transformed_dataset['passage'][4]) 121 | ``` 122 | ``` 123 | Original data: 124 | Средство коммуникации. В своей простейшей форме искусство представляет собой средство коммуникации. Как и большинство прочих способов коммуникации, оно несет в себе намерение передать информацию аудитории. Например, научная иллюстрация — тоже форма искусства, существующая для передачи информации. Ещё один пример такого рода — географические карты. Однако содержание послания не обязательно бывает научным. Искусство позволяет передавать не только объективную информацию, но и эмоции, настроение, чувства. 125 | Transformed data: 126 | Средство коммуникации. В своей простой форме искусство является средством общения. Как и большинство других средств коммуникации, она намерена доводить информацию до сведения аудитории. Например, научная иллюстрация — тоже форма искусства, существующая для передачи информации. Ещё один пример такого рода — географические карты. Однако содержание послания не обязательно бывает научным. Искусство позволяет передавать не только объективную информацию, но и эмоции, настроение, чувства. 127 | ``` 128 |
129 | 130 |
131 | Custom Constraints 132 | 133 | If the provided constraints are not enough, you can create your own ones by simple class inheritance. For example, to run transformation on the [RWSD](https://russiansuperglue.com/tasks/task_info/RWSD) dataset [(Shavrina et al,. 2020)](https://aclanthology.org/2020.emnlp-main.381/), we create an `RWSDConstraint`: 134 | 135 | ``` 136 | from rutransform.constraints import Constraint 137 | from rutransform.constraints.utils import parse_reference 138 | from typing import List, Optional 139 | from spacy.language import Language 140 | 141 | class RWSDConstraint(Constraint): 142 | def __init__(self, target_col_name: str, reference_key: str, noun_key: str) -> None: 143 | super().__init__(name='rwsd_constraint') 144 | self.target_col_name = target_col_name 145 | self.reference_key = reference_key 146 | self.noun_key = noun_key 147 | 148 | def patterns( 149 | self, text: Optional[dict], spacy_model: Optional[Language] 150 | ) -> List[List[dict]]: 151 | morph = parse_reference(text[self.target_col_name][self.noun_key], spacy_model) 152 | antecedent_feats = list(morph.values()) 153 | patterns = [ 154 | [{"TEXT": {"IN": text[self.target_col_name][self.reference_key].split() + text[self.target_col_name][self.noun_key].split()}}], 155 | [{"POS": {"IN": ["NOUN", 'PROPN']}, "MORPH": {"IS_SUPERSET": antecedent_feats}}], 156 | ] 157 | return patterns 158 | ``` 159 | To use custom constraints during the transformation, pass them into the `custom_constraints` argument: 160 | ``` 161 | import pandas as pd 162 | from rutransform.transformations import DatasetTransformer 163 | from rutransform.utils.args import TransformArguments 164 | 165 | # load data 166 | dataset = pd.read_json('test_data/rwsd.json', lines=True) 167 | 168 | # load arguments 169 | transformation = "eda" 170 | probability = 0.5 171 | args = TransformArguments(transformation=transformation, probability=probability) 172 | 173 | # init dataset transformer 174 | tr = DatasetTransformer( 175 | dataset=dataset, 176 | text_col='text', 177 | args=args, 178 | custom_constraints=[ 179 | RWSDConstraint( 180 | target_col_name='target', reference_key='span2_text', noun_key='span1_text' 181 | ) 182 | ], 183 | return_type='pd' # format of the resulting dataset (default is 'hf') 184 | ) 185 | 186 | # run transformation 187 | output = tr.transform() 188 | 189 | print('Target:', dataset['target'][0]) 190 | print('Original data:\n', dataset['text'][0],) 191 | print('Transformed data:\n', output.transformed_dataset['text'][0]) 192 | ``` 193 | ``` 194 | Target: {'span1_text': 'статью', 'span2_text': 'читает ее', 'span1_index': 7, 'span2_index': 9} 195 | Original data: 196 | Сара взяла в библиотеке книгу, чтобы написать статью. Она читает ее, когда приходит с работы. 197 | Transformed data: 198 | Сара книгу , чтобы написать статью Она читает ее с работы . 199 | ``` 200 | 201 |
202 | 203 |
204 | Sentence Transformation 205 | 206 | All of the transformations, supported by the framework, can be applied not only to the while datasets, but sentences alone. 207 | 208 | ``` 209 | from rutransform.transformations import ( 210 | SentenceAdditions, 211 | ButterFingersTransformation, 212 | EmojifyTransformation, 213 | ChangeCharCase, 214 | BackTranslationNER, 215 | Paraphraser, 216 | RandomEDA, 217 | BAE 218 | ) 219 | 220 | # initialize the transformations arguments, but you can leave out the transformation 221 | args = TransformArguments(probability=0.5) 222 | 223 | # transform the sentence 224 | tr = SentenceAdditions(args=args) 225 | tr.generate('мама мыла раму') 226 | ``` 227 | 228 | ``` 229 | ['мама мыла раму, Мама мыла раму,'] 230 | ``` 231 | 232 | ``` 233 | tr = ButterFingersTransformation(args=args,) 234 | tr.generate('мама мыла раму') 235 | ``` 236 | ``` 237 | ['ммаа мырв ламу'] 238 | ``` 239 |
240 | 241 | 242 |
243 | Custom Transformation 244 | 245 | RuTransform allows one to create their own custom transformations. Here is the example of a simple transformation that randomises word order. 246 | 247 | 248 | First, you need to define the `SentenceOperation` class for the transformation, which has `__init__` and `generate` functions. 249 | 250 | Note, that the function arguments must stay unchanged for further compatability with the framework. We also define a separate function for th transformation itself, to keep the code more readable. 251 | 252 | ``` 253 | import random 254 | import spacy 255 | from rutransform.transformations.utils import SentenceOperation 256 | from typing import Optional, List, Union, Dict 257 | 258 | 259 | def random_word_order(sentence, spacy_model, seed, max_outputs): 260 | 261 | """ 262 | Randomise word order 263 | """ 264 | 265 | random.seed(seed) 266 | 267 | if not spacy_model: 268 | spacy_model = spacy.load('ru_core_news_sm') 269 | 270 | tokens = [token.text for token in spacy_model(sentence)] 271 | 272 | return [' '.join(random.sample(tokens, k=len(tokens))) for _ in range(max_outputs)] 273 | 274 | 275 | class RandomWordOrder(SentenceOperation): 276 | def __init__( 277 | self, args, seed=42, 278 | max_outputs=1, device="cpu", 279 | spacy_model=None, 280 | ): 281 | super().__init__( 282 | args=args, 283 | seed=seed, 284 | max_outputs=max_outputs, 285 | device=device, 286 | spacy_model=spacy_model, 287 | ) 288 | 289 | def generate( 290 | self, 291 | sentence: str, 292 | stop_words: Optional[List[Union[int, str]]] = None, 293 | prob: Optional[float] = None, 294 | ) -> List[str]: 295 | 296 | transformed = random_word_order( 297 | sentence=sentence, 298 | seed=self.seed, 299 | spacy_model=self.spacy_model, 300 | max_outputs=self.max_outputs 301 | ) 302 | 303 | return transformed 304 | 305 | ``` 306 | 307 | Now the transformation is ready to use on the sentence level: 308 | 309 | ``` 310 | from rutransform.utils.args import TransformArguments 311 | 312 | args = TransformArguments() 313 | tr = RandomWordOrder(args=args, max_outputs=5) 314 | tr.generate("мама мыла раму") 315 | 316 | ``` 317 | 318 | ``` 319 | ['раму мама мыла', 320 | 'раму мыла мама', 321 | 'мама раму мыла', 322 | 'раму мама мыла', 323 | 'мама раму мыла'] 324 | ``` 325 | 326 | After creating the transformation, you can add it to an existing Transformer, by simply inheriting the class and changing the `transform_info` fuction: 327 | 328 | ``` 329 | from rutransform.transformations import EDATransformer 330 | 331 | 332 | class EDATransformer(EDATransformer): 333 | def __init__( 334 | self, 335 | transformations: List[str], 336 | task_type: str, 337 | args: TransformArguments, 338 | text_col: Optional[str] = "text", 339 | label_col: Optional[str] = "label", 340 | seed: int = 42, 341 | device: str = "cpu", 342 | constraints=None, 343 | ) -> None: 344 | 345 | super().__init__( 346 | transformations=transformations, 347 | task_type=task_type, 348 | args=args, 349 | text_col=text_col, 350 | label_col=label_col, 351 | seed=seed, 352 | device=device, 353 | constraints=constraints 354 | ) 355 | 356 | def transform_info() -> Dict[str, Optional[SentenceOperation]]: 357 | 358 | info = {"eda": RandomEDA, "word_order": RandomWordOrder} 359 | 360 | return info 361 | ``` 362 | 363 | ...or create a Transformer from scratch by inheriting the `Transformer` class and defining several functions: 364 | 365 | - `transform_info`: a staticmethod, must return a dictionary {transformation name: corresponding SentenceOperation class}. It is used to load the list of all the available transformations 366 | - `_apply_transformation`: a function that applies the transformations to text until the transformed text passes the similarity threshold and returns a list of transformed texts and their similarity scores 367 | - `transform` (optional): a function that takes a sentence as input and transforms it 368 | 369 | For more information on the `Transformer` class and its structure see [here](rutransform/transformations/transformer.py). 370 | 371 | 372 | Once you have created the Transformer, add it to the [rutransform/transformations/transformers](rutransform/transformations/transformers) folder and edit the [`__init__.py`](rutransform/transformations/__init__.py) file. 373 | 374 | Now you transformation is ready for use! 375 |
376 | 377 | ## Framework Design 378 | 379 | ### Supported Transformations 380 | 381 | Following the generally accepted typology ([Zhang et al., 2020](https://arxiv.org/pdf/1901.06796.pdf); [Wang et al., 2021b](https://aclanthology.org/2022.naacl-main.339/)), we divide the transformations included in the framework in two types, depending on their target. 382 | 383 | #### Word-Level Transformations 384 | 385 | Word-level perturbations utilize several strategies to perturb tokens, ranging from imitation of typos to synonym replacement: 386 | 387 | Type | Transformation | Paper/Source | Main Idea |Example | 388 | :-------|:------------------------|:-------------|:----------|:------------------------------------------------------------------| 389 | Spelling|ButterFingers (`butter_fingers`) | [(Dhole, 2021)](https://github.com/GEM-benchmark/NL-Augmenter/tree/main/nlaugmenter/transformations/butter_fingers_perturbation) | Adds noise to data by mimicking spelling mistakes made by humans through character swaps based on their keyboard distance | This is a se**m**tence **r**o test t**j**e code | 390 | Spelling|Case (`case`) | [(Z. Wang, 2021)](https://github.com/GEM-benchmark/NL-Augmenter/tree/main/nlaugmenter/transformations/change_char_case) | Adds noise to data through case alteration | This is a sentence to tes**T** t**H**e c**OD**e | 391 | Modality|Emojify (`emojify`) | [(Wang, 2021)](https://github.com/GEM-benchmark/NL-Augmenter/tree/main/nlaugmenter/transformations/emojify)| Replaces the input words with the corresponding emojis, preserving their original meaning | This is a sentence to test the 👩‍💻 | 392 | Context |BAE (`bae`)| [(Garg and Ramakrishnan, 2020)](https://arxiv.org/abs/2004.01970) | Inserts and replaces tokens in the original text by masking a portion of the text and using masked LMs to generate contextually appropriate words | This is a sentence to test the **given** code | 393 | 394 | #### Sentence-Level Transformations 395 | In contrast to word-level perturbations, sentence-level perturbation techniques affect the syntactic structure: 396 | 397 | Type |Transformation | Paper/Source | Main Idea |Example | 398 | :----------|:-----------------------|:-------------|:----------|:------------------------------------------------------------------| 399 | Random |EDADELETE (`eda`)| [(Wei and Zou, 2019)](https://aclanthology.org/D19-1670.pdf) | Randomly deletes tokens in the text | This **_** a sentence to test the code | 400 | Random |EDASWAP (`eda`) | [(Wei and Zou, 2019)](https://aclanthology.org/D19-1670.pdf) | Randomly swaps tokens in the text | **code** is a sentence to test the **This** | 401 | Paraphrasis|Paraphraser (`paraphraser`) | [(Fenogenova, 2021)](http://bsnlp.cs.helsinki.fi/papers-2021/2021.bsnlp-1.2.pdf) | Generates variations of the context via simple paraphrasing | **I am just testing the code** | 402 | Paraphrasis|BackTranslation (`back_translation`) | [(Yaseen and Langer, 2021)](https://arxiv.org/abs/2108.11703) | Generates variations of the context through back-translation (ru -> en -> ru) |**This sentence tests the code** | 403 | Distraction|AddSent (`add_sent`) | [(Jia and Liang, 2017)](https://aclanthology.org/D17-1215/) | Generates extra words or a sentence at the end of the text. For multiple choice QA tasks it replaces one or more choice options with a generated one | This is a sentence to test the code **, if you want to delete it** | 404 | 405 | The examples are given in English for illustration purposes. 406 | 407 | ### Probability thresholds 408 | 409 | The degree of the input modification can be controlled with an adversarial probability threshold, which serves as the hyperparameter. The higher the probability, the more the input gets modified. 410 | 411 | ### Constraints 412 | 413 | The RuTransform's attacks and perturbations do not drastically change the input's meaning. Despite this, we consider the use of rule-based constraints that keep the linguistic structure and task-specific aspects unchanged. For instance, it is crucial to leave named entities in the QA tasks untouched and not modify the syntactic structure and anaphors when perturbing the coreference resolution task examples. 414 | 415 | Name| Description | Additional Requirements | Example | 416 | :---|:------------|:------------------------|:--------| 417 | `Jeopardy` | Jeopardy type conatraints, including (1) Noun Phrases such as THIS FILM, THIS ACTOR, both UPPER and lower cased, (2) 'X', (3) «Named Entity in parentheses» | - | For the first time, **THIS soda** appeared in 1958 in Spain, the name of the drink is translated from the Esperanto language as **“amazing”**.| 418 | `NamedEntities`|Matches all the named entities in text| - |The singer from **Turkey** who impressed us all.| 419 | `Multihop`| Constraints for multihop QA tasks. Matches all the bridge and main answers important for hops | - | `Question:` Where is the source of the river, the tributary of which is the Getar, located? `Supporting Text:` The **Getar** is a river in Armenia. It originates in the Kotayk region, flows through the central part of Yerevan and flows into **the Hrazdan**. `Main Text:` **The Hrazdan**, a river in Armenia, is the left tributary of the Aras. It originates at the northwest extremity of Lake **Sevan**, near the city of **Sevan**. `Answer:` Sevan | 420 | `Referents` | Constraints for coreference resolution tasks. Matches (1) the anaphoric pronoun, (2) all possible antecedents (3) all verbs referring to antecedents and anaphor | Markup of the possible antecedents and anaphors | The **singer** from **Turkey** **who** **impressed** us all.| 421 | 422 | ### Semantic filtering 423 | 424 | We follow [Wang et al., 2021](https://arxiv.org/abs/2111.02840) on filtering the adversarial examples with BERTScore [(Zhang et al., 2019)](https://arxiv.org/abs/1904.09675), a BERT-based text similarity metric [(Devlin et al., 2019)](https://aclanthology.org/N19-1423.pdf). We measure the semantic similarity between the original input and adversarial output and keep examples with the highest similarity score. In cases when the score is lower than a specified threshold, we iteratively decrease the adversarial probability threshold and re-score the new adversarial examples. 425 | 426 | 427 | ## Cite us 428 | 429 | ``` 430 | @article{taktasheva2022tape, 431 | title={TAPE: Assessing Few-shot Russian Language Understanding}, 432 | author={Taktasheva, Ekaterina and Shavrina, Tatiana and Fenogenova, Alena and Shevelev, Denis and Katricheva, Nadezhda and Tikhonova, Maria and Akhmetgareeva, Albina and Zinkevich, Oleg and Bashmakova, Anastasiia and Iordanskaia, Svetlana and others}, 433 | journal={arXiv preprint arXiv:2210.12813}, 434 | year={2022} 435 | } 436 | ``` 437 | 438 | ## License 439 | 440 | All the code is available under the Apache 2.0 license. 441 | --------------------------------------------------------------------------------