├── rutransform
├── constraints
│ ├── constraints
│ │ ├── __init__.py
│ │ ├── named_entities.py
│ │ ├── multihop.py
│ │ ├── jeopardy.py
│ │ └── referents.py
│ ├── __init__.py
│ ├── utils.py
│ └── constraint.py
├── transformations
│ ├── transformers
│ │ ├── __init__.py
│ │ ├── bae.py
│ │ ├── spelling.py
│ │ ├── eda.py
│ │ ├── paraphrasis.py
│ │ └── distraction.py
│ ├── transformations
│ │ ├── __init__.py
│ │ ├── bae
│ │ │ ├── __init__.py
│ │ │ └── bae.py
│ │ ├── eda
│ │ │ ├── __init__.py
│ │ │ └── eda.py
│ │ ├── distraction
│ │ │ ├── __init__.py
│ │ │ └── sentence_additions.py
│ │ ├── paraphrasis
│ │ │ ├── __init__.py
│ │ │ ├── paraphraser.py
│ │ │ └── back_translation.py
│ │ └── spelling
│ │ │ ├── __init__.py
│ │ │ ├── change_char_case.py
│ │ │ ├── butter_fingers.py
│ │ │ └── emojify.py
│ ├── __init__.py
│ ├── load_transformers.py
│ ├── README.md
│ ├── utils.py
│ └── dataset_transformer.py
├── utils
│ ├── __init__.py
│ └── args.py
└── __init__.py
├── requirements.txt
├── test_data
├── openbook.json
├── rwsd.json
├── winograd.json
├── worldtree.json
├── chegeka.json
├── danet_qa.json
├── sit_ethics.json
├── per_ethics.json
└── multiq.json
├── setup.py
├── LICENSE
└── README.md
/rutransform/constraints/constraints/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/rutransform/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from . import args
2 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformations/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/rutransform/__init__.py:
--------------------------------------------------------------------------------
1 | from . import utils, constraints, transformations
2 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformations/bae/__init__.py:
--------------------------------------------------------------------------------
1 | from .bae import *
2 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformations/eda/__init__.py:
--------------------------------------------------------------------------------
1 | from .eda import *
2 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformations/distraction/__init__.py:
--------------------------------------------------------------------------------
1 | from .sentence_additions import *
2 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformations/paraphrasis/__init__.py:
--------------------------------------------------------------------------------
1 | from .back_translation import *
2 | from .paraphraser import *
3 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformations/spelling/__init__.py:
--------------------------------------------------------------------------------
1 | from .butter_fingers import *
2 | from .change_char_case import *
3 | from .emojify import *
4 |
--------------------------------------------------------------------------------
/rutransform/constraints/__init__.py:
--------------------------------------------------------------------------------
1 | from .constraint import Constraint
2 | from .utils import *
3 | from .constraints.jeopardy import *
4 | from .constraints.named_entities import *
5 | from .constraints.referents import *
6 | from .constraints.multihop import *
7 |
--------------------------------------------------------------------------------
/rutransform/transformations/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *
2 | from .transformations import *
3 |
4 | from .transformer import Transformer
5 | from .transformers.distraction import *
6 | from .transformers.bae import *
7 | from .transformers.eda import *
8 | from .transformers.paraphrasis import *
9 | from .transformers.spelling import *
10 |
11 | from .load_transformers import *
12 |
13 | from .dataset_transformer import *
14 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | bert-score==0.3.11
2 | datasets==1.10.0
3 | nltk==3.6.7
4 | numpy>=1.19.2
5 | pandas>=1.0.1
6 | textattack==0.3.4
7 | tensorflow_text==2.7.3
8 | torch==1.10.2
9 | torchtext==0.11.2
10 | transformers==4.12.2
11 | tqdm==4.49.0
12 | sentencepiece==0.1.96
13 | scikit-multilearn==0.2.0
14 | spacy>=3.2.0
15 | russian_paraphrasers==0.0.3
16 | razdel==0.5.0
17 | rusenttokenize==0.0.5
18 | nlg-eval @ git+https://github.com/Maluuba/nlg-eval.git@master
19 | ru_core_news_sm @ https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.2.0/ru_core_news_sm-3.2.0.tar.gz
--------------------------------------------------------------------------------
/test_data/openbook.json:
--------------------------------------------------------------------------------
1 | {"ID":"14-1486","question":"Когда вы нажимаете на тормоз в машине, из-за трения (A) машина летит (B) скорость не меняется (C) скорость падает (D) скорость увеличивается","answer":"C"}
2 | {"ID":"8-430","question":"Несмотря на то, что некоторые думают, наша планета вращается вокруг этого объекта (A) плутон (B) луна (C) млечный путь (D) солнце","answer":"D"}
3 | {"ID":"828","question":"Люди худеют, когда (A) им не хватает еды (B) они слишком много едят (C) они переедают (D) они едят пищу","answer":"A"}
4 | {"ID":"14-788","question":"Что сделано из минералов? (A) Стоунхендж (B) автомобильные двигатели (C) спирт (D) электричество","answer":"A"}
5 | {"ID":"7-1160","question":"Карелия - край гигантских озер, где (A) ледники вырывали озера (B) много купающихся (C) ледники плавают в озерах (D) озера заполнены угрями","answer":"A"}
6 |
--------------------------------------------------------------------------------
/test_data/rwsd.json:
--------------------------------------------------------------------------------
1 | {"idx":253,"target":{"span1_text":"статью","span2_text":"читает ее","span1_index":7,"span2_index":9},"label":false,"text":"Сара взяла в библиотеке книгу, чтобы написать статью. Она читает ее, когда приходит с работы."}
2 | {"idx":326,"target":{"span1_text":"Фред","span2_text":"он вернулся","span1_index":0,"span2_index":10},"label":false,"text":"Фред смотрел телевизор, пока Джордж выходил купить продуктов. Через час он вернулся."}
3 | {"idx":377,"target":{"span1_text":"печенья с шоколадной крошкой","span2_text":"больше их","span1_index":3,"span2_index":16},"label":false,"text":"Всем понравились овсяные печенья, и только некоторым – печенья с шоколадной крошкой. В следующий раз надо больше их сделать."}
4 | {"idx":8,"target":{"span1_text":"Женя","span2_text":"она получила","span1_index":0,"span2_index":6},"label":true,"text":"Женя поблагодарила Сашу за помощь, которую она получила."}
5 | {"idx":475,"target":{"span1_text":"Донной","span2_text":"ее сосредоточенность","span1_index":3,"span2_index":5},"label":true,"text":"Лили заговорила с Донной, нарушив ее сосредоточенность."}
--------------------------------------------------------------------------------
/rutransform/constraints/utils.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | from spacy.language import Language
3 |
4 |
5 | def parse_reference(text: str, spacy_model: Language) -> List[str]:
6 | """
7 | Extract morphological features of the antecedents
8 |
9 | Parameters
10 | ----------
11 | text: str
12 | anaphor
13 | spacy_model: spacy.language.Language
14 | spacy model to be used for morphological analysis
15 |
16 | Returns
17 | -------
18 | List[str]
19 | Number and/or Gender of the anaphor parameter strings for Matcher
20 | """
21 | out = {}
22 | morph = spacy_model(text)[0].morph
23 | case = morph.get("Case")
24 | if len(case) > 0:
25 | case = case[0]
26 | out["case"] = f"Case={case}"
27 | gender = morph.get("Gender")
28 | if len(gender) > 0:
29 | gender = gender[0]
30 | out["gender"] = f"Gender={gender}"
31 | number = morph.get("Number")
32 | if len(number) > 0:
33 | number = number[0]
34 | out["number"] = f"Number={number}"
35 | return out
36 |
--------------------------------------------------------------------------------
/rutransform/transformations/load_transformers.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pkgutil
3 | from pathlib import Path
4 | from importlib import import_module
5 | import inspect
6 | from rutransform.transformations import Transformer
7 | from rutransform.transformations.utils import SentenceOperation
8 |
9 |
10 | def load_transformers():
11 | search = "transformers"
12 | package_dir = Path(__file__).resolve()
13 | transformations_dir = package_dir.parent.joinpath(search)
14 | a = pkgutil.iter_modules(path=[transformations_dir])
15 |
16 | transform_dict = {}
17 | for (_, folder, _) in a:
18 |
19 | t = import_module(f"rutransform.transformations.transformers.{folder}")
20 |
21 | for name, obj in inspect.getmembers(t):
22 | if (
23 | inspect.isclass(obj)
24 | and issubclass(obj, Transformer)
25 | and not issubclass(obj, SentenceOperation)
26 | ):
27 | try:
28 | info = obj.transform_info().items()
29 | for transformation, _ in info:
30 | transform_dict[transformation] = obj
31 | except NotImplementedError:
32 | pass
33 |
34 | return transform_dict
35 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 | import glob
3 | import os
4 |
5 |
6 | def process_requirements(fname):
7 | path = os.path.join(os.path.dirname(__file__), fname)
8 | with open(path, "r", encoding="utf-8") as f:
9 | requirements = f.read()
10 | processed_requirements = [x for x in requirements.strip().split("\n")]
11 | return processed_requirements
12 |
13 |
14 | with open("README.md", "r") as fh:
15 | long_description = fh.read()
16 |
17 |
18 | setuptools.setup(
19 | name="rutransform",
20 | version="0.0.1",
21 | author="evtaktasheva",
22 | author_email="evtaktasheva@gmail.com",
23 | description="Adversarial text perturbation framework for Russian",
24 | long_description=long_description,
25 | long_description_content_type="text/markdown",
26 | url="https://github.com/RussianNLP/rutransform",
27 | packages=setuptools.find_packages(),
28 | package_data={"": ["*.json"]},
29 | include_package_data=True,
30 | license='Apache License 2.0',
31 | classifiers=[
32 | "Programming Language :: Python :: 3",
33 | "License :: OSI Approved :: Apache Software License",
34 | "Operating System :: OS Independent",
35 | ],
36 | install_requires=process_requirements("requirements.txt"),
37 | )
38 |
--------------------------------------------------------------------------------
/rutransform/constraints/constraint.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional
2 | from abc import abstractmethod
3 | from spacy.language import Language
4 |
5 |
6 | class Constraint:
7 | """
8 | Base class for transformation constraints
9 |
10 | Attributes
11 | ----------
12 | name: str
13 | name of the constraint
14 |
15 | Methods
16 | -------
17 | @abstractmethod
18 | patterns(text, spacy_model)
19 | Creates spacy.Matcher patterns to extract stopwords
20 | """
21 |
22 | def __init__(self, name: str) -> None:
23 | """
24 | Parameters
25 | ----------
26 | name: str
27 | name of the constraint
28 | """
29 | self.name = name
30 |
31 | @abstractmethod
32 | def patterns(
33 | self, text: Optional[dict], spacy_model: Optional[Language]
34 | ) -> List[List[dict]]:
35 | """
36 | Creates spacy.Matcher patterns to extract stopwords
37 |
38 | Parameters
39 | ----------
40 | text: dict
41 | dataset object in dict form
42 | spacy_model: spacy.language.Language
43 | spacy model to be uses for morphological analysis
44 |
45 | Returns
46 | -------
47 | List[List[dict]]
48 | list of spacy.Matcher patterns, that match the constraint
49 | """
50 | raise NotImplementedError
51 |
--------------------------------------------------------------------------------
/test_data/winograd.json:
--------------------------------------------------------------------------------
1 | {"text":"\"А для госрегистрации понадобится только декларация о постройке, которую вы заполняете самостоятельно (бланк выдадут в местной администрации, также его можно скачать на сайте Росрегистрации)","answer":"постройке","label":0,"options":["госрегистрации","декларация","постройке","администрации","Росрегистрации"],"reference":"которая","homonymia_type":1.2}
2 | {"text":"На втором месте оказалась 16-летняя алина из румынии, которая получила 4 тысячи долларов","answer":"румынии","label":0,"options":["алина","румынии","тысячи"],"reference":"которая","homonymia_type":1.1}
3 | {"text":"\"Чего стоила, например, мёртвенно-бледная Морена - Хранительница Смерти, которую доставили в зал на золотых носилках скелеты, щёлкая челюстями в такт каждого шага","answer":"Хранительница","label":0,"options":["Морена","Хранительница","Смерти","челюстями"],"reference":"которая","homonymia_type":1.4}
4 | {"text":"\"Маша, водя пальчиком по его лицу, стала говорить об одушевленности пола, о личности, которая выражает себя прикосновением…","answer":"личности","label":1,"options":["Маша","одушевленности","личности"],"reference":"которая","homonymia_type":1.2}
5 | {"text":"\"Мифологии, скреплявшей нацию, нужен был набор идеологически выдержанных легенд, а не правда о войне, которая могла подорвать самые основы брежневской Системы","answer":"правда","label":1,"options":["Мифологии","нацию","легенд","правда","войне","основы","Системы"],"reference":"которая","homonymia_type":1.2}
6 |
--------------------------------------------------------------------------------
/rutransform/transformations/README.md:
--------------------------------------------------------------------------------
1 | # Transformations
2 |
3 | ## Word-Level Transformations
4 |
5 | Type | Transformation | Example |
6 | :-------|:------------------------|:----------------------------------------------------------------|
7 | Spelling|`ButterFingers` | This is a se**m**tence **r**o test t**j**e code |
8 | Spelling|`Case` | This is a sentence to tes**T** t**H**e c**OD**e |
9 | Modality|`Emojify` | This is a sentence to test the 👩💻 |
10 | Context |`BAE``INSERT` | This is a sentence to test the **given** code |
11 | Context |`BAE``REPLACE`| This is a sentence to check the code |
12 |
13 | ## Sentence-Level Transformations
14 |
15 | Type |Transformation | Example |
16 | :----------|:-----------------------|:-------------------------------------------------------------------|
17 | Random |`EDA``DELETE`| This **_** a sentence to test the code |
18 | Random |`EDA``SWAP` | **code** is a sentence to test the **This** |
19 | Paraphrasis|`Paraphraser` | **I am just testing the code** |
20 | Paraphrasis|`BackTranslation` | **This sentence tests the code** |
21 | Distraction|`AddSent` | This is a sentence to test the code **, if you want to delete it** |
22 |
--------------------------------------------------------------------------------
/test_data/worldtree.json:
--------------------------------------------------------------------------------
1 | {"question":"Когда мороженое не кладут в морозильную камеру, мороженое превращается из ___. (A) твердого тела в газ (B) газа в жидкость (C) твердого тела в жидкость (D) жидкости в газ","answer":"C","exam_name":"Virginia Standards of Learning - Science","school_grade":5,"knowledge_type":"CAUSAL,EXAMPLE"}
2 | {"question":"За сколько времени Земля совершит семь оборотов вокруг своей оси? (A) один день (B) одна неделя (C) один месяц (D) один год","answer":"B","exam_name":"NYSEDREGENTS","school_grade":4,"knowledge_type":"MODEL,QUANT"}
3 | {"question":"Студент толкает красную игрушечную машинку по деревянному полу. Затем студент толкает ту же красную игрушечную машинку по цементному полу. Какой вопрос студент, скорее всего, исследует? (A) С какой скоростью обычно катится игрушечная машинка? (B) Как лучше всего заставить игрушечный автомобиль катиться? (C) Какая поверхность позволяет игрушечной машинке катиться дальше? (D) Влияет ли цвет на расстояние, на которое катится игрушечная машинка?","answer":"C","exam_name":"Alaska Dept. of Education & Early Development","school_grade":4,"knowledge_type":"MODEL"}
4 | {"question":"Животные используют ресурсы окружающей среды, чтобы выжить. Какие из следующих ресурсов животные используют для получения энергии? (A) воздух (B) еда (C) убежище (D) вода","answer":"B","exam_name":"Maryland School Assessment - Science","school_grade":4,"knowledge_type":"PROCESS"}
5 | {"question":"Чем похожи испарение и конденсация? (A) Оба вызывают снижение температуры воздуха. (B) Оба вызывают повышение температуры воздуха. (C) Оба вызваны потеплением атмосферы. (D) Оба вызваны изменениями тепловой энергии.","answer":"D","exam_name":"North Carolina READY End-of-Grade Assessment","school_grade":5,"knowledge_type":"CAUSAL"}
6 |
--------------------------------------------------------------------------------
/test_data/chegeka.json:
--------------------------------------------------------------------------------
1 | {"question_id":27039,"question":"Именно В ЭТОМ ГОДУ родилась ровно половина всех людей, побывавших на Луне.","answer":"1930","topic":"Год лошади","author":"Кирилл Богловский","tour_name":"IX Кубок Европы по интеллектуальным играм среди студентов (Петродворец). Своя игра","tour_link":"https:\/\/db.chgk.info\/tour\/eu13stsv"}
2 | {"question_id":20030,"question":"Во время соревнований по стрельбе пошел сильный дождь. Был сооружен специальный навес, под который пускали, впрочем, не всех спортсменов, а только ИХ.","answer":"Шведы","topic":"Олимпийские игры 1912 года в Стокгольме","author":"Евгений Поникаров","tour_name":"I Чемпионат России по \"Своей игре\" (Великие Луки)","tour_link":"https:\/\/db.chgk.info\/tour\/russv07"}
3 | {"question_id":26414,"question":"Роберт Смит в 1872 году в книге \"Начало химической климатологии\" впервые употребил ЭТОТ ТЕРМИН.","answer":"Кислотный дождь","topic":"Осадки","author":"Евгений Шляхов, Владимир Островский, Александра Косолапова, Дмитрий Стрильчук","tour_name":"Молодежный чемпионат Украины по своей игре - 2013","tour_link":"https:\/\/db.chgk.info\/tour\/uksv13st"}
4 | {"question_id":11121,"question":"\"Энфилд Роуд\".","answer":"Ливерпуль","topic":"СТАДИОНЫ (называется стадион, нужно назвать город, в котором он расположен)","author":"Борис Шойхет","tour_name":"Пятый Международный летний лагерь немецких знатоков \"Хоббах - 2003\". Спортивное Jeopardy.","tour_link":"https:\/\/db.chgk.info\/tour\/hobb03je"}
5 | {"question_id":10970,"question":"Цитата из ЭТОГО произведения: \"Газет он не читал, в театр не ходил, а жена сбежала от него с тенором оперы Зимина в 1913 году, оставив ему записку такого содержания: \"Невыносимую дрожь отвращения возбуждают во мне твои лягушки. Я всю жизнь буду несчастна из-за них\"\".","answer":"\"Роковые яйца\" (профессор Персиков) Булгакова","topic":"ПЕРСИКИ","author":"Борис Шойхет","tour_name":"Фестиваль \"Кронах-2003\". Jeopardy.","tour_link":"https:\/\/db.chgk.info\/tour\/kronah03"}
6 |
--------------------------------------------------------------------------------
/rutransform/constraints/constraints/named_entities.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional
2 | from spacy.language import Language
3 |
4 | from rutransform.constraints import Constraint
5 |
6 |
7 | class NamedEntities(Constraint):
8 | """
9 | Named entities constraint
10 |
11 | Matches all the named entities in text
12 |
13 | Attributes
14 | ----------
15 | name: str
16 | name of the constraint (is always 'named_entities')
17 | entity_types: List[str], optional
18 | list of named entity types to include (default is None)
19 | matches all types if not provided
20 |
21 | Methods
22 | -------
23 | patterns(text, spacy_model)
24 | Creates spacy.Matcher patterns to extract stopwords
25 | """
26 |
27 | def __init__(self, entity_types: Optional[List[str]] = None) -> None:
28 | """
29 | Parameters
30 | ----------
31 | entity_types: List[str], optional
32 | list of named entity types to include (default is None)
33 | matches all types if not provided
34 | """
35 | super().__init__(name="named_entities")
36 | self.entity_types = entity_types
37 |
38 | def patterns(
39 | self, text: Optional[dict] = None, spacy_model: Optional[Language] = None
40 | ) -> List[List[dict]]:
41 | """
42 | Creates spacy.Matcher patterns to extract stopwords
43 |
44 | Parameters
45 | ----------
46 | text: dict
47 | ! exists for compatability, always ignored !
48 | dataset object in dict form (default is None)
49 | spacy_model: spacy.language.Language
50 | ! exists for compatability, always ignored !
51 | spacy model to be uses for morphological analysis (default is None)
52 |
53 | Returns
54 | -------
55 | List[List[dict]]
56 | list of spacy.Matcher patterns matching named entities
57 | """
58 | if self.entity_types is None:
59 | patterns = [[{"ENT_TYPE": "", "OP": "!"}]]
60 | else:
61 | patterns = [[{"ENT_TYPE": {"IN": self.entity_types}, "OP": "!"}]]
62 | return patterns
63 |
--------------------------------------------------------------------------------
/rutransform/transformations/utils.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional, Union, NamedTuple
2 |
3 | import pandas as pd
4 | import numpy as np
5 | from datasets import Dataset
6 | from spacy.language import Language
7 | from rutransform.utils.args import TransformArguments
8 |
9 |
10 | class TransformResult(NamedTuple):
11 | transformed_dataset: Union[pd.DataFrame, Dataset]
12 | scores: np.array
13 | score: float
14 | std: float
15 |
16 |
17 | class SentenceOperation(object):
18 | """
19 | Generic operation class.
20 |
21 | Attributes
22 | ----------
23 | args: TransformArguments
24 | parameters of the transformation
25 | seed: int
26 | seed to freeze everything (default is 42)
27 | max_outputs: int
28 | maximum number of the transfromed sentences (default is 1)
29 | device: str
30 | the device used during transformation (default is 'cpu')
31 | spacy_model: spacy.language.Language
32 | spacy model used for tokenization
33 |
34 | Methods
35 | -------
36 | generate(sentence, stop_words)
37 | Transforms the sentence
38 | """
39 |
40 | def __init__(
41 | self,
42 | args: TransformArguments,
43 | seed: int = 42,
44 | max_outputs: int = 1,
45 | device: str = "cpu",
46 | spacy_model: Optional[Language] = None,
47 | ) -> None:
48 | """
49 | Parameters
50 | ----------
51 | args: TransformArguments
52 | parameters of the transformation
53 | seed: int
54 | seed to freeze everything (default is 42)
55 | max_outputs: int
56 | maximum number of the transfromed sentences (default is 1)
57 | device: str
58 | the device used during transformation (default is 'cpu')
59 | spacy_model: spacy.language.Language
60 | spacy model used for tokenization
61 | """
62 | self.args = args
63 | self.seed = seed
64 | self.max_outputs = max_outputs
65 | self.device = device
66 | self.spacy_model = spacy_model
67 |
68 | def generate(
69 | self,
70 | sentence: str,
71 | stop_words: Optional[List[Union[int, str]]] = None,
72 | prob: Optional[float] = None,
73 | ) -> List[str]:
74 | """
75 | Transforms the sentence
76 |
77 | Parameters
78 | ----------
79 | sentence: str
80 | sentence to transform
81 | stop_words: List[Union[int, str]], optional
82 | stop_words to ignore during transformation (default is None)
83 | prob: float, optional
84 | probability of the transformation (default is None)
85 |
86 | Returns
87 | -------
88 | list
89 | list of transformed sentences
90 | """
91 | raise NotImplementedError
92 |
--------------------------------------------------------------------------------
/rutransform/constraints/constraints/multihop.py:
--------------------------------------------------------------------------------
1 | from spacy.language import Language
2 |
3 | from typing import Optional, List, Dict, Any
4 |
5 | from rutransform.constraints import Constraint
6 |
7 |
8 | class Multihop(Constraint):
9 | """
10 | Constraints for multihop QA tasks
11 |
12 | Matches all the bridge and main answers important
13 | for hops
14 |
15 | Attributes
16 | ----------
17 | name: str
18 | name of the constraint (is always 'referents')
19 | bridge_col_name: str
20 | name of the column containing bridge answers
21 | main_col_name: str
22 | name of the column containig main question answers
23 |
24 | Methods
25 | -------
26 | extract_words(answer)
27 | Parses answer dictionary and extracts all tokens
28 | patterns(text, spacy_model)
29 | Creates spacy.Matcher patterns to extract stopwords
30 | """
31 |
32 | def __init__(self, bridge_answers_col: str, main_answers_col: str) -> None:
33 | """
34 | Parameters
35 | ----------
36 | bridge_col_name: str
37 | name of the column containing bridge answers
38 | main_col_name: str
39 | name of the column containig main question answers
40 | """
41 | super().__init__(name="multihop")
42 | self.bridge_answers_col = bridge_answers_col
43 | self.main_answers_col = main_answers_col
44 |
45 | def extract_words(self, answers: Dict[str, Any]) -> List[str]:
46 | """
47 | Parses answer dictionary and extracts all tokens
48 |
49 | Parameters
50 | ----------
51 | answers: Dict[str, Any]
52 | answers dictionary
53 |
54 | Returns
55 | -------
56 | List[str]
57 | list of tokens in the answer
58 | """
59 | stop_words = []
60 | for answer in answers:
61 | stop_words.extend(answer["segment"].split())
62 | return stop_words
63 |
64 | def patterns(
65 | self, text: Optional[dict] = None, spacy_model: Optional[Language] = None
66 | ) -> List[List[dict]]:
67 | """
68 | Creates spacy.Matcher patterns to extract stopwords
69 |
70 | Parameters
71 | ----------
72 | text: dict
73 | dataset object in dict form (default is None)
74 | spacy_model: spacy.language.Language
75 | spacy model to be used for morphological analysis (default is None)
76 |
77 | Returns
78 | -------
79 | List[List[dicMult]]
80 | list of spacy.Matcher patterns matching entities important for hops
81 | """
82 | stop_words = self.extract_words(text[self.bridge_answers_col])
83 | stop_words += self.extract_words(text[self.main_answers_col])
84 |
85 | stop_words = list(set(stop_words))
86 | patterns = [[{"TEXT": {"IN": stop_words}}]]
87 |
88 | return patterns
89 |
--------------------------------------------------------------------------------
/rutransform/constraints/constraints/jeopardy.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional
2 | from spacy.language import Language
3 |
4 | from rutransform.constraints import Constraint
5 |
6 |
7 | class Jeopardy(Constraint):
8 | """
9 | Jeopardy type conatraints, including:
10 | - Noun Phrases such as THIS FILM, THIS ACTOR, both UPPER and lower cased
11 | - 'X'
12 | - «Named Entity in parentheses»
13 |
14 | Attributes
15 | ----------
16 | name: str
17 | name of the constraint (is always 'jeopardy')
18 | lemmas: List[str], optional
19 | lemmas to include in the patterns (default is None)
20 | used to define the list of DET that can be used in
21 | jeopardy questions (e.g. if we want to include 'this' but not 'that')
22 |
23 | Methods
24 | -------
25 | patterns(text, spacy_model)
26 | Creates spacy.Matcher patterns to extract stopwords
27 | """
28 |
29 | def __init__(self, lemmas: Optional[List[str]] = None) -> None:
30 | """
31 | Parameters
32 | ----------
33 | lemmas: List[str]
34 | lemmas to include in the patterns (default is None)
35 | used to define the list of DET that can be used in
36 | jeopardy questions (e.g. if we want to include 'this' but not 'that')
37 | """
38 | super().__init__(name="jeopardy")
39 | self.lemmas = lemmas
40 |
41 | def patterns(
42 | self, text: Optional[dict] = None, spacy_model: Optional[Language] = None
43 | ) -> List[List[dict]]:
44 | """
45 | Creates spacy.Matcher patterns to extract stopwords
46 |
47 | Parameters
48 | ----------
49 | text: dict
50 | ! exists for compatability, always ignored !
51 | dataset object in dict form (default is None)
52 | spacy_model: spacy.language.Language
53 | ! exists for compatability, always ignored !
54 | spacy model to be used for morphological analysis (default is None)
55 |
56 | Returns
57 | -------
58 | List[List[dict]]
59 | list of spacy.Matcher patterns matching jeopardy questions
60 | """
61 |
62 | patterns = [
63 | [
64 | {
65 | "IS_UPPER": True,
66 | "OP": "+",
67 | "POS": {"IN": ["NOUN", "PROPN", "DET", "PRON"]},
68 | }
69 | ],
70 | [
71 | {"IS_UPPER": True, "POS": {"NOT_IN": ["ADP"]}},
72 | {"POS": "ADJ", "OP": "*"},
73 | {"POS": "NOUN", "OP": "+"},
74 | ],
75 | [
76 | {"TEXT": "«"},
77 | {"IS_TITLE": True},
78 | {"TEXT": {"REGEX": "\w|\d|['?!.]"}, "OP": "*"},
79 | {"TEXT": "»"},
80 | ],
81 | ]
82 |
83 | if self.lemmas is None:
84 | self.lemmas = [
85 | "его",
86 | "ему",
87 | "её",
88 | "икс",
89 | "ими",
90 | "их",
91 | "него",
92 | "ней",
93 | "неё",
94 | "ним",
95 | "них",
96 | "нём",
97 | "он",
98 | "она",
99 | "они",
100 | "оно",
101 | "такой",
102 | "это",
103 | "этот",
104 | ]
105 | patterns.append(
106 | [
107 | {"LEMMA": {"IN": self.lemmas}},
108 | {"POS": "ADJ", "OP": "*"},
109 | {"POS": "NOUN", "OP": "+"},
110 | ]
111 | )
112 |
113 | return patterns
114 |
--------------------------------------------------------------------------------
/rutransform/constraints/constraints/referents.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional
2 | from spacy.language import Language
3 |
4 | from rutransform.constraints import Constraint
5 | from rutransform.constraints.utils import parse_reference
6 |
7 |
8 | class Referents(Constraint):
9 | """
10 | Constraints for coreference resolution tasks
11 |
12 | Matches
13 | - the anaphoric pronoun
14 | - all possible antecedents
15 | - all verbs referring to antecedents and anaphor
16 |
17 | Attributes
18 | ----------
19 | name: str
20 | name of the constraint (is always 'referents')
21 | reference_col_name: str, optional
22 | name of the column containing anaphor
23 | defaults to 'reference' if not provided
24 | candidates_col_name: str, optional
25 | name of the column containig possible antecedents
26 | defaults to 'Options' or 'options' if not provided
27 |
28 | Methods
29 | -------
30 | patterns(text, spacy_model)
31 | Creates spacy.Matcher patterns to extract stopwords
32 | """
33 |
34 | def __init__(
35 | self,
36 | reference_col_name: Optional[str] = None,
37 | candidates_col_name: Optional[str] = None,
38 | ) -> None:
39 | """
40 | Parameters
41 | ----------
42 | reference_col_name: str, optional
43 | name of the column containing anaphor
44 | defaults to 'reference' if not provided
45 | candidates_col_name: str, optional
46 | name of the column containig possible antecedents
47 | defaults to 'options' if not provided
48 | """
49 | super().__init__(name="referents")
50 | self.reference_col_name = reference_col_name
51 | self.candidates_col_name = candidates_col_name
52 |
53 | def patterns(
54 | self, text: Optional[dict] = None, spacy_model: Optional[Language] = None
55 | ) -> List[List[dict]]:
56 | """
57 | Creates spacy.Matcher patterns to extract stopwords
58 |
59 | Parameters
60 | ----------
61 | text: dict
62 | dataset object in dict form (default is None)
63 | spacy_model: spacy.language.Language
64 | spacy model to be used for morphological analysis (default is None)
65 |
66 | Returns
67 | -------
68 | List[List[dict]]
69 | list of spacy.Matcher patterns matching antecedents, anaphors and corresponding verbs
70 | """
71 | if not self.reference_col_name:
72 | if "reference" in text:
73 | self.reference_col_name = "reference"
74 | else:
75 | raise ValueError(
76 | "Column 'reference' not found in pd.DataFrame columns. "
77 | + "Rename the text column or provide 'reference_col_name' argument."
78 | )
79 | if not self.candidates_col_name:
80 | if "options" in text:
81 | self.candidates_col_name = "options"
82 | else:
83 | raise ValueError(
84 | "Column 'options' not found in pd.DataFrame columns. "
85 | + "Rename the text column or provide 'candidates_col_name' argument."
86 | )
87 |
88 | options = (
89 | eval(text[self.candidates_col_name])
90 | if type(text[self.candidates_col_name]) is str
91 | else text[self.candidates_col_name]
92 | )
93 | morph = parse_reference(text[self.reference_col_name], spacy_model)
94 | referents = [morph.get("number")] + [morph.get("gender")]
95 | referents = [referent for referent in referents if referent]
96 | patterns = [
97 | [{"TEXT": {"IN": options + text[self.reference_col_name].strip().split()}}],
98 | [{"POS": "VERB", "MORPH": {"IS_SUPERSET": referents}}],
99 | ]
100 | return patterns
101 |
--------------------------------------------------------------------------------
/rutransform/utils/args.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from dataclasses import field, dataclass
4 | from typing import Optional, List
5 |
6 | logger = logging.getLogger(__name__)
7 |
8 |
9 | @dataclass
10 | class TransformArguments:
11 | """
12 | Arguments pertaining to transformations we are going to apply to data
13 | """
14 |
15 | transformation: Optional[str] = field(
16 | default=None, metadata={"help": "transformation to use for data augmentation"}
17 | )
18 | max_outputs: int = field(
19 | default=1, metadata={"help": "maximum number of the transformed sentences"}
20 | )
21 | probability: float = field(
22 | default=0.1, metadata={"help": "probability of the transformation"}
23 | )
24 | same_prob: bool = field(
25 | default=True,
26 | metadata={
27 | "help": "whether to use the same probability for EDA deletion and swap"
28 | },
29 | )
30 | del_prob: float = field(
31 | default=0.05,
32 | metadata={
33 | "help": "probability of the word deletion for EDA transformation "
34 | "Needs to be specified if same_prob parameter is True"
35 | },
36 | )
37 | similarity_threshold: float = field(
38 | default=0.8, metadata={"help": "BERTScore value to filter out candidates"}
39 | )
40 | bae_model: str = field(
41 | default="bert-base-multilingual-cased",
42 | metadata={"help": "BERT model for BAE attack"},
43 | )
44 | segment_length: int = field(
45 | default=3, metadata={"help": "minimal segment length for BackTranslationNE"}
46 | )
47 | bin_p: float = field(
48 | default=1.0,
49 | metadata={
50 | "help": "parameter of the binomial distribution for BackTranslationNE"
51 | },
52 | )
53 | generator: str = field(
54 | default="gpt3",
55 | metadata={
56 | "help": "generator model: 'gpt2' = sberbank-ai/rugpt2large, "
57 | "'gpt3' = sberbank-ai/rugpt3small_based_on_gpt2, "
58 | "'mt5-small' = google/mt5-small, 'mt5-base' = google/mt5-base, "
59 | "'mt5-large' = google/mt5-large"
60 | },
61 | )
62 | prompt_text: str = field(
63 | default=" Парафраза:", metadata={"help": "prompt for text generation"}
64 | )
65 | prompt: bool = field(
66 | default=False, metadata={"help": "whether to use a prompt for generation"}
67 | )
68 | num_beams: Optional[int] = field(
69 | default=None, metadata={"help": "number of beams for beam search"}
70 | )
71 | early_stopping: bool = field(
72 | default=False,
73 | metadata={"help": "whether to stop when beam hypotheses reached the EOS token"},
74 | )
75 | no_repeat_ngram_size: Optional[int] = field(
76 | default=None, metadata={"help": "n-gram penalty for beam search generation"}
77 | )
78 | do_sample: bool = field(default=False, metadata={"help": "whether to do sampling"})
79 | temperature: Optional[float] = field(
80 | default=None, metadata={"help": "temperature for text generation"}
81 | )
82 | top_k: Optional[int] = field(
83 | default=None, metadata={"help": "top-k sampling parameter for text generation"}
84 | )
85 | top_p: Optional[float] = field(
86 | default=None, metadata={"help": "top-p sampling parameter for text generation"}
87 | )
88 | repetition_penalty: Optional[float] = field(
89 | default=None,
90 | metadata={"help": "repetition penalty parameter for text generation"},
91 | )
92 | threshold: Optional[float] = field(
93 | default=None, metadata={"help": "threshold parameter to filter candidates"}
94 | )
95 | max_length: int = field(
96 | default=50, metadata={"help": "maximum length of the generated text"}
97 | )
98 |
99 | def __post_init__(self):
100 | if self.transformation is None:
101 | logger.warning("No transformation was passed.")
102 |
--------------------------------------------------------------------------------
/test_data/danet_qa.json:
--------------------------------------------------------------------------------
1 | {"question":"Был ли у обломова сын?","passage":"В браке с Пшеницыной у Обломова родился сын, названный в честь друга Андреем. После смерти Обломова чета Штольцев взяла его сына на воспитание. Завершает всё тихая сцена в загородном доме Штольцев, вызывающая сомнение в их взаимном счастье, и маленький Андрюша, бегущий по полю на встречу с матерью. Олег Табаков — Илья Ильич Обломов Юрий Богатырёв — Андрей Иванович Штольц Андрей Попов — Захар Трофимович, слуга Обломова Елена Соловей — Ольга Сергеевна Ильинская Авангард Леонтьев — Иван Алексеевич Алексеев Андрей Разумовский — Илюша Обломов Олег Козлов — Андрюша Штольц Елена Клещевская — Катя, горничная Ольги Галина Шостко — тётка Ольги Глеб Стриженов — барон фон Лангваген Евгений Стеблов — отец Обломова Евгения Глушенко — мать Обломова Николай Пастухов — Иван Богданович, отец Штольца В эпизодах В фильме использована музыка Беллини, Сергея Рахманинова. Фильм снимался в деревне Сенькино Серпуховского района, Московской области.","label":true,"idx":680}
2 | {"question":"Должен ли цвет чехла соответствовать цвету церковного праздника?","passage":"Начиная с XV—XVI веков престолы делают либо в виде каменных монолитов, либо из дерева, в виде рамы с крышкой сверху, которую покрывают снаружи облачением из ткани. Облачение представляет собой чехол из дорогой парчевой ткани, надеваемый на престол. Весьма часто, но необязательно цвет облачения престола соответствует богослужебному цвету церковного праздника. Облачения могут быть не только из ткани, например парчи, но и представлять собой богато украшенный золотом и камнями металлический, даже золотой, футляр для престола. Престол храма Святой Софии в Константинополе был сделан из золота и драгоценных камней.","label":false,"idx":1643}
3 | {"question":"Состоит ли албания в евросоюзе?","passage":"Вступление Албании в Европейский союз — процедура принятия Албании в Европейский союз. В 2000 году Албания была официально признана в качестве «потенциальной страны-кандидата» на вступление в Европейский союз, в 2003 году стороны начали переговоры по подписанию Соглашения о стабилизации и ассоциации. Соглашение было подписано 12 июня 2006 года, тем самым завершив первый важный шаг к полноправному членству Албании в ЕС. 1 апреля 2009 года Албания вступила в военный альянс НАТО, а 28 апреля 2009 года Албания подала заявку на вступление в Европейский союз. 8 ноября 2010 года Совет Европейского союза одобрил введение безвизового режима для граждан Албании желающих посетить страны Европейского союза с туристической целью, и 10 декабря 2010 года это решение официально вступило в силу.","label":false,"idx":729}
4 | {"question":"Был ли автомобиль принцессы дианы в дтп?","passage":"Несмотря на продолжительные реанимационные попытки, включая внутренний массаж сердца, она умерла в 4:00 утра, поскольку данные ранения оказались несовместимыми с жизнью. Позднее в то же утро, Жан-Пьер Шевенман , французский премьер-министр Лионель Жоспен, Бернадетт Ширак , и Бернар Кушнер , посетили палату, где лежало тело Дианы и отдали последнюю дань уважения. После их визитов, Англиканский Архидиакон Франции, отец Мартин Дрейпер, прочитал благодарственные молитвы из Книги Общей Молитвы. Около 2:00 ночи, бывший муж Дианы, Чарльз, принц Уэльский, и две её старшие сестры, Сара Маккоркодейл и Джейн Феллоуз, прибыли в Париж; они оставили её тело спустя 90 минут. Первоначальные сообщения в СМИ заявили, что автомобиль Дианы столкнулся с столбом со скоростью 190 км\/ч , и что стрелка спидометра застряла в этом положении.","label":true,"idx":7}
5 | {"question":"Обязательно ли содержание послания, которое несет искусство аудитории, является научным?","passage":"Средство коммуникации. В своей простейшей форме искусство представляет собой средство коммуникации. Как и большинство прочих способов коммуникации, оно несет в себе намерение передать информацию аудитории. Например, научная иллюстрация — тоже форма искусства, существующая для передачи информации. Ещё один пример такого рода — географические карты. Однако содержание послания не обязательно бывает научным. Искусство позволяет передавать не только объективную информацию, но и эмоции, настроение, чувства.","label":false,"idx":1252}
6 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformations/bae/bae.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional, Union
2 | from spacy.language import Language
3 |
4 | from textattack.augmentation import Augmenter as TAAugmenter
5 | from textattack.transformations import WordSwapMaskedLM
6 | from textattack.constraints.pre_transformation.stopword_modification import (
7 | StopwordModification,
8 | )
9 |
10 | from rutransform.utils.args import TransformArguments
11 | from rutransform.transformations.utils import SentenceOperation
12 |
13 |
14 | class BAE(SentenceOperation):
15 | """
16 | BERT masked language model transformation attack from
17 | "BAE: BERT-based Adversarial Examples for Text Classification"
18 | (Garg & Ramakrishnan, 2019).
19 |
20 | Attributes
21 | ----------
22 | args: TransformArguments
23 | parameters of the transformation
24 | seed: int
25 | seed to freeze everything (default is 42)
26 | max_outputs: int
27 | maximum number of the transfromed sentences (default is 1)
28 | device: str
29 | the device used during transformation (default is 'cpu')
30 | spacy_model: spacy.language.Language
31 | ! exists for compatability, always ignored !
32 | spacy model used for tokenization
33 |
34 | Methods
35 | -------
36 | generate(sentence, stop_words, prob)
37 | Transforms the sentence
38 | """
39 |
40 | def __init__(
41 | self,
42 | args: TransformArguments,
43 | seed: int = 42,
44 | max_outputs: int = 1,
45 | device: str = "cpu",
46 | spacy_model: Optional[Language] = None,
47 | ) -> None:
48 | """
49 | Parameters
50 | ----------
51 | args: TransformArguments
52 | parameters of the transformation
53 | seed: int
54 | seed to freeze everything (default is 42)
55 | max_outputs: int
56 | maximum number of the transfromed sentences (default is 1)
57 | device: str
58 | the device used during transformation (default is 'cpu')
59 | spacy_model: spacy.language.Language
60 | spacy model used for tokenization
61 | """
62 | super().__init__(
63 | args=args,
64 | seed=seed,
65 | max_outputs=max_outputs,
66 | device=device,
67 | spacy_model=spacy_model,
68 | )
69 |
70 | self.transformation = WordSwapMaskedLM(
71 | method="bae",
72 | masked_language_model=self.args.bae_model,
73 | tokenizer=self.args.bae_model,
74 | )
75 |
76 | def generate(
77 | self,
78 | sentence: str,
79 | stop_words: Optional[List[Union[int, str]]] = None,
80 | prob: Optional[float] = None,
81 | ) -> List[str]:
82 | """
83 | Transforms the sentence
84 |
85 | Parameters
86 | ----------
87 | sentence: str
88 | sentence to transform
89 | stop_words: List[Union[int, str]], optional
90 | stop_words to ignore during transformation (default is None)
91 | prob: float, optional
92 | ! exists for compatability, always ignored !
93 | probability of the transformation (default is None)
94 |
95 | Returns
96 | -------
97 | list
98 | list of transformed sentences
99 | """
100 | if stop_words is not None:
101 | constraints = [StopwordModification(stop_words)]
102 |
103 | augmenter = TAAugmenter(
104 | transformation=self.transformation,
105 | transformations_per_example=self.max_outputs,
106 | constraints=constraints,
107 | )
108 | else:
109 | augmenter = TAAugmenter(
110 | transformation=self.transformation,
111 | transformations_per_example=self.max_outputs,
112 | )
113 |
114 | perturbed = augmenter.augment(sentence)
115 |
116 | return perturbed
117 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformations/paraphrasis/paraphraser.py:
--------------------------------------------------------------------------------
1 | import transformers
2 |
3 | from typing import List, Optional, Union
4 | from spacy.language import Language
5 |
6 | from russian_paraphrasers import GPTParaphraser, Mt5Paraphraser
7 |
8 | from rutransform.utils.args import TransformArguments
9 | from rutransform.transformations.utils import SentenceOperation
10 |
11 | transformers.logging.set_verbosity_error()
12 |
13 |
14 | class Paraphraser(SentenceOperation):
15 | """
16 | Generates diverse linguistic variations of the contexts
17 | through paraphrasis using a ruGPT or ruMT5 model
18 |
19 | Attributes
20 | ----------
21 | args: TransformArguments
22 | parameters of the transformation
23 | seed: int
24 | seed to freeze everything (default is 42)
25 | max_outputs: int
26 | maximum number of the transfromed sentences (default is 1)
27 | device: str
28 | the device used during transformation (default is 'cpu')
29 | spacy_model: spacy.language.Language
30 | spacy model used for tokenization
31 |
32 | Methods
33 | -------
34 | generate(sentence, stop_words, prob)
35 | Transforms the sentence
36 | """
37 |
38 | def __init__(
39 | self,
40 | args: TransformArguments,
41 | seed: int = 42,
42 | max_outputs: int = 1,
43 | device: str = "cpu",
44 | spacy_model: Optional[Language] = None,
45 | ) -> None:
46 | """
47 | Parameters
48 | ----------
49 | args: TransformArguments
50 | parameters of the transformation
51 | seed: int
52 | seed to freeze everything (default is 42)
53 | max_outputs: int
54 | maximum number of the transfromed sentences (default is 1)
55 | device: str
56 | the device used during transformation (default is 'cpu')
57 | spacy_model: spacy.language.Language
58 | ! exists for compatability, always ignored !
59 | spacy model used for tokenization
60 | """
61 | super().__init__(
62 | args=args,
63 | seed=seed,
64 | max_outputs=max_outputs,
65 | device=device,
66 | spacy_model=spacy_model,
67 | )
68 |
69 | if "gpt" in self.args.generator:
70 | self.paraphraser = GPTParaphraser(
71 | model_name=self.args.generator, range_cand=True, make_eval=False
72 | )
73 | else:
74 | self.paraphraser = Mt5Paraphraser(
75 | model_name=self.args.generator, range_cand=True, make_eval=False
76 | )
77 |
78 | def generate(
79 | self,
80 | sentence: str,
81 | stop_words: Optional[List[Union[int, str]]] = None,
82 | prob: Optional[float] = None,
83 | ) -> List[str]:
84 | """
85 | Transforms the sentence
86 |
87 | Parameters
88 | ----------
89 | sentence: str
90 | sentence to transform
91 | stop_words: List[int], optional
92 | ! exists for compatability, always ignored !
93 | stop_words to ignore during transformation (default is None)
94 | prob: float, optional
95 | ! exists for compatability, always ignored !
96 | probability of the transformation (default is None)
97 |
98 | Returns
99 | -------
100 | list
101 | list of transformed sentences
102 | """
103 | transformed = self.paraphraser.generate(
104 | sentence,
105 | n=self.max_outputs,
106 | temperature=self.args.temperature,
107 | top_k=self.args.top_k,
108 | top_p=self.args.top_p,
109 | max_length=self.args.max_length,
110 | repetition_penalty=self.args.repetition_penalty,
111 | threshold=self.args.threshold,
112 | )
113 | best = transformed["results"][0]["best_candidates"]
114 | if best:
115 | return best
116 | else:
117 | return transformed["results"][0]["predictions"]
118 |
--------------------------------------------------------------------------------
/test_data/sit_ethics.json:
--------------------------------------------------------------------------------
1 | {"source":"lenta","text":"Около 800 антиглобалистов заблокировали несколько улиц, ведущих к городскому порту. Видео с места событий публикует Ruptly. Полиция попыталась остановить мирную акцию протеста, однако демонстранты продолжили шествие. У парка Альтер Эльб протестующих разгоняли водометами, передает корреспондент РИА Новости. Отмечается, что на подобные меры власти пошли после того, как сквозь крики и свист четыре раза послышались звуки, похожие на выстрелы. Вечером 6 июля, по оценкам полиции, на улицах города протестовали около шести тысяч человек. В результате стычек, произошедших в ночь на 7 июля, ранения получили более 100 сотрудников правоохранительных органов. Столкновения антиглобалистов с полицией длятся уже несколько дней. 6 июля манифестанты объявили о начале акции «Адский пикник». Для обеспечения безопасности лидеров стран «Большой двадцатки» в Гамбург со всей Германии были переброшены около 20 тысяч полицейских. Стражи порядка запросили подкрепление, когда поняли, что недооценили угрозу.","sit_virtue":1,"sit_moral":0,"sit_law":1,"sit_justice":0,"sit_util":0,"label":[1,0,1,0,0]}
2 | {"source":"taiga_kp","text":"В Кирове спрогнозировали начало эпидемии гриппа. Данные регионального Роспотребнадзора за неделю с 9 по 15 января говорят, что в городе наблюдаются первые признаки массовой заболеваемости. К врачам за этот период обратились 4662 заболевших. Этот показатель увеличился в 1,5 раза, причем для всех возрастов. Больше всего простуду подхватили дети до 2 лет, а также малыши с 3 до 6 лет. При этом ОРВИ массово накрыла две кировские школы. В них два класса вывели на карантин. У всех обратившихся к медикам берут анализы для лабораторных исследований. Последние показали, что люди подхватывают вирус «гонконгского» гриппа. Только за неделю его обнаружили у 19 заболевших, в их число попали 12 детей. Специалисты уже сейчас советуют городским властям подготовить план профилактики для борьбы с болезнью на период эпидемии. Школам и детсадам необходимо ввести дополнительные санитарные меры.","sit_virtue":0,"sit_moral":0,"sit_law":0,"sit_justice":0,"sit_util":0,"label":[0,0,0,0,0]}
3 | {"source":"meduza","text":"В городе Санта-Клара (штат Калифорния) произошел сбой в системе пожарной сигнализации местного аэропорта, и пена для тушения пожаров выплеснулась из специального ангара на улицы, прилегающие к аэропорту. «Это выглядело, будто на пенной вечеринке что-то пошло не так», — рассказал один из свидетелей происшествия. Владельцы ангара заявили, что пена не токсична, но может вызывать аллергию при попадании на кожу. Вскоре после разлива пены нашелся смельчак, который проехал через облако пены на велосипеде. После этого полиция заблокировала улицы, заполненные пеной. Но у прохожих пенное облако вызывало восторг. «Вероятно, самая крутая вещь, которую я когда-либо видела. Противопожарная пена повсюду. Не завидую тому, кто будет все это убирать»","sit_virtue":0,"sit_moral":0,"sit_law":0,"sit_justice":0,"sit_util":0,"label":[0,0,0,0,0]}
4 | {"source":"meduza","text":"Директор проекта Tor (анонимной интернет-сети) Роджер Динглдин обвинил ФБР в заказе взлома сервиса. По его словам, атаку на Tor провел Университет Карнеги-Меллон, за что получил от ФБР как минимум миллион долларов. В университете это не подтвердили. Атака на Tor проводилась с февраля по июль 2014 года. Как заявил Динглдин, в результате злоумышленникам удалось получить данные о пользователях (в том числе и о тех, кто не нарушал закона). О том, что ФБР удалось установить личности некоторых пользователей Tor, говорится в материалах дела Silk Road 2 — магазина в анонимной сети, где можно было свободно купить наркотики. В материалах дела отмечается, что за взломом стоит некий университет, но его название не уточняется. Tor — это сервис для анонимного доступа в интернет и, как следствие, обхода блокировок. Через Tor можно зайти на любой заблокированный в России ресурс. Некоторые страны (например, Китай и Белоруссия) пытаются блокировать Tor.","sit_virtue":1,"sit_moral":0,"sit_law":0,"sit_justice":0,"sit_util":0,"label":[1,0,0,0,0]}
5 | {"source":"taiga_kp","text":"Модель Анастасия Квитко уже давно поражает пользователей соцсетей пикантными фотографиями. Девушку заметили даже мировые таблоиды. А на этой неделе интернет-звезду показали в эфире Первого канала, где она познакомилась с Анастасией Семенович, и получила приглашение сняться в клипе Александра Реввы. Однако впервые в сети появился снимок Насти без нижнего белья. Скандальное фото опубликовано на ее официальном сайте в разделе «Мода». На нем калининградская Ким стоит в легкой джинсовой курточке и игриво демонстрирует обнаженную пятую точку. Кстати, именно на сайте можно узнать, что в англоязычной версии «Википедии» появилась статья про нашу землячку. В одном из интервью Настя рассказала, что в США была ограблена под дулом пистолета. Любопытно, но настоящую Ким Кардашьян тоже ограбили в 2016-м году в отеле Парижа и, по словам светской львицы, на нее также направили пистолет.","sit_virtue":1,"sit_moral":0,"sit_law":1,"sit_justice":0,"sit_util":1,"label":[1,0,1,0,1]}
6 |
--------------------------------------------------------------------------------
/test_data/per_ethics.json:
--------------------------------------------------------------------------------
1 | {"source":"lenta","text":"Большинство респондентов «Левада-Центра», участвовавших в опросе о протестных акциях 5 мая, узнали о случившемся от самих социологов (61 процент), передает ТАСС. В курсе событий были 39 процентов опрошенных. Тех, кто испытал злость к митингующим (3 процента), оказалось втрое больше (1 процент), позавидовавших им. Почти каждый пятый опрошенный убежден, что участникам несогласованных мероприятий заплатили. Однако тех, кто считает подобные действия обоснованными, больше — 32 процента. 20 процентов респондентов заявили, что митингующие выходили в поддержку блогера Алексея Навального. Симпатию и уважение к участникам протестов выразили 12 и 17 процентов соответственно. Исследование проводилось в последних числах мая среди 1,6 тысячи совершеннолетних респондентов из 136 населенных пунктов в 52 субъектах страны. 5 мая в различных городах России прошли протестные акции. В Москве в ней участвовали не менее 1,5 тысячи человек, 300 из них были задержаны. В Петербурге полиция насчитала 2 тысячи митингующих.","per_virtue":0,"per_moral":0,"per_law":0,"per_justice":1,"per_util":0,"label":[0,0,0,1,0]}
2 | {"source":"lenta","text":"В ЮАР обманутая девушка пришла на свадьбу своего парня в облачении невесты и сорвала церемонию. Видео поделился в своем Twitter-аккаунте пользователь @ kokhi. Ролик был снят в городе Бенони на северо-востоке страны. Женщина в фате и белом платье явилась на свадьбу и встала перед алтарем вместе с женихом и его избранницей. Мужчина шумно ругается с ней и пытается прогнать, в то время как настоящая невеста терпеливо ждет начала церемонии. За кадром слышны крики гостей: некоторые из них удивлены, а другие поддерживают устроившую скандал женщину. В подписи к посту гостью назвали любовницей жениха, которая и не подозревала, что является не единственной его девушкой. Комментаторы восхитились решимостью женщины. «Я надеюсь, что на каждого изменяющего мужчину найдется такая любовница», — написала одна из них. Пользователь @A dile__R задался вопросом, как жених собирается объяснить ситуацию настоящей невесте.","per_virtue":0,"per_moral":1,"per_law":1,"per_justice":1,"per_util":0,"label":[0,1,1,1,0]}
3 | {"source":"interfax","text":"Москва. 24 августа. INTERFAX.RU - Лучшая команда Европы по итогам прошлого года каталонская \"Барселона\" выиграла первый трофей в новом сезоне - Кубок Испании. В этом мини-турнире участвуют всего две команды - победитель национального первенства по футболу и обладатель Кубка. По итогам двух матчей \"Барселона\" выиграла у \"Атлетико\". В первом поединке каталонцы взяли верх над соперником в гостях со счетом 2:1, а во второй встрече и вовсе разгромили \"Атлетико\" - 3:0. Голы у \"Барселоны\" забили Месси (дважды) и Кркич.","per_virtue":1,"per_moral":1,"per_law":1,"per_justice":1,"per_util":1,"label":[1,1,1,1,1]}
4 | {"source":"gazeta","text":"Главный тренер женской сборной России по керлингу Томас Липс заявил, что отсутствие стабильного результата отечественных керлингисток связано с их недостаточной уверенностью в собственных силах. «Если команда три раза подряд добиралась до полуфинала чемпионата Европы, значит, она умеет играть в керлинг, — сказал Липс. — Значит, все, что остается, это понять, как сделать так, чтобы они играли стабильно. Я думаю, что наша команда способна играть очень хорошо, но у нее постоянно то взлет, то падение, один матч девушки проводят хорошо, другой плохо и так далее. Дело в недостаточной уверенности в себе». По мнению швейцарца, для того, чтобы заставить российских спортсменок быть увереннее себе, с ними нужно больше общаться. «Нужно просто разговаривать с девушками, уважать их, воспринимать как обычных нормальных людей, беседовать с ними, узнавать о них больше — не только о том, что касается керлинга. Я хочу знать, что они делают в свободное время, что они думают. Важно знать, как с ними общаться, ведь у нас в команде у всех разные характеры. Я не могу говорить со всеми игроками на одном и том же языке. С кем-то я должен быть более обходительным, с кем-то разговаривать более спокойно. Главное, чтобы игроки понимали меня», — заявил специалист.","per_virtue":1,"per_moral":1,"per_law":1,"per_justice":0,"per_util":0,"label":[1,1,1,0,0]}
5 | {"source":"lenta","text":"В Рио-де-Жанейро неизвестный попытался ограбить школу боевых искусств. В итоге ему пришлось спасаться бегством от спортсменов, занимавшихся там джиу-джитсу. Инцидент попал в объектив камеры видеонаблюдения. На кадрах видно, как молодой человек заходит в помещение и после короткого разговора с девушкой-администратором перепрыгивает через турникет и пробегает в помещение школы. Однако уже через несколько секунд он выбегает обратно, преследуемый шестерыми спортсменами в кимоно, у которых в это время было занятие по джиу-джитсу. Как уточняет The Daily Mail, грабитель отобрал у администратора мобильный телефон и угрожал ее убить, тогда девушка начала звать на помощь. Догнать нападавшего защитники девушки так и не смогли: он скрылся с места преступления на велосипеде вместе со своим сообщником. Полицейским также пока не удалось задержать грабителя, однако, по словам представителя ведомства, они нашли одну сандалию, которую тот потерял во время погони.","per_virtue":0,"per_moral":0,"per_law":0,"per_justice":0,"per_util":0,"label":[0,0,0,0,0]}
6 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformations/spelling/change_char_case.py:
--------------------------------------------------------------------------------
1 | import random
2 | import spacy
3 | from typing import List, Optional, Union
4 | from spacy.language import Language
5 |
6 | from rutransform.utils.args import TransformArguments
7 | from rutransform.transformations.utils import SentenceOperation
8 |
9 | """
10 | Adapted from https://github.com/GEM-benchmark/NL-Augmenter/tree/main/transformations/change_char_case
11 | """
12 |
13 |
14 | def change_char_case(
15 | text: str,
16 | spacy_model: Language,
17 | prob: float = 0.1,
18 | seed: int = 42,
19 | max_outputs: int = 1,
20 | stop_words: List[str] = None,
21 | ) -> List[str]:
22 | """
23 | Changes character cases randomly
24 |
25 | Parameters
26 | ----------
27 | text: str
28 | text to transform
29 | spacy_model: spacy.language.Language
30 | spacy model used for lemmatization
31 | prob: float
32 | probabilty of the transformation (default is 0.1)
33 | seed: int
34 | seed to freeze everything (default is 42)
35 | max_outputs: int
36 | maximum number of the returned sentences (default is 1)
37 | stop_words: List[str], optional
38 | stop words to ignore during transformation (default is None)
39 |
40 | Returns
41 | -------
42 | List[str]
43 | list of transformed sentences
44 | """
45 | if stop_words is None:
46 | stop_words = []
47 |
48 | random.seed(seed)
49 | results = []
50 | split_text = [token.text for token in spacy_model(text)]
51 | for _ in range(max_outputs):
52 | result = []
53 | for w, word in enumerate(split_text):
54 | if word in stop_words:
55 | new_word = word
56 | else:
57 | new_word = ""
58 | for c in word:
59 | if random.uniform(0, 1) < prob:
60 | if c.isupper():
61 | new_word += c.lower()
62 | elif c.islower():
63 | new_word += c.upper()
64 | else:
65 | new_word += c
66 | result.append(new_word)
67 | result = " ".join(result)
68 | results.append(result)
69 | return results
70 |
71 |
72 | class ChangeCharCase(SentenceOperation):
73 | """
74 | Changes character cases randomly
75 |
76 | Attributes
77 | ----------
78 | args: TransformArguments
79 | parameters of the transformation
80 | seed: int
81 | seed to freeze everything (default is 42)
82 | max_outputs: int
83 | maximum number of the transfromed sentences (default is 1)
84 | device: str
85 | ! exists for compatability, always ignored !
86 | the device used during transformation (default is 'cpu')
87 | spacy_model: spacy.language.Language
88 | spacy model used for tokenization
89 |
90 | Methods
91 | -------
92 | generate(sentence, stop_words, prob)
93 | Transforms the sentence
94 | """
95 |
96 | def __init__(
97 | self,
98 | args: TransformArguments,
99 | seed: int = 42,
100 | max_outputs: int = 1,
101 | device: Optional[str] = None,
102 | spacy_model: Optional[Language] = None,
103 | ) -> None:
104 | """
105 | Parameters
106 | ----------
107 | args: TransformArguments
108 | parameters of the transformation
109 | seed: int
110 | seed to freeze everything (default is 42)
111 | max_outputs: int
112 | maximum number of the transfromed sentences (default is 1)
113 | device: str
114 | ! exists for compatability, always ignored !
115 | the device used during transformation (default is None)
116 | spacy_model: spacy.language.Language
117 | spacy model used for tokenization
118 | """
119 | if spacy_model is None:
120 | spacy_model = spacy.load("ru_core_news_sm")
121 |
122 | super().__init__(
123 | args=args,
124 | seed=seed,
125 | max_outputs=max_outputs,
126 | device=device,
127 | spacy_model=spacy_model,
128 | )
129 |
130 | def generate(
131 | self,
132 | sentence: str,
133 | stop_words: Optional[List[Union[int, str]]] = None,
134 | prob: Optional[float] = None,
135 | ) -> List[str]:
136 | """
137 | Transforms the sentence
138 |
139 | If 'prob' argument is not None, ignores the probability provided in the arguments.
140 |
141 | Parameters
142 | ----------
143 | sentence: str
144 | sentence to transform
145 | stop_words: List[str], optional
146 | stop_words to ignore during transformation (default is None)
147 | prob: float, optional
148 | probability of the transformation (default is None)
149 |
150 | Returns
151 | -------
152 | list
153 | list of transformed sentences
154 | """
155 | transformed = change_char_case(
156 | text=sentence,
157 | spacy_model=self.spacy_model,
158 | prob=(self.args.probability if not prob else prob),
159 | seed=self.seed,
160 | max_outputs=self.max_outputs,
161 | stop_words=stop_words,
162 | )
163 | return transformed
164 |
--------------------------------------------------------------------------------
/test_data/multiq.json:
--------------------------------------------------------------------------------
1 | {"support_text":"Павел Михайлович Харчик ( 5 апреля 1979, Душанбе, СССР) — туркменский и российский футболист, вратарь.","main_text":"Душанбе ( — букв. \"понедельник\") — столица Таджикистана, город республиканского значения, самый крупный научно-культурный, политический, экономический, промышленный и административный центр страны. В Душанбе расположены высшие органы государственной власти страны. Население города на 1 января 2019 года составляло 846,4 тыс. чел., он является центром Душанбинской агломерации. С 12 января 2017 года мэром города Душанбе является Рустам Эмомали.","question":"Кто является главой администрации в городе, где родился Павел Михайлович Харчик?","bridge_answers":[{"label":"passage","offset":42,"length":7,"segment":"Душанбе"}],"main_answers":[{"label":"passage","offset":430,"length":14,"segment":"Рустам Эмомали"}]}
2 | {"support_text":"Евгений Константинович Забиякин (7 (20 марта) 1909, Казань — 29 апреля 2001, Санкт-Петербург) — театральный актёр, заслуженный артист Таджикской ССР (1967).","main_text":"Казань — город в России, столица Республики Татарстан, крупный порт на левом берегу реки Волги при впадении в неё реки Казанки. Крупнейший по численности населения город в Приволжском федеральном округе и на реке Волге.\\n\\nАдминистративное устройство.\\nВ Казани расположены органы власти собственно городского муниципального образования, а также Республики Татарстан. Мэр и Городская дума. Главой города является мэр. Должность мэра в Казани является избираемой депутатами Городской думы из своего числа. С 17 ноября 2005 года эту должность занимает И. Р. Метшин. Свои сессии Дума проводит в городской Ратуше. Городская дума является нормотворческим органом города Казани, выборы в Думу проводятся раз в 5 лет. Председателем Городской думы является мэр, помимо него в Президиум Думы входят также два его заместителя, старший по возрасту депутат и главы семи постоянных комиссий:.","question":"Кто является главой администрации в городе, где родился Евгений Константинович Забиякин?","bridge_answers":[{"label":"passage","offset":52,"length":6,"segment":"Казань"}],"main_answers":[{"label":"passage","offset":550,"length":12,"segment":"И. Р. Метшин"}]}
3 | {"support_text":"Евгений Васильевич Неронович (1888—1918) — украинский политический и общественный деятель, революционер. Член Украинской Центральной Рады. С марта 1918 года — народный секретарь военных дел советского правительства Украины.\\n\\nБиография.\\nЕвгений Неронович родился 1888 года возле города Пирятин (Полтавская губерния). Получил среднее образование в Полтавской мужской гимназии и на электротехническом факультете Петербургского технологического института.","main_text":"Пирятин — город в Полтавской области Украины. Входит в Лубенский район. До 2020 года был административным центром упразднённого Пирятинского района, в котором составлял Пирятинский городской совет, в который, кроме того, входили сёла Верхояровка, Голобородько, Замостище, Заречье, Ивженки, Калинов Мост, Александровка, Ровное и Могилевщина.","question":"Как называется законодательный орган города, где родился Евгений Васильевич Неронович?","bridge_answers":[{"label":"passage","offset":288,"length":7,"segment":"Пирятин"}],"main_answers":[{"label":"passage","offset":181,"length":15,"segment":"городской совет"}]}
4 | {"support_text":"Гурий Константинович Щенников (17 ноября 1931, Челябинск — 6 сентября 2010, Санкт-Петербург) — советский и российский , доктор филологических наук (1982), профессор (1983). Заслуженный деятель науки Российской Федерации (1995).\\n\\nНаучная деятельность.\\nАкадемик РАН Г. М. Фридлендер полагал, что его монография \"Достоевский и русский реализм\" \"может быть с полным правом причислена к высшим достижениям советской историко-литературной науки 80-х годов в целом\". В 1990-х годах Г. К. Щенников организовал широкую группу российских ученых, работающих над проблемой \"Достоевский и русская культура\". Результатом этой работы явились книги \"Творчество Достоевского: искусство синтеза\" (1991), два выпуска коллективной монографии \"Достоевский и национальная культура\" (1994—1996) и крупный словарь-справочник \"Достоевский: Эстетика и поэтика\" (Челябинск, 1997) — 240 статей, в работе над которыми участвовали 37 авторов из различных вузов страны. Г. К. Щенников был также редактором сборника \"Русская литература 1870—1890-х гг.\". . Под его руководством начато издание полного собрания сочинений Д. Н. Мамина-Сибиряка, а также подготовлено 13 кандидатских и 5 докторских диссертаций.","main_text":"Челябинск — город в Российской Федерации, административный центр Челябинской области, седьмой по количеству жителей, шестнадцатый по занимаемой площади городской округ с внутригородским делением. Второй по величине культурный, экономический, деловой и политический центр УрФО. Население — человек .\\n\\nЭкономика.\\n816 аварий в 2015 году произошло из-за ям, а сам Челябинск, по данным компании \"АльфаСтрахование\" занимает 8 место среди российских городов по аварийности. Челябинские дороги были подвергнуты резкой критике премьер-министром России Дмитрием Медведевым. Также дороги Челябинска были названы Общероссийским народным фронтом одними из самых худших и опасных в стране. Крайне низкое качество дорожной инфраструктуры Челябинска постоянно подвергается критике со стороны известных гостей города, в частности, тележурналист Владимир Соловьёв и музыкант Баста заявили, что в Челябинске самые плохие дороги в стране. Из-за разбитых дорог Челябинска главе города Евгению Тефтелеву прокуратурой было внесено представление. Евгений Тефтелев и губернатор области Борис Дубровский соглашаются с тем, что городская дорожная сеть находится в крайне неудовлетворительном и обветшалом состоянии.","question":"Кто является главой администрации в городе, где родился Гурий Константинович Щенников?","bridge_answers":[{"label":"passage","offset":47,"length":9,"segment":"Челябинск"}],"main_answers":[{"label":"passage","offset":969,"length":17,"segment":"Евгению Тефтелеву"}]}
5 | {"support_text":"Рамитница — река в России, протекает в Опаринском районе Кировской области. Устье реки находится в 14 км по правому берегу реки Белая. Длина реки составляет 12 км.","main_text":"Белая — река в России, протекает в Опаринском районе Кировской области. Устье реки находится в 52 км по левому берегу реки Волманга. Длина реки составляет 40 км, площадь бассейна 316 кв км. Белая берёт исток в лесах близ точки, где сходятся Кировская, Вологодская и Костромская область в 22 км к северо-западу от посёлка Верхняя Волманга. Река течёт на юго-восток по ненаселённому заболоченному лесу. Впадает в Волмангу в посёлке Верхняя Волманга.","question":"С какой рекой сливается река, в которую впадает Рамитница?","bridge_answers":[{"label":"passage","offset":128,"length":5,"segment":"Белая"}],"main_answers":[{"label":"passage","offset":123,"length":8,"segment":"Волманга"}]}
6 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformations/spelling/butter_fingers.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | import random
3 | import spacy
4 | from typing import List, Optional, Union
5 | from spacy.language import Language
6 |
7 | from rutransform.utils.args import TransformArguments
8 | from rutransform.transformations.utils import SentenceOperation
9 |
10 | """
11 | Adapted from https://github.com/GEM-benchmark/NL-Augmenter/tree/main/transformations/butter_fingers_perturbation
12 | """
13 |
14 |
15 | def butter_finger(
16 | text: str,
17 | spacy_model: Language,
18 | prob: float = 0.1,
19 | seed: int = 42,
20 | max_outputs: int = 1,
21 | stop_words: List[int] = None,
22 | ) -> List[str]:
23 | """
24 | Adds typos to text the sentence using keyboard distance
25 |
26 | Parameters
27 | ----------
28 | text: str
29 | text to transform
30 | spacy_model: spacy.language.Language
31 | spacy model used for lemmatization
32 | prob: float
33 | probability of the transformation (default is 0.1)
34 | seed: int
35 | seed to freeze everything (default is 42)
36 | max_outputs: int
37 | maximum number of the returned sentences (default is 1)
38 | stop_words: List[int], optional
39 | stop words to ignore during transformation (default is None)
40 |
41 | Returns
42 | -------
43 | List[str]
44 | list of transformed sentences
45 | """
46 | random.seed(seed)
47 | key_approx = {
48 | "й": "йцфыувяч",
49 | "ц": "цйуыфвкасч",
50 | "у": "уцкыавйфячсмпе",
51 | "к": "куевпацычсмпе",
52 | "е": "екнарпувсмитог",
53 | "н": "негпоркамитош",
54 | "г": "гншрлоепитьдщ",
55 | "ш": "шгщодлнртьдз",
56 | "щ": "щшзлдгоь",
57 | "з": "здщхэшл",
58 | "х": "хзъэж\щдю.",
59 | "ъ": "ъх\зэж.",
60 | "ф": "фйыяцчцвсу",
61 | "ы": "ыцчфвкам",
62 | "в": "вусыафйпим",
63 | "а": "авпкмцычнрт",
64 | "п": "пеиарувснот",
65 | "р": "рнтпоакмлшь",
66 | "о": "орлтгпеидщь",
67 | "л": "лодштнрт",
68 | "д": "дщльзгот",
69 | "ж": "жз.дэх\ю",
70 | "э": "эхж\зъ.",
71 | "я": "яфчымву",
72 | "ч": "чясывимакуцй",
73 | "с": "счмваяыцукпи",
74 | "м": "мсаипчвукент",
75 | "и": "имтпрсаенгт",
76 | "т": "тиьромпегшл",
77 | "ь": "ьтлодщшл",
78 | "б": "блдьюож",
79 | "ю": "юджб.ьл",
80 | " ": " ",
81 | }
82 | if stop_words is None:
83 | stop_words = []
84 |
85 | transformed_texts = []
86 | split_text = [token.text for token in spacy_model(text)]
87 | for _ in itertools.repeat(None, max_outputs):
88 | butter_text = []
89 | for w, word in enumerate(split_text):
90 | if w in stop_words:
91 | butter_text.append(word)
92 | else:
93 | new_word = ""
94 | for letter in word:
95 | lcletter = letter.lower()
96 | if lcletter not in key_approx.keys():
97 | new_letter = lcletter
98 | else:
99 | if random.uniform(0, 1) <= prob:
100 | new_letter = random.choice(key_approx[lcletter])
101 | else:
102 | new_letter = lcletter
103 | # go back to original case
104 | if not lcletter == letter:
105 | new_letter = new_letter.upper()
106 | new_word += new_letter
107 | butter_text.append(new_word)
108 | transformed_texts.append(" ".join(butter_text))
109 | return transformed_texts
110 |
111 |
112 | class ButterFingersTransformation(SentenceOperation):
113 | """
114 | Add typos to text the sentence using keyboard distance
115 |
116 | Attributes
117 | ----------
118 | args: TransformArguments
119 | parameters of the transformation
120 | seed: int
121 | seed to freeze everything (default is 42)
122 | max_outputs: int
123 | maximum number of the transfromed sentences (default is 1)
124 | device: str
125 | ! exists for compatability, always ignored !
126 | the device used during transformation (default is 'cpu')
127 | spacy_model: spacy.language.Language
128 | spacy model used for tokenization
129 |
130 | Methods
131 | -------
132 | generate(sentence, stop_words, prob)
133 | Transforms the sentence
134 | """
135 |
136 | def __init__(
137 | self,
138 | args: TransformArguments,
139 | seed: int = 42,
140 | max_outputs: int = 1,
141 | device: Optional[str] = None,
142 | spacy_model: Optional[Language] = None,
143 | ) -> None:
144 | """
145 | Parameters
146 | ----------
147 | args: TransformArguments
148 | parameters of the transformation
149 | seed: int
150 | seed to freeze everything (default is 42)
151 | max_outputs: int
152 | maximum number of the transfromed sentences (default is 1)
153 | device: str
154 | ! exists for compatability, always ignored !
155 | the device used during transformation (default is 'cpu')
156 | spacy_model: spacy.language.Language
157 | spacy model used for tokenization
158 | """
159 | if spacy_model is None:
160 | spacy_model = spacy.load("ru_core_news_sm")
161 |
162 | super().__init__(
163 | args=args,
164 | seed=seed,
165 | max_outputs=max_outputs,
166 | device=device,
167 | spacy_model=spacy_model,
168 | )
169 |
170 | def generate(
171 | self,
172 | sentence: str,
173 | stop_words: Optional[List[Union[int, str]]] = None,
174 | prob: Optional[float] = None,
175 | ) -> List[str]:
176 | """
177 | Transforms the sentence
178 |
179 | If 'prob' argument is not None, ignores the probability provided in the arguments.
180 |
181 | Parameters
182 | ----------
183 | sentence: str
184 | sentence to transform
185 | stop_words: List[int], optional
186 | stop words to ignore during transformation (default is None)
187 | prob: float, optional
188 | probability of the transformation (default is None)
189 |
190 | Returns
191 | -------
192 | list
193 | list of transformed sentences
194 | """
195 | transformed = butter_finger(
196 | text=sentence,
197 | spacy_model=self.spacy_model,
198 | prob=(self.args.probability if not prob else prob),
199 | seed=self.seed,
200 | max_outputs=self.max_outputs,
201 | stop_words=stop_words,
202 | )
203 | return transformed
204 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformers/bae.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict, List, Optional
2 |
3 | from rutransform.transformations.transformations.bae import *
4 | from rutransform.transformations import Transformer
5 | from rutransform.constraints import Constraint
6 | from rutransform.utils.args import TransformArguments
7 | from rutransform.transformations.utils import SentenceOperation
8 |
9 |
10 | class BAETransformer(Transformer):
11 | """
12 | BAE (BERT-based Adversarial Examples) attack (https://arxiv.org/abs/2004.01970)
13 |
14 | Generates adversarial examples using contextual perturbations
15 | from a BERT masked language model.
16 |
17 | Utilizes constraints provided by the user to extract stopwords
18 | specific for the task, to which the transformations do not apply.
19 | Uses similarity metric (BERTScore) to filter the sentences,
20 | similarity score of which are less than a threshold (specified in
21 | TransformArguments).
22 |
23 | Attributes
24 | ----------
25 | transformation_type: str
26 | type of the transformations supported by the transformer
27 | transformations: List[str]
28 | list of transformations to apply to data
29 | task_type: str
30 | type of the task (e.g. 'classification', 'multichoice_qa', etc.)
31 | args: TransformArguments
32 | parameters of the transformation
33 | text_col: str, optional
34 | name of the column containing text to transform (default is 'text')
35 | label_col: str, optional
36 | name of the target column (default is 'label')
37 | seed: int
38 | seed to freeze everything (default is 42)
39 | device: str
40 | device used during transformation (default is 'cpu')
41 | constraints: List[Constraint], optional
42 | list of transformation constraints (default is None)
43 | spacy_model: spacy.language.Language
44 | spacy model used for tokenization (default is 'ru_core_news_sm')
45 | bert_scorer: BERTScorer
46 | similarity metric class used to filter transformed texts (default is None)
47 |
48 | Methods
49 | -------
50 | @staticmethod
51 | transform_info: Dict[str, SentenceOperation]
52 | dictionary mapping transformations and SentenceOperation classes
53 | load_transformations()
54 | Loads all the transformations required
55 | @abstractmethod
56 | transform(sentence)
57 | Applies the transformations to input
58 | sent_split(text)
59 | Splits text into sentences
60 | @staticmethod
61 | get_ids(matches)
62 | Returns ids of stopwords
63 | _transform_text(transformer, sentences, reference, stop_words, prob)
64 | Applies the transformations to long text and filters the transformed texts
65 | _transform_sentence(self, transformer, sentence, stop_words, prob)
66 | Applies the transformations to sentence and filters the transformed sentences
67 | _list_stop_words(sentence, return_ids)
68 | Extracts stopwords matching the constraints
69 | _drop_duplicates(reference, candidates, scores)
70 | Returns transformed sentences without duplicates
71 | _filter_candidates(candidates, reference, context)
72 | Filters out sentences based on the similarity score
73 | _sample_to_max_outputs(sentences, scores)
74 | Returns the desired number of the transformed sentences
75 | _update_data(org_sentence, transformed)
76 | Updates the dataset object
77 | """
78 |
79 | def __init__(
80 | self,
81 | transformations: List[str],
82 | task_type: str,
83 | args: TransformArguments,
84 | text_col: Optional[str] = "text",
85 | label_col: Optional[str] = "label",
86 | seed: int = 42,
87 | device: str = "cpu",
88 | constraints: Optional[List[Constraint]] = None,
89 | ) -> None:
90 | """
91 | Parameters
92 | ----------
93 | transformations: List[str]
94 | list of transformations to apply to data
95 | task_type: str
96 | type of the task (e.g. 'classification', 'multichoice_qa', etc.)
97 | args: TransformArguments
98 | parameters of the transformation
99 | text_col: str, optional
100 | name of the column containing text to transform (default is 'text')
101 | label_col: str, optional
102 | name of the target column (default is 'label')
103 | seed: int
104 | seed to freeze everything (default is 42)
105 | device: str
106 | device used during transformation (default is 'cpu')
107 | constraints: List[Constraint], optional
108 | list of transformation constraints (default is None)
109 | """
110 | super().__init__(
111 | transformation_type="bae",
112 | transformations=transformations,
113 | task_type=task_type,
114 | args=args,
115 | text_col=text_col,
116 | label_col=label_col,
117 | seed=seed,
118 | device=device,
119 | constraints=constraints,
120 | )
121 |
122 | self.transformers = self.load_transformations()
123 |
124 | @staticmethod
125 | def transform_info() -> Dict[str, Optional[SentenceOperation]]:
126 | """
127 | Information about the transformations used by the transformer
128 |
129 | Returns
130 | -------
131 | Dict[str, Optional[SentenceOperation]]
132 | dictionary storing transformation info
133 | """
134 | info = {"bae": BAE}
135 |
136 | return info
137 |
138 | def transform(self, sentence: Dict[str, Any]) -> List[Dict[str, Any]]:
139 | """
140 | Applies the transformations to input
141 |
142 | Parameters
143 | ----------
144 | sentence: Dict[str, Any]
145 | dataset object in dict form
146 |
147 | Returns
148 | -------
149 | Dict[str, Any]
150 | Transformed dataset object
151 | """
152 | transformed_data = []
153 | scores = []
154 | for transform_name, transformer in self.transformers.items():
155 | stop_words = self._list_stop_words(sentence, return_ids=False)
156 | sentences = self.sent_split(sentence[self.text_col])
157 | if len(sentences) > 1:
158 | transform_sent, sent_scores = self._transform_text(
159 | transformer, sentences, sentence[self.text_col], stop_words
160 | )
161 | else:
162 | transform_sent, sent_scores = self._transform_sentence(
163 | transformer, sentence[self.text_col], stop_words
164 | )
165 | transformed = self._update_data(sentence, transform_sent)
166 | transformed_data.extend(transformed)
167 | scores.extend(sent_scores)
168 |
169 | return transformed_data, scores
170 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformations/spelling/emojify.py:
--------------------------------------------------------------------------------
1 | import os
2 | import random
3 | import spacy
4 | from json import load
5 | from typing import Dict, List, Optional, Union
6 |
7 | from spacy.language import Language
8 |
9 | from rutransform.utils.args import TransformArguments
10 | from rutransform.transformations.utils import SentenceOperation
11 |
12 | """
13 | Adapted from https://github.com/GEM-benchmark/NL-Augmenter/tree/main/transformations/emojify
14 | """
15 |
16 |
17 | def emojify(
18 | sentence: str,
19 | word_to_emoji: Dict[str, str],
20 | spacy_model: Language,
21 | prob: float = 0.1,
22 | seed: int = 0,
23 | max_outputs: int = 1,
24 | stop_words: Optional[List[str]] = None,
25 | ) -> List[str]:
26 | """
27 | Randomly replaces tokens with corresponding emojis
28 |
29 | Parameters
30 | ----------
31 | sentence: str
32 | sentence to transform
33 | word_to_emoji: Dict[str, str]
34 | dictionary with emojis and their meanings
35 | spacy_model: spacy.language.Language
36 | spacy model used for lemmatization
37 | prob: float
38 | probabilty of the transformation (default is 0.1)
39 | seed: int
40 | seed to freeze everything (default is 42)
41 | max_outputs: int
42 | maximum number of the returned sentences (default is 1)
43 | stop_words: List[str], optional
44 | stop words to ignore during transformation (default is None)
45 |
46 | Returns
47 | -------
48 | List[str]
49 | list of transformed sentences
50 | """
51 | random.seed(seed)
52 | doc = spacy_model(sentence)
53 | results = []
54 |
55 | if stop_words is None:
56 | stop_words = []
57 |
58 | for _ in range(max_outputs):
59 |
60 | # Reconstruct the sentence with replaced lemma
61 | transformed_sentence = ""
62 |
63 | for t, token in enumerate(doc):
64 | lemma = token.lemma_.lower()
65 |
66 | if t in stop_words:
67 | transformed_sentence += token.text_with_ws
68 |
69 | # Handle numeric tokens
70 | elif lemma.isnumeric():
71 | if random.uniform(0, 1) < prob:
72 | for digit in list(lemma):
73 | emoji = digit
74 | if digit in word_to_emoji:
75 | emoji = random.choice(word_to_emoji[digit])
76 | transformed_sentence += emoji
77 |
78 | if " " in token.text_with_ws:
79 | transformed_sentence += " "
80 |
81 | else:
82 | transformed_sentence += token.text_with_ws
83 |
84 | elif lemma in word_to_emoji:
85 | # We have `prob` chance to replace this token with emoji
86 | if random.uniform(0, 1) < prob:
87 |
88 | # Randomly choose a emoji candidate for this token
89 | emoji = random.choice(word_to_emoji[lemma])
90 | transformed_sentence += emoji
91 |
92 | if " " in token.text_with_ws:
93 | transformed_sentence += " "
94 |
95 | else:
96 | transformed_sentence += token.text_with_ws
97 |
98 | else:
99 | # If lemma is not in the emoji dictionary, we keep it the same
100 | transformed_sentence += token.text_with_ws
101 |
102 | results.append(transformed_sentence)
103 |
104 | return results
105 |
106 |
107 | class EmojifyTransformation(SentenceOperation):
108 | """
109 | Augments the input sentence by swapping words
110 | into emojis with similar meanings
111 |
112 | Attributes
113 | ----------
114 | args: TransformArguments
115 | parameters of the transformation
116 | spacy_model: spacy.language.Language
117 | spacy model used for tokenization
118 | seed: int
119 | seed to freeze everything (default is 42)
120 | max_outputs: int
121 | maximum number of the transfromed sentences (default is 1)
122 | device: str
123 | ! exists for compatability, always ignored !
124 | the device used during transformation (default is 'cpu')
125 |
126 | Methods
127 | -------
128 | generate(sentence, stop_words, prob)
129 | Transforms the sentence
130 | """
131 |
132 | def __init__(
133 | self,
134 | args: TransformArguments,
135 | spacy_model: Optional[Language] = None,
136 | seed: int = 42,
137 | max_outputs: int = 1,
138 | device: Optional[str] = None,
139 | ) -> None:
140 | """
141 | Parameters
142 | ----------
143 | args: TransformArguments
144 | parameters of the transformation
145 | spacy_model: spacy.language.Language
146 | spacy model used for tokenization
147 | seed: int
148 | seed to freeze everything (default is 42)
149 | max_outputs: int
150 | maximum number of the transfromed sentences (default is 1)
151 | device: str
152 | ! exists for compatability, always ignored !
153 | the device used during transformation (default is None)
154 | """
155 | if spacy_model is None:
156 | spacy_model = spacy.load("ru_core_news_sm")
157 |
158 | super().__init__(
159 | args=args,
160 | seed=seed,
161 | max_outputs=max_outputs,
162 | device=device,
163 | spacy_model=spacy_model,
164 | )
165 |
166 | emoji_dict_path = "emoji_dict_ru.json"
167 | # Load the emoji dictionary
168 | dict_path = os.path.join(
169 | os.path.dirname(os.path.abspath(__file__)), emoji_dict_path
170 | )
171 | self.word_to_emoji = load(open(dict_path, "r"))
172 |
173 | def generate(
174 | self,
175 | sentence: str,
176 | stop_words: Optional[List[Union[int, str]]] = None,
177 | prob: Optional[float] = None,
178 | ) -> List[str]:
179 | """
180 | Transforms the sentence
181 |
182 | If 'prob' argument is not None, ignores the probability provided in the arguments.
183 |
184 | Parameters
185 | ----------
186 | sentence: str
187 | sentence to transform
188 | stop_words: List[Union[int, str]], optional
189 | stop_words to ignore during transformation (default is None)
190 | prob: float, optional
191 | probability of the transformation (default is None)
192 |
193 | Returns
194 | -------
195 | list
196 | list of transformed sentences
197 | """
198 | transformed = emojify(
199 | sentence=sentence,
200 | word_to_emoji=self.word_to_emoji,
201 | spacy_model=self.spacy_model,
202 | prob=(self.args.probability if not prob else prob),
203 | seed=self.seed,
204 | max_outputs=self.max_outputs,
205 | stop_words=stop_words,
206 | )
207 | return transformed
208 |
--------------------------------------------------------------------------------
/rutransform/transformations/dataset_transformer.py:
--------------------------------------------------------------------------------
1 | from typing import Union, NamedTuple
2 |
3 | import pandas as pd
4 | import numpy as np
5 | from datasets import Dataset
6 | from tqdm.auto import tqdm
7 |
8 | from rutransform.transformations import Transformer
9 | from rutransform.transformations.utils import *
10 | from rutransform.constraints import *
11 | from rutransform.utils.args import TransformArguments
12 | from rutransform.transformations import load_transformers
13 |
14 |
15 | class DatasetTransformer(object):
16 | """
17 | Class for dataset transformation
18 |
19 | Attributes
20 | ----------
21 | dataset: Union[Dataset, pd.DataFrame]
22 | dataset to transform
23 | args: TransformArguments
24 | transformation parameters
25 | text_col: str, optional
26 | name of the column containing text to transform (default is 'text')
27 | label_col: str, optional
28 | name of the target column (default is 'label')
29 | task_type: str, optional
30 | type of the task (default is None)
31 | if dataset_name in original tasks,
32 | defaults to task_type of the dataset
33 | seed: int, optional
34 | seed to freeze everything (default is 42)
35 | device: str
36 | the device used during transformation (default is 'cpu')
37 | return_type: str
38 | type of the transformed dataset (default is 'hf')
39 | - if 'pd' - returns pandas.DataFrame
40 | - if 'hf' - returns HuggingFace Dataset
41 | custom_constraints: List[Constraint]
42 | list of custom constraints for transformation (defaul is None)
43 | if not provided, uses contrsaints for task_type,
44 | else uses only custom ones
45 | transform_dict: dict
46 | dictionary containing Transformer classes by transformation
47 | provided in utils.constants
48 | transform_info:
49 | dictionary mapping transformations and SentenceOperation classes
50 | provided in utils.constants
51 |
52 | Methods
53 | -------
54 | load_transformer()
55 | Loads the transformer used for transformation
56 | transform()
57 | Transforms dataset
58 |
59 | """
60 |
61 | def __init__(
62 | self,
63 | dataset: Union[Dataset, pd.DataFrame],
64 | args: TransformArguments,
65 | text_col: str = "text",
66 | label_col: str = "label",
67 | task_type: Optional[str] = None,
68 | seed: int = 42,
69 | device: str = "cpu",
70 | return_type: str = "hf",
71 | custom_constraints: List[Constraint] = None,
72 | use_constraints: bool = True,
73 | ) -> None:
74 | """
75 | dataset: Union[Dataset, pd.DataFrame]
76 | dataset to transform
77 | args: TransformArguments
78 | transformation parameters
79 | text_col: str, optional
80 | name of the column containing text to transform (default is 'text')
81 | label_col: str, optional
82 | name of the target column (default is 'label')
83 | task_type: str, optional
84 | type of the task
85 | seed: int, optional
86 | seed to freeze everything (default is 42)
87 | device: str
88 | the device used during transformation (default is 'cpu')
89 | return_type: str
90 | type of the transformed dataset (default is 'hf')
91 | - if 'pd' - returns pandas.DataFrame
92 | - if 'hf' - returns HuggingFace Dataset
93 | custom_constraints: List[Constraint]
94 | list of custom constraints for transformation (defaul is None)
95 | if not provided, uses contrsaints for task_type,
96 | else uses only custom ones
97 | """
98 | self.dataset = dataset
99 | self.args = args
100 | self.text_col = text_col
101 | self.label_col = label_col
102 | self.task_type = task_type
103 | self.seed = seed
104 | self.device = device
105 | self.return_type = return_type
106 | self.custom_constraints = custom_constraints
107 | self.use_constraints = use_constraints
108 |
109 | self.transform_dict = load_transformers()
110 | self.transformer = self.load_transformer()
111 |
112 | def load_transformer(self) -> Transformer:
113 | """
114 | Loads the transformer used for transformation.
115 | Initializes task_type and default constraints for task.
116 |
117 | Returns
118 | -------
119 | Transformer
120 | initialized Transformer class
121 | """
122 | if self.args.transformation not in self.transform_dict:
123 | raise ValueError(
124 | "Invalid transformation name: %s" % self.args.transformation
125 | )
126 |
127 | if self.use_constraints:
128 | if self.custom_constraints is not None:
129 | constraints = self.custom_constraints
130 | else:
131 | constraints = []
132 | if self.task_type == "multichoice_qa":
133 | constraints = [NamedEntities()]
134 | elif self.task_type == "winograd":
135 | constraints = [NamedEntities(), Referents()]
136 | elif self.task_type == "jeopardy":
137 | constraints = [NamedEntities(), Jeopardy()]
138 | elif self.task_type == "multihop":
139 | constraints = [
140 | NamedEntities(),
141 | Multihop("bridge_answers", "main_answers"),
142 | ]
143 | else:
144 | constraints = None
145 |
146 | transformer = self.transform_dict[self.args.transformation](
147 | transformations=[self.args.transformation],
148 | task_type=self.task_type,
149 | args=self.args,
150 | text_col=self.text_col,
151 | label_col=self.label_col,
152 | seed=self.seed,
153 | device=self.device,
154 | constraints=constraints,
155 | )
156 |
157 | return transformer
158 |
159 | def transform(self) -> TransformResult:
160 | """
161 | Transforms dataset
162 |
163 | Applies provided transformations to dataset.
164 | Uses constraints to ensure the quality of the transformation.
165 |
166 | Returns
167 | -------
168 | TransformResult
169 | result of the transformation, including
170 | - transformed dataset
171 | type provided during initialization (self.return_type)
172 | - similarity scores of each transformed text (BERT-score)
173 | - mean similarity score (BERT-score)
174 | - standard deviation of the similarity scores
175 |
176 | """
177 |
178 | if type(self.dataset) is pd.DataFrame:
179 | dataset = Dataset.from_pandas(self.dataset)
180 | else:
181 | dataset = self.dataset
182 |
183 | transformed = []
184 | scores = []
185 | for sent_ind, sentence in tqdm(
186 | enumerate(dataset), total=len(dataset), desc="Transforming data"
187 | ):
188 | transformed_sentence, sent_scores = self.transformer.transform(sentence)
189 | transformed.extend(transformed_sentence)
190 | scores.extend(sent_scores)
191 |
192 | transformed = pd.DataFrame(transformed)
193 |
194 | if self.return_type == "hf":
195 | transformed = Dataset.from_pandas(transformed)
196 |
197 | scores = np.array(scores)
198 |
199 | return TransformResult(
200 | transformed_dataset=transformed,
201 | scores=scores,
202 | score=scores.mean(),
203 | std=np.std(scores),
204 | )
205 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformations/paraphrasis/back_translation.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | import spacy
3 | from typing import List, Tuple, Optional, Union
4 |
5 | import numpy as np
6 | from spacy.language import Language
7 | from transformers import MarianTokenizer, MarianMTModel
8 |
9 | from rutransform.utils.args import TransformArguments
10 | from rutransform.transformations.utils import SentenceOperation
11 |
12 |
13 | """
14 | Adapted from https://github.com/GEM-benchmark/NL-Augmenter/tree/main/transformations/back_translation_ner
15 | """
16 |
17 |
18 | class BackTranslationNER(SentenceOperation):
19 | """
20 | Generates diverse linguistic variations of the contexts
21 | around the entity mention(s) through back-translation
22 | ru -> en -> ru using Heksinki-NLP/opus-mt models
23 |
24 | Attributes
25 | ----------
26 | args: TransformArguments
27 | parameters of the transformation
28 | seed: int
29 | seed to freeze everything (default is 42)
30 | max_outputs: int
31 | maximum number of the transfromed sentences (default is 1)
32 | device: str
33 | the device used during transformation (default is 'cpu')
34 | spacy_model: spacy.language.Language
35 | spacy model used for tokenization
36 |
37 | Methods
38 | -------
39 | spacy_tagger(text, stop_words)
40 | Tokenizes the sentence and extract entity mentions
41 | translation_pipeline(text)
42 | Passes the text in source languages through the intermediate
43 | translations
44 | create_segments(tokens, tags)
45 | Creates segments for translation
46 | generate(sentence, stop_words, prob)
47 | Transforms the sentence
48 | """
49 |
50 | def __init__(
51 | self,
52 | args: TransformArguments,
53 | seed: int = 42,
54 | max_outputs: int = 1,
55 | device: str = "cpu",
56 | spacy_model: Optional[Language] = None,
57 | ) -> None:
58 | """
59 | Parameters
60 | ----------
61 | args: TransformArguments
62 | parameters of the transformation
63 | seed: int
64 | seed to freeze everything (default is 42)
65 | max_outputs: int
66 | maximum number of the transfromed sentences (default is 1)
67 | device: str
68 | the device used during transformation (default is 'cpu')
69 | spacy_model: spacy.language.Language
70 | spacy model used for tokenization
71 | """
72 | if spacy_model is None:
73 | spacy_model = spacy.load("ru_core_news_sm")
74 |
75 | super().__init__(
76 | args=args,
77 | seed=seed,
78 | max_outputs=max_outputs,
79 | device=device,
80 | spacy_model=spacy_model,
81 | )
82 |
83 | np.random.seed(self.seed)
84 | mname_ru2en = "Helsinki-NLP/opus-mt-ru-en"
85 | mname_en2ru = "Helsinki-NLP/opus-mt-en-ru"
86 | self.tokenizer_ru2en = MarianTokenizer.from_pretrained(mname_ru2en)
87 | self.tokenizer_en2ru = MarianTokenizer.from_pretrained(mname_en2ru)
88 | self.model_ru2en = MarianMTModel.from_pretrained(mname_ru2en).to(self.device)
89 | self.model_en2ru = MarianMTModel.from_pretrained(mname_en2ru).to(self.device)
90 | self.spacy_model = spacy_model
91 |
92 | def spacy_tagger(
93 | self, text: str, stop_words: Optional[List[str]]
94 | ) -> Tuple[List[str], List[str]]:
95 | """
96 | Tokenizes the sentence and extract entity mentions
97 |
98 | Parameters
99 | ----------
100 | text: str
101 | text to tokenize
102 | stop_words: List[int], optional
103 | stop_words to ignore during transformation (default is None)
104 |
105 | Returns
106 | -------
107 | Tuple[List[str], List[str]]
108 | tokenized text, BIO-annotated text
109 | """
110 | doc = self.spacy_model(text)
111 | ner = []
112 | tokenized = []
113 | for t, token in enumerate(doc):
114 | tokenized.append(token.text)
115 | if token.ent_type_:
116 | ner.append(token.ent_type_)
117 | elif stop_words is not None and t in stop_words:
118 | ner.append("B")
119 | else:
120 | ner.append("O")
121 | return tokenized, ner
122 |
123 | def translation_pipeline(self, text: str) -> str:
124 | """
125 | Passes the text in source languages through the intermediate
126 | translations
127 |
128 | Parameters
129 | ----------
130 | text: str
131 | text to translate
132 |
133 | Returns
134 | -------
135 | str
136 | back-translated text
137 | """
138 | ru2en_inputids = self.tokenizer_ru2en.encode(text, return_tensors="pt")
139 | ru2en_inputids = ru2en_inputids.to(self.device)
140 | outputs_ru2en = self.model_ru2en.generate(ru2en_inputids)
141 | text_trans = self.tokenizer_ru2en.decode(
142 | outputs_ru2en[0], skip_special_tokens=True
143 | )
144 | en2ru_inputids = self.tokenizer_en2ru.encode(text_trans, return_tensors="pt")
145 | en2ru_inputids = en2ru_inputids.to(self.device)
146 | outputs_en2ru = self.model_en2ru.generate(en2ru_inputids)
147 | text_trans = self.tokenizer_en2ru.decode(
148 | outputs_en2ru[0], skip_special_tokens=True
149 | )
150 | return text_trans
151 |
152 | @staticmethod
153 | def create_segments(
154 | tokens: List[str], tags: List[str]
155 | ) -> Tuple[List[List[str]], List[List[str]]]:
156 | """
157 | Creates segments for translation
158 |
159 | A segment is defined as a consecutive sequence of same tag/label
160 |
161 | Parameters
162 | ----------
163 | tokens: List[str]
164 | tokenized text
165 | tags: List[str]
166 | BIO-annotated text
167 |
168 | Returns
169 | -------
170 | Tuple[List[List[str]], List[List[str]]]
171 | segments of the text and its BIO-annotation
172 | """
173 | segment_tokens, segment_tags = [], []
174 | tags_idxs = [(i, t) for i, t in enumerate(tags)]
175 | groups = [
176 | list(g)
177 | for _, g in itertools.groupby(tags_idxs, lambda s: s[1].split("-")[-1])
178 | ]
179 | for group in groups:
180 | idxs = [i[0] for i in group]
181 | segment_tokens.append([tokens[idx] for idx in idxs])
182 | segment_tags.append([tags[idx] for idx in idxs])
183 |
184 | return segment_tokens, segment_tags
185 |
186 | def generate(
187 | self,
188 | sentence: str,
189 | stop_words: Optional[List[Union[int, str]]] = None,
190 | prob: Optional[float] = None,
191 | ) -> List[str]:
192 | """
193 | Transforms the sentence
194 |
195 | Parameters
196 | ----------
197 | sentence: str
198 | sentence to transform
199 | stop_words: List[int], optional
200 | stop_words to ignore during transformation (default is None)
201 | prob: float, optional
202 | ! exists for compatability, always ignored !
203 | probability of the transformation (default is None)
204 |
205 | Returns
206 | -------
207 | list
208 | list of transformed sentences
209 | """
210 |
211 | # tag sentence to extract entity mentions
212 | token_sequence, tag_sequence = self.spacy_tagger(sentence, stop_words)
213 |
214 | assert len(token_sequence) == len(
215 | tag_sequence
216 | ), f"token_sequence and tag_sequence should have same length! {len(token_sequence)}!={len(tag_sequence)}"
217 |
218 | transformations = []
219 | segment_tokens, segment_tags = BackTranslationNER.create_segments(
220 | token_sequence, tag_sequence
221 | )
222 | for _ in range(self.max_outputs):
223 | tokens = []
224 | for s_token, s_tag in zip(segment_tokens, segment_tags):
225 | if len(s_token) >= 100:
226 | segment_text = " ".join(s_token)
227 | tokens.extend([segment_text])
228 | continue
229 | translate_segment = np.random.binomial(1, p=self.args.bin_p)
230 | if (
231 | s_tag[0] != "O"
232 | or len(s_token) < self.args.segment_length
233 | or not translate_segment
234 | ):
235 | tokens.extend(s_token)
236 | continue
237 | segment_text = " ".join(s_token)
238 | segment_translation = self.translation_pipeline(segment_text)
239 | tokens.extend([segment_translation])
240 |
241 | transformations.append(" ".join(tokens))
242 |
243 | return transformations
244 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformers/spelling.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict, List, Optional, Tuple
2 |
3 | from rutransform.transformations.transformations.spelling import *
4 | from rutransform.transformations import Transformer
5 | from rutransform.constraints import Constraint
6 | from rutransform.utils.args import TransformArguments
7 | from rutransform.transformations.utils import SentenceOperation
8 |
9 |
10 | class SpellingTransformer(Transformer):
11 | """
12 | Spelling transformations
13 |
14 | Adds noise to text by mimicking common spelling mistakes,
15 | such as case alteration and keyboard distance character swaps,
16 | and emoji replacement
17 |
18 | Uses similarity metric (BERTScore) to filter the sentences,
19 | similarity score of which are less than a threshold (specified in
20 | TransformArguments).
21 |
22 | Attributes
23 | ----------
24 | transformation_type: str
25 | type of the transformations supported by the transformer
26 | transformations: List[str]
27 | list of transformations to apply to data
28 | task_type: str
29 | type of the task (e.g. 'classification', 'multichoice_qa', etc.)
30 | args: TransformArguments
31 | parameters of the transformation
32 | text_col: str, optional
33 | name of the column containing text to transform (default is 'text')
34 | label_col: str, optional
35 | name of the target column (default is 'label')
36 | seed: int
37 | seed to freeze everything (default is 42)
38 | device: str
39 | device used during transformation (default is 'cpu')
40 | constraints: List[Constraint], optional
41 | list of transformation constraints (default is None)
42 | spacy_model: spacy.language.Language
43 | spacy model used for tokenization (default is 'ru_core_news_sm')
44 | bert_scorer: BERTScorer
45 | similarity metric class used to filter transformed texts (default is None)
46 |
47 | Methods
48 | -------
49 | @staticmethod
50 | transform_info: Dict[str, SentenceOperation]
51 | dictionary mapping transformations and SentenceOperation classes
52 | load_transformations()
53 | Loads all the transformations required
54 | @abstractmethod
55 | transform(sentence)
56 | Applies the transformations to input
57 | sent_split(text)
58 | Splits text into sentences
59 | @staticmethod
60 | get_ids(matches)
61 | Returns ids of stopwords
62 | _transform_text(transformer, sentences, reference, stop_words, prob)
63 | Applies the transformations to long text and filters the transformed texts
64 | _transform_sentence(self, transformer, sentence, stop_words, prob)
65 | Applies the transformations to sentence and filters the transformed sentences
66 | _apply_transformation(transformer, sentence, sentences, reference, stop_words, prob)
67 | Applies the transformations to text until the transformed text passes
68 | the similarity threshold
69 | _list_stop_words(sentence, return_ids)
70 | Extracts stopwords matching the constraints
71 | _drop_duplicates(reference, candidates, scores)
72 | Returns transformed sentences without duplicates
73 | _filter_candidates(candidates, reference, context)
74 | Filters out sentences based on the similarity score
75 | _sample_to_max_outputs(sentences, scores)
76 | Returns the desired number of the transformed sentences
77 | _update_data(org_sentence, transformed)
78 | Updates the dataset object
79 | """
80 |
81 | def __init__(
82 | self,
83 | transformations: List[str],
84 | task_type: str,
85 | args: TransformArguments,
86 | text_col: Optional[str] = "text",
87 | label_col: Optional[str] = "label",
88 | seed: int = 42,
89 | device: str = "cpu",
90 | constraints: Optional[List[Constraint]] = None,
91 | ) -> None:
92 | """
93 | Parameters
94 | ----------
95 | transformations: List[str]
96 | list of transformations to apply to data
97 | task_type: str
98 | type of the task (e.g. 'classification', 'multichoice_qa', etc.)
99 | args: TransformArguments
100 | parameters of the transformation
101 | text_col: str, optional
102 | name of the column containing text to transform (default is 'text')
103 | label_col: str, optional
104 | name of the target column (default is 'label')
105 | seed: int
106 | seed to freeze everything (default is 42)
107 | device: str
108 | device used during transformation (default is 'cpu')
109 | constraints: List[Constraint], optional
110 | list of transformation constraints (default is None)
111 | """
112 | super().__init__(
113 | transformation_type="spelling",
114 | transformations=transformations,
115 | task_type=task_type,
116 | args=args,
117 | text_col=text_col,
118 | label_col=label_col,
119 | seed=seed,
120 | device=device,
121 | constraints=constraints,
122 | )
123 |
124 | self.transformers = self.load_transformations()
125 |
126 | @staticmethod
127 | def transform_info() -> Dict[str, Optional[SentenceOperation]]:
128 | """
129 | Information about the transformations used by the transformer
130 |
131 | Returns
132 | -------
133 | Dict[str, Optional[SentenceOperation]]
134 | dictionary storing transformation info
135 | """
136 | info = {
137 | "butter_fingers": ButterFingersTransformation,
138 | "case": ChangeCharCase,
139 | "emojify": EmojifyTransformation,
140 | }
141 |
142 | return info
143 |
144 | def _apply_transformation(
145 | self,
146 | transformer: SentenceOperation,
147 | sentence: str,
148 | sentences: Optional[List[str]] = None,
149 | stop_words: Optional[List[Union[str, int, List[Union[str, int]]]]] = None,
150 | prob: Optional[float] = None,
151 | ) -> Tuple[List[str], List[float]]:
152 | """
153 | Applies the transformations to text until the transformed text passes
154 | the similarity threshold
155 |
156 | Parameters
157 | ----------
158 | transformer: SentenceOperation
159 | transformer used for transformation
160 | sentence: str
161 | original sentence
162 | sentences: List[str], optional
163 | list of sentences of the text to transform if working
164 | with long texts (default is None)
165 | stop_words: List[Union[str, int]], optional
166 | stop_words for the transformation (default is None)
167 | prob: float, optional
168 | probability of the transformation (default is None)
169 |
170 | Returns
171 | -------
172 | Tuple[List[str], List[float]]
173 | list of transformed texts and their similarity scores
174 | """
175 |
176 | if sentences is not None:
177 | transform_sent, sent_scores = self._transform_text(
178 | transformer, sentences, sentence, stop_words=stop_words, prob=prob
179 | )
180 | prob = prob / 2
181 | count = 0
182 | while prob > 0.01 and count < 3:
183 | if len(transform_sent) == 1 and transform_sent[0] == sentence:
184 | transform_sent, sent_scores = self._transform_text(
185 | transformer, sentences, sentence, prob=prob
186 | )
187 | count += 1
188 | prob = prob / 2
189 | else:
190 | break
191 | return transform_sent, sent_scores
192 |
193 | transform_sent, sent_scores = self._transform_sentence(
194 | transformer, sentence, stop_words=stop_words, prob=prob
195 | )
196 | prob = prob / 2
197 | count = 0
198 | while prob > 0.01 and count < 3:
199 | if len(transform_sent) == 1 and transform_sent[0] == sentence:
200 | transform_sent, sent_scores = self._transform_sentence(
201 | transformer, sentence, stop_words=stop_words, prob=prob
202 | )
203 | prob = prob / 2
204 | count += 1
205 | else:
206 | break
207 |
208 | return transform_sent, sent_scores
209 |
210 | def transform(self, sentence: Dict[str, Any]) -> List[Dict[str, Any]]:
211 | """
212 | Applies the transformations to input
213 |
214 | Parameters
215 | ----------
216 | sentence: dict
217 | dataset object in dict form
218 |
219 | Returns
220 | -------
221 | dict
222 | Transformed dataset object
223 | """
224 | transformed_data = []
225 | scores = []
226 | for transform_name, transformer in self.transformers.items():
227 | if self.task_type in ["multihop", "multichoice_qa"]:
228 | stop_words = self._list_stop_words(sentence)
229 | sentences = self.sent_split(sentence[self.text_col])
230 | transform_sent, sent_scores = self._apply_transformation(
231 | transformer,
232 | sentence[self.text_col],
233 | sentences=sentences,
234 | stop_words=stop_words,
235 | prob=self.args.probability,
236 | )
237 |
238 | else:
239 | stop_words = self._list_stop_words(sentence)
240 | transform_sent, sent_scores = self._apply_transformation(
241 | transformer,
242 | sentence[self.text_col],
243 | stop_words=stop_words,
244 | prob=self.args.probability,
245 | )
246 |
247 | transformed = self._update_data(sentence, transform_sent, transform_name)
248 | transformed_data.extend(transformed)
249 | scores.extend(sent_scores)
250 |
251 | return transformed_data, scores
252 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformers/eda.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict, List, Optional, Tuple
2 |
3 | from rutransform.transformations.transformations.eda import *
4 | from rutransform.transformations import Transformer
5 | from rutransform.constraints import Constraint
6 | from rutransform.utils.args import TransformArguments
7 | from rutransform.transformations.utils import SentenceOperation
8 |
9 |
10 | class EDATransformer(Transformer):
11 | """
12 | Easy Data Augmentation transformation
13 |
14 |
15 | Takes sentence as input and applies random words swaps and delitions
16 | to transform the sentence.
17 |
18 | Utilizes constraints provided by the user to extract stopwords
19 | specific for the task, to which the transformations do not apply.
20 | Uses similarity metric (BERTScore) to filter the sentences,
21 | similarity score of which are less than a threshold (specified in
22 | TransformArguments).
23 |
24 | Attributes
25 | ----------
26 | transformation_type: str
27 | type of the transformations supported by the transformer
28 | transformations: List[str]
29 | list of transformations to apply to data
30 | task_type: str
31 | type of the task (e.g. 'classification', 'multichoice_qa', etc.)
32 | args: TransformArguments
33 | parameters of the transformation
34 | text_col: str, optional
35 | name of the column containing text to transform (default is 'text')
36 | label_col: str, optional
37 | name of the target column (default is 'label')
38 | seed: int
39 | seed to freeze everything (default is 42)
40 | device: str
41 | device used during transformation (default is 'cpu')
42 | constraints: List[Constraint]
43 | list of transformation constraints
44 | spacy_model: spacy.language.Language
45 | spacy model used for tokenization (default is 'ru_core_news_sm')
46 | bert_scorer: BERTScorer
47 | similarity metric class used to filter transformed texts (default is None)
48 |
49 | Methods
50 | -------
51 | @staticmethod
52 | transform_info: Dict[str, SentenceOperation]
53 | dictionary mapping transformations and SentenceOperation classes
54 | load_transformations()
55 | Loads all the transformations required
56 | @abstractmethod
57 | transform(sentence)
58 | Applies the transformations to input
59 | sent_split(text)
60 | Splits text into sentences
61 | @staticmethod
62 | get_ids(matches)
63 | Returns ids of stopwords
64 | _transform_text(transformer, sentences, reference, stop_words, prob)
65 | Applies the transformations to long text and filters the transformed texts
66 | _transform_sentence(self, transformer, sentence, stop_words, prob)
67 | Applies the transformations to sentence and filters the transformed sentences
68 | _apply_transformation(transformer, sentence, sentences, reference, stop_words, prob)
69 | Applies the transformations to text until the transformed text passes
70 | the similarity threshold
71 | _list_stop_words(sentence, return_ids)
72 | Extracts stopwords matching the constraints
73 | _drop_duplicates(reference, candidates, scores)
74 | Returns transformed sentences without duplicates
75 | _filter_candidates(candidates, reference, context)
76 | Filters out sentences based on the similarity score
77 | _sample_to_max_outputs(sentences, scores)
78 | Returns the desired number of the transformed sentences
79 | _update_data(org_sentence, transformed)
80 | Updates the dataset object
81 | """
82 |
83 | def __init__(
84 | self,
85 | transformations: List[str],
86 | task_type: str,
87 | args: TransformArguments,
88 | text_col: Optional[str] = "text",
89 | label_col: Optional[str] = "label",
90 | seed: int = 42,
91 | device: str = "cpu",
92 | constraints=None,
93 | ) -> None:
94 | """
95 | Parameters
96 | ----------
97 | transformations: List[str]
98 | list of transformations to apply to data
99 | task_type: str
100 | type of the task (e.g. 'classification', 'multichoice_qa', etc.)
101 | args: TransformArguments
102 | parameters of the transformation
103 | text_col: str, optional
104 | name of the column containing text to transform (default is 'text')
105 | label_col: str, optional
106 | name of the target column (default is 'label')
107 | seed: int
108 | seed to freeze everything (default is 42)
109 | device: str
110 | device used during transformation (default is 'cpu')
111 | constraints: List[Constraint]
112 | list of transformation constraints (default is None)
113 | """
114 | super().__init__(
115 | transformation_type="eda",
116 | transformations=transformations,
117 | task_type=task_type,
118 | args=args,
119 | text_col=text_col,
120 | label_col=label_col,
121 | seed=seed,
122 | device=device,
123 | constraints=constraints,
124 | )
125 |
126 | self.transformers = self.load_transformations()
127 |
128 | @staticmethod
129 | def transform_info() -> Dict[str, Optional[SentenceOperation]]:
130 | """
131 | Information about the transformations used by the transformer
132 |
133 | Returns
134 | -------
135 | Dict[str, Optional[SentenceOperation]]
136 | dictionary storing transformation info
137 | """
138 | info = {"eda": RandomEDA}
139 |
140 | return info
141 |
142 | def _apply_transformation(
143 | self,
144 | transformer: SentenceOperation,
145 | sentence: str,
146 | sentences: Optional[List[str]] = None,
147 | stop_words: Optional[List[Union[str, int, List[Union[str, int]]]]] = None,
148 | prob: Optional[float] = None,
149 | ) -> Tuple[List[str], List[float]]:
150 | """
151 | Applies the transformations to text until the transformed text passes
152 | the similarity threshold
153 |
154 | Parameters
155 | ----------
156 | transformer: SentenceOperation
157 | transformer used for transformation
158 | sentence: str
159 | original sentence
160 | sentences: List[str], optional
161 | list of sentences of the text to transform if working
162 | with long texts (default is None)
163 | stop_words: List[Union[str, int]], optional
164 | stop_words for the transformation (default is None)
165 | prob: float, optional
166 | probability of the transformation (default is None)
167 |
168 | Returns
169 | -------
170 | Tuple[List[str], List[float]]
171 | list of transformed texts and their similarity scores
172 | """
173 |
174 | if sentences is not None:
175 | transform_sent, sent_scores = self._transform_text(
176 | transformer=transformer,
177 | sentences=sentences,
178 | reference=sentence,
179 | stop_words=stop_words,
180 | prob=prob,
181 | )
182 | prob = prob / 2
183 | count = 0
184 | while prob > 0.01 and count < 3:
185 | if len(transform_sent) == 1 and sent_scores[0] == 1:
186 | transform_sent, sent_scores = self._transform_text(
187 | transformer=transformer,
188 | sentences=sentences,
189 | reference=sentence,
190 | prob=prob,
191 | )
192 | count += 1
193 | prob = prob / 2
194 | else:
195 | break
196 | return transform_sent, sent_scores
197 |
198 | transform_sent, sent_scores = self._transform_sentence(
199 | transformer=transformer, sentence=sentence, stop_words=stop_words, prob=prob
200 | )
201 | prob = prob / 2
202 | count = 0
203 | while prob > 0.01 and count < 3:
204 | if len(transform_sent) == 1 and sent_scores[0] == 1:
205 | transform_sent, sent_scores = self._transform_sentence(
206 | transformer=transformer,
207 | sentence=sentence,
208 | stop_words=stop_words,
209 | prob=prob,
210 | )
211 | prob = prob / 2
212 | count += 1
213 | else:
214 | break
215 |
216 | return transform_sent, sent_scores
217 |
218 | def transform(self, sentence: Dict[str, Any]) -> List[Dict[str, Any]]:
219 | """
220 | Applies the transformations to input
221 |
222 | Parameters
223 | ----------
224 | sentence: Dict[str, Any]
225 | dataset object in dict form
226 |
227 | Returns
228 | -------
229 | Dict[str, Any]
230 | Transformed dataset object
231 | """
232 | transformed_data = []
233 | scores = []
234 | for transform_name, transformer in self.transformers.items():
235 | stop_words = self._list_stop_words(sentence)
236 | sentences = self.sent_split(sentence[self.text_col])
237 | if len(sentences) > 1:
238 | transform_sent, sent_scores = self._apply_transformation(
239 | transformer=transformer,
240 | sentence=sentence[self.text_col],
241 | sentences=sentences,
242 | stop_words=stop_words,
243 | prob=self.args.probability,
244 | )
245 | else:
246 | transform_sent, sent_scores = self._apply_transformation(
247 | transformer=transformer,
248 | sentence=sentence[self.text_col],
249 | stop_words=stop_words,
250 | prob=self.args.probability,
251 | )
252 | transformed = self._update_data(sentence, transform_sent)
253 | transformed_data.extend(transformed)
254 | scores.extend(sent_scores)
255 |
256 | return transformed_data, scores
257 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformations/distraction/sentence_additions.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import List, Optional, Union
3 | from spacy.language import Language
4 | from transformers import (
5 | TextGenerationPipeline,
6 | set_seed,
7 | MT5ForConditionalGeneration,
8 | T5ForConditionalGeneration,
9 | AutoTokenizer,
10 | AutoModelWithLMHead,
11 | MT5Tokenizer,
12 | )
13 |
14 | from rutransform.utils.args import TransformArguments
15 | from rutransform.transformations.utils import SentenceOperation
16 |
17 | """
18 | Adapted from https://github.com/GEM-benchmark/NL-Augmenter/tree/main/transformations/sentence_additions
19 | """
20 |
21 |
22 | def clean(text: str) -> str:
23 | """
24 | Cleans text from unwanted characters created
25 | by the generator model
26 |
27 | Parameters
28 | ----------
29 | text: str
30 | generated text to clean
31 |
32 | Returns
33 | -------
34 | str
35 | clean string of text
36 | """
37 | if len(text) > 1:
38 | text = text.split("===")[0]
39 | text = " ".join(text.split("\n\n")[:2])
40 | text = text.replace("\\n", "\n")
41 | text = text.replace("", "")
42 | text = text.replace("&", "&")
43 | text = text.replace("lt;", "")
44 | text = text.replace("gt;", "")
45 | text = text.split("< EOS>")[0]
46 | text = text.split("")[0]
47 | text = text.replace("< EOS>", " ")
48 | text = text.replace("", "")
49 | text = text.replace("", "")
50 | text = text.replace("", " ")
51 | text = text.replace("< BOS>", " ")
52 | text = text.replace("", " ")
53 | text = text.replace("< SHORT>", " ")
54 | text = text.replace("", " ")
55 | text = text.replace("", " ")
56 | text = text.replace("< LONG>", " ")
57 | text = text.replace(" ul ", "\n")
58 | text = text.replace(" pre ", " ")
59 | text = text.replace(r" /pre ", " ")
60 | text = text.replace(r" / pre ", " ")
61 | text = text.replace(r"/code", "\n/code\n")
62 | text = text.replace(r"/ code", "\n/code\n")
63 | text = text.replace(" code", "\ncode\n")
64 | text = text.replace(" hr ", " ")
65 | text = text.replace(" e f ", "\n")
66 | text = text.replace("/h1", "\n")
67 | text = text.replace("nbsp;", " ")
68 | text = text.replace("/blockquote", "\n")
69 | text = text.replace(" +", " ")
70 | text = text.replace("", "")
71 | text = text.replace(".<", ".")
72 | text = text.replace("/", ".")
73 | text = text.replace("tml", "")
74 | text = text.replace("", "")
81 | return text.strip()
82 |
83 |
84 | class SentenceAdditions(SentenceOperation):
85 | """
86 | Adds generated sentence into provided sentences
87 | or paragraph to create adversarial examples.
88 |
89 | Attributes
90 | ----------
91 | args: TransformArguments
92 | parameters of the transformation
93 | seed: int
94 | seed to freeze everything (default is 42)
95 | max_outputs: int
96 | maximum number of the transfromed sentences (default is 1)
97 | device: str
98 | the device used during transformation (default is 'cpu')
99 | spacy_model: spacy.language.Language
100 | ! exists for compatability, always ignored !
101 | spacy model used for tokenization
102 |
103 | Methods
104 | -------
105 | get_model_path()
106 | Converts model name to model path
107 | generate(sentence, stop_words)
108 | Transforms the sentence
109 | """
110 |
111 | def __init__(
112 | self,
113 | args: TransformArguments,
114 | seed: int = 42,
115 | max_outputs: int = 1,
116 | device: str = "cpu",
117 | spacy_model: Optional[Language] = None,
118 | ) -> None:
119 | """
120 | Parameters
121 | ----------
122 | args: TransformArguments
123 | parameters of the transformation
124 | seed: int
125 | seed to freeze everything (default is 42)
126 | max_outputs: int
127 | maximum number of the transfromed sentences (default is 1)
128 | device: str
129 | the device used during transformation (default is 'cpu')
130 | spacy_model: spacy.language.Language
131 | ! exists for compatability, always ignored !
132 | spacy model used for tokenization
133 | """
134 | super().__init__(
135 | args=args,
136 | seed=seed,
137 | max_outputs=max_outputs,
138 | device=device,
139 | spacy_model=spacy_model,
140 | )
141 |
142 | model_name = self.get_model_path()
143 | if "mt5" in self.args.generator:
144 | self.tokenizer = AutoTokenizer.from_pretrained(model_name)
145 | self.model = MT5ForConditionalGeneration.from_pretrained(
146 | model_name, pad_token_id=self.tokenizer.eos_token_id
147 | ).to(self.device)
148 | elif "t5" in self.args.generator:
149 | self.tokenizer = AutoTokenizer.from_pretrained(model_name)
150 | self.model = T5ForConditionalGeneration.from_pretrained(
151 | model_name, pad_token_id=self.tokenizer.eos_token_id
152 | ).to(self.device)
153 | else:
154 | tokenizer = (
155 | MT5Tokenizer.from_pretrained(model_name)
156 | if model_name == "THUMT/mGPT"
157 | else AutoTokenizer.from_pretrained(model_name)
158 | )
159 | model = AutoModelWithLMHead.from_pretrained(
160 | model_name, pad_token_id=tokenizer.eos_token_id
161 | )
162 | self.generator = TextGenerationPipeline(
163 | model=model,
164 | tokenizer=tokenizer,
165 | device=(-1 if self.device == "cpu" else 0),
166 | )
167 |
168 | def get_model_path(self) -> str:
169 | """
170 | Converts model name to model path
171 |
172 | Returns
173 | -------
174 | str
175 | path to model in the HuggingFace library
176 | """
177 | model_dict = {
178 | "gpt2": "sberbank-ai/rugpt2_large",
179 | "gpt3": "sberbank-ai/rugpt3large_based_on_gpt2",
180 | "mt5-base": "google/mt5-base",
181 | "mt5-small": "google/mt5-small",
182 | "mt5-large": "google/mt5-large",
183 | }
184 | return (
185 | model_dict[self.args.generator]
186 | if self.args.generator in model_dict
187 | else self.args.generator
188 | )
189 |
190 | def generate(
191 | self,
192 | sentence: str,
193 | stop_words: Optional[List[Union[int, str]]] = None,
194 | prob: Optional[float] = None,
195 | ) -> List[str]:
196 | """
197 | Transforms the sentence
198 |
199 | Parameters
200 | ----------
201 | sentence: str
202 | sentence to transform
203 | stop_words: List[Union[int, str]], optional
204 | ! exists for compatability, always ignored !
205 | stop_words to ignore during transformation (default is None)
206 | prob: float, optional
207 | ! exists for compatability, always ignored !
208 | probability of the transformation (default is None)
209 |
210 | Returns
211 | -------
212 | list
213 | list of transformed sentences
214 | """
215 | if self.max_outputs == 1:
216 | set_seed(self.seed)
217 |
218 | if self.args.prompt:
219 | sentence = sentence + self.args.prompt_text
220 |
221 | transformed = []
222 | for _ in range(self.max_outputs):
223 | if "t5" in self.args.generator:
224 | encoding = self.tokenizer.encode_plus(
225 | sentence, pad_to_max_length=True, return_tensors="pt"
226 | )
227 | input_ids, attention_masks = (
228 | encoding["input_ids"].to(self.device),
229 | encoding["attention_mask"].to(self.device),
230 | )
231 |
232 | beam_outputs = self.model.generate(
233 | input_ids=input_ids,
234 | attention_mask=attention_masks,
235 | do_sample=self.args.do_sample,
236 | max_length=self.args.max_length,
237 | temperature=self.args.temperature,
238 | top_k=self.args.top_k,
239 | top_p=self.args.top_p,
240 | early_stopping=self.args.early_stopping,
241 | num_return_sequences=1,
242 | repetition_penalty=self.args.repetition_penalty,
243 | )
244 |
245 | for output in beam_outputs:
246 | sent = self.tokenizer.decode(
247 | output,
248 | skip_special_tokens=True,
249 | clean_up_tokenization_spaces=True,
250 | )
251 | transformed.append(sent)
252 | else:
253 | outputs = self.generator(
254 | sentence,
255 | max_length=self.args.max_length,
256 | skip_special_tokens=True,
257 | num_return_sequences=1,
258 | num_beams=self.args.num_beams,
259 | early_stopping=self.args.early_stopping,
260 | no_repeat_ngram_size=self.args.no_repeat_ngram_size,
261 | top_k=self.args.top_k,
262 | top_p=self.args.top_p,
263 | temperature=self.args.temperature,
264 | do_sample=self.args.do_sample,
265 | repetition_penalty=self.args.repetition_penalty,
266 | )
267 |
268 | for sents_with_additions in outputs:
269 | for key, value in sents_with_additions.items():
270 | transformed.append(clean(value))
271 | return transformed
272 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformations/eda/eda.py:
--------------------------------------------------------------------------------
1 | import random
2 | import spacy
3 | from random import shuffle
4 | from typing import List, Optional, Union
5 | from nltk.corpus import stopwords
6 | from string import punctuation
7 |
8 | from spacy.language import Language
9 |
10 | from rutransform.utils.args import TransformArguments
11 | from rutransform.transformations.utils import SentenceOperation
12 |
13 | STOPWORDS = stopwords.words("russian")
14 |
15 | """
16 | Adapted from https://github.com/jasonwei20/eda_nlp
17 | """
18 |
19 |
20 | def tokenize(text: str, spacy_model: Language) -> str:
21 | """
22 | Tokenizes text
23 |
24 | Parameters
25 | ----------
26 | text: str
27 | text to tokenize
28 | spacy_model: spacy.language.Language
29 | spacy model used for tokenization
30 |
31 | Returns
32 | -------
33 | str
34 | tokenized text
35 | """
36 | return " ".join([token.text for token in spacy_model(text)])
37 |
38 |
39 | def random_deletion(
40 | words: List[str], p: float, seed: int, stop_words: Optional[List[int]] = None
41 | ) -> List[str]:
42 | """
43 | Randomly deletes words from the sentence with probability p
44 |
45 | Parameters
46 | ----------
47 | words: List[str]
48 | list of tokens in the sentence
49 | p: float
50 | probability of the deletion
51 | seed: int
52 | seed to freeze everything
53 | stop_words: List[int], optional
54 | stop_words to ignore during deletion (default is None)
55 |
56 | Returns
57 | -------
58 | List[str]
59 | transformed sentence in tokens
60 | """
61 | random.seed(seed)
62 | if stop_words is None:
63 | stop_words = []
64 |
65 | # if there's only one word, don't delete it
66 | if len(words) <= 1:
67 | return words
68 |
69 | # randomly delete words with probability p
70 | new_words = []
71 | for idx, word in enumerate(words):
72 | if idx in stop_words:
73 | new_words.append(word)
74 | continue
75 | r = random.uniform(0, 1)
76 | if r > p:
77 | new_words.append(word)
78 |
79 | # if you end up deleting all words, just return a random word
80 | if len(new_words) == 0:
81 | rand_int = random.randint(0, len(words) - 1)
82 | return [words[rand_int]]
83 |
84 | if new_words == words:
85 | stopwords = [
86 | i
87 | for (i, word) in enumerate(words)
88 | if (word in STOPWORDS and i not in stop_words)
89 | ]
90 | if len(stopwords) > 0:
91 | random_idx = random.choice(stopwords)
92 | new_words.pop(random_idx)
93 |
94 | return new_words
95 |
96 |
97 | def random_swap(
98 | words: List[str], n: int, seed: int, stop_words: Optional[List[int]] = None
99 | ) -> List[str]:
100 | """
101 | Randomly swaps two words in the sentence n times
102 |
103 | Parameters
104 | ----------
105 | words: List[str]
106 | list of tokens in the sentence
107 | n: int
108 | number of swaps
109 | seed: int
110 | seed to freeze everything
111 | stop_words: List[int], optional
112 | stop_words to ignore during swaps (default is None)
113 |
114 | Returns
115 | -------
116 | List[str]
117 | transformed sentence in tokens
118 | """
119 | new_words = words.copy()
120 | for _ in range(n):
121 | new_words = swap_word(new_words, seed, stop_words=stop_words)
122 | return new_words
123 |
124 |
125 | def swap_word(
126 | words: List[str], seed: int, stop_words: Optional[List[int]] = None
127 | ) -> List[str]:
128 | """
129 | Randomly swap two words in the sentence
130 |
131 | Parameters
132 | ----------
133 | words: List[str]
134 | list of tokens in the sentence
135 | seed: int
136 | seed to freeze everything
137 | stop_words: List[int], optional
138 | stop_words to ignore during swaps (default is None)
139 |
140 | Returns
141 | -------
142 | List[str]
143 | transformed sentence in tokens
144 | """
145 | if stop_words is None:
146 | stop_words = []
147 |
148 | new_words = words.copy()
149 | random.seed(seed)
150 | allowed_ids = [i for (i, word) in enumerate(words) if i not in stop_words]
151 | if len(allowed_ids) >= 2:
152 | random_idx_1 = random.choice(allowed_ids) # test
153 | else:
154 | return new_words
155 |
156 | random_idx_2 = random_idx_1
157 | counter = 0
158 | while random_idx_2 == random_idx_1:
159 | random_idx_2 = random.choice(allowed_ids)
160 | counter += 1
161 | if counter > 3:
162 | return new_words
163 |
164 | new_words[random_idx_1], new_words[random_idx_2] = (
165 | new_words[random_idx_2],
166 | new_words[random_idx_1],
167 | )
168 |
169 | # if we did not swap any of the words swap any articles, pronouns, etc.
170 | if new_words == words:
171 | stopwords = [
172 | i
173 | for (i, word) in enumerate(new_words)
174 | if (word in STOPWORDS and i not in stop_words)
175 | ]
176 | if len(stopwords) > 1:
177 | random_idx_1, random_idx_2 = random.sample(stopwords, k=2)
178 | new_words[random_idx_1], new_words[random_idx_2] = (
179 | new_words[random_idx_2],
180 | new_words[random_idx_1],
181 | )
182 | return new_words
183 |
184 |
185 | def eda(
186 | sentence: str,
187 | spacy_model: Language,
188 | alpha_rs: float = 0.1,
189 | p_rd: float = 0.1,
190 | num_aug: int = 1,
191 | seed: int = 42,
192 | stop_words: Optional[List[int]] = None,
193 | ) -> List[str]:
194 | """
195 | Applies Easy Data Augmentations (random deletion and random swaps) to text
196 |
197 | Parameters
198 | ----------
199 | sentence: str
200 | text to transform
201 | spacy_model: spacy.language.Language
202 | spacy model used for tokenization
203 | alpha_rs: float
204 | probability of word swap (default is 0.1)
205 | p_rd: float
206 | probability of word deletion (default is 0.1)
207 | num_aug: int
208 | maximum number of the transformed sentences (default is 1)
209 | seed: int
210 | seed to freeze everything (default is 42)
211 | stop_words: List[int], optional
212 | stop_words to ignore during swaps (default is None)
213 |
214 | Returns
215 | -------
216 | List[str]
217 | list of transformed sentences
218 | """
219 | random.seed(seed)
220 | sentence = tokenize(sentence, spacy_model)
221 | words = sentence.split()
222 | words = [word for word in words if word is not ""]
223 | num_words = len(words)
224 | augmented_sentences = []
225 | num_new_per_technique = int(num_aug / 4) + 1
226 | n_rs = max(1, int(alpha_rs * num_words))
227 |
228 | # random swap
229 | for _ in range(num_new_per_technique):
230 | a_words = random_swap(words, n_rs, seed, stop_words=stop_words)
231 | augmented_sentences.append(" ".join(a_words))
232 |
233 | # random deletion
234 | for _ in range(num_new_per_technique):
235 | a_words = random_deletion(words, p_rd, seed, stop_words=stop_words)
236 | augmented_sentences.append(" ".join(a_words))
237 |
238 | shuffle(augmented_sentences)
239 |
240 | # trim to the the desired number of augmented sentences
241 | augmented_sentences = [s for s in augmented_sentences if s != sentence][:num_aug]
242 |
243 | return augmented_sentences
244 |
245 |
246 | class RandomEDA(SentenceOperation):
247 | """
248 | Augment data using Easy Data Augmentation techniques
249 | (random deletion and random word swaps)
250 |
251 | Attributes
252 | ----------
253 | args: TransformArguments
254 | parameters of the transformation
255 | seed: int
256 | seed to freeze everything (default is 42)
257 | max_outputs: int
258 | maximum number of the transfromed sentences (default is 1)
259 | device: str
260 | the device used during transformation (default is 'cpu')
261 | spacy_model: spacy.language.Language
262 | spacy model used for tokenization
263 |
264 | Methods
265 | -------
266 | generate(sentence, stop_words, prob)
267 | Transforms the sentence
268 | """
269 |
270 | def __init__(
271 | self,
272 | args: TransformArguments,
273 | seed: int = 42,
274 | max_outputs: int = 1,
275 | device: str = "cpu",
276 | spacy_model: Optional[Language] = None,
277 | ) -> None:
278 | """
279 | Parameters
280 | ----------
281 | args: TransformArguments
282 | parameters of the transformation
283 | seed: int
284 | seed to freeze everything (default is 42)
285 | max_outputs: int
286 | maximum number of the transfromed sentences (default is 1)
287 | device: str
288 | the device used during transformation (default is 'cpu')
289 | spacy_model: spacy.language.Language
290 | spacy model used for tokenization
291 | """
292 | if spacy_model is None:
293 | spacy_model = spacy.load("ru_core_news_sm")
294 |
295 | super().__init__(
296 | args=args,
297 | seed=seed,
298 | max_outputs=max_outputs,
299 | device=device,
300 | spacy_model=spacy_model,
301 | )
302 |
303 | def generate(
304 | self,
305 | sentence: str,
306 | stop_words: Optional[List[Union[int, str]]] = None,
307 | prob: Optional[float] = None,
308 | ) -> List[str]:
309 | """
310 | Transforms the sentence
311 |
312 | If 'prob' argument is not None, ignores the probabilityprovided in the arguments.
313 |
314 | Parameters
315 | ----------
316 | sentence: str
317 | sentence to transform
318 | stop_words: List[int], optional
319 | stop_words to ignore during transformation (default is None)
320 | prob: float, optional
321 | probability of the transformation (default is None)
322 |
323 | Returns
324 | -------
325 | list
326 | list of transformed sentences
327 | """
328 | if not prob:
329 | alpha_rs = self.args.probability
330 | p_rd = self.args.probability if self.args.same_prob else self.args.del_prob
331 | else:
332 | alpha_rs = prob
333 | p_rd = prob if self.args.same_prob else self.args.del_prob / 2
334 |
335 | transformed = eda(
336 | sentence=sentence,
337 | alpha_rs=alpha_rs,
338 | p_rd=p_rd,
339 | num_aug=self.max_outputs,
340 | seed=self.seed,
341 | spacy_model=self.spacy_model,
342 | stop_words=stop_words,
343 | )
344 |
345 | return transformed
346 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformers/paraphrasis.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | from typing import Any, Dict, List, Optional
4 |
5 | from razdel import sentenize
6 |
7 | from rutransform.transformations.transformations.paraphrasis import *
8 | from rutransform.transformations import Transformer
9 | from rutransform.constraints import Constraint
10 | from rutransform.utils.args import TransformArguments
11 | from rutransform.transformations.utils import SentenceOperation
12 |
13 |
14 | class ParaphrasisTransformer(Transformer):
15 | """
16 | Paraphrasis transformations
17 |
18 | Generates diverse linguistic variations of the contexts
19 | through paraphrasis, back-translation and style-trasfer
20 |
21 | Utilizes constraints provided by the user to extract stopwords
22 | specific for the task, to which the transformations do not apply.
23 | Uses similarity metric (BERTScore) to filter the sentences,
24 | similarity score of which are less than a threshold (specified in
25 | TransformArguments).
26 |
27 | Attributes
28 | ----------
29 | transformation_type: str
30 | type of the transformations supported by the transformer
31 | transformations: List[str]
32 | list of transformations to apply to data
33 | task_type: str
34 | type of the task (e.g. 'classification', 'multichoice_qa', etc.)
35 | args: TransformArguments
36 | parameters of the transformation
37 | text_col: str, optional
38 | name of the column containing text to transform (default is 'text')
39 | label_col: str, optional
40 | name of the target column (default is 'label')
41 | seed: int
42 | seed to freeze everything (default is 42)
43 | device: str
44 | device used during transformation (default is 'cpu')
45 | constraints: List[Constraint], optional
46 | list of transformation constraints (default is None)
47 | spacy_model: spacy.language.Language
48 | spacy model used for tokenization (default is 'ru_core_news_sm')
49 | bert_scorer: BERTScorer
50 | similarity metric class used to filter transformed texts (default is None)
51 |
52 | Methods
53 | -------
54 | @staticmethod
55 | transform_info: Dict[str, SentenceOperation]
56 | dictionary mapping transformations and SentenceOperation classes
57 | load_transformations()
58 | Loads all the transformations required
59 | @abstractmethod
60 | transform(sentence)
61 | Applies the transformations to input
62 | sent_split(text)
63 | Splits text into sentences
64 | @staticmethod
65 | get_ids(matches)
66 | Returns ids of stopwords
67 | _transform_text(transformer, sentences, reference, stop_words, prob)
68 | Applies the transformations to long text and filters the transformed texts
69 | _transform_sentence(self, transformer, sentence, stop_words, prob)
70 | Applies the transformations to sentence and filters the transformed sentences
71 | _list_stop_words(sentence, return_ids)
72 | Extracts stopwords matching the constraints
73 | _drop_duplicates(reference, candidates, scores)
74 | Returns transformed sentences without duplicates
75 | _filter_candidates(candidates, reference, context)
76 | Filters out sentences based on the similarity score
77 | _sample_to_max_outputs(sentences, scores)
78 | Returns the desired number of the transformed sentences
79 | _update_data(org_sentence, transformed)
80 | Updates the dataset object
81 | """
82 |
83 | def __init__(
84 | self,
85 | transformations: List[str],
86 | task_type: str,
87 | args: TransformArguments,
88 | text_col: Optional[str] = "text",
89 | label_col: Optional[str] = "label",
90 | seed: int = 42,
91 | device: str = "cpu",
92 | constraints: Optional[List[Constraint]] = None,
93 | ) -> None:
94 | """
95 | Parameters
96 | ----------
97 | transformations: List[str]
98 | list of transformations to apply to data
99 | task_type: str
100 | type of the task (e.g. 'classification', 'multichoice_qa', etc.)
101 | args: TransformArguments
102 | parameters of the transformation
103 | text_col: str, optional
104 | name of the column containing text to transform (default is 'text')
105 | label_col: str, optional
106 | name of the target column (default is 'label')
107 | seed: int
108 | seed to freeze everything (default is 42)
109 | device: str
110 | device used during transformation (default is 'cpu')
111 | constraints: List[Constraint], optional
112 | list of transformation constraints (default is None)
113 | """
114 | super().__init__(
115 | transformation_type="paraphrasis",
116 | transformations=transformations,
117 | task_type=task_type,
118 | args=args,
119 | text_col=text_col,
120 | label_col=label_col,
121 | seed=seed,
122 | device=device,
123 | constraints=constraints,
124 | )
125 |
126 | self.transformers = self.load_transformations()
127 |
128 | @staticmethod
129 | def transform_info() -> Dict[str, Optional[SentenceOperation]]:
130 | """
131 | Information about the transformations used by the transformer
132 |
133 | Returns
134 | -------
135 | Dict[str, Optional[SentenceOperation]]
136 | dictionary storing transformation info
137 | """
138 | info = {
139 | "paraphraser": Paraphraser,
140 | "style_transfer": None,
141 | "back_translation": BackTranslationNER,
142 | }
143 | return info
144 |
145 | def _transform_text(
146 | self,
147 | transformer: SentenceOperation,
148 | sentences: List[str],
149 | reference: str,
150 | stop_words: Optional[List[Union[str, int, List[Union[str, int]]]]] = None,
151 | prob: Optional[float] = None,
152 | ) -> Tuple[List[str], List[float]]:
153 | """
154 | Applies the transformations to long text and filters the transformed texts
155 |
156 | Parameters
157 | ----------
158 | transformer: SentenceOperation
159 | transformer used for transformation
160 | sentences: List[str]
161 | sentences of the text to transform
162 | reference: str
163 | original sentence to use as a reference for similarity score
164 | stop_words: List[Union[str, int]], optional
165 | stop_words for the transformation (default is None)
166 | prob: float, optional
167 | probability of the transformation (default is None)
168 | used when no transormed sentence passes the similarity
169 | score threshold
170 |
171 | Returns
172 | -------
173 | Tuple[List[str], List[float]]
174 | list of transformed texts and their similarity scores
175 | """
176 | transform_text = []
177 | text_scores = []
178 | for s, sent in enumerate(sentences):
179 |
180 | if stop_words is not None:
181 | stops = stop_words[s]
182 | else:
183 | stops = None
184 |
185 | if s == 0 and self.task_type == "multichoice":
186 | question_text = []
187 | sents = [sentence.text for sentence in sentenize(sent)]
188 |
189 | for i, sentence in enumerate(sents):
190 | if random.uniform(0, 1) >= self.args.probability:
191 | question_text.append([sentence] * self.args.max_outputs)
192 | else:
193 | transform_sent = transformer.generate(
194 | sentence=sentence, stop_words=stops, prob=prob
195 | )
196 | question_text.append(
197 | transform_sent
198 | ) # else random.choice(sentence)
199 |
200 | transformed = []
201 | for m in range(self.args.max_outputs):
202 | transf_sentence = []
203 | for tr_sent in question_text:
204 | if len(tr_sent) >= m:
205 | transf_sentence.append(tr_sent[m])
206 | else:
207 | transf_sentence.append(random.choice(tr_sent))
208 | transformed.append(". ".join(transf_sentence))
209 |
210 | transform_sent = transformed
211 | transform_sent = self._drop_duplicates(sent, transform_sent)
212 | transform_sent, sent_scores = self._filter_candidates(
213 | candidates=transform_sent, reference=sent
214 | )
215 | transform_text.append(transform_sent)
216 | text_scores.append(sent_scores)
217 |
218 | elif random.uniform(0, 1) >= self.args.probability:
219 | transform_text.append([sent])
220 | text_scores.append([1.0])
221 |
222 | else:
223 | transform_sent = transformer.generate(
224 | sentence=sent, stop_words=stops, prob=prob
225 | )
226 | transform_sent = self._drop_duplicates(sent, transform_sent)
227 | transform_sent, sent_scores = self._filter_candidates(
228 | candidates=transform_sent, reference=sent
229 | )
230 | transform_text.append(transform_sent)
231 | text_scores.append(sent_scores)
232 |
233 | transform_sent, scores = self._sample_to_max_outputs(
234 | transform_text, text_scores
235 | )
236 | transform_sent, scores = self._filter_candidates(
237 | candidates=transform_sent, reference=reference
238 | )
239 | transform_sent, scores = self._drop_duplicates(
240 | reference, transform_sent, scores
241 | )
242 | return transform_sent, scores
243 |
244 | def transform(self, sentence: Dict[str, Any]) -> List[Dict[str, Any]]:
245 | """
246 | Applies the transformations to input
247 |
248 | Parameters
249 | ----------
250 | sentence: Dict[str, Any]
251 | dataset object in dict form
252 |
253 | Returns
254 | -------
255 | Dict[str, Any]
256 | Transformed dataset object
257 | """
258 | transformed_data = []
259 | scores = []
260 | for transform_name, transformer in self.transformers.items():
261 | stop_words = self._list_stop_words(sentence)
262 | sentences = self.sent_split(sentence[self.text_col])
263 | if len(sentences) > 1:
264 | transform_sent, sent_scores = self._transform_text(
265 | transformer,
266 | sentences,
267 | reference=sentence[self.text_col],
268 | stop_words=stop_words,
269 | )
270 | if len(transform_sent) == 1 and sent_scores[0] == 1:
271 | transform_sent, sent_scores = self._transform_text(
272 | transformer,
273 | sentences,
274 | reference=sentence[self.text_col],
275 | stop_words=stop_words,
276 | prob=self.args.probability,
277 | )
278 | else:
279 | transform_sent, sent_scores = self._transform_sentence(
280 | transformer, sentence[self.text_col], stop_words=stop_words
281 | )
282 | transformed = self._update_data(sentence, transform_sent)
283 | transformed_data.extend(transformed)
284 | scores.extend(sent_scores)
285 |
286 | return transformed_data, scores
287 |
--------------------------------------------------------------------------------
/rutransform/transformations/transformers/distraction.py:
--------------------------------------------------------------------------------
1 | import random
2 | import re
3 | from typing import Any, Dict, List, Optional, Tuple
4 | from razdel import sentenize
5 |
6 | from rutransform.transformations.transformations.distraction import *
7 | from rutransform.transformations import Transformer
8 | from rutransform.constraints import Constraint
9 | from rutransform.utils.args import TransformArguments
10 | from rutransform.transformations.utils import SentenceOperation
11 |
12 |
13 | class AdditionTransformer(Transformer):
14 | """
15 | Addition transformations
16 |
17 | Generates additional sentence or words at the end of the sentence
18 |
19 | Utilizes constraints provided by the user to extract stopwords
20 | specific for the task, to which the transformations do not apply.
21 | Uses similarity metric (BERTScore) to filter the sentences,
22 | similarity score of which are less than a threshold (specified in
23 | TransformArguments).
24 |
25 | Attributes
26 | ----------
27 | transformation_type: str
28 | type of the transformations supported by the transformer
29 | transformations: List[str]
30 | list of transformations to apply to data
31 | task_type: str
32 | type of the task (e.g. 'classification', 'multichoice_qa', etc.)
33 | args: TransformArguments
34 | parameters of the transformation
35 | text_col: str, optional
36 | name of the column containing text to transform (default is 'text')
37 | label_col: str, optional
38 | name of the target column (default is 'label')
39 | seed: int
40 | seed to freeze everything (default is 42)
41 | device: str
42 | device used during transformation (default is 'cpu')
43 | constraints: List[Constraint], optional
44 | list of transformation constraints (default is None)
45 | spacy_model: spacy.language.Language
46 | spacy model used for tokenization (default is 'ru_core_news_sm')
47 | transform_info:
48 | dictionary mapping transformations and SentenceOperation classes
49 | provided in utils.constants
50 | bert_scorer: BERTScorer
51 | similarity metric class used to filter transformed texts (default is None)
52 |
53 | Methods
54 | -------
55 | @staticmethod
56 | transform_info: Dict[str, SentenceOperation]
57 | dictionary mapping transformations and SentenceOperation classes
58 | load_transformations()
59 | Loads all the transformations required
60 | @abstractmethod
61 | transform(sentence)
62 | Applies the transformations to input
63 | sent_split(text)
64 | Splits text into sentences
65 | @staticmethod
66 | get_ids(matches)
67 | Returns ids of stopwords
68 | _transform_text(transformer, sentences, reference, stop_words, prob)
69 | Applies the transformations to long text and filters the transformed texts
70 | _transform_multichoice(transformer, sentence, add_split)
71 | Generates new answer options for multichoice questions and
72 | filters the transformed texts
73 | _transform_sentence(self, transformer, sentence, stop_words, prob)
74 | Applies the transformations to sentence and filters the transformed sentences
75 | _list_stop_words(sentence, return_ids)
76 | Extracts stopwords matching the constraints
77 | _drop_duplicates(reference, candidates, scores)
78 | Returns transformed sentences without duplicates
79 | _filter_candidates(candidates, reference, context)
80 | Filters out sentences based on the similarity score
81 | _sample_to_max_outputs(sentences, scores)
82 | Returns the desired number of the transformed sentences
83 | _update_data(org_sentence, transformed)
84 | Updates the dataset object
85 | """
86 |
87 | def __init__(
88 | self,
89 | transformations: List[str],
90 | task_type: str,
91 | args: TransformArguments,
92 | text_col: Optional[str] = "text",
93 | label_col: Optional[str] = "label",
94 | seed: int = 42,
95 | device: str = str,
96 | constraints=Optional[List[Constraint]],
97 | ) -> None:
98 | """
99 | Parameters
100 | ----------
101 | transformations: List[str]
102 | list of transformations to apply to data
103 | task_type: str
104 | type of the task (e.g. 'classification', 'multichoice_qa', etc.)
105 | args: TransformArguments
106 | parameters of the transformation
107 | text_col: str, optional
108 | name of the column containing text to transform (default is 'text')
109 | label_col: str, optional
110 | name of the target column (default is 'label')
111 | seed: int
112 | seed to freeze everything (default is 42)
113 | device: str
114 | device used during transformation (default is 'cpu')
115 | constraints: List[Constraint], optional
116 | list of transformation constraints (default is None)
117 | """
118 | super().__init__(
119 | transformation_type="addition",
120 | transformations=transformations,
121 | task_type=task_type,
122 | args=args,
123 | text_col=text_col,
124 | label_col=label_col,
125 | seed=seed,
126 | device=device,
127 | constraints=constraints,
128 | )
129 |
130 | self.transformers = self.load_transformations()
131 |
132 | @staticmethod
133 | def transform_info() -> Dict[str, Optional[SentenceOperation]]:
134 | """
135 | Information about the transformations used by the transformer
136 |
137 | Returns
138 | -------
139 | Dict[str, Optional[SentenceOperation]]
140 | dictionary storing transformation info
141 | """
142 | info = {
143 | "addsent": SentenceAdditions,
144 | }
145 |
146 | return info
147 |
148 | def _transform_multichoice(
149 | self,
150 | transformer: SentenceOperation,
151 | sentence: Dict[str, Any],
152 | add_split: bool = False,
153 | ) -> Tuple[List[str], List[float]]:
154 | """
155 | Generates new answer options for multichoice questions and
156 | filters the transformed texts
157 |
158 | Parameters
159 | ----------
160 | transformer: SentenceOperation
161 | transformer used for transformation
162 | sentences: dict
163 | dataset object in dict form
164 | add_split: bool
165 | whether to do additional splitting of the
166 | generated data (default is False)
167 | used to trim the generated text to create
168 | shorter sequences
169 |
170 | Returns
171 | -------
172 | Tuple[List[str], List[float]]
173 | list of transformed texts and their similarity scores
174 | """
175 | # split text into context and answer options
176 | sentences = self.sent_split(sentence[self.text_col])
177 | context = sentences.pop(0)
178 | # get answer index
179 | keys = ["A", "B", "C", "D"]
180 | answer = sentence[self.label_col]
181 | answer = keys.index(answer) if type(answer) == str else answer
182 |
183 | transform_sent = [[context]]
184 | imediate_context = [sentence.text for sentence in sentenize(context)][-1]
185 |
186 | # generate new answers
187 | change_answ = random.choice(range(len(sentences)))
188 | while change_answ == answer:
189 | change_answ = random.choice(range(len(sentences)))
190 |
191 | for s_id, sent in enumerate(sentences):
192 | if s_id == change_answ:
193 | transformed = set(transformer.generate(imediate_context))
194 | if add_split == "sent":
195 | transf_sent = []
196 | for s in transformed:
197 | s = s.replace(imediate_context, "").replace("\n", " ")
198 | split_sent = re.split(r"[\.\?!]", s)[0]
199 | if len(split_sent) > 1 and len(split_sent[0]) > 1:
200 | transf_sent.append(split_sent.strip())
201 | else:
202 | transf_sent.append(" ".join(s.split()[:5]).strip())
203 | else:
204 | transf_sent = [
205 | re.split(
206 | r"[\.\?!]",
207 | s.replace(imediate_context, "").replace("\n", " "),
208 | )[0].strip()
209 | for s in transformed
210 | ]
211 | transf_sent = self._drop_duplicates(sent, transf_sent)
212 | transform_sent.append(transf_sent)
213 | else:
214 | transform_sent.append([sent])
215 | transform_sent = self._sample_to_max_outputs(transform_sent)
216 | transform_sent, sent_scores = self._filter_candidates(
217 | transform_sent, sentence[self.text_col]
218 | )
219 |
220 | return transform_sent, sent_scores
221 |
222 | def transform_sentence(
223 | self,
224 | transformer: SentenceOperation,
225 | sentence: str,
226 | reference: str,
227 | context: Optional[str] = None,
228 | add_split: Optional[str] = None,
229 | ) -> Tuple[List[str], List[float]]:
230 | """
231 | Applies the transformations to sentence and filters the transformed sentences
232 |
233 | Parameters
234 | ----------
235 | transformer: SentenceOperation
236 | transformer used for transformation
237 | sentence: str
238 | sentence to transform
239 | reference:
240 | original sentence
241 | context: str, optional
242 | full context (default is None)
243 | add_split: str, optional
244 | type of additional splitting to do (default is None)
245 | - if 'word' returns first 5 words of the generated text
246 | - if 'sent' returns the first generated sentence (sequence to '.')
247 |
248 | Returns
249 | -------
250 | Tuple[List[str], List[float]]
251 | list of transformed sentences and their similarity scores
252 | """
253 | transform_sent = transformer.generate(sentence)
254 |
255 | if add_split == "sent":
256 | transformed = []
257 | for s in transform_sent:
258 | split_sent = re.split(
259 | r"[\.\?!]", s.replace(sentence, "").replace("\n", " ")
260 | )
261 | if len(split_sent[0]) < 10 and len(split_sent) > 1:
262 | transformed.append((sentence + " " + split_sent[1]).strip())
263 | else:
264 | transformed.append((sentence + " " + split_sent[0]).strip())
265 | transform_sent = transformed
266 | elif add_split == "word5":
267 | transform_sent = [
268 | sentence
269 | + " "
270 | + " ".join(
271 | s.replace(sentence, "").replace("\n", " ").split()[:5]
272 | ).strip()
273 | for s in transform_sent
274 | ]
275 | elif add_split == "word3":
276 | transform_sent = [
277 | sentence
278 | + " "
279 | + " ".join(
280 | s.replace(sentence, "").replace("\n", " ").split()[:3]
281 | ).strip()
282 | for s in transform_sent
283 | ]
284 | else:
285 | transform_sent = [
286 | sentence
287 | + " "
288 | + " ".join(s.replace(sentence, "").replace("\n", " ")[:10]).strip()
289 | for s in transform_sent
290 | ]
291 |
292 | transform_sent = self._drop_duplicates(sentence, transform_sent)
293 | transform_sent, sent_scores = self._filter_candidates(
294 | transform_sent, reference, context=context
295 | )
296 | transform_sent, sent_scores = self._sample_to_max_outputs(
297 | transform_sent, sent_scores
298 | )
299 |
300 | return transform_sent, sent_scores
301 |
302 | def transform(self, sentence: Dict[str, Any]) -> List[Dict[str, Any]]:
303 | """
304 | Applies the transformations to input
305 |
306 | Parameters
307 | ----------
308 | sentence: Dict[str, Any]
309 | dataset object in dict form
310 |
311 | Returns
312 | -------
313 | Dict[str, Any]
314 | Transformed dataset object
315 | """
316 | transformed_data = []
317 | scores = []
318 | for transform_name, transformer in self.transformers.items():
319 | if self.task_type == "multichoice_qa":
320 | transform_sent, sent_scores = self._transform_multichoice(
321 | transformer, sentence, add_split="sent"
322 | )
323 |
324 | if len(transform_sent) == 1 and sent_scores[0] == 1:
325 | transform_sent, sent_scores = self._transform_multichoice(
326 | transformer, sentence, add_split="word"
327 | )
328 |
329 | else:
330 | split_text = self.sent_split(sentence[self.text_col])
331 | if len(split_text) > 1:
332 | context, text = map(
333 | lambda x: " ".join(x), (split_text[:-2], split_text[-2:])
334 | )
335 | else:
336 | text = sentence[self.text_col]
337 | context = None
338 | transform_sent, sent_scores = self.transform_sentence(
339 | transformer,
340 | text,
341 | sentence[self.text_col],
342 | context,
343 | add_split="sent",
344 | )
345 | for split_type in ["word5", "word3", "char"]:
346 | if len(transform_sent) == 1 and sent_scores[0] == 1:
347 | transform_sent, sent_scores = self.transform_sentence(
348 | transformer,
349 | text,
350 | sentence[self.text_col],
351 | context,
352 | add_split=split_type,
353 | )
354 | else:
355 | break
356 |
357 | transformed = self._update_data(sentence, transform_sent)
358 | transformed_data.extend(transformed)
359 | scores.extend(sent_scores)
360 |
361 | return transformed_data, scores
362 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # RuTransform
2 |
3 | RuTransform is a Python framework for adversarial attacks and text data augmentation for Russian.
4 |
5 | #### Table of contents
6 |
7 | - [Setup & Usage](https://github.com/RussianNLP/rutransform/#setup--usage)
8 | - [Examples](https://github.com/RussianNLP/rutransform/#examples)
9 | - Dataset Transformation on sample data
10 | - Dataset Transformation on your own data
11 | - Custom Constraints
12 | - Sentence Transformations
13 | - Custom Transformations
14 | - [Framework Design](https://github.com/RussianNLP/rutransform/#framework-design)
15 | - [Cite us](https://github.com/RussianNLP/rutransform/#cite-us)
16 |
17 | ## Setup & Usage
18 |
19 | #### Requirements
20 |
21 | Python >= 3.7
22 |
23 | ### Installation
24 |
25 | ```
26 | git clone https://github.com/RussianNLP/rutransform
27 | cd rutransform
28 | pip install .
29 | ```
30 |
31 | ### Examples
32 |
33 |
34 | Dataset Transformation (sample data)
35 |
36 | For the full list of supported transformations with examples see [supported transformations](https://github.com/RussianNLP/rutransform/#supported-transformations).
37 |
38 | ```
39 | import pandas as pd
40 | from rutransform.transformations import DatasetTransformer
41 | from rutransform.utils.args import TransformArguments
42 |
43 |
44 | # load data
45 | dataset = pd.read_json('test_data/worldtree.json', lines=True)
46 |
47 | # load arguments
48 | transformation = "butter_fingers"
49 | probability = 0.3
50 | args = TransformArguments(transformation=transformation, probability=probability)
51 |
52 | # init dataset transformer
53 | tr = DatasetTransformer(
54 | dataset=dataset,
55 | text_col='question',
56 | task_type='multichoice_qa',
57 | args=args,
58 | return_type='pd' # format of the resulting dataset (default is 'hf')
59 | )
60 |
61 | # run transformation
62 | output = tr.transform()
63 | ```
64 | `DatasetTransformer` outputs a named tuple with dataset similarity scores:
65 | ```
66 | score = output.score # mean dataset similarity score: 0.9146944761276246
67 | scores = output.scores # similarity scores for each sentence: array([0.93971652, 0.94295949, 0.8272841 , 0.98828816, 0.87522411])
68 | std = output.std # std of the similarity scores: 0.05663837594035781
69 | ```
70 | ...and the transformed dataset:
71 | ```
72 | transformed_data = output.transformed_dataset
73 |
74 | print('Original data:\n', dataset['question'][0])
75 | print('Transformed data:\n', transformed_data['question'][0])
76 | ```
77 | ```
78 | Original data:
79 | Когда мороженое не кладут в морозильную камеру, мороженое превращается из ___. (A) твердого тела в газ (B) газа в жидкость (C) твердого тела в жидкость (D) жидкости в газ
80 | Transformed data:
81 | Когда мороженое не кладут в морозильную камеру, мороженое превращается из ___. (A) твердого тела в газ (B) газа в жидвость (C) мвердого тела в дидкость (D) жидкости в гкз
82 | ```
83 |
84 |
85 |
86 |
87 | Dataset Transformation (own data)
88 |
89 |
90 | RuTransform can easily be adapted to other tasks. To use the framework on your own data, simply specify the text (`text_col`) and/or target (`label_col`) column names and choose the suitable constraints (pass them into the `custom_constraints` argument). For example, to run transformation on the [DaNetQA](https://russiansuperglue.com/tasks/task_info/DaNetQA) data [(Shavrina et al,. 2020)](https://aclanthology.org/2020.emnlp-main.381/) we choose to perturb the `passage` text and use the `NamedEntities` constraint to preserve proper nouns:
91 |
92 | ```
93 | import pandas as pd
94 | from rutransform.transformations import DatasetTransformer
95 | from rutransform.utils.args import TransformArguments
96 | from rutransform.constraints import NamedEntities
97 |
98 | # load data
99 | dataset = pd.read_json('test_data/danet_qa.json', lines=True)
100 |
101 | # init arguments
102 | transformation = "back_translation"
103 | probability = 0.5
104 | args = TransformArguments(transformation=transformation, probability=probability)
105 |
106 | # init dataset transformer
107 | tr = DatasetTransformer(
108 | dataset=dataset,
109 | text_col='passage',
110 | args=args,
111 | return_type='pd',
112 | custom_constraints=[NamedEntities()],
113 | device='cuda:0'
114 | )
115 |
116 | # run transformation
117 | output = tr.transform()
118 |
119 | print('Original data:\n', dataset['passage'][4],)
120 | print('Transformed data:\n', output.transformed_dataset['passage'][4])
121 | ```
122 | ```
123 | Original data:
124 | Средство коммуникации. В своей простейшей форме искусство представляет собой средство коммуникации. Как и большинство прочих способов коммуникации, оно несет в себе намерение передать информацию аудитории. Например, научная иллюстрация — тоже форма искусства, существующая для передачи информации. Ещё один пример такого рода — географические карты. Однако содержание послания не обязательно бывает научным. Искусство позволяет передавать не только объективную информацию, но и эмоции, настроение, чувства.
125 | Transformed data:
126 | Средство коммуникации. В своей простой форме искусство является средством общения. Как и большинство других средств коммуникации, она намерена доводить информацию до сведения аудитории. Например, научная иллюстрация — тоже форма искусства, существующая для передачи информации. Ещё один пример такого рода — географические карты. Однако содержание послания не обязательно бывает научным. Искусство позволяет передавать не только объективную информацию, но и эмоции, настроение, чувства.
127 | ```
128 |
129 |
130 |
131 | Custom Constraints
132 |
133 | If the provided constraints are not enough, you can create your own ones by simple class inheritance. For example, to run transformation on the [RWSD](https://russiansuperglue.com/tasks/task_info/RWSD) dataset [(Shavrina et al,. 2020)](https://aclanthology.org/2020.emnlp-main.381/), we create an `RWSDConstraint`:
134 |
135 | ```
136 | from rutransform.constraints import Constraint
137 | from rutransform.constraints.utils import parse_reference
138 | from typing import List, Optional
139 | from spacy.language import Language
140 |
141 | class RWSDConstraint(Constraint):
142 | def __init__(self, target_col_name: str, reference_key: str, noun_key: str) -> None:
143 | super().__init__(name='rwsd_constraint')
144 | self.target_col_name = target_col_name
145 | self.reference_key = reference_key
146 | self.noun_key = noun_key
147 |
148 | def patterns(
149 | self, text: Optional[dict], spacy_model: Optional[Language]
150 | ) -> List[List[dict]]:
151 | morph = parse_reference(text[self.target_col_name][self.noun_key], spacy_model)
152 | antecedent_feats = list(morph.values())
153 | patterns = [
154 | [{"TEXT": {"IN": text[self.target_col_name][self.reference_key].split() + text[self.target_col_name][self.noun_key].split()}}],
155 | [{"POS": {"IN": ["NOUN", 'PROPN']}, "MORPH": {"IS_SUPERSET": antecedent_feats}}],
156 | ]
157 | return patterns
158 | ```
159 | To use custom constraints during the transformation, pass them into the `custom_constraints` argument:
160 | ```
161 | import pandas as pd
162 | from rutransform.transformations import DatasetTransformer
163 | from rutransform.utils.args import TransformArguments
164 |
165 | # load data
166 | dataset = pd.read_json('test_data/rwsd.json', lines=True)
167 |
168 | # load arguments
169 | transformation = "eda"
170 | probability = 0.5
171 | args = TransformArguments(transformation=transformation, probability=probability)
172 |
173 | # init dataset transformer
174 | tr = DatasetTransformer(
175 | dataset=dataset,
176 | text_col='text',
177 | args=args,
178 | custom_constraints=[
179 | RWSDConstraint(
180 | target_col_name='target', reference_key='span2_text', noun_key='span1_text'
181 | )
182 | ],
183 | return_type='pd' # format of the resulting dataset (default is 'hf')
184 | )
185 |
186 | # run transformation
187 | output = tr.transform()
188 |
189 | print('Target:', dataset['target'][0])
190 | print('Original data:\n', dataset['text'][0],)
191 | print('Transformed data:\n', output.transformed_dataset['text'][0])
192 | ```
193 | ```
194 | Target: {'span1_text': 'статью', 'span2_text': 'читает ее', 'span1_index': 7, 'span2_index': 9}
195 | Original data:
196 | Сара взяла в библиотеке книгу, чтобы написать статью. Она читает ее, когда приходит с работы.
197 | Transformed data:
198 | Сара книгу , чтобы написать статью Она читает ее с работы .
199 | ```
200 |
201 |
202 |
203 |
204 | Sentence Transformation
205 |
206 | All of the transformations, supported by the framework, can be applied not only to the while datasets, but sentences alone.
207 |
208 | ```
209 | from rutransform.transformations import (
210 | SentenceAdditions,
211 | ButterFingersTransformation,
212 | EmojifyTransformation,
213 | ChangeCharCase,
214 | BackTranslationNER,
215 | Paraphraser,
216 | RandomEDA,
217 | BAE
218 | )
219 |
220 | # initialize the transformations arguments, but you can leave out the transformation
221 | args = TransformArguments(probability=0.5)
222 |
223 | # transform the sentence
224 | tr = SentenceAdditions(args=args)
225 | tr.generate('мама мыла раму')
226 | ```
227 |
228 | ```
229 | ['мама мыла раму, Мама мыла раму,']
230 | ```
231 |
232 | ```
233 | tr = ButterFingersTransformation(args=args,)
234 | tr.generate('мама мыла раму')
235 | ```
236 | ```
237 | ['ммаа мырв ламу']
238 | ```
239 |
240 |
241 |
242 |
243 | Custom Transformation
244 |
245 | RuTransform allows one to create their own custom transformations. Here is the example of a simple transformation that randomises word order.
246 |
247 |
248 | First, you need to define the `SentenceOperation` class for the transformation, which has `__init__` and `generate` functions.
249 |
250 | Note, that the function arguments must stay unchanged for further compatability with the framework. We also define a separate function for th transformation itself, to keep the code more readable.
251 |
252 | ```
253 | import random
254 | import spacy
255 | from rutransform.transformations.utils import SentenceOperation
256 | from typing import Optional, List, Union, Dict
257 |
258 |
259 | def random_word_order(sentence, spacy_model, seed, max_outputs):
260 |
261 | """
262 | Randomise word order
263 | """
264 |
265 | random.seed(seed)
266 |
267 | if not spacy_model:
268 | spacy_model = spacy.load('ru_core_news_sm')
269 |
270 | tokens = [token.text for token in spacy_model(sentence)]
271 |
272 | return [' '.join(random.sample(tokens, k=len(tokens))) for _ in range(max_outputs)]
273 |
274 |
275 | class RandomWordOrder(SentenceOperation):
276 | def __init__(
277 | self, args, seed=42,
278 | max_outputs=1, device="cpu",
279 | spacy_model=None,
280 | ):
281 | super().__init__(
282 | args=args,
283 | seed=seed,
284 | max_outputs=max_outputs,
285 | device=device,
286 | spacy_model=spacy_model,
287 | )
288 |
289 | def generate(
290 | self,
291 | sentence: str,
292 | stop_words: Optional[List[Union[int, str]]] = None,
293 | prob: Optional[float] = None,
294 | ) -> List[str]:
295 |
296 | transformed = random_word_order(
297 | sentence=sentence,
298 | seed=self.seed,
299 | spacy_model=self.spacy_model,
300 | max_outputs=self.max_outputs
301 | )
302 |
303 | return transformed
304 |
305 | ```
306 |
307 | Now the transformation is ready to use on the sentence level:
308 |
309 | ```
310 | from rutransform.utils.args import TransformArguments
311 |
312 | args = TransformArguments()
313 | tr = RandomWordOrder(args=args, max_outputs=5)
314 | tr.generate("мама мыла раму")
315 |
316 | ```
317 |
318 | ```
319 | ['раму мама мыла',
320 | 'раму мыла мама',
321 | 'мама раму мыла',
322 | 'раму мама мыла',
323 | 'мама раму мыла']
324 | ```
325 |
326 | After creating the transformation, you can add it to an existing Transformer, by simply inheriting the class and changing the `transform_info` fuction:
327 |
328 | ```
329 | from rutransform.transformations import EDATransformer
330 |
331 |
332 | class EDATransformer(EDATransformer):
333 | def __init__(
334 | self,
335 | transformations: List[str],
336 | task_type: str,
337 | args: TransformArguments,
338 | text_col: Optional[str] = "text",
339 | label_col: Optional[str] = "label",
340 | seed: int = 42,
341 | device: str = "cpu",
342 | constraints=None,
343 | ) -> None:
344 |
345 | super().__init__(
346 | transformations=transformations,
347 | task_type=task_type,
348 | args=args,
349 | text_col=text_col,
350 | label_col=label_col,
351 | seed=seed,
352 | device=device,
353 | constraints=constraints
354 | )
355 |
356 | def transform_info() -> Dict[str, Optional[SentenceOperation]]:
357 |
358 | info = {"eda": RandomEDA, "word_order": RandomWordOrder}
359 |
360 | return info
361 | ```
362 |
363 | ...or create a Transformer from scratch by inheriting the `Transformer` class and defining several functions:
364 |
365 | - `transform_info`: a staticmethod, must return a dictionary {transformation name: corresponding SentenceOperation class}. It is used to load the list of all the available transformations
366 | - `_apply_transformation`: a function that applies the transformations to text until the transformed text passes the similarity threshold and returns a list of transformed texts and their similarity scores
367 | - `transform` (optional): a function that takes a sentence as input and transforms it
368 |
369 | For more information on the `Transformer` class and its structure see [here](rutransform/transformations/transformer.py).
370 |
371 |
372 | Once you have created the Transformer, add it to the [rutransform/transformations/transformers](rutransform/transformations/transformers) folder and edit the [`__init__.py`](rutransform/transformations/__init__.py) file.
373 |
374 | Now you transformation is ready for use!
375 |
376 |
377 | ## Framework Design
378 |
379 | ### Supported Transformations
380 |
381 | Following the generally accepted typology ([Zhang et al., 2020](https://arxiv.org/pdf/1901.06796.pdf); [Wang et al., 2021b](https://aclanthology.org/2022.naacl-main.339/)), we divide the transformations included in the framework in two types, depending on their target.
382 |
383 | #### Word-Level Transformations
384 |
385 | Word-level perturbations utilize several strategies to perturb tokens, ranging from imitation of typos to synonym replacement:
386 |
387 | Type | Transformation | Paper/Source | Main Idea |Example |
388 | :-------|:------------------------|:-------------|:----------|:------------------------------------------------------------------|
389 | Spelling|ButterFingers (`butter_fingers`) | [(Dhole, 2021)](https://github.com/GEM-benchmark/NL-Augmenter/tree/main/nlaugmenter/transformations/butter_fingers_perturbation) | Adds noise to data by mimicking spelling mistakes made by humans through character swaps based on their keyboard distance | This is a se**m**tence **r**o test t**j**e code |
390 | Spelling|Case (`case`) | [(Z. Wang, 2021)](https://github.com/GEM-benchmark/NL-Augmenter/tree/main/nlaugmenter/transformations/change_char_case) | Adds noise to data through case alteration | This is a sentence to tes**T** t**H**e c**OD**e |
391 | Modality|Emojify (`emojify`) | [(Wang, 2021)](https://github.com/GEM-benchmark/NL-Augmenter/tree/main/nlaugmenter/transformations/emojify)| Replaces the input words with the corresponding emojis, preserving their original meaning | This is a sentence to test the 👩💻 |
392 | Context |BAE (`bae`)| [(Garg and Ramakrishnan, 2020)](https://arxiv.org/abs/2004.01970) | Inserts and replaces tokens in the original text by masking a portion of the text and using masked LMs to generate contextually appropriate words | This is a sentence to test the **given** code |
393 |
394 | #### Sentence-Level Transformations
395 | In contrast to word-level perturbations, sentence-level perturbation techniques affect the syntactic structure:
396 |
397 | Type |Transformation | Paper/Source | Main Idea |Example |
398 | :----------|:-----------------------|:-------------|:----------|:------------------------------------------------------------------|
399 | Random |EDADELETE (`eda`)| [(Wei and Zou, 2019)](https://aclanthology.org/D19-1670.pdf) | Randomly deletes tokens in the text | This **_** a sentence to test the code |
400 | Random |EDASWAP (`eda`) | [(Wei and Zou, 2019)](https://aclanthology.org/D19-1670.pdf) | Randomly swaps tokens in the text | **code** is a sentence to test the **This** |
401 | Paraphrasis|Paraphraser (`paraphraser`) | [(Fenogenova, 2021)](http://bsnlp.cs.helsinki.fi/papers-2021/2021.bsnlp-1.2.pdf) | Generates variations of the context via simple paraphrasing | **I am just testing the code** |
402 | Paraphrasis|BackTranslation (`back_translation`) | [(Yaseen and Langer, 2021)](https://arxiv.org/abs/2108.11703) | Generates variations of the context through back-translation (ru -> en -> ru) |**This sentence tests the code** |
403 | Distraction|AddSent (`add_sent`) | [(Jia and Liang, 2017)](https://aclanthology.org/D17-1215/) | Generates extra words or a sentence at the end of the text. For multiple choice QA tasks it replaces one or more choice options with a generated one | This is a sentence to test the code **, if you want to delete it** |
404 |
405 | The examples are given in English for illustration purposes.
406 |
407 | ### Probability thresholds
408 |
409 | The degree of the input modification can be controlled with an adversarial probability threshold, which serves as the hyperparameter. The higher the probability, the more the input gets modified.
410 |
411 | ### Constraints
412 |
413 | The RuTransform's attacks and perturbations do not drastically change the input's meaning. Despite this, we consider the use of rule-based constraints that keep the linguistic structure and task-specific aspects unchanged. For instance, it is crucial to leave named entities in the QA tasks untouched and not modify the syntactic structure and anaphors when perturbing the coreference resolution task examples.
414 |
415 | Name| Description | Additional Requirements | Example |
416 | :---|:------------|:------------------------|:--------|
417 | `Jeopardy` | Jeopardy type conatraints, including (1) Noun Phrases such as THIS FILM, THIS ACTOR, both UPPER and lower cased, (2) 'X', (3) «Named Entity in parentheses» | - | For the first time, **THIS soda** appeared in 1958 in Spain, the name of the drink is translated from the Esperanto language as **“amazing”**.|
418 | `NamedEntities`|Matches all the named entities in text| - |The singer from **Turkey** who impressed us all.|
419 | `Multihop`| Constraints for multihop QA tasks. Matches all the bridge and main answers important for hops | - | `Question:` Where is the source of the river, the tributary of which is the Getar, located? `Supporting Text:` The **Getar** is a river in Armenia. It originates in the Kotayk region, flows through the central part of Yerevan and flows into **the Hrazdan**. `Main Text:` **The Hrazdan**, a river in Armenia, is the left tributary of the Aras. It originates at the northwest extremity of Lake **Sevan**, near the city of **Sevan**. `Answer:` Sevan |
420 | `Referents` | Constraints for coreference resolution tasks. Matches (1) the anaphoric pronoun, (2) all possible antecedents (3) all verbs referring to antecedents and anaphor | Markup of the possible antecedents and anaphors | The **singer** from **Turkey** **who** **impressed** us all.|
421 |
422 | ### Semantic filtering
423 |
424 | We follow [Wang et al., 2021](https://arxiv.org/abs/2111.02840) on filtering the adversarial examples with BERTScore [(Zhang et al., 2019)](https://arxiv.org/abs/1904.09675), a BERT-based text similarity metric [(Devlin et al., 2019)](https://aclanthology.org/N19-1423.pdf). We measure the semantic similarity between the original input and adversarial output and keep examples with the highest similarity score. In cases when the score is lower than a specified threshold, we iteratively decrease the adversarial probability threshold and re-score the new adversarial examples.
425 |
426 |
427 | ## Cite us
428 |
429 | ```
430 | @article{taktasheva2022tape,
431 | title={TAPE: Assessing Few-shot Russian Language Understanding},
432 | author={Taktasheva, Ekaterina and Shavrina, Tatiana and Fenogenova, Alena and Shevelev, Denis and Katricheva, Nadezhda and Tikhonova, Maria and Akhmetgareeva, Albina and Zinkevich, Oleg and Bashmakova, Anastasiia and Iordanskaia, Svetlana and others},
433 | journal={arXiv preprint arXiv:2210.12813},
434 | year={2022}
435 | }
436 | ```
437 |
438 | ## License
439 |
440 | All the code is available under the Apache 2.0 license.
441 |
--------------------------------------------------------------------------------