├── HyCxG
├── DataProcessor
│ ├── FoldWrapper.py
│ ├── HyperDataset.py
│ └── __init__.py
├── Model
│ ├── HyperCxG.py
│ ├── HyperGraphATT.py
│ ├── Layer
│ │ ├── LayerNorm.py
│ │ ├── Linear.py
│ │ ├── SPHyperGraphLayer.py
│ │ ├── __init__.py
│ │ └── activate_fn.py
│ ├── __init__.py
│ └── lm.py
├── README.md
├── README_ZH.md
├── Simuann
│ ├── CxGCoverage.py
│ ├── README.md
│ ├── README_ZH.md
│ ├── SimuAnneal.py
│ └── __init__.py
├── Tokenizer
│ ├── BaseTokenizer.py
│ ├── CxGProcessor
│ │ ├── CxGCore.py
│ │ ├── Encoder.py
│ │ ├── Loader.py
│ │ ├── Parser.py
│ │ ├── __init__.py
│ │ ├── rdrpos_tagger
│ │ │ ├── !READ_ME_ANNOTATE.txt
│ │ │ ├── InitialTagger
│ │ │ │ ├── InitialTagger.py
│ │ │ │ ├── InitialTagger4En.py
│ │ │ │ └── InitialTagger4Vn.py
│ │ │ ├── SCRDRlearner
│ │ │ │ ├── Node.py
│ │ │ │ ├── Object.py
│ │ │ │ ├── SCRDRTree.py
│ │ │ │ └── SCRDRTreeLearner.py
│ │ │ ├── Utility
│ │ │ │ ├── Config.py
│ │ │ │ ├── Eval.py
│ │ │ │ ├── LexiconCreator.py
│ │ │ │ └── Utils.py
│ │ │ └── pSCRDRtagger
│ │ │ │ ├── ExtRDRPOSTagger.py
│ │ │ │ └── RDRPOSTagger.py
│ │ └── utils.py
│ ├── ModelTokenizer.py
│ ├── README.md
│ ├── README_ZH.md
│ ├── Vocab.py
│ ├── __init__.py
│ ├── constants.py
│ └── download_cxgdict.sh
├── Trainer
│ ├── HyCxGTrainerABSA.py
│ ├── HyCxGTrainerGLUE.py
│ ├── Trainer.py
│ └── __init__.py
├── config.py
├── dataset
│ ├── README.md
│ ├── README_ZH.md
│ └── download_vocab.sh
├── run_hycxg.sh
├── train_hycxg.py
└── utils
│ ├── __init__.py
│ ├── argument.py
│ ├── coverage.py
│ ├── data.py
│ ├── define.py
│ ├── hypergraph.py
│ ├── metric.py
│ ├── misc.py
│ ├── operates.py
│ └── optimizers.py
├── LICENSE
├── README.md
├── README_ZH.md
├── data
├── ABSA
│ ├── README.md
│ ├── README_ZH.md
│ ├── download_and_process_absa.sh
│ └── process_absa.py
├── Colloquial
│ ├── README.md
│ ├── README_ZH.md
│ ├── baseline
│ │ ├── DGEDT_germeval_gengraph.py
│ │ ├── DualGCN_germeval_txt2json.py
│ │ ├── KumaGCN_germeval_gengraph.py
│ │ └── RGAT_germeval_txt2json.py
│ ├── download_and_process_colloquial.sh
│ ├── process_germeval.py
│ └── process_twitter.py
├── Counterfactual
│ ├── README.md
│ ├── README_ZH.md
│ ├── download_and_process_counterfactual.sh
│ └── process_counterfactual.py
├── GLUE
│ ├── README.md
│ ├── README_ZH.md
│ ├── download_and_process_glue.py
│ └── download_and_process_glue.sh
├── Multilingual
│ ├── README.md
│ ├── README_ZH.md
│ ├── baseline
│ │ ├── DGEDT_french_dutch_spanish.py
│ │ ├── DGEDT_turkish.py
│ │ ├── DualGCN_french_dutch_spanish.py
│ │ ├── DualGCN_turkish.py
│ │ ├── KumaGCN_french_dutch_spanish.py
│ │ ├── KumaGCN_turkish.py
│ │ ├── RGAT_french_dutch_spanish.py
│ │ └── RGAT_turkish.py
│ ├── download_and_process_multilingual.sh
│ └── process_multilingual.py
├── README.md
├── README_ZH.md
├── data_pipeline.sh
└── download_stanfordcore.py
├── figures
├── hycxg-logo.png
├── main-logo.png
└── sub-logo.png
├── guidelines
├── README.md
└── README_ZH.md
├── requirements.txt
└── tutorials
├── 01_cxgtokenizer_tutorial.py
├── 02_coverage_solver_tutorial.py
├── 03_hypergraph_tutorial.py
├── PaperLists.md
├── README.md
└── README_ZH.md
/HyCxG/DataProcessor/FoldWrapper.py:
--------------------------------------------------------------------------------
1 | from argparse import Namespace
2 | from DataProcessor.HyperDataset import HyperDataLM
3 | from copy import deepcopy
4 | from sklearn.model_selection import KFold
5 |
6 | # Note: This Module is only utilized for Countertfactual task
7 | class KFoldWrapper(HyperDataLM):
8 | def __init__(self, args: Namespace, set_name: str, desc: str = 'train', num_workers: int = 1, debug=False):
9 | super(KFoldWrapper, self).__init__(args, set_name, desc, num_workers, debug)
10 | self.curfold_items, self.curfold_labels = deepcopy(self.items), deepcopy(self.labels)
11 | kfold_splitter = KFold(n_splits=args.kfold)
12 | self.grtrain_ids, self.grvalid_ids = self.calculate_ids(kfold_splitter)
13 |
14 | def calculate_ids(self, splitter):
15 | train_ids, valid_ids = [], []
16 | for train_index, valid_index in splitter.split(self.curfold_items):
17 | train_ids.append(train_index)
18 | valid_ids.append(valid_index)
19 | return train_ids, valid_ids
20 |
21 | def set_valid(self):
22 | self.desc = 'valid'
23 |
24 | def set_group(self, index : int):
25 | if index >= self.args.kfold: raise Exception('Error in setting `index`, `index` need to be lower than %d' % self.args.kfold)
26 | if self.desc == 'train': inds = self.grtrain_ids[index]
27 | elif self.desc == 'valid': inds = self.grvalid_ids[index]
28 | else: raise Exception('Error in setting `desc` mode, you can only choose [`train`, `valid`]')
29 | self.items, self.labels = [self.curfold_items[idx] for idx in inds], [self.curfold_labels[idx] for idx in inds]
30 | print('>> Kfold set the group to %d for %s set, total %d instances.' % (index, self.desc, len(self)))
--------------------------------------------------------------------------------
/HyCxG/DataProcessor/__init__.py:
--------------------------------------------------------------------------------
1 | from DataProcessor.HyperDataset import HyperDataLM
2 | from DataProcessor.FoldWrapper import KFoldWrapper
3 |
4 | __all__ = ['HyperDataLM', 'KFoldWrapper']
--------------------------------------------------------------------------------
/HyCxG/Model/HyperCxG.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | from argparse import Namespace
4 | from Model.lm import LM
5 | from Model.HyperGraphATT import RHGAT
6 | from Model.Layer.Linear import Linear
7 |
8 | class HyperCxG(nn.Module):
9 | def __init__(self, args : Namespace, device : torch.device):
10 | super(HyperCxG, self).__init__()
11 | self.device = device
12 | # LM model
13 | self.lm = LM(args, device, use_encoder=True, pooler_output=False)
14 | self.lm_dropout = nn.Dropout(args.lm_dropout)
15 | # Edge embedding
16 | self.edgemb = nn.Embedding(args.cxg_vocab_size, args.lm_hidden_size, padding_idx=0)
17 | # Relational hyper-graph attention network
18 | self.hgatt = RHGAT(args, device, args.lm_hidden_size, args.inter_size, args.lm_hidden_size, args.hg_dropout, args.leaky_alpha, args.edge_trans, args.remove_layernorm)
19 | # Classifier
20 | self.classifier = Linear(args.lm_hidden_size, args.num_classes)
21 | self.do_squeeze = args.num_classes == 1 # Combine for this repo
22 |
23 | def forward(self, input: torch.Tensor, attention_mask: torch.Tensor, HT: torch.Tensor, edges: torch.Tensor, adj_matrix: torch.Tensor, node_mask: torch.Tensor, asp_masks: torch.Tensor):
24 | # adj_matrix and asp_masks is not available in this repo
25 | # Encoder
26 | encoded = self.lm(input, attention_mask = attention_mask)
27 | encoded = self.lm_dropout(encoded)
28 | edge_emb = self.edgemb(edges)
29 | # RHGAT
30 | hidden = self.hgatt(encoded, HT, edge_emb)
31 | # Pooling
32 | node_wn = node_mask.sum(dim=1).unsqueeze(-1)
33 | mask = node_mask.unsqueeze(-1).repeat(1, 1, hidden.shape[-1])
34 | final = (hidden * mask).sum(dim=1) / node_wn
35 | outputs = self.classifier(final)
36 | if self.do_squeeze: outputs = outputs.squeeze(-1) # Combine for this repo
37 | return outputs
--------------------------------------------------------------------------------
/HyCxG/Model/HyperGraphATT.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 | from Model.Layer import HGAL
3 | from Model.Layer import LayerNorm
4 |
5 | class RHGAT(nn.Module):
6 | def __init__(self, args, device, input_size, inter_size, output_size, dropout=0.3, alpha=0.2, edge_trans=False, remove_layernorm=False):
7 | super(RHGAT, self).__init__()
8 | self.hgat = HGAL(args, device, input_size, input_size, dropout=dropout, do_scale=True, edge_trans=edge_trans, remove_layernorm=remove_layernorm)
9 | # self.hgat = HGAL_MH(args, device, input_size, input_size, dropout=dropout, do_scale=True, edge_trans=edge_trans, remove_layernorm=remove_layernorm) - Not available in this REPO
10 | self.dropout_1 = nn.Dropout(dropout)
11 | self.dropout_2 = nn.Dropout(dropout)
12 | # FFN + Rs
13 | self.leakyrelu = nn.LeakyReLU(alpha)
14 | self.linear_1 = nn.Linear(input_size, inter_size, bias=True)
15 | self.linear_2 = nn.Linear(inter_size, output_size, bias=True)
16 | self.layer_norm_1 = LayerNorm(args, device, input_size)
17 | self.layer_norm_2 = LayerNorm(args, device, input_size)
18 |
19 | def forward(self, hidden, HT, edge_emb):
20 | inter = self.hgat(hidden, HT, edge_emb)
21 | inter = self.dropout_1(inter)
22 | inter = self.layer_norm_1(inter + hidden)
23 | output = self.dropout_2(self.linear_2(self.leakyrelu(self.linear_1(inter))))
24 | output = self.layer_norm_2(output + inter)
25 | return output
--------------------------------------------------------------------------------
/HyCxG/Model/Layer/LayerNorm.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 | import torch
3 | from argparse import Namespace
4 |
5 | class LayerNorm(nn.Module):
6 | def __init__(self, args : Namespace, device : torch.device, hidden_size : int = 768, eps : float =1e-6):
7 | super(LayerNorm, self).__init__()
8 | self.eps = eps
9 | self.args = args
10 | self.device = device
11 | self.gamma = nn.Parameter(torch.ones(hidden_size))
12 | self.beta = nn.Parameter(torch.zeros(hidden_size))
13 |
14 | def forward(self, x):
15 | mean = x.mean(-1, keepdim=True)
16 | std = x.std(-1, keepdim=True)
17 | hidden_states = self.gamma * (x-mean) / (std + self.eps)
18 | return hidden_states + self.beta
--------------------------------------------------------------------------------
/HyCxG/Model/Layer/Linear.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 |
3 | class Linear(nn.Module):
4 | def __init__(self, Text_InFeature, Text_OutFeature):
5 | super(Linear, self).__init__()
6 | self.linear = nn.Linear(in_features=Text_InFeature, out_features=Text_OutFeature)
7 | self.init_params()
8 |
9 | def init_params(self):
10 | nn.init.kaiming_normal_(self.linear.weight)
11 | nn.init.constant_(self.linear.bias, 0)
12 |
13 | def forward(self, x):
14 | x = self.linear(x)
15 | return x
--------------------------------------------------------------------------------
/HyCxG/Model/Layer/SPHyperGraphLayer.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | from torch.nn.parameter import Parameter
6 | from Model.Layer.LayerNorm import LayerNorm
7 |
8 | INF_SUB_NUM = -9e15
9 |
10 | # Simplified version
11 | class HyperGraphAttentionLayer(nn.Module):
12 | def __init__(self, args, device, input_size, output_size, dropout, do_scale=True, edge_trans=False, remove_layernorm=False):
13 | super(HyperGraphAttentionLayer, self).__init__()
14 | self.input_size = input_size
15 | self.output_size = output_size
16 | self.scale = do_scale
17 | self.layernorm = not remove_layernorm
18 | if self.layernorm: self.lnorm = LayerNorm(args, device, input_size)
19 | self.wnk = Parameter(torch.Tensor(self.input_size, self.output_size))
20 | self.wek = Parameter(torch.Tensor(self.output_size, self.output_size))
21 | if edge_trans: self.w_edge = Parameter(torch.Tensor(self.input_size, self.input_size))
22 | else: self.register_parameter('w_edge', None)
23 | self.dropout_emb = nn.Dropout(dropout)
24 | self.dropout = nn.Dropout(dropout)
25 | self.reset_parameters()
26 |
27 | def reset_parameters(self):
28 | stdv = 1. / math.sqrt(self.output_size)
29 | self.wnk.data.uniform_(-stdv, stdv)
30 | self.wek.data.uniform_(-stdv, stdv)
31 | if self.w_edge is not None: self.w_edge.data.uniform_(-stdv, stdv)
32 |
33 | def forward(self, hidden, ht, edge_emb):
34 | if self.layernorm: edge_emb = self.lnorm(edge_emb)
35 | edge_emb = self.dropout_emb(edge_emb)
36 | node_k = hidden.matmul(self.wnk)
37 | if self.w_edge is not None: edge_q = torch.matmul(edge_emb, self.w_edge)
38 | else: edge_q = edge_emb
39 | edge_attnscores = torch.matmul(edge_q, node_k.permute(0, 2, 1))
40 | if self.scale: edge_attnscores = edge_attnscores * (1 / (self.input_size ** (1/2)))
41 | zero_vec = INF_SUB_NUM * torch.ones_like(edge_attnscores)
42 | edge_attnscores = torch.where(ht > 0, edge_attnscores, zero_vec)
43 | attention_edge = F.softmax(edge_attnscores, dim=2)
44 | edge_h = torch.matmul(attention_edge, hidden)
45 | edge_h = self.dropout(edge_h)
46 | edge_h = edge_h + edge_emb
47 | edge_k = edge_h.matmul(self.wek)
48 | node_q = node_k
49 | node_attnscores = torch.matmul(node_q, edge_k.permute(0, 2, 1))
50 | if self.scale: node_attnscores = node_attnscores * (1 / (self.input_size ** (1/2)))
51 | zero_vec = INF_SUB_NUM * torch.ones_like(node_attnscores)
52 | node_attnscores = torch.where(ht.permute(0, 2, 1) > 0, node_attnscores, zero_vec)
53 | attention_node = F.softmax(node_attnscores, dim=1)
54 | node_hidden = torch.matmul(attention_node, edge_h)
55 | return node_hidden
--------------------------------------------------------------------------------
/HyCxG/Model/Layer/__init__.py:
--------------------------------------------------------------------------------
1 | from Model.Layer.Linear import Linear
2 | from Model.Layer.LayerNorm import LayerNorm
3 | from Model.Layer.SPHyperGraphLayer import HyperGraphAttentionLayer as HGAL
4 |
5 | __all__ = ['Linear', 'LayerNorm', 'HGAL']
--------------------------------------------------------------------------------
/HyCxG/Model/Layer/activate_fn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import math
3 |
4 | def gelu(x):
5 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
--------------------------------------------------------------------------------
/HyCxG/Model/__init__.py:
--------------------------------------------------------------------------------
1 | from Model.HyperCxG import HyperCxG
2 |
3 | __all__ = ['HyperCxG']
--------------------------------------------------------------------------------
/HyCxG/Model/lm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | from transformers import BertModel, RobertaModel
4 | from Model.Layer import *
5 | import argparse
6 |
7 | class LM(nn.Module):
8 | def __init__(self, args : argparse.Namespace, device : torch.device, use_encoder : bool = False, pooler_output : bool = True, all_output : bool = False):
9 | super(LM, self).__init__()
10 | self.device = device
11 | self.use_encoder = use_encoder
12 | self.pooler_output = pooler_output
13 | self.all_output = all_output
14 | self._lm = BertModel.from_pretrained(args.lm_path, cache_dir='.cache/') if args.lm_group == 'BERT' else RobertaModel.from_pretrained(args.lm_path, cache_dir='.cache/')
15 | if args.finetune is not True:
16 | for param in self._lm.base_model.parameters():
17 | param.requires_grad = False
18 | self._lm_output = args.output_hidden_states
19 | self._lm.config.output_hidden_states = self._lm_output
20 | self._fc = Linear(args.lm_hidden_size, args.num_classes)
21 |
22 | def forward(self, inputs : torch.Tensor, attention_mask : torch.Tensor = None) -> tuple:
23 | encode_output = self._lm(inputs, attention_mask=attention_mask)
24 | if self.pooler_output: encoded = encode_output.pooler_output
25 | else: encoded = encode_output.last_hidden_state
26 | if self.use_encoder is not True: output = self._fc(encoded)
27 | else:
28 | if self.all_output: output = (encode_output.pooler_output, encode_output.last_hidden_state)
29 | else: output = encoded
30 | return output
31 |
32 |
--------------------------------------------------------------------------------
/HyCxG/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 |
16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/README_ZH.md)
17 |
18 | ## Run HyCxG
19 |
20 | ### Quick start
21 | Before running our HyCxG model, a variety of preparation steps are required. The following are the necessary steps:
22 | 1. (**Data preparation**) Please prepare the data first. We have provided an automatic download and processing script for all data. Please refer to the [`data`](https://github.com/xlxwalex/HyCxG/tree/main/data) folder for details. With the default configuration, the processed data will be saved in the `data/dataset` folder in the form of folders. Please copy all the data folders (e.g. `JSONABSA_MAMS`) to the `dataset` folder in this directory.
23 | 2. (**Preparation for CxG lists**) In the [`dataset`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset) folder under this directory, we provide an automatic download script for the list of CxG. Please refer to the README file in the folder for details. Under the default configuration, the required data files will be automatically downloaded to the corresponding location.
24 | 3. (**Preparation for CxG vocabulary**) In the [`Tokenizer`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset) folder from this directory, we provide an automatic download script for the construction vocabulary data. Please refer to the README file in the folder for the execution command. Similarly, under the default configuration, the required data files will be automatically downloaded to the corresponding location.
25 | 4. (**Run HyCxG**): In the [`run_hycxg.sh`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/run_hycxg.sh) file, we provide the command to run the HyCxG model, which can be adapted to different datasets by modifying the parameters.
26 |
27 | **Note:** The hyper-parameter settings for each task can be found in the [`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines).
--------------------------------------------------------------------------------
/HyCxG/README_ZH.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/README_ZH.md)
16 | ## 运行HyCxG
17 |
18 | ### 快速运行指引
19 | 在运行HyCxG模型前,我们需要先进行多种数据的准备,以下为所需的操作步骤:
20 | 1. (**数据集准备**) 请先准备数据,我们提供了数据的自动下载处理脚本,详情请见[`data`](https://github.com/xlxwalex/HyCxG/tree/main/data)文件夹。在默认配置下,处理完的数据会以文件夹的形式保存在`datas/dataset`文件夹中,请复制所有的数据文件夹(例如:`JSONABSA_MAMS`等)到本目录下的`dataset`文件夹中
21 | 2. (**构式语法表数据准备**) 在本目录下的[`dataset`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset)文件夹中,我们提供了构式语法列表数据的自动下载脚本,详情请见文件夹中的README文件。在默认配置下,所需数据文件会自动下载到对应位置
22 | 3. (**构式词表数据准备**) 在本目录下的[`Tokenizer`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset)文件夹内,我们提供了构式词表数据的自动下载脚本,执行命令请见文件夹中的README文件。同样在默认配置下,所需数据文件会自动下载到对应位置
23 | 4. (**运行HyCxG**) 在[`run_hycxg.sh`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/run_hycxg.sh)中我们给出了运行模型的命令,可以通过修改参数来适应不同的数据集
24 |
25 | **注意**:各个任务的超参数设置请见[`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines)。
--------------------------------------------------------------------------------
/HyCxG/Simuann/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 |
16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann/README_ZH.md)
17 |
18 | ## Cond-MC Solver
19 |
20 | The code in this package is utilized to solve the Cond-MC problem in the second section of our paper (Construction Extraction and Selection) via simulated annealing (SA). Please refer to the paper for problem definition and solution steps. Meanwhile, you can browse [`tutorials`](https://github.com/xlxwalex/HyCxG/tree/main/tutorials) for more detailed information.
21 |
22 | ### Quick Start
23 | We provide multiple instances in the [`CxGCoverage.py`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann/CxGCoverage.py) file, and you can directly run the following code to browse the results (these instances can also serve as a reference for hyper-parameters adjustment):
24 |
25 | ```shell
26 | python CxGCoverage.py
27 | ```
28 | **Hyper-parameters:**
29 | + PATTERN_SCORE: The dict represents scores for slots with different levels of abstraction in the constructions, corresponding to ,  and  in the paper.
30 | + COVERAGE_SCORE: The weights in the objective function are stored in this dict, corresponding to ,, in our paper.
31 |
32 | **Note:** These hyper-parameters are hardcoded in the code, please modify the hyper-parameters in the header of [`CxGCoverage.py`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann/CxGCoverage.py) directly.
33 |
34 | ### Acknowledgement
35 | The code in [`SimuAnneal.py`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann/SimuAnneal.py) is modified from [`simanneal`](https://github.com/perrygeo/simanneal), which provides a convenient framework for solving problems. We are extremely grateful for the efforts and contributions by the owner of this repo!
36 |
--------------------------------------------------------------------------------
/HyCxG/Simuann/README_ZH.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann/README_ZH.md)
16 | ## Cond-MC求解器
17 |
18 | 该部分代码用于对我们论文第二章节(Construction Extraction and Selection)的Cond-MC问题使用模拟退火(Simulated Annealing, SA)的方式进行求解。问题定义以及求解步骤请见论文。更多详细的细节请参考[`tutorials`](https://github.com/xlxwalex/HyCxG/tree/main/tutorials)。
19 |
20 | ### 快速上手
21 | 我们在[`CxGCoverage.py`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann/CxGCoverage.py)中提供了多个例子,您可以直接使用以下代码直接运行来浏览结果(同时,这些例子也可以作为超参调整的参考):
22 | ```shell
23 | python CxGCoverage.py
24 | ```
25 | **超参数列表:**
26 | + PATTERN_SCORE:该字典中表示的是构式不同抽象等级的槽的分数,对应论文中, , 
27 | + COVERAGE_SCORE:该字典中存储的是目标函数中的权重,对应论文中的,,
28 |
29 | **注意:** 这些超参数被硬编码在了代码文件中,因此请直接修改[`CxGCoverage.py`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann/CxGCoverage.py)头部的两个字典。
30 |
31 | ### 致谢
32 | 本部分的[`SimuAnneal`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann/SimuAnneal.py)修改自[`simanneal`](https://github.com/perrygeo/simanneal),其提供了一个很方便的框架来对问题进行求解,我们十分感谢该仓库开发者的贡献!
33 |
--------------------------------------------------------------------------------
/HyCxG/Simuann/__init__.py:
--------------------------------------------------------------------------------
1 | from Simuann.CxGCoverage import CxGCoverageProblem as CxGCoverage
2 |
3 | __all__ = ['CxGCoverage']
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/CxGProcessor/CxGCore.py:
--------------------------------------------------------------------------------
1 | from Tokenizer.CxGProcessor.Loader import Loader
2 | from Tokenizer.CxGProcessor.Encoder import Encoder
3 | from Tokenizer.CxGProcessor.Parser import Parser
4 |
5 | class CxGCore(object):
6 | def __init__(self, args, workers = None, lang='eng'):
7 | self.args = args
8 | self.Loader = Loader(args, lang=lang)
9 | self.Encoder = Encoder(lang=lang)
10 | self.Parser = Parser(self.Loader, self.Encoder, workers=workers)
11 |
12 | def parse_text(self, text):
13 | if isinstance(text, str):
14 | text = [text]
15 | tokens = self.Loader.tokenize(text)
16 | lines, mapper, tokenizer_tokens = self.Loader.load_text(text)
17 | results = self.Parser.parse_lines(lines)
18 | # return results
19 | results_ = {}
20 | for i, res in enumerate(results):
21 | temp = {}
22 | temp["text"] = text[i]
23 | temp["token"] = tokenizer_tokens[i]
24 | temp["cons_idx"] = [ele + 1 for ele in res[0]] # 0 -
25 | temp["cons_start"] = [mapper[ele][0] for ele in res[1]]
26 | temp["cons_end"] = [mapper[ele-1][-1] + 1 for ele in res[2]]
27 | results_[i] = temp
28 | return results_[0]
29 |
30 | def parse_file(self, file):
31 | lines = self.Loader.load_from_file(file)
32 | results = self.Parser.parse_lines(lines)
33 | return results
34 |
35 |
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/CxGProcessor/Encoder.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import pandas as pd
4 | from sklearn.utils import murmurhash3_32
5 | from Tokenizer.CxGProcessor.rdrpos_tagger.pSCRDRtagger.RDRPOSTagger import RDRPOSTagger
6 | from Tokenizer.CxGProcessor.rdrpos_tagger.Utility.Utils import readDictionary
7 | from Tokenizer.CxGProcessor.Loader import Loader
8 |
9 | class Encoder(object):
10 | def __init__(self, args="", lang='eng'):
11 |
12 | MODEL_STRING = os.path.abspath(os.path.join(os.path.dirname(__file__), "data/{}.RDR".format(lang)))
13 | DICT_STRING = os.path.abspath(os.path.join(os.path.dirname(__file__), "data/{}.DICT".format(lang)))
14 | DICTIONARY_FILE = os.path.abspath(os.path.join(os.path.dirname(__file__), "data/{}.clusters.fastText.v2.gz".format(lang)))
15 | pos_list = Loader.pos_list
16 | seed = Loader.seed
17 |
18 | self.args = args
19 | self.pos_dict = {murmurhash3_32(pos, seed=seed): pos for pos in pos_list}
20 | self.word_dict = pd.read_csv(DICTIONARY_FILE, index_col=0).to_dict()["Cluster"]
21 | self.domain_dict = {murmurhash3_32(str(key), seed=seed): self.word_dict[key] for key in self.word_dict.keys()}
22 | self.word_dict = {murmurhash3_32(str(key), seed=0): key for key in self.word_dict.keys()}
23 | self.build_decoder()
24 |
25 | self.DICT = readDictionary(DICT_STRING)
26 | self.r = RDRPOSTagger(word_dict=self.domain_dict, DICT=self.DICT)
27 | self.r.constructSCRDRtreeFromRDRfile(MODEL_STRING)
28 |
29 | def build_decoder(self):
30 | #LEX = 1, POS = 2, CAT = 3
31 | decoding_dict = {}
32 | decoding_dict[1] = self.word_dict
33 | decoding_dict[2] = self.pos_dict
34 | decoding_dict[3] = {key: "<" + str(key) + ">" for key in list(set(self.domain_dict.values()))}
35 | self.decoding_dict = decoding_dict
36 |
37 | def tagline(self, line):
38 | line = self.r.tagRawSentenceHash(line)
39 | return np.array(line)
40 |
41 |
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/CxGProcessor/Loader.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | from Tokenizer.BaseTokenizer import BasicTokenizer, WordpieceTokenizer
3 | from transformers import AutoTokenizer
4 | from sklearn.utils import murmurhash3_32
5 | from collections import defaultdict
6 | import Tokenizer.CxGProcessor.utils as utils
7 | from Tokenizer.constants import *
8 |
9 | # Load Construction
10 | class Loader(object):
11 | pos_list = ["PROPN", "SYM", "VERB", "DET", "CCONJ", "AUX",
12 | "ADJ", "INTJ", "SCONJ", "PRON", "NUM", "PUNCT",
13 | "ADV", "ADP", "X", "NOUN", "PART"]
14 | seed = 0
15 |
16 | def __init__(self, args, lang='eng'):
17 | self.args = args
18 | self.cons_path = args.cxg_vocab_path + "/construction.pkl" if lang == 'eng' else args.cxg_vocab_path + "/{}.construction.pkl".format(lang)
19 | self.pos_list = Loader.pos_list
20 | self.seed = Loader.seed
21 | self.lmg = args.lm_group
22 | self.cons = self.load_cons()
23 | self.dict_cons = self.load_dict_cons()
24 | self.basic_tokenizer = BasicTokenizer(do_lower_case=self.args.do_lower_case)
25 | self.auto_tokenizer = AutoTokenizer.from_pretrained(args.lm_path)
26 |
27 | def load_dict_cons(self):
28 | encoded_cons = self.cons
29 | dict_cons = dict()
30 | X = list(set([encoded_cons[i][0][0] for i in range(len(encoded_cons))]))
31 | for x in X:
32 | dict_cons[x] = defaultdict(list)
33 | for i, encoded_con in enumerate(encoded_cons):
34 | if encoded_con[0][0] == x:
35 | dict_cons[x][encoded_con[0][1]].append((encoded_con, i))
36 | return X, dict_cons
37 |
38 | def load_cons(self):
39 | if self.cons_path.endswith(".pkl"):
40 | with open(self.cons_path, "rb") as f:
41 | res = pickle.load(f)
42 | else:
43 | cons = self.read_cons()
44 | res = self.encode_cons(cons)
45 | return res
46 |
47 | def load_text(self, text):
48 | tokens = self.tokenize(text)
49 | map_word2_token, tokenizer_tokens = self.map_cxgtoken2plmtoken(tokens)
50 | tokens = self.replace(tokens)
51 | lines = self.tokens2lines(tokens)
52 | return lines, map_word2_token, tokenizer_tokens
53 |
54 | def load_from_file(self, file):
55 | text = []
56 | with open(file, "r") as f:
57 | for line in f.readlines():
58 | if line.strip():
59 | text.append(line.strip())
60 | lines, _, tokenizer_tokens = self.load_text(text)
61 | return lines
62 |
63 | def tokenize(self, text):
64 | if isinstance(text, str):
65 | text = [text]
66 | tokens = []
67 | basic_tokenizer = BasicTokenizer(do_lower_case=self.args.do_lower_case)
68 | for ele in text:
69 | tokens.append(basic_tokenizer.tokenize(ele))
70 | return tokens
71 |
72 | def replace(self, tokens, no_number=True, no_phone=True, no_email=True, no_currency=True):
73 | if no_phone:
74 | tokens = [self.replace_with_phone(token) for token in tokens]
75 | if no_number:
76 | tokens = [self.replace_with_number(token) for token in tokens]
77 | if no_email:
78 | tokens = [self.replace_with_email(token) for token in tokens]
79 | if no_currency:
80 | tokens = [self.replace_with_currency_symbol(token) for token in tokens]
81 | return tokens
82 |
83 | def map_cxgtoken2plmtoken(self, tokens):
84 | accum_idx = 0
85 | mapper, true_tokens = [], []
86 | for token in tokens[0]:
87 | tok = []
88 | wp_tokens = self.auto_tokenizer.tokenize(token) if self.lmg == 'BERT' else self.auto_tokenizer.tokenize(' ' + token)
89 | true_tokens.extend(wp_tokens)
90 | tok.extend(wp_tokens)
91 | mapper.append([accum_idx, accum_idx + len(tok) -1])
92 | accum_idx += len(tok)
93 | return mapper, [true_tokens]
94 |
95 | @staticmethod
96 | def replace_with_number(token, alternative=""):
97 | return [utils.NUMBERS_REGEX.sub(alternative, x) for x in token]
98 |
99 | @staticmethod
100 | def replace_with_currency_symbol(token, alternative=""):
101 | return [utils.CURRENCY_REGEX.sub(alternative, x) for x in token]
102 |
103 | @staticmethod
104 | def replace_with_email(token, alternative=""):
105 | return [utils.CURRENCY_REGEX.sub(alternative, x) for x in token]
106 |
107 | @staticmethod
108 | def replace_with_phone(token, alternative=""):
109 | return [utils.PHONE_REGEX.sub(alternative, x) for x in token]
110 |
111 | def tokens2lines(self, tokens):
112 | lines = [" ".join(token) for token in tokens]
113 | return lines
114 |
115 | def read_cons(self):
116 | cons = []
117 | with open(self.cons_path, "r") as f:
118 | for line in f.readlines():
119 | con = line.strip().split("--")
120 | cons.append(con)
121 | return cons
122 |
123 | def write_cons(self, encoded_cons):
124 | path = self.cons_path.replace(".txt", ".pkl")
125 | with open(path, "wb") as f:
126 | pickle.dump(encoded_cons, f)
127 |
128 | def encode_cons(self, cons):
129 | encoded_cons = []
130 | for con in cons:
131 | encoded_cons.append(self.encode_con(con))
132 | return encoded_cons
133 |
134 | def encode_con(self, con):
135 | encoded_con = []
136 | for x in con:
137 | if x.startswith("<"):
138 | encoded_con.append((3, int(x[1:-1])))
139 | elif x in self.pos_list:
140 | encoded_con.append((2, murmurhash3_32(x, seed=self.seed)))
141 | else:
142 | encoded_con.append((1, murmurhash3_32(x, seed=self.seed)))
143 | return tuple(encoded_con)
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/CxGProcessor/Parser.py:
--------------------------------------------------------------------------------
1 | import multiprocessing as mp
2 |
3 |
4 | class Parser(object):
5 | def __init__(self, Loader, Encoder, workers=10):
6 | self.Loader = Loader
7 | self.Encoder = Encoder
8 | self.workers = workers
9 |
10 | def parse_lines(self, lines):
11 | if self.workers is not None:
12 | chunk_size = 1000
13 | pool_instance = mp.Pool(processes=self.workers, maxtasksperchild=None)
14 | lines = pool_instance.map(self.Encoder.tagline, lines, chunksize=chunk_size)
15 | pool_instance.close()
16 | pool_instance.join()
17 | pool_instance = mp.Pool(processes=self.workers, maxtasksperchild=None)
18 | results = pool_instance.map(self.match_cons, lines, chunksize=chunk_size)
19 | pool_instance.close()
20 | pool_instance.join()
21 | else:
22 | lines = [self.Encoder.tagline(line) for line in lines]
23 | results = [self.match_cons(line) for line in lines]
24 | results = [self.del_duplicate(res) for res in results]
25 | return results
26 |
27 | def del_duplicate(self, result):
28 | if len(result[0]) <= 1:
29 | return result
30 | s = set()
31 | k = 0
32 | for i in range(len(result[0])):
33 | if (result[1][i], result[2][i]) not in s:
34 | result[0][k] = result[0][i]
35 | result[1][k] = result[1][i]
36 | result[2][k] = result[2][i]
37 | s.add((result[1][i], result[2][i]))
38 | k += 1
39 | for i in range(len(result)):
40 | del(result[i][k:])
41 | return result
42 |
43 | def match_cons(self, line):
44 | cons_idx, cons_start, cons_end = [], [], []
45 | for i, unit in enumerate(line):
46 | candidates = self.get_candidates(unit)
47 | for con, idx in candidates:
48 | match = True
49 | for j in range(1, len(con)):
50 | if i + j < len(line):
51 | if line[i + j][con[j][0] - 1] != con[j][1]:
52 | match = False
53 | break
54 | else:
55 | match = False
56 | break
57 | if match:
58 | cons_idx.append(idx)
59 | cons_start.append(i)
60 | cons_end.append(i + len(con))
61 | return cons_idx, cons_start, cons_end
62 |
63 | def get_candidates(self, unit):
64 | candidates = []
65 | for i in self.Loader.dict_cons[0]:
66 | candidate = self.Loader.dict_cons[1][i][unit[i-1]]
67 | candidates += candidate
68 | return candidates
69 |
70 |
71 |
72 |
73 |
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/CxGProcessor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xlxwalex/HyCxG/bc55ea31f339ec36710b7e9d6a22d4fb1577fa20/HyCxG/Tokenizer/CxGProcessor/__init__.py
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/CxGProcessor/rdrpos_tagger/!READ_ME_ANNOTATE.txt:
--------------------------------------------------------------------------------
1 | The main part-of-speech tagging tool used is RDRPOSTagger:
2 |
3 | Copyright (C) 2013-2015 by Dat Quoc Nguyen, Dai Quoc Nguyen, Dang Duc Pham and Son Bao Pham
4 | RDRPOSTagger's website: http://rdrpostagger.sourceforge.net/
5 |
6 | This version of RDRPOSTagger has been updated to work in Python 3, along with other minor changes.
7 | Original models and new models (e.g., RDR and DICT files) can be found in the main data directory.
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/CxGProcessor/rdrpos_tagger/InitialTagger/InitialTagger.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import re
4 |
5 | def initializeSentence(FREQDICT, sentence):
6 | words = sentence.strip().split()
7 | taggedSen = []
8 | for word in words:
9 | if word in ["“", "”", "\""]:
10 | #taggedSen.append("''/" + FREQDICT["''"])
11 | if "''" in FREQDICT:
12 | taggedSen.append("''/" + FREQDICT["''"])
13 | elif "." in FREQDICT:
14 | taggedSen.append("''/" + FREQDICT["."])
15 | elif "," in FREQDICT:
16 | taggedSen.append("''/" + FREQDICT[","])
17 | else:
18 | print("\n'' is not in the dictionary \nManually add '' with a possible POS tag into the .DICT file!")
19 | taggedSen.append("''/" + FREQDICT["''"])
20 | continue
21 |
22 | tag = ''
23 | decodedW = word
24 | lowerW = decodedW.lower()
25 | if word in FREQDICT:
26 | tag = FREQDICT[word]
27 | elif lowerW in FREQDICT:
28 | tag = FREQDICT[lowerW]
29 | else:
30 | if re.search(r"[0-9]+", word) != None:
31 | tag = FREQDICT["TAG4UNKN-NUM"]
32 | else:
33 | suffixL2 = suffixL3 = suffixL4 = suffixL5 = None
34 | wLength = len(decodedW)
35 | if wLength >= 4:
36 | suffixL3 = ".*" + decodedW[-3:]
37 | suffixL2 = ".*" + decodedW[-2:]
38 | if wLength >= 5:
39 | suffixL4 = ".*" + decodedW[-4:]
40 | if wLength >= 6:
41 | suffixL5 = ".*" + decodedW[-5:]
42 |
43 | if suffixL5 in FREQDICT:
44 | tag = FREQDICT[suffixL5]
45 | elif suffixL4 in FREQDICT:
46 | tag = FREQDICT[suffixL4]
47 | elif suffixL3 in FREQDICT:
48 | tag = FREQDICT[suffixL3]
49 | elif suffixL2 in FREQDICT:
50 | tag = FREQDICT[suffixL2]
51 | elif decodedW[0].isupper():
52 | tag = FREQDICT["TAG4UNKN-CAPITAL"]
53 | else:
54 | tag = FREQDICT["TAG4UNKN-WORD"]
55 |
56 | taggedSen.append(word + "/" + tag)
57 |
58 | return " ".join(taggedSen)
59 |
60 | def initializeCorpus(FREQDICT, inputFile, outputFile):
61 | lines = open(inputFile, "r").readlines()
62 | fileOut = open(outputFile, "w")
63 | for line in lines:
64 | fileOut.write(initializeSentence(FREQDICT, line) + "\n")
65 | fileOut.close()
66 |
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/CxGProcessor/rdrpos_tagger/InitialTagger/InitialTagger4En.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import re
4 |
5 | def initializeEnSentence(FREQDICT, sentence):
6 | words = sentence.strip().split()
7 | taggedSen = []
8 | for word in words:
9 | if word in ["“", "”", "\""]:
10 | taggedSen.append("''/" + FREQDICT["''"])
11 | continue
12 |
13 | tag = ''
14 | lowerW = word.lower()
15 | if word in FREQDICT:
16 | tag = FREQDICT[word]
17 | elif lowerW in FREQDICT:
18 | tag = FREQDICT[lowerW]
19 | else:
20 | if (re.search(r"([0-9]+-)|(-[0-9]+)", word) != None):
21 | tag = "JJ"
22 | elif (re.search(r"[0-9]+", word) != None):
23 | tag = "CD"
24 | elif (re.search(r'(.*ness$)|(.*ment$)|(.*ship$)|(^[Ee]x-.*)|(^[Ss]elf-.*)', word) != None):
25 | tag = "NN"
26 | elif (re.search(r'.*s$', word) != None and word[0].islower()):
27 | tag = "NNS"
28 | elif (word[0].isupper()):
29 | tag = "NNP"
30 | elif(re.search(r'(^[Ii]nter.*)|(^[nN]on.*)|(^[Dd]is.*)|(^[Aa]nti.*)', word) != None):
31 | tag = "JJ"
32 | elif (re.search(r'.*ing$', word) != None and word.find("-") < 0):
33 | tag = "VBG"
34 | elif (re.search(r'.*ed$', word) != None and word.find("-") < 0):
35 | tag = "VBN"
36 | elif (re.search(r'(.*ful$)|(.*ous$)|(.*ble$)|(.*ic$)|(.*ive$)|(.*est$)|(.*able$)|(.*al$)', word) != None
37 | or word.find("-") > -1):
38 | tag = "JJ"
39 | elif(re.search(r'.*ly$', word) != None):
40 | tag = "RB"
41 | else:
42 | tag = "NN"
43 |
44 | taggedSen.append(word + "/" + tag)
45 |
46 | return " ".join(taggedSen)
47 |
48 | def initializeEnCorpus(FREQDICT, inputFile, outputFile):
49 | lines = open(inputFile, "r").readlines()
50 | fileOut = open(outputFile, "w")
51 | for line in lines:
52 | fileOut.write(initializeEnSentence(FREQDICT, line) + "\n")
53 | fileOut.close()
54 |
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/CxGProcessor/rdrpos_tagger/InitialTagger/InitialTagger4Vn.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import re
4 |
5 | def isAbbre(word):
6 |
7 | #word = unicode(word, "utf-8")
8 | for i in range(len(word)):
9 | if isVnLowerChar(word[i]) or word[i] == "_":
10 | return False
11 | return True
12 |
13 | VNUPPERCHARS = [u'Ă', u'Â', u'Đ', u'Ê', u'Ô', u'Ơ', u'Ư']
14 | VNLOWERCHARS = [u'ă', u'â', u'đ', u'ê', u'ô', u'ơ', u'ư']
15 |
16 | def isVnLowerChar(char):
17 | if char.islower() or char in VNLOWERCHARS:
18 | return True;
19 | return False;
20 |
21 | def isVnUpperChar(char):
22 | if char.isupper() or char in VNUPPERCHARS:
23 | return True;
24 | return False;
25 |
26 | def isVnProperNoun(word):
27 | #word = unicode(word, "utf-8")
28 | if (isVnUpperChar(word[0])):
29 | if word.count("_") >= 4:
30 | return True
31 | index = word.find("_")
32 | while index > 0 and index < len(word) - 1:
33 | if isVnLowerChar(word[index + 1]):
34 | return False;
35 | index = word.find("_", index + 1)
36 | return True;
37 | else:
38 | return False;
39 |
40 | def initializeVnSentence(FREQDICT, sentence):
41 | words = sentence.strip().split()
42 | taggedSen = []
43 | for word in words:
44 | if word in ["“", "”", "\""]:
45 | taggedSen.append("''/" + FREQDICT["''"])
46 | continue
47 |
48 | tag = ''
49 | decodedW = word
50 | lowerW = decodedW.lower()
51 | if word in FREQDICT:
52 | tag = FREQDICT[word]
53 | elif lowerW in FREQDICT:
54 | tag = FREQDICT[lowerW]
55 | else:
56 | if (re.search(r"[0-9]+", word) != None):
57 | tag = FREQDICT["TAG4UNKN-NUM"]
58 | elif(len(word) == 1 and isVnUpperChar(word[0])):
59 | tag = "Y"
60 | elif (isAbbre(word)):
61 | tag = "Ny"
62 | elif (isVnProperNoun(word)):
63 | tag = "Np"
64 | else:
65 | suffixL2 = suffixL3 = suffixL4 = suffixL5 = None
66 | wLength = len(decodedW)
67 | if wLength >= 4:
68 | suffixL3 = ".*" + decodedW[-3:]
69 | suffixL2 = ".*" + decodedW[-2:]
70 | if wLength >= 5:
71 | suffixL4 = ".*" + decodedW[-4:]
72 | if wLength >= 6:
73 | suffixL5 = ".*" + decodedW[-5:]
74 |
75 | if suffixL5 in FREQDICT:
76 | tag = FREQDICT[suffixL5]
77 | elif suffixL4 in FREQDICT:
78 | tag = FREQDICT[suffixL4]
79 | elif suffixL3 in FREQDICT:
80 | tag = FREQDICT[suffixL3]
81 | elif suffixL2 in FREQDICT:
82 | tag = FREQDICT[suffixL2]
83 | else:
84 | tag = FREQDICT["TAG4UNKN-WORD"]
85 |
86 | taggedSen.append(word + "/" + tag)
87 |
88 | return " ".join(taggedSen)
89 |
90 | def initializeVnCorpus(FREQDICT, inputFile, outputFile):
91 | lines = open(inputFile, "r").readlines()
92 | fileOut = open(outputFile, "w")
93 | for line in lines:
94 | fileOut.write(initializeVnSentence(FREQDICT, line) + "\n")
95 | fileOut.close()
96 |
97 |
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/CxGProcessor/rdrpos_tagger/SCRDRlearner/Node.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | class Node:
4 | """
5 | A class to represent the nodes in SCRDR tree
6 | """
7 |
8 | def __init__(self, condition, conclusion, father = None, exceptChild = None, elseChild = None, cornerstoneCases = [], depth = 0):
9 | self.condition = condition
10 | self.conclusion = conclusion
11 | self.exceptChild = exceptChild
12 | self.elseChild = elseChild
13 | self.cornerstoneCases = cornerstoneCases
14 | self.father = father
15 | self.depth = depth
16 |
17 | def satisfied(self, object):
18 | return eval(self.condition)
19 |
20 | def executeConclusion(self, object):
21 | exec(self.conclusion)
22 |
23 | def appendCornerstoneCase(self, object):
24 | self.cornerstoneCases.append(object)
25 |
26 | def check(self, object):
27 | if self.satisfied(object):
28 | self.executeConclusion(object)
29 | if self.exceptChild != None:
30 | self.exceptChild.check(object)
31 | else:
32 | if self.elseChild != None:
33 | self.elseChild.check(object)
34 |
35 | def checkDepth(self, object, length):
36 | if self.depth <= length:
37 | if self.satisfied(object):
38 | self.executeConclusion(object)
39 | if self.exceptChild != None:
40 | self.exceptChild.checkDepth(object, length)
41 | else:
42 | if self.elseChild != None:
43 | self.elseChild.checkDepth(object, length)
44 |
45 | def findRealFather(self):
46 | node = self
47 | fatherNode = node.father
48 | while True and fatherNode != None:
49 | if fatherNode.exceptChild == node:
50 | break
51 | node = fatherNode
52 | fatherNode = node.father
53 | return fatherNode
54 |
55 | def addElseChild(self, node):
56 | fatherNode = self.findRealFather()
57 | for object in fatherNode.cornerstoneCases:
58 | if node.satisfied(object):
59 | print("The new rule fires the cornerstone cases of its father node!!!")
60 | self.findRealFather().cornerstoneCases.remove(object)
61 | self.elseChild = node
62 | return True
63 |
64 | def addExceptChild(self, node):
65 | for object in self.cornerstoneCases:
66 | if node.satisfied(object):
67 | print("The new rule fires the cornerstone cases of its father node!!!")
68 | self.cornerstoneCases.remove(object)
69 | self.exceptChild = node
70 | return True
71 |
72 | def writeToFileWithSeenCases(self, out, depth):
73 | space = tabStr(depth)
74 | out.write(space + self.condition + " : " + self.conclusion + "\n")
75 | for case in self.cornerstoneCases:
76 | out.write(" " + space + "cc: " + case.toStr() + "\n")
77 | if self.exceptChild != None:
78 | self.exceptChild.writeToFile(out, depth + 1)
79 | if self.elseChild != None:
80 | self.elseChild.writeToFile(out, depth)
81 |
82 | def writeToFile(self, out, depth):
83 | space = tabStr(depth)
84 | out.write(space + self.condition + " : " + self.conclusion + "\n")
85 | if self.exceptChild != None:
86 | self.exceptChild.writeToFile(out, depth + 1)
87 | if self.elseChild != None:
88 | self.elseChild.writeToFile(out, depth)
89 |
90 | def tabStr(length):
91 | return "".join(["\t"] * length)
92 |
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/CxGProcessor/rdrpos_tagger/Utility/Config.py:
--------------------------------------------------------------------------------
1 | #Change the value of NUMBER_OF_PROCESSES to obtain faster tagging process!
2 | NUMBER_OF_PROCESSES = 2
3 |
4 | THRESHOLD = (3, 2)
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/CxGProcessor/rdrpos_tagger/Utility/Eval.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | import sys
5 | os.chdir("../")
6 | sys.setrecursionlimit(100000)
7 | sys.path.append(os.path.abspath(""))
8 | os.chdir("./Utility")
9 |
10 | from Utility.Utils import getWordTag, readDictionary
11 |
12 | def computeAccuracy(goldStandardCorpus, taggedCorpus):
13 | tagged = open(taggedCorpus, "r").read().split()
14 | goldStandard = open(goldStandardCorpus, "r").read().split()
15 | if len(tagged) != len(goldStandard):
16 | print("The numbers of word tokens in %s and %s are not equal!" % (goldStandardCorpus, taggedCorpus))
17 | return 0
18 | numwords = 0
19 | count = 0
20 | for i in range(len(tagged)):
21 | numwords += 1
22 | word1, tag1 = getWordTag(tagged[i])
23 | word2, tag2 = getWordTag(goldStandard[i])
24 | if word1 != word2 and word1 != "''" and word2 != "''":
25 | print("Words are not the same in gold standard and tagged corpora, at the index", i)
26 | return 0
27 |
28 | if tag1.lower() == tag2.lower():
29 | count += 1
30 |
31 | return count * 100.0 / numwords
32 |
33 | def computeAccuracies(fullDictFile, goldStandardCorpus, taggedCorpus):
34 | """
35 | Return known-word accuracy, unknown-word accuracy and the overall accuracy
36 | """
37 | tagged = open(taggedCorpus, "r").read().split()
38 | goldStandard = open(goldStandardCorpus, "r").read().split()
39 | if len(tagged) != len(goldStandard):
40 | print("The numbers of word tokens in %s and %s are not equal!" % (goldStandardCorpus, taggedCorpus))
41 | return 0
42 |
43 | fullDICT = readDictionary(fullDictFile)
44 |
45 | numwords = count = 0
46 | countKN = countUNKN = 0
47 | countCorrectKN = countCorrectUNKN = 0
48 |
49 | for i in range(len(tagged)):
50 | numwords += 1
51 | word1, tag1 = getWordTag(tagged[i])
52 | word2, tag2 = getWordTag(goldStandard[i])
53 | if word1 != word2 and word1 != "''" and word2 != "''":
54 | print("Words are not the same in gold standard and tagged corpora, at the index " + str(i))
55 | return 0
56 |
57 | if tag1.lower() == tag2.lower():
58 | count += 1
59 |
60 | if word1 in fullDICT:
61 | countKN += 1
62 | if tag1.lower() == tag2.lower():
63 | countCorrectKN += 1
64 | else:
65 | countUNKN += 1
66 | if tag1.lower() == tag2.lower():
67 | countCorrectUNKN += 1
68 |
69 | if countUNKN == 0:
70 | return countCorrectKN * 100.0 / countKN, 0.0, count * 100.0 / numwords
71 | else:
72 | return countCorrectKN * 100.0 / countKN, countCorrectUNKN * 100.0 / countUNKN, count * 100.0 / numwords
73 |
74 | if __name__ == "__main__":
75 | print(str(computeAccuracy(sys.argv[1], sys.argv[2])) + "%")
76 | pass
77 |
78 |
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/CxGProcessor/rdrpos_tagger/Utility/Utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import codecs
3 |
4 | def getWordTag(wordTag):
5 | if wordTag == "///":
6 | return "/", "/"
7 | index = wordTag.rfind("/")
8 | if index == -1:
9 | return wordTag, None
10 | word = wordTag[:index].strip()
11 | tag = wordTag[index + 1:].strip()
12 | return word, tag
13 |
14 | def getRawText(inputFile, outFile):
15 | out = open(outFile, "w")
16 | sents = open(inputFile, "r").readlines()
17 | for sent in sents:
18 | wordTags = sent.strip().split()
19 | for wordTag in wordTags:
20 | word, tag = getWordTag(wordTag)
21 | out.write(word + " ")
22 | out.write("\n")
23 | out.close()
24 |
25 | def readDictionary(inputFile):
26 | dictionary = {}
27 | lines = codecs.open(inputFile, "r", encoding = "utf-8", errors = "replace").readlines()
28 | for line in lines:
29 | wordtag = line.strip().split()
30 | dictionary[wordtag[0]] = wordtag[1]
31 | return dictionary
32 |
33 |
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/CxGProcessor/rdrpos_tagger/pSCRDRtagger/ExtRDRPOSTagger.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | import sys
5 | os.chdir("../")
6 | sys.setrecursionlimit(100000)
7 | sys.path.append(os.path.abspath(""))
8 | os.chdir("./pSCRDRtagger")
9 |
10 | from multiprocessing import Pool
11 | from SCRDRlearner.Object import FWObject, getWordTag
12 | from SCRDRlearner.SCRDRTree import SCRDRTree
13 | from SCRDRlearner.SCRDRTreeLearner import SCRDRTreeLearner
14 | from Utility.Config import NUMBER_OF_PROCESSES, THRESHOLD
15 |
16 | def unwrap_self_ExtRDRPOSTagger(arg, **kwarg):
17 | return ExtRDRPOSTagger.tagInitializedSentence(*arg, **kwarg)
18 |
19 | class ExtRDRPOSTagger(SCRDRTree):
20 | def __init__(self):
21 | self.root = None
22 |
23 | def tagInitializedSentence(self, initSen):
24 | wordTags = initSen.replace("“", "''").replace("”", "''").replace("\"", "''").split()
25 | sen = []
26 | for i in range(len(wordTags)):
27 | fwObject = FWObject.getFWObject(wordTags, i)
28 | word, tag = getWordTag(wordTags[i])
29 | node = self.findFiredNode(fwObject)
30 | if node.depth > 0:
31 | sen.append(word + "/" + node.conclusion)
32 | else:# Fired at root, return initialized tag
33 | sen.append(word + "/" + tag)
34 | return " ".join(sen)
35 |
36 | def tagInitializedCorpus(self, inputFile):
37 | lines = open(inputFile, "r").readlines()
38 | #Change the value of NUMBER_OF_PROCESSES to obtain faster tagging process!
39 | pool = Pool(processes = NUMBER_OF_PROCESSES)
40 | taggedLines = pool.map(unwrap_self_ExtRDRPOSTagger, zip([self] * len(lines), lines))
41 | out = open(inputFile + ".TAGGED", "w")
42 | for line in taggedLines:
43 | out.write(line + "\n")
44 | out.close()
45 | print("\nOutput file: " + inputFile + ".TAGGED")
46 |
47 | def printHelp():
48 | print("\n===== Usage =====")
49 | print('\n#1: To train RDRPOSTagger in case of using output from an external initial POS tagger:')
50 | print('\npython ExtRDRPOSTagger.py train PATH-TO-GOLD-STANDARD-TRAINING-CORPUS PATH-TO-TRAINING-CORPUS-INITIALIZED-BY-EXTERNAL-TAGGER')
51 | print('\nExample: python ExtRDRPOSTagger.py train ../data/goldTrain ../data/initTrain')
52 | print('\n#2: To use the trained model for POS tagging on a test corpus where words already are initially tagged by the external initial tagger:')
53 | print('\npython ExtRDRPOSTagger.py tag PATH-TO-TRAINED-MODEL PATH-TO-TEST-CORPUS-INITIALIZED-BY-EXTERNAL-TAGGER')
54 | print('\nExample: python ExtRDRPOSTagger.py tag ../data/initTrain.RDR ../data/initTest')
55 | print('\n#3: Find the full usage at http://rdrpostagger.sourceforge.net !')
56 |
57 | def run(args = sys.argv[1:]):
58 | if (len(args) == 0):
59 | printHelp()
60 | elif args[0].lower() == "train":
61 | try:
62 | print("\n===== Start =====")
63 | print('\nLearn a tree model of rules for POS tagging from %s and %s ' % (args[1], args[2]))
64 | rdrTree = SCRDRTreeLearner(THRESHOLD[0], THRESHOLD[1])
65 | rdrTree.learnRDRTree(args[2], args[1])
66 | print("\nWrite the learned tree model to file " + args[2] + ".RDR")
67 | rdrTree.writeToFile(args[2] + ".RDR")
68 | print('\nDone!')
69 | except Exception as e:
70 | print("\nERROR ==> ", e)
71 | printHelp()
72 | elif args[0].lower() == "tag":
73 | try:
74 | r = ExtRDRPOSTagger()
75 | print("\n=> Read a POS tagging model from " + args[1])
76 | r.constructSCRDRtreeFromRDRfile(args[1])
77 | print("\n=> Perform POS tagging on " + args[2])
78 | r.tagInitializedCorpus(args[2])
79 | except Exception as e:
80 | print("\nERROR ==> ", e)
81 | printHelp()
82 | else:
83 | printHelp()
84 | if __name__ == "__main__":
85 | run()
86 |
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/CxGProcessor/utils.py:
--------------------------------------------------------------------------------
1 | import re
2 | EMAIL_REGEX = re.compile(
3 | r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-](@|[(<{\[]at[)>}\]])(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))",
4 | flags=re.IGNORECASE | re.UNICODE,
5 | )
6 | NUMBERS_REGEX = re.compile(
7 | r"(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|(\d*?[.,]\d+)|\d+)(?:$|(?=\b))"
8 | )
9 | CURRENCIES = {
10 | "$": "USD",
11 | "zł": "PLN",
12 | "£": "GBP",
13 | "¥": "JPY",
14 | "฿": "THB",
15 | "₡": "CRC",
16 | "₦": "NGN",
17 | "₩": "KRW",
18 | "₪": "ILS",
19 | "₫": "VND",
20 | "€": "EUR",
21 | "₱": "PHP",
22 | "₲": "PYG",
23 | "₴": "UAH",
24 | "₹": "INR",
25 | }
26 | CURRENCY_REGEX = re.compile(
27 | "({})+".format("|".join(re.escape(c) for c in CURRENCIES.keys()))
28 | )
29 | PHONE_REGEX = re.compile(
30 | r"((?:^|(?<=[^\w)]))(((\+?[01])|(\+\d{2}))[ .-]?)?(\(?\d{3,4}\)?/?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W)))|\+?\d{4,5}[ .-/]\d{6,9}"
31 | )
32 |
33 | # text = "10086 yes"
34 | # res = PHONE_REGEX.sub("", text)
35 | # res1 = NUMBERS_REGEX.sub("", text)
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/ModelTokenizer.py:
--------------------------------------------------------------------------------
1 | from Tokenizer.BaseTokenizer import BasicTokenizer, WordpieceTokenizer, Tokenizer
2 | from Tokenizer.constants import *
3 | from Tokenizer.CxGProcessor.CxGCore import CxGCore
4 | from Tokenizer.Vocab import CxGVocab
5 | from transformers import AutoTokenizer
6 |
7 | # Not available in this repo
8 | class BertTokenizer(Tokenizer):
9 | def __init__(self, args):
10 | super().__init__(args, token_mode=CONST_TOKEN_MODE_WORD)
11 | self.basic_tokenizer = BasicTokenizer(do_lower_case=args.do_lower_case)
12 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=UNK_TOKEN)
13 |
14 | def tokenize(self, text) -> list:
15 | tokens, split_tokens = [], []
16 | if isinstance(text, str):
17 | text = [text]
18 | for ele in text:
19 | split_tokens = []
20 | for token in self.basic_tokenizer.tokenize(ele):
21 | for sub_token in self.wordpiece_tokenizer.tokenize(token):
22 | split_tokens.append(sub_token)
23 | tokens.append(split_tokens)
24 | if len(text) == 1:
25 | return split_tokens
26 | return tokens
27 |
28 |
29 | class CxGTokenizer(object):
30 | def __init__(self, args, visible=True, workers=None, lang='eng'):
31 | self.cxg = CxGCore(args, workers=workers, lang=lang)
32 | self.bert = AutoTokenizer.from_pretrained(args.lm_path)
33 | self.cons_vocab = CxGVocab(args.cxg_vocab_path, lang=lang)
34 | self.visible = visible
35 |
36 | def tokenize(self, text, raw=True) -> dict:
37 | results = self.cxg.parse_text(text)
38 | # return results
39 | if raw:
40 | cons_pattern = [self.cons_vocab.cxg_i2c[ele] for ele in results['cons_idx']]
41 | results['cons_pattern'] = cons_pattern
42 | return results
43 | # else branch is not avalible in Github Repo
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 |
16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Tokenizer) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Tokenizer/README_ZH.md)
17 |
18 | ## Construction dictionaries for CxGTokenizer
19 |
20 | In this repository, we provide mirror data and download script for six language (English, French, German, Spanish, Dutch, and Turkish) dictionaries in CxGTokenizer based on the `c2xg` package. Regarding the specific usage guidance for CxGTokenizer, please refer to [`tutorials`](https://github.com/xlxwalex/HyCxG/tree/main/tutorials).
21 |
22 | ### Download data
23 | You can use the following command to download the data (Note: you may not attach any parameters, all parameters have default values):
24 | ```shell
25 | bash download_cxgdict.sh [--LANGUAGES]
26 | ```
27 | **Parameters:**
28 | + LANGUAGES: The abbreviation of the required languages. If you want to download all languages, use `all` for `LANGUAGES` parameter. If you only want to download part of languages, please include the abbreviation of the required languages in the parameter according to the following abbreviations:
29 | 1. English: `eng`
30 | 2. French: `fra`
31 | 3. German: `deu`
32 | 4. Spanish: `spa`
33 | 5. Dutch: `nld`
34 | 6. Turkish: `tur`
35 |
36 | **Note:** The dicts for CxG corresponding to different languages will be downloaded to the `CxGProcessor/data` folder.
37 |
38 | ### Resource of data
39 | Our mirror data is obtained from c2xg package, as well as the data sources [c2xg - dictionaries](https://github.com/jonathandunn/c2xg/tree/master/c2xg/data/dictionaries). If you have also used these construction grammar lists, you can cite their papers as follows:
40 | ```
41 | @article{dunn2017computational,
42 | title={Computational learning of construction grammars},
43 | author={Dunn, Jonathan},
44 | journal={Language and cognition},
45 | volume={9},
46 | number={2},
47 | pages={254--292},
48 | year={2017},
49 | publisher={Cambridge University Press}
50 | }
51 | ```
52 |
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/README_ZH.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Tokenizer) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Tokenizer/README_ZH.md)
16 | ## CxGTokenizer所需的构式语法词表数据
17 |
18 | 在本仓库中,我们基于c2xg包提供了六种语言(英语、法语、德语、西班牙语、荷兰语以及土耳其语)的构式语法词表的镜像数据以及下载脚本。关于具体CxGTokenizer的使用指导,请参考[`tutorials`](https://github.com/xlxwalex/HyCxG/tree/main/tutorials)。
19 |
20 | ### 数据下载
21 | 您可以用以下命令来获得并处理数据(可以不附加任何参数,所有参数均有默认值):
22 | ```shell
23 | bash download_cxgdict.sh [--LANGUAGES]
24 | ```
25 | **参数含义:**
26 | + LANGUAGES: 所需语言的简称,如果需要下载全部语言,使用`all`即可。若只希望下载部分语言,请按照以下对应关系在参数中加入所需语言简称:
27 | 1. 英语:`eng`
28 | 2. 法语:`fra`
29 | 3. 德语:`deu`
30 | 4. 西班牙语:`spa`
31 | 5. 荷兰语:`nld`
32 | 6. 土耳其语:`tur`
33 |
34 | **注意:** 不同语言对应的构式语法词表会被下载到本目录下的`CxGProcessor/data`文件夹中
35 |
36 | ### 数据来源
37 | 本部分数据来自c2xg,我们的镜像数据来源[c2xg - dictionaries](https://github.com/jonathandunn/c2xg/tree/master/c2xg/data/dictionaries),如果您也使用了该语法表,您可以引用他们的论文:
38 | ```
39 | @article{dunn2017computational,
40 | title={Computational learning of construction grammars},
41 | author={Dunn, Jonathan},
42 | journal={Language and cognition},
43 | volume={9},
44 | number={2},
45 | pages={254--292},
46 | year={2017},
47 | publisher={Cambridge University Press}
48 | }
49 | ```
50 |
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/Vocab.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | class Vocab(object):
4 | def __init__(self):
5 | # Reserved Vocabulary
6 | self.reserved_vocab_path = os.path.abspath(
7 | os.path.join(os.path.dirname(__file__), "../../models/reserved_vocab.txt"))
8 |
9 | def load(self, vocab_path):
10 | raise NotImplementedError
11 |
12 | def save(self, save_path):
13 | raise NotImplementedError
14 |
15 | class BERTVocab(Vocab):
16 | """
17 | Vocabulary of BERT Tokenizer
18 | """
19 |
20 | def __init__(self, vocab_path='dataset/Vocab/BERT/'):
21 | super().__init__()
22 | self.word_w2i = {}
23 | self.word_i2w = []
24 | self.word_w2c = {}
25 | self.load(vocab_path)
26 |
27 | def load(self, vocab_path):
28 | with open(os.path.join(vocab_path, 'vocab.txt'), mode="r", encoding="utf-8") as reader:
29 | for index, line in enumerate(reader):
30 | w = line.strip("\n").split()[0] if line.strip() else line.strip("\n")
31 | self.word_w2i[w] = index
32 | self.word_i2w.append(w)
33 |
34 | def save(self, save_path):
35 | print("Word Vocabulary size: ", len(self))
36 | with open(save_path, mode="w", encoding="utf-8") as f:
37 | for w in self.word_i2w:
38 | f.write(w + "\n")
39 | print("Word Vocabulary saving done.")
40 |
41 | def get(self, w):
42 | return self.word_w2i[w]
43 |
44 | def __len__(self):
45 | return len(self.word_i2w)
46 |
47 |
48 | class CxGVocab(Vocab):
49 | """
50 | Vocabulary of CxGBERT Tokenizer
51 | """
52 | def __init__(self, vocab_path = "../dataset/CxGBERT/", lang='eng'):
53 | super().__init__()
54 | # Externel Construction Vocabulary
55 | self.cxg_c2i = {}
56 | self.cxg_i2c = []
57 | self.cxg_c2c = {}
58 | self.lang = lang
59 | self.load(vocab_path)
60 |
61 | def __len__(self):
62 | return len(self.cxg_i2c)
63 |
64 | def load(self, vocab_path):
65 | with open(os.path.join(vocab_path, 'construction.txt' if self.lang == 'eng' else '{}.construction.txt'.format(self.lang)), mode="r", encoding="utf-8") as reader:
66 | self.cxg_c2i[''] = 0
67 | self.cxg_i2c.append('')
68 | for index, line in enumerate(reader):
69 | w = line.strip("\n").replace(' ', '').split()[0] if line.strip() else line.strip("\n")
70 | self.cxg_c2i[w] = index + 1
71 | self.cxg_i2c.append(w)
72 |
73 | def get(self, w):
74 | return self.cxg_c2i[w]
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | from .ModelTokenizer import CxGTokenizer, BertTokenizer
2 |
3 | __all__ = ['CxGTokenizer', 'BertTokenizer']
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/constants.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 |
4 | with open(os.path.abspath(os.path.join(os.path.dirname(__file__), "../dataset/Vocab/special_tokens_map.json")), mode="r", encoding="utf-8") as f:
5 | special_tokens_map = json.load(f)
6 |
7 | UNK_TOKEN = special_tokens_map["unk_token"]
8 | CLS_TOKEN = special_tokens_map["cls_token"]
9 | SEP_TOKEN = special_tokens_map["sep_token"]
10 | PAD_TOKEN = special_tokens_map["pad_token"]
11 |
12 | CONST_TOKEN_MODE_WORD = 'WORD'
13 | CONST_TOKEN_MODE_CXG = 'CXG'
--------------------------------------------------------------------------------
/HyCxG/Tokenizer/download_cxgdict.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | OUT_PATH=CxGProcessor/data
4 | # (English, Turkish, Dutch, Spanish, French, German)
5 | LANGUAGE_1=$1
6 | LANGUAGE_2=$2
7 | LANGUAGE_3=$3
8 | LANGUAGE_4=$4
9 | LANGUAGE_5=$5
10 | LANGUAGE_6=$6
11 |
12 | if [ -d "$OUT_PATH" ]; then
13 | echo "$OUT_PATH folder exists, pass"
14 | else
15 | echo "$OUT_PATH folder does not exist, please check"
16 | set -e
17 | fi
18 |
19 | if test -z "$LANGUAGE_1"
20 | then
21 | LANGUAGE_1=all
22 | fi
23 |
24 | echo "Downloading construction grammar dictionaries"
25 | echo "Original data for construction grammar dictionaries of languages can be found in (c2xg package) https://github.com/jonathandunn/c2xg/tree/master/c2xg/data/dictionaries"
26 |
27 | for LANG in $LANGUAGE_1 $LANGUAGE_2 $LANGUAGE_3 $LANGUAGE_4 $LANGUAGE_5 $LANGUAGE_6
28 | do
29 | # English
30 | if [[ "$LANG" == "eng" ]] || [[ "$LANG" == "all" ]];then
31 | wget -O $OUT_PATH/eng.DICT https://expic.xlxw.org/hycxg/cxgdict/eng.DICT
32 | wget -O $OUT_PATH/eng.RDR https://expic.xlxw.org/hycxg/cxgdict/eng.RDR
33 | wget -O $OUT_PATH/eng.clusters.fastText.v2.gz https://expic.xlxw.org/hycxg/cxgdict/eng.clusters.fastText.v2.gz
34 | fi
35 | # Tukrish
36 | if [[ "$LANG" == "tur" ]] || [[ "$LANG" == "all" ]];then
37 | wget -O $OUT_PATH/tur.DICT https://expic.xlxw.org/hycxg/cxgdict/tur.DICT
38 | wget -O $OUT_PATH/tur.RDR https://expic.xlxw.org/hycxg/cxgdict/tur.RDR
39 | wget -O $OUT_PATH/tur.clusters.fastText.v2.gz https://expic.xlxw.org/hycxg/cxgdict/tur.clusters.fastText.v2.gz
40 | fi
41 | # French
42 | if [[ "$LANG" == "fra" ]] || [[ "$LANG" == "all" ]];then
43 | wget -O $OUT_PATH/fra.DICT https://expic.xlxw.org/hycxg/cxgdict/fra.DICT
44 | wget -O $OUT_PATH/fra.RDR https://expic.xlxw.org/hycxg/cxgdict/fra.RDR
45 | wget -O $OUT_PATH/fra.clusters.fastText.v2.gz https://expic.xlxw.org/hycxg/cxgdict/fra.clusters.fastText.v2.gz
46 | fi
47 | # Spanish
48 | if [[ "$LANG" == "spa" ]] || [[ "$LANG" == "all" ]];then
49 | wget -O $OUT_PATH/spa.DICT https://expic.xlxw.org/hycxg/cxgdict/spa.DICT
50 | wget -O $OUT_PATH/spa.RDR https://expic.xlxw.org/hycxg/cxgdict/spa.RDR
51 | wget -O $OUT_PATH/spa.clusters.fastText.v2.gz https://expic.xlxw.org/hycxg/cxgdict/spa.clusters.fastText.v2.gz
52 | fi
53 | # German
54 | if [[ "$LANG" == "deu" ]] || [[ "$LANG" == "all" ]];then
55 | wget -O $OUT_PATH/deu.DICT https://expic.xlxw.org/hycxg/cxgdict/deu.DICT
56 | wget -O $OUT_PATH/deu.RDR https://expic.xlxw.org/hycxg/cxgdict/deu.RDR
57 | wget -O $OUT_PATH/deu.clusters.fastText.v2.gz https://expic.xlxw.org/hycxg/cxgdict/deu.clusters.fastText.v2.gz
58 | fi
59 | # Dutch
60 | if [[ "$LANG" == "nld" ]] || [[ "$LANG" == "all" ]];then
61 | wget -O $OUT_PATH/nld.DICT https://expic.xlxw.org/hycxg/cxgdict/nld.DICT
62 | wget -O $OUT_PATH/nld.RDR https://expic.xlxw.org/hycxg/cxgdict/nld.RDR
63 | wget -O $OUT_PATH/nld.clusters.fastText.v2.gz https://expic.xlxw.org/hycxg/cxgdict/nld.clusters.fastText.v2.gz
64 | fi
65 | done
66 |
67 | echo "The CxG vocab of languages are stored in $OUT_PATH"
68 |
--------------------------------------------------------------------------------
/HyCxG/Trainer/Trainer.py:
--------------------------------------------------------------------------------
1 | from torch.utils.data import DataLoader
2 |
3 | class Trainer(object):
4 | def __init__(self, args, model, criterion, optimizer, device, checkp, scheduler = None):
5 | super(Trainer, self).__init__()
6 | self.args = args
7 | # Training Component
8 | self.model = model
9 | self.criterion = criterion
10 | self.optimizer = optimizer
11 | self.scheduler = scheduler
12 | self.device = device
13 | # Training Params
14 | self.checkpoint = checkp
15 | self.epoch = 0
16 | self.step = 0
17 | self.best = - float('inf')
18 | self.eval_inform = []
19 |
20 | # Train Model
21 | def train(self, Trainset : DataLoader, Validset : DataLoader):
22 | raise NotImplementedError
23 |
24 | # Valid Model
25 | def valid(self, Validset : DataLoader):
26 | raise NotImplementedError
27 |
28 | # Test Model
29 | def test(self, Testset: DataLoader):
30 | raise NotImplementedError
31 |
32 | # Generate Checkpoints
33 | def _generate_checkp(self) -> dict:
34 | checkpoints = {
35 | 'model': self.model.state_dict(),
36 | 'optim': self.optimizer,
37 | }
38 | return checkpoints
--------------------------------------------------------------------------------
/HyCxG/Trainer/__init__.py:
--------------------------------------------------------------------------------
1 | from Trainer.HyCxGTrainerABSA import HyCxGTrainerABSA
2 | from Trainer.HyCxGTrainerGLUE import HyCxGTrainerGLUE
3 |
4 | __all__ = ['HyCxGTrainerABSA', 'HyCxGTrainerGLUE']
--------------------------------------------------------------------------------
/HyCxG/config.py:
--------------------------------------------------------------------------------
1 | from utils.argument import ArgumentGroup
2 | import argparse
3 |
4 | def parse_args():
5 | parser = argparse.ArgumentParser(description='HyCxG Model Parameters Setting')
6 | # Base Params
7 | base_args = ArgumentGroup(parser, 'base', 'Base Settings')
8 | base_args.add_arg('mode', str, 'train', 'Experiment Mode')
9 | base_args.add_arg('cuda', bool, True, 'CUDA device')
10 | base_args.add_arg('gpu_id', int, 0, 'GPU ID, 0 for cuda:0')
11 | base_args.add_arg('seed', int, 0, 'Global Random Seed')
12 | base_args.add_arg('checkpoints', str, 'checkpoints/', 'Checkpoint Path')
13 | base_args.add_arg('checkp', str, 'hyper_cxg_mams/', 'Checkpoint Dir')
14 |
15 | # Dataset Params
16 | data_args = ArgumentGroup(parser, 'dataset', 'Dataset Settings')
17 | data_args.add_arg('data_name', str, 'JSONABSA_MAMS', 'Name of Dataset')
18 | data_args.add_arg('num_workers', int, 64, 'Number of workers to solve coverage problem')
19 | data_args.add_arg('t_minutes', float, 0.05, 'Cost of time to solve the coverage problem per instance (minutes)')
20 | data_args.add_arg('test_outpath', str, 'result_test.csv', 'Output test results for analysis')
21 |
22 | # Model Params
23 | model_args = ArgumentGroup(parser, 'model', 'Model Settings')
24 | model_args.add_arg('num_classes', int, 3, 'Number of classes or each task')
25 | model_args.add_arg('padding_size', int, 150, 'Padding size Of PLM Model')
26 | model_args.add_arg('padding_val', int, 0, 'Padding value of PLM Model')
27 | model_args.add_arg('lm_dropout', float, 0.0, 'Dropout for PLM model')
28 | model_args.add_arg('hg_dropout', float, 0.4, 'Dropout for R-HGAT network')
29 | model_args.add_arg('hg_inter_dim', int, 384, 'Size of representations for transform hgatt network')
30 | model_args.add_arg('hg_layers', int, 1, 'Number of layers for R-HGAT network')
31 | model_args.add_arg('inter_size', int, 3072, 'Size of middle representations in FFN module')
32 | model_args.add_arg('leaky_alpha', float, 0.2, 'Alpha setting for leaky_relu in R-HGAT network')
33 | model_args.add_arg('edge_trans', bool, True, 'Transform hyperedge embedding (construction)')
34 | model_args.add_arg('remove_layernorm', bool, False, 'Remove layernorm for the embedding of cxg')
35 | # If enable multi-head R-HGAT (Not available in the repo)
36 | model_args.add_arg('heads_num', int, 12, 'Head num of hyper graph attention')
37 | # If enable syntactic graph (Not available in the repo)
38 | model_args.add_arg('parse_syntax', bool, False, 'Whether to inject syntax inform')
39 | model_args.add_arg('parse_direct', bool, False, 'Whether to construct direct graph')
40 |
41 | # Tokenizer and CxG Processor Params
42 | tokenizer_args = ArgumentGroup(parser, 'tokenizer', 'Tokenizer Settings')
43 | tokenizer_args.add_arg('word_vocab_path', str, 'dataset/Vocab/BERT/', 'LM Vocab path')
44 | tokenizer_args.add_arg('cxg_vocab_path', str, 'dataset/Vocab/CxG/', 'LM Vocab path')
45 | tokenizer_args.add_arg('do_lower_case', bool, True, 'Lower case the elememts')
46 |
47 | # Pre-trained Model Params
48 | pretrained_args = ArgumentGroup(parser, 'pretrained', 'Pre-trained Model Settings')
49 | pretrained_args.add_arg('lm_group', str, 'BERT', 'Pre-trained language model group, e.g., BERT/RoBERTa')
50 | pretrained_args.add_arg('use_lm', bool, True, 'Whether Model Use pre-trained language models')
51 | pretrained_args.add_arg('lm_path', str, 'bert-base-uncased', 'Pre-trained model path')
52 | pretrained_args.add_arg('lm_hidden_size', int, 768, 'HiddenSize of PLM')
53 | pretrained_args.add_arg('output_hidden_states', bool, True, 'Output PLM hidden states at token level')
54 | pretrained_args.add_arg('finetune', bool, True, 'Finetune Or freeze PLM')
55 |
56 | # Training Params
57 | train_args = ArgumentGroup(parser, 'train', 'Training Settings')
58 | train_args.add_arg('batch_size', int, 32, 'Batch size for training, depending on the memory size of your GPU')
59 | train_args.add_arg('shuffle', bool, True, 'DataLoader shuffle params, should be True when training')
60 | train_args.add_arg('droplast', bool, False, 'Whether to drop rest data for dataloader')
61 | train_args.add_arg('lr', float, 2e-5, 'Learning rate')
62 | train_args.add_arg('wd', float, 1e-2, 'Weight decay')
63 | train_args.add_arg('max_grad_norm', float, 1.0, 'Gradient clipping')
64 | train_args.add_arg('num_epoch', int, 50, 'Epoch param')
65 | train_args.add_arg('warmup_ratio', int, 0.06, 'Warm Up Steps Phase')
66 | train_args.add_arg('print_step', int, 5, 'Training Print Steps')
67 | train_args.add_arg('eval_step', int, 50, 'Evaluating Steps')
68 | train_args.add_arg('scheduler', bool, False, 'Whether to apply scheduler for training')
69 |
70 | args = parser.parse_args()
71 | return args
72 |
73 |
--------------------------------------------------------------------------------
/HyCxG/dataset/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 |
16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset/README_ZH.md)
17 |
18 | ## Construction grammar list
19 |
20 | In this repository, we provide mirror data and download script for six language (English, French, German, Spanish, Dutch, and Turkish) construction grammar lists based on the `c2xg` package.
21 |
22 | ### Download data
23 | You can use the following command to download the data (Note: you may not attach any parameters, all parameters have default values):
24 | ```shell
25 | bash download_vocab.sh [--LANGUAGES]
26 | ```
27 | **Parameters:**
28 | + LANGUAGES: The abbreviation of the required languages. If you want to download all languages, use `all` for `LANGUAGES` parameter. If you only want to download part of languages, please include the abbreviation of the required languages in the parameter according to the following abbreviations:
29 | 1. English: `eng`
30 | 2. French: `fra`
31 | 3. German: `deu`
32 | 4. Spanish: `spa`
33 | 5. Dutch: `nld`
34 | 6. Turkish: `tur`
35 |
36 | **Note:** The list of construction crammar corresponding to different languages will be downloaded to the `CxG` folder in the current directory.
37 |
38 | ### Resource of data
39 | Our mirror data is obtained from c2xg package, as well as the data sources [c2xg - data](https://github.com/jonathandunn/c2xg/tree/master/c2xg/data/models). If you have also used these construction grammar lists, you can cite their papers as follows:
40 | ```
41 | @article{dunn2017computational,
42 | title={Computational learning of construction grammars},
43 | author={Dunn, Jonathan},
44 | journal={Language and cognition},
45 | volume={9},
46 | number={2},
47 | pages={254--292},
48 | year={2017},
49 | publisher={Cambridge University Press}
50 | }
51 | ```
52 |
--------------------------------------------------------------------------------
/HyCxG/dataset/README_ZH.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset/README_ZH.md)
16 | ## 构式语法表数据
17 |
18 | 在本仓库中,我们基于c2xg包提供了六种语言(英语、法语、德语、西班牙语、荷兰语以及土耳其语)的构式语法列表的镜像数据以及下载脚本。
19 |
20 | ### 数据下载
21 | 您可以用以下命令来获得并处理数据(可以不附加任何参数,所有参数均有默认值):
22 | ```shell
23 | bash download_vocab.sh [--LANGUAGES]
24 | ```
25 | **参数含义:**
26 | + LANGUAGES: 所需语言的简称,如果需要下载全部语言,使用`all`即可。若只希望下载部分语言,请按照以下对应关系在参数中加入所需语言简称:
27 | 1. 英语:`eng`
28 | 2. 法语:`fra`
29 | 3. 德语:`deu`
30 | 4. 西班牙语:`spa`
31 | 5. 荷兰语:`nld`
32 | 6. 土耳其语:`tur`
33 |
34 | **注意:** 不同语言对应的构式语法列表会下载到本目录下的`CxG`文件夹中
35 |
36 | ### 数据来源
37 | 本部分数据来自c2xg,我们的镜像数据来源[c2xg - data](https://github.com/jonathandunn/c2xg/tree/master/c2xg/data/models),如果您也使用了该语法表,您可以引用他们的论文:
38 | ```
39 | @article{dunn2017computational,
40 | title={Computational learning of construction grammars},
41 | author={Dunn, Jonathan},
42 | journal={Language and cognition},
43 | volume={9},
44 | number={2},
45 | pages={254--292},
46 | year={2017},
47 | publisher={Cambridge University Press}
48 | }
49 | ```
50 |
--------------------------------------------------------------------------------
/HyCxG/dataset/download_vocab.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | DETECT_FLAG=dataset/Vocab
4 | # (English, Turkish, Dutch, Spanish, French, German)
5 | LANGUAGE_1=$1
6 | LANGUAGE_2=$2
7 | LANGUAGE_3=$3
8 | LANGUAGE_4=$4
9 | LANGUAGE_5=$5
10 | LANGUAGE_6=$6
11 |
12 | if [ -d "$DETECT_FLAG" ]; then
13 | OUT_PATH=$DETECT_FLAG
14 | else
15 | OUT_PATH='Vocab'
16 | fi
17 |
18 | if [ -d "$OUT_PATH" ]; then
19 | echo "$OUT_PATH folder exists, pass"
20 | else
21 | echo "$OUT_PATH folder does not exist, try to mkdir"
22 | mkdir "$OUT_PATH"
23 | fi
24 |
25 | VOCAB_DIR=$OUT_PATH/CxG
26 |
27 | if [ -d "$VOCAB_DIR" ]; then
28 | echo "$VOCAB_DIR folder exists, pass"
29 | else
30 | echo "$VOCAB_DIR folder does not exist, try to mkdir"
31 | mkdir "$VOCAB_DIR"
32 | fi
33 |
34 | if test -z "$LANGUAGE_1"
35 | then
36 | LANGUAGE_1=all
37 | fi
38 |
39 | echo "Downloading special_tokens_map.json"
40 | wget -O $OUT_PATH/special_tokens_map.json https://expic.xlxw.org/hycxg/cxgvocab/special_tokens_map.json
41 |
42 | echo "Downloading construction grammar list"
43 | echo "Original data for construction grammar list of languages can be found in (c2xg package) https://github.com/jonathandunn/c2xg/tree/master/c2xg/data/models"
44 |
45 | for LANG in $LANGUAGE_1 $LANGUAGE_2 $LANGUAGE_3 $LANGUAGE_4 $LANGUAGE_5 $LANGUAGE_6
46 | do
47 | # English
48 | if [[ "$LANG" == "eng" ]] || [[ "$LANG" == "all" ]];then
49 | wget -O $VOCAB_DIR/construction.txt https://expic.xlxw.org/hycxg/cxgvocab/construction.txt
50 | wget -O $VOCAB_DIR/construction.pkl https://expic.xlxw.org/hycxg/cxgvocab/construction.pkl
51 | fi
52 | # Tukrish
53 | if [[ "$LANG" == "tur" ]] || [[ "$LANG" == "all" ]];then
54 | wget -O $VOCAB_DIR/tur.construction.txt https://expic.xlxw.org/hycxg/cxgvocab/tur.construction.txt
55 | wget -O $VOCAB_DIR/tur.construction.pkl https://expic.xlxw.org/hycxg/cxgvocab/tur.construction.pkl
56 | fi
57 | # French
58 | if [[ "$LANG" == "fra" ]] || [[ "$LANG" == "all" ]];then
59 | wget -O $VOCAB_DIR/fra.construction.txt https://expic.xlxw.org/hycxg/cxgvocab/fra.construction.txt
60 | wget -O $VOCAB_DIR/fra.construction.pkl https://expic.xlxw.org/hycxg/cxgvocab/fra.construction.pkl
61 | fi
62 | # Spanish
63 | if [[ "$LANG" == "spa" ]] || [[ "$LANG" == "all" ]];then
64 | wget -O $VOCAB_DIR/spa.construction.txt https://expic.xlxw.org/hycxg/cxgvocab/spa.construction.txt
65 | wget -O $VOCAB_DIR/spa.construction.pkl https://expic.xlxw.org/hycxg/cxgvocab/spa.construction.pkl
66 | fi
67 | # German
68 | if [[ "$LANG" == "deu" ]] || [[ "$LANG" == "all" ]];then
69 | wget -O $VOCAB_DIR/deu.construction.txt https://expic.xlxw.org/hycxg/cxgvocab/deu.construction.txt
70 | wget -O $VOCAB_DIR/deu.construction.pkl https://expic.xlxw.org/hycxg/cxgvocab/deu.construction.pkl
71 | fi
72 | # Dutch
73 | if [[ "$LANG" == "nld" ]] || [[ "$LANG" == "all" ]];then
74 | wget -O $VOCAB_DIR/nld.construction.txt https://expic.xlxw.org/hycxg/cxgvocab/nld.construction.txt
75 | wget -O $VOCAB_DIR/nld.construction.pkl https://expic.xlxw.org/hycxg/cxgvocab/nld.construction.pkl
76 | fi
77 | done
78 |
79 | echo "The CxG vocab of languages are stored in $OUT_PATH"
80 |
--------------------------------------------------------------------------------
/HyCxG/run_hycxg.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2023 The ZJU MMF Authors (Lvxiaowei Xu, Jianwang Wu, Jiawei Peng, Zhilin Gong, Ming Cai * and Tianxiang Wang).
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # Train HyCxG
17 | # Global Variable (!!! SHOULD ADAPT TO YOUR CONFIGURATION !!!)
18 | PLM_PATH_ABBR=bert-base-uncased
19 | NUM_WORKERS=1 # Recommand: The bigger the better, if possible.
20 |
21 | # ABSA
22 | python train_hycxg.py --mode train --checkp hyper_cxg_mams \
23 | --data_name JSONABSA_MAMS --num_classes 3 --num_workers $NUM_WORKERS \
24 | --lm_path $PLM_PATH_ABBR --lm_group BERT
25 |
26 | # GLUE - Base model
27 | # Note: for STS task, the num_classes need to be set to 1
28 | python train_hycxg.py --mode train --checkp hyper_cxg_mrpc \
29 | --data_name JSONGLUE_MRPC --num_classes 2 --num_workers $NUM_WORKERS \
30 | --lm_path $PLM_PATH_ABBR --lm_group RoBERTa
31 |
32 | # GLUE - Large model
33 | python train_hycxg.py --mode train --checkp hyper_cxg_mrpc \
34 | --data_name JSONGLUE_MRPC --num_classes 2 --num_workers $NUM_WORKERS \
35 | --lm_hidden_size 1024 --inter_size 4096 \
36 | --lm_path $PLM_PATH_ABBR --lm_group RoBERTa
--------------------------------------------------------------------------------
/HyCxG/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from utils.misc import _get_device as get_device, set_seed, print_config as arg_show, cal4scheduler
2 | from utils.data import read_dataset, collate_hypercxg_aspect, collate_hypercxg_glue, output_results, tokenize_aspect, tokenize_glue, construct_dependency_graph, pair_hypocxg, reconstruct_sentence, calculate_cxg_size
3 | from utils.define import *
4 | from utils.coverage import cxg_max_coverage
5 | from utils.hypergraph import construct_graph
6 | from utils.operates import _padding as padding, _save_model as save_model, _attention_mask as attention_mask, _pad_adj as pad_adj
7 | from utils.metric import Metric
8 | from utils.argument import Args_trans
9 | from utils.optimizers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, \
10 | get_cosine_with_hard_restarts_schedule_with_warmup, get_polynomial_decay_schedule_with_warmup, \
11 | get_constant_schedule, get_constant_schedule_with_warmup
12 |
13 | __all__ = ['get_device', 'set_seed', 'arg_show', 'cal4scheduler',
14 | 'read_dataset', 'collate_hypercxg_aspect', 'collate_hypercxg_glue', 'output_results', 'tokenize_aspect', 'tokenize_glue', 'pair_hypocxg', 'reconstruct_sentence', 'calculate_cxg_size',
15 | 'DATASET_MAP', 'ABSA_POLARITY_MAP',
16 | 'cxg_max_coverage', 'construct_graph',
17 | 'padding', 'save_model', 'attention_mask', 'pad_adj',
18 | 'Metric', 'Args_trans',
19 | 'get_linear_schedule_with_warmup', 'get_cosine_schedule_with_warmup', 'get_cosine_with_hard_restarts_schedule_with_warmup',
20 | 'get_cosine_with_hard_restarts_schedule_with_warmup', 'get_polynomial_decay_schedule_with_warmup', 'get_constant_schedule', 'get_constant_schedule_with_warmup']
--------------------------------------------------------------------------------
/HyCxG/utils/argument.py:
--------------------------------------------------------------------------------
1 | def str2bool(v):
2 | return v.lower() in ("true", "t", "1")
3 |
4 | class ArgumentGroup(object):
5 | def __init__(self, parser, title, des):
6 | self._group = parser.add_argument_group(title=title, description=des)
7 |
8 | def add_arg(self, name, type, default, help, **kwargs):
9 | type = str2bool if type == bool else type
10 | self._group.add_argument(
11 | "--" + name,
12 | default=default,
13 | type=type,
14 | help=help + ' Default: %(default)s.',
15 | **kwargs)
16 |
17 | class Args_trans(object):
18 | def __init__(self, args):
19 | self.args = args
20 | for k in args:
21 | if type(args[k]) == str:
22 | exec("self." + k + "='%s'" % args[k])
23 | elif type(args[k]) == int:
24 | exec("self." + k + "=%d" % args[k])
25 | elif type(args[k]) == float:
26 | exec("self." + k + "=%f" % args[k])
27 | elif type(args[k]) == bool:
28 | if args[k]:
29 | exec("self." + k + "=True")
30 | else:
31 | exec("self." + k + "=False")
32 |
33 | def __str__(self):
34 | for var in self.__dict__:
35 | print('>> {} : {}'.format(var, self.__dict__[var]))
--------------------------------------------------------------------------------
/HyCxG/utils/coverage.py:
--------------------------------------------------------------------------------
1 | from Simuann import CxGCoverage
2 |
3 | def init_solver_state(cxg_length : int) -> list:
4 | init_state = [0] * cxg_length
5 | return init_state
6 |
7 | def _unpack(cxg_dict : dict) -> dict:
8 | starts, ends, patterns = [], [], []
9 | for cxg in cxg_dict:
10 | starts.append(cxg_dict[cxg][0])
11 | ends.append(cxg_dict[cxg][1])
12 | patterns.append(cxg)
13 | return {'patterns' : patterns, 'starts' : starts, 'ends': ends}
14 |
15 | def pre_detect(cons_pos : list) -> bool:
16 | if len(cons_pos) < 2:
17 | return False
18 | else:
19 | coverage = []
20 | for cons in cons_pos: coverage.extend(list(range(cons[0], cons[1])))
21 | if len(coverage) - len(set(coverage)) == 0: return False
22 | else:
23 | return True
24 |
25 | def construct_dict(cons_pos: list) -> dict:
26 | # To avoid dup in the origin way of "dict([[ele[3], (ele[0], ele[1])] for ele in cons_pos])"
27 | cxg_dict, cxg_counter = {}, {}
28 | for cons in cons_pos:
29 | if cons[3] in cxg_dict:
30 | cxg_dict[cons[3] + '--[{}]'.format(cxg_counter[cons[3]] + 1)] = (cons[0], cons[1])
31 | cxg_counter[cons[3]] += 1
32 | else:
33 | cxg_dict[cons[3]] = (cons[0], cons[1])
34 | cxg_counter[cons[3]] = 1
35 | return cxg_dict
36 |
37 | def cxg_max_coverage(starts, ends, indexs, cxgs, T_minutes = 0.05) -> list:
38 | cons_pos = list(zip(starts, ends, indexs, cxgs))
39 | cons_pos.sort(key=lambda x : x[0])
40 | # PRE-FILTER
41 | flag = pre_detect(cons_pos)
42 | if not flag: return cons_pos
43 | cxg_dict = construct_dict(cons_pos)
44 | init_state = init_solver_state(len(cxgs))
45 | solver = CxGCoverage(init_state, **_unpack(cxg_dict))
46 | solver.set_schedule(solver.auto(minutes=T_minutes))
47 | state, _ = solver.anneal()
48 | results = [cons_pos[ids] for ids in range(len(state)) if state[ids] == 1]
49 | return results
--------------------------------------------------------------------------------
/HyCxG/utils/hypergraph.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy.sparse as sp
3 |
4 | def construct_graph(cxg, mask, pad_len : int = 150):
5 | assert len(cxg) == len(mask)
6 | HT, edges = [], []
7 | max_n_edge = max([len(item) + 1 for item in cxg])
8 | for idx in range(len(cxg)):
9 | bs_cxg = cxg[idx]
10 |
11 | rows = []
12 | cols = []
13 | vals = []
14 |
15 | edge_labels = []
16 | for edge in range(len(bs_cxg)):
17 | start = bs_cxg[edge][0] + 1 # 1 - CLS
18 | end = bs_cxg[edge][1] + 1 # 1 - CLS
19 | edge_label = bs_cxg[edge][2] # CxG Index
20 | edge_labels.append(edge_label)
21 | for node in range(start, end):
22 | rows.append(node)
23 | cols.append(edge)
24 | vals.append(1.0)
25 |
26 | # FULLY RELATION
27 | # Not available in this repo
28 | try:
29 | u_H = sp.coo_matrix((vals, (rows, cols)), shape=(pad_len, max_n_edge))
30 | HT.append(np.asarray(u_H.T.todense()))
31 | except:
32 | u_H = np.zeros((pad_len, max_n_edge), dtype=np.float)
33 | HT.append(u_H.T)
34 | edges.append(edge_labels + [0] * (max_n_edge - len(edge_labels)))
35 | return HT, edges
36 |
37 |
--------------------------------------------------------------------------------
/HyCxG/utils/metric.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, matthews_corrcoef
2 | import numpy as np
3 | from scipy.stats import pearsonr
4 |
5 | class Metric(object):
6 | def __init__(self, args):
7 | self.args = args
8 |
9 | def __call__(self, preds : np.ndarray, truth : np.ndarray, sets_name = 'ABSA'):
10 | if sets_name not in ['STS']:
11 | accuracy = accuracy_score(truth, preds)
12 | precision = precision_score(truth, preds, average='macro')
13 | recall = recall_score(truth, preds, average='macro')
14 | f1score = f1_score(truth, preds, average='macro')
15 | return accuracy, precision, recall, f1score
16 | else:
17 | pearson = pearsonr(truth, preds)[0]
18 | return pearson
19 |
20 | def report(self, preds : np.ndarray, truth : np.ndarray, digit : int = 5):
21 | print(classification_report(truth, preds, digits=digit))
22 |
23 | def print_matthew(self, preds : np.ndarray, truth : np.ndarray):
24 | matt = matthews_corrcoef(truth, preds) * 100
25 | print('CoLA Matthews_coef = {:.3f}'.format(matt))
26 | return matt
--------------------------------------------------------------------------------
/HyCxG/utils/misc.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import random
4 | from argparse import Namespace
5 |
6 | # Device
7 | def _get_device(cuda : bool, gpu_id : int = 0) -> torch.device:
8 | gpu_count = torch.cuda.device_count()
9 | if torch.cuda.is_available() and gpu_id < gpu_count:
10 | device = torch.device("cuda:" + str(gpu_id) if cuda else "cpu")
11 | else:
12 | device = torch.device("cpu")
13 | return device
14 |
15 | # Seed
16 | def set_seed(args):
17 | args.seed = int(args.seed)
18 | torch.manual_seed(args.seed)
19 | np.random.seed(args.seed)
20 | random.seed(args.seed)
21 | torch.cuda.manual_seed(args.seed)
22 |
23 | def print_config(args : Namespace):
24 | print(args)
25 |
26 | def cal4scheduler(args, epoch_nums : int, train_num : int, batch_size : int, warm_up : float):
27 | import math
28 | train_steps = math.ceil(epoch_nums * train_num / batch_size)
29 | warm_up_steps = math.floor(train_steps * warm_up)
30 | args.train_steps = train_steps
31 | args.warmup_steps = warm_up_steps
32 | return args
--------------------------------------------------------------------------------
/HyCxG/utils/operates.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from scipy.special import logsumexp
4 | from utils.define import LM_PAD
5 |
6 | def _padding(inputs : list, paddings : int, pad_val : int, lm_group : str = 'BERT') -> np.ndarray:
7 | if lm_group in LM_PAD.keys():
8 | pad_val = LM_PAD[lm_group]
9 | doc = np.array([
10 | np.pad(x[0:paddings], ( 0, paddings - len(x[0:paddings])),
11 | 'constant', constant_values=pad_val)
12 | for x in inputs
13 | ]).astype('int64')
14 | return doc
15 |
16 | def _pad_adj(inputs : list, paddings : int, pad_val : int) -> np.ndarray:
17 | batch = len(inputs)
18 | adjs = np.zeros((batch, paddings, paddings)) # Not available in this repo
19 | return adjs
20 |
21 | def _attention_mask(padded : np.ndarray, pad_val : int, lm_group : str = 'BERT') -> torch.Tensor:
22 | if lm_group in LM_PAD.keys():
23 | pad_val = LM_PAD[lm_group]
24 | np_mask = (padded != pad_val).astype('int32')
25 | return torch.from_numpy(np_mask)
26 |
27 | def _save_model(path : str, checkp : dict) -> None:
28 | torch.save(checkp, path)
29 |
30 | def _normalize_logits(logits):
31 | numerator = logits
32 | denominator = logsumexp(logits)
33 | return numerator - denominator
34 |
35 | def _softmax_logits(logits :torch.Tensor, dim : int = 1):
36 | return torch.softmax(logits, dim=dim)
--------------------------------------------------------------------------------
/HyCxG/utils/optimizers.py:
--------------------------------------------------------------------------------
1 | from torch.optim import Optimizer
2 | from torch.optim.lr_scheduler import LambdaLR
3 | import math
4 |
5 | def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1):
6 | return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
7 |
8 |
9 | def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: int, last_epoch: int = -1):
10 |
11 | def lr_lambda(current_step: int):
12 | if current_step < num_warmup_steps:
13 | return float(current_step) / float(max(1.0, num_warmup_steps))
14 | return 1.0
15 |
16 | return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
17 |
18 |
19 | def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
20 | def lr_lambda(current_step: int):
21 | if current_step < num_warmup_steps:
22 | return float(current_step) / float(max(1, num_warmup_steps))
23 | return max(
24 | 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
25 | )
26 |
27 | return LambdaLR(optimizer, lr_lambda, last_epoch)
28 |
29 |
30 | def get_cosine_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1):
31 | def lr_lambda(current_step):
32 | if current_step < num_warmup_steps:
33 | return float(current_step) / float(max(1, num_warmup_steps))
34 | progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
35 | return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
36 |
37 | return LambdaLR(optimizer, lr_lambda, last_epoch)
38 |
39 |
40 | def get_cosine_with_hard_restarts_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: int = 1, last_epoch: int = -1):
41 | def lr_lambda(current_step):
42 | if current_step < num_warmup_steps:
43 | return float(current_step) / float(max(1, num_warmup_steps))
44 | progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
45 | if progress >= 1.0:
46 | return 0.0
47 | return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
48 |
49 | return LambdaLR(optimizer, lr_lambda, last_epoch)
50 |
51 |
52 | def get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1):
53 | lr_init = optimizer.defaults["lr"]
54 | assert lr_init > lr_end, f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})"
55 |
56 | def lr_lambda(current_step: int):
57 | if current_step < num_warmup_steps:
58 | return float(current_step) / float(max(1, num_warmup_steps))
59 | elif current_step > num_training_steps:
60 | return lr_end / lr_init # as LambdaLR multiplies by lr_init
61 | else:
62 | lr_range = lr_init - lr_end
63 | decay_steps = num_training_steps - num_warmup_steps
64 | pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
65 | decay = lr_range * pct_remaining ** power + lr_end
66 | return decay / lr_init # as LambdaLR multiplies by lr_init
67 |
68 | return LambdaLR(optimizer, lr_lambda, last_epoch)
--------------------------------------------------------------------------------
/README_ZH.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | # HyCxG
10 | 论文"**Enhancing Language Representation with Constructional Information for Natural Language Understanding**"的代码仓库
11 |
12 |
13 |
14 |
15 | [**English**](https://github.com/xlxwalex/HyCxG/) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/README_ZH.md)
16 |
17 |
18 | 🔗 [数据集](https://github.com/xlxwalex/HyCxG/tree/main/data) • [教程](https://github.com/xlxwalex/HyCxG/tree/main/tutorials) • [指南](https://github.com/xlxwalex/HyCxG/tree/main/guidelines) • [快速开始](#-快速开始) • [相关工作](https://github.com/xlxwalex/HyCxG/blob/main/tutorials/PaperLists.md) • [FAQ❓](https://github.com/xlxwalex/HyCxG/tree/main/guidelines/faq.md)
19 |
20 | > **注意**
21 | >
22 | > 本仓库还在建设中,需要过一段时间才能完成
23 | >
24 |
25 | ## 🌀 目录
26 | * [📖 HyCxG介绍](#-hycxg介绍)
27 | * [📃 仓库资源](#-仓库资源)
28 | * [🐍 快速开始](#-快速开始)
29 | * [🔗 其他信息](#-使用的项目)
30 |
31 | ## 📖 HyCxG介绍
32 | **构式语法** (Construction Grammar, CxG)是认知语言学的一个分支。它认为语法是词汇、形态和句法的连续统。构式可以被定义为一系列存储不同形式和意义对的语言模式项(Linguistic Pattern)。由于构式的意义被分配给这些模式项而不是其实例化后内部包含的特定词汇,因此通过预训练模型学习构式信息可能更具挑战且需要大量的训练数据,这可能在自然语言理解任务中遇到问题。
33 |
34 | 这促使我们有动机将构式语法与预训练模型结合起来。因此我们提出了一个新的编码框架 - **HyCxG**(基于构式语法的超图网络),其通过三阶段过程来使用构式信息增强语言表示。首先,我们从句子中提取和选择出所需的构式集合。然后使用关系引导的超图注意网络将构式信息附加到词汇表示上。最后我们获取了最终表示就可以在各种下游任务中进行微调了。
35 |
36 | ## 📃 仓库资源
37 | 本代码仓库中各部分包含的内容为:
38 | - [**HyCxG**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG) 包含了HyCxG的完整框架
39 | - [**Data**](https://github.com/xlxwalex/HyCxG/tree/main/data) 包括了该工作中用到的所有数据集以及处理脚本。其中的绝大部分数据会从我们的镜像源中进行下载。同时,该部分也提供了基线模型对一些数据的处理脚本
40 | - [**Tutorial**](https://github.com/xlxwalex/HyCxG/tree/main/tutorials) 包含了一些HyCxG相关的教程以及与我们工作相关的资源
41 | - [**Guideline**](https://github.com/xlxwalex/HyCxG/tree/main/guidelines) (正在建设中) 展示了基线模型的一些信息和问答内容
42 |
43 |
44 | ## 🐍 快速开始
45 | **1 实验环境搭建**
46 |
47 | 我们采用了`Python=3.8.5`作为基础实验环境,您可以用以下代码创建环境并安装依赖的包:
48 | ```shell
49 | conda create -n hycxg_env python=3.8.5
50 | source activate hycxg_env
51 | pip install -r requirements.txt
52 | ```
53 |
54 | **2 准备数据集**
55 | 我们在[`data`](https://github.com/xlxwalex/HyCxG/tree/main/data)文件夹中提供了数据下载脚本。你可以直接用以下代码来获得所有数据集:
56 | ```shell
57 | cd data
58 | bash data_pipeline.sh
59 | ```
60 | 在下载完数据后,请将每个数据文件夹(例如JSONABSA_MAMS)移动到[`HyCxG/dataset`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset) 路径下
61 |
62 | **3 准备组件所需数据**
63 | 在运行代码之前,您需要下载组件必须的数据(例如构式表),关于下载步骤请分别见[`HyCxG/dataset`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset) 以及 [`HyCxG/Tokenizer`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Tokenizer) 。 你也可以直接通过以下代码来下载这些数据到对应位置:
64 | ```shell
65 | cd HyCxG/dataset
66 | bash download_vocab.sh
67 | cd ../Tokenizer
68 | bash download_cxgdict.sh
69 | ```
70 |
71 | **4 运行HyCxG**
72 |
73 | 我们提供了一些HyCxG的运行样例脚本在[`HyCxG/run_hycxg.sh`](https://github.com/xlxwalex/HyCxG/blob/main/HyCxG/run_hycxg.sh)中,方便您参考
74 |
75 |
76 | ## 🙏 使用的项目
77 | - [c2xg](https://github.com/jonathandunn/c2xg) 被用于从句子中抽取构式
78 | - [simanneal](https://github.com/perrygeo/simanneal)是一个很方便的模拟退火框架被用于解决Cond-MC问题
79 |
80 | ## 👋 引用
81 | 如果您认为我们的工作对您有帮助,您可以引用我们的论文: Enhancing Language Representation with Constructional Information for Natural Language Understanding
82 | ```
83 | @inproceedings{xu-etal-2023-enhancing,
84 | title = "Enhancing Language Representation with Constructional Information for Natural Language Understanding",
85 | author = "Xu, Lvxiaowei and
86 | Wu, Jianwang and
87 | Peng, Jiawei and
88 | Gong, Zhilin and
89 | Cai, Ming and
90 | Wang, Tianxiang",
91 | booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
92 | year = "2023",
93 | publisher = "Association for Computational Linguistics",
94 | url = "https://aclanthology.org/2023.acl-long.258",
95 | pages = "4685--4705",
96 | }
97 | ```
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 | ## 📧 联系我们
107 | 如果您对代码有任何问题,可以提交Issue或联系 [`xlxw@zju.edu.cn`](mailto:xlxw@zju.edu.cn)
108 |
--------------------------------------------------------------------------------
/data/ABSA/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 |
16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/ABSA) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/ABSA/README_ZH.md)
17 |
18 | ## Aspect-based Sentiment Analysis Dataset for ABSA
19 |
20 | The Aspect-based Sentiment Analysis (ABSA) dataset is based on SemEval 2014, 2015, 2016, and MAMS. SemEval 2014 contains reviews from two domains: restaurants and laptops. SemEval 2014 and 2015 include reviews from the restaurant domain only. MAMS is a larger-scale dataset, where each sentence has multiple aspects, making it more challenging.
21 |
22 | ### Download and process the data
23 | Before using the data processing and downloading script, please make sure that the dependencies in [`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt) have already been installed. After installing the dependencies, use the following command to download and process the data (note: you may not attach any parameters, all parameters have default values):
24 |
25 | Specifically, since the four datasets included in SemEval 2014, 2015 and 2016 only contain train and test sets, we split the training set into new training and validation sets in a `9:1` random ratio for better evaluating model performances. Such datasets will be identified with a `Split`suffix.
26 |
27 | ```shell
28 | bash download_and_process_absa.sh [--DATA_DIR] [--OUTPUT_DIR] [--STANFORD_DIR]
29 | ```
30 | **Parameters:**
31 | + DATA_DIR: The folder where the downloaded raw data is located. The default parameter is the current folder.
32 | + OUTPUT_DIR: The folder where the processed data is stored. The default parameter is `dataset`.
33 | + STANFORD_DIR: The location of the Stanford parser. The default parameter is `stanford-corenlp-3.9.2-minimal` in the parent directory.
34 |
35 | **Note:** If the shared parser directory does not exist, the program will ask if you want to fetch the Stanford parser from our mirror data source, which is 353MB in size. Choose `Y` to proceed.
36 |
37 | ### Data processing for baseline models
38 | In Section4.2 - Results on ABSA tasks and Appendix K - Detailed Results on ABSA Tasks of our paper, we compared the performance of multiple baseline models. Therefore, we provide more information on reproducing the baseline models in [`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines).
39 |
40 | ### Resource of data
41 | Our mirror data is obtained from SemEval 2014, 2015, 2016 and MAMS, as well as the data sources [ASGCN-data](https://github.com/GeneZC/ASGCN/tree/master/datasets) and [MAMS](https://github.com/siat-nlp/MAMS-for-ABSA). If you have also used this dataset, you can cite their papers as follows:
42 |
43 | **SemEval 2014**
44 | ```
45 | @inproceedings{pontiki2014semeval,
46 | title = "{S}em{E}val-2014 Task 4: Aspect Based Sentiment Analysis",
47 | author = "Pontiki, Maria and
48 | Galanis, Dimitris and
49 | Pavlopoulos, John and
50 | Papageorgiou, Harris and
51 | Androutsopoulos, Ion and
52 | Manandhar, Suresh",
53 | booktitle = "{S}em{E}val 2014",
54 | year = "2014",
55 | url = "https://aclanthology.org/S14-2004",
56 | pages = "27--35",
57 | }
58 | ```
59 | **SemEval 2015**
60 | ```
61 | @inproceedings{pontiki2015semeval,
62 | title = "{S}em{E}val-2015 Task 12: Aspect Based Sentiment Analysis",
63 | author = "Pontiki, Maria and
64 | Galanis, Dimitris and
65 | Papageorgiou, Haris and
66 | Manandhar, Suresh and
67 | Androutsopoulos, Ion",
68 | booktitle = "{S}em{E}val 2015",
69 | year = "2015",
70 | url = "https://aclanthology.org/S15-2082",
71 | pages = "486--495",
72 | }
73 | ```
74 | **SemEval 2016**
75 | ```
76 | @inproceedings{pontiki2016semeval,
77 | title = "{S}em{E}val-2016 Task 5: Aspect Based Sentiment Analysis",
78 | author = {Pontiki, Maria and
79 | Galanis, Dimitris and
80 | Papageorgiou, Haris and
81 | Androutsopoulos, Ion and
82 | Manandhar, Suresh and
83 | AL-Smadi, Mohammad and
84 | Al-Ayyoub, Mahmoud and
85 | Zhao, Yanyan and
86 | Qin, Bing and
87 | De Clercq, Orph{\'e}e and
88 | Hoste, V{\'e}ronique and
89 | Apidianaki, Marianna and
90 | Tannier, Xavier and
91 | Loukachevitch, Natalia and
92 | Kotelnikov, Evgeniy and
93 | Bel, Nuria and
94 | Jim{\'e}nez-Zafra, Salud Mar{\'\i}a and
95 | Eryi{\u{g}}it, G{\"u}l{\c{s}}en},
96 | booktitle = "{S}em{E}val-2016",
97 | year = "2016",
98 | url = "https://aclanthology.org/S16-1002",
99 | pages = "19--30",
100 | }
101 | ```
102 | **MAMS**
103 | ```
104 | @inproceedings{jiang2019challenge,
105 | title = "A Challenge Dataset and Effective Models for Aspect-Based Sentiment Analysis",
106 | author = "Jiang, Qingnan and
107 | Chen, Lei and
108 | Xu, Ruifeng and
109 | Ao, Xiang and
110 | Yang, Min",
111 | booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
112 | year = "2019",
113 | url = "https://aclanthology.org/D19-1654",
114 | pages = "6280--6285",
115 | }
116 | ```
117 |
--------------------------------------------------------------------------------
/data/ABSA/README_ZH.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/ABSA) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/ABSA/README_ZH.md)
16 | ## 方面级情感分析数据集
17 |
18 | 方面级情感分析(Aspect-based sentiment analysis, ABSA)数据集基于SemEval 2014/2015/2016以及MAMS任务及数据。其中SemEval 2014中包含了餐厅(Restaurant)和笔记本电脑(laptop)两个领域的评论,而SemEval 2014/2015均为餐厅领域的评论。MAMS是一个更大尺度的数据集,其每个句子都有多个方面,因此更具挑战性。
19 |
20 | 比较特别的是,由于SemEval 2014/2015/2016包含的四个数据集仅包含训练集和测试集,因此为了能更好评估模型性能,我们独立地将训练集按照`9:1`随机划分为了新的训练集和验证集,该类数据集会用`Split`后缀进行标识。
21 |
22 | ### 数据下载及处理
23 | 在使用数据处理及下载脚本前,请您确认已经安装了[`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt)中的依赖包。在安装完依赖包后,用以下命令来获得并处理数据(可以不附加任何参数,所有参数均有默认值):
24 | ```shell
25 | bash download_and_process_absa.sh [--DATA_DIR] [--OUTPUT_DIR] [--STANFORD_DIR]
26 | ```
27 | **参数含义:**
28 | + DATA_DIR: 下载的原始数据所在文件夹, 默认为当前文件夹
29 | + OUTPUT_DIR: 处理好后的数据存储的文件夹, 默认为`dataset`
30 | + STANFORD_DIR: 斯坦福解析器所在位置,默认为上一级目录的`stanford-corenlp-3.9.2-minimal`
31 |
32 | **注意:** 如果共享的解析器文件夹不存在,那么程序会询问是否从我们的镜像数据源拉取斯坦福解析器(共353MB),选择`Y`即可
33 |
34 | ### 基准模型数据处理
35 | 在论文的Section4.2 - Results on ABSA tasks中以及Appendix K - Detailed Results on ABSA Tasks中我们对比了多个模型在方面级情感数据集上的性能,我们在[`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines)提供了更多关于基线模型的复现信息。
36 |
37 | ### 数据来源
38 | 本部分数据来自SemEval 2014/2015/2016,我们的镜像数据来源[ASGCN-data](https://github.com/GeneZC/ASGCN/tree/master/datasets)以及[MAMS](https://github.com/siat-nlp/MAMS-for-ABSA),如果您也使用了该数据集,您可以引用他们的论文,分别为:
39 |
40 | **SemEval 2014**
41 | ```
42 | @inproceedings{pontiki2014semeval,
43 | title = "{S}em{E}val-2014 Task 4: Aspect Based Sentiment Analysis",
44 | author = "Pontiki, Maria and
45 | Galanis, Dimitris and
46 | Pavlopoulos, John and
47 | Papageorgiou, Harris and
48 | Androutsopoulos, Ion and
49 | Manandhar, Suresh",
50 | booktitle = "{S}em{E}val 2014",
51 | year = "2014",
52 | url = "https://aclanthology.org/S14-2004",
53 | pages = "27--35",
54 | }
55 | ```
56 | **SemEval 2015**
57 | ```
58 | @inproceedings{pontiki2015semeval,
59 | title = "{S}em{E}val-2015 Task 12: Aspect Based Sentiment Analysis",
60 | author = "Pontiki, Maria and
61 | Galanis, Dimitris and
62 | Papageorgiou, Haris and
63 | Manandhar, Suresh and
64 | Androutsopoulos, Ion",
65 | booktitle = "{S}em{E}val 2015",
66 | year = "2015",
67 | url = "https://aclanthology.org/S15-2082",
68 | pages = "486--495",
69 | }
70 | ```
71 | **SemEval 2016**
72 | ```
73 | @inproceedings{pontiki2016semeval,
74 | title = "{S}em{E}val-2016 Task 5: Aspect Based Sentiment Analysis",
75 | author = {Pontiki, Maria and
76 | Galanis, Dimitris and
77 | Papageorgiou, Haris and
78 | Androutsopoulos, Ion and
79 | Manandhar, Suresh and
80 | AL-Smadi, Mohammad and
81 | Al-Ayyoub, Mahmoud and
82 | Zhao, Yanyan and
83 | Qin, Bing and
84 | De Clercq, Orph{\'e}e and
85 | Hoste, V{\'e}ronique and
86 | Apidianaki, Marianna and
87 | Tannier, Xavier and
88 | Loukachevitch, Natalia and
89 | Kotelnikov, Evgeniy and
90 | Bel, Nuria and
91 | Jim{\'e}nez-Zafra, Salud Mar{\'\i}a and
92 | Eryi{\u{g}}it, G{\"u}l{\c{s}}en},
93 | booktitle = "{S}em{E}val-2016",
94 | year = "2016",
95 | url = "https://aclanthology.org/S16-1002",
96 | pages = "19--30",
97 | }
98 | ```
99 | **MAMS**
100 | ```
101 | @inproceedings{jiang2019challenge,
102 | title = "A Challenge Dataset and Effective Models for Aspect-Based Sentiment Analysis",
103 | author = "Jiang, Qingnan and
104 | Chen, Lei and
105 | Xu, Ruifeng and
106 | Ao, Xiang and
107 | Yang, Min",
108 | booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
109 | year = "2019",
110 | url = "https://aclanthology.org/D19-1654",
111 | pages = "6280--6285",
112 | }
113 | ```
--------------------------------------------------------------------------------
/data/ABSA/process_absa.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | import sys
4 | import json
5 | import os
6 | import numpy as np
7 | import pandas as pd
8 | import argparse
9 | from tqdm import tqdm
10 | from stanfordcorenlp import StanfordCoreNLP
11 | try:
12 | sys.path.append('.')
13 | from download_stanfordcore import download_stanfordcore, unzip_stanfordcore, STANFORD_CORE_LINK
14 | except:
15 | sys.path.append('..')
16 | from download_stanfordcore import download_stanfordcore, unzip_stanfordcore, STANFORD_CORE_LINK
17 |
18 | MAP_POLARITY = {0 : 'neutral', 1 : 'positive', -1 : 'negative'}
19 |
20 | def initialize_stanfordcore(stanford_path: str):
21 | try: nlpmodel = StanfordCoreNLP(stanford_path)
22 | except:
23 | print('The script need stanfordparser>=3.9.2 package, while the path is not exist for the package, do you want to download it? (Y/N)')
24 | download_flag = input()
25 | download_flag = download_flag.lower()
26 | assert download_flag in ['y'], "Abort"
27 | download_stanfordcore(STANFORD_CORE_LINK, stanford_path+'.zip')
28 | unzip_stanfordcore(stanford_path+'.zip', '../')
29 | nlpmodel = StanfordCoreNLP(stanford_path)
30 | return nlpmodel
31 |
32 | def convert_raw2json(path : str, nlpmodel, desc: str='train', dataset_name: str='Rest14'):
33 | def parse_adj(edge):
34 | e_id, dep_rels, dep_heads = 1, [], []
35 | for eidx in range(len(edge)):
36 | if (eidx + 1) != edge[0][2]:
37 | dep_heads.append(edge[e_id][1])
38 | dep_rels.append(edge[e_id][0])
39 | e_id += 1
40 | else:
41 | dep_heads.append(0)
42 | dep_rels.append(edge[0][0])
43 | return dep_heads, dep_rels
44 |
45 | data = []
46 | with open(path, 'r', encoding='utf-8') as fp:
47 | raw_data = fp.readlines()
48 | fp.close()
49 | for idx in tqdm(range(0, len(raw_data), 3), desc='Process {} file in {}'.format(desc, dataset_name)):
50 | obj = {}
51 | sentence = raw_data[idx].strip()
52 | target = raw_data[idx + 1].strip()
53 | polarity = MAP_POLARITY[eval(raw_data[idx + 2].strip())]
54 | if '$T$' not in sentence:
55 | print('Error sentence : %s' % sentence)
56 | continue
57 | post_sentence = sentence.replace('$T$', target)
58 | obj['token'] = nlpmodel.word_tokenize(post_sentence)
59 | pos_tag = nlpmodel.pos_tag(post_sentence)
60 | dependecy = nlpmodel.dependency_parse(post_sentence)
61 | obj['pos'] = [tag[1] for tag in pos_tag]
62 | heads, rels = parse_adj(dependecy)
63 | obj['head'] = heads
64 | obj['deprel'] = rels
65 | obj['aspects'] = [{
66 | 'term' : nlpmodel.word_tokenize(target),
67 | 'from' : len(nlpmodel.word_tokenize(sentence.split('$T$')[0])),
68 | 'to' : len(nlpmodel.word_tokenize(sentence.split('$T$')[0])) + len(nlpmodel.word_tokenize(target)),
69 | 'polarity' : polarity
70 | }]
71 | data.append(obj)
72 | return data
73 |
74 | def read_semeval_data(path : str):
75 | data_out = []
76 | data = np.array(pd.read_csv(path))
77 | for dat in data: data_out.append([dat[0], dat[2], dat[1]])
78 | return data_out
79 |
80 | def output_json(data: list, folder_path: str, file_path: str):
81 | if not os.path.exists(folder_path): os.makedirs(folder_path)
82 | with open(os.path.join(folder_path, file_path), 'w', encoding='utf-8') as fp:
83 | fp.write(json.dumps(data, indent=4))
84 | fp.close()
85 |
86 | def process_data(args: argparse.Namespace):
87 | assert os.path.exists(args.train_file) and os.path.exists(args.test_file), "The path of data is not exist, please download first."
88 | train_json = convert_raw2json(args.train_file, args.nlpmodel, dataset_name=args.dataset_name)
89 | output_json(train_json, args.out_path, 'train.json')
90 | print('The train file of {} dataset is saved at {}'.format(args.dataset_name, os.path.join(args.out_path, 'train.json')))
91 |
92 | valid_json = convert_raw2json(args.valid_file, args.nlpmodel, desc='valid', dataset_name=args.dataset_name)
93 | output_json(valid_json, args.out_path, 'valid.json')
94 | print('The test file of {} dataset is saved at {}'.format(args.dataset_name, os.path.join(args.out_path, 'valid.json')))
95 |
96 | test_json = convert_raw2json(args.test_file, args.nlpmodel, desc='test', dataset_name=args.dataset_name)
97 | output_json(test_json, args.out_path, 'test.json')
98 | print('The test file of {} dataset is saved at {}'.format(args.dataset_name, os.path.join(args.out_path, 'test.json')))
99 |
100 | def main():
101 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
102 | parser.add_argument("--out_path", default='dataset/JSONABSA_Rest14', type=str, help="Output path of ABSA dataset.")
103 | parser.add_argument("--stanford_path", default='stanford-corenlp-3.9.2-minimal', type=str, help="The path for stanfordparser.")
104 | parser.add_argument("--train_file", default='rest14_train.raw', type=str, help="The path of train file.")
105 | parser.add_argument("--valid_file", default='rest14_test.raw', type=str, help="The path of valid file.")
106 | parser.add_argument("--test_file", default='rest14_test.raw', type=str, help="The path of test file.")
107 | args = parser.parse_args()
108 | args.nlpmodel = initialize_stanfordcore(args.stanford_path)
109 | args.dataset_name = args.out_path.split('_')[0]
110 | process_data(args)
111 | print('ABSA data for {} has been processed.'.format(args.dataset_name))
112 |
113 | if __name__ == '__main__':
114 | main()
115 |
--------------------------------------------------------------------------------
/data/Colloquial/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 |
16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/Colloquial) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/Colloquial/README_ZH.md)
17 |
18 | ## Colloquial Dataset for ABSA
19 |
20 | The colloquial ABSA datasets are composed of two sources in sentiment analysis: the Twitter dataset and the GermEval2017 dataset. Therefore, it can serve as data for performance evaluation across different register. Both datasets are derived from social media, so they contain more colloquial language compared to other datasets. Since GermEval is much larger than Twitter, we sampled a subset of GermEval for performance testing.
21 |
22 | ### Download and process the data
23 | Before using the data processing and downloading script, please make sure that the dependencies in [`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt) have already been installed. After installing the dependencies, use the following command to download and process the data (note: you may not attach any parameters, all parameters have default values):
24 | ```shell
25 | bash download_and_process_colloquial.sh [--DATA_DIR] [--OUTPUT_DIR] [--STANFORD_DIR]
26 | ```
27 | **Parameters:**
28 | + DATA_DIR: The folder where the downloaded raw data is located. The default parameter is the current folder.
29 | + OUTPUT_DIR: The folder where the processed data is stored. The default parameter is `dataset`.
30 | + STANFORD_DIR: The location of the Stanford parser. The default parameter is `stanford-corenlp-3.9.2-minimal` in the parent directory.
31 |
32 | **Note:** If the shared parser directory does not exist, the program will ask if you want to fetch the Stanford parser from our mirror data source, which is 353MB in size. Choose `Y` to proceed. Additionally, GermEval requires SpaCy package, which automatically downloads the necessary models when initialized.
33 |
34 | ### Data processing for baseline models
35 | In Appendix H - Colloquial Expression Results of our paper, we compared the performance of four models, namely RGAT, DualGCN, DGEDT, and KumaGCN, on these colloquial sentiment datasets. Therefore, in the [`baseline`](https://github.com/xlxwalex/HyCxG/tree/main/data/Colloquial/baseline) folder, we provide a conversion script for the GermEval dataset (as Twitter is a commonly used dataset, these baseline models already include the processed data). For more information on reproducing the baseline models, please refer to the [`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines).
36 |
37 | ### Resource of data
38 | Our mirror data is obtained from Twitter and GermEval, as well as the data sources [acl-14-short-data](https://github.com/songyouwei/ABSA-PyTorch/tree/master/datasets/acl-14-short-data) and [GermEval 2017](http://ltdata1.informatik.uni-hamburg.de/germeval2017/). If you have also used this dataset, you can cite their papers as follows:
39 |
40 | #### Twitter Dataset
41 | ```
42 | @inproceedings{dong2014adaptive,
43 | title = "Adaptive Recursive Neural Network for Target-dependent {T}witter Sentiment Classification",
44 | author = "Dong, Li and
45 | Wei, Furu and
46 | Tan, Chuanqi and
47 | Tang, Duyu and
48 | Zhou, Ming and
49 | Xu, Ke",
50 | booktitle = "Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
51 | year = "2014",
52 | publisher = "Association for Computational Linguistics",
53 | url = "https://aclanthology.org/P14-2009",
54 | pages = "49--54",
55 | }
56 | ```
57 | #### GermEval 2017 Competition
58 | ```
59 | @article{wojatzki2017germeval,
60 | title={Germeval 2017: Shared task on aspect-based sentiment in social media customer feedback},
61 | author={Wojatzki, Michael and Ruppert, Eugen and Holschneider, Sarah and Zesch, Torsten and Biemann, Chris},
62 | journal={GermEval},
63 | pages={1--12},
64 | year={2017}
65 | }
66 | ```
--------------------------------------------------------------------------------
/data/Colloquial/README_ZH.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/Colloquial) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/Colloquial/README_ZH.md)
16 | ## 口语化情感数据集
17 |
18 | 口语化情感数据集由Twitter数据集以及GermEval2017两部分方面级情感分析数据集组成,这两个数据集的数据均来自于社交媒体平台,因此相较于其他数据集包含更多的口语化语料,因此可以作为不同语域(Register)性能评测的数据。由于GermEval远大于Twitter,我们对GermEval采样为了一个子集进行性能测试。
19 |
20 | ### 数据下载及处理
21 | 在使用数据处理及下载脚本前,请您确认已经安装了[`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt)中的依赖包。在安装完依赖包后,用以下命令来获得并处理数据(可以不附加任何参数,所有参数均有默认值):
22 | ```shell
23 | bash download_and_process_counterfactual.sh [--DATA_DIR] [--OUTPUT_DIR] [--STANFORD_DIR]
24 | ```
25 | **参数含义:**
26 | + DATA_DIR: 下载的原始数据所在文件夹, 默认为当前文件夹
27 | + OUTPUT_DIR: 处理好后的数据存储的文件夹, 默认为`dataset`
28 | + STANFORD_DIR: 斯坦福解析器所在位置,默认为上一级目录的`stanford-corenlp-3.9.2-minimal`
29 |
30 | **注意:** 如果共享的解析器文件夹不存在,那么程序会询问是否从我们的镜像数据源拉取斯坦福解析器(共353MB),选择`Y`即可。另外GermEval需要用到SpaCy,在调用时其会自动下载所需模型。
31 |
32 | ### 基准模型数据处理
33 | 在论文的Appendix H - Colloquial Expression Results中,我们对比了RGAT、DualGCN、DGEDT以及KumaGCN这四个模型在口语化情感数据集上的性能,因此在[`baseline文件夹`](https://github.com/xlxwalex/HyCxG/tree/main/data/Colloquial/baseline)中我们提供了GermEval数据的转换脚本(由于Twitter是常用数据集,这些基线模型都包含了处理好的数据)。更多关于基线模型的复现信息请见[`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines)。
34 |
35 | ### 数据来源
36 | 本部分数据来自Twitter和GermEval,我们的镜像数据来源[acl-14-short-data](https://github.com/songyouwei/ABSA-PyTorch/tree/master/datasets/acl-14-short-data)以及[GermEval 2017](http://ltdata1.informatik.uni-hamburg.de/germeval2017/),如果您也使用了该数据集,您可以引用他们的论文,分别为:
37 | #### Twitter数据集:
38 | ```
39 | @inproceedings{dong2014adaptive,
40 | title = "Adaptive Recursive Neural Network for Target-dependent {T}witter Sentiment Classification",
41 | author = "Dong, Li and
42 | Wei, Furu and
43 | Tan, Chuanqi and
44 | Tang, Duyu and
45 | Zhou, Ming and
46 | Xu, Ke",
47 | booktitle = "Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
48 | year = "2014",
49 | publisher = "Association for Computational Linguistics",
50 | url = "https://aclanthology.org/P14-2009",
51 | pages = "49--54",
52 | }
53 | ```
54 | #### GermEval 2017评测比赛:
55 | ```
56 | @article{wojatzki2017germeval,
57 | title={Germeval 2017: Shared task on aspect-based sentiment in social media customer feedback},
58 | author={Wojatzki, Michael and Ruppert, Eugen and Holschneider, Sarah and Zesch, Torsten and Biemann, Chris},
59 | journal={GermEval},
60 | pages={1--12},
61 | year={2017}
62 | }
63 | ```
--------------------------------------------------------------------------------
/data/Colloquial/baseline/DGEDT_germeval_gengraph.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import spacy
3 | import pickle
4 | import tqdm
5 | nlp = spacy.load('de')
6 | import re
7 |
8 | def tokenize(text):
9 | text=text.strip()
10 | text=re.sub(r' {2,}',' ',text)
11 | document = nlp(text)
12 | return [token.text for token in document]
13 |
14 | def update_edge(text,vocab):
15 | # https://spacy.io/docs/usage/processing-text
16 | document = nlp(text)
17 | seq_len = len(text.split())
18 | for token in document:
19 | if token.dep_ not in vocab:
20 | vocab[token.dep_]=len(vocab)
21 | return 0
22 |
23 | def dependency_adj_matrix(text,edge_vocab):
24 | # https://spacy.io/docs/usage/proclessing-text
25 | document = nlp(text.strip())
26 | seq_len = len(tokenize(text))
27 | matrix = np.zeros((seq_len, seq_len)).astype('float32')
28 | matrix1 = np.zeros((seq_len, seq_len)).astype('float32')
29 | edge = np.zeros((seq_len, seq_len)).astype('int32')
30 | edge1 = np.zeros((seq_len, seq_len)).astype('int32')
31 | assert len(document)==seq_len
32 | for token in document:
33 | if token.i >= seq_len:
34 | print('bug')
35 | print(text)
36 | print(text.split())
37 | print(document)
38 | print([token.i for token in document])
39 | print([token.text for token in document])
40 | a=input('hahha')
41 | if token.i < seq_len:
42 | matrix[token.i][token.i] = 1
43 | matrix1[token.i][token.i] = 1
44 | # https://spacy.io/docs/api/token
45 | for child in token.children:
46 | if child.i < seq_len:
47 | matrix[token.i][child.i] = 1
48 | matrix1[child.i][token.i] = 1
49 | edge[token.i][child.i] = edge_vocab.get(child.dep_,1)
50 | edge1[child.i][token.i] = edge_vocab.get(child.dep_,1)
51 | return matrix,matrix1,edge,edge1
52 |
53 | def concat(texts,aspect):
54 | source=''
55 | splitnum=0
56 | for i,text in enumerate(texts):
57 | source+=text
58 | splitnum+=len(tokenize(text))
59 | if i ':0,'':1}
77 | fin = open(filename, 'r', encoding='utf-8', newline='\n', errors='ignore')
78 | lines = fin.readlines()
79 | fin.close()
80 | idx2graph = {}
81 | fout = open(filename+'.graph', 'wb')
82 | if savevocab:
83 | fout1 = open(filename+'.edgevocab', 'wb')
84 | if savevocab:
85 | for i in tqdm.tqdm(range(0, len(lines), 3)):
86 | text_left = [s.lower().strip() for s in lines[i].split("$T$")]
87 | aspect = lines[i + 1].lower().strip()
88 | update_edge(concat(text_left,aspect),edge_vocab)
89 | for i in tqdm.tqdm(range(0, len(lines), 3)):
90 | text_left = [s.lower().strip() for s in lines[i].split("$T$")]
91 | aspect = lines[i + 1].lower().strip()
92 | adj_matrix = dependency_adj_matrix(concat(text_left,aspect),edge_vocab)
93 | idx2graph[i] = adj_matrix
94 | pickle.dump(idx2graph, fout)
95 | if savevocab:
96 | pickle.dump(edge_vocab, fout1)
97 | fout.close()
98 | if savevocab:
99 | fout1.close()
100 | return edge_vocab
101 |
102 | def processe(filename,filename2):
103 | savevocab=True
104 |
105 | edge_vocab={'':0,'':1}
106 | fin = open(filename, 'r', encoding='utf-8', newline='\n', errors='ignore')
107 | lines = fin.readlines()
108 | fin.close()
109 | idx2graph = {}
110 | fout = open(filename+'.graph', 'wb')
111 | if savevocab:
112 | fout1 = open(filename+'.edgevocab', 'wb')
113 | if savevocab:
114 | for i in tqdm.tqdm(range(0, len(lines), 1)):
115 | update_edge(re.sub(r' {2,}',' ',lines[i].strip()),edge_vocab)
116 | for i in tqdm.tqdm(range(0, len(lines), 1)):
117 | adj_matrix = dependency_adj_matrix(re.sub(r' {2,}',' ',lines[i].strip()),edge_vocab)
118 | idx2graph[i] = adj_matrix
119 | pickle.dump(idx2graph, fout)
120 | if savevocab:
121 | pickle.dump(edge_vocab, fout1)
122 | fout.close()
123 | if savevocab:
124 | fout1.close()
125 | return edge_vocab
126 |
127 | if __name__ == '__main__':
128 | edge_vocab = process('./datasets/germeval/germeval_train.raw', None, True)
129 | process('./datasets/germeval/germeval_valid.raw', edge_vocab, False)
130 | process('./datasets/germeval/germeval_test.raw', edge_vocab, False)
131 |
--------------------------------------------------------------------------------
/data/Colloquial/baseline/DualGCN_germeval_txt2json.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import spacy
3 | import pickle
4 | from tqdm import tqdm
5 | import json
6 |
7 | MAP_INV = {0 : 'neutral', 1 : 'positive', -1 : 'negative'}
8 | def read_data(path : str):
9 | with open(path, 'r') as fp:
10 | data = fp.readlines()
11 | fp.close()
12 | data_gp = []
13 | for idx in range(0, len(data), 3):
14 | sentence = data[idx].strip()
15 | term = data[idx+1].strip()
16 | polarity = eval(data[idx+2].strip())
17 | data_gp.append([sentence, term, polarity])
18 | return data_gp
19 |
20 | def construct_data(data : list):
21 | nlp = spacy.load('de')
22 | out_data = []
23 | for text in tqdm(data, desc='Processing'):
24 | sentence, term, polarity = text[0], text[1], text[2]
25 | if len(term) <1:continue
26 | document = nlp(sentence.replace('$T$', term))
27 | tokens, pos_tag = zip(*[[token.text, token.tag_.split('_')[0]] for token in document])
28 | from_idx = len([tok.text for tok in nlp(sentence.split('$T$')[0])])
29 | term_tok = [tok.text for tok in nlp(term)]
30 | from_to = [[from_idx, from_idx + len(term_tok)]]
31 | predicted_dependencies = [token.dep_.split(':')[0] if token.dep_ != 'ROOT' else 'root' for token in document]
32 | predicted_heads = [token.head.i+1 if token.dep_ != 'ROOT' else 0 for token in document]
33 | obj = {}
34 | obj['token'] = list(tokens)
35 | obj['pos'] = list(pos_tag)
36 | obj['head'] = predicted_heads
37 | obj['deprel'] = predicted_dependencies
38 | obj['aspects'] = [{
39 | 'term': term_tok,
40 | 'from' : from_to[0][0],
41 | 'to' : from_to[0][1],
42 | 'polarity' : MAP_INV[polarity]
43 | }]
44 | out_data.append(obj)
45 | return out_data
46 |
47 | if __name__ == '__main__':
48 | train_data = read_data('germeval_train.raw')
49 | train_data = construct_data(train_data)
50 | with open('train.json', 'w', encoding='utf-8') as tr_fp:
51 | json_str = json.dumps(train_data, indent=4)
52 | tr_fp.write(json_str)
53 | tr_fp.close()
54 |
55 | valid_data = read_data('germeval_valid.raw')
56 | valid_data = construct_data(valid_data)
57 | with open('valid.json', 'w', encoding='utf-8') as va_fp:
58 | json_str = json.dumps(valid_data, indent=4)
59 | va_fp.write(json_str)
60 | va_fp.close()
61 |
62 | test_data = read_data('germeval_test.raw')
63 | test_data = construct_data(test_data)
64 | with open('test.json', 'w', encoding='utf-8') as te_fp:
65 | json_str = json.dumps(test_data, indent=4)
66 | te_fp.write(json_str)
67 | te_fp.close()
68 | print('WELL DONE.')
69 |
--------------------------------------------------------------------------------
/data/Colloquial/baseline/KumaGCN_germeval_gengraph.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import spacy
3 | import pickle
4 |
5 | nlp = spacy.load('de')
6 |
7 | def dependency_adj_matrix(text):
8 | print(text)
9 | document = nlp(text)
10 | print("[tlog] document: " + str(document))
11 | # sys.exit(0)
12 | seq_len = len(text.split())
13 | matrix = np.zeros((seq_len, seq_len)).astype('float32')
14 | pos = []
15 | dep_rel = []
16 | i = 0
17 | for sentence in document.sentences:
18 | print("[tlog] sentence: " + str(sentence))
19 | for word in sentence.words:
20 | if word.index + i < seq_len: # there are some bugs for here such as SPACE
21 | pos.append(word.pos)
22 | dep_rel.append(word.dependency_relation)
23 | if word.index + i < seq_len:
24 | index = word.index + i
25 | head_index = word.governor + i
26 | matrix[index][index] = 1
27 |
28 | matrix[head_index][index] = 1
29 | matrix[index][head_index] = 1
30 |
31 | i += len(sentence.words)
32 | return matrix, pos, dep_rel
33 |
34 |
35 | def dependency_adj_matrix2(text):
36 | # https://spacy.io/docs/usage/processing-text
37 | # print("[tlog] text: " + str(text)) # Maybe for parsing, we should not lower case this
38 | document = nlp(text)
39 | # print("[tlog] document: " + str(document))
40 | # sys.exit(0)
41 | seq_len = len(text.split())
42 | matrix = np.zeros((seq_len, seq_len)).astype('float32')
43 | pos = []
44 | dep_rel = []
45 | for token in document:
46 | if token.i < seq_len: # there are some bugs for here such as SPACE
47 | pos.append(token.tag_)
48 | dep_rel.append(token.dep_)
49 |
50 | if token.i < seq_len:
51 | matrix[token.i][token.i] = 1
52 | # https://spacy.io/docs/api/token
53 | for child in token.children: # tzy: do not distinguish the arc types
54 | if child.i < seq_len:
55 | matrix[token.i][child.i] = 1
56 | matrix[child.i][token.i] = 1
57 |
58 | return matrix, pos, dep_rel
59 |
60 |
61 | def process(filename):
62 | print(filename)
63 | fin = open(filename, 'r', encoding='utf-8', newline='\n', errors='ignore')
64 | lines = fin.readlines()
65 | fin.close()
66 | idx2graph = {}
67 | fout = open(filename + '.graph', 'wb')
68 | pos_out = open(filename + '.pos', 'w')
69 | rel_out = open(filename + '.rel', 'w')
70 | for i in range(0, len(lines), 3):
71 | text_left, _, text_right = [s.strip() for s in lines[i].partition("$T$")]
72 | aspect = lines[i + 1].strip()
73 | adj_matrix, pos, rel = dependency_adj_matrix2(text_left.strip() + ' ' + aspect + ' ' + text_right.strip())
74 | idx2graph[i] = adj_matrix
75 | pos_out.write(" ".join(pos) + "\n")
76 | rel_out.write(" ".join(rel) + "\n")
77 | pickle.dump(idx2graph, fout)
78 | fout.close()
79 |
80 |
81 | if __name__ == '__main__':
82 | process('./datasets/german/germeval_train.raw')
83 | process('./datasets/german/germeval_valid.raw')
84 | process('./datasets/german/germeval_test.raw')
85 |
86 |
--------------------------------------------------------------------------------
/data/Colloquial/baseline/RGAT_germeval_txt2json.py:
--------------------------------------------------------------------------------
1 | import spacy
2 | from tqdm import tqdm
3 | import json
4 |
5 | MAP_INV = {0 : 'neutral', 1 : 'positive', -1 : 'negative'}
6 | def read_data(path : str):
7 | with open(path, 'r') as fp:
8 | data = fp.readlines()
9 | fp.close()
10 | data_gp = []
11 | for idx in range(0, len(data), 3):
12 | sentence = data[idx].strip()
13 | term = data[idx+1].strip()
14 | polarity = eval(data[idx+2].strip())
15 | data_gp.append([sentence, term, polarity])
16 | return data_gp
17 |
18 | def construct_data(data : list):
19 | nlp = spacy.load('de')
20 | out_data = []
21 | for text in tqdm(data, desc='Processing'):
22 | sentence, term, polarity = text[0], text[1], text[2]
23 | document = nlp(sentence.replace('$T$', term))
24 | tokens, pos_tag = zip(*[[token.text, token.tag_.split('_')[0]] for token in document])
25 | sentence_post = sentence.replace('$T$', term)
26 | aspect_sentiment = [[term, MAP_INV[polarity]]]
27 | from_idx = len([tok.text for tok in nlp(sentence.split('$T$')[0])])
28 | term_tok = len([tok.text for tok in nlp(term)])
29 | from_to = [[from_idx, from_idx + term_tok]]
30 | predicted_dependencies = [token.dep_.split(':')[0] if token.dep_ != 'ROOT' else 'root' for token in document]
31 | predicted_heads = [token.head.i+1 if token.dep_ != 'ROOT' else 0 for token in document]
32 | dependencies = [list(ele) for ele in list(zip(predicted_dependencies, predicted_heads, list(range(1, len(predicted_heads) + 1))))]
33 | obj = {}
34 | obj['sentence'] = sentence_post
35 | obj['tokens'] = list(tokens)
36 | obj['tags'] = list(pos_tag)
37 | obj['predicted_dependencies'] = predicted_dependencies
38 | obj['predicted_heads'] = predicted_heads
39 | obj['dependencies'] = dependencies
40 | obj['aspect_sentiment'] = aspect_sentiment
41 | obj['from_to'] = from_to
42 | out_data.append(obj)
43 | return out_data
44 |
45 | if __name__ == '__main__':
46 | train_data = read_data('germeval_train.raw')
47 | train_data = construct_data(train_data)
48 | with open('germeval_Train.json', 'w', encoding='utf-8') as tr_fp:
49 | json_str = json.dumps(train_data, indent=4)
50 | tr_fp.write(json_str)
51 | tr_fp.close()
52 |
53 | valid_data = read_data('germeval_valid.raw')
54 | valid_data = construct_data(valid_data)
55 | with open('germeval_Valid.json', 'w', encoding='utf-8') as va_fp:
56 | json_str = json.dumps(valid_data, indent=4)
57 | va_fp.write(json_str)
58 | va_fp.close()
59 |
60 | test_data = read_data('germeval_test.raw')
61 | test_data = construct_data(test_data)
62 | with open('germeval_Test.json', 'w', encoding='utf-8') as te_fp:
63 | json_str = json.dumps(test_data, indent=4)
64 | te_fp.write(json_str)
65 | te_fp.close()
66 | print('WELL DONE.')
67 |
--------------------------------------------------------------------------------
/data/Colloquial/download_and_process_colloquial.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | DATA_DIR=$1
4 | OUTPUT_DIR=$2
5 | STANFORD_DIR=$3
6 | EXECUTE_DIR=$4
7 |
8 | if test -z "$DATA_DIR"
9 | then
10 | DATA_DIR='.'
11 | fi
12 |
13 | if test -z "$OUTPUT_DIR"
14 | then
15 | OUTPUT_DIR='dataset'
16 | fi
17 |
18 | if test -z "$STANFORD_DIR"
19 | then
20 | STANFORD_DIR='../stanford-corenlp-3.9.2-minimal'
21 | fi
22 |
23 | if test -z "$EXECUTE_DIR"
24 | then
25 | EXECUTE_DIR='.'
26 | fi
27 |
28 | echo "Download colloquial data (Twitter) in mirror source of ZJU MMF"
29 | echo "Origin data for Twitter can be found in https://github.com/songyouwei/ABSA-PyTorch/tree/master/datasets/acl-14-short-data"
30 |
31 | TWITTER_TRAIN_FILE=${DATA_DIR}/twitter_train.raw
32 | TWITTER_TEST_FILE=${DATA_DIR}/twitter_test.raw
33 | TWITTER_OUT_DIR=${OUTPUT_DIR}/JSONABSA_Twitter
34 |
35 | wget -O $TWITTER_TRAIN_FILE https://expic.xlxw.org/hycxg/datamirror/Twitter_train.raw
36 | wget -O $TWITTER_TEST_FILE https://expic.xlxw.org/hycxg/datamirror/Twitter_test.raw
37 |
38 | echo "Process raw data of Twitter to HyCxG format"
39 | python $EXECUTE_DIR/process_twitter.py --train_file $TWITTER_TRAIN_FILE --test_file $TWITTER_TEST_FILE \
40 | --out_path $TWITTER_OUT_DIR \
41 | --stanford_path $STANFORD_DIR
42 |
43 | echo "Download colloquial data (GermEval) in mirror source of ZJU MMF"
44 | echo "Origin data for GermEval can be found in http://ltdata1.informatik.uni-hamburg.de/germeval2017/"
45 | echo "Note: the GermEval dataset in our experiment is a subset. if you want to reporduce the experiment, you may download data via this script."
46 |
47 | GERMEVAL_TRAIN_FILE=${DATA_DIR}/germeval_train.raw
48 | GERMEVAL_VALID_FILE=${DATA_DIR}/germeval_valid.raw
49 | GERMEVAL_TEST_FILE=${DATA_DIR}/germeval_test.raw
50 | GERMEVAL_OUT_DIR=${OUTPUT_DIR}/JSONABSA_German
51 |
52 | wget -O $GERMEVAL_TRAIN_FILE https://expic.xlxw.org/hycxg/datamirror/germeval_train.raw
53 | wget -O $GERMEVAL_VALID_FILE https://expic.xlxw.org/hycxg/datamirror/germeval_valid.raw
54 | wget -O $GERMEVAL_TEST_FILE https://expic.xlxw.org/hycxg/datamirror/germeval_test.raw
55 |
56 | echo "Process raw data of GermEval to HyCxG format"
57 | python $EXECUTE_DIR/process_germeval.py --train_file $GERMEVAL_TRAIN_FILE --valid_file $GERMEVAL_VALID_FILE --test_file $GERMEVAL_TEST_FILE \
58 | --out_path $GERMEVAL_OUT_DIR
--------------------------------------------------------------------------------
/data/Colloquial/process_germeval.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | import sys
4 | sys.path.append('..')
5 | import json
6 | import os
7 | import numpy as np
8 | import pandas as pd
9 | import argparse
10 | from tqdm import tqdm
11 | import spacy
12 |
13 | MAP_POLARITY = {0 : 'neutral', 1 : 'positive', -1 : 'negative'}
14 |
15 | def initialize_spacy(lang: str='de'):
16 | try: nlpmodel = spacy.load(lang)
17 | except: raise 'The script need spacy>=2.3.5 package, you need to proceed `pip install spacy` first.'
18 | return nlpmodel
19 |
20 | def convert_raw2json(path : str, nlpmodel, desc: str='train'):
21 | data = []
22 | with open(path, 'r', encoding='utf-8') as fp:
23 | raw_data = fp.readlines()
24 | fp.close()
25 | for idx in tqdm(range(0, len(raw_data), 3), desc='Process {} file in GermEval'.format(desc)):
26 | obj = {}
27 | sentence = raw_data[idx].strip()
28 | target = raw_data[idx + 1].strip()
29 | polarity = MAP_POLARITY[eval(raw_data[idx + 2].strip())]
30 | if '$T$' not in sentence:
31 | print('Error sentence : %s' % sentence)
32 | continue
33 | post_sentence = sentence.replace('$T$', target)
34 | document = nlpmodel(post_sentence)
35 | tokens, pos_tag = zip(*[[token.text, token.tag_.split('_')[0]] for token in document])
36 | obj['token'] = list(tokens)
37 | pos_tag = list(pos_tag)
38 | obj['pos'] = pos_tag
39 | heads = [token.head.i + 1 if token.dep_ != 'ROOT' else 0 for token in document]
40 | rels = [token.dep_.split(':')[0] if token.dep_ != 'ROOT' else 'root' for token in document]
41 | obj['head'] = heads
42 | obj['deprel'] = rels
43 | term, targl = [tok.text for tok in nlpmodel(target)], [tok.text for tok in nlpmodel(sentence.split('$T$')[0])]
44 | obj['aspects'] = [{
45 | 'term' : term,
46 | 'from' : len(targl),
47 | 'to' : len(targl) + len(term),
48 | 'polarity' : polarity
49 | }]
50 | data.append(obj)
51 | return data
52 |
53 | def read_semeval_data(path : str):
54 | data_out = []
55 | data = np.array(pd.read_csv(path))
56 | for dat in data: data_out.append([dat[0], dat[2], dat[1]])
57 | return data_out
58 |
59 | def output_json(data: list, folder_path: str, file_path: str):
60 | if not os.path.exists(folder_path): os.makedirs(folder_path)
61 | with open(os.path.join(folder_path, file_path), 'w', encoding='utf-8') as fp:
62 | fp.write(json.dumps(data, indent=4))
63 | fp.close()
64 |
65 | def process_data(args: argparse.Namespace):
66 | assert os.path.exists(args.train_file) and os.path.exists(args.test_file) and os.path.exists(args.test_file), "The path of data is not exist, please download first."
67 | train_json = convert_raw2json(args.train_file, args.nlpmodel)
68 | output_json(train_json, args.out_path, 'train.json')
69 | print('The train file of GermEval dataset is saved at %s' % os.path.join(args.out_path, 'train.json'))
70 |
71 | valid_json = convert_raw2json(args.valid_file, args.nlpmodel, desc='valid')
72 | output_json(valid_json, args.out_path, 'valid.json')
73 | print('The valid file of GermEval dataset is saved at %s' % os.path.join(args.out_path, 'valid.json'))
74 |
75 | test_json = convert_raw2json(args.test_file, args.nlpmodel, desc='test')
76 | output_json(test_json, args.out_path, 'test.json')
77 | print('The test file of GermEval dataset is saved at %s' % os.path.join(args.out_path, 'test.json'))
78 |
79 | def main():
80 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
81 | parser.add_argument("--out_path", default='dataset/JSONABSA_German', type=str, help="Output path of GermEval dataset.")
82 | parser.add_argument("--train_file", default='germeval_train.raw', type=str, help="The path of train file.")
83 | parser.add_argument("--valid_file", default='germeval_valid.raw', type=str, help="The path of valid file.")
84 | parser.add_argument("--test_file", default='germeval_test.raw', type=str, help="The path of test file.")
85 | args = parser.parse_args()
86 | args.nlpmodel = initialize_spacy()
87 | process_data(args)
88 | print('GermEval data has been processed.')
89 |
90 | if __name__ == '__main__':
91 | main()
92 |
--------------------------------------------------------------------------------
/data/Colloquial/process_twitter.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | import sys
4 | import json
5 | import os
6 | import numpy as np
7 | import pandas as pd
8 | import argparse
9 | from tqdm import tqdm
10 | from stanfordcorenlp import StanfordCoreNLP
11 | try:
12 | sys.path.append('.')
13 | from download_stanfordcore import download_stanfordcore, unzip_stanfordcore, STANFORD_CORE_LINK
14 | except:
15 | sys.path.append('..')
16 | from download_stanfordcore import download_stanfordcore, unzip_stanfordcore, STANFORD_CORE_LINK
17 |
18 |
19 | MAP_POLARITY = {0 : 'neutral', 1 : 'positive', -1 : 'negative'}
20 |
21 | def initialize_stanfordcore(stanford_path: str):
22 | try: nlpmodel = StanfordCoreNLP(stanford_path)
23 | except:
24 | print('The script need stanfordparser>=3.9.2 package, while the path is not exist for the package, do you want to download it? (Y/N)')
25 | download_flag = input()
26 | download_flag = download_flag.lower()
27 | assert download_flag in ['y'], "Abort"
28 | download_stanfordcore(STANFORD_CORE_LINK, stanford_path+'.zip')
29 | unzip_stanfordcore(stanford_path+'.zip', '../')
30 | nlpmodel = StanfordCoreNLP(stanford_path)
31 | return nlpmodel
32 |
33 | def convert_raw2json(path : str, nlpmodel, desc: str='train'):
34 | def parse_adj(edge):
35 | e_id, dep_rels, dep_heads = 1, [], []
36 | for eidx in range(len(edge)):
37 | if (eidx + 1) != edge[0][2]:
38 | dep_heads.append(edge[e_id][1])
39 | dep_rels.append(edge[e_id][0])
40 | e_id += 1
41 | else:
42 | dep_heads.append(0)
43 | dep_rels.append(edge[0][0])
44 | return dep_heads, dep_rels
45 |
46 | data = []
47 | with open(path, 'r', encoding='utf-8') as fp:
48 | raw_data = fp.readlines()
49 | fp.close()
50 | for idx in tqdm(range(0, len(raw_data), 3), desc='Process {} file in Twitter'.format(desc)):
51 | obj = {}
52 | sentence = raw_data[idx].strip()
53 | target = raw_data[idx + 1].strip()
54 | polarity = MAP_POLARITY[eval(raw_data[idx + 2].strip())]
55 | if '$T$' not in sentence:
56 | print('Error sentence : %s' % sentence)
57 | continue
58 | post_sentence = sentence.replace('$T$', target)
59 | obj['token'] = nlpmodel.word_tokenize(post_sentence)
60 | pos_tag = nlpmodel.pos_tag(post_sentence)
61 | dependecy = nlpmodel.dependency_parse(post_sentence)
62 | obj['pos'] = [tag[1] for tag in pos_tag]
63 | heads, rels = parse_adj(dependecy)
64 | obj['head'] = heads
65 | obj['deprel'] = rels
66 | obj['aspects'] = [{
67 | 'term' : nlpmodel.word_tokenize(target),
68 | 'from' : len(nlpmodel.word_tokenize(sentence.split('$T$')[0])),
69 | 'to' : len(nlpmodel.word_tokenize(sentence.split('$T$')[0])) + len(nlpmodel.word_tokenize(target)),
70 | 'polarity' : polarity
71 | }]
72 | data.append(obj)
73 | return data
74 |
75 | def read_semeval_data(path : str):
76 | data_out = []
77 | data = np.array(pd.read_csv(path))
78 | for dat in data: data_out.append([dat[0], dat[2], dat[1]])
79 | return data_out
80 |
81 | def output_json(data: list, folder_path: str, file_path: str):
82 | if not os.path.exists(folder_path): os.makedirs(folder_path)
83 | with open(os.path.join(folder_path, file_path), 'w', encoding='utf-8') as fp:
84 | fp.write(json.dumps(data, indent=4))
85 | fp.close()
86 |
87 | def process_data(args: argparse.Namespace):
88 | assert os.path.exists(args.train_file) and os.path.exists(args.test_file), "The path of data is not exist, please download first."
89 | train_json = convert_raw2json(args.train_file, args.nlpmodel)
90 | output_json(train_json, args.out_path, 'train.json')
91 | print('The train file of Twitter dataset is saved at %s' % os.path.join(args.out_path, 'train.json'))
92 |
93 | test_json = convert_raw2json(args.test_file, args.nlpmodel, desc='test')
94 | output_json(test_json, args.out_path, 'test.json')
95 | print('The test file of Twitter dataset is saved at %s' % os.path.join(args.out_path, 'test.json'))
96 |
97 | def main():
98 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
99 | parser.add_argument("--out_path", default='dataset/JSONABSA_Twitter', type=str, help="Output path of twitter dataset.")
100 | parser.add_argument("--stanford_path", default='stanford-corenlp-3.9.2-minimal', type=str, help="The path for stanfordparser.")
101 | parser.add_argument("--train_file", default='twitter_train.raw', type=str, help="The path of train file.")
102 | parser.add_argument("--test_file", default='twitter_test.raw', type=str, help="The path of test file.")
103 | args = parser.parse_args()
104 | args.nlpmodel = initialize_stanfordcore(args.stanford_path)
105 | process_data(args)
106 | print('Twitter data has been processed.')
107 |
108 | if __name__ == '__main__':
109 | main()
110 |
--------------------------------------------------------------------------------
/data/Counterfactual/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 |
16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/Counterfactual) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/Counterfactual/README_ZH.md)
17 |
18 | ## Counterfactual Detection Dataset
19 |
20 | The Counterfactual Recognition (CR) dataset is derived from Subtask 1 - Recognizing Counterfactual Statements (RCS) of SemEval2020 Task5. The data is collected from the domains of politics, finance, and health, and consists of 13k training data with 7k test data.
21 |
22 | ### Download and process the data
23 | Before using the data processing and downloading script, please make sure that the dependencies in [`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt) have already been installed. After installing the dependencies, use the following command to download and process the data (note: you may not attach any parameters, all parameters have default values):
24 | ```shell
25 | bash download_and_process_counterfactual.sh [--DATA_DIR] [--OUTPUT_DIR] [--STANFORD_DIR]
26 | ```
27 | **Parameters:**
28 | + DATA_DIR: The folder where the downloaded raw data is located. The default parameter is the current folder.
29 | + OUTPUT_DIR: The folder where the processed data is stored. The default parameter is `JSON_Counterfactual`.
30 | + STANFORD_DIR: The location of the Stanford parser. The default parameter is `stanford-corenlp-3.9.2-minimal` in the parent directory.
31 |
32 | **Note:** If the shared parser directory does not exist, the program will ask if you want to fetch the Stanford parser from our mirror data source, which is 353MB in size. Choose `Y` to proceed.
33 |
34 | ### Resource of data
35 | This data is proposed in SemEval2020 Task5, and our mirrored data comes from official source [SemEval2020_Task5](https://github.com/Jiaqi1008/SemEval2020_Task5). If you also want to use this dataset, you can cite their paper:
36 | ```
37 | @inproceedings{yang-etal-2020-semeval,
38 | title = "{S}em{E}val-2020 Task 5: Counterfactual Recognition",
39 | author = "Yang, Xiaoyu and
40 | Obadinma, Stephen and
41 | Zhao, Huasha and
42 | Zhang, Qiong and
43 | Matwin, Stan and
44 | Zhu, Xiaodan",
45 | booktitle = "Proceedings of the Fourteenth Workshop on Semantic Evaluation",
46 | year = "2020",
47 | publisher = "International Committee for Computational Linguistics",
48 | url = "https://aclanthology.org/2020.semeval-1.40",
49 | doi = "10.18653/v1/2020.semeval-1.40",
50 | pages = "322--335",
51 | }
52 | ```
--------------------------------------------------------------------------------
/data/Counterfactual/README_ZH.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/Counterfactual) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/Counterfactual/README_ZH.md)
16 | ## 反事实检测数据集
17 |
18 |
19 |
20 | 反事实检测(Counterfactual recognition, CR)数据集来自SemEval2020 Task5中的子任务1-反事实陈述检测(Recognizing Counterfactual Statements, RCS),数据采集自政治、金融和健康领域,共有13k训练数据以及7k测试数据。
21 |
22 | ### 数据下载及处理
23 | 在使用数据处理及下载脚本前,请您确认已经安装了[`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt)中的依赖包。在安装完依赖包后,用以下命令来获得并处理数据(可以不附加任何参数,所有参数均有默认值):
24 | ```shell
25 | bash download_and_process_counterfactual.sh [--DATA_DIR] [--OUTPUT_DIR] [--STANFORD_DIR]
26 | ```
27 | **参数含义:**
28 | + DATA_DIR: 下载的原始数据所在文件夹, 默认为当前文件夹
29 | + OUTPUT_DIR: 处理好后的数据存储的文件夹, 默认为`JSON_Counterfactual`
30 | + STANFORD_DIR: 斯坦福解析器所在位置,默认为上一级目录的`stanford-corenlp-3.9.2-minimal`
31 |
32 | **注意:** 如果共享的解析器文件夹不存在,那么程序会询问是否从我们的镜像数据源拉取斯坦福解析器(共353MB),选择`Y`即可
33 |
34 | ### 数据来源
35 | 本部分数据来自SemEval2020 Task5,我们的镜像数据来源官方Github[SemEval2020_Task5](https://github.com/Jiaqi1008/SemEval2020_Task5),如果您也使用了该数据集,您可以引用他们的论文:
36 | ```
37 | @inproceedings{yang-etal-2020-semeval,
38 | title = "{S}em{E}val-2020 Task 5: Counterfactual Recognition",
39 | author = "Yang, Xiaoyu and
40 | Obadinma, Stephen and
41 | Zhao, Huasha and
42 | Zhang, Qiong and
43 | Matwin, Stan and
44 | Zhu, Xiaodan",
45 | booktitle = "Proceedings of the Fourteenth Workshop on Semantic Evaluation",
46 | year = "2020",
47 | publisher = "International Committee for Computational Linguistics",
48 | url = "https://aclanthology.org/2020.semeval-1.40",
49 | doi = "10.18653/v1/2020.semeval-1.40",
50 | pages = "322--335",
51 | }
52 | ```
--------------------------------------------------------------------------------
/data/Counterfactual/download_and_process_counterfactual.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | DATA_DIR=$1
4 | OUTPUT_DIR=$2
5 | STANFORD_DIR=$3
6 | EXECUTE_DIR=$4
7 |
8 | if test -z "$DATA_DIR"
9 | then
10 | DATA_DIR='.'
11 | fi
12 |
13 | if test -z "$OUTPUT_DIR"
14 | then
15 | OUTPUT_DIR='JSON_Counterfactual'
16 | fi
17 |
18 | if test -z "$STANFORD_DIR"
19 | then
20 | STANFORD_DIR='../stanford-corenlp-3.9.2-minimal'
21 | fi
22 |
23 | if test -z "$EXECUTE_DIR"
24 | then
25 | EXECUTE_DIR='.'
26 | fi
27 |
28 | echo "Download counterfactual data in mirror source of ZJU MMF"
29 | echo "Origin data can be found in https://github.com/Jiaqi1008/SemEval2020_Task5"
30 |
31 | TRAIN_FILE=${DATA_DIR}/counterfactual_train.csv
32 | TEST_FILE=${DATA_DIR}/counterfactual_test.csv
33 | wget -O $TRAIN_FILE https://expic.xlxw.org/hycxg/datamirror/subtask1_train.csv
34 | wget -O $TEST_FILE https://expic.xlxw.org/hycxg/datamirror/subtask1_test.csv
35 |
36 | echo "Process csv data to HyCxG format"
37 | python $EXECUTE_DIR/process_counterfactual.py --train_file $TRAIN_FILE --test_file $TEST_FILE \
38 | --out_path $OUTPUT_DIR \
39 | --stanford_path $STANFORD_DIR
40 |
--------------------------------------------------------------------------------
/data/Counterfactual/process_counterfactual.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | import sys
4 | import json
5 | import os
6 | import numpy as np
7 | import pandas as pd
8 | import argparse
9 | from tqdm import tqdm
10 | from stanfordcorenlp import StanfordCoreNLP
11 | try:
12 | sys.path.append('.')
13 | from download_stanfordcore import download_stanfordcore, unzip_stanfordcore, STANFORD_CORE_LINK
14 | except:
15 | sys.path.append('..')
16 | from download_stanfordcore import download_stanfordcore, unzip_stanfordcore, STANFORD_CORE_LINK
17 |
18 | def initialize_stanfordcore(stanford_path: str):
19 | try: nlpmodel = StanfordCoreNLP(stanford_path)
20 | except:
21 | print('The script need stanfordparser>=3.9.2 package, while the path is not exist for the package, do you want to download it? (Y/N)')
22 | download_flag = input()
23 | download_flag = download_flag.lower()
24 | assert download_flag in ['y'], "Abort"
25 | download_stanfordcore(STANFORD_CORE_LINK, stanford_path+'.zip')
26 | unzip_stanfordcore(stanford_path+'.zip', '../')
27 | nlpmodel = StanfordCoreNLP(stanford_path)
28 | return nlpmodel
29 |
30 | def data2json(data: list, nlpmodel, desc:str='train'):
31 | def parse_adj(edge):
32 | e_id, dep_rels, dep_heads = 1, [], []
33 | for eidx in range(len(edge)):
34 | if (eidx + 1) != edge[0][2]:
35 | dep_heads.append(edge[e_id][1])
36 | dep_rels.append(edge[e_id][0])
37 | e_id += 1
38 | else:
39 | dep_heads.append(0)
40 | dep_rels.append(edge[0][0])
41 | return dep_heads, dep_rels
42 |
43 | data_out = []
44 | for dat in tqdm(data, desc='Process {} file in Counterfactual'.format(desc)):
45 | obj ={}
46 | gid =dat[0]
47 | obj['id'] = gid
48 | sentence = dat[1].strip()
49 | if sentence == '#NAME?': continue
50 | label = dat[2]
51 | pos_tag = nlpmodel.pos_tag(sentence)
52 | dependecy = nlpmodel.dependency_parse(sentence)
53 | obj['token'] = nlpmodel.word_tokenize(sentence)
54 | obj['pos'] = [tag[1] for tag in pos_tag]
55 | heads, rels = parse_adj(dependecy)
56 | obj['head'] = heads
57 | obj['deprel'] = rels
58 | obj['label'] = label
59 | data_out.append(obj)
60 | return data_out
61 |
62 | def read_semeval_data(path : str):
63 | data_out = []
64 | data = np.array(pd.read_csv(path))
65 | for dat in data: data_out.append([dat[0], dat[2], dat[1]])
66 | return data_out
67 |
68 | def output_json(data: list, folder_path: str, file_path: str):
69 | if not os.path.exists(folder_path): os.makedirs(folder_path)
70 | with open(os.path.join(folder_path, file_path), 'w', encoding='utf-8') as fp:
71 | fp.write(json.dumps(data, indent=4))
72 | fp.close()
73 |
74 | def process_data(args: argparse.Namespace):
75 | assert os.path.exists(args.train_file) and os.path.exists(args.test_file), "The path of data is not exist, please download first."
76 | train_semeval_data = read_semeval_data(args.train_file)
77 | train_json = data2json(train_semeval_data, args.nlpmodel)
78 | output_json(train_json, args.out_path, 'train.json')
79 | print('The train file of Counterfactual dataset is saved at %s' % os.path.join(args.out_path, 'train.json'))
80 |
81 | test_semeval_data = read_semeval_data(args.test_file)
82 | test_json = data2json(test_semeval_data, args.nlpmodel, desc='test')
83 | output_json(test_json, args.out_path, 'test.json')
84 | print('The test file of Counterfactual dataset is saved at %s' % os.path.join(args.out_path, 'test.json'))
85 |
86 | def main():
87 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
88 | parser.add_argument("--out_path", default='dataset/GLUE_Counterfactual', type=str, help="Output path of glue dataset.")
89 | parser.add_argument("--stanford_path", default='stanford-corenlp-3.9.2-minimal', type=str, help="The path for stanfordparser.")
90 | parser.add_argument("--train_file", default='counterfactual_train.csv', type=str, help="The path of train file.")
91 | parser.add_argument("--test_file", default='counterfactual_test.csv', type=str, help="The path of test file.")
92 | args = parser.parse_args()
93 | args.nlpmodel = initialize_stanfordcore(args.stanford_path)
94 | process_data(args)
95 | print('Counterfactual data has been processed.')
96 |
97 | if __name__ == '__main__':
98 | main()
99 |
--------------------------------------------------------------------------------
/data/GLUE/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 |
16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/GLUE) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/GLUE/README_ZH.md)
17 |
18 | ## GLUE Benchmark
19 |
20 | The General Language Understanding Evaluation (GLUE) benchmark is a common collection of natural language understanding systems based on a set of nine understanding tasks constructed from various existing natural language understanding datasets. These tasks are carefully selected to include a diverse range of dataset sizes, types, and difficulty levels. We evaluated the performance of this benchmark dataset on eight tasks, including `CoLA` (linguistic acceptability), `SST-2` (sentiment analysis), `MRPC`/`STS-B`/`QQP` (semantic similarity computation and equivalence matching), `MNLI`/`QNLI`/`RTE` (natural language inference).
21 |
22 | ### Download and process the data
23 | Before using the data processing and downloading script, please make sure that the dependencies in [`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt) have already been installed. After installing the dependencies, use the following command to download and process the data (Note: you may not attach any parameters, all parameters have default values):
24 | ```shell
25 | bash download_and_process_glue.sh [--OUTPUT_DIR] [--STANFORD_DIR] [--TASK]
26 | ```
27 | **Parameters:**
28 | + OUTPUT_DIR: The folder where the processed data is stored. The default parameter is `dataset`.
29 | + STANFORD_DIR: The location of the Stanford parser. The default parameter is `stanford-corenlp-3.9.2-minimal` in the parent directory.
30 | + TASK: The GLUE tasks that need to be handled can be selected from [cola, sst2, mnli, qnli, qqp, rte, mrpc, stsb]. If you want to download all of them, you can use `all` directly.
31 |
32 | **Note:** If the shared parser directory does not exist, the program will ask if you want to fetch the Stanford parser from our mirror data source, which is 353MB in size. Choose `Y` to proceed. In addition, the MNLI task is relatively special, so there will be two extra files (matched/mismatched) in the output.
33 |
34 | ### Resource of data
35 | This data is proposed in GLUE benchmark, and our data comes from datasets library of Hugging Face [GLUE](https://huggingface.co/datasets?sort=downloads&search=glue). If you also want to use this dataset, you can cite their paper:
36 | ```
37 | @inproceedings{wang2018glue,
38 | title = "{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding",
39 | author = "Wang, Alex and
40 | Singh, Amanpreet and
41 | Michael, Julian and
42 | Hill, Felix and
43 | Levy, Omer and
44 | Bowman, Samuel",
45 | booktitle = "Proceedings of the 2018 {EMNLP} Workshop {B}lackbox{NLP}: Analyzing and Interpreting Neural Networks for {NLP}",
46 | month = nov,
47 | year = "2018",
48 | url = "https://aclanthology.org/W18-5446",
49 | }
50 | ```
51 | Or the other bib below:
52 | ```
53 | @inproceedings{wangglue,
54 | title={GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding},
55 | author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},
56 | booktitle={International Conference on Learning Representations (ICLR)},
57 | year = "2019"
58 | }
59 | ```
--------------------------------------------------------------------------------
/data/GLUE/README_ZH.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/GLUE) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/GLUE/README_ZH.md)
16 | ## GLUE基准数据集
17 |
18 | 通用自然语言理解评估(General Language Understanding Evaluation, GLUE)基准数据集是一个常用的自然语言理解系统的集合,基于已有的各方面自然语言理解数据集构建的九项句子语义理解任务的基准测试,这些任务被精选出来包括多样化的数据集大小、文本类型和难度程度。我们针对它的8个任务进行了性能评估,分别是`CoLA`(语言可接受度)、`SST-2`(文本情感分析)、`MRPC`/`STS-B`/`QQP`(语义相似度计算以及等价性匹配)、`MNLI`/`QNLI`/`RTE`(自然语言推理)。
19 | ### 数据下载及处理
20 | 在使用数据处理及下载脚本前,请您确认已经安装了[`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt)中的依赖包。在安装完依赖包后,用以下命令来获得并处理数据(可以不附加任何参数,所有参数均有默认值):
21 | ```shell
22 | bash download_and_process_glue.sh [--OUTPUT_DIR] [--STANFORD_DIR] [--TASK]
23 | ```
24 | **参数含义:**
25 | + OUTPUT_DIR: 处理好后的数据存储的文件夹, 默认为`dataset`
26 | + STANFORD_DIR: 斯坦福解析器所在位置,默认为上一级目录的`stanford-corenlp-3.9.2-minimal`
27 | + TASK: 需要处理的GLUE任务,可以选择[cola, sst2, mnli, qnli, qqp, rte, mrpc, stsb],如果都要下载可以直接使用`all`替代
28 |
29 | **注意:** 如果共享的解析器文件夹不存在,那么程序会询问是否从我们的镜像数据源拉取斯坦福解析器(共353MB),选择`Y`即可。除此之外,`MNLI`任务较为特别因此输出的文件会多两个文件(matched/mismatched)
30 |
31 | ### 数据来源
32 | 本部分数据来自GLUE基准,我们的数据来源Hugging Face的datasets库[GLUE](https://huggingface.co/datasets?sort=downloads&search=glue),如果您也使用了该数据集,您可以引用他们的论文:
33 | ```
34 | @inproceedings{wang2018glue,
35 | title = "{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding",
36 | author = "Wang, Alex and
37 | Singh, Amanpreet and
38 | Michael, Julian and
39 | Hill, Felix and
40 | Levy, Omer and
41 | Bowman, Samuel",
42 | booktitle = "Proceedings of the 2018 {EMNLP} Workshop {B}lackbox{NLP}: Analyzing and Interpreting Neural Networks for {NLP}",
43 | month = nov,
44 | year = "2018",
45 | url = "https://aclanthology.org/W18-5446",
46 | }
47 | ```
48 | 或者
49 | ```
50 | @inproceedings{wangglue,
51 | title={GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding},
52 | author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},
53 | booktitle={International Conference on Learning Representations (ICLR)},
54 | year = "2019"
55 | }
56 | ```
--------------------------------------------------------------------------------
/data/GLUE/download_and_process_glue.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | OUTPUT_DIR=$1
4 | STANFORD_DIR=$2
5 | TASK=$3
6 | EXECUTE_DIR=$4
7 |
8 | if test -z "$OUTPUT_DIR"
9 | then
10 | OUTPUT_DIR='dataset'
11 | fi
12 |
13 | if test -z "$STANFORD_DIR"
14 | then
15 | STANFORD_DIR='../stanford-corenlp-3.9.2-minimal'
16 | fi
17 |
18 | if test -z "$TASK"
19 | then
20 | TASK='all'
21 | fi
22 |
23 | if test -z "$EXECUTE_DIR"
24 | then
25 | EXECUTE_DIR='.'
26 | fi
27 |
28 | echo "Original data can be found in https://gluebenchmark.com/"
29 |
30 | echo "Process data to HyCxG format (depend on Hugging Face)"
31 | python $EXECUTE_DIR/download_and_process_glue.py --task $TASK \
32 | --out_path $OUTPUT_DIR \
33 | --stanford_path $STANFORD_DIR
34 |
--------------------------------------------------------------------------------
/data/Multilingual/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 |
16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/Multilingual) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/Multilingual/README_ZH.md)
17 |
18 | ## Multilingual Dataset for ABSA
19 |
20 | The Multilingual sentiment analysis dataset is based on Semeval 2016 Task 5, which provides data in 8 different languages from various domains. We selected 4 different languages from the Restaurant domain for the multilingual performance evaluation, including `French`, `Spanish`, `Turkish`, and `Dutch`.
21 |
22 | ### Download and process the data
23 | Before using the data processing and downloading script, please make sure that the dependencies in [`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt) have already been installed. After installing the dependencies, use the following command to download and process the data (note: you may not attach any parameters, all parameters have default values):
24 | ```shell
25 | bash download_and_process_multilingual.sh [--DATA_DIR] [--OUTPUT_DIR]
26 | ```
27 | **Parameters:**
28 | + DATA_DIR: The folder where the downloaded raw data is located. The default parameter is the current folder.
29 | + OUTPUT_DIR: The folder where the processed data is stored. The default parameter is `dataset`.
30 |
31 | **Note:** The `PORT` variable in the script represents the startup port of the Stanza server, which is set to `9000` by default. If this port is already in use on your machine, you need to manually set a different port in the script.
32 |
33 | ### Data processing for baseline models
34 | In Section4.3 - Multilingual results of our paper, we compared the performance of four models, namely RGAT, DualGCN, DGEDT, and KumaGCN on these multilingual sentiment datasets. Therefore, we provide a conversion script for processing the data in the [`baseline`](https://github.com/xlxwalex/HyCxG/tree/main/data/Multilingual/baseline) folder. For more information on reproducing the baseline models, please refer to the [`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines).
35 |
36 | ### Resource of data
37 | Our mirror data is obtained from SemEval 2016, as well as the data sources [SemEval-2016 Task 5](https://alt.qcri.org/semeval2016/task5/). If you have also used this dataset, you can cite their papers as follows:
38 | ```
39 | @inproceedings{pontiki2016semeval,
40 | title = "{S}em{E}val-2016 Task 5: Aspect Based Sentiment Analysis",
41 | author = {Pontiki, Maria and
42 | Galanis, Dimitris and
43 | Papageorgiou, Haris and
44 | Androutsopoulos, Ion and
45 | Manandhar, Suresh and
46 | AL-Smadi, Mohammad and
47 | Al-Ayyoub, Mahmoud and
48 | Zhao, Yanyan and
49 | Qin, Bing and
50 | De Clercq, Orph{\'e}e and
51 | Hoste, V{\'e}ronique and
52 | Apidianaki, Marianna and
53 | Tannier, Xavier and
54 | Loukachevitch, Natalia and
55 | Kotelnikov, Evgeniy and
56 | Bel, Nuria and
57 | Jim{\'e}nez-Zafra, Salud Mar{\'\i}a and
58 | Eryi{\u{g}}it, G{\"u}l{\c{s}}en},
59 | booktitle = "{S}em{E}val-2016",
60 | year = "2016",
61 | url = "https://aclanthology.org/S16-1002",
62 | pages = "19--30",
63 | }
64 | ```
65 |
--------------------------------------------------------------------------------
/data/Multilingual/README_ZH.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/Multilingual) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/Multilingual/README_ZH.md)
16 | ## 多语情感数据集
17 |
18 | 多语(Multilingual)情感数据集基于Semeval 2016的Task5,其除了常规英语外还提供了不同领域的8种语言的数据。我们从其中的餐厅领域(Resturant)选择了4个不同的语言作为多语性能评估数据集,它们分别是`法语`、`西班牙语`、`土耳其语`以及`荷兰语`。
19 |
20 | ### 数据下载及处理
21 | 在使用数据处理及下载脚本前,请您确认已经安装了[`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt)中的依赖包。在安装完依赖包后,用以下命令来获得并处理数据(可以不附加任何参数,所有参数均有默认值):
22 | ```shell
23 | bash download_and_process_multilingual.sh [--DATA_DIR] [--OUTPUT_DIR]
24 | ```
25 | **参数含义:**
26 | + DATA_DIR: 下载的原始数据所在文件夹, 默认为当前文件夹
27 | + OUTPUT_DIR: 处理好后的数据存储的文件夹, 默认为`dataset`
28 |
29 | **注意:** 在脚本内的PORT变量表示stanza服务器启动端口,默认为`9000`,如果您的机器上该端口被占用,您需要手动在脚本中进行设置
30 |
31 | ### 基准模型数据处理
32 | 在论文的Section4.3 - Multilingual results中,我们对比了RGAT、DualGCN、DGEDT以及KumaGCN这四个模型在多语情感数据集上的性能,因此在[`baseline文件夹`](https://github.com/xlxwalex/HyCxG/tree/main/data/Multilingual/baseline)中我们提供了四种语言数据转换脚本(我们尽可能用了和官方代码一致的处理工具包)。更多关于基线模型的复现信息请见[`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines)。
33 |
34 | ### 数据来源
35 | 本部分数据来自SemEval 2016,我们的镜像数据来源[SemEval-2016 Task 5](https://alt.qcri.org/semeval2016/task5/),如果您也使用了该数据集,您可以引用他们的论文:
36 | ```
37 | @inproceedings{pontiki2016semeval,
38 | title = "{S}em{E}val-2016 Task 5: Aspect Based Sentiment Analysis",
39 | author = {Pontiki, Maria and
40 | Galanis, Dimitris and
41 | Papageorgiou, Haris and
42 | Androutsopoulos, Ion and
43 | Manandhar, Suresh and
44 | AL-Smadi, Mohammad and
45 | Al-Ayyoub, Mahmoud and
46 | Zhao, Yanyan and
47 | Qin, Bing and
48 | De Clercq, Orph{\'e}e and
49 | Hoste, V{\'e}ronique and
50 | Apidianaki, Marianna and
51 | Tannier, Xavier and
52 | Loukachevitch, Natalia and
53 | Kotelnikov, Evgeniy and
54 | Bel, Nuria and
55 | Jim{\'e}nez-Zafra, Salud Mar{\'\i}a and
56 | Eryi{\u{g}}it, G{\"u}l{\c{s}}en},
57 | booktitle = "{S}em{E}val-2016",
58 | year = "2016",
59 | url = "https://aclanthology.org/S16-1002",
60 | pages = "19--30",
61 | }
62 | ```
63 |
--------------------------------------------------------------------------------
/data/Multilingual/baseline/DGEDT_french_dutch_spanish.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import spacy
3 | import pickle
4 | import tqdm
5 | nlp = spacy.load('fr') # fr - franch / es - spanish / nl -dutch
6 | import re
7 |
8 | def tokenize(text):
9 | text=text.strip()
10 | text=re.sub(r' {2,}',' ',text)
11 | document = nlp(text)
12 | return [token.text for token in document]
13 |
14 | def update_edge(text,vocab):
15 | # https://spacy.io/docs/usage/processing-text
16 | document = nlp(text)
17 | seq_len = len(text.split())
18 | for token in document:
19 | if token.dep_ not in vocab:
20 | vocab[token.dep_]=len(vocab)
21 | return 0
22 | def dependency_adj_matrix(text,edge_vocab):
23 | # https://spacy.io/docs/usage/proclessing-text
24 | document = nlp(text.strip())
25 | seq_len = len(tokenize(text))
26 | matrix = np.zeros((seq_len, seq_len)).astype('float32')
27 | matrix1 = np.zeros((seq_len, seq_len)).astype('float32')
28 | edge = np.zeros((seq_len, seq_len)).astype('int32')
29 | edge1 = np.zeros((seq_len, seq_len)).astype('int32')
30 | assert len(document)==seq_len
31 | for token in document:
32 | if token.i >= seq_len:
33 | print('bug')
34 | print(text)
35 | print(text.split())
36 | print(document)
37 | print([token.i for token in document])
38 | print([token.text for token in document])
39 | a=input('hahha')
40 | if token.i < seq_len:
41 | matrix[token.i][token.i] = 1
42 | matrix1[token.i][token.i] = 1
43 | # https://spacy.io/docs/api/token
44 | for child in token.children:
45 | if child.i < seq_len:
46 | matrix[token.i][child.i] = 1
47 | matrix1[child.i][token.i] = 1
48 | edge[token.i][child.i] = edge_vocab.get(child.dep_,1)
49 | edge1[child.i][token.i] = edge_vocab.get(child.dep_,1)
50 | return matrix,matrix1,edge,edge1
51 | def concat(texts,aspect):
52 | source=''
53 | splitnum=0
54 | for i,text in enumerate(texts):
55 | source+=text
56 | splitnum+=len(tokenize(text))
57 | if i ':0,'':1}
74 | fin = open(filename, 'r', encoding='utf-8', newline='\n', errors='ignore')
75 | lines = fin.readlines()
76 | fin.close()
77 | idx2graph = {}
78 | fout = open(filename+'.graph', 'wb')
79 | if savevocab:
80 | fout1 = open(filename+'.edgevocab', 'wb')
81 | if savevocab:
82 | for i in tqdm.tqdm(range(0, len(lines), 3)):
83 | text_left = [s.lower().strip() for s in lines[i].split("$T$")]
84 | aspect = lines[i + 1].lower().strip()
85 | update_edge(concat(text_left,aspect),edge_vocab)
86 | for i in tqdm.tqdm(range(0, len(lines), 3)):
87 | text_left = [s.lower().strip() for s in lines[i].split("$T$")]
88 | aspect = lines[i + 1].lower().strip()
89 | adj_matrix = dependency_adj_matrix(concat(text_left,aspect),edge_vocab)
90 | idx2graph[i] = adj_matrix
91 | pickle.dump(idx2graph, fout)
92 | if savevocab:
93 | pickle.dump(edge_vocab, fout1)
94 | fout.close()
95 | if savevocab:
96 | fout1.close()
97 | return edge_vocab
98 | def processe(filename,filename2):
99 | savevocab=True
100 |
101 | edge_vocab={'':0,'':1}
102 | fin = open(filename, 'r', encoding='utf-8', newline='\n', errors='ignore')
103 | lines = fin.readlines()
104 | fin.close()
105 | idx2graph = {}
106 | fout = open(filename+'.graph', 'wb')
107 | if savevocab:
108 | fout1 = open(filename+'.edgevocab', 'wb')
109 | if savevocab:
110 | for i in tqdm.tqdm(range(0, len(lines), 1)):
111 | update_edge(re.sub(r' {2,}',' ',lines[i].strip()),edge_vocab)
112 | for i in tqdm.tqdm(range(0, len(lines), 1)):
113 | adj_matrix = dependency_adj_matrix(re.sub(r' {2,}',' ',lines[i].strip()),edge_vocab)
114 | idx2graph[i] = adj_matrix
115 | pickle.dump(idx2graph, fout)
116 | if savevocab:
117 | pickle.dump(edge_vocab, fout1)
118 | fout.close()
119 | if savevocab:
120 | fout1.close()
121 | return edge_vocab
122 | if __name__ == '__main__':
123 | # e.g., french
124 | edge_vocab = process('./datasets/french/restaurant_train.raw',None, True)
125 | process('./datasets/french/restaurant_test.raw', edge_vocab, False)
126 |
--------------------------------------------------------------------------------
/data/Multilingual/baseline/DGEDT_turkish.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pickle
3 | import tqdm
4 | import re
5 |
6 | import stanfordnlp
7 | nlpmodel = stanfordnlp.Pipeline(processors='tokenize,pos,depparse', lang="tr")
8 |
9 | def tokenize(text):
10 | text=text.strip()
11 | text=re.sub(r' {2,}',' ',text)
12 | document = nlpmodel(text)
13 | tokens = []
14 | for sentence in document.sentences:
15 | tok= [word.text for word in sentence.words]
16 | tokens.extend(tok)
17 | return tokens
18 |
19 | def update_edge(text,vocab):
20 | # https://spacy.io/docs/usage/processing-text
21 | document = nlpmodel(text)
22 | tokens = []
23 | for sentence in document.sentences:
24 | dep = [word.dependency_relation for word in sentence.words]
25 | tokens.extend(dep)
26 | seq_len = len(text.split())
27 | for token in tokens:
28 | if token not in vocab:
29 | vocab[token]=len(vocab)
30 | return 0
31 |
32 |
33 | def dependency_adj_matrix(text,edge_vocab):
34 | # https://spacy.io/docs/usage/processing-text
35 | document = nlpmodel(text)
36 | deprels = []
37 | for sentence in document.sentences:
38 | dep = [(word.dependency_relation, word.governor, eval(word.index)) for word in sentence.words]
39 | deprels.extend(dep)
40 | seq_len = len(tokenize(text))
41 | matrix = np.zeros((seq_len, seq_len)).astype('float32')
42 | matrix1 = np.zeros((seq_len, seq_len)).astype('float32')
43 | edge = np.zeros((seq_len, seq_len)).astype('int32')
44 | edge1 = np.zeros((seq_len, seq_len)).astype('int32')
45 |
46 | for tokid in range(len(deprels)):
47 | matrix[tokid][tokid] = 1
48 | matrix1[tokid][tokid] = 1
49 |
50 | for link in deprels:
51 | if link[0] == 'root':
52 | continue
53 | matrix[link[1] - 1][link[2] - 1] = 1
54 | matrix1[link[2] - 1][link[1] - 1] = 1
55 | edge[link[1] - 1][link[2] - 1] = edge_vocab.get(link[0], 1)
56 | edge1[link[2] - 1][link[1] - 1] = edge_vocab.get(link[0],1)
57 | return matrix, matrix1, edge, edge1
58 |
59 | def concat(texts,aspect):
60 | source=''
61 | splitnum=0
62 | for i, text in enumerate(texts):
63 | source+=text
64 | if text == '80 tl.': text = '80 tl .'
65 | if len(text) < 1:
66 | splitnum +=0
67 | else:
68 | splitnum+=len(tokenize(text))
69 | if i ':0,'':1}
88 | fin = open(filename, 'r', newline='\n', errors='ignore')
89 | lines = fin.readlines()
90 | fin.close()
91 | idx2graph = {}
92 | fout = open(filename+'.graph', 'wb')
93 | if savevocab:
94 | fout1 = open(filename+'.edgevocab', 'wb')
95 | if savevocab:
96 | for i in tqdm.tqdm(range(0, len(lines), 3)):
97 | text_left = [s.lower().strip() for s in lines[i].split("$T$")]
98 | aspect = lines[i + 1].lower().strip()
99 | concater = concat(text_left,aspect)
100 | update_edge(concater, edge_vocab)
101 | for i in tqdm.tqdm(range(0, len(lines), 3)):
102 | text_left = [s.lower().strip() for s in lines[i].split("$T$")]
103 | aspect = lines[i + 1].lower().strip()
104 | adj_matrix = dependency_adj_matrix(concat(text_left,aspect),edge_vocab)
105 | idx2graph[i] = adj_matrix
106 | pickle.dump(idx2graph, fout)
107 | if savevocab:
108 | pickle.dump(edge_vocab, fout1)
109 | fout.close()
110 | if savevocab:
111 | fout1.close()
112 | return edge_vocab
113 | def processe(filename,filename2):
114 | savevocab=True
115 |
116 | edge_vocab={'':0,'':1}
117 | fin = open(filename, 'r', encoding='utf-8', newline='\n', errors='ignore')
118 | lines = fin.readlines()
119 | fin.close()
120 | idx2graph = {}
121 | fout = open(filename+'.graph', 'wb')
122 | if savevocab:
123 | fout1 = open(filename+'.edgevocab', 'wb')
124 | if savevocab:
125 | for i in tqdm.tqdm(range(0, len(lines), 1)):
126 | update_edge(re.sub(r' {2,}',' ',lines[i].strip()),edge_vocab)
127 | for i in tqdm.tqdm(range(0, len(lines), 1)):
128 | adj_matrix = dependency_adj_matrix(re.sub(r' {2,}',' ',lines[i].strip()),edge_vocab)
129 | idx2graph[i] = adj_matrix
130 | pickle.dump(idx2graph, fout)
131 | if savevocab:
132 | pickle.dump(edge_vocab, fout1)
133 | fout.close()
134 | if savevocab:
135 | fout1.close()
136 | return edge_vocab
137 | if __name__ == '__main__':
138 | edge_vocab = process('./datasets/turkish/restaurant_train.raw', None, True)
139 | process('./datasets/turkish/restaurant_test.raw', edge_vocab, False)
140 |
--------------------------------------------------------------------------------
/data/Multilingual/baseline/DualGCN_french_dutch_spanish.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import spacy
3 | import pickle
4 | from tqdm import tqdm
5 | import json
6 | nlp = spacy.load('fr') # fr - franch / es - spanish / nl -dutch
7 | MAP_INV = {0 : 'neutral', 1 : 'positive', -1 : 'negative'}
8 |
9 | def read_data(path : str):
10 | with open(path, 'r') as fp:
11 | data = fp.readlines()
12 | fp.close()
13 | data_gp = []
14 | for idx in range(0, len(data), 3):
15 | sentence = data[idx].strip()
16 | term = data[idx+1].strip()
17 | polarity = eval(data[idx+2].strip())
18 | data_gp.append([sentence, term, polarity])
19 | return data_gp
20 |
21 | def construct_data(data : list):
22 | out_data = []
23 | for text in tqdm(data, desc='Processing'):
24 | sentence, term, polarity = text[0], text[1], text[2]
25 | document = nlp(sentence.replace('$T$', term))
26 | tokens, pos_tag = zip(*[[token.text, token.tag_.split('_')[0]] for token in document])
27 | sentence_post = sentence.replace('$T$', term)
28 | aspect_sentiment = [[term, MAP_INV[polarity]]]
29 | from_idx = len([tok.text for tok in nlp(sentence.split('$T$')[0])])
30 | term_tok = [tok.text for tok in nlp(term)]
31 | from_to = [[from_idx, from_idx + len(term_tok)]]
32 | predicted_dependencies = [token.dep_.split(':')[0] if token.dep_ != 'ROOT' else 'root' for token in document]
33 | predicted_heads = [token.head.i+1 if token.dep_ != 'ROOT' else 0 for token in document]
34 | dependencies = [list(ele) for ele in list(zip(predicted_dependencies, predicted_heads, list(range(1, len(predicted_heads) + 1))))]
35 | obj = {}
36 | obj['token'] = list(tokens)
37 | obj['pos'] = list(pos_tag)
38 | obj['head'] = predicted_heads
39 | obj['deprel'] = predicted_dependencies
40 | obj['aspects'] = [{
41 | 'term': term_tok,
42 | 'from' : from_to[0][0],
43 | 'to' : from_to[0][1],
44 | 'polarity' : MAP_INV[polarity]
45 | }]
46 | out_data.append(obj)
47 | return out_data
48 |
49 | if __name__ == '__main__':
50 | train_data = read_data('restaurant_train.raw')
51 | train_data = construct_data(train_data)
52 | with open('train.json', 'w', encoding='utf-8') as tr_fp:
53 | json_str = json.dumps(train_data, indent=4)
54 | tr_fp.write(json_str)
55 | tr_fp.close()
56 |
57 | test_data = read_data('restaurant_test.raw')
58 | test_data = construct_data(test_data)
59 | with open('test.json', 'w', encoding='utf-8') as te_fp:
60 | json_str = json.dumps(test_data, indent=4)
61 | te_fp.write(json_str)
62 | te_fp.close()
63 | print('WELL DONE.')
64 |
--------------------------------------------------------------------------------
/data/Multilingual/baseline/DualGCN_turkish.py:
--------------------------------------------------------------------------------
1 | try:
2 | import xml.etree.cElementTree as ET
3 | except ImportError:
4 | import xml.etree.ElementTree as ET
5 |
6 | from tqdm import tqdm
7 | import json
8 | import stanfordnlp
9 | nlpmodel = stanfordnlp.Pipeline(processors='tokenize,pos,depparse', lang="tr")
10 |
11 | def parse_adj(edge):
12 | e_id, dep_rels, dep_heads = 1, [], []
13 | for eidx in range(len(edge)):
14 | if (eidx + 1) != edge[0][2]:
15 | dep_heads.append(edge[e_id][1])
16 | dep_rels.append(edge[e_id][0])
17 | e_id += 1
18 | else:
19 | dep_heads.append(0)
20 | dep_rels.append(edge[0][0])
21 | return dep_heads, dep_rels
22 |
23 | def obtain_annotate(results: dict, only_tokens: bool=False, encode_eng:bool=False):
24 | tokens, postag, heads, deprels = [], [], [], []
25 | for sentence in results.sentences:
26 | tok, pos = zip(*[[word.text, word.xpos.split('|')[0]] for word in sentence.words])
27 | postag.extend(pos)
28 | tokens.extend(tok)
29 | dep = [(word.dependency_relation if word.dependency_relation != 'root' else 'ROOT', word.governor, eval(word.index)) for word in sentence.words]
30 | head, rel = parse_adj(dep)
31 | heads.extend([he+len(heads) for he in head])
32 | deprels.extend(rel)
33 | if not only_tokens:
34 | return tokens, postag, heads, deprels
35 | else:
36 | return tokens
37 |
38 | def parse_xml(path : str):
39 | data = []
40 | tree = ET.parse(path)
41 | root = tree.getroot()
42 | for review in tqdm(root.findall('Review'), 'Process'):
43 | for sentences in review.findall('sentences'):
44 | for sentence in sentences.findall('sentence'):
45 | obj = {}
46 | text = sentence.find('text').text
47 | if text is None or len(text) < 1: continue
48 | results = nlpmodel(text)
49 | tokens, pos_tag, heads, rels = obtain_annotate(results)
50 | obj['token'] = tokens
51 | obj['pos'] = pos_tag
52 | obj['head'] = heads
53 | obj['deprel'] = rels
54 | asp_total = []
55 | for asps in sentence.findall('Opinions'):
56 | for asp in asps.findall('Opinion'):
57 | aspect_dict = {}
58 | from_idx = eval(asp.get('from'))
59 | to_idx = eval(asp.get('to'))
60 | polarity = asp.get('polarity')
61 | term = asp.get('target')
62 | if polarity == 'conflict': continue
63 | if term == 'NULL':continue
64 | context_l = text[:from_idx]
65 | term_lr = text[from_idx:to_idx]
66 | if term_lr.lower() != term:
67 | print(text + ' / ' + term)
68 | if len(context_l) > 0:
69 | token_l = obtain_annotate(nlpmodel(context_l), only_tokens=True)
70 | else:
71 | token_l = []
72 | if len(text[from_idx:to_idx]) < 1: continue
73 | token_term = obtain_annotate(nlpmodel(text[from_idx:to_idx]), only_tokens=True)
74 | aspect_dict['term'] = token_term
75 | aspect_dict['from'] = len(token_l)
76 | aspect_dict['to'] = len(token_l) + len(token_term)
77 | aspect_dict['polarity'] = polarity
78 | asp_total.append(aspect_dict)
79 |
80 | obj['aspects'] = asp_total
81 | if len(asp_total) > 0: data.append(obj)
82 | return data
83 |
84 | if __name__ == '__main__':
85 | # Original Data Source
86 | train_data = parse_xml('ABSA16Tur_Train.xml')
87 | with open('train.json', 'w', encoding='utf-8') as tr_fp:
88 | json_str = json.dumps(train_data, indent=4)
89 | tr_fp.write(json_str)
90 | tr_fp.close()
91 |
92 | test_data = parse_xml('ABSA16Tur_Test.xml')
93 | with open('test.json', 'w', encoding='utf-8') as te_fp:
94 | json_str = json.dumps(test_data, indent=4)
95 | te_fp.write(json_str)
96 | te_fp.close()
97 | print('WELL DONE.')
--------------------------------------------------------------------------------
/data/Multilingual/baseline/KumaGCN_french_dutch_spanish.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | import spacy
4 | import pickle
5 | nlp = spacy.load('de')
6 |
7 | def dependency_adj_matrix(text):
8 | print(text)
9 | document = nlp(text)
10 | print("[tlog] document: " + str(document))
11 | #sys.exit(0)
12 | seq_len = len(text.split())
13 | matrix = np.zeros((seq_len, seq_len)).astype('float32')
14 | pos = []
15 | dep_rel = []
16 | i = 0
17 | for sentence in document.sentences:
18 | print("[tlog] sentence: " + str(sentence))
19 | for word in sentence.words:
20 | #print("[tlog] token: " + str(token.pos_))
21 | if word.index + i < seq_len: #there are some bugs for here such as SPACE
22 | pos.append(word.pos)
23 | dep_rel.append(word.dependency_relation)
24 | #print("[tlog] token: " + str(token.dep_))
25 | #print("[tlog] token.i: " + str(token.i))
26 | #print("[tlog] token.children: " + str([child for child in token.children]))
27 | #print("\n")
28 | #sys.exit(0) governor
29 | if word.index + i < seq_len:
30 | index = word.index + i
31 | head_index = word.governor + i
32 | matrix[index][index] = 1
33 |
34 | matrix[head_index][index] = 1
35 | matrix[index][head_index] = 1
36 |
37 | i += len(sentence.words)
38 | #print("[tlog] matrix: " + str(matrix))
39 | #sys.exit(0)
40 | return matrix, pos, dep_rel
41 |
42 | def dependency_adj_matrix2(text):
43 | # https://spacy.io/docs/usage/processing-text
44 | #print("[tlog] text: " + str(text)) # Maybe for parsing, we should not lower case this
45 | document = nlp(text)
46 | #print("[tlog] document: " + str(document))
47 | #sys.exit(0)
48 | seq_len = len(text.split())
49 | matrix = np.zeros((seq_len, seq_len)).astype('float32')
50 | pos = []
51 | dep_rel = []
52 | for token in document:
53 | #print("[tlog] token: " + str(token))
54 | #print("[tlog] token: " + str(token.pos_))
55 | if token.i < seq_len: #there are some bugs for here such as SPACE
56 | pos.append(token.tag_)
57 | dep_rel.append(token.dep_)
58 | #print("[tlog] token: " + str(token.dep_))
59 | #print("[tlog] token.i: " + str(token.i))
60 | #print("[tlog] token.children: " + str([child for child in token.children]))
61 | #print("\n")
62 | #sys.exit(0)
63 | if token.i < seq_len:
64 | matrix[token.i][token.i] = 1
65 | # https://spacy.io/docs/api/token
66 | for child in token.children: # tzy: do not distinguish the arc types
67 | if child.i < seq_len:
68 | matrix[token.i][child.i] = 1
69 | matrix[child.i][token.i] = 1
70 |
71 | #print("[tlog] matrix: " + str(matrix))
72 | #sys.exit(0)
73 | return matrix, pos, dep_rel
74 |
75 | def process(filename):
76 | print(filename)
77 | fin = open(filename, 'r', encoding='utf-8', newline='\n', errors='ignore')
78 | lines = fin.readlines()
79 | fin.close()
80 | idx2graph = {}
81 | fout = open(filename+'.graph', 'wb')
82 | pos_out = open(filename+'.pos', 'w')
83 | rel_out = open(filename+'.rel', 'w')
84 | for i in range(0, len(lines), 3):
85 | #text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")]
86 | #aspect = lines[i + 1].lower().strip()
87 | text_left, _, text_right = [s.strip() for s in lines[i].partition("$T$")]
88 | aspect = lines[i + 1].strip()
89 | #adj_matrix, pos, rel = dependency_adj_matrix(text_left.strip()+' '+aspect+' '+text_right.strip())
90 | adj_matrix, pos, rel = dependency_adj_matrix2(text_left.strip()+' '+aspect+' '+text_right.strip())
91 | idx2graph[i] = adj_matrix
92 | pos_out.write(" ".join(pos)+"\n")
93 | rel_out.write(" ".join(rel)+"\n")
94 | pickle.dump(idx2graph, fout)
95 | fout.close()
96 |
97 | if __name__ == '__main__':
98 | process('./datasets/french/restaurant_train.raw')
99 | process('./datasets/french/restaurant_test.raw')
100 |
101 | process('./datasets/spanish/restaurant_train.raw')
102 | process('./datasets/spanish/restaurant_test.raw')
103 |
104 | process('./datasets/dutch/restaurant_train.raw')
105 | process('./datasets/dutch/restaurant_test.raw')
106 |
--------------------------------------------------------------------------------
/data/Multilingual/baseline/RGAT_french_dutch_spanish.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import spacy
3 | import pickle
4 | from tqdm import tqdm
5 | import json
6 | nlp = spacy.load('fr') # fr - franch / es - spanish / nl -dutch
7 | MAP_INV = {0 : 'neutral', 1 : 'positive', -1 : 'negative'}
8 |
9 | def read_data(path : str):
10 | with open(path, 'r') as fp:
11 | data = fp.readlines()
12 | fp.close()
13 | data_gp = []
14 | for idx in range(0, len(data), 3):
15 | sentence = data[idx].strip()
16 | term = data[idx+1].strip()
17 | polarity = eval(data[idx+2].strip())
18 | data_gp.append([sentence, term, polarity])
19 | return data_gp
20 |
21 | def construct_data(data : list):
22 | out_data = []
23 | for text in tqdm(data, desc='Processing'):
24 | sentence, term, polarity = text[0], text[1], text[2]
25 | document = nlp(sentence.replace('$T$', term))
26 | tokens, pos_tag = zip(*[[token.text, token.tag_.split('_')[0]] for token in document])
27 | sentence_post = sentence.replace('$T$', term)
28 | aspect_sentiment = [[term, MAP_INV[polarity]]]
29 | from_idx = len([tok.text for tok in nlp(sentence.split('$T$')[0])])
30 | term_tok = len([tok.text for tok in nlp(term)])
31 | from_to = [[from_idx, from_idx + term_tok]]
32 | predicted_dependencies = [token.dep_.split(':')[0] if token.dep_ != 'ROOT' else 'root' for token in document]
33 | predicted_heads = [token.head.i+1 if token.dep_ != 'ROOT' else 0 for token in document]
34 | dependencies = [list(ele) for ele in list(zip(predicted_dependencies, predicted_heads, list(range(1, len(predicted_heads) + 1))))]
35 | obj = {}
36 | obj['sentence'] = sentence_post
37 | obj['tokens'] = list(tokens)
38 | obj['tags'] = list(pos_tag)
39 | obj['predicted_dependencies'] = predicted_dependencies
40 | obj['predicted_heads'] = predicted_heads
41 | obj['dependencies'] = dependencies
42 | obj['aspect_sentiment'] = aspect_sentiment
43 | obj['from_to'] = from_to
44 | out_data.append(obj)
45 | return out_data
46 |
47 | if __name__ == '__main__':
48 | train_data = read_data('restaurant_train.raw')
49 | train_data = construct_data(train_data)
50 | with open('restaurant_Train.json', 'w', encoding='utf-8') as tr_fp:
51 | json_str = json.dumps(train_data, indent=4)
52 | tr_fp.write(json_str)
53 | tr_fp.close()
54 |
55 | test_data = read_data('restaurant_test.raw')
56 | test_data = construct_data(test_data)
57 | with open('restaurant_Test.json', 'w', encoding='utf-8') as te_fp:
58 | json_str = json.dumps(test_data, indent=4)
59 | te_fp.write(json_str)
60 | te_fp.close()
61 | print('WELL DONE.')
62 |
--------------------------------------------------------------------------------
/data/Multilingual/baseline/RGAT_turkish.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pickle
3 | from tqdm import tqdm
4 | import json
5 |
6 | import stanfordnlp
7 | nlpmodel = stanfordnlp.Pipeline(processors='tokenize,pos,depparse', lang="tr")
8 |
9 | MAP_INV = {0 : 'neutral', 1 : 'positive', -1 : 'negative'}
10 |
11 | def parse_adj(edge):
12 | e_id, dep_rels, dep_heads = 1, [], []
13 | for eidx in range(len(edge)):
14 | if (eidx + 1) != edge[0][2]:
15 | dep_heads.append(edge[e_id][1])
16 | dep_rels.append(edge[e_id][0])
17 | e_id += 1
18 | else:
19 | dep_heads.append(0)
20 | dep_rels.append(edge[0][0])
21 | return dep_heads, dep_rels
22 |
23 |
24 | def obtain_annotate(results: dict, only_tokens: bool=False, encode_eng:bool=False):
25 | tokens, postag, heads, deprels, deps = [], [], [], [], []
26 | for sentence in results.sentences:
27 | tok, pos = zip(*[[word.text, word.xpos.split('|')[0].upper()] for word in sentence.words])
28 | postag.extend(pos)
29 | tokens.extend(tok)
30 | dep = [(word.dependency_relation, word.governor, eval(word.index)) for word in sentence.words]
31 | head, rel = parse_adj(dep)
32 | heads.extend([he+len(heads) for he in head])
33 | deprels.extend(rel)
34 | dep =[list(gr) for gr in dep]
35 | for idx in range(len(dep)):
36 | if dep[idx][1] !=0:dep[idx][1] += len(deps)
37 | dep[idx][2] += len(deps)
38 | if dep[idx][0] == 'ROOT' : dep[idx][0] = 'root'
39 | deps.extend(dep)
40 | if not only_tokens:
41 | return tokens, postag, deps, heads, deprels
42 | else:
43 | return tokens
44 |
45 |
46 | def read_data(path : str):
47 | with open(path, 'r') as fp:
48 | data = fp.readlines()
49 | fp.close()
50 | data_gp = []
51 | for idx in range(0, len(data), 3):
52 | sentence = data[idx].strip()
53 | term = data[idx+1].strip()
54 | polarity = eval(data[idx+2].strip())
55 | data_gp.append([sentence, term, polarity])
56 | return data_gp
57 |
58 | def construct_data(data : list):
59 | out_data = []
60 | for text in tqdm(data, desc='Processing'):
61 | sentence, term, polarity = text[0], text[1], text[2]
62 | results = nlpmodel(sentence.replace('$T$', term))
63 | tokens, pos_tag, deps, heads, rels = obtain_annotate(results)
64 | predicted_heads = [ele[1] for ele in deps]
65 | sentence_post = sentence.replace('$T$', term)
66 | aspect_sentiment = [[term, MAP_INV[polarity]]]
67 | tok_lr = sentence.split('$T$')[0]
68 | if len(tok_lr) < 1:
69 | from_idx = 0
70 | else:
71 | from_idx = len(obtain_annotate(nlpmodel(sentence.split('$T$')[0]), only_tokens=True))
72 | term_tok = len(obtain_annotate(nlpmodel(term), only_tokens=True))
73 | from_to = [[from_idx, from_idx + term_tok]]
74 | obj = {}
75 | obj['sentence'] = sentence_post
76 | obj['tokens'] = list(tokens)
77 | obj['tags'] = list(pos_tag)
78 | obj['predicted_dependencies'] = rels
79 | obj['predicted_heads'] = predicted_heads
80 | obj['dependencies'] = deps
81 | obj['aspect_sentiment'] = aspect_sentiment
82 | obj['from_to'] = from_to
83 | out_data.append(obj)
84 | return out_data
85 |
86 | if __name__ == '__main__':
87 | train_data = read_data('restaurant_train.raw')
88 | train_data = construct_data(train_data)
89 | with open('restaurant_Train.json', 'w', encoding='utf-8') as tr_fp:
90 | json_str = json.dumps(train_data, indent=4)
91 | tr_fp.write(json_str)
92 | tr_fp.close()
93 |
94 | test_data = read_data('restaurant_test.raw')
95 | test_data = construct_data(test_data)
96 | with open('restaurant_Test.json', 'w', encoding='utf-8') as te_fp:
97 | json_str = json.dumps(test_data, indent=4)
98 | te_fp.write(json_str)
99 | te_fp.close()
100 | print('WELL DONE.')
101 |
--------------------------------------------------------------------------------
/data/Multilingual/download_and_process_multilingual.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | DATA_DIR=$1
4 | OUTPUT_DIR=$2
5 | EXECUTE_DIR=$3
6 | PORT=9000
7 |
8 | if test -z "$DATA_DIR"
9 | then
10 | DATA_DIR='.'
11 | fi
12 |
13 | if test -z "$OUTPUT_DIR"
14 | then
15 | OUTPUT_DIR='dataset'
16 | fi
17 |
18 | if test -z "$EXECUTE_DIR"
19 | then
20 | EXECUTE_DIR='.'
21 | fi
22 |
23 | echo "Download multilingual data in mirror source of ZJU MMF"
24 | echo "Origin data for multilingual dataset can be found in https://alt.qcri.org/semeval2016/task5/"
25 |
26 | echo ">> 1 FRENCH"
27 | FRENCH_TRAIN_FILE=${DATA_DIR}/french_train.raw
28 | FRENCH_TEST_FILE=${DATA_DIR}/french_test.raw
29 | FRENCH_OUT_DIR=${OUTPUT_DIR}/JSONABSA_French
30 |
31 | wget -O $FRENCH_TRAIN_FILE https://expic.xlxw.org/hycxg/datamirror/french_train.raw
32 | wget -O $FRENCH_TEST_FILE https://expic.xlxw.org/hycxg/datamirror/french_test.raw
33 |
34 | echo "Process french raw data to HyCxG format"
35 | python $EXECUTE_DIR/process_multilingual.py --train_file $FRENCH_TRAIN_FILE --test_file $FRENCH_TEST_FILE \
36 | --out_path $FRENCH_OUT_DIR --lang french --port $PORT
37 |
38 | echo ">> 2 SPANISH"
39 | SPANISH_TRAIN_FILE=${DATA_DIR}/spanish_train.raw
40 | SPANISH_TEST_FILE=${DATA_DIR}/spanish_test.raw
41 | SPANISH_OUT_DIR=${OUTPUT_DIR}/JSONABSA_Spanish
42 |
43 | wget -O $SPANISH_TRAIN_FILE https://expic.xlxw.org/hycxg/datamirror/spanish_train.raw
44 | wget -O $SPANISH_TEST_FILE https://expic.xlxw.org/hycxg/datamirror/spanish_test.raw
45 |
46 | echo "Process spanish raw data to HyCxG format"
47 | python $EXECUTE_DIR/process_multilingual.py --train_file $SPANISH_TRAIN_FILE --test_file $SPANISH_TEST_FILE \
48 | --out_path $SPANISH_OUT_DIR --lang spanish --port $PORT
49 |
50 | echo ">> 3 TURKISH"
51 | TURKISH_TRAIN_FILE=${DATA_DIR}/turkish_train.raw
52 | TURKISH_TEST_FILE=${DATA_DIR}/turkish_test.raw
53 | TURKISH_OUT_DIR=${OUTPUT_DIR}/JSONABSA_Turkish
54 |
55 | wget -O $TURKISH_TRAIN_FILE https://expic.xlxw.org/hycxg/datamirror/turkish_train.raw
56 | wget -O $TURKISH_TEST_FILE https://expic.xlxw.org/hycxg/datamirror/turkish_test.raw
57 |
58 | echo "Process turkish raw data to HyCxG format"
59 | python $EXECUTE_DIR/process_multilingual.py --train_file $TURKISH_TRAIN_FILE --test_file $TURKISH_TEST_FILE \
60 | --out_path $TURKISH_OUT_DIR --lang turkish --port $PORT
61 |
62 |
63 | echo ">> 4 DUTCH"
64 | DUTCH_TRAIN_FILE=${DATA_DIR}/dutch_train.raw
65 | DUTCH_TEST_FILE=${DATA_DIR}/dutch_test.raw
66 | DUTCH_OUT_DIR=${OUTPUT_DIR}/JSONABSA_Dutch
67 |
68 | wget -O $DUTCH_TRAIN_FILE https://expic.xlxw.org/hycxg/datamirror/dutch_train.raw
69 | wget -O $DUTCH_TEST_FILE https://expic.xlxw.org/hycxg/datamirror/dutch_test.raw
70 |
71 | echo "Process dutch raw data to HyCxG format"
72 | python $EXECUTE_DIR/process_multilingual.py --train_file $DUTCH_TRAIN_FILE --test_file $DUTCH_TEST_FILE \
73 | --out_path $DUTCH_OUT_DIR --lang dutch --port $PORT
74 |
--------------------------------------------------------------------------------
/data/README_ZH.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/README_ZH.md)
16 |
17 | ## HyCxG使用的数据集
18 |
19 | 在本工作中,我们用到了以下五个方面的数据集(括号中为在论文中提到的位置):
20 | + **方面级情感分析数据集** `[Rest 14/Lap 14/Rest 15/Rest 16/MAMS]` (4.2节的Results on ABSA tasks)
21 | + **GLUE基准数据集** `[CoLA/SST-2/MNLI/QNLI/RTE/QQP/MRPC/STS]` (4.2节的Results on GLUE tasks)
22 | + **多语言情感分析数据集** `[French/Spanish/Turkish/Dutch]` (4.3节的Multilingual results)
23 | + **反事实检测数据集** (附录F的Pattern Recognition Capability of CxG)
24 | + **口语化情感分析数据集** `[Twitter/GermEval]` (附录H的Colloquial Expression Results)
25 |
26 | ### 方面级情感分析数据集
27 | 方面级情感分析(Aspect-based sentiment analysis)数据集请见[`ABSA文件夹`](https://github.com/xlxwalex/HyCxG/tree/main/data/ABSA),其包含来自SemEval 2014/15/16的4个数据集以及MAMS数据集。我们在文件夹中给出了数据集的原始链接以及镜像数据下载脚本,并提供了转换为HyCxG所需数据格式的脚本。
28 |
29 | 除此之外,为了方便使用者在其他基线模型上评估性能,我们给不同的基线模型提供了转换脚本可以转换为它们官方数据格式的脚本(更多关于基线模型的信息请见[`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines))
30 |
31 | ### GLUE基准数据集
32 | GLUE基准(GLUE benchmark)是常用的自然语言理解任务评估基准,共由11个任务组成,请见[`GLUE文件夹`](https://github.com/xlxwalex/HyCxG/tree/main/data/GLUE)。我们测试了除`Winograd NLI(WNLI)`以及`Diagnostics Main(DM)`外的所有任务。我们在文件夹中给出了GLUE基准的原始下载链接以及镜像下载脚本(该镜像我们直接使用了Hugging Face datasets中的数据),并提供了转换为HyCxG所需数据格式的脚本。
33 |
34 | ### 多语言情感分析数据集
35 | 多语言情感分析数据集基于SemEval 2016,我们选择了法语、西班牙语、土耳其语以及荷兰语作为多语言实验数据集,请见[`Multilingual文件夹`](https://github.com/xlxwalex/HyCxG/tree/main/data/Multilingual)。我们在文件夹中给出了数据集的原始链接以及镜像数据下载脚本,并提供了转换为HyCxG所需数据格式的脚本。
36 |
37 | 另外由于其他基线模型只在英语数据集上进行了性能评估,为了方便使用者能够对比基线模型在多语言实验下的性能,我们提供了不同的基线模型的数据转换脚本以将它们的数据格式进行转换,具体细节请见[`Multilingual文件夹`](https://github.com/xlxwalex/HyCxG/tree/main/data/Multilingual)以及[`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines)。
38 |
39 | ### 反事实检测数据集
40 | 反事实检测(Counterfactual detection)数据集基于SemEval2020的Task5,请见[`Counterfactual文件夹`](https://github.com/xlxwalex/HyCxG/tree/main/data/Counterfactual)。我们同样在文件夹中给出了数据集的原始链接以及镜像数据下载脚本,并提供了转换为HyCxG所需数据格式的脚本。
41 |
42 | ### 口语化情感分析数据集
43 | 口语化情感分析实验基于Twitter以及GermEval这两个在社交媒体数据上标注的数据集,请见[`Colloquial文件夹`](https://github.com/xlxwalex/HyCxG/tree/main/data/Colloquial)。我们在该文件夹中给出了数据集的原始链接以及镜像数据下载脚本,并提供了转换为HyCxG所需数据格式的脚本。
44 |
45 | ## 快速下载并处理
46 | 除了进入以上各个子目录分别下载数据文件并处理的方式之外,我们也提供了只需一步即可直接下载并处理所有数据的脚本[`data_pipeline.sh`](https://github.com/xlxwalex/HyCxG/tree/main/data/data_pipeline.sh)。在使用脚本前,请您确认已经安装了[`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt)中的依赖包。在安装完依赖包后,用以下命令来获得并处理数据(可以不附加任何参数,所有参数均有默认值):
47 | ```shell
48 | bash data_pipeline.sh [--DATA_DIR] [--OUTPUT_DIR] [--STANFORD_DIR]
49 | ```
50 | **参数含义:**
51 | + DATA_DIR: 下载的原始数据所在文件夹, 默认为当前文件夹
52 | + OUTPUT_DIR: 处理好后的数据存储的文件夹, 默认为`dataset`
53 | + STANFORD_DIR: 斯坦福解析器所在位置,默认为本目录的`stanford-corenlp-3.9.2-minimal`
54 |
55 | **注意:** 如果您的系统中存有斯坦福解析器,请将`STANFORD_DIR`设置为解析器所在目录。若解析器文件夹不存在,那么程序会自动从镜像源中进行下载(共353MB)。您也可以使用[`download_stanfordcore.py`](https://github.com/xlxwalex/HyCxG/tree/main/data/download_stanfordcore.py)手动进行下载。
56 |
57 | ## 镜像数据源
58 | 由于以上数据集以及解析器均为公开数据集并且能够直接得到,因此为了方便使用者进行统一下载,我们提供了以上数据的备份镜像源(除了GLUE基准数据集)。如果您是数据集以及解析器的版权所有者,并认为在该数据源分发可能违反您的公开许可,请联系[`xlxw@zju.edu.cn`](mailto:xlxw@zju.edu.cn),我们会立刻撤下您的数据集或解析器。
59 |
60 | ## 数据使用规范
61 | 通过下载数据或以任何方式访问这些数据集,请遵守原始数据集的使用条款,具体原始数据集的链接请见各数据集文件夹。请注意这些数据不得用于任何`非法`或`歧视性`的目的。
--------------------------------------------------------------------------------
/data/data_pipeline.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | DATA_DIR=$1
4 | OUTPUT_DIR=$2
5 | STANFORD_DIR=$3
6 |
7 | if test -z "$DATA_DIR"
8 | then
9 | DATA_DIR='.'
10 | fi
11 |
12 | if test -z "$OUTPUT_DIR"
13 | then
14 | OUTPUT_DIR='dataset'
15 | fi
16 |
17 | if test -z "$STANFORD_DIR"
18 | then
19 | STANFORD_DIR='stanford-corenlp-3.9.2-minimal'
20 | fi
21 |
22 | if [ -d "$STANFORD_DIR" ]; then
23 | echo "$STANFORD_DIR exists, pass"
24 | else
25 | echo "$STANFORD_DIR does not exist, try to download"
26 | python download_stanfordcore.py
27 | fi
28 |
29 | # Download and process ABSA datasets
30 | bash ABSA/download_and_process_absa.sh $DATA_DIR $OUTPUT_DIR $STANFORD_DIR ABSA
31 |
32 | # Download and process GLUE datasets
33 | bash GLUE/download_and_process_glue.sh $OUTPUT_DIR $STANFORD_DIR all GLUE
34 |
35 | # Download and process Colloquial datasets
36 | bash Colloquial/download_and_process_colloquial.sh $DATA_DIR $OUTPUT_DIR $STANFORD_DIR Colloquial
37 |
38 | # Download and process Counterfactual datasets
39 | bash Counterfactual/download_and_process_counterfactual.sh $DATA_DIR $OUTPUT_DIR $STANFORD_DIR Counterfactual
40 |
41 | # Download and process Multilingual datasets
42 | bash Multilingual/download_and_process_multilingual.sh $DATA_DIR $OUTPUT_DIR Multilingual
--------------------------------------------------------------------------------
/data/download_stanfordcore.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from tqdm import tqdm
3 | import zipfile
4 |
5 | STANFORD_CORE_LINK = 'https://expic.xlxw.org/hycxg/stanfordcore/stanford-corenlp-3.9.2-minimal.zip'
6 |
7 | def download_stanfordcore(url: str, fname: str):
8 | resp = requests.get(url, stream=True)
9 | total = int(resp.headers.get('content-length', 0))
10 |
11 | with open(fname, 'wb') as file, tqdm(
12 | desc=fname,
13 | total=total,
14 | unit='iB',
15 | unit_scale=True,
16 | unit_divisor=1024,
17 | ) as bar:
18 | for data in resp.iter_content(chunk_size=1024):
19 | size = file.write(data)
20 | bar.update(size)
21 |
22 | def unzip_stanfordcore(file_name: str, out_path: str=r'.'):
23 | file_zip = zipfile.ZipFile(file_name, 'r')
24 | for file in file_zip.namelist():
25 | file_zip.extract(file, out_path)
26 | file_zip.close()
27 |
28 | if __name__ == '__main__':
29 | download_stanfordcore(STANFORD_CORE_LINK, 'stanford-corenlp-3.9.2-minimal.zip')
30 | print('>> stanford-corenlp-3.9.2-minimal.zip is downloaded.')
31 | unzip_stanfordcore('stanford-corenlp-3.9.2-minimal.zip')
32 | print('>> Stanford Core files are ready.')
33 |
--------------------------------------------------------------------------------
/figures/hycxg-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xlxwalex/HyCxG/bc55ea31f339ec36710b7e9d6a22d4fb1577fa20/figures/hycxg-logo.png
--------------------------------------------------------------------------------
/figures/main-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xlxwalex/HyCxG/bc55ea31f339ec36710b7e9d6a22d4fb1577fa20/figures/main-logo.png
--------------------------------------------------------------------------------
/figures/sub-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xlxwalex/HyCxG/bc55ea31f339ec36710b7e9d6a22d4fb1577fa20/figures/sub-logo.png
--------------------------------------------------------------------------------
/guidelines/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 |
16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/guidelines/) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/guidelines/README_ZH.md)
17 |
18 | ## Guidelines
19 |
20 | Under construction..
--------------------------------------------------------------------------------
/guidelines/README_ZH.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | ---
15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/guidelines/) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/guidelines/README_ZH.md)
16 | ## 指南
17 |
18 | 正在建设中..
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.11.0+cu113
2 | transformers==4.30.0
3 | pandas==1.4.2
4 | scipy==1.6.0
5 | six==1.15.0
6 | cytoolz==0.11.0
7 | tqdm==4.64.0
8 | numpy==1.24.1
9 | scikit_learn==1.2.0
10 | nltk==3.5
11 | spacy==2.3.5
12 | allennlp==2.1.0
13 | datasets==2.3.2
14 | stanfordcorenlp==3.9.1.1
15 | stanfordnlp==0.2.0
16 | stanza==1.4.2
17 |
--------------------------------------------------------------------------------
/tutorials/01_cxgtokenizer_tutorial.py:
--------------------------------------------------------------------------------
1 | from Tokenizer.constants import *
2 | from Tokenizer.ModelTokenizer import CxGTokenizer
3 |
4 | class ARG_Test:
5 | cxg_vocab_path: str = os.path.abspath(os.path.join(os.path.dirname(__file__), "../dataset/Vocab/CxG"))
6 | lm_path: str = 'bert-base-uncased'
7 | do_lower_case: bool = True
8 | lm_group: str = 'BERT'
9 |
10 | # Prepare the args
11 | args = ARG_Test()
12 |
13 | # Initialize CxGTokenizer
14 | cxg_tokenizer = CxGTokenizer(args, lang='eng') # Current language is English
15 |
16 | # acquire constructions
17 | test_sentence = "The restaurants try too hard to make fancy food."
18 | constructions = cxg_tokenizer.tokenize(test_sentence, raw=True)
19 | print(constructions)
20 | # Output:
21 | #{
22 | # 'text': 'The restaurants try too hard to make fancy food.',
23 | # 'token': ['the', 'restaurants', 'try', 'too', 'hard', 'to', 'make', 'fancy', 'food', '.'],
24 | # 'cons_idx': [1501, 10765, 1943], 'cons_start': [3, 3, 4], 'cons_end': [7, 6, 7],
25 | # 'cons_pattern': ['ADV--hard--to--VERB', 'ADV--hard--to', 'hard--PART--VERB']
26 | # }
27 |
--------------------------------------------------------------------------------
/tutorials/02_coverage_solver_tutorial.py:
--------------------------------------------------------------------------------
1 | from Simuann import CxGCoverage
2 | import random
3 |
4 | t_minutes = 0.05
5 | test_cxgs = {
6 | 'the--NOUN--was--ADV' : (1, 5),
7 | 'the--NOUN--was' : (1, 4),
8 | 'NOUN--AUX--ADV' : (2, 5),
9 | 'AUX--so--ADJ' : (3, 6)
10 | }
11 |
12 | cxg_names = list(test_cxgs)
13 |
14 | # Initialize states
15 | init_state = [0] * len(cxg_names)
16 | for _ in range(random.randint(1, len(cxg_names))):
17 | init_state[random.randint(0, len(cxg_names)-1)] = 1
18 |
19 | # Pack inputs
20 | starts, ends, patterns = [], [], []
21 | for cxg in test_cxgs:
22 | starts.append(test_cxgs[cxg][0])
23 | ends.append(test_cxgs[cxg][1])
24 | patterns.append(cxg)
25 |
26 | # Initialize CxGCoverage
27 | cp = CxGCoverage(init_state, patterns, starts, ends, vis=True)
28 | cp.set_schedule(cp.auto(minutes=t_minutes))
29 | state, energy = cp.anneal()
30 | print()
31 | print('>> Results:')
32 | for ids in range(len(state)):
33 | if state[ids] == 1:
34 | cxg = list(test_cxgs)[ids]
35 | print('CXG : {}, ({}, {})'.format(cxg, test_cxgs[cxg][0], test_cxgs[cxg][1]))
36 |
37 | # Output:
38 | # Temperature Energy Accept Improve Elapsed Remaining
39 | # 0.10000 0.66 0.00% 0.00% 0:00:01 0:00:00
40 | # Temperature Energy Accept Improve Elapsed Remaining
41 | # 0.10000 0.66 0.07% 0.04% 0:00:03 0:00:00
42 | # >> Results:
43 | # CXG : the--NOUN--was, (1, 4)
44 | # CXG : AUX--so--ADJ, (3, 6)
--------------------------------------------------------------------------------
/tutorials/03_hypergraph_tutorial.py:
--------------------------------------------------------------------------------
1 | import os
2 | from utils.coverage import cxg_max_coverage
3 | from utils.hypergraph import construct_graph
4 | from Tokenizer import CxGTokenizer
5 | from transformers import AutoTokenizer
6 | import random
7 | random.seed(0)
8 |
9 | class ARG_Test:
10 | cxg_vocab_path: str = os.path.abspath(os.path.join(os.path.dirname(__file__), "../dataset/Vocab/CxG"))
11 | lm_path: str = 'roberta-base-english'
12 | do_lower_case: bool = True
13 | lm_group: str = 'RoBERTa'
14 | t_minutes: float = 0.05
15 |
16 | # Prepare the args
17 | args = ARG_Test()
18 | cxgprocessor = CxGTokenizer(args, lang='eng')
19 | tokenizer = AutoTokenizer.from_pretrained(args.lm_path)
20 |
21 | # Process the sentence
22 | sentence = 'I can understand the prices if it served better food.'
23 | tokens = tokenizer.tokenize(sentence)
24 | sentence_mask = [0] + [1] * len(tokens) + [0]
25 | tokens = [''] + tokens + ['']
26 | token_ids = tokenizer.convert_tokens_to_ids(tokens)
27 | cxgs = cxgprocessor.tokenize(sentence, raw=True)
28 | selected = cxg_max_coverage(cxgs['cons_start'], cxgs['cons_end'], cxgs['cons_idx'], cxgs['cons_pattern'], T_minutes=args.t_minutes)
29 | hg, edges = construct_graph([selected], [sentence_mask], pad_len=15)
30 |
31 | print('>> Results')
32 | print('Tokens = {}'.format(tokens))
33 | print('Token ids = {}'.format(token_ids))
34 | print('constructions = {}'.format(cxgs))
35 | print('selected constructions = {}'.format(selected))
36 | print('hypergraph adjs =\n{}'.format(hg))
37 |
38 | # Outputs:
39 | # >> Results
40 | # Tokens = ['', 'I', 'Ġcan', 'Ġunderstand', 'Ġthe', 'Ġprices', 'Ġif', 'Ġit', 'Ġserved', 'Ġbetter', 'Ġfood', '.', '']
41 | # Token ids = [0, 100, 64, 1346, 5, 850, 114, 24, 1665, 357, 689, 4, 2]
42 | # constructions = {
43 | # 'text': 'I can understand the prices if it served better food.',
44 | # 'token': ['i', 'can', 'understand', 'the', 'prices', 'if', 'it', 'served', 'better', 'food', '.'],
45 | # 'cons_idx': [5943, 6071, 16646, 6388, 13591, 11402, 786, 4387, 13648, 5683, 12421, 12967],
46 | # 'cons_start': [0, 0, 0, 1, 1, 2, 3, 3, 3, 4, 4, 5], 'cons_end': [4, 5, 3, 5, 4, 5, 7, 6, 8, 8, 7, 8],
47 | # 'cons_pattern': ['i--AUX--VERB--DET', 'i--AUX--VERB--DET--NOUN', 'i--AUX--VERB', 'can--VERB--DET--NOUN', 'can--VERB--DET', 'understand--DET--NOUN', 'the--NOUN--SCONJ--PRON', 'the--NOUN--SCONJ', 'the--NOUN--SCONJ--PRON--VERB', 'NOUN--SCONJ--PRON--VERB', 'NOUN--SCONJ--PRON', 'if--PRON--VERB']
48 | # }
49 | # selected constructions = [(0, 5, 6071, 'i--AUX--VERB--DET--NOUN'), (5, 8, 12967, 'if--PRON--VERB')]
50 | # hypergraph adjs =
51 | # [array([[0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
52 | # [0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
53 | # [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])]
--------------------------------------------------------------------------------