├── HyCxG ├── DataProcessor │ ├── FoldWrapper.py │ ├── HyperDataset.py │ └── __init__.py ├── Model │ ├── HyperCxG.py │ ├── HyperGraphATT.py │ ├── Layer │ │ ├── LayerNorm.py │ │ ├── Linear.py │ │ ├── SPHyperGraphLayer.py │ │ ├── __init__.py │ │ └── activate_fn.py │ ├── __init__.py │ └── lm.py ├── README.md ├── README_ZH.md ├── Simuann │ ├── CxGCoverage.py │ ├── README.md │ ├── README_ZH.md │ ├── SimuAnneal.py │ └── __init__.py ├── Tokenizer │ ├── BaseTokenizer.py │ ├── CxGProcessor │ │ ├── CxGCore.py │ │ ├── Encoder.py │ │ ├── Loader.py │ │ ├── Parser.py │ │ ├── __init__.py │ │ ├── rdrpos_tagger │ │ │ ├── !READ_ME_ANNOTATE.txt │ │ │ ├── InitialTagger │ │ │ │ ├── InitialTagger.py │ │ │ │ ├── InitialTagger4En.py │ │ │ │ └── InitialTagger4Vn.py │ │ │ ├── SCRDRlearner │ │ │ │ ├── Node.py │ │ │ │ ├── Object.py │ │ │ │ ├── SCRDRTree.py │ │ │ │ └── SCRDRTreeLearner.py │ │ │ ├── Utility │ │ │ │ ├── Config.py │ │ │ │ ├── Eval.py │ │ │ │ ├── LexiconCreator.py │ │ │ │ └── Utils.py │ │ │ └── pSCRDRtagger │ │ │ │ ├── ExtRDRPOSTagger.py │ │ │ │ └── RDRPOSTagger.py │ │ └── utils.py │ ├── ModelTokenizer.py │ ├── README.md │ ├── README_ZH.md │ ├── Vocab.py │ ├── __init__.py │ ├── constants.py │ └── download_cxgdict.sh ├── Trainer │ ├── HyCxGTrainerABSA.py │ ├── HyCxGTrainerGLUE.py │ ├── Trainer.py │ └── __init__.py ├── config.py ├── dataset │ ├── README.md │ ├── README_ZH.md │ └── download_vocab.sh ├── run_hycxg.sh ├── train_hycxg.py └── utils │ ├── __init__.py │ ├── argument.py │ ├── coverage.py │ ├── data.py │ ├── define.py │ ├── hypergraph.py │ ├── metric.py │ ├── misc.py │ ├── operates.py │ └── optimizers.py ├── LICENSE ├── README.md ├── README_ZH.md ├── data ├── ABSA │ ├── README.md │ ├── README_ZH.md │ ├── download_and_process_absa.sh │ └── process_absa.py ├── Colloquial │ ├── README.md │ ├── README_ZH.md │ ├── baseline │ │ ├── DGEDT_germeval_gengraph.py │ │ ├── DualGCN_germeval_txt2json.py │ │ ├── KumaGCN_germeval_gengraph.py │ │ └── RGAT_germeval_txt2json.py │ ├── download_and_process_colloquial.sh │ ├── process_germeval.py │ └── process_twitter.py ├── Counterfactual │ ├── README.md │ ├── README_ZH.md │ ├── download_and_process_counterfactual.sh │ └── process_counterfactual.py ├── GLUE │ ├── README.md │ ├── README_ZH.md │ ├── download_and_process_glue.py │ └── download_and_process_glue.sh ├── Multilingual │ ├── README.md │ ├── README_ZH.md │ ├── baseline │ │ ├── DGEDT_french_dutch_spanish.py │ │ ├── DGEDT_turkish.py │ │ ├── DualGCN_french_dutch_spanish.py │ │ ├── DualGCN_turkish.py │ │ ├── KumaGCN_french_dutch_spanish.py │ │ ├── KumaGCN_turkish.py │ │ ├── RGAT_french_dutch_spanish.py │ │ └── RGAT_turkish.py │ ├── download_and_process_multilingual.sh │ └── process_multilingual.py ├── README.md ├── README_ZH.md ├── data_pipeline.sh └── download_stanfordcore.py ├── figures ├── hycxg-logo.png ├── main-logo.png └── sub-logo.png ├── guidelines ├── README.md └── README_ZH.md ├── requirements.txt └── tutorials ├── 01_cxgtokenizer_tutorial.py ├── 02_coverage_solver_tutorial.py ├── 03_hypergraph_tutorial.py ├── PaperLists.md ├── README.md └── README_ZH.md /HyCxG/DataProcessor/FoldWrapper.py: -------------------------------------------------------------------------------- 1 | from argparse import Namespace 2 | from DataProcessor.HyperDataset import HyperDataLM 3 | from copy import deepcopy 4 | from sklearn.model_selection import KFold 5 | 6 | # Note: This Module is only utilized for Countertfactual task 7 | class KFoldWrapper(HyperDataLM): 8 | def __init__(self, args: Namespace, set_name: str, desc: str = 'train', num_workers: int = 1, debug=False): 9 | super(KFoldWrapper, self).__init__(args, set_name, desc, num_workers, debug) 10 | self.curfold_items, self.curfold_labels = deepcopy(self.items), deepcopy(self.labels) 11 | kfold_splitter = KFold(n_splits=args.kfold) 12 | self.grtrain_ids, self.grvalid_ids = self.calculate_ids(kfold_splitter) 13 | 14 | def calculate_ids(self, splitter): 15 | train_ids, valid_ids = [], [] 16 | for train_index, valid_index in splitter.split(self.curfold_items): 17 | train_ids.append(train_index) 18 | valid_ids.append(valid_index) 19 | return train_ids, valid_ids 20 | 21 | def set_valid(self): 22 | self.desc = 'valid' 23 | 24 | def set_group(self, index : int): 25 | if index >= self.args.kfold: raise Exception('Error in setting `index`, `index` need to be lower than %d' % self.args.kfold) 26 | if self.desc == 'train': inds = self.grtrain_ids[index] 27 | elif self.desc == 'valid': inds = self.grvalid_ids[index] 28 | else: raise Exception('Error in setting `desc` mode, you can only choose [`train`, `valid`]') 29 | self.items, self.labels = [self.curfold_items[idx] for idx in inds], [self.curfold_labels[idx] for idx in inds] 30 | print('>> Kfold set the group to %d for %s set, total %d instances.' % (index, self.desc, len(self))) -------------------------------------------------------------------------------- /HyCxG/DataProcessor/__init__.py: -------------------------------------------------------------------------------- 1 | from DataProcessor.HyperDataset import HyperDataLM 2 | from DataProcessor.FoldWrapper import KFoldWrapper 3 | 4 | __all__ = ['HyperDataLM', 'KFoldWrapper'] -------------------------------------------------------------------------------- /HyCxG/Model/HyperCxG.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from argparse import Namespace 4 | from Model.lm import LM 5 | from Model.HyperGraphATT import RHGAT 6 | from Model.Layer.Linear import Linear 7 | 8 | class HyperCxG(nn.Module): 9 | def __init__(self, args : Namespace, device : torch.device): 10 | super(HyperCxG, self).__init__() 11 | self.device = device 12 | # LM model 13 | self.lm = LM(args, device, use_encoder=True, pooler_output=False) 14 | self.lm_dropout = nn.Dropout(args.lm_dropout) 15 | # Edge embedding 16 | self.edgemb = nn.Embedding(args.cxg_vocab_size, args.lm_hidden_size, padding_idx=0) 17 | # Relational hyper-graph attention network 18 | self.hgatt = RHGAT(args, device, args.lm_hidden_size, args.inter_size, args.lm_hidden_size, args.hg_dropout, args.leaky_alpha, args.edge_trans, args.remove_layernorm) 19 | # Classifier 20 | self.classifier = Linear(args.lm_hidden_size, args.num_classes) 21 | self.do_squeeze = args.num_classes == 1 # Combine for this repo 22 | 23 | def forward(self, input: torch.Tensor, attention_mask: torch.Tensor, HT: torch.Tensor, edges: torch.Tensor, adj_matrix: torch.Tensor, node_mask: torch.Tensor, asp_masks: torch.Tensor): 24 | # adj_matrix and asp_masks is not available in this repo 25 | # Encoder 26 | encoded = self.lm(input, attention_mask = attention_mask) 27 | encoded = self.lm_dropout(encoded) 28 | edge_emb = self.edgemb(edges) 29 | # RHGAT 30 | hidden = self.hgatt(encoded, HT, edge_emb) 31 | # Pooling 32 | node_wn = node_mask.sum(dim=1).unsqueeze(-1) 33 | mask = node_mask.unsqueeze(-1).repeat(1, 1, hidden.shape[-1]) 34 | final = (hidden * mask).sum(dim=1) / node_wn 35 | outputs = self.classifier(final) 36 | if self.do_squeeze: outputs = outputs.squeeze(-1) # Combine for this repo 37 | return outputs -------------------------------------------------------------------------------- /HyCxG/Model/HyperGraphATT.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from Model.Layer import HGAL 3 | from Model.Layer import LayerNorm 4 | 5 | class RHGAT(nn.Module): 6 | def __init__(self, args, device, input_size, inter_size, output_size, dropout=0.3, alpha=0.2, edge_trans=False, remove_layernorm=False): 7 | super(RHGAT, self).__init__() 8 | self.hgat = HGAL(args, device, input_size, input_size, dropout=dropout, do_scale=True, edge_trans=edge_trans, remove_layernorm=remove_layernorm) 9 | # self.hgat = HGAL_MH(args, device, input_size, input_size, dropout=dropout, do_scale=True, edge_trans=edge_trans, remove_layernorm=remove_layernorm) - Not available in this REPO 10 | self.dropout_1 = nn.Dropout(dropout) 11 | self.dropout_2 = nn.Dropout(dropout) 12 | # FFN + Rs 13 | self.leakyrelu = nn.LeakyReLU(alpha) 14 | self.linear_1 = nn.Linear(input_size, inter_size, bias=True) 15 | self.linear_2 = nn.Linear(inter_size, output_size, bias=True) 16 | self.layer_norm_1 = LayerNorm(args, device, input_size) 17 | self.layer_norm_2 = LayerNorm(args, device, input_size) 18 | 19 | def forward(self, hidden, HT, edge_emb): 20 | inter = self.hgat(hidden, HT, edge_emb) 21 | inter = self.dropout_1(inter) 22 | inter = self.layer_norm_1(inter + hidden) 23 | output = self.dropout_2(self.linear_2(self.leakyrelu(self.linear_1(inter)))) 24 | output = self.layer_norm_2(output + inter) 25 | return output -------------------------------------------------------------------------------- /HyCxG/Model/Layer/LayerNorm.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | from argparse import Namespace 4 | 5 | class LayerNorm(nn.Module): 6 | def __init__(self, args : Namespace, device : torch.device, hidden_size : int = 768, eps : float =1e-6): 7 | super(LayerNorm, self).__init__() 8 | self.eps = eps 9 | self.args = args 10 | self.device = device 11 | self.gamma = nn.Parameter(torch.ones(hidden_size)) 12 | self.beta = nn.Parameter(torch.zeros(hidden_size)) 13 | 14 | def forward(self, x): 15 | mean = x.mean(-1, keepdim=True) 16 | std = x.std(-1, keepdim=True) 17 | hidden_states = self.gamma * (x-mean) / (std + self.eps) 18 | return hidden_states + self.beta -------------------------------------------------------------------------------- /HyCxG/Model/Layer/Linear.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | class Linear(nn.Module): 4 | def __init__(self, Text_InFeature, Text_OutFeature): 5 | super(Linear, self).__init__() 6 | self.linear = nn.Linear(in_features=Text_InFeature, out_features=Text_OutFeature) 7 | self.init_params() 8 | 9 | def init_params(self): 10 | nn.init.kaiming_normal_(self.linear.weight) 11 | nn.init.constant_(self.linear.bias, 0) 12 | 13 | def forward(self, x): 14 | x = self.linear(x) 15 | return x -------------------------------------------------------------------------------- /HyCxG/Model/Layer/SPHyperGraphLayer.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.nn.parameter import Parameter 6 | from Model.Layer.LayerNorm import LayerNorm 7 | 8 | INF_SUB_NUM = -9e15 9 | 10 | # Simplified version 11 | class HyperGraphAttentionLayer(nn.Module): 12 | def __init__(self, args, device, input_size, output_size, dropout, do_scale=True, edge_trans=False, remove_layernorm=False): 13 | super(HyperGraphAttentionLayer, self).__init__() 14 | self.input_size = input_size 15 | self.output_size = output_size 16 | self.scale = do_scale 17 | self.layernorm = not remove_layernorm 18 | if self.layernorm: self.lnorm = LayerNorm(args, device, input_size) 19 | self.wnk = Parameter(torch.Tensor(self.input_size, self.output_size)) 20 | self.wek = Parameter(torch.Tensor(self.output_size, self.output_size)) 21 | if edge_trans: self.w_edge = Parameter(torch.Tensor(self.input_size, self.input_size)) 22 | else: self.register_parameter('w_edge', None) 23 | self.dropout_emb = nn.Dropout(dropout) 24 | self.dropout = nn.Dropout(dropout) 25 | self.reset_parameters() 26 | 27 | def reset_parameters(self): 28 | stdv = 1. / math.sqrt(self.output_size) 29 | self.wnk.data.uniform_(-stdv, stdv) 30 | self.wek.data.uniform_(-stdv, stdv) 31 | if self.w_edge is not None: self.w_edge.data.uniform_(-stdv, stdv) 32 | 33 | def forward(self, hidden, ht, edge_emb): 34 | if self.layernorm: edge_emb = self.lnorm(edge_emb) 35 | edge_emb = self.dropout_emb(edge_emb) 36 | node_k = hidden.matmul(self.wnk) 37 | if self.w_edge is not None: edge_q = torch.matmul(edge_emb, self.w_edge) 38 | else: edge_q = edge_emb 39 | edge_attnscores = torch.matmul(edge_q, node_k.permute(0, 2, 1)) 40 | if self.scale: edge_attnscores = edge_attnscores * (1 / (self.input_size ** (1/2))) 41 | zero_vec = INF_SUB_NUM * torch.ones_like(edge_attnscores) 42 | edge_attnscores = torch.where(ht > 0, edge_attnscores, zero_vec) 43 | attention_edge = F.softmax(edge_attnscores, dim=2) 44 | edge_h = torch.matmul(attention_edge, hidden) 45 | edge_h = self.dropout(edge_h) 46 | edge_h = edge_h + edge_emb 47 | edge_k = edge_h.matmul(self.wek) 48 | node_q = node_k 49 | node_attnscores = torch.matmul(node_q, edge_k.permute(0, 2, 1)) 50 | if self.scale: node_attnscores = node_attnscores * (1 / (self.input_size ** (1/2))) 51 | zero_vec = INF_SUB_NUM * torch.ones_like(node_attnscores) 52 | node_attnscores = torch.where(ht.permute(0, 2, 1) > 0, node_attnscores, zero_vec) 53 | attention_node = F.softmax(node_attnscores, dim=1) 54 | node_hidden = torch.matmul(attention_node, edge_h) 55 | return node_hidden -------------------------------------------------------------------------------- /HyCxG/Model/Layer/__init__.py: -------------------------------------------------------------------------------- 1 | from Model.Layer.Linear import Linear 2 | from Model.Layer.LayerNorm import LayerNorm 3 | from Model.Layer.SPHyperGraphLayer import HyperGraphAttentionLayer as HGAL 4 | 5 | __all__ = ['Linear', 'LayerNorm', 'HGAL'] -------------------------------------------------------------------------------- /HyCxG/Model/Layer/activate_fn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | 4 | def gelu(x): 5 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) -------------------------------------------------------------------------------- /HyCxG/Model/__init__.py: -------------------------------------------------------------------------------- 1 | from Model.HyperCxG import HyperCxG 2 | 3 | __all__ = ['HyperCxG'] -------------------------------------------------------------------------------- /HyCxG/Model/lm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from transformers import BertModel, RobertaModel 4 | from Model.Layer import * 5 | import argparse 6 | 7 | class LM(nn.Module): 8 | def __init__(self, args : argparse.Namespace, device : torch.device, use_encoder : bool = False, pooler_output : bool = True, all_output : bool = False): 9 | super(LM, self).__init__() 10 | self.device = device 11 | self.use_encoder = use_encoder 12 | self.pooler_output = pooler_output 13 | self.all_output = all_output 14 | self._lm = BertModel.from_pretrained(args.lm_path, cache_dir='.cache/') if args.lm_group == 'BERT' else RobertaModel.from_pretrained(args.lm_path, cache_dir='.cache/') 15 | if args.finetune is not True: 16 | for param in self._lm.base_model.parameters(): 17 | param.requires_grad = False 18 | self._lm_output = args.output_hidden_states 19 | self._lm.config.output_hidden_states = self._lm_output 20 | self._fc = Linear(args.lm_hidden_size, args.num_classes) 21 | 22 | def forward(self, inputs : torch.Tensor, attention_mask : torch.Tensor = None) -> tuple: 23 | encode_output = self._lm(inputs, attention_mask=attention_mask) 24 | if self.pooler_output: encoded = encode_output.pooler_output 25 | else: encoded = encode_output.last_hidden_state 26 | if self.use_encoder is not True: output = self._fc(encoded) 27 | else: 28 | if self.all_output: output = (encode_output.pooler_output, encode_output.last_hidden_state) 29 | else: output = encoded 30 | return output 31 | 32 | -------------------------------------------------------------------------------- /HyCxG/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | 16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/README_ZH.md) 17 | 18 | ## Run HyCxG 19 | 20 | ### Quick start 21 | Before running our HyCxG model, a variety of preparation steps are required. The following are the necessary steps: 22 | 1. (**Data preparation**) Please prepare the data first. We have provided an automatic download and processing script for all data. Please refer to the [`data`](https://github.com/xlxwalex/HyCxG/tree/main/data) folder for details. With the default configuration, the processed data will be saved in the `data/dataset` folder in the form of folders. Please copy all the data folders (e.g. `JSONABSA_MAMS`) to the `dataset` folder in this directory. 23 | 2. (**Preparation for CxG lists**) In the [`dataset`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset) folder under this directory, we provide an automatic download script for the list of CxG. Please refer to the README file in the folder for details. Under the default configuration, the required data files will be automatically downloaded to the corresponding location. 24 | 3. (**Preparation for CxG vocabulary**) In the [`Tokenizer`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset) folder from this directory, we provide an automatic download script for the construction vocabulary data. Please refer to the README file in the folder for the execution command. Similarly, under the default configuration, the required data files will be automatically downloaded to the corresponding location. 25 | 4. (**Run HyCxG**): In the [`run_hycxg.sh`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/run_hycxg.sh) file, we provide the command to run the HyCxG model, which can be adapted to different datasets by modifying the parameters. 26 | 27 | **Note:** The hyper-parameter settings for each task can be found in the [`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines). -------------------------------------------------------------------------------- /HyCxG/README_ZH.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/README_ZH.md) 16 | ## 运行HyCxG 17 | 18 | ### 快速运行指引 19 | 在运行HyCxG模型前,我们需要先进行多种数据的准备,以下为所需的操作步骤: 20 | 1. (**数据集准备**) 请先准备数据,我们提供了数据的自动下载处理脚本,详情请见[`data`](https://github.com/xlxwalex/HyCxG/tree/main/data)文件夹。在默认配置下,处理完的数据会以文件夹的形式保存在`datas/dataset`文件夹中,请复制所有的数据文件夹(例如:`JSONABSA_MAMS`等)到本目录下的`dataset`文件夹中 21 | 2. (**构式语法表数据准备**) 在本目录下的[`dataset`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset)文件夹中,我们提供了构式语法列表数据的自动下载脚本,详情请见文件夹中的README文件。在默认配置下,所需数据文件会自动下载到对应位置 22 | 3. (**构式词表数据准备**) 在本目录下的[`Tokenizer`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset)文件夹内,我们提供了构式词表数据的自动下载脚本,执行命令请见文件夹中的README文件。同样在默认配置下,所需数据文件会自动下载到对应位置 23 | 4. (**运行HyCxG**) 在[`run_hycxg.sh`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/run_hycxg.sh)中我们给出了运行模型的命令,可以通过修改参数来适应不同的数据集 24 | 25 | **注意**:各个任务的超参数设置请见[`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines)。 -------------------------------------------------------------------------------- /HyCxG/Simuann/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | 16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann/README_ZH.md) 17 | 18 | ## Cond-MC Solver 19 | 20 | The code in this package is utilized to solve the Cond-MC problem in the second section of our paper (Construction Extraction and Selection) via simulated annealing (SA). Please refer to the paper for problem definition and solution steps. Meanwhile, you can browse [`tutorials`](https://github.com/xlxwalex/HyCxG/tree/main/tutorials) for more detailed information. 21 | 22 | ### Quick Start 23 | We provide multiple instances in the [`CxGCoverage.py`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann/CxGCoverage.py) file, and you can directly run the following code to browse the results (these instances can also serve as a reference for hyper-parameters adjustment): 24 | 25 | ```shell 26 | python CxGCoverage.py 27 | ``` 28 | **Hyper-parameters:** 29 | + PATTERN_SCORE: The dict represents scores for slots with different levels of abstraction in the constructions, corresponding to ![](http://latex.codecogs.com/svg.latex?s_{syn}), ![](http://latex.codecogs.com/svg.latex?s_{sem}) and ![](http://latex.codecogs.com/svg.latex?s_{lex}) in the paper. 30 | + COVERAGE_SCORE: The weights in the objective function are stored in this dict, corresponding to ![](http://latex.codecogs.com/svg.latex?w_{1}),![](http://latex.codecogs.com/svg.latex?w_{2}),![](http://latex.codecogs.com/svg.latex?w_{3}) in our paper. 31 | 32 | **Note:** These hyper-parameters are hardcoded in the code, please modify the hyper-parameters in the header of [`CxGCoverage.py`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann/CxGCoverage.py) directly. 33 | 34 | ### Acknowledgement 35 | The code in [`SimuAnneal.py`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann/SimuAnneal.py) is modified from [`simanneal`](https://github.com/perrygeo/simanneal), which provides a convenient framework for solving problems. We are extremely grateful for the efforts and contributions by the owner of this repo! 36 | -------------------------------------------------------------------------------- /HyCxG/Simuann/README_ZH.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann/README_ZH.md) 16 | ## Cond-MC求解器 17 | 18 | 该部分代码用于对我们论文第二章节(Construction Extraction and Selection)的Cond-MC问题使用模拟退火(Simulated Annealing, SA)的方式进行求解。问题定义以及求解步骤请见论文。更多详细的细节请参考[`tutorials`](https://github.com/xlxwalex/HyCxG/tree/main/tutorials)。 19 | 20 | ### 快速上手 21 | 我们在[`CxGCoverage.py`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann/CxGCoverage.py)中提供了多个例子,您可以直接使用以下代码直接运行来浏览结果(同时,这些例子也可以作为超参调整的参考): 22 | ```shell 23 | python CxGCoverage.py 24 | ``` 25 | **超参数列表:** 26 | + PATTERN_SCORE:该字典中表示的是构式不同抽象等级的槽的分数,对应论文中![](http://latex.codecogs.com/svg.latex?s_{syn}), ![](http://latex.codecogs.com/svg.latex?s_{sem}), ![](http://latex.codecogs.com/svg.latex?s_{lex}) 27 | + COVERAGE_SCORE:该字典中存储的是目标函数中的权重,对应论文中的![](http://latex.codecogs.com/svg.latex?w_{1}),![](http://latex.codecogs.com/svg.latex?w_{2}),![](http://latex.codecogs.com/svg.latex?w_{3}) 28 | 29 | **注意:** 这些超参数被硬编码在了代码文件中,因此请直接修改[`CxGCoverage.py`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann/CxGCoverage.py)头部的两个字典。 30 | 31 | ### 致谢 32 | 本部分的[`SimuAnneal`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Simuann/SimuAnneal.py)修改自[`simanneal`](https://github.com/perrygeo/simanneal),其提供了一个很方便的框架来对问题进行求解,我们十分感谢该仓库开发者的贡献! 33 | -------------------------------------------------------------------------------- /HyCxG/Simuann/__init__.py: -------------------------------------------------------------------------------- 1 | from Simuann.CxGCoverage import CxGCoverageProblem as CxGCoverage 2 | 3 | __all__ = ['CxGCoverage'] -------------------------------------------------------------------------------- /HyCxG/Tokenizer/CxGProcessor/CxGCore.py: -------------------------------------------------------------------------------- 1 | from Tokenizer.CxGProcessor.Loader import Loader 2 | from Tokenizer.CxGProcessor.Encoder import Encoder 3 | from Tokenizer.CxGProcessor.Parser import Parser 4 | 5 | class CxGCore(object): 6 | def __init__(self, args, workers = None, lang='eng'): 7 | self.args = args 8 | self.Loader = Loader(args, lang=lang) 9 | self.Encoder = Encoder(lang=lang) 10 | self.Parser = Parser(self.Loader, self.Encoder, workers=workers) 11 | 12 | def parse_text(self, text): 13 | if isinstance(text, str): 14 | text = [text] 15 | tokens = self.Loader.tokenize(text) 16 | lines, mapper, tokenizer_tokens = self.Loader.load_text(text) 17 | results = self.Parser.parse_lines(lines) 18 | # return results 19 | results_ = {} 20 | for i, res in enumerate(results): 21 | temp = {} 22 | temp["text"] = text[i] 23 | temp["token"] = tokenizer_tokens[i] 24 | temp["cons_idx"] = [ele + 1 for ele in res[0]] # 0 - 25 | temp["cons_start"] = [mapper[ele][0] for ele in res[1]] 26 | temp["cons_end"] = [mapper[ele-1][-1] + 1 for ele in res[2]] 27 | results_[i] = temp 28 | return results_[0] 29 | 30 | def parse_file(self, file): 31 | lines = self.Loader.load_from_file(file) 32 | results = self.Parser.parse_lines(lines) 33 | return results 34 | 35 | -------------------------------------------------------------------------------- /HyCxG/Tokenizer/CxGProcessor/Encoder.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.utils import murmurhash3_32 5 | from Tokenizer.CxGProcessor.rdrpos_tagger.pSCRDRtagger.RDRPOSTagger import RDRPOSTagger 6 | from Tokenizer.CxGProcessor.rdrpos_tagger.Utility.Utils import readDictionary 7 | from Tokenizer.CxGProcessor.Loader import Loader 8 | 9 | class Encoder(object): 10 | def __init__(self, args="", lang='eng'): 11 | 12 | MODEL_STRING = os.path.abspath(os.path.join(os.path.dirname(__file__), "data/{}.RDR".format(lang))) 13 | DICT_STRING = os.path.abspath(os.path.join(os.path.dirname(__file__), "data/{}.DICT".format(lang))) 14 | DICTIONARY_FILE = os.path.abspath(os.path.join(os.path.dirname(__file__), "data/{}.clusters.fastText.v2.gz".format(lang))) 15 | pos_list = Loader.pos_list 16 | seed = Loader.seed 17 | 18 | self.args = args 19 | self.pos_dict = {murmurhash3_32(pos, seed=seed): pos for pos in pos_list} 20 | self.word_dict = pd.read_csv(DICTIONARY_FILE, index_col=0).to_dict()["Cluster"] 21 | self.domain_dict = {murmurhash3_32(str(key), seed=seed): self.word_dict[key] for key in self.word_dict.keys()} 22 | self.word_dict = {murmurhash3_32(str(key), seed=0): key for key in self.word_dict.keys()} 23 | self.build_decoder() 24 | 25 | self.DICT = readDictionary(DICT_STRING) 26 | self.r = RDRPOSTagger(word_dict=self.domain_dict, DICT=self.DICT) 27 | self.r.constructSCRDRtreeFromRDRfile(MODEL_STRING) 28 | 29 | def build_decoder(self): 30 | #LEX = 1, POS = 2, CAT = 3 31 | decoding_dict = {} 32 | decoding_dict[1] = self.word_dict 33 | decoding_dict[2] = self.pos_dict 34 | decoding_dict[3] = {key: "<" + str(key) + ">" for key in list(set(self.domain_dict.values()))} 35 | self.decoding_dict = decoding_dict 36 | 37 | def tagline(self, line): 38 | line = self.r.tagRawSentenceHash(line) 39 | return np.array(line) 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /HyCxG/Tokenizer/CxGProcessor/Loader.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from Tokenizer.BaseTokenizer import BasicTokenizer, WordpieceTokenizer 3 | from transformers import AutoTokenizer 4 | from sklearn.utils import murmurhash3_32 5 | from collections import defaultdict 6 | import Tokenizer.CxGProcessor.utils as utils 7 | from Tokenizer.constants import * 8 | 9 | # Load Construction 10 | class Loader(object): 11 | pos_list = ["PROPN", "SYM", "VERB", "DET", "CCONJ", "AUX", 12 | "ADJ", "INTJ", "SCONJ", "PRON", "NUM", "PUNCT", 13 | "ADV", "ADP", "X", "NOUN", "PART"] 14 | seed = 0 15 | 16 | def __init__(self, args, lang='eng'): 17 | self.args = args 18 | self.cons_path = args.cxg_vocab_path + "/construction.pkl" if lang == 'eng' else args.cxg_vocab_path + "/{}.construction.pkl".format(lang) 19 | self.pos_list = Loader.pos_list 20 | self.seed = Loader.seed 21 | self.lmg = args.lm_group 22 | self.cons = self.load_cons() 23 | self.dict_cons = self.load_dict_cons() 24 | self.basic_tokenizer = BasicTokenizer(do_lower_case=self.args.do_lower_case) 25 | self.auto_tokenizer = AutoTokenizer.from_pretrained(args.lm_path) 26 | 27 | def load_dict_cons(self): 28 | encoded_cons = self.cons 29 | dict_cons = dict() 30 | X = list(set([encoded_cons[i][0][0] for i in range(len(encoded_cons))])) 31 | for x in X: 32 | dict_cons[x] = defaultdict(list) 33 | for i, encoded_con in enumerate(encoded_cons): 34 | if encoded_con[0][0] == x: 35 | dict_cons[x][encoded_con[0][1]].append((encoded_con, i)) 36 | return X, dict_cons 37 | 38 | def load_cons(self): 39 | if self.cons_path.endswith(".pkl"): 40 | with open(self.cons_path, "rb") as f: 41 | res = pickle.load(f) 42 | else: 43 | cons = self.read_cons() 44 | res = self.encode_cons(cons) 45 | return res 46 | 47 | def load_text(self, text): 48 | tokens = self.tokenize(text) 49 | map_word2_token, tokenizer_tokens = self.map_cxgtoken2plmtoken(tokens) 50 | tokens = self.replace(tokens) 51 | lines = self.tokens2lines(tokens) 52 | return lines, map_word2_token, tokenizer_tokens 53 | 54 | def load_from_file(self, file): 55 | text = [] 56 | with open(file, "r") as f: 57 | for line in f.readlines(): 58 | if line.strip(): 59 | text.append(line.strip()) 60 | lines, _, tokenizer_tokens = self.load_text(text) 61 | return lines 62 | 63 | def tokenize(self, text): 64 | if isinstance(text, str): 65 | text = [text] 66 | tokens = [] 67 | basic_tokenizer = BasicTokenizer(do_lower_case=self.args.do_lower_case) 68 | for ele in text: 69 | tokens.append(basic_tokenizer.tokenize(ele)) 70 | return tokens 71 | 72 | def replace(self, tokens, no_number=True, no_phone=True, no_email=True, no_currency=True): 73 | if no_phone: 74 | tokens = [self.replace_with_phone(token) for token in tokens] 75 | if no_number: 76 | tokens = [self.replace_with_number(token) for token in tokens] 77 | if no_email: 78 | tokens = [self.replace_with_email(token) for token in tokens] 79 | if no_currency: 80 | tokens = [self.replace_with_currency_symbol(token) for token in tokens] 81 | return tokens 82 | 83 | def map_cxgtoken2plmtoken(self, tokens): 84 | accum_idx = 0 85 | mapper, true_tokens = [], [] 86 | for token in tokens[0]: 87 | tok = [] 88 | wp_tokens = self.auto_tokenizer.tokenize(token) if self.lmg == 'BERT' else self.auto_tokenizer.tokenize(' ' + token) 89 | true_tokens.extend(wp_tokens) 90 | tok.extend(wp_tokens) 91 | mapper.append([accum_idx, accum_idx + len(tok) -1]) 92 | accum_idx += len(tok) 93 | return mapper, [true_tokens] 94 | 95 | @staticmethod 96 | def replace_with_number(token, alternative=""): 97 | return [utils.NUMBERS_REGEX.sub(alternative, x) for x in token] 98 | 99 | @staticmethod 100 | def replace_with_currency_symbol(token, alternative=""): 101 | return [utils.CURRENCY_REGEX.sub(alternative, x) for x in token] 102 | 103 | @staticmethod 104 | def replace_with_email(token, alternative=""): 105 | return [utils.CURRENCY_REGEX.sub(alternative, x) for x in token] 106 | 107 | @staticmethod 108 | def replace_with_phone(token, alternative=""): 109 | return [utils.PHONE_REGEX.sub(alternative, x) for x in token] 110 | 111 | def tokens2lines(self, tokens): 112 | lines = [" ".join(token) for token in tokens] 113 | return lines 114 | 115 | def read_cons(self): 116 | cons = [] 117 | with open(self.cons_path, "r") as f: 118 | for line in f.readlines(): 119 | con = line.strip().split("--") 120 | cons.append(con) 121 | return cons 122 | 123 | def write_cons(self, encoded_cons): 124 | path = self.cons_path.replace(".txt", ".pkl") 125 | with open(path, "wb") as f: 126 | pickle.dump(encoded_cons, f) 127 | 128 | def encode_cons(self, cons): 129 | encoded_cons = [] 130 | for con in cons: 131 | encoded_cons.append(self.encode_con(con)) 132 | return encoded_cons 133 | 134 | def encode_con(self, con): 135 | encoded_con = [] 136 | for x in con: 137 | if x.startswith("<"): 138 | encoded_con.append((3, int(x[1:-1]))) 139 | elif x in self.pos_list: 140 | encoded_con.append((2, murmurhash3_32(x, seed=self.seed))) 141 | else: 142 | encoded_con.append((1, murmurhash3_32(x, seed=self.seed))) 143 | return tuple(encoded_con) -------------------------------------------------------------------------------- /HyCxG/Tokenizer/CxGProcessor/Parser.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | 3 | 4 | class Parser(object): 5 | def __init__(self, Loader, Encoder, workers=10): 6 | self.Loader = Loader 7 | self.Encoder = Encoder 8 | self.workers = workers 9 | 10 | def parse_lines(self, lines): 11 | if self.workers is not None: 12 | chunk_size = 1000 13 | pool_instance = mp.Pool(processes=self.workers, maxtasksperchild=None) 14 | lines = pool_instance.map(self.Encoder.tagline, lines, chunksize=chunk_size) 15 | pool_instance.close() 16 | pool_instance.join() 17 | pool_instance = mp.Pool(processes=self.workers, maxtasksperchild=None) 18 | results = pool_instance.map(self.match_cons, lines, chunksize=chunk_size) 19 | pool_instance.close() 20 | pool_instance.join() 21 | else: 22 | lines = [self.Encoder.tagline(line) for line in lines] 23 | results = [self.match_cons(line) for line in lines] 24 | results = [self.del_duplicate(res) for res in results] 25 | return results 26 | 27 | def del_duplicate(self, result): 28 | if len(result[0]) <= 1: 29 | return result 30 | s = set() 31 | k = 0 32 | for i in range(len(result[0])): 33 | if (result[1][i], result[2][i]) not in s: 34 | result[0][k] = result[0][i] 35 | result[1][k] = result[1][i] 36 | result[2][k] = result[2][i] 37 | s.add((result[1][i], result[2][i])) 38 | k += 1 39 | for i in range(len(result)): 40 | del(result[i][k:]) 41 | return result 42 | 43 | def match_cons(self, line): 44 | cons_idx, cons_start, cons_end = [], [], [] 45 | for i, unit in enumerate(line): 46 | candidates = self.get_candidates(unit) 47 | for con, idx in candidates: 48 | match = True 49 | for j in range(1, len(con)): 50 | if i + j < len(line): 51 | if line[i + j][con[j][0] - 1] != con[j][1]: 52 | match = False 53 | break 54 | else: 55 | match = False 56 | break 57 | if match: 58 | cons_idx.append(idx) 59 | cons_start.append(i) 60 | cons_end.append(i + len(con)) 61 | return cons_idx, cons_start, cons_end 62 | 63 | def get_candidates(self, unit): 64 | candidates = [] 65 | for i in self.Loader.dict_cons[0]: 66 | candidate = self.Loader.dict_cons[1][i][unit[i-1]] 67 | candidates += candidate 68 | return candidates 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /HyCxG/Tokenizer/CxGProcessor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xlxwalex/HyCxG/bc55ea31f339ec36710b7e9d6a22d4fb1577fa20/HyCxG/Tokenizer/CxGProcessor/__init__.py -------------------------------------------------------------------------------- /HyCxG/Tokenizer/CxGProcessor/rdrpos_tagger/!READ_ME_ANNOTATE.txt: -------------------------------------------------------------------------------- 1 | The main part-of-speech tagging tool used is RDRPOSTagger: 2 | 3 | Copyright (C) 2013-2015 by Dat Quoc Nguyen, Dai Quoc Nguyen, Dang Duc Pham and Son Bao Pham 4 | RDRPOSTagger's website: http://rdrpostagger.sourceforge.net/ 5 | 6 | This version of RDRPOSTagger has been updated to work in Python 3, along with other minor changes. 7 | Original models and new models (e.g., RDR and DICT files) can be found in the main data directory. -------------------------------------------------------------------------------- /HyCxG/Tokenizer/CxGProcessor/rdrpos_tagger/InitialTagger/InitialTagger.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | 5 | def initializeSentence(FREQDICT, sentence): 6 | words = sentence.strip().split() 7 | taggedSen = [] 8 | for word in words: 9 | if word in ["“", "”", "\""]: 10 | #taggedSen.append("''/" + FREQDICT["''"]) 11 | if "''" in FREQDICT: 12 | taggedSen.append("''/" + FREQDICT["''"]) 13 | elif "." in FREQDICT: 14 | taggedSen.append("''/" + FREQDICT["."]) 15 | elif "," in FREQDICT: 16 | taggedSen.append("''/" + FREQDICT[","]) 17 | else: 18 | print("\n'' is not in the dictionary \nManually add '' with a possible POS tag into the .DICT file!") 19 | taggedSen.append("''/" + FREQDICT["''"]) 20 | continue 21 | 22 | tag = '' 23 | decodedW = word 24 | lowerW = decodedW.lower() 25 | if word in FREQDICT: 26 | tag = FREQDICT[word] 27 | elif lowerW in FREQDICT: 28 | tag = FREQDICT[lowerW] 29 | else: 30 | if re.search(r"[0-9]+", word) != None: 31 | tag = FREQDICT["TAG4UNKN-NUM"] 32 | else: 33 | suffixL2 = suffixL3 = suffixL4 = suffixL5 = None 34 | wLength = len(decodedW) 35 | if wLength >= 4: 36 | suffixL3 = ".*" + decodedW[-3:] 37 | suffixL2 = ".*" + decodedW[-2:] 38 | if wLength >= 5: 39 | suffixL4 = ".*" + decodedW[-4:] 40 | if wLength >= 6: 41 | suffixL5 = ".*" + decodedW[-5:] 42 | 43 | if suffixL5 in FREQDICT: 44 | tag = FREQDICT[suffixL5] 45 | elif suffixL4 in FREQDICT: 46 | tag = FREQDICT[suffixL4] 47 | elif suffixL3 in FREQDICT: 48 | tag = FREQDICT[suffixL3] 49 | elif suffixL2 in FREQDICT: 50 | tag = FREQDICT[suffixL2] 51 | elif decodedW[0].isupper(): 52 | tag = FREQDICT["TAG4UNKN-CAPITAL"] 53 | else: 54 | tag = FREQDICT["TAG4UNKN-WORD"] 55 | 56 | taggedSen.append(word + "/" + tag) 57 | 58 | return " ".join(taggedSen) 59 | 60 | def initializeCorpus(FREQDICT, inputFile, outputFile): 61 | lines = open(inputFile, "r").readlines() 62 | fileOut = open(outputFile, "w") 63 | for line in lines: 64 | fileOut.write(initializeSentence(FREQDICT, line) + "\n") 65 | fileOut.close() 66 | -------------------------------------------------------------------------------- /HyCxG/Tokenizer/CxGProcessor/rdrpos_tagger/InitialTagger/InitialTagger4En.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | 5 | def initializeEnSentence(FREQDICT, sentence): 6 | words = sentence.strip().split() 7 | taggedSen = [] 8 | for word in words: 9 | if word in ["“", "”", "\""]: 10 | taggedSen.append("''/" + FREQDICT["''"]) 11 | continue 12 | 13 | tag = '' 14 | lowerW = word.lower() 15 | if word in FREQDICT: 16 | tag = FREQDICT[word] 17 | elif lowerW in FREQDICT: 18 | tag = FREQDICT[lowerW] 19 | else: 20 | if (re.search(r"([0-9]+-)|(-[0-9]+)", word) != None): 21 | tag = "JJ" 22 | elif (re.search(r"[0-9]+", word) != None): 23 | tag = "CD" 24 | elif (re.search(r'(.*ness$)|(.*ment$)|(.*ship$)|(^[Ee]x-.*)|(^[Ss]elf-.*)', word) != None): 25 | tag = "NN" 26 | elif (re.search(r'.*s$', word) != None and word[0].islower()): 27 | tag = "NNS" 28 | elif (word[0].isupper()): 29 | tag = "NNP" 30 | elif(re.search(r'(^[Ii]nter.*)|(^[nN]on.*)|(^[Dd]is.*)|(^[Aa]nti.*)', word) != None): 31 | tag = "JJ" 32 | elif (re.search(r'.*ing$', word) != None and word.find("-") < 0): 33 | tag = "VBG" 34 | elif (re.search(r'.*ed$', word) != None and word.find("-") < 0): 35 | tag = "VBN" 36 | elif (re.search(r'(.*ful$)|(.*ous$)|(.*ble$)|(.*ic$)|(.*ive$)|(.*est$)|(.*able$)|(.*al$)', word) != None 37 | or word.find("-") > -1): 38 | tag = "JJ" 39 | elif(re.search(r'.*ly$', word) != None): 40 | tag = "RB" 41 | else: 42 | tag = "NN" 43 | 44 | taggedSen.append(word + "/" + tag) 45 | 46 | return " ".join(taggedSen) 47 | 48 | def initializeEnCorpus(FREQDICT, inputFile, outputFile): 49 | lines = open(inputFile, "r").readlines() 50 | fileOut = open(outputFile, "w") 51 | for line in lines: 52 | fileOut.write(initializeEnSentence(FREQDICT, line) + "\n") 53 | fileOut.close() 54 | -------------------------------------------------------------------------------- /HyCxG/Tokenizer/CxGProcessor/rdrpos_tagger/InitialTagger/InitialTagger4Vn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | 5 | def isAbbre(word): 6 | 7 | #word = unicode(word, "utf-8") 8 | for i in range(len(word)): 9 | if isVnLowerChar(word[i]) or word[i] == "_": 10 | return False 11 | return True 12 | 13 | VNUPPERCHARS = [u'Ă', u'Â', u'Đ', u'Ê', u'Ô', u'Ơ', u'Ư'] 14 | VNLOWERCHARS = [u'ă', u'â', u'đ', u'ê', u'ô', u'ơ', u'ư'] 15 | 16 | def isVnLowerChar(char): 17 | if char.islower() or char in VNLOWERCHARS: 18 | return True; 19 | return False; 20 | 21 | def isVnUpperChar(char): 22 | if char.isupper() or char in VNUPPERCHARS: 23 | return True; 24 | return False; 25 | 26 | def isVnProperNoun(word): 27 | #word = unicode(word, "utf-8") 28 | if (isVnUpperChar(word[0])): 29 | if word.count("_") >= 4: 30 | return True 31 | index = word.find("_") 32 | while index > 0 and index < len(word) - 1: 33 | if isVnLowerChar(word[index + 1]): 34 | return False; 35 | index = word.find("_", index + 1) 36 | return True; 37 | else: 38 | return False; 39 | 40 | def initializeVnSentence(FREQDICT, sentence): 41 | words = sentence.strip().split() 42 | taggedSen = [] 43 | for word in words: 44 | if word in ["“", "”", "\""]: 45 | taggedSen.append("''/" + FREQDICT["''"]) 46 | continue 47 | 48 | tag = '' 49 | decodedW = word 50 | lowerW = decodedW.lower() 51 | if word in FREQDICT: 52 | tag = FREQDICT[word] 53 | elif lowerW in FREQDICT: 54 | tag = FREQDICT[lowerW] 55 | else: 56 | if (re.search(r"[0-9]+", word) != None): 57 | tag = FREQDICT["TAG4UNKN-NUM"] 58 | elif(len(word) == 1 and isVnUpperChar(word[0])): 59 | tag = "Y" 60 | elif (isAbbre(word)): 61 | tag = "Ny" 62 | elif (isVnProperNoun(word)): 63 | tag = "Np" 64 | else: 65 | suffixL2 = suffixL3 = suffixL4 = suffixL5 = None 66 | wLength = len(decodedW) 67 | if wLength >= 4: 68 | suffixL3 = ".*" + decodedW[-3:] 69 | suffixL2 = ".*" + decodedW[-2:] 70 | if wLength >= 5: 71 | suffixL4 = ".*" + decodedW[-4:] 72 | if wLength >= 6: 73 | suffixL5 = ".*" + decodedW[-5:] 74 | 75 | if suffixL5 in FREQDICT: 76 | tag = FREQDICT[suffixL5] 77 | elif suffixL4 in FREQDICT: 78 | tag = FREQDICT[suffixL4] 79 | elif suffixL3 in FREQDICT: 80 | tag = FREQDICT[suffixL3] 81 | elif suffixL2 in FREQDICT: 82 | tag = FREQDICT[suffixL2] 83 | else: 84 | tag = FREQDICT["TAG4UNKN-WORD"] 85 | 86 | taggedSen.append(word + "/" + tag) 87 | 88 | return " ".join(taggedSen) 89 | 90 | def initializeVnCorpus(FREQDICT, inputFile, outputFile): 91 | lines = open(inputFile, "r").readlines() 92 | fileOut = open(outputFile, "w") 93 | for line in lines: 94 | fileOut.write(initializeVnSentence(FREQDICT, line) + "\n") 95 | fileOut.close() 96 | 97 | -------------------------------------------------------------------------------- /HyCxG/Tokenizer/CxGProcessor/rdrpos_tagger/SCRDRlearner/Node.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class Node: 4 | """ 5 | A class to represent the nodes in SCRDR tree 6 | """ 7 | 8 | def __init__(self, condition, conclusion, father = None, exceptChild = None, elseChild = None, cornerstoneCases = [], depth = 0): 9 | self.condition = condition 10 | self.conclusion = conclusion 11 | self.exceptChild = exceptChild 12 | self.elseChild = elseChild 13 | self.cornerstoneCases = cornerstoneCases 14 | self.father = father 15 | self.depth = depth 16 | 17 | def satisfied(self, object): 18 | return eval(self.condition) 19 | 20 | def executeConclusion(self, object): 21 | exec(self.conclusion) 22 | 23 | def appendCornerstoneCase(self, object): 24 | self.cornerstoneCases.append(object) 25 | 26 | def check(self, object): 27 | if self.satisfied(object): 28 | self.executeConclusion(object) 29 | if self.exceptChild != None: 30 | self.exceptChild.check(object) 31 | else: 32 | if self.elseChild != None: 33 | self.elseChild.check(object) 34 | 35 | def checkDepth(self, object, length): 36 | if self.depth <= length: 37 | if self.satisfied(object): 38 | self.executeConclusion(object) 39 | if self.exceptChild != None: 40 | self.exceptChild.checkDepth(object, length) 41 | else: 42 | if self.elseChild != None: 43 | self.elseChild.checkDepth(object, length) 44 | 45 | def findRealFather(self): 46 | node = self 47 | fatherNode = node.father 48 | while True and fatherNode != None: 49 | if fatherNode.exceptChild == node: 50 | break 51 | node = fatherNode 52 | fatherNode = node.father 53 | return fatherNode 54 | 55 | def addElseChild(self, node): 56 | fatherNode = self.findRealFather() 57 | for object in fatherNode.cornerstoneCases: 58 | if node.satisfied(object): 59 | print("The new rule fires the cornerstone cases of its father node!!!") 60 | self.findRealFather().cornerstoneCases.remove(object) 61 | self.elseChild = node 62 | return True 63 | 64 | def addExceptChild(self, node): 65 | for object in self.cornerstoneCases: 66 | if node.satisfied(object): 67 | print("The new rule fires the cornerstone cases of its father node!!!") 68 | self.cornerstoneCases.remove(object) 69 | self.exceptChild = node 70 | return True 71 | 72 | def writeToFileWithSeenCases(self, out, depth): 73 | space = tabStr(depth) 74 | out.write(space + self.condition + " : " + self.conclusion + "\n") 75 | for case in self.cornerstoneCases: 76 | out.write(" " + space + "cc: " + case.toStr() + "\n") 77 | if self.exceptChild != None: 78 | self.exceptChild.writeToFile(out, depth + 1) 79 | if self.elseChild != None: 80 | self.elseChild.writeToFile(out, depth) 81 | 82 | def writeToFile(self, out, depth): 83 | space = tabStr(depth) 84 | out.write(space + self.condition + " : " + self.conclusion + "\n") 85 | if self.exceptChild != None: 86 | self.exceptChild.writeToFile(out, depth + 1) 87 | if self.elseChild != None: 88 | self.elseChild.writeToFile(out, depth) 89 | 90 | def tabStr(length): 91 | return "".join(["\t"] * length) 92 | -------------------------------------------------------------------------------- /HyCxG/Tokenizer/CxGProcessor/rdrpos_tagger/Utility/Config.py: -------------------------------------------------------------------------------- 1 | #Change the value of NUMBER_OF_PROCESSES to obtain faster tagging process! 2 | NUMBER_OF_PROCESSES = 2 3 | 4 | THRESHOLD = (3, 2) -------------------------------------------------------------------------------- /HyCxG/Tokenizer/CxGProcessor/rdrpos_tagger/Utility/Eval.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | os.chdir("../") 6 | sys.setrecursionlimit(100000) 7 | sys.path.append(os.path.abspath("")) 8 | os.chdir("./Utility") 9 | 10 | from Utility.Utils import getWordTag, readDictionary 11 | 12 | def computeAccuracy(goldStandardCorpus, taggedCorpus): 13 | tagged = open(taggedCorpus, "r").read().split() 14 | goldStandard = open(goldStandardCorpus, "r").read().split() 15 | if len(tagged) != len(goldStandard): 16 | print("The numbers of word tokens in %s and %s are not equal!" % (goldStandardCorpus, taggedCorpus)) 17 | return 0 18 | numwords = 0 19 | count = 0 20 | for i in range(len(tagged)): 21 | numwords += 1 22 | word1, tag1 = getWordTag(tagged[i]) 23 | word2, tag2 = getWordTag(goldStandard[i]) 24 | if word1 != word2 and word1 != "''" and word2 != "''": 25 | print("Words are not the same in gold standard and tagged corpora, at the index", i) 26 | return 0 27 | 28 | if tag1.lower() == tag2.lower(): 29 | count += 1 30 | 31 | return count * 100.0 / numwords 32 | 33 | def computeAccuracies(fullDictFile, goldStandardCorpus, taggedCorpus): 34 | """ 35 | Return known-word accuracy, unknown-word accuracy and the overall accuracy 36 | """ 37 | tagged = open(taggedCorpus, "r").read().split() 38 | goldStandard = open(goldStandardCorpus, "r").read().split() 39 | if len(tagged) != len(goldStandard): 40 | print("The numbers of word tokens in %s and %s are not equal!" % (goldStandardCorpus, taggedCorpus)) 41 | return 0 42 | 43 | fullDICT = readDictionary(fullDictFile) 44 | 45 | numwords = count = 0 46 | countKN = countUNKN = 0 47 | countCorrectKN = countCorrectUNKN = 0 48 | 49 | for i in range(len(tagged)): 50 | numwords += 1 51 | word1, tag1 = getWordTag(tagged[i]) 52 | word2, tag2 = getWordTag(goldStandard[i]) 53 | if word1 != word2 and word1 != "''" and word2 != "''": 54 | print("Words are not the same in gold standard and tagged corpora, at the index " + str(i)) 55 | return 0 56 | 57 | if tag1.lower() == tag2.lower(): 58 | count += 1 59 | 60 | if word1 in fullDICT: 61 | countKN += 1 62 | if tag1.lower() == tag2.lower(): 63 | countCorrectKN += 1 64 | else: 65 | countUNKN += 1 66 | if tag1.lower() == tag2.lower(): 67 | countCorrectUNKN += 1 68 | 69 | if countUNKN == 0: 70 | return countCorrectKN * 100.0 / countKN, 0.0, count * 100.0 / numwords 71 | else: 72 | return countCorrectKN * 100.0 / countKN, countCorrectUNKN * 100.0 / countUNKN, count * 100.0 / numwords 73 | 74 | if __name__ == "__main__": 75 | print(str(computeAccuracy(sys.argv[1], sys.argv[2])) + "%") 76 | pass 77 | 78 | -------------------------------------------------------------------------------- /HyCxG/Tokenizer/CxGProcessor/rdrpos_tagger/Utility/Utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import codecs 3 | 4 | def getWordTag(wordTag): 5 | if wordTag == "///": 6 | return "/", "/" 7 | index = wordTag.rfind("/") 8 | if index == -1: 9 | return wordTag, None 10 | word = wordTag[:index].strip() 11 | tag = wordTag[index + 1:].strip() 12 | return word, tag 13 | 14 | def getRawText(inputFile, outFile): 15 | out = open(outFile, "w") 16 | sents = open(inputFile, "r").readlines() 17 | for sent in sents: 18 | wordTags = sent.strip().split() 19 | for wordTag in wordTags: 20 | word, tag = getWordTag(wordTag) 21 | out.write(word + " ") 22 | out.write("\n") 23 | out.close() 24 | 25 | def readDictionary(inputFile): 26 | dictionary = {} 27 | lines = codecs.open(inputFile, "r", encoding = "utf-8", errors = "replace").readlines() 28 | for line in lines: 29 | wordtag = line.strip().split() 30 | dictionary[wordtag[0]] = wordtag[1] 31 | return dictionary 32 | 33 | -------------------------------------------------------------------------------- /HyCxG/Tokenizer/CxGProcessor/rdrpos_tagger/pSCRDRtagger/ExtRDRPOSTagger.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | os.chdir("../") 6 | sys.setrecursionlimit(100000) 7 | sys.path.append(os.path.abspath("")) 8 | os.chdir("./pSCRDRtagger") 9 | 10 | from multiprocessing import Pool 11 | from SCRDRlearner.Object import FWObject, getWordTag 12 | from SCRDRlearner.SCRDRTree import SCRDRTree 13 | from SCRDRlearner.SCRDRTreeLearner import SCRDRTreeLearner 14 | from Utility.Config import NUMBER_OF_PROCESSES, THRESHOLD 15 | 16 | def unwrap_self_ExtRDRPOSTagger(arg, **kwarg): 17 | return ExtRDRPOSTagger.tagInitializedSentence(*arg, **kwarg) 18 | 19 | class ExtRDRPOSTagger(SCRDRTree): 20 | def __init__(self): 21 | self.root = None 22 | 23 | def tagInitializedSentence(self, initSen): 24 | wordTags = initSen.replace("“", "''").replace("”", "''").replace("\"", "''").split() 25 | sen = [] 26 | for i in range(len(wordTags)): 27 | fwObject = FWObject.getFWObject(wordTags, i) 28 | word, tag = getWordTag(wordTags[i]) 29 | node = self.findFiredNode(fwObject) 30 | if node.depth > 0: 31 | sen.append(word + "/" + node.conclusion) 32 | else:# Fired at root, return initialized tag 33 | sen.append(word + "/" + tag) 34 | return " ".join(sen) 35 | 36 | def tagInitializedCorpus(self, inputFile): 37 | lines = open(inputFile, "r").readlines() 38 | #Change the value of NUMBER_OF_PROCESSES to obtain faster tagging process! 39 | pool = Pool(processes = NUMBER_OF_PROCESSES) 40 | taggedLines = pool.map(unwrap_self_ExtRDRPOSTagger, zip([self] * len(lines), lines)) 41 | out = open(inputFile + ".TAGGED", "w") 42 | for line in taggedLines: 43 | out.write(line + "\n") 44 | out.close() 45 | print("\nOutput file: " + inputFile + ".TAGGED") 46 | 47 | def printHelp(): 48 | print("\n===== Usage =====") 49 | print('\n#1: To train RDRPOSTagger in case of using output from an external initial POS tagger:') 50 | print('\npython ExtRDRPOSTagger.py train PATH-TO-GOLD-STANDARD-TRAINING-CORPUS PATH-TO-TRAINING-CORPUS-INITIALIZED-BY-EXTERNAL-TAGGER') 51 | print('\nExample: python ExtRDRPOSTagger.py train ../data/goldTrain ../data/initTrain') 52 | print('\n#2: To use the trained model for POS tagging on a test corpus where words already are initially tagged by the external initial tagger:') 53 | print('\npython ExtRDRPOSTagger.py tag PATH-TO-TRAINED-MODEL PATH-TO-TEST-CORPUS-INITIALIZED-BY-EXTERNAL-TAGGER') 54 | print('\nExample: python ExtRDRPOSTagger.py tag ../data/initTrain.RDR ../data/initTest') 55 | print('\n#3: Find the full usage at http://rdrpostagger.sourceforge.net !') 56 | 57 | def run(args = sys.argv[1:]): 58 | if (len(args) == 0): 59 | printHelp() 60 | elif args[0].lower() == "train": 61 | try: 62 | print("\n===== Start =====") 63 | print('\nLearn a tree model of rules for POS tagging from %s and %s ' % (args[1], args[2])) 64 | rdrTree = SCRDRTreeLearner(THRESHOLD[0], THRESHOLD[1]) 65 | rdrTree.learnRDRTree(args[2], args[1]) 66 | print("\nWrite the learned tree model to file " + args[2] + ".RDR") 67 | rdrTree.writeToFile(args[2] + ".RDR") 68 | print('\nDone!') 69 | except Exception as e: 70 | print("\nERROR ==> ", e) 71 | printHelp() 72 | elif args[0].lower() == "tag": 73 | try: 74 | r = ExtRDRPOSTagger() 75 | print("\n=> Read a POS tagging model from " + args[1]) 76 | r.constructSCRDRtreeFromRDRfile(args[1]) 77 | print("\n=> Perform POS tagging on " + args[2]) 78 | r.tagInitializedCorpus(args[2]) 79 | except Exception as e: 80 | print("\nERROR ==> ", e) 81 | printHelp() 82 | else: 83 | printHelp() 84 | if __name__ == "__main__": 85 | run() 86 | -------------------------------------------------------------------------------- /HyCxG/Tokenizer/CxGProcessor/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | EMAIL_REGEX = re.compile( 3 | r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-](@|[(<{\[]at[)>}\]])(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))", 4 | flags=re.IGNORECASE | re.UNICODE, 5 | ) 6 | NUMBERS_REGEX = re.compile( 7 | r"(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|(\d*?[.,]\d+)|\d+)(?:$|(?=\b))" 8 | ) 9 | CURRENCIES = { 10 | "$": "USD", 11 | "zł": "PLN", 12 | "£": "GBP", 13 | "¥": "JPY", 14 | "฿": "THB", 15 | "₡": "CRC", 16 | "₦": "NGN", 17 | "₩": "KRW", 18 | "₪": "ILS", 19 | "₫": "VND", 20 | "€": "EUR", 21 | "₱": "PHP", 22 | "₲": "PYG", 23 | "₴": "UAH", 24 | "₹": "INR", 25 | } 26 | CURRENCY_REGEX = re.compile( 27 | "({})+".format("|".join(re.escape(c) for c in CURRENCIES.keys())) 28 | ) 29 | PHONE_REGEX = re.compile( 30 | r"((?:^|(?<=[^\w)]))(((\+?[01])|(\+\d{2}))[ .-]?)?(\(?\d{3,4}\)?/?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W)))|\+?\d{4,5}[ .-/]\d{6,9}" 31 | ) 32 | 33 | # text = "10086 yes" 34 | # res = PHONE_REGEX.sub("", text) 35 | # res1 = NUMBERS_REGEX.sub("", text) -------------------------------------------------------------------------------- /HyCxG/Tokenizer/ModelTokenizer.py: -------------------------------------------------------------------------------- 1 | from Tokenizer.BaseTokenizer import BasicTokenizer, WordpieceTokenizer, Tokenizer 2 | from Tokenizer.constants import * 3 | from Tokenizer.CxGProcessor.CxGCore import CxGCore 4 | from Tokenizer.Vocab import CxGVocab 5 | from transformers import AutoTokenizer 6 | 7 | # Not available in this repo 8 | class BertTokenizer(Tokenizer): 9 | def __init__(self, args): 10 | super().__init__(args, token_mode=CONST_TOKEN_MODE_WORD) 11 | self.basic_tokenizer = BasicTokenizer(do_lower_case=args.do_lower_case) 12 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=UNK_TOKEN) 13 | 14 | def tokenize(self, text) -> list: 15 | tokens, split_tokens = [], [] 16 | if isinstance(text, str): 17 | text = [text] 18 | for ele in text: 19 | split_tokens = [] 20 | for token in self.basic_tokenizer.tokenize(ele): 21 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 22 | split_tokens.append(sub_token) 23 | tokens.append(split_tokens) 24 | if len(text) == 1: 25 | return split_tokens 26 | return tokens 27 | 28 | 29 | class CxGTokenizer(object): 30 | def __init__(self, args, visible=True, workers=None, lang='eng'): 31 | self.cxg = CxGCore(args, workers=workers, lang=lang) 32 | self.bert = AutoTokenizer.from_pretrained(args.lm_path) 33 | self.cons_vocab = CxGVocab(args.cxg_vocab_path, lang=lang) 34 | self.visible = visible 35 | 36 | def tokenize(self, text, raw=True) -> dict: 37 | results = self.cxg.parse_text(text) 38 | # return results 39 | if raw: 40 | cons_pattern = [self.cons_vocab.cxg_i2c[ele] for ele in results['cons_idx']] 41 | results['cons_pattern'] = cons_pattern 42 | return results 43 | # else branch is not avalible in Github Repo -------------------------------------------------------------------------------- /HyCxG/Tokenizer/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | 16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Tokenizer) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Tokenizer/README_ZH.md) 17 | 18 | ## Construction dictionaries for CxGTokenizer 19 | 20 | In this repository, we provide mirror data and download script for six language (English, French, German, Spanish, Dutch, and Turkish) dictionaries in CxGTokenizer based on the `c2xg` package. Regarding the specific usage guidance for CxGTokenizer, please refer to [`tutorials`](https://github.com/xlxwalex/HyCxG/tree/main/tutorials). 21 | 22 | ### Download data 23 | You can use the following command to download the data (Note: you may not attach any parameters, all parameters have default values): 24 | ```shell 25 | bash download_cxgdict.sh [--LANGUAGES] 26 | ``` 27 | **Parameters:** 28 | + LANGUAGES: The abbreviation of the required languages. If you want to download all languages, use `all` for `LANGUAGES` parameter. If you only want to download part of languages, please include the abbreviation of the required languages in the parameter according to the following abbreviations: 29 | 1. English: `eng` 30 | 2. French: `fra` 31 | 3. German: `deu` 32 | 4. Spanish: `spa` 33 | 5. Dutch: `nld` 34 | 6. Turkish: `tur` 35 | 36 | **Note:** The dicts for CxG corresponding to different languages will be downloaded to the `CxGProcessor/data` folder. 37 | 38 | ### Resource of data 39 | Our mirror data is obtained from c2xg package, as well as the data sources [c2xg - dictionaries](https://github.com/jonathandunn/c2xg/tree/master/c2xg/data/dictionaries). If you have also used these construction grammar lists, you can cite their papers as follows: 40 | ``` 41 | @article{dunn2017computational, 42 | title={Computational learning of construction grammars}, 43 | author={Dunn, Jonathan}, 44 | journal={Language and cognition}, 45 | volume={9}, 46 | number={2}, 47 | pages={254--292}, 48 | year={2017}, 49 | publisher={Cambridge University Press} 50 | } 51 | ``` 52 | -------------------------------------------------------------------------------- /HyCxG/Tokenizer/README_ZH.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Tokenizer) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Tokenizer/README_ZH.md) 16 | ## CxGTokenizer所需的构式语法词表数据 17 | 18 | 在本仓库中,我们基于c2xg包提供了六种语言(英语、法语、德语、西班牙语、荷兰语以及土耳其语)的构式语法词表的镜像数据以及下载脚本。关于具体CxGTokenizer的使用指导,请参考[`tutorials`](https://github.com/xlxwalex/HyCxG/tree/main/tutorials)。 19 | 20 | ### 数据下载 21 | 您可以用以下命令来获得并处理数据(可以不附加任何参数,所有参数均有默认值): 22 | ```shell 23 | bash download_cxgdict.sh [--LANGUAGES] 24 | ``` 25 | **参数含义:** 26 | + LANGUAGES: 所需语言的简称,如果需要下载全部语言,使用`all`即可。若只希望下载部分语言,请按照以下对应关系在参数中加入所需语言简称: 27 | 1. 英语:`eng` 28 | 2. 法语:`fra` 29 | 3. 德语:`deu` 30 | 4. 西班牙语:`spa` 31 | 5. 荷兰语:`nld` 32 | 6. 土耳其语:`tur` 33 | 34 | **注意:** 不同语言对应的构式语法词表会被下载到本目录下的`CxGProcessor/data`文件夹中 35 | 36 | ### 数据来源 37 | 本部分数据来自c2xg,我们的镜像数据来源[c2xg - dictionaries](https://github.com/jonathandunn/c2xg/tree/master/c2xg/data/dictionaries),如果您也使用了该语法表,您可以引用他们的论文: 38 | ``` 39 | @article{dunn2017computational, 40 | title={Computational learning of construction grammars}, 41 | author={Dunn, Jonathan}, 42 | journal={Language and cognition}, 43 | volume={9}, 44 | number={2}, 45 | pages={254--292}, 46 | year={2017}, 47 | publisher={Cambridge University Press} 48 | } 49 | ``` 50 | -------------------------------------------------------------------------------- /HyCxG/Tokenizer/Vocab.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | class Vocab(object): 4 | def __init__(self): 5 | # Reserved Vocabulary 6 | self.reserved_vocab_path = os.path.abspath( 7 | os.path.join(os.path.dirname(__file__), "../../models/reserved_vocab.txt")) 8 | 9 | def load(self, vocab_path): 10 | raise NotImplementedError 11 | 12 | def save(self, save_path): 13 | raise NotImplementedError 14 | 15 | class BERTVocab(Vocab): 16 | """ 17 | Vocabulary of BERT Tokenizer 18 | """ 19 | 20 | def __init__(self, vocab_path='dataset/Vocab/BERT/'): 21 | super().__init__() 22 | self.word_w2i = {} 23 | self.word_i2w = [] 24 | self.word_w2c = {} 25 | self.load(vocab_path) 26 | 27 | def load(self, vocab_path): 28 | with open(os.path.join(vocab_path, 'vocab.txt'), mode="r", encoding="utf-8") as reader: 29 | for index, line in enumerate(reader): 30 | w = line.strip("\n").split()[0] if line.strip() else line.strip("\n") 31 | self.word_w2i[w] = index 32 | self.word_i2w.append(w) 33 | 34 | def save(self, save_path): 35 | print("Word Vocabulary size: ", len(self)) 36 | with open(save_path, mode="w", encoding="utf-8") as f: 37 | for w in self.word_i2w: 38 | f.write(w + "\n") 39 | print("Word Vocabulary saving done.") 40 | 41 | def get(self, w): 42 | return self.word_w2i[w] 43 | 44 | def __len__(self): 45 | return len(self.word_i2w) 46 | 47 | 48 | class CxGVocab(Vocab): 49 | """ 50 | Vocabulary of CxGBERT Tokenizer 51 | """ 52 | def __init__(self, vocab_path = "../dataset/CxGBERT/", lang='eng'): 53 | super().__init__() 54 | # Externel Construction Vocabulary 55 | self.cxg_c2i = {} 56 | self.cxg_i2c = [] 57 | self.cxg_c2c = {} 58 | self.lang = lang 59 | self.load(vocab_path) 60 | 61 | def __len__(self): 62 | return len(self.cxg_i2c) 63 | 64 | def load(self, vocab_path): 65 | with open(os.path.join(vocab_path, 'construction.txt' if self.lang == 'eng' else '{}.construction.txt'.format(self.lang)), mode="r", encoding="utf-8") as reader: 66 | self.cxg_c2i[''] = 0 67 | self.cxg_i2c.append('') 68 | for index, line in enumerate(reader): 69 | w = line.strip("\n").replace(' ', '').split()[0] if line.strip() else line.strip("\n") 70 | self.cxg_c2i[w] = index + 1 71 | self.cxg_i2c.append(w) 72 | 73 | def get(self, w): 74 | return self.cxg_c2i[w] -------------------------------------------------------------------------------- /HyCxG/Tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | from .ModelTokenizer import CxGTokenizer, BertTokenizer 2 | 3 | __all__ = ['CxGTokenizer', 'BertTokenizer'] -------------------------------------------------------------------------------- /HyCxG/Tokenizer/constants.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | with open(os.path.abspath(os.path.join(os.path.dirname(__file__), "../dataset/Vocab/special_tokens_map.json")), mode="r", encoding="utf-8") as f: 5 | special_tokens_map = json.load(f) 6 | 7 | UNK_TOKEN = special_tokens_map["unk_token"] 8 | CLS_TOKEN = special_tokens_map["cls_token"] 9 | SEP_TOKEN = special_tokens_map["sep_token"] 10 | PAD_TOKEN = special_tokens_map["pad_token"] 11 | 12 | CONST_TOKEN_MODE_WORD = 'WORD' 13 | CONST_TOKEN_MODE_CXG = 'CXG' -------------------------------------------------------------------------------- /HyCxG/Tokenizer/download_cxgdict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | OUT_PATH=CxGProcessor/data 4 | # (English, Turkish, Dutch, Spanish, French, German) 5 | LANGUAGE_1=$1 6 | LANGUAGE_2=$2 7 | LANGUAGE_3=$3 8 | LANGUAGE_4=$4 9 | LANGUAGE_5=$5 10 | LANGUAGE_6=$6 11 | 12 | if [ -d "$OUT_PATH" ]; then 13 | echo "$OUT_PATH folder exists, pass" 14 | else 15 | echo "$OUT_PATH folder does not exist, please check" 16 | set -e 17 | fi 18 | 19 | if test -z "$LANGUAGE_1" 20 | then 21 | LANGUAGE_1=all 22 | fi 23 | 24 | echo "Downloading construction grammar dictionaries" 25 | echo "Original data for construction grammar dictionaries of languages can be found in (c2xg package) https://github.com/jonathandunn/c2xg/tree/master/c2xg/data/dictionaries" 26 | 27 | for LANG in $LANGUAGE_1 $LANGUAGE_2 $LANGUAGE_3 $LANGUAGE_4 $LANGUAGE_5 $LANGUAGE_6 28 | do 29 | # English 30 | if [[ "$LANG" == "eng" ]] || [[ "$LANG" == "all" ]];then 31 | wget -O $OUT_PATH/eng.DICT https://expic.xlxw.org/hycxg/cxgdict/eng.DICT 32 | wget -O $OUT_PATH/eng.RDR https://expic.xlxw.org/hycxg/cxgdict/eng.RDR 33 | wget -O $OUT_PATH/eng.clusters.fastText.v2.gz https://expic.xlxw.org/hycxg/cxgdict/eng.clusters.fastText.v2.gz 34 | fi 35 | # Tukrish 36 | if [[ "$LANG" == "tur" ]] || [[ "$LANG" == "all" ]];then 37 | wget -O $OUT_PATH/tur.DICT https://expic.xlxw.org/hycxg/cxgdict/tur.DICT 38 | wget -O $OUT_PATH/tur.RDR https://expic.xlxw.org/hycxg/cxgdict/tur.RDR 39 | wget -O $OUT_PATH/tur.clusters.fastText.v2.gz https://expic.xlxw.org/hycxg/cxgdict/tur.clusters.fastText.v2.gz 40 | fi 41 | # French 42 | if [[ "$LANG" == "fra" ]] || [[ "$LANG" == "all" ]];then 43 | wget -O $OUT_PATH/fra.DICT https://expic.xlxw.org/hycxg/cxgdict/fra.DICT 44 | wget -O $OUT_PATH/fra.RDR https://expic.xlxw.org/hycxg/cxgdict/fra.RDR 45 | wget -O $OUT_PATH/fra.clusters.fastText.v2.gz https://expic.xlxw.org/hycxg/cxgdict/fra.clusters.fastText.v2.gz 46 | fi 47 | # Spanish 48 | if [[ "$LANG" == "spa" ]] || [[ "$LANG" == "all" ]];then 49 | wget -O $OUT_PATH/spa.DICT https://expic.xlxw.org/hycxg/cxgdict/spa.DICT 50 | wget -O $OUT_PATH/spa.RDR https://expic.xlxw.org/hycxg/cxgdict/spa.RDR 51 | wget -O $OUT_PATH/spa.clusters.fastText.v2.gz https://expic.xlxw.org/hycxg/cxgdict/spa.clusters.fastText.v2.gz 52 | fi 53 | # German 54 | if [[ "$LANG" == "deu" ]] || [[ "$LANG" == "all" ]];then 55 | wget -O $OUT_PATH/deu.DICT https://expic.xlxw.org/hycxg/cxgdict/deu.DICT 56 | wget -O $OUT_PATH/deu.RDR https://expic.xlxw.org/hycxg/cxgdict/deu.RDR 57 | wget -O $OUT_PATH/deu.clusters.fastText.v2.gz https://expic.xlxw.org/hycxg/cxgdict/deu.clusters.fastText.v2.gz 58 | fi 59 | # Dutch 60 | if [[ "$LANG" == "nld" ]] || [[ "$LANG" == "all" ]];then 61 | wget -O $OUT_PATH/nld.DICT https://expic.xlxw.org/hycxg/cxgdict/nld.DICT 62 | wget -O $OUT_PATH/nld.RDR https://expic.xlxw.org/hycxg/cxgdict/nld.RDR 63 | wget -O $OUT_PATH/nld.clusters.fastText.v2.gz https://expic.xlxw.org/hycxg/cxgdict/nld.clusters.fastText.v2.gz 64 | fi 65 | done 66 | 67 | echo "The CxG vocab of languages are stored in $OUT_PATH" 68 | -------------------------------------------------------------------------------- /HyCxG/Trainer/Trainer.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import DataLoader 2 | 3 | class Trainer(object): 4 | def __init__(self, args, model, criterion, optimizer, device, checkp, scheduler = None): 5 | super(Trainer, self).__init__() 6 | self.args = args 7 | # Training Component 8 | self.model = model 9 | self.criterion = criterion 10 | self.optimizer = optimizer 11 | self.scheduler = scheduler 12 | self.device = device 13 | # Training Params 14 | self.checkpoint = checkp 15 | self.epoch = 0 16 | self.step = 0 17 | self.best = - float('inf') 18 | self.eval_inform = [] 19 | 20 | # Train Model 21 | def train(self, Trainset : DataLoader, Validset : DataLoader): 22 | raise NotImplementedError 23 | 24 | # Valid Model 25 | def valid(self, Validset : DataLoader): 26 | raise NotImplementedError 27 | 28 | # Test Model 29 | def test(self, Testset: DataLoader): 30 | raise NotImplementedError 31 | 32 | # Generate Checkpoints 33 | def _generate_checkp(self) -> dict: 34 | checkpoints = { 35 | 'model': self.model.state_dict(), 36 | 'optim': self.optimizer, 37 | } 38 | return checkpoints -------------------------------------------------------------------------------- /HyCxG/Trainer/__init__.py: -------------------------------------------------------------------------------- 1 | from Trainer.HyCxGTrainerABSA import HyCxGTrainerABSA 2 | from Trainer.HyCxGTrainerGLUE import HyCxGTrainerGLUE 3 | 4 | __all__ = ['HyCxGTrainerABSA', 'HyCxGTrainerGLUE'] -------------------------------------------------------------------------------- /HyCxG/config.py: -------------------------------------------------------------------------------- 1 | from utils.argument import ArgumentGroup 2 | import argparse 3 | 4 | def parse_args(): 5 | parser = argparse.ArgumentParser(description='HyCxG Model Parameters Setting') 6 | # Base Params 7 | base_args = ArgumentGroup(parser, 'base', 'Base Settings') 8 | base_args.add_arg('mode', str, 'train', 'Experiment Mode') 9 | base_args.add_arg('cuda', bool, True, 'CUDA device') 10 | base_args.add_arg('gpu_id', int, 0, 'GPU ID, 0 for cuda:0') 11 | base_args.add_arg('seed', int, 0, 'Global Random Seed') 12 | base_args.add_arg('checkpoints', str, 'checkpoints/', 'Checkpoint Path') 13 | base_args.add_arg('checkp', str, 'hyper_cxg_mams/', 'Checkpoint Dir') 14 | 15 | # Dataset Params 16 | data_args = ArgumentGroup(parser, 'dataset', 'Dataset Settings') 17 | data_args.add_arg('data_name', str, 'JSONABSA_MAMS', 'Name of Dataset') 18 | data_args.add_arg('num_workers', int, 64, 'Number of workers to solve coverage problem') 19 | data_args.add_arg('t_minutes', float, 0.05, 'Cost of time to solve the coverage problem per instance (minutes)') 20 | data_args.add_arg('test_outpath', str, 'result_test.csv', 'Output test results for analysis') 21 | 22 | # Model Params 23 | model_args = ArgumentGroup(parser, 'model', 'Model Settings') 24 | model_args.add_arg('num_classes', int, 3, 'Number of classes or each task') 25 | model_args.add_arg('padding_size', int, 150, 'Padding size Of PLM Model') 26 | model_args.add_arg('padding_val', int, 0, 'Padding value of PLM Model') 27 | model_args.add_arg('lm_dropout', float, 0.0, 'Dropout for PLM model') 28 | model_args.add_arg('hg_dropout', float, 0.4, 'Dropout for R-HGAT network') 29 | model_args.add_arg('hg_inter_dim', int, 384, 'Size of representations for transform hgatt network') 30 | model_args.add_arg('hg_layers', int, 1, 'Number of layers for R-HGAT network') 31 | model_args.add_arg('inter_size', int, 3072, 'Size of middle representations in FFN module') 32 | model_args.add_arg('leaky_alpha', float, 0.2, 'Alpha setting for leaky_relu in R-HGAT network') 33 | model_args.add_arg('edge_trans', bool, True, 'Transform hyperedge embedding (construction)') 34 | model_args.add_arg('remove_layernorm', bool, False, 'Remove layernorm for the embedding of cxg') 35 | # If enable multi-head R-HGAT (Not available in the repo) 36 | model_args.add_arg('heads_num', int, 12, 'Head num of hyper graph attention') 37 | # If enable syntactic graph (Not available in the repo) 38 | model_args.add_arg('parse_syntax', bool, False, 'Whether to inject syntax inform') 39 | model_args.add_arg('parse_direct', bool, False, 'Whether to construct direct graph') 40 | 41 | # Tokenizer and CxG Processor Params 42 | tokenizer_args = ArgumentGroup(parser, 'tokenizer', 'Tokenizer Settings') 43 | tokenizer_args.add_arg('word_vocab_path', str, 'dataset/Vocab/BERT/', 'LM Vocab path') 44 | tokenizer_args.add_arg('cxg_vocab_path', str, 'dataset/Vocab/CxG/', 'LM Vocab path') 45 | tokenizer_args.add_arg('do_lower_case', bool, True, 'Lower case the elememts') 46 | 47 | # Pre-trained Model Params 48 | pretrained_args = ArgumentGroup(parser, 'pretrained', 'Pre-trained Model Settings') 49 | pretrained_args.add_arg('lm_group', str, 'BERT', 'Pre-trained language model group, e.g., BERT/RoBERTa') 50 | pretrained_args.add_arg('use_lm', bool, True, 'Whether Model Use pre-trained language models') 51 | pretrained_args.add_arg('lm_path', str, 'bert-base-uncased', 'Pre-trained model path') 52 | pretrained_args.add_arg('lm_hidden_size', int, 768, 'HiddenSize of PLM') 53 | pretrained_args.add_arg('output_hidden_states', bool, True, 'Output PLM hidden states at token level') 54 | pretrained_args.add_arg('finetune', bool, True, 'Finetune Or freeze PLM') 55 | 56 | # Training Params 57 | train_args = ArgumentGroup(parser, 'train', 'Training Settings') 58 | train_args.add_arg('batch_size', int, 32, 'Batch size for training, depending on the memory size of your GPU') 59 | train_args.add_arg('shuffle', bool, True, 'DataLoader shuffle params, should be True when training') 60 | train_args.add_arg('droplast', bool, False, 'Whether to drop rest data for dataloader') 61 | train_args.add_arg('lr', float, 2e-5, 'Learning rate') 62 | train_args.add_arg('wd', float, 1e-2, 'Weight decay') 63 | train_args.add_arg('max_grad_norm', float, 1.0, 'Gradient clipping') 64 | train_args.add_arg('num_epoch', int, 50, 'Epoch param') 65 | train_args.add_arg('warmup_ratio', int, 0.06, 'Warm Up Steps Phase') 66 | train_args.add_arg('print_step', int, 5, 'Training Print Steps') 67 | train_args.add_arg('eval_step', int, 50, 'Evaluating Steps') 68 | train_args.add_arg('scheduler', bool, False, 'Whether to apply scheduler for training') 69 | 70 | args = parser.parse_args() 71 | return args 72 | 73 | -------------------------------------------------------------------------------- /HyCxG/dataset/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | 16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset/README_ZH.md) 17 | 18 | ## Construction grammar list 19 | 20 | In this repository, we provide mirror data and download script for six language (English, French, German, Spanish, Dutch, and Turkish) construction grammar lists based on the `c2xg` package. 21 | 22 | ### Download data 23 | You can use the following command to download the data (Note: you may not attach any parameters, all parameters have default values): 24 | ```shell 25 | bash download_vocab.sh [--LANGUAGES] 26 | ``` 27 | **Parameters:** 28 | + LANGUAGES: The abbreviation of the required languages. If you want to download all languages, use `all` for `LANGUAGES` parameter. If you only want to download part of languages, please include the abbreviation of the required languages in the parameter according to the following abbreviations: 29 | 1. English: `eng` 30 | 2. French: `fra` 31 | 3. German: `deu` 32 | 4. Spanish: `spa` 33 | 5. Dutch: `nld` 34 | 6. Turkish: `tur` 35 | 36 | **Note:** The list of construction crammar corresponding to different languages will be downloaded to the `CxG` folder in the current directory. 37 | 38 | ### Resource of data 39 | Our mirror data is obtained from c2xg package, as well as the data sources [c2xg - data](https://github.com/jonathandunn/c2xg/tree/master/c2xg/data/models). If you have also used these construction grammar lists, you can cite their papers as follows: 40 | ``` 41 | @article{dunn2017computational, 42 | title={Computational learning of construction grammars}, 43 | author={Dunn, Jonathan}, 44 | journal={Language and cognition}, 45 | volume={9}, 46 | number={2}, 47 | pages={254--292}, 48 | year={2017}, 49 | publisher={Cambridge University Press} 50 | } 51 | ``` 52 | -------------------------------------------------------------------------------- /HyCxG/dataset/README_ZH.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset/README_ZH.md) 16 | ## 构式语法表数据 17 | 18 | 在本仓库中,我们基于c2xg包提供了六种语言(英语、法语、德语、西班牙语、荷兰语以及土耳其语)的构式语法列表的镜像数据以及下载脚本。 19 | 20 | ### 数据下载 21 | 您可以用以下命令来获得并处理数据(可以不附加任何参数,所有参数均有默认值): 22 | ```shell 23 | bash download_vocab.sh [--LANGUAGES] 24 | ``` 25 | **参数含义:** 26 | + LANGUAGES: 所需语言的简称,如果需要下载全部语言,使用`all`即可。若只希望下载部分语言,请按照以下对应关系在参数中加入所需语言简称: 27 | 1. 英语:`eng` 28 | 2. 法语:`fra` 29 | 3. 德语:`deu` 30 | 4. 西班牙语:`spa` 31 | 5. 荷兰语:`nld` 32 | 6. 土耳其语:`tur` 33 | 34 | **注意:** 不同语言对应的构式语法列表会下载到本目录下的`CxG`文件夹中 35 | 36 | ### 数据来源 37 | 本部分数据来自c2xg,我们的镜像数据来源[c2xg - data](https://github.com/jonathandunn/c2xg/tree/master/c2xg/data/models),如果您也使用了该语法表,您可以引用他们的论文: 38 | ``` 39 | @article{dunn2017computational, 40 | title={Computational learning of construction grammars}, 41 | author={Dunn, Jonathan}, 42 | journal={Language and cognition}, 43 | volume={9}, 44 | number={2}, 45 | pages={254--292}, 46 | year={2017}, 47 | publisher={Cambridge University Press} 48 | } 49 | ``` 50 | -------------------------------------------------------------------------------- /HyCxG/dataset/download_vocab.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DETECT_FLAG=dataset/Vocab 4 | # (English, Turkish, Dutch, Spanish, French, German) 5 | LANGUAGE_1=$1 6 | LANGUAGE_2=$2 7 | LANGUAGE_3=$3 8 | LANGUAGE_4=$4 9 | LANGUAGE_5=$5 10 | LANGUAGE_6=$6 11 | 12 | if [ -d "$DETECT_FLAG" ]; then 13 | OUT_PATH=$DETECT_FLAG 14 | else 15 | OUT_PATH='Vocab' 16 | fi 17 | 18 | if [ -d "$OUT_PATH" ]; then 19 | echo "$OUT_PATH folder exists, pass" 20 | else 21 | echo "$OUT_PATH folder does not exist, try to mkdir" 22 | mkdir "$OUT_PATH" 23 | fi 24 | 25 | VOCAB_DIR=$OUT_PATH/CxG 26 | 27 | if [ -d "$VOCAB_DIR" ]; then 28 | echo "$VOCAB_DIR folder exists, pass" 29 | else 30 | echo "$VOCAB_DIR folder does not exist, try to mkdir" 31 | mkdir "$VOCAB_DIR" 32 | fi 33 | 34 | if test -z "$LANGUAGE_1" 35 | then 36 | LANGUAGE_1=all 37 | fi 38 | 39 | echo "Downloading special_tokens_map.json" 40 | wget -O $OUT_PATH/special_tokens_map.json https://expic.xlxw.org/hycxg/cxgvocab/special_tokens_map.json 41 | 42 | echo "Downloading construction grammar list" 43 | echo "Original data for construction grammar list of languages can be found in (c2xg package) https://github.com/jonathandunn/c2xg/tree/master/c2xg/data/models" 44 | 45 | for LANG in $LANGUAGE_1 $LANGUAGE_2 $LANGUAGE_3 $LANGUAGE_4 $LANGUAGE_5 $LANGUAGE_6 46 | do 47 | # English 48 | if [[ "$LANG" == "eng" ]] || [[ "$LANG" == "all" ]];then 49 | wget -O $VOCAB_DIR/construction.txt https://expic.xlxw.org/hycxg/cxgvocab/construction.txt 50 | wget -O $VOCAB_DIR/construction.pkl https://expic.xlxw.org/hycxg/cxgvocab/construction.pkl 51 | fi 52 | # Tukrish 53 | if [[ "$LANG" == "tur" ]] || [[ "$LANG" == "all" ]];then 54 | wget -O $VOCAB_DIR/tur.construction.txt https://expic.xlxw.org/hycxg/cxgvocab/tur.construction.txt 55 | wget -O $VOCAB_DIR/tur.construction.pkl https://expic.xlxw.org/hycxg/cxgvocab/tur.construction.pkl 56 | fi 57 | # French 58 | if [[ "$LANG" == "fra" ]] || [[ "$LANG" == "all" ]];then 59 | wget -O $VOCAB_DIR/fra.construction.txt https://expic.xlxw.org/hycxg/cxgvocab/fra.construction.txt 60 | wget -O $VOCAB_DIR/fra.construction.pkl https://expic.xlxw.org/hycxg/cxgvocab/fra.construction.pkl 61 | fi 62 | # Spanish 63 | if [[ "$LANG" == "spa" ]] || [[ "$LANG" == "all" ]];then 64 | wget -O $VOCAB_DIR/spa.construction.txt https://expic.xlxw.org/hycxg/cxgvocab/spa.construction.txt 65 | wget -O $VOCAB_DIR/spa.construction.pkl https://expic.xlxw.org/hycxg/cxgvocab/spa.construction.pkl 66 | fi 67 | # German 68 | if [[ "$LANG" == "deu" ]] || [[ "$LANG" == "all" ]];then 69 | wget -O $VOCAB_DIR/deu.construction.txt https://expic.xlxw.org/hycxg/cxgvocab/deu.construction.txt 70 | wget -O $VOCAB_DIR/deu.construction.pkl https://expic.xlxw.org/hycxg/cxgvocab/deu.construction.pkl 71 | fi 72 | # Dutch 73 | if [[ "$LANG" == "nld" ]] || [[ "$LANG" == "all" ]];then 74 | wget -O $VOCAB_DIR/nld.construction.txt https://expic.xlxw.org/hycxg/cxgvocab/nld.construction.txt 75 | wget -O $VOCAB_DIR/nld.construction.pkl https://expic.xlxw.org/hycxg/cxgvocab/nld.construction.pkl 76 | fi 77 | done 78 | 79 | echo "The CxG vocab of languages are stored in $OUT_PATH" 80 | -------------------------------------------------------------------------------- /HyCxG/run_hycxg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2023 The ZJU MMF Authors (Lvxiaowei Xu, Jianwang Wu, Jiawei Peng, Zhilin Gong, Ming Cai * and Tianxiang Wang). 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Train HyCxG 17 | # Global Variable (!!! SHOULD ADAPT TO YOUR CONFIGURATION !!!) 18 | PLM_PATH_ABBR=bert-base-uncased 19 | NUM_WORKERS=1 # Recommand: The bigger the better, if possible. 20 | 21 | # ABSA 22 | python train_hycxg.py --mode train --checkp hyper_cxg_mams \ 23 | --data_name JSONABSA_MAMS --num_classes 3 --num_workers $NUM_WORKERS \ 24 | --lm_path $PLM_PATH_ABBR --lm_group BERT 25 | 26 | # GLUE - Base model 27 | # Note: for STS task, the num_classes need to be set to 1 28 | python train_hycxg.py --mode train --checkp hyper_cxg_mrpc \ 29 | --data_name JSONGLUE_MRPC --num_classes 2 --num_workers $NUM_WORKERS \ 30 | --lm_path $PLM_PATH_ABBR --lm_group RoBERTa 31 | 32 | # GLUE - Large model 33 | python train_hycxg.py --mode train --checkp hyper_cxg_mrpc \ 34 | --data_name JSONGLUE_MRPC --num_classes 2 --num_workers $NUM_WORKERS \ 35 | --lm_hidden_size 1024 --inter_size 4096 \ 36 | --lm_path $PLM_PATH_ABBR --lm_group RoBERTa -------------------------------------------------------------------------------- /HyCxG/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from utils.misc import _get_device as get_device, set_seed, print_config as arg_show, cal4scheduler 2 | from utils.data import read_dataset, collate_hypercxg_aspect, collate_hypercxg_glue, output_results, tokenize_aspect, tokenize_glue, construct_dependency_graph, pair_hypocxg, reconstruct_sentence, calculate_cxg_size 3 | from utils.define import * 4 | from utils.coverage import cxg_max_coverage 5 | from utils.hypergraph import construct_graph 6 | from utils.operates import _padding as padding, _save_model as save_model, _attention_mask as attention_mask, _pad_adj as pad_adj 7 | from utils.metric import Metric 8 | from utils.argument import Args_trans 9 | from utils.optimizers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, \ 10 | get_cosine_with_hard_restarts_schedule_with_warmup, get_polynomial_decay_schedule_with_warmup, \ 11 | get_constant_schedule, get_constant_schedule_with_warmup 12 | 13 | __all__ = ['get_device', 'set_seed', 'arg_show', 'cal4scheduler', 14 | 'read_dataset', 'collate_hypercxg_aspect', 'collate_hypercxg_glue', 'output_results', 'tokenize_aspect', 'tokenize_glue', 'pair_hypocxg', 'reconstruct_sentence', 'calculate_cxg_size', 15 | 'DATASET_MAP', 'ABSA_POLARITY_MAP', 16 | 'cxg_max_coverage', 'construct_graph', 17 | 'padding', 'save_model', 'attention_mask', 'pad_adj', 18 | 'Metric', 'Args_trans', 19 | 'get_linear_schedule_with_warmup', 'get_cosine_schedule_with_warmup', 'get_cosine_with_hard_restarts_schedule_with_warmup', 20 | 'get_cosine_with_hard_restarts_schedule_with_warmup', 'get_polynomial_decay_schedule_with_warmup', 'get_constant_schedule', 'get_constant_schedule_with_warmup'] -------------------------------------------------------------------------------- /HyCxG/utils/argument.py: -------------------------------------------------------------------------------- 1 | def str2bool(v): 2 | return v.lower() in ("true", "t", "1") 3 | 4 | class ArgumentGroup(object): 5 | def __init__(self, parser, title, des): 6 | self._group = parser.add_argument_group(title=title, description=des) 7 | 8 | def add_arg(self, name, type, default, help, **kwargs): 9 | type = str2bool if type == bool else type 10 | self._group.add_argument( 11 | "--" + name, 12 | default=default, 13 | type=type, 14 | help=help + ' Default: %(default)s.', 15 | **kwargs) 16 | 17 | class Args_trans(object): 18 | def __init__(self, args): 19 | self.args = args 20 | for k in args: 21 | if type(args[k]) == str: 22 | exec("self." + k + "='%s'" % args[k]) 23 | elif type(args[k]) == int: 24 | exec("self." + k + "=%d" % args[k]) 25 | elif type(args[k]) == float: 26 | exec("self." + k + "=%f" % args[k]) 27 | elif type(args[k]) == bool: 28 | if args[k]: 29 | exec("self." + k + "=True") 30 | else: 31 | exec("self." + k + "=False") 32 | 33 | def __str__(self): 34 | for var in self.__dict__: 35 | print('>> {} : {}'.format(var, self.__dict__[var])) -------------------------------------------------------------------------------- /HyCxG/utils/coverage.py: -------------------------------------------------------------------------------- 1 | from Simuann import CxGCoverage 2 | 3 | def init_solver_state(cxg_length : int) -> list: 4 | init_state = [0] * cxg_length 5 | return init_state 6 | 7 | def _unpack(cxg_dict : dict) -> dict: 8 | starts, ends, patterns = [], [], [] 9 | for cxg in cxg_dict: 10 | starts.append(cxg_dict[cxg][0]) 11 | ends.append(cxg_dict[cxg][1]) 12 | patterns.append(cxg) 13 | return {'patterns' : patterns, 'starts' : starts, 'ends': ends} 14 | 15 | def pre_detect(cons_pos : list) -> bool: 16 | if len(cons_pos) < 2: 17 | return False 18 | else: 19 | coverage = [] 20 | for cons in cons_pos: coverage.extend(list(range(cons[0], cons[1]))) 21 | if len(coverage) - len(set(coverage)) == 0: return False 22 | else: 23 | return True 24 | 25 | def construct_dict(cons_pos: list) -> dict: 26 | # To avoid dup in the origin way of "dict([[ele[3], (ele[0], ele[1])] for ele in cons_pos])" 27 | cxg_dict, cxg_counter = {}, {} 28 | for cons in cons_pos: 29 | if cons[3] in cxg_dict: 30 | cxg_dict[cons[3] + '--[{}]'.format(cxg_counter[cons[3]] + 1)] = (cons[0], cons[1]) 31 | cxg_counter[cons[3]] += 1 32 | else: 33 | cxg_dict[cons[3]] = (cons[0], cons[1]) 34 | cxg_counter[cons[3]] = 1 35 | return cxg_dict 36 | 37 | def cxg_max_coverage(starts, ends, indexs, cxgs, T_minutes = 0.05) -> list: 38 | cons_pos = list(zip(starts, ends, indexs, cxgs)) 39 | cons_pos.sort(key=lambda x : x[0]) 40 | # PRE-FILTER 41 | flag = pre_detect(cons_pos) 42 | if not flag: return cons_pos 43 | cxg_dict = construct_dict(cons_pos) 44 | init_state = init_solver_state(len(cxgs)) 45 | solver = CxGCoverage(init_state, **_unpack(cxg_dict)) 46 | solver.set_schedule(solver.auto(minutes=T_minutes)) 47 | state, _ = solver.anneal() 48 | results = [cons_pos[ids] for ids in range(len(state)) if state[ids] == 1] 49 | return results -------------------------------------------------------------------------------- /HyCxG/utils/hypergraph.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse as sp 3 | 4 | def construct_graph(cxg, mask, pad_len : int = 150): 5 | assert len(cxg) == len(mask) 6 | HT, edges = [], [] 7 | max_n_edge = max([len(item) + 1 for item in cxg]) 8 | for idx in range(len(cxg)): 9 | bs_cxg = cxg[idx] 10 | 11 | rows = [] 12 | cols = [] 13 | vals = [] 14 | 15 | edge_labels = [] 16 | for edge in range(len(bs_cxg)): 17 | start = bs_cxg[edge][0] + 1 # 1 - CLS 18 | end = bs_cxg[edge][1] + 1 # 1 - CLS 19 | edge_label = bs_cxg[edge][2] # CxG Index 20 | edge_labels.append(edge_label) 21 | for node in range(start, end): 22 | rows.append(node) 23 | cols.append(edge) 24 | vals.append(1.0) 25 | 26 | # FULLY RELATION 27 | # Not available in this repo 28 | try: 29 | u_H = sp.coo_matrix((vals, (rows, cols)), shape=(pad_len, max_n_edge)) 30 | HT.append(np.asarray(u_H.T.todense())) 31 | except: 32 | u_H = np.zeros((pad_len, max_n_edge), dtype=np.float) 33 | HT.append(u_H.T) 34 | edges.append(edge_labels + [0] * (max_n_edge - len(edge_labels))) 35 | return HT, edges 36 | 37 | -------------------------------------------------------------------------------- /HyCxG/utils/metric.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, matthews_corrcoef 2 | import numpy as np 3 | from scipy.stats import pearsonr 4 | 5 | class Metric(object): 6 | def __init__(self, args): 7 | self.args = args 8 | 9 | def __call__(self, preds : np.ndarray, truth : np.ndarray, sets_name = 'ABSA'): 10 | if sets_name not in ['STS']: 11 | accuracy = accuracy_score(truth, preds) 12 | precision = precision_score(truth, preds, average='macro') 13 | recall = recall_score(truth, preds, average='macro') 14 | f1score = f1_score(truth, preds, average='macro') 15 | return accuracy, precision, recall, f1score 16 | else: 17 | pearson = pearsonr(truth, preds)[0] 18 | return pearson 19 | 20 | def report(self, preds : np.ndarray, truth : np.ndarray, digit : int = 5): 21 | print(classification_report(truth, preds, digits=digit)) 22 | 23 | def print_matthew(self, preds : np.ndarray, truth : np.ndarray): 24 | matt = matthews_corrcoef(truth, preds) * 100 25 | print('CoLA Matthews_coef = {:.3f}'.format(matt)) 26 | return matt -------------------------------------------------------------------------------- /HyCxG/utils/misc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import random 4 | from argparse import Namespace 5 | 6 | # Device 7 | def _get_device(cuda : bool, gpu_id : int = 0) -> torch.device: 8 | gpu_count = torch.cuda.device_count() 9 | if torch.cuda.is_available() and gpu_id < gpu_count: 10 | device = torch.device("cuda:" + str(gpu_id) if cuda else "cpu") 11 | else: 12 | device = torch.device("cpu") 13 | return device 14 | 15 | # Seed 16 | def set_seed(args): 17 | args.seed = int(args.seed) 18 | torch.manual_seed(args.seed) 19 | np.random.seed(args.seed) 20 | random.seed(args.seed) 21 | torch.cuda.manual_seed(args.seed) 22 | 23 | def print_config(args : Namespace): 24 | print(args) 25 | 26 | def cal4scheduler(args, epoch_nums : int, train_num : int, batch_size : int, warm_up : float): 27 | import math 28 | train_steps = math.ceil(epoch_nums * train_num / batch_size) 29 | warm_up_steps = math.floor(train_steps * warm_up) 30 | args.train_steps = train_steps 31 | args.warmup_steps = warm_up_steps 32 | return args -------------------------------------------------------------------------------- /HyCxG/utils/operates.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from scipy.special import logsumexp 4 | from utils.define import LM_PAD 5 | 6 | def _padding(inputs : list, paddings : int, pad_val : int, lm_group : str = 'BERT') -> np.ndarray: 7 | if lm_group in LM_PAD.keys(): 8 | pad_val = LM_PAD[lm_group] 9 | doc = np.array([ 10 | np.pad(x[0:paddings], ( 0, paddings - len(x[0:paddings])), 11 | 'constant', constant_values=pad_val) 12 | for x in inputs 13 | ]).astype('int64') 14 | return doc 15 | 16 | def _pad_adj(inputs : list, paddings : int, pad_val : int) -> np.ndarray: 17 | batch = len(inputs) 18 | adjs = np.zeros((batch, paddings, paddings)) # Not available in this repo 19 | return adjs 20 | 21 | def _attention_mask(padded : np.ndarray, pad_val : int, lm_group : str = 'BERT') -> torch.Tensor: 22 | if lm_group in LM_PAD.keys(): 23 | pad_val = LM_PAD[lm_group] 24 | np_mask = (padded != pad_val).astype('int32') 25 | return torch.from_numpy(np_mask) 26 | 27 | def _save_model(path : str, checkp : dict) -> None: 28 | torch.save(checkp, path) 29 | 30 | def _normalize_logits(logits): 31 | numerator = logits 32 | denominator = logsumexp(logits) 33 | return numerator - denominator 34 | 35 | def _softmax_logits(logits :torch.Tensor, dim : int = 1): 36 | return torch.softmax(logits, dim=dim) -------------------------------------------------------------------------------- /HyCxG/utils/optimizers.py: -------------------------------------------------------------------------------- 1 | from torch.optim import Optimizer 2 | from torch.optim.lr_scheduler import LambdaLR 3 | import math 4 | 5 | def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1): 6 | return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch) 7 | 8 | 9 | def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: int, last_epoch: int = -1): 10 | 11 | def lr_lambda(current_step: int): 12 | if current_step < num_warmup_steps: 13 | return float(current_step) / float(max(1.0, num_warmup_steps)) 14 | return 1.0 15 | 16 | return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch) 17 | 18 | 19 | def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): 20 | def lr_lambda(current_step: int): 21 | if current_step < num_warmup_steps: 22 | return float(current_step) / float(max(1, num_warmup_steps)) 23 | return max( 24 | 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)) 25 | ) 26 | 27 | return LambdaLR(optimizer, lr_lambda, last_epoch) 28 | 29 | 30 | def get_cosine_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1): 31 | def lr_lambda(current_step): 32 | if current_step < num_warmup_steps: 33 | return float(current_step) / float(max(1, num_warmup_steps)) 34 | progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) 35 | return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) 36 | 37 | return LambdaLR(optimizer, lr_lambda, last_epoch) 38 | 39 | 40 | def get_cosine_with_hard_restarts_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: int = 1, last_epoch: int = -1): 41 | def lr_lambda(current_step): 42 | if current_step < num_warmup_steps: 43 | return float(current_step) / float(max(1, num_warmup_steps)) 44 | progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) 45 | if progress >= 1.0: 46 | return 0.0 47 | return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0)))) 48 | 49 | return LambdaLR(optimizer, lr_lambda, last_epoch) 50 | 51 | 52 | def get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1): 53 | lr_init = optimizer.defaults["lr"] 54 | assert lr_init > lr_end, f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})" 55 | 56 | def lr_lambda(current_step: int): 57 | if current_step < num_warmup_steps: 58 | return float(current_step) / float(max(1, num_warmup_steps)) 59 | elif current_step > num_training_steps: 60 | return lr_end / lr_init # as LambdaLR multiplies by lr_init 61 | else: 62 | lr_range = lr_init - lr_end 63 | decay_steps = num_training_steps - num_warmup_steps 64 | pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps 65 | decay = lr_range * pct_remaining ** power + lr_end 66 | return decay / lr_init # as LambdaLR multiplies by lr_init 67 | 68 | return LambdaLR(optimizer, lr_lambda, last_epoch) -------------------------------------------------------------------------------- /README_ZH.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 | 9 | # HyCxG 10 | 论文"**Enhancing Language Representation with Constructional Information for Natural Language Understanding**"的代码仓库 11 | 12 | 13 | GitHub 14 | 15 | [**English**](https://github.com/xlxwalex/HyCxG/) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/README_ZH.md) 16 | 17 | 18 | 🔗 [数据集](https://github.com/xlxwalex/HyCxG/tree/main/data) • [教程](https://github.com/xlxwalex/HyCxG/tree/main/tutorials) • [指南](https://github.com/xlxwalex/HyCxG/tree/main/guidelines) • [快速开始](#-快速开始) • [相关工作](https://github.com/xlxwalex/HyCxG/blob/main/tutorials/PaperLists.md) • [FAQ❓](https://github.com/xlxwalex/HyCxG/tree/main/guidelines/faq.md) 19 | 20 | > **注意** 21 | > 22 | > 本仓库还在建设中,需要过一段时间才能完成 23 | > 24 | 25 | ## 🌀 目录 26 | * [📖 HyCxG介绍](#-hycxg介绍) 27 | * [📃 仓库资源](#-仓库资源) 28 | * [🐍 快速开始](#-快速开始) 29 | * [🔗 其他信息](#-使用的项目) 30 | 31 | ## 📖 HyCxG介绍 32 | **构式语法** (Construction Grammar, CxG)是认知语言学的一个分支。它认为语法是词汇、形态和句法的连续统。构式可以被定义为一系列存储不同形式和意义对的语言模式项(Linguistic Pattern)。由于构式的意义被分配给这些模式项而不是其实例化后内部包含的特定词汇,因此通过预训练模型学习构式信息可能更具挑战且需要大量的训练数据,这可能在自然语言理解任务中遇到问题。 33 | 34 | 这促使我们有动机将构式语法与预训练模型结合起来。因此我们提出了一个新的编码框架 - **HyCxG**(基于构式语法的超图网络),其通过三阶段过程来使用构式信息增强语言表示。首先,我们从句子中提取和选择出所需的构式集合。然后使用关系引导的超图注意网络将构式信息附加到词汇表示上。最后我们获取了最终表示就可以在各种下游任务中进行微调了。 35 | 36 | ## 📃 仓库资源 37 | 本代码仓库中各部分包含的内容为: 38 | - [**HyCxG**](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG) 包含了HyCxG的完整框架 39 | - [**Data**](https://github.com/xlxwalex/HyCxG/tree/main/data) 包括了该工作中用到的所有数据集以及处理脚本。其中的绝大部分数据会从我们的镜像源中进行下载。同时,该部分也提供了基线模型对一些数据的处理脚本 40 | - [**Tutorial**](https://github.com/xlxwalex/HyCxG/tree/main/tutorials) 包含了一些HyCxG相关的教程以及与我们工作相关的资源 41 | - [**Guideline**](https://github.com/xlxwalex/HyCxG/tree/main/guidelines) (正在建设中) 展示了基线模型的一些信息和问答内容 42 | 43 | 44 | ## 🐍 快速开始 45 | **1 实验环境搭建** 46 | 47 | 我们采用了`Python=3.8.5`作为基础实验环境,您可以用以下代码创建环境并安装依赖的包: 48 | ```shell 49 | conda create -n hycxg_env python=3.8.5 50 | source activate hycxg_env 51 | pip install -r requirements.txt 52 | ``` 53 | 54 | **2 准备数据集** 55 | 我们在[`data`](https://github.com/xlxwalex/HyCxG/tree/main/data)文件夹中提供了数据下载脚本。你可以直接用以下代码来获得所有数据集: 56 | ```shell 57 | cd data 58 | bash data_pipeline.sh 59 | ``` 60 | 在下载完数据后,请将每个数据文件夹(例如JSONABSA_MAMS)移动到[`HyCxG/dataset`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset) 路径下 61 | 62 | **3 准备组件所需数据** 63 | 在运行代码之前,您需要下载组件必须的数据(例如构式表),关于下载步骤请分别见[`HyCxG/dataset`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/dataset) 以及 [`HyCxG/Tokenizer`](https://github.com/xlxwalex/HyCxG/tree/main/HyCxG/Tokenizer) 。 你也可以直接通过以下代码来下载这些数据到对应位置: 64 | ```shell 65 | cd HyCxG/dataset 66 | bash download_vocab.sh 67 | cd ../Tokenizer 68 | bash download_cxgdict.sh 69 | ``` 70 | 71 | **4 运行HyCxG** 72 | 73 | 我们提供了一些HyCxG的运行样例脚本在[`HyCxG/run_hycxg.sh`](https://github.com/xlxwalex/HyCxG/blob/main/HyCxG/run_hycxg.sh)中,方便您参考 74 | 75 | 76 | ## 🙏 使用的项目 77 | - [c2xg](https://github.com/jonathandunn/c2xg) 被用于从句子中抽取构式 78 | - [simanneal](https://github.com/perrygeo/simanneal)是一个很方便的模拟退火框架被用于解决Cond-MC问题 79 | 80 | ## 👋 引用 81 | 如果您认为我们的工作对您有帮助,您可以引用我们的论文: Enhancing Language Representation with Constructional Information for Natural Language Understanding 82 | ``` 83 | @inproceedings{xu-etal-2023-enhancing, 84 | title = "Enhancing Language Representation with Constructional Information for Natural Language Understanding", 85 | author = "Xu, Lvxiaowei and 86 | Wu, Jianwang and 87 | Peng, Jiawei and 88 | Gong, Zhilin and 89 | Cai, Ming and 90 | Wang, Tianxiang", 91 | booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", 92 | year = "2023", 93 | publisher = "Association for Computational Linguistics", 94 | url = "https://aclanthology.org/2023.acl-long.258", 95 | pages = "4685--4705", 96 | } 97 | ``` 98 | 99 |

100 | 101 | Arxiv 102 | 103 |

104 | 105 | 106 | ## 📧 联系我们 107 | 如果您对代码有任何问题,可以提交Issue或联系 [`xlxw@zju.edu.cn`](mailto:xlxw@zju.edu.cn) 108 | -------------------------------------------------------------------------------- /data/ABSA/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | 16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/ABSA) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/ABSA/README_ZH.md) 17 | 18 | ## Aspect-based Sentiment Analysis Dataset for ABSA 19 | 20 | The Aspect-based Sentiment Analysis (ABSA) dataset is based on SemEval 2014, 2015, 2016, and MAMS. SemEval 2014 contains reviews from two domains: restaurants and laptops. SemEval 2014 and 2015 include reviews from the restaurant domain only. MAMS is a larger-scale dataset, where each sentence has multiple aspects, making it more challenging. 21 | 22 | ### Download and process the data 23 | Before using the data processing and downloading script, please make sure that the dependencies in [`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt) have already been installed. After installing the dependencies, use the following command to download and process the data (note: you may not attach any parameters, all parameters have default values): 24 | 25 | Specifically, since the four datasets included in SemEval 2014, 2015 and 2016 only contain train and test sets, we split the training set into new training and validation sets in a `9:1` random ratio for better evaluating model performances. Such datasets will be identified with a `Split`suffix. 26 | 27 | ```shell 28 | bash download_and_process_absa.sh [--DATA_DIR] [--OUTPUT_DIR] [--STANFORD_DIR] 29 | ``` 30 | **Parameters:** 31 | + DATA_DIR: The folder where the downloaded raw data is located. The default parameter is the current folder. 32 | + OUTPUT_DIR: The folder where the processed data is stored. The default parameter is `dataset`. 33 | + STANFORD_DIR: The location of the Stanford parser. The default parameter is `stanford-corenlp-3.9.2-minimal` in the parent directory. 34 | 35 | **Note:** If the shared parser directory does not exist, the program will ask if you want to fetch the Stanford parser from our mirror data source, which is 353MB in size. Choose `Y` to proceed. 36 | 37 | ### Data processing for baseline models 38 | In Section4.2 - Results on ABSA tasks and Appendix K - Detailed Results on ABSA Tasks of our paper, we compared the performance of multiple baseline models. Therefore, we provide more information on reproducing the baseline models in [`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines). 39 | 40 | ### Resource of data 41 | Our mirror data is obtained from SemEval 2014, 2015, 2016 and MAMS, as well as the data sources [ASGCN-data](https://github.com/GeneZC/ASGCN/tree/master/datasets) and [MAMS](https://github.com/siat-nlp/MAMS-for-ABSA). If you have also used this dataset, you can cite their papers as follows: 42 | 43 | **SemEval 2014** 44 | ``` 45 | @inproceedings{pontiki2014semeval, 46 | title = "{S}em{E}val-2014 Task 4: Aspect Based Sentiment Analysis", 47 | author = "Pontiki, Maria and 48 | Galanis, Dimitris and 49 | Pavlopoulos, John and 50 | Papageorgiou, Harris and 51 | Androutsopoulos, Ion and 52 | Manandhar, Suresh", 53 | booktitle = "{S}em{E}val 2014", 54 | year = "2014", 55 | url = "https://aclanthology.org/S14-2004", 56 | pages = "27--35", 57 | } 58 | ``` 59 | **SemEval 2015** 60 | ``` 61 | @inproceedings{pontiki2015semeval, 62 | title = "{S}em{E}val-2015 Task 12: Aspect Based Sentiment Analysis", 63 | author = "Pontiki, Maria and 64 | Galanis, Dimitris and 65 | Papageorgiou, Haris and 66 | Manandhar, Suresh and 67 | Androutsopoulos, Ion", 68 | booktitle = "{S}em{E}val 2015", 69 | year = "2015", 70 | url = "https://aclanthology.org/S15-2082", 71 | pages = "486--495", 72 | } 73 | ``` 74 | **SemEval 2016** 75 | ``` 76 | @inproceedings{pontiki2016semeval, 77 | title = "{S}em{E}val-2016 Task 5: Aspect Based Sentiment Analysis", 78 | author = {Pontiki, Maria and 79 | Galanis, Dimitris and 80 | Papageorgiou, Haris and 81 | Androutsopoulos, Ion and 82 | Manandhar, Suresh and 83 | AL-Smadi, Mohammad and 84 | Al-Ayyoub, Mahmoud and 85 | Zhao, Yanyan and 86 | Qin, Bing and 87 | De Clercq, Orph{\'e}e and 88 | Hoste, V{\'e}ronique and 89 | Apidianaki, Marianna and 90 | Tannier, Xavier and 91 | Loukachevitch, Natalia and 92 | Kotelnikov, Evgeniy and 93 | Bel, Nuria and 94 | Jim{\'e}nez-Zafra, Salud Mar{\'\i}a and 95 | Eryi{\u{g}}it, G{\"u}l{\c{s}}en}, 96 | booktitle = "{S}em{E}val-2016", 97 | year = "2016", 98 | url = "https://aclanthology.org/S16-1002", 99 | pages = "19--30", 100 | } 101 | ``` 102 | **MAMS** 103 | ``` 104 | @inproceedings{jiang2019challenge, 105 | title = "A Challenge Dataset and Effective Models for Aspect-Based Sentiment Analysis", 106 | author = "Jiang, Qingnan and 107 | Chen, Lei and 108 | Xu, Ruifeng and 109 | Ao, Xiang and 110 | Yang, Min", 111 | booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)", 112 | year = "2019", 113 | url = "https://aclanthology.org/D19-1654", 114 | pages = "6280--6285", 115 | } 116 | ``` 117 | -------------------------------------------------------------------------------- /data/ABSA/README_ZH.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/ABSA) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/ABSA/README_ZH.md) 16 | ## 方面级情感分析数据集 17 | 18 | 方面级情感分析(Aspect-based sentiment analysis, ABSA)数据集基于SemEval 2014/2015/2016以及MAMS任务及数据。其中SemEval 2014中包含了餐厅(Restaurant)和笔记本电脑(laptop)两个领域的评论,而SemEval 2014/2015均为餐厅领域的评论。MAMS是一个更大尺度的数据集,其每个句子都有多个方面,因此更具挑战性。 19 | 20 | 比较特别的是,由于SemEval 2014/2015/2016包含的四个数据集仅包含训练集和测试集,因此为了能更好评估模型性能,我们独立地将训练集按照`9:1`随机划分为了新的训练集和验证集,该类数据集会用`Split`后缀进行标识。 21 | 22 | ### 数据下载及处理 23 | 在使用数据处理及下载脚本前,请您确认已经安装了[`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt)中的依赖包。在安装完依赖包后,用以下命令来获得并处理数据(可以不附加任何参数,所有参数均有默认值): 24 | ```shell 25 | bash download_and_process_absa.sh [--DATA_DIR] [--OUTPUT_DIR] [--STANFORD_DIR] 26 | ``` 27 | **参数含义:** 28 | + DATA_DIR: 下载的原始数据所在文件夹, 默认为当前文件夹 29 | + OUTPUT_DIR: 处理好后的数据存储的文件夹, 默认为`dataset` 30 | + STANFORD_DIR: 斯坦福解析器所在位置,默认为上一级目录的`stanford-corenlp-3.9.2-minimal` 31 | 32 | **注意:** 如果共享的解析器文件夹不存在,那么程序会询问是否从我们的镜像数据源拉取斯坦福解析器(共353MB),选择`Y`即可 33 | 34 | ### 基准模型数据处理 35 | 在论文的Section4.2 - Results on ABSA tasks中以及Appendix K - Detailed Results on ABSA Tasks中我们对比了多个模型在方面级情感数据集上的性能,我们在[`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines)提供了更多关于基线模型的复现信息。 36 | 37 | ### 数据来源 38 | 本部分数据来自SemEval 2014/2015/2016,我们的镜像数据来源[ASGCN-data](https://github.com/GeneZC/ASGCN/tree/master/datasets)以及[MAMS](https://github.com/siat-nlp/MAMS-for-ABSA),如果您也使用了该数据集,您可以引用他们的论文,分别为: 39 | 40 | **SemEval 2014** 41 | ``` 42 | @inproceedings{pontiki2014semeval, 43 | title = "{S}em{E}val-2014 Task 4: Aspect Based Sentiment Analysis", 44 | author = "Pontiki, Maria and 45 | Galanis, Dimitris and 46 | Pavlopoulos, John and 47 | Papageorgiou, Harris and 48 | Androutsopoulos, Ion and 49 | Manandhar, Suresh", 50 | booktitle = "{S}em{E}val 2014", 51 | year = "2014", 52 | url = "https://aclanthology.org/S14-2004", 53 | pages = "27--35", 54 | } 55 | ``` 56 | **SemEval 2015** 57 | ``` 58 | @inproceedings{pontiki2015semeval, 59 | title = "{S}em{E}val-2015 Task 12: Aspect Based Sentiment Analysis", 60 | author = "Pontiki, Maria and 61 | Galanis, Dimitris and 62 | Papageorgiou, Haris and 63 | Manandhar, Suresh and 64 | Androutsopoulos, Ion", 65 | booktitle = "{S}em{E}val 2015", 66 | year = "2015", 67 | url = "https://aclanthology.org/S15-2082", 68 | pages = "486--495", 69 | } 70 | ``` 71 | **SemEval 2016** 72 | ``` 73 | @inproceedings{pontiki2016semeval, 74 | title = "{S}em{E}val-2016 Task 5: Aspect Based Sentiment Analysis", 75 | author = {Pontiki, Maria and 76 | Galanis, Dimitris and 77 | Papageorgiou, Haris and 78 | Androutsopoulos, Ion and 79 | Manandhar, Suresh and 80 | AL-Smadi, Mohammad and 81 | Al-Ayyoub, Mahmoud and 82 | Zhao, Yanyan and 83 | Qin, Bing and 84 | De Clercq, Orph{\'e}e and 85 | Hoste, V{\'e}ronique and 86 | Apidianaki, Marianna and 87 | Tannier, Xavier and 88 | Loukachevitch, Natalia and 89 | Kotelnikov, Evgeniy and 90 | Bel, Nuria and 91 | Jim{\'e}nez-Zafra, Salud Mar{\'\i}a and 92 | Eryi{\u{g}}it, G{\"u}l{\c{s}}en}, 93 | booktitle = "{S}em{E}val-2016", 94 | year = "2016", 95 | url = "https://aclanthology.org/S16-1002", 96 | pages = "19--30", 97 | } 98 | ``` 99 | **MAMS** 100 | ``` 101 | @inproceedings{jiang2019challenge, 102 | title = "A Challenge Dataset and Effective Models for Aspect-Based Sentiment Analysis", 103 | author = "Jiang, Qingnan and 104 | Chen, Lei and 105 | Xu, Ruifeng and 106 | Ao, Xiang and 107 | Yang, Min", 108 | booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)", 109 | year = "2019", 110 | url = "https://aclanthology.org/D19-1654", 111 | pages = "6280--6285", 112 | } 113 | ``` -------------------------------------------------------------------------------- /data/ABSA/process_absa.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | import sys 4 | import json 5 | import os 6 | import numpy as np 7 | import pandas as pd 8 | import argparse 9 | from tqdm import tqdm 10 | from stanfordcorenlp import StanfordCoreNLP 11 | try: 12 | sys.path.append('.') 13 | from download_stanfordcore import download_stanfordcore, unzip_stanfordcore, STANFORD_CORE_LINK 14 | except: 15 | sys.path.append('..') 16 | from download_stanfordcore import download_stanfordcore, unzip_stanfordcore, STANFORD_CORE_LINK 17 | 18 | MAP_POLARITY = {0 : 'neutral', 1 : 'positive', -1 : 'negative'} 19 | 20 | def initialize_stanfordcore(stanford_path: str): 21 | try: nlpmodel = StanfordCoreNLP(stanford_path) 22 | except: 23 | print('The script need stanfordparser>=3.9.2 package, while the path is not exist for the package, do you want to download it? (Y/N)') 24 | download_flag = input() 25 | download_flag = download_flag.lower() 26 | assert download_flag in ['y'], "Abort" 27 | download_stanfordcore(STANFORD_CORE_LINK, stanford_path+'.zip') 28 | unzip_stanfordcore(stanford_path+'.zip', '../') 29 | nlpmodel = StanfordCoreNLP(stanford_path) 30 | return nlpmodel 31 | 32 | def convert_raw2json(path : str, nlpmodel, desc: str='train', dataset_name: str='Rest14'): 33 | def parse_adj(edge): 34 | e_id, dep_rels, dep_heads = 1, [], [] 35 | for eidx in range(len(edge)): 36 | if (eidx + 1) != edge[0][2]: 37 | dep_heads.append(edge[e_id][1]) 38 | dep_rels.append(edge[e_id][0]) 39 | e_id += 1 40 | else: 41 | dep_heads.append(0) 42 | dep_rels.append(edge[0][0]) 43 | return dep_heads, dep_rels 44 | 45 | data = [] 46 | with open(path, 'r', encoding='utf-8') as fp: 47 | raw_data = fp.readlines() 48 | fp.close() 49 | for idx in tqdm(range(0, len(raw_data), 3), desc='Process {} file in {}'.format(desc, dataset_name)): 50 | obj = {} 51 | sentence = raw_data[idx].strip() 52 | target = raw_data[idx + 1].strip() 53 | polarity = MAP_POLARITY[eval(raw_data[idx + 2].strip())] 54 | if '$T$' not in sentence: 55 | print('Error sentence : %s' % sentence) 56 | continue 57 | post_sentence = sentence.replace('$T$', target) 58 | obj['token'] = nlpmodel.word_tokenize(post_sentence) 59 | pos_tag = nlpmodel.pos_tag(post_sentence) 60 | dependecy = nlpmodel.dependency_parse(post_sentence) 61 | obj['pos'] = [tag[1] for tag in pos_tag] 62 | heads, rels = parse_adj(dependecy) 63 | obj['head'] = heads 64 | obj['deprel'] = rels 65 | obj['aspects'] = [{ 66 | 'term' : nlpmodel.word_tokenize(target), 67 | 'from' : len(nlpmodel.word_tokenize(sentence.split('$T$')[0])), 68 | 'to' : len(nlpmodel.word_tokenize(sentence.split('$T$')[0])) + len(nlpmodel.word_tokenize(target)), 69 | 'polarity' : polarity 70 | }] 71 | data.append(obj) 72 | return data 73 | 74 | def read_semeval_data(path : str): 75 | data_out = [] 76 | data = np.array(pd.read_csv(path)) 77 | for dat in data: data_out.append([dat[0], dat[2], dat[1]]) 78 | return data_out 79 | 80 | def output_json(data: list, folder_path: str, file_path: str): 81 | if not os.path.exists(folder_path): os.makedirs(folder_path) 82 | with open(os.path.join(folder_path, file_path), 'w', encoding='utf-8') as fp: 83 | fp.write(json.dumps(data, indent=4)) 84 | fp.close() 85 | 86 | def process_data(args: argparse.Namespace): 87 | assert os.path.exists(args.train_file) and os.path.exists(args.test_file), "The path of data is not exist, please download first." 88 | train_json = convert_raw2json(args.train_file, args.nlpmodel, dataset_name=args.dataset_name) 89 | output_json(train_json, args.out_path, 'train.json') 90 | print('The train file of {} dataset is saved at {}'.format(args.dataset_name, os.path.join(args.out_path, 'train.json'))) 91 | 92 | valid_json = convert_raw2json(args.valid_file, args.nlpmodel, desc='valid', dataset_name=args.dataset_name) 93 | output_json(valid_json, args.out_path, 'valid.json') 94 | print('The test file of {} dataset is saved at {}'.format(args.dataset_name, os.path.join(args.out_path, 'valid.json'))) 95 | 96 | test_json = convert_raw2json(args.test_file, args.nlpmodel, desc='test', dataset_name=args.dataset_name) 97 | output_json(test_json, args.out_path, 'test.json') 98 | print('The test file of {} dataset is saved at {}'.format(args.dataset_name, os.path.join(args.out_path, 'test.json'))) 99 | 100 | def main(): 101 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 102 | parser.add_argument("--out_path", default='dataset/JSONABSA_Rest14', type=str, help="Output path of ABSA dataset.") 103 | parser.add_argument("--stanford_path", default='stanford-corenlp-3.9.2-minimal', type=str, help="The path for stanfordparser.") 104 | parser.add_argument("--train_file", default='rest14_train.raw', type=str, help="The path of train file.") 105 | parser.add_argument("--valid_file", default='rest14_test.raw', type=str, help="The path of valid file.") 106 | parser.add_argument("--test_file", default='rest14_test.raw', type=str, help="The path of test file.") 107 | args = parser.parse_args() 108 | args.nlpmodel = initialize_stanfordcore(args.stanford_path) 109 | args.dataset_name = args.out_path.split('_')[0] 110 | process_data(args) 111 | print('ABSA data for {} has been processed.'.format(args.dataset_name)) 112 | 113 | if __name__ == '__main__': 114 | main() 115 | -------------------------------------------------------------------------------- /data/Colloquial/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | 16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/Colloquial) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/Colloquial/README_ZH.md) 17 | 18 | ## Colloquial Dataset for ABSA 19 | 20 | The colloquial ABSA datasets are composed of two sources in sentiment analysis: the Twitter dataset and the GermEval2017 dataset. Therefore, it can serve as data for performance evaluation across different register. Both datasets are derived from social media, so they contain more colloquial language compared to other datasets. Since GermEval is much larger than Twitter, we sampled a subset of GermEval for performance testing. 21 | 22 | ### Download and process the data 23 | Before using the data processing and downloading script, please make sure that the dependencies in [`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt) have already been installed. After installing the dependencies, use the following command to download and process the data (note: you may not attach any parameters, all parameters have default values): 24 | ```shell 25 | bash download_and_process_colloquial.sh [--DATA_DIR] [--OUTPUT_DIR] [--STANFORD_DIR] 26 | ``` 27 | **Parameters:** 28 | + DATA_DIR: The folder where the downloaded raw data is located. The default parameter is the current folder. 29 | + OUTPUT_DIR: The folder where the processed data is stored. The default parameter is `dataset`. 30 | + STANFORD_DIR: The location of the Stanford parser. The default parameter is `stanford-corenlp-3.9.2-minimal` in the parent directory. 31 | 32 | **Note:** If the shared parser directory does not exist, the program will ask if you want to fetch the Stanford parser from our mirror data source, which is 353MB in size. Choose `Y` to proceed. Additionally, GermEval requires SpaCy package, which automatically downloads the necessary models when initialized. 33 | 34 | ### Data processing for baseline models 35 | In Appendix H - Colloquial Expression Results of our paper, we compared the performance of four models, namely RGAT, DualGCN, DGEDT, and KumaGCN, on these colloquial sentiment datasets. Therefore, in the [`baseline`](https://github.com/xlxwalex/HyCxG/tree/main/data/Colloquial/baseline) folder, we provide a conversion script for the GermEval dataset (as Twitter is a commonly used dataset, these baseline models already include the processed data). For more information on reproducing the baseline models, please refer to the [`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines). 36 | 37 | ### Resource of data 38 | Our mirror data is obtained from Twitter and GermEval, as well as the data sources [acl-14-short-data](https://github.com/songyouwei/ABSA-PyTorch/tree/master/datasets/acl-14-short-data) and [GermEval 2017](http://ltdata1.informatik.uni-hamburg.de/germeval2017/). If you have also used this dataset, you can cite their papers as follows: 39 | 40 | #### Twitter Dataset 41 | ``` 42 | @inproceedings{dong2014adaptive, 43 | title = "Adaptive Recursive Neural Network for Target-dependent {T}witter Sentiment Classification", 44 | author = "Dong, Li and 45 | Wei, Furu and 46 | Tan, Chuanqi and 47 | Tang, Duyu and 48 | Zhou, Ming and 49 | Xu, Ke", 50 | booktitle = "Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)", 51 | year = "2014", 52 | publisher = "Association for Computational Linguistics", 53 | url = "https://aclanthology.org/P14-2009", 54 | pages = "49--54", 55 | } 56 | ``` 57 | #### GermEval 2017 Competition 58 | ``` 59 | @article{wojatzki2017germeval, 60 | title={Germeval 2017: Shared task on aspect-based sentiment in social media customer feedback}, 61 | author={Wojatzki, Michael and Ruppert, Eugen and Holschneider, Sarah and Zesch, Torsten and Biemann, Chris}, 62 | journal={GermEval}, 63 | pages={1--12}, 64 | year={2017} 65 | } 66 | ``` -------------------------------------------------------------------------------- /data/Colloquial/README_ZH.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/Colloquial) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/Colloquial/README_ZH.md) 16 | ## 口语化情感数据集 17 | 18 | 口语化情感数据集由Twitter数据集以及GermEval2017两部分方面级情感分析数据集组成,这两个数据集的数据均来自于社交媒体平台,因此相较于其他数据集包含更多的口语化语料,因此可以作为不同语域(Register)性能评测的数据。由于GermEval远大于Twitter,我们对GermEval采样为了一个子集进行性能测试。 19 | 20 | ### 数据下载及处理 21 | 在使用数据处理及下载脚本前,请您确认已经安装了[`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt)中的依赖包。在安装完依赖包后,用以下命令来获得并处理数据(可以不附加任何参数,所有参数均有默认值): 22 | ```shell 23 | bash download_and_process_counterfactual.sh [--DATA_DIR] [--OUTPUT_DIR] [--STANFORD_DIR] 24 | ``` 25 | **参数含义:** 26 | + DATA_DIR: 下载的原始数据所在文件夹, 默认为当前文件夹 27 | + OUTPUT_DIR: 处理好后的数据存储的文件夹, 默认为`dataset` 28 | + STANFORD_DIR: 斯坦福解析器所在位置,默认为上一级目录的`stanford-corenlp-3.9.2-minimal` 29 | 30 | **注意:** 如果共享的解析器文件夹不存在,那么程序会询问是否从我们的镜像数据源拉取斯坦福解析器(共353MB),选择`Y`即可。另外GermEval需要用到SpaCy,在调用时其会自动下载所需模型。 31 | 32 | ### 基准模型数据处理 33 | 在论文的Appendix H - Colloquial Expression Results中,我们对比了RGAT、DualGCN、DGEDT以及KumaGCN这四个模型在口语化情感数据集上的性能,因此在[`baseline文件夹`](https://github.com/xlxwalex/HyCxG/tree/main/data/Colloquial/baseline)中我们提供了GermEval数据的转换脚本(由于Twitter是常用数据集,这些基线模型都包含了处理好的数据)。更多关于基线模型的复现信息请见[`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines)。 34 | 35 | ### 数据来源 36 | 本部分数据来自Twitter和GermEval,我们的镜像数据来源[acl-14-short-data](https://github.com/songyouwei/ABSA-PyTorch/tree/master/datasets/acl-14-short-data)以及[GermEval 2017](http://ltdata1.informatik.uni-hamburg.de/germeval2017/),如果您也使用了该数据集,您可以引用他们的论文,分别为: 37 | #### Twitter数据集: 38 | ``` 39 | @inproceedings{dong2014adaptive, 40 | title = "Adaptive Recursive Neural Network for Target-dependent {T}witter Sentiment Classification", 41 | author = "Dong, Li and 42 | Wei, Furu and 43 | Tan, Chuanqi and 44 | Tang, Duyu and 45 | Zhou, Ming and 46 | Xu, Ke", 47 | booktitle = "Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)", 48 | year = "2014", 49 | publisher = "Association for Computational Linguistics", 50 | url = "https://aclanthology.org/P14-2009", 51 | pages = "49--54", 52 | } 53 | ``` 54 | #### GermEval 2017评测比赛: 55 | ``` 56 | @article{wojatzki2017germeval, 57 | title={Germeval 2017: Shared task on aspect-based sentiment in social media customer feedback}, 58 | author={Wojatzki, Michael and Ruppert, Eugen and Holschneider, Sarah and Zesch, Torsten and Biemann, Chris}, 59 | journal={GermEval}, 60 | pages={1--12}, 61 | year={2017} 62 | } 63 | ``` -------------------------------------------------------------------------------- /data/Colloquial/baseline/DGEDT_germeval_gengraph.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import spacy 3 | import pickle 4 | import tqdm 5 | nlp = spacy.load('de') 6 | import re 7 | 8 | def tokenize(text): 9 | text=text.strip() 10 | text=re.sub(r' {2,}',' ',text) 11 | document = nlp(text) 12 | return [token.text for token in document] 13 | 14 | def update_edge(text,vocab): 15 | # https://spacy.io/docs/usage/processing-text 16 | document = nlp(text) 17 | seq_len = len(text.split()) 18 | for token in document: 19 | if token.dep_ not in vocab: 20 | vocab[token.dep_]=len(vocab) 21 | return 0 22 | 23 | def dependency_adj_matrix(text,edge_vocab): 24 | # https://spacy.io/docs/usage/proclessing-text 25 | document = nlp(text.strip()) 26 | seq_len = len(tokenize(text)) 27 | matrix = np.zeros((seq_len, seq_len)).astype('float32') 28 | matrix1 = np.zeros((seq_len, seq_len)).astype('float32') 29 | edge = np.zeros((seq_len, seq_len)).astype('int32') 30 | edge1 = np.zeros((seq_len, seq_len)).astype('int32') 31 | assert len(document)==seq_len 32 | for token in document: 33 | if token.i >= seq_len: 34 | print('bug') 35 | print(text) 36 | print(text.split()) 37 | print(document) 38 | print([token.i for token in document]) 39 | print([token.text for token in document]) 40 | a=input('hahha') 41 | if token.i < seq_len: 42 | matrix[token.i][token.i] = 1 43 | matrix1[token.i][token.i] = 1 44 | # https://spacy.io/docs/api/token 45 | for child in token.children: 46 | if child.i < seq_len: 47 | matrix[token.i][child.i] = 1 48 | matrix1[child.i][token.i] = 1 49 | edge[token.i][child.i] = edge_vocab.get(child.dep_,1) 50 | edge1[child.i][token.i] = edge_vocab.get(child.dep_,1) 51 | return matrix,matrix1,edge,edge1 52 | 53 | def concat(texts,aspect): 54 | source='' 55 | splitnum=0 56 | for i,text in enumerate(texts): 57 | source+=text 58 | splitnum+=len(tokenize(text)) 59 | if i ':0,'':1} 77 | fin = open(filename, 'r', encoding='utf-8', newline='\n', errors='ignore') 78 | lines = fin.readlines() 79 | fin.close() 80 | idx2graph = {} 81 | fout = open(filename+'.graph', 'wb') 82 | if savevocab: 83 | fout1 = open(filename+'.edgevocab', 'wb') 84 | if savevocab: 85 | for i in tqdm.tqdm(range(0, len(lines), 3)): 86 | text_left = [s.lower().strip() for s in lines[i].split("$T$")] 87 | aspect = lines[i + 1].lower().strip() 88 | update_edge(concat(text_left,aspect),edge_vocab) 89 | for i in tqdm.tqdm(range(0, len(lines), 3)): 90 | text_left = [s.lower().strip() for s in lines[i].split("$T$")] 91 | aspect = lines[i + 1].lower().strip() 92 | adj_matrix = dependency_adj_matrix(concat(text_left,aspect),edge_vocab) 93 | idx2graph[i] = adj_matrix 94 | pickle.dump(idx2graph, fout) 95 | if savevocab: 96 | pickle.dump(edge_vocab, fout1) 97 | fout.close() 98 | if savevocab: 99 | fout1.close() 100 | return edge_vocab 101 | 102 | def processe(filename,filename2): 103 | savevocab=True 104 | 105 | edge_vocab={'':0,'':1} 106 | fin = open(filename, 'r', encoding='utf-8', newline='\n', errors='ignore') 107 | lines = fin.readlines() 108 | fin.close() 109 | idx2graph = {} 110 | fout = open(filename+'.graph', 'wb') 111 | if savevocab: 112 | fout1 = open(filename+'.edgevocab', 'wb') 113 | if savevocab: 114 | for i in tqdm.tqdm(range(0, len(lines), 1)): 115 | update_edge(re.sub(r' {2,}',' ',lines[i].strip()),edge_vocab) 116 | for i in tqdm.tqdm(range(0, len(lines), 1)): 117 | adj_matrix = dependency_adj_matrix(re.sub(r' {2,}',' ',lines[i].strip()),edge_vocab) 118 | idx2graph[i] = adj_matrix 119 | pickle.dump(idx2graph, fout) 120 | if savevocab: 121 | pickle.dump(edge_vocab, fout1) 122 | fout.close() 123 | if savevocab: 124 | fout1.close() 125 | return edge_vocab 126 | 127 | if __name__ == '__main__': 128 | edge_vocab = process('./datasets/germeval/germeval_train.raw', None, True) 129 | process('./datasets/germeval/germeval_valid.raw', edge_vocab, False) 130 | process('./datasets/germeval/germeval_test.raw', edge_vocab, False) 131 | -------------------------------------------------------------------------------- /data/Colloquial/baseline/DualGCN_germeval_txt2json.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import spacy 3 | import pickle 4 | from tqdm import tqdm 5 | import json 6 | 7 | MAP_INV = {0 : 'neutral', 1 : 'positive', -1 : 'negative'} 8 | def read_data(path : str): 9 | with open(path, 'r') as fp: 10 | data = fp.readlines() 11 | fp.close() 12 | data_gp = [] 13 | for idx in range(0, len(data), 3): 14 | sentence = data[idx].strip() 15 | term = data[idx+1].strip() 16 | polarity = eval(data[idx+2].strip()) 17 | data_gp.append([sentence, term, polarity]) 18 | return data_gp 19 | 20 | def construct_data(data : list): 21 | nlp = spacy.load('de') 22 | out_data = [] 23 | for text in tqdm(data, desc='Processing'): 24 | sentence, term, polarity = text[0], text[1], text[2] 25 | if len(term) <1:continue 26 | document = nlp(sentence.replace('$T$', term)) 27 | tokens, pos_tag = zip(*[[token.text, token.tag_.split('_')[0]] for token in document]) 28 | from_idx = len([tok.text for tok in nlp(sentence.split('$T$')[0])]) 29 | term_tok = [tok.text for tok in nlp(term)] 30 | from_to = [[from_idx, from_idx + len(term_tok)]] 31 | predicted_dependencies = [token.dep_.split(':')[0] if token.dep_ != 'ROOT' else 'root' for token in document] 32 | predicted_heads = [token.head.i+1 if token.dep_ != 'ROOT' else 0 for token in document] 33 | obj = {} 34 | obj['token'] = list(tokens) 35 | obj['pos'] = list(pos_tag) 36 | obj['head'] = predicted_heads 37 | obj['deprel'] = predicted_dependencies 38 | obj['aspects'] = [{ 39 | 'term': term_tok, 40 | 'from' : from_to[0][0], 41 | 'to' : from_to[0][1], 42 | 'polarity' : MAP_INV[polarity] 43 | }] 44 | out_data.append(obj) 45 | return out_data 46 | 47 | if __name__ == '__main__': 48 | train_data = read_data('germeval_train.raw') 49 | train_data = construct_data(train_data) 50 | with open('train.json', 'w', encoding='utf-8') as tr_fp: 51 | json_str = json.dumps(train_data, indent=4) 52 | tr_fp.write(json_str) 53 | tr_fp.close() 54 | 55 | valid_data = read_data('germeval_valid.raw') 56 | valid_data = construct_data(valid_data) 57 | with open('valid.json', 'w', encoding='utf-8') as va_fp: 58 | json_str = json.dumps(valid_data, indent=4) 59 | va_fp.write(json_str) 60 | va_fp.close() 61 | 62 | test_data = read_data('germeval_test.raw') 63 | test_data = construct_data(test_data) 64 | with open('test.json', 'w', encoding='utf-8') as te_fp: 65 | json_str = json.dumps(test_data, indent=4) 66 | te_fp.write(json_str) 67 | te_fp.close() 68 | print('WELL DONE.') 69 | -------------------------------------------------------------------------------- /data/Colloquial/baseline/KumaGCN_germeval_gengraph.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import spacy 3 | import pickle 4 | 5 | nlp = spacy.load('de') 6 | 7 | def dependency_adj_matrix(text): 8 | print(text) 9 | document = nlp(text) 10 | print("[tlog] document: " + str(document)) 11 | # sys.exit(0) 12 | seq_len = len(text.split()) 13 | matrix = np.zeros((seq_len, seq_len)).astype('float32') 14 | pos = [] 15 | dep_rel = [] 16 | i = 0 17 | for sentence in document.sentences: 18 | print("[tlog] sentence: " + str(sentence)) 19 | for word in sentence.words: 20 | if word.index + i < seq_len: # there are some bugs for here such as SPACE 21 | pos.append(word.pos) 22 | dep_rel.append(word.dependency_relation) 23 | if word.index + i < seq_len: 24 | index = word.index + i 25 | head_index = word.governor + i 26 | matrix[index][index] = 1 27 | 28 | matrix[head_index][index] = 1 29 | matrix[index][head_index] = 1 30 | 31 | i += len(sentence.words) 32 | return matrix, pos, dep_rel 33 | 34 | 35 | def dependency_adj_matrix2(text): 36 | # https://spacy.io/docs/usage/processing-text 37 | # print("[tlog] text: " + str(text)) # Maybe for parsing, we should not lower case this 38 | document = nlp(text) 39 | # print("[tlog] document: " + str(document)) 40 | # sys.exit(0) 41 | seq_len = len(text.split()) 42 | matrix = np.zeros((seq_len, seq_len)).astype('float32') 43 | pos = [] 44 | dep_rel = [] 45 | for token in document: 46 | if token.i < seq_len: # there are some bugs for here such as SPACE 47 | pos.append(token.tag_) 48 | dep_rel.append(token.dep_) 49 | 50 | if token.i < seq_len: 51 | matrix[token.i][token.i] = 1 52 | # https://spacy.io/docs/api/token 53 | for child in token.children: # tzy: do not distinguish the arc types 54 | if child.i < seq_len: 55 | matrix[token.i][child.i] = 1 56 | matrix[child.i][token.i] = 1 57 | 58 | return matrix, pos, dep_rel 59 | 60 | 61 | def process(filename): 62 | print(filename) 63 | fin = open(filename, 'r', encoding='utf-8', newline='\n', errors='ignore') 64 | lines = fin.readlines() 65 | fin.close() 66 | idx2graph = {} 67 | fout = open(filename + '.graph', 'wb') 68 | pos_out = open(filename + '.pos', 'w') 69 | rel_out = open(filename + '.rel', 'w') 70 | for i in range(0, len(lines), 3): 71 | text_left, _, text_right = [s.strip() for s in lines[i].partition("$T$")] 72 | aspect = lines[i + 1].strip() 73 | adj_matrix, pos, rel = dependency_adj_matrix2(text_left.strip() + ' ' + aspect + ' ' + text_right.strip()) 74 | idx2graph[i] = adj_matrix 75 | pos_out.write(" ".join(pos) + "\n") 76 | rel_out.write(" ".join(rel) + "\n") 77 | pickle.dump(idx2graph, fout) 78 | fout.close() 79 | 80 | 81 | if __name__ == '__main__': 82 | process('./datasets/german/germeval_train.raw') 83 | process('./datasets/german/germeval_valid.raw') 84 | process('./datasets/german/germeval_test.raw') 85 | 86 | -------------------------------------------------------------------------------- /data/Colloquial/baseline/RGAT_germeval_txt2json.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from tqdm import tqdm 3 | import json 4 | 5 | MAP_INV = {0 : 'neutral', 1 : 'positive', -1 : 'negative'} 6 | def read_data(path : str): 7 | with open(path, 'r') as fp: 8 | data = fp.readlines() 9 | fp.close() 10 | data_gp = [] 11 | for idx in range(0, len(data), 3): 12 | sentence = data[idx].strip() 13 | term = data[idx+1].strip() 14 | polarity = eval(data[idx+2].strip()) 15 | data_gp.append([sentence, term, polarity]) 16 | return data_gp 17 | 18 | def construct_data(data : list): 19 | nlp = spacy.load('de') 20 | out_data = [] 21 | for text in tqdm(data, desc='Processing'): 22 | sentence, term, polarity = text[0], text[1], text[2] 23 | document = nlp(sentence.replace('$T$', term)) 24 | tokens, pos_tag = zip(*[[token.text, token.tag_.split('_')[0]] for token in document]) 25 | sentence_post = sentence.replace('$T$', term) 26 | aspect_sentiment = [[term, MAP_INV[polarity]]] 27 | from_idx = len([tok.text for tok in nlp(sentence.split('$T$')[0])]) 28 | term_tok = len([tok.text for tok in nlp(term)]) 29 | from_to = [[from_idx, from_idx + term_tok]] 30 | predicted_dependencies = [token.dep_.split(':')[0] if token.dep_ != 'ROOT' else 'root' for token in document] 31 | predicted_heads = [token.head.i+1 if token.dep_ != 'ROOT' else 0 for token in document] 32 | dependencies = [list(ele) for ele in list(zip(predicted_dependencies, predicted_heads, list(range(1, len(predicted_heads) + 1))))] 33 | obj = {} 34 | obj['sentence'] = sentence_post 35 | obj['tokens'] = list(tokens) 36 | obj['tags'] = list(pos_tag) 37 | obj['predicted_dependencies'] = predicted_dependencies 38 | obj['predicted_heads'] = predicted_heads 39 | obj['dependencies'] = dependencies 40 | obj['aspect_sentiment'] = aspect_sentiment 41 | obj['from_to'] = from_to 42 | out_data.append(obj) 43 | return out_data 44 | 45 | if __name__ == '__main__': 46 | train_data = read_data('germeval_train.raw') 47 | train_data = construct_data(train_data) 48 | with open('germeval_Train.json', 'w', encoding='utf-8') as tr_fp: 49 | json_str = json.dumps(train_data, indent=4) 50 | tr_fp.write(json_str) 51 | tr_fp.close() 52 | 53 | valid_data = read_data('germeval_valid.raw') 54 | valid_data = construct_data(valid_data) 55 | with open('germeval_Valid.json', 'w', encoding='utf-8') as va_fp: 56 | json_str = json.dumps(valid_data, indent=4) 57 | va_fp.write(json_str) 58 | va_fp.close() 59 | 60 | test_data = read_data('germeval_test.raw') 61 | test_data = construct_data(test_data) 62 | with open('germeval_Test.json', 'w', encoding='utf-8') as te_fp: 63 | json_str = json.dumps(test_data, indent=4) 64 | te_fp.write(json_str) 65 | te_fp.close() 66 | print('WELL DONE.') 67 | -------------------------------------------------------------------------------- /data/Colloquial/download_and_process_colloquial.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DATA_DIR=$1 4 | OUTPUT_DIR=$2 5 | STANFORD_DIR=$3 6 | EXECUTE_DIR=$4 7 | 8 | if test -z "$DATA_DIR" 9 | then 10 | DATA_DIR='.' 11 | fi 12 | 13 | if test -z "$OUTPUT_DIR" 14 | then 15 | OUTPUT_DIR='dataset' 16 | fi 17 | 18 | if test -z "$STANFORD_DIR" 19 | then 20 | STANFORD_DIR='../stanford-corenlp-3.9.2-minimal' 21 | fi 22 | 23 | if test -z "$EXECUTE_DIR" 24 | then 25 | EXECUTE_DIR='.' 26 | fi 27 | 28 | echo "Download colloquial data (Twitter) in mirror source of ZJU MMF" 29 | echo "Origin data for Twitter can be found in https://github.com/songyouwei/ABSA-PyTorch/tree/master/datasets/acl-14-short-data" 30 | 31 | TWITTER_TRAIN_FILE=${DATA_DIR}/twitter_train.raw 32 | TWITTER_TEST_FILE=${DATA_DIR}/twitter_test.raw 33 | TWITTER_OUT_DIR=${OUTPUT_DIR}/JSONABSA_Twitter 34 | 35 | wget -O $TWITTER_TRAIN_FILE https://expic.xlxw.org/hycxg/datamirror/Twitter_train.raw 36 | wget -O $TWITTER_TEST_FILE https://expic.xlxw.org/hycxg/datamirror/Twitter_test.raw 37 | 38 | echo "Process raw data of Twitter to HyCxG format" 39 | python $EXECUTE_DIR/process_twitter.py --train_file $TWITTER_TRAIN_FILE --test_file $TWITTER_TEST_FILE \ 40 | --out_path $TWITTER_OUT_DIR \ 41 | --stanford_path $STANFORD_DIR 42 | 43 | echo "Download colloquial data (GermEval) in mirror source of ZJU MMF" 44 | echo "Origin data for GermEval can be found in http://ltdata1.informatik.uni-hamburg.de/germeval2017/" 45 | echo "Note: the GermEval dataset in our experiment is a subset. if you want to reporduce the experiment, you may download data via this script." 46 | 47 | GERMEVAL_TRAIN_FILE=${DATA_DIR}/germeval_train.raw 48 | GERMEVAL_VALID_FILE=${DATA_DIR}/germeval_valid.raw 49 | GERMEVAL_TEST_FILE=${DATA_DIR}/germeval_test.raw 50 | GERMEVAL_OUT_DIR=${OUTPUT_DIR}/JSONABSA_German 51 | 52 | wget -O $GERMEVAL_TRAIN_FILE https://expic.xlxw.org/hycxg/datamirror/germeval_train.raw 53 | wget -O $GERMEVAL_VALID_FILE https://expic.xlxw.org/hycxg/datamirror/germeval_valid.raw 54 | wget -O $GERMEVAL_TEST_FILE https://expic.xlxw.org/hycxg/datamirror/germeval_test.raw 55 | 56 | echo "Process raw data of GermEval to HyCxG format" 57 | python $EXECUTE_DIR/process_germeval.py --train_file $GERMEVAL_TRAIN_FILE --valid_file $GERMEVAL_VALID_FILE --test_file $GERMEVAL_TEST_FILE \ 58 | --out_path $GERMEVAL_OUT_DIR -------------------------------------------------------------------------------- /data/Colloquial/process_germeval.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | import sys 4 | sys.path.append('..') 5 | import json 6 | import os 7 | import numpy as np 8 | import pandas as pd 9 | import argparse 10 | from tqdm import tqdm 11 | import spacy 12 | 13 | MAP_POLARITY = {0 : 'neutral', 1 : 'positive', -1 : 'negative'} 14 | 15 | def initialize_spacy(lang: str='de'): 16 | try: nlpmodel = spacy.load(lang) 17 | except: raise 'The script need spacy>=2.3.5 package, you need to proceed `pip install spacy` first.' 18 | return nlpmodel 19 | 20 | def convert_raw2json(path : str, nlpmodel, desc: str='train'): 21 | data = [] 22 | with open(path, 'r', encoding='utf-8') as fp: 23 | raw_data = fp.readlines() 24 | fp.close() 25 | for idx in tqdm(range(0, len(raw_data), 3), desc='Process {} file in GermEval'.format(desc)): 26 | obj = {} 27 | sentence = raw_data[idx].strip() 28 | target = raw_data[idx + 1].strip() 29 | polarity = MAP_POLARITY[eval(raw_data[idx + 2].strip())] 30 | if '$T$' not in sentence: 31 | print('Error sentence : %s' % sentence) 32 | continue 33 | post_sentence = sentence.replace('$T$', target) 34 | document = nlpmodel(post_sentence) 35 | tokens, pos_tag = zip(*[[token.text, token.tag_.split('_')[0]] for token in document]) 36 | obj['token'] = list(tokens) 37 | pos_tag = list(pos_tag) 38 | obj['pos'] = pos_tag 39 | heads = [token.head.i + 1 if token.dep_ != 'ROOT' else 0 for token in document] 40 | rels = [token.dep_.split(':')[0] if token.dep_ != 'ROOT' else 'root' for token in document] 41 | obj['head'] = heads 42 | obj['deprel'] = rels 43 | term, targl = [tok.text for tok in nlpmodel(target)], [tok.text for tok in nlpmodel(sentence.split('$T$')[0])] 44 | obj['aspects'] = [{ 45 | 'term' : term, 46 | 'from' : len(targl), 47 | 'to' : len(targl) + len(term), 48 | 'polarity' : polarity 49 | }] 50 | data.append(obj) 51 | return data 52 | 53 | def read_semeval_data(path : str): 54 | data_out = [] 55 | data = np.array(pd.read_csv(path)) 56 | for dat in data: data_out.append([dat[0], dat[2], dat[1]]) 57 | return data_out 58 | 59 | def output_json(data: list, folder_path: str, file_path: str): 60 | if not os.path.exists(folder_path): os.makedirs(folder_path) 61 | with open(os.path.join(folder_path, file_path), 'w', encoding='utf-8') as fp: 62 | fp.write(json.dumps(data, indent=4)) 63 | fp.close() 64 | 65 | def process_data(args: argparse.Namespace): 66 | assert os.path.exists(args.train_file) and os.path.exists(args.test_file) and os.path.exists(args.test_file), "The path of data is not exist, please download first." 67 | train_json = convert_raw2json(args.train_file, args.nlpmodel) 68 | output_json(train_json, args.out_path, 'train.json') 69 | print('The train file of GermEval dataset is saved at %s' % os.path.join(args.out_path, 'train.json')) 70 | 71 | valid_json = convert_raw2json(args.valid_file, args.nlpmodel, desc='valid') 72 | output_json(valid_json, args.out_path, 'valid.json') 73 | print('The valid file of GermEval dataset is saved at %s' % os.path.join(args.out_path, 'valid.json')) 74 | 75 | test_json = convert_raw2json(args.test_file, args.nlpmodel, desc='test') 76 | output_json(test_json, args.out_path, 'test.json') 77 | print('The test file of GermEval dataset is saved at %s' % os.path.join(args.out_path, 'test.json')) 78 | 79 | def main(): 80 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 81 | parser.add_argument("--out_path", default='dataset/JSONABSA_German', type=str, help="Output path of GermEval dataset.") 82 | parser.add_argument("--train_file", default='germeval_train.raw', type=str, help="The path of train file.") 83 | parser.add_argument("--valid_file", default='germeval_valid.raw', type=str, help="The path of valid file.") 84 | parser.add_argument("--test_file", default='germeval_test.raw', type=str, help="The path of test file.") 85 | args = parser.parse_args() 86 | args.nlpmodel = initialize_spacy() 87 | process_data(args) 88 | print('GermEval data has been processed.') 89 | 90 | if __name__ == '__main__': 91 | main() 92 | -------------------------------------------------------------------------------- /data/Colloquial/process_twitter.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | import sys 4 | import json 5 | import os 6 | import numpy as np 7 | import pandas as pd 8 | import argparse 9 | from tqdm import tqdm 10 | from stanfordcorenlp import StanfordCoreNLP 11 | try: 12 | sys.path.append('.') 13 | from download_stanfordcore import download_stanfordcore, unzip_stanfordcore, STANFORD_CORE_LINK 14 | except: 15 | sys.path.append('..') 16 | from download_stanfordcore import download_stanfordcore, unzip_stanfordcore, STANFORD_CORE_LINK 17 | 18 | 19 | MAP_POLARITY = {0 : 'neutral', 1 : 'positive', -1 : 'negative'} 20 | 21 | def initialize_stanfordcore(stanford_path: str): 22 | try: nlpmodel = StanfordCoreNLP(stanford_path) 23 | except: 24 | print('The script need stanfordparser>=3.9.2 package, while the path is not exist for the package, do you want to download it? (Y/N)') 25 | download_flag = input() 26 | download_flag = download_flag.lower() 27 | assert download_flag in ['y'], "Abort" 28 | download_stanfordcore(STANFORD_CORE_LINK, stanford_path+'.zip') 29 | unzip_stanfordcore(stanford_path+'.zip', '../') 30 | nlpmodel = StanfordCoreNLP(stanford_path) 31 | return nlpmodel 32 | 33 | def convert_raw2json(path : str, nlpmodel, desc: str='train'): 34 | def parse_adj(edge): 35 | e_id, dep_rels, dep_heads = 1, [], [] 36 | for eidx in range(len(edge)): 37 | if (eidx + 1) != edge[0][2]: 38 | dep_heads.append(edge[e_id][1]) 39 | dep_rels.append(edge[e_id][0]) 40 | e_id += 1 41 | else: 42 | dep_heads.append(0) 43 | dep_rels.append(edge[0][0]) 44 | return dep_heads, dep_rels 45 | 46 | data = [] 47 | with open(path, 'r', encoding='utf-8') as fp: 48 | raw_data = fp.readlines() 49 | fp.close() 50 | for idx in tqdm(range(0, len(raw_data), 3), desc='Process {} file in Twitter'.format(desc)): 51 | obj = {} 52 | sentence = raw_data[idx].strip() 53 | target = raw_data[idx + 1].strip() 54 | polarity = MAP_POLARITY[eval(raw_data[idx + 2].strip())] 55 | if '$T$' not in sentence: 56 | print('Error sentence : %s' % sentence) 57 | continue 58 | post_sentence = sentence.replace('$T$', target) 59 | obj['token'] = nlpmodel.word_tokenize(post_sentence) 60 | pos_tag = nlpmodel.pos_tag(post_sentence) 61 | dependecy = nlpmodel.dependency_parse(post_sentence) 62 | obj['pos'] = [tag[1] for tag in pos_tag] 63 | heads, rels = parse_adj(dependecy) 64 | obj['head'] = heads 65 | obj['deprel'] = rels 66 | obj['aspects'] = [{ 67 | 'term' : nlpmodel.word_tokenize(target), 68 | 'from' : len(nlpmodel.word_tokenize(sentence.split('$T$')[0])), 69 | 'to' : len(nlpmodel.word_tokenize(sentence.split('$T$')[0])) + len(nlpmodel.word_tokenize(target)), 70 | 'polarity' : polarity 71 | }] 72 | data.append(obj) 73 | return data 74 | 75 | def read_semeval_data(path : str): 76 | data_out = [] 77 | data = np.array(pd.read_csv(path)) 78 | for dat in data: data_out.append([dat[0], dat[2], dat[1]]) 79 | return data_out 80 | 81 | def output_json(data: list, folder_path: str, file_path: str): 82 | if not os.path.exists(folder_path): os.makedirs(folder_path) 83 | with open(os.path.join(folder_path, file_path), 'w', encoding='utf-8') as fp: 84 | fp.write(json.dumps(data, indent=4)) 85 | fp.close() 86 | 87 | def process_data(args: argparse.Namespace): 88 | assert os.path.exists(args.train_file) and os.path.exists(args.test_file), "The path of data is not exist, please download first." 89 | train_json = convert_raw2json(args.train_file, args.nlpmodel) 90 | output_json(train_json, args.out_path, 'train.json') 91 | print('The train file of Twitter dataset is saved at %s' % os.path.join(args.out_path, 'train.json')) 92 | 93 | test_json = convert_raw2json(args.test_file, args.nlpmodel, desc='test') 94 | output_json(test_json, args.out_path, 'test.json') 95 | print('The test file of Twitter dataset is saved at %s' % os.path.join(args.out_path, 'test.json')) 96 | 97 | def main(): 98 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 99 | parser.add_argument("--out_path", default='dataset/JSONABSA_Twitter', type=str, help="Output path of twitter dataset.") 100 | parser.add_argument("--stanford_path", default='stanford-corenlp-3.9.2-minimal', type=str, help="The path for stanfordparser.") 101 | parser.add_argument("--train_file", default='twitter_train.raw', type=str, help="The path of train file.") 102 | parser.add_argument("--test_file", default='twitter_test.raw', type=str, help="The path of test file.") 103 | args = parser.parse_args() 104 | args.nlpmodel = initialize_stanfordcore(args.stanford_path) 105 | process_data(args) 106 | print('Twitter data has been processed.') 107 | 108 | if __name__ == '__main__': 109 | main() 110 | -------------------------------------------------------------------------------- /data/Counterfactual/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | 16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/Counterfactual) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/Counterfactual/README_ZH.md) 17 | 18 | ## Counterfactual Detection Dataset 19 | 20 | The Counterfactual Recognition (CR) dataset is derived from Subtask 1 - Recognizing Counterfactual Statements (RCS) of SemEval2020 Task5. The data is collected from the domains of politics, finance, and health, and consists of 13k training data with 7k test data. 21 | 22 | ### Download and process the data 23 | Before using the data processing and downloading script, please make sure that the dependencies in [`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt) have already been installed. After installing the dependencies, use the following command to download and process the data (note: you may not attach any parameters, all parameters have default values): 24 | ```shell 25 | bash download_and_process_counterfactual.sh [--DATA_DIR] [--OUTPUT_DIR] [--STANFORD_DIR] 26 | ``` 27 | **Parameters:** 28 | + DATA_DIR: The folder where the downloaded raw data is located. The default parameter is the current folder. 29 | + OUTPUT_DIR: The folder where the processed data is stored. The default parameter is `JSON_Counterfactual`. 30 | + STANFORD_DIR: The location of the Stanford parser. The default parameter is `stanford-corenlp-3.9.2-minimal` in the parent directory. 31 | 32 | **Note:** If the shared parser directory does not exist, the program will ask if you want to fetch the Stanford parser from our mirror data source, which is 353MB in size. Choose `Y` to proceed. 33 | 34 | ### Resource of data 35 | This data is proposed in SemEval2020 Task5, and our mirrored data comes from official source [SemEval2020_Task5](https://github.com/Jiaqi1008/SemEval2020_Task5). If you also want to use this dataset, you can cite their paper: 36 | ``` 37 | @inproceedings{yang-etal-2020-semeval, 38 | title = "{S}em{E}val-2020 Task 5: Counterfactual Recognition", 39 | author = "Yang, Xiaoyu and 40 | Obadinma, Stephen and 41 | Zhao, Huasha and 42 | Zhang, Qiong and 43 | Matwin, Stan and 44 | Zhu, Xiaodan", 45 | booktitle = "Proceedings of the Fourteenth Workshop on Semantic Evaluation", 46 | year = "2020", 47 | publisher = "International Committee for Computational Linguistics", 48 | url = "https://aclanthology.org/2020.semeval-1.40", 49 | doi = "10.18653/v1/2020.semeval-1.40", 50 | pages = "322--335", 51 | } 52 | ``` -------------------------------------------------------------------------------- /data/Counterfactual/README_ZH.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/Counterfactual) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/Counterfactual/README_ZH.md) 16 | ## 反事实检测数据集 17 | 18 | 19 | 20 | 反事实检测(Counterfactual recognition, CR)数据集来自SemEval2020 Task5中的子任务1-反事实陈述检测(Recognizing Counterfactual Statements, RCS),数据采集自政治、金融和健康领域,共有13k训练数据以及7k测试数据。 21 | 22 | ### 数据下载及处理 23 | 在使用数据处理及下载脚本前,请您确认已经安装了[`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt)中的依赖包。在安装完依赖包后,用以下命令来获得并处理数据(可以不附加任何参数,所有参数均有默认值): 24 | ```shell 25 | bash download_and_process_counterfactual.sh [--DATA_DIR] [--OUTPUT_DIR] [--STANFORD_DIR] 26 | ``` 27 | **参数含义:** 28 | + DATA_DIR: 下载的原始数据所在文件夹, 默认为当前文件夹 29 | + OUTPUT_DIR: 处理好后的数据存储的文件夹, 默认为`JSON_Counterfactual` 30 | + STANFORD_DIR: 斯坦福解析器所在位置,默认为上一级目录的`stanford-corenlp-3.9.2-minimal` 31 | 32 | **注意:** 如果共享的解析器文件夹不存在,那么程序会询问是否从我们的镜像数据源拉取斯坦福解析器(共353MB),选择`Y`即可 33 | 34 | ### 数据来源 35 | 本部分数据来自SemEval2020 Task5,我们的镜像数据来源官方Github[SemEval2020_Task5](https://github.com/Jiaqi1008/SemEval2020_Task5),如果您也使用了该数据集,您可以引用他们的论文: 36 | ``` 37 | @inproceedings{yang-etal-2020-semeval, 38 | title = "{S}em{E}val-2020 Task 5: Counterfactual Recognition", 39 | author = "Yang, Xiaoyu and 40 | Obadinma, Stephen and 41 | Zhao, Huasha and 42 | Zhang, Qiong and 43 | Matwin, Stan and 44 | Zhu, Xiaodan", 45 | booktitle = "Proceedings of the Fourteenth Workshop on Semantic Evaluation", 46 | year = "2020", 47 | publisher = "International Committee for Computational Linguistics", 48 | url = "https://aclanthology.org/2020.semeval-1.40", 49 | doi = "10.18653/v1/2020.semeval-1.40", 50 | pages = "322--335", 51 | } 52 | ``` -------------------------------------------------------------------------------- /data/Counterfactual/download_and_process_counterfactual.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DATA_DIR=$1 4 | OUTPUT_DIR=$2 5 | STANFORD_DIR=$3 6 | EXECUTE_DIR=$4 7 | 8 | if test -z "$DATA_DIR" 9 | then 10 | DATA_DIR='.' 11 | fi 12 | 13 | if test -z "$OUTPUT_DIR" 14 | then 15 | OUTPUT_DIR='JSON_Counterfactual' 16 | fi 17 | 18 | if test -z "$STANFORD_DIR" 19 | then 20 | STANFORD_DIR='../stanford-corenlp-3.9.2-minimal' 21 | fi 22 | 23 | if test -z "$EXECUTE_DIR" 24 | then 25 | EXECUTE_DIR='.' 26 | fi 27 | 28 | echo "Download counterfactual data in mirror source of ZJU MMF" 29 | echo "Origin data can be found in https://github.com/Jiaqi1008/SemEval2020_Task5" 30 | 31 | TRAIN_FILE=${DATA_DIR}/counterfactual_train.csv 32 | TEST_FILE=${DATA_DIR}/counterfactual_test.csv 33 | wget -O $TRAIN_FILE https://expic.xlxw.org/hycxg/datamirror/subtask1_train.csv 34 | wget -O $TEST_FILE https://expic.xlxw.org/hycxg/datamirror/subtask1_test.csv 35 | 36 | echo "Process csv data to HyCxG format" 37 | python $EXECUTE_DIR/process_counterfactual.py --train_file $TRAIN_FILE --test_file $TEST_FILE \ 38 | --out_path $OUTPUT_DIR \ 39 | --stanford_path $STANFORD_DIR 40 | -------------------------------------------------------------------------------- /data/Counterfactual/process_counterfactual.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | import sys 4 | import json 5 | import os 6 | import numpy as np 7 | import pandas as pd 8 | import argparse 9 | from tqdm import tqdm 10 | from stanfordcorenlp import StanfordCoreNLP 11 | try: 12 | sys.path.append('.') 13 | from download_stanfordcore import download_stanfordcore, unzip_stanfordcore, STANFORD_CORE_LINK 14 | except: 15 | sys.path.append('..') 16 | from download_stanfordcore import download_stanfordcore, unzip_stanfordcore, STANFORD_CORE_LINK 17 | 18 | def initialize_stanfordcore(stanford_path: str): 19 | try: nlpmodel = StanfordCoreNLP(stanford_path) 20 | except: 21 | print('The script need stanfordparser>=3.9.2 package, while the path is not exist for the package, do you want to download it? (Y/N)') 22 | download_flag = input() 23 | download_flag = download_flag.lower() 24 | assert download_flag in ['y'], "Abort" 25 | download_stanfordcore(STANFORD_CORE_LINK, stanford_path+'.zip') 26 | unzip_stanfordcore(stanford_path+'.zip', '../') 27 | nlpmodel = StanfordCoreNLP(stanford_path) 28 | return nlpmodel 29 | 30 | def data2json(data: list, nlpmodel, desc:str='train'): 31 | def parse_adj(edge): 32 | e_id, dep_rels, dep_heads = 1, [], [] 33 | for eidx in range(len(edge)): 34 | if (eidx + 1) != edge[0][2]: 35 | dep_heads.append(edge[e_id][1]) 36 | dep_rels.append(edge[e_id][0]) 37 | e_id += 1 38 | else: 39 | dep_heads.append(0) 40 | dep_rels.append(edge[0][0]) 41 | return dep_heads, dep_rels 42 | 43 | data_out = [] 44 | for dat in tqdm(data, desc='Process {} file in Counterfactual'.format(desc)): 45 | obj ={} 46 | gid =dat[0] 47 | obj['id'] = gid 48 | sentence = dat[1].strip() 49 | if sentence == '#NAME?': continue 50 | label = dat[2] 51 | pos_tag = nlpmodel.pos_tag(sentence) 52 | dependecy = nlpmodel.dependency_parse(sentence) 53 | obj['token'] = nlpmodel.word_tokenize(sentence) 54 | obj['pos'] = [tag[1] for tag in pos_tag] 55 | heads, rels = parse_adj(dependecy) 56 | obj['head'] = heads 57 | obj['deprel'] = rels 58 | obj['label'] = label 59 | data_out.append(obj) 60 | return data_out 61 | 62 | def read_semeval_data(path : str): 63 | data_out = [] 64 | data = np.array(pd.read_csv(path)) 65 | for dat in data: data_out.append([dat[0], dat[2], dat[1]]) 66 | return data_out 67 | 68 | def output_json(data: list, folder_path: str, file_path: str): 69 | if not os.path.exists(folder_path): os.makedirs(folder_path) 70 | with open(os.path.join(folder_path, file_path), 'w', encoding='utf-8') as fp: 71 | fp.write(json.dumps(data, indent=4)) 72 | fp.close() 73 | 74 | def process_data(args: argparse.Namespace): 75 | assert os.path.exists(args.train_file) and os.path.exists(args.test_file), "The path of data is not exist, please download first." 76 | train_semeval_data = read_semeval_data(args.train_file) 77 | train_json = data2json(train_semeval_data, args.nlpmodel) 78 | output_json(train_json, args.out_path, 'train.json') 79 | print('The train file of Counterfactual dataset is saved at %s' % os.path.join(args.out_path, 'train.json')) 80 | 81 | test_semeval_data = read_semeval_data(args.test_file) 82 | test_json = data2json(test_semeval_data, args.nlpmodel, desc='test') 83 | output_json(test_json, args.out_path, 'test.json') 84 | print('The test file of Counterfactual dataset is saved at %s' % os.path.join(args.out_path, 'test.json')) 85 | 86 | def main(): 87 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 88 | parser.add_argument("--out_path", default='dataset/GLUE_Counterfactual', type=str, help="Output path of glue dataset.") 89 | parser.add_argument("--stanford_path", default='stanford-corenlp-3.9.2-minimal', type=str, help="The path for stanfordparser.") 90 | parser.add_argument("--train_file", default='counterfactual_train.csv', type=str, help="The path of train file.") 91 | parser.add_argument("--test_file", default='counterfactual_test.csv', type=str, help="The path of test file.") 92 | args = parser.parse_args() 93 | args.nlpmodel = initialize_stanfordcore(args.stanford_path) 94 | process_data(args) 95 | print('Counterfactual data has been processed.') 96 | 97 | if __name__ == '__main__': 98 | main() 99 | -------------------------------------------------------------------------------- /data/GLUE/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | 16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/GLUE) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/GLUE/README_ZH.md) 17 | 18 | ## GLUE Benchmark 19 | 20 | The General Language Understanding Evaluation (GLUE) benchmark is a common collection of natural language understanding systems based on a set of nine understanding tasks constructed from various existing natural language understanding datasets. These tasks are carefully selected to include a diverse range of dataset sizes, types, and difficulty levels. We evaluated the performance of this benchmark dataset on eight tasks, including `CoLA` (linguistic acceptability), `SST-2` (sentiment analysis), `MRPC`/`STS-B`/`QQP` (semantic similarity computation and equivalence matching), `MNLI`/`QNLI`/`RTE` (natural language inference). 21 | 22 | ### Download and process the data 23 | Before using the data processing and downloading script, please make sure that the dependencies in [`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt) have already been installed. After installing the dependencies, use the following command to download and process the data (Note: you may not attach any parameters, all parameters have default values): 24 | ```shell 25 | bash download_and_process_glue.sh [--OUTPUT_DIR] [--STANFORD_DIR] [--TASK] 26 | ``` 27 | **Parameters:** 28 | + OUTPUT_DIR: The folder where the processed data is stored. The default parameter is `dataset`. 29 | + STANFORD_DIR: The location of the Stanford parser. The default parameter is `stanford-corenlp-3.9.2-minimal` in the parent directory. 30 | + TASK: The GLUE tasks that need to be handled can be selected from [cola, sst2, mnli, qnli, qqp, rte, mrpc, stsb]. If you want to download all of them, you can use `all` directly. 31 | 32 | **Note:** If the shared parser directory does not exist, the program will ask if you want to fetch the Stanford parser from our mirror data source, which is 353MB in size. Choose `Y` to proceed. In addition, the MNLI task is relatively special, so there will be two extra files (matched/mismatched) in the output. 33 | 34 | ### Resource of data 35 | This data is proposed in GLUE benchmark, and our data comes from datasets library of Hugging Face [GLUE](https://huggingface.co/datasets?sort=downloads&search=glue). If you also want to use this dataset, you can cite their paper: 36 | ``` 37 | @inproceedings{wang2018glue, 38 | title = "{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding", 39 | author = "Wang, Alex and 40 | Singh, Amanpreet and 41 | Michael, Julian and 42 | Hill, Felix and 43 | Levy, Omer and 44 | Bowman, Samuel", 45 | booktitle = "Proceedings of the 2018 {EMNLP} Workshop {B}lackbox{NLP}: Analyzing and Interpreting Neural Networks for {NLP}", 46 | month = nov, 47 | year = "2018", 48 | url = "https://aclanthology.org/W18-5446", 49 | } 50 | ``` 51 | Or the other bib below: 52 | ``` 53 | @inproceedings{wangglue, 54 | title={GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding}, 55 | author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R}, 56 | booktitle={International Conference on Learning Representations (ICLR)}, 57 | year = "2019" 58 | } 59 | ``` -------------------------------------------------------------------------------- /data/GLUE/README_ZH.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/GLUE) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/GLUE/README_ZH.md) 16 | ## GLUE基准数据集 17 | 18 | 通用自然语言理解评估(General Language Understanding Evaluation, GLUE)基准数据集是一个常用的自然语言理解系统的集合,基于已有的各方面自然语言理解数据集构建的九项句子语义理解任务的基准测试,这些任务被精选出来包括多样化的数据集大小、文本类型和难度程度。我们针对它的8个任务进行了性能评估,分别是`CoLA`(语言可接受度)、`SST-2`(文本情感分析)、`MRPC`/`STS-B`/`QQP`(语义相似度计算以及等价性匹配)、`MNLI`/`QNLI`/`RTE`(自然语言推理)。 19 | ### 数据下载及处理 20 | 在使用数据处理及下载脚本前,请您确认已经安装了[`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt)中的依赖包。在安装完依赖包后,用以下命令来获得并处理数据(可以不附加任何参数,所有参数均有默认值): 21 | ```shell 22 | bash download_and_process_glue.sh [--OUTPUT_DIR] [--STANFORD_DIR] [--TASK] 23 | ``` 24 | **参数含义:** 25 | + OUTPUT_DIR: 处理好后的数据存储的文件夹, 默认为`dataset` 26 | + STANFORD_DIR: 斯坦福解析器所在位置,默认为上一级目录的`stanford-corenlp-3.9.2-minimal` 27 | + TASK: 需要处理的GLUE任务,可以选择[cola, sst2, mnli, qnli, qqp, rte, mrpc, stsb],如果都要下载可以直接使用`all`替代 28 | 29 | **注意:** 如果共享的解析器文件夹不存在,那么程序会询问是否从我们的镜像数据源拉取斯坦福解析器(共353MB),选择`Y`即可。除此之外,`MNLI`任务较为特别因此输出的文件会多两个文件(matched/mismatched) 30 | 31 | ### 数据来源 32 | 本部分数据来自GLUE基准,我们的数据来源Hugging Face的datasets库[GLUE](https://huggingface.co/datasets?sort=downloads&search=glue),如果您也使用了该数据集,您可以引用他们的论文: 33 | ``` 34 | @inproceedings{wang2018glue, 35 | title = "{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding", 36 | author = "Wang, Alex and 37 | Singh, Amanpreet and 38 | Michael, Julian and 39 | Hill, Felix and 40 | Levy, Omer and 41 | Bowman, Samuel", 42 | booktitle = "Proceedings of the 2018 {EMNLP} Workshop {B}lackbox{NLP}: Analyzing and Interpreting Neural Networks for {NLP}", 43 | month = nov, 44 | year = "2018", 45 | url = "https://aclanthology.org/W18-5446", 46 | } 47 | ``` 48 | 或者 49 | ``` 50 | @inproceedings{wangglue, 51 | title={GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding}, 52 | author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R}, 53 | booktitle={International Conference on Learning Representations (ICLR)}, 54 | year = "2019" 55 | } 56 | ``` -------------------------------------------------------------------------------- /data/GLUE/download_and_process_glue.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | OUTPUT_DIR=$1 4 | STANFORD_DIR=$2 5 | TASK=$3 6 | EXECUTE_DIR=$4 7 | 8 | if test -z "$OUTPUT_DIR" 9 | then 10 | OUTPUT_DIR='dataset' 11 | fi 12 | 13 | if test -z "$STANFORD_DIR" 14 | then 15 | STANFORD_DIR='../stanford-corenlp-3.9.2-minimal' 16 | fi 17 | 18 | if test -z "$TASK" 19 | then 20 | TASK='all' 21 | fi 22 | 23 | if test -z "$EXECUTE_DIR" 24 | then 25 | EXECUTE_DIR='.' 26 | fi 27 | 28 | echo "Original data can be found in https://gluebenchmark.com/" 29 | 30 | echo "Process data to HyCxG format (depend on Hugging Face)" 31 | python $EXECUTE_DIR/download_and_process_glue.py --task $TASK \ 32 | --out_path $OUTPUT_DIR \ 33 | --stanford_path $STANFORD_DIR 34 | -------------------------------------------------------------------------------- /data/Multilingual/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | 16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/Multilingual) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/Multilingual/README_ZH.md) 17 | 18 | ## Multilingual Dataset for ABSA 19 | 20 | The Multilingual sentiment analysis dataset is based on Semeval 2016 Task 5, which provides data in 8 different languages from various domains. We selected 4 different languages from the Restaurant domain for the multilingual performance evaluation, including `French`, `Spanish`, `Turkish`, and `Dutch`. 21 | 22 | ### Download and process the data 23 | Before using the data processing and downloading script, please make sure that the dependencies in [`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt) have already been installed. After installing the dependencies, use the following command to download and process the data (note: you may not attach any parameters, all parameters have default values): 24 | ```shell 25 | bash download_and_process_multilingual.sh [--DATA_DIR] [--OUTPUT_DIR] 26 | ``` 27 | **Parameters:** 28 | + DATA_DIR: The folder where the downloaded raw data is located. The default parameter is the current folder. 29 | + OUTPUT_DIR: The folder where the processed data is stored. The default parameter is `dataset`. 30 | 31 | **Note:** The `PORT` variable in the script represents the startup port of the Stanza server, which is set to `9000` by default. If this port is already in use on your machine, you need to manually set a different port in the script. 32 | 33 | ### Data processing for baseline models 34 | In Section4.3 - Multilingual results of our paper, we compared the performance of four models, namely RGAT, DualGCN, DGEDT, and KumaGCN on these multilingual sentiment datasets. Therefore, we provide a conversion script for processing the data in the [`baseline`](https://github.com/xlxwalex/HyCxG/tree/main/data/Multilingual/baseline) folder. For more information on reproducing the baseline models, please refer to the [`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines). 35 | 36 | ### Resource of data 37 | Our mirror data is obtained from SemEval 2016, as well as the data sources [SemEval-2016 Task 5](https://alt.qcri.org/semeval2016/task5/). If you have also used this dataset, you can cite their papers as follows: 38 | ``` 39 | @inproceedings{pontiki2016semeval, 40 | title = "{S}em{E}val-2016 Task 5: Aspect Based Sentiment Analysis", 41 | author = {Pontiki, Maria and 42 | Galanis, Dimitris and 43 | Papageorgiou, Haris and 44 | Androutsopoulos, Ion and 45 | Manandhar, Suresh and 46 | AL-Smadi, Mohammad and 47 | Al-Ayyoub, Mahmoud and 48 | Zhao, Yanyan and 49 | Qin, Bing and 50 | De Clercq, Orph{\'e}e and 51 | Hoste, V{\'e}ronique and 52 | Apidianaki, Marianna and 53 | Tannier, Xavier and 54 | Loukachevitch, Natalia and 55 | Kotelnikov, Evgeniy and 56 | Bel, Nuria and 57 | Jim{\'e}nez-Zafra, Salud Mar{\'\i}a and 58 | Eryi{\u{g}}it, G{\"u}l{\c{s}}en}, 59 | booktitle = "{S}em{E}val-2016", 60 | year = "2016", 61 | url = "https://aclanthology.org/S16-1002", 62 | pages = "19--30", 63 | } 64 | ``` 65 | -------------------------------------------------------------------------------- /data/Multilingual/README_ZH.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data/Multilingual) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/Multilingual/README_ZH.md) 16 | ## 多语情感数据集 17 | 18 | 多语(Multilingual)情感数据集基于Semeval 2016的Task5,其除了常规英语外还提供了不同领域的8种语言的数据。我们从其中的餐厅领域(Resturant)选择了4个不同的语言作为多语性能评估数据集,它们分别是`法语`、`西班牙语`、`土耳其语`以及`荷兰语`。 19 | 20 | ### 数据下载及处理 21 | 在使用数据处理及下载脚本前,请您确认已经安装了[`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt)中的依赖包。在安装完依赖包后,用以下命令来获得并处理数据(可以不附加任何参数,所有参数均有默认值): 22 | ```shell 23 | bash download_and_process_multilingual.sh [--DATA_DIR] [--OUTPUT_DIR] 24 | ``` 25 | **参数含义:** 26 | + DATA_DIR: 下载的原始数据所在文件夹, 默认为当前文件夹 27 | + OUTPUT_DIR: 处理好后的数据存储的文件夹, 默认为`dataset` 28 | 29 | **注意:** 在脚本内的PORT变量表示stanza服务器启动端口,默认为`9000`,如果您的机器上该端口被占用,您需要手动在脚本中进行设置 30 | 31 | ### 基准模型数据处理 32 | 在论文的Section4.3 - Multilingual results中,我们对比了RGAT、DualGCN、DGEDT以及KumaGCN这四个模型在多语情感数据集上的性能,因此在[`baseline文件夹`](https://github.com/xlxwalex/HyCxG/tree/main/data/Multilingual/baseline)中我们提供了四种语言数据转换脚本(我们尽可能用了和官方代码一致的处理工具包)。更多关于基线模型的复现信息请见[`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines)。 33 | 34 | ### 数据来源 35 | 本部分数据来自SemEval 2016,我们的镜像数据来源[SemEval-2016 Task 5](https://alt.qcri.org/semeval2016/task5/),如果您也使用了该数据集,您可以引用他们的论文: 36 | ``` 37 | @inproceedings{pontiki2016semeval, 38 | title = "{S}em{E}val-2016 Task 5: Aspect Based Sentiment Analysis", 39 | author = {Pontiki, Maria and 40 | Galanis, Dimitris and 41 | Papageorgiou, Haris and 42 | Androutsopoulos, Ion and 43 | Manandhar, Suresh and 44 | AL-Smadi, Mohammad and 45 | Al-Ayyoub, Mahmoud and 46 | Zhao, Yanyan and 47 | Qin, Bing and 48 | De Clercq, Orph{\'e}e and 49 | Hoste, V{\'e}ronique and 50 | Apidianaki, Marianna and 51 | Tannier, Xavier and 52 | Loukachevitch, Natalia and 53 | Kotelnikov, Evgeniy and 54 | Bel, Nuria and 55 | Jim{\'e}nez-Zafra, Salud Mar{\'\i}a and 56 | Eryi{\u{g}}it, G{\"u}l{\c{s}}en}, 57 | booktitle = "{S}em{E}val-2016", 58 | year = "2016", 59 | url = "https://aclanthology.org/S16-1002", 60 | pages = "19--30", 61 | } 62 | ``` 63 | -------------------------------------------------------------------------------- /data/Multilingual/baseline/DGEDT_french_dutch_spanish.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import spacy 3 | import pickle 4 | import tqdm 5 | nlp = spacy.load('fr') # fr - franch / es - spanish / nl -dutch 6 | import re 7 | 8 | def tokenize(text): 9 | text=text.strip() 10 | text=re.sub(r' {2,}',' ',text) 11 | document = nlp(text) 12 | return [token.text for token in document] 13 | 14 | def update_edge(text,vocab): 15 | # https://spacy.io/docs/usage/processing-text 16 | document = nlp(text) 17 | seq_len = len(text.split()) 18 | for token in document: 19 | if token.dep_ not in vocab: 20 | vocab[token.dep_]=len(vocab) 21 | return 0 22 | def dependency_adj_matrix(text,edge_vocab): 23 | # https://spacy.io/docs/usage/proclessing-text 24 | document = nlp(text.strip()) 25 | seq_len = len(tokenize(text)) 26 | matrix = np.zeros((seq_len, seq_len)).astype('float32') 27 | matrix1 = np.zeros((seq_len, seq_len)).astype('float32') 28 | edge = np.zeros((seq_len, seq_len)).astype('int32') 29 | edge1 = np.zeros((seq_len, seq_len)).astype('int32') 30 | assert len(document)==seq_len 31 | for token in document: 32 | if token.i >= seq_len: 33 | print('bug') 34 | print(text) 35 | print(text.split()) 36 | print(document) 37 | print([token.i for token in document]) 38 | print([token.text for token in document]) 39 | a=input('hahha') 40 | if token.i < seq_len: 41 | matrix[token.i][token.i] = 1 42 | matrix1[token.i][token.i] = 1 43 | # https://spacy.io/docs/api/token 44 | for child in token.children: 45 | if child.i < seq_len: 46 | matrix[token.i][child.i] = 1 47 | matrix1[child.i][token.i] = 1 48 | edge[token.i][child.i] = edge_vocab.get(child.dep_,1) 49 | edge1[child.i][token.i] = edge_vocab.get(child.dep_,1) 50 | return matrix,matrix1,edge,edge1 51 | def concat(texts,aspect): 52 | source='' 53 | splitnum=0 54 | for i,text in enumerate(texts): 55 | source+=text 56 | splitnum+=len(tokenize(text)) 57 | if i ':0,'':1} 74 | fin = open(filename, 'r', encoding='utf-8', newline='\n', errors='ignore') 75 | lines = fin.readlines() 76 | fin.close() 77 | idx2graph = {} 78 | fout = open(filename+'.graph', 'wb') 79 | if savevocab: 80 | fout1 = open(filename+'.edgevocab', 'wb') 81 | if savevocab: 82 | for i in tqdm.tqdm(range(0, len(lines), 3)): 83 | text_left = [s.lower().strip() for s in lines[i].split("$T$")] 84 | aspect = lines[i + 1].lower().strip() 85 | update_edge(concat(text_left,aspect),edge_vocab) 86 | for i in tqdm.tqdm(range(0, len(lines), 3)): 87 | text_left = [s.lower().strip() for s in lines[i].split("$T$")] 88 | aspect = lines[i + 1].lower().strip() 89 | adj_matrix = dependency_adj_matrix(concat(text_left,aspect),edge_vocab) 90 | idx2graph[i] = adj_matrix 91 | pickle.dump(idx2graph, fout) 92 | if savevocab: 93 | pickle.dump(edge_vocab, fout1) 94 | fout.close() 95 | if savevocab: 96 | fout1.close() 97 | return edge_vocab 98 | def processe(filename,filename2): 99 | savevocab=True 100 | 101 | edge_vocab={'':0,'':1} 102 | fin = open(filename, 'r', encoding='utf-8', newline='\n', errors='ignore') 103 | lines = fin.readlines() 104 | fin.close() 105 | idx2graph = {} 106 | fout = open(filename+'.graph', 'wb') 107 | if savevocab: 108 | fout1 = open(filename+'.edgevocab', 'wb') 109 | if savevocab: 110 | for i in tqdm.tqdm(range(0, len(lines), 1)): 111 | update_edge(re.sub(r' {2,}',' ',lines[i].strip()),edge_vocab) 112 | for i in tqdm.tqdm(range(0, len(lines), 1)): 113 | adj_matrix = dependency_adj_matrix(re.sub(r' {2,}',' ',lines[i].strip()),edge_vocab) 114 | idx2graph[i] = adj_matrix 115 | pickle.dump(idx2graph, fout) 116 | if savevocab: 117 | pickle.dump(edge_vocab, fout1) 118 | fout.close() 119 | if savevocab: 120 | fout1.close() 121 | return edge_vocab 122 | if __name__ == '__main__': 123 | # e.g., french 124 | edge_vocab = process('./datasets/french/restaurant_train.raw',None, True) 125 | process('./datasets/french/restaurant_test.raw', edge_vocab, False) 126 | -------------------------------------------------------------------------------- /data/Multilingual/baseline/DGEDT_turkish.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import tqdm 4 | import re 5 | 6 | import stanfordnlp 7 | nlpmodel = stanfordnlp.Pipeline(processors='tokenize,pos,depparse', lang="tr") 8 | 9 | def tokenize(text): 10 | text=text.strip() 11 | text=re.sub(r' {2,}',' ',text) 12 | document = nlpmodel(text) 13 | tokens = [] 14 | for sentence in document.sentences: 15 | tok= [word.text for word in sentence.words] 16 | tokens.extend(tok) 17 | return tokens 18 | 19 | def update_edge(text,vocab): 20 | # https://spacy.io/docs/usage/processing-text 21 | document = nlpmodel(text) 22 | tokens = [] 23 | for sentence in document.sentences: 24 | dep = [word.dependency_relation for word in sentence.words] 25 | tokens.extend(dep) 26 | seq_len = len(text.split()) 27 | for token in tokens: 28 | if token not in vocab: 29 | vocab[token]=len(vocab) 30 | return 0 31 | 32 | 33 | def dependency_adj_matrix(text,edge_vocab): 34 | # https://spacy.io/docs/usage/processing-text 35 | document = nlpmodel(text) 36 | deprels = [] 37 | for sentence in document.sentences: 38 | dep = [(word.dependency_relation, word.governor, eval(word.index)) for word in sentence.words] 39 | deprels.extend(dep) 40 | seq_len = len(tokenize(text)) 41 | matrix = np.zeros((seq_len, seq_len)).astype('float32') 42 | matrix1 = np.zeros((seq_len, seq_len)).astype('float32') 43 | edge = np.zeros((seq_len, seq_len)).astype('int32') 44 | edge1 = np.zeros((seq_len, seq_len)).astype('int32') 45 | 46 | for tokid in range(len(deprels)): 47 | matrix[tokid][tokid] = 1 48 | matrix1[tokid][tokid] = 1 49 | 50 | for link in deprels: 51 | if link[0] == 'root': 52 | continue 53 | matrix[link[1] - 1][link[2] - 1] = 1 54 | matrix1[link[2] - 1][link[1] - 1] = 1 55 | edge[link[1] - 1][link[2] - 1] = edge_vocab.get(link[0], 1) 56 | edge1[link[2] - 1][link[1] - 1] = edge_vocab.get(link[0],1) 57 | return matrix, matrix1, edge, edge1 58 | 59 | def concat(texts,aspect): 60 | source='' 61 | splitnum=0 62 | for i, text in enumerate(texts): 63 | source+=text 64 | if text == '80 tl.': text = '80 tl .' 65 | if len(text) < 1: 66 | splitnum +=0 67 | else: 68 | splitnum+=len(tokenize(text)) 69 | if i ':0,'':1} 88 | fin = open(filename, 'r', newline='\n', errors='ignore') 89 | lines = fin.readlines() 90 | fin.close() 91 | idx2graph = {} 92 | fout = open(filename+'.graph', 'wb') 93 | if savevocab: 94 | fout1 = open(filename+'.edgevocab', 'wb') 95 | if savevocab: 96 | for i in tqdm.tqdm(range(0, len(lines), 3)): 97 | text_left = [s.lower().strip() for s in lines[i].split("$T$")] 98 | aspect = lines[i + 1].lower().strip() 99 | concater = concat(text_left,aspect) 100 | update_edge(concater, edge_vocab) 101 | for i in tqdm.tqdm(range(0, len(lines), 3)): 102 | text_left = [s.lower().strip() for s in lines[i].split("$T$")] 103 | aspect = lines[i + 1].lower().strip() 104 | adj_matrix = dependency_adj_matrix(concat(text_left,aspect),edge_vocab) 105 | idx2graph[i] = adj_matrix 106 | pickle.dump(idx2graph, fout) 107 | if savevocab: 108 | pickle.dump(edge_vocab, fout1) 109 | fout.close() 110 | if savevocab: 111 | fout1.close() 112 | return edge_vocab 113 | def processe(filename,filename2): 114 | savevocab=True 115 | 116 | edge_vocab={'':0,'':1} 117 | fin = open(filename, 'r', encoding='utf-8', newline='\n', errors='ignore') 118 | lines = fin.readlines() 119 | fin.close() 120 | idx2graph = {} 121 | fout = open(filename+'.graph', 'wb') 122 | if savevocab: 123 | fout1 = open(filename+'.edgevocab', 'wb') 124 | if savevocab: 125 | for i in tqdm.tqdm(range(0, len(lines), 1)): 126 | update_edge(re.sub(r' {2,}',' ',lines[i].strip()),edge_vocab) 127 | for i in tqdm.tqdm(range(0, len(lines), 1)): 128 | adj_matrix = dependency_adj_matrix(re.sub(r' {2,}',' ',lines[i].strip()),edge_vocab) 129 | idx2graph[i] = adj_matrix 130 | pickle.dump(idx2graph, fout) 131 | if savevocab: 132 | pickle.dump(edge_vocab, fout1) 133 | fout.close() 134 | if savevocab: 135 | fout1.close() 136 | return edge_vocab 137 | if __name__ == '__main__': 138 | edge_vocab = process('./datasets/turkish/restaurant_train.raw', None, True) 139 | process('./datasets/turkish/restaurant_test.raw', edge_vocab, False) 140 | -------------------------------------------------------------------------------- /data/Multilingual/baseline/DualGCN_french_dutch_spanish.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import spacy 3 | import pickle 4 | from tqdm import tqdm 5 | import json 6 | nlp = spacy.load('fr') # fr - franch / es - spanish / nl -dutch 7 | MAP_INV = {0 : 'neutral', 1 : 'positive', -1 : 'negative'} 8 | 9 | def read_data(path : str): 10 | with open(path, 'r') as fp: 11 | data = fp.readlines() 12 | fp.close() 13 | data_gp = [] 14 | for idx in range(0, len(data), 3): 15 | sentence = data[idx].strip() 16 | term = data[idx+1].strip() 17 | polarity = eval(data[idx+2].strip()) 18 | data_gp.append([sentence, term, polarity]) 19 | return data_gp 20 | 21 | def construct_data(data : list): 22 | out_data = [] 23 | for text in tqdm(data, desc='Processing'): 24 | sentence, term, polarity = text[0], text[1], text[2] 25 | document = nlp(sentence.replace('$T$', term)) 26 | tokens, pos_tag = zip(*[[token.text, token.tag_.split('_')[0]] for token in document]) 27 | sentence_post = sentence.replace('$T$', term) 28 | aspect_sentiment = [[term, MAP_INV[polarity]]] 29 | from_idx = len([tok.text for tok in nlp(sentence.split('$T$')[0])]) 30 | term_tok = [tok.text for tok in nlp(term)] 31 | from_to = [[from_idx, from_idx + len(term_tok)]] 32 | predicted_dependencies = [token.dep_.split(':')[0] if token.dep_ != 'ROOT' else 'root' for token in document] 33 | predicted_heads = [token.head.i+1 if token.dep_ != 'ROOT' else 0 for token in document] 34 | dependencies = [list(ele) for ele in list(zip(predicted_dependencies, predicted_heads, list(range(1, len(predicted_heads) + 1))))] 35 | obj = {} 36 | obj['token'] = list(tokens) 37 | obj['pos'] = list(pos_tag) 38 | obj['head'] = predicted_heads 39 | obj['deprel'] = predicted_dependencies 40 | obj['aspects'] = [{ 41 | 'term': term_tok, 42 | 'from' : from_to[0][0], 43 | 'to' : from_to[0][1], 44 | 'polarity' : MAP_INV[polarity] 45 | }] 46 | out_data.append(obj) 47 | return out_data 48 | 49 | if __name__ == '__main__': 50 | train_data = read_data('restaurant_train.raw') 51 | train_data = construct_data(train_data) 52 | with open('train.json', 'w', encoding='utf-8') as tr_fp: 53 | json_str = json.dumps(train_data, indent=4) 54 | tr_fp.write(json_str) 55 | tr_fp.close() 56 | 57 | test_data = read_data('restaurant_test.raw') 58 | test_data = construct_data(test_data) 59 | with open('test.json', 'w', encoding='utf-8') as te_fp: 60 | json_str = json.dumps(test_data, indent=4) 61 | te_fp.write(json_str) 62 | te_fp.close() 63 | print('WELL DONE.') 64 | -------------------------------------------------------------------------------- /data/Multilingual/baseline/DualGCN_turkish.py: -------------------------------------------------------------------------------- 1 | try: 2 | import xml.etree.cElementTree as ET 3 | except ImportError: 4 | import xml.etree.ElementTree as ET 5 | 6 | from tqdm import tqdm 7 | import json 8 | import stanfordnlp 9 | nlpmodel = stanfordnlp.Pipeline(processors='tokenize,pos,depparse', lang="tr") 10 | 11 | def parse_adj(edge): 12 | e_id, dep_rels, dep_heads = 1, [], [] 13 | for eidx in range(len(edge)): 14 | if (eidx + 1) != edge[0][2]: 15 | dep_heads.append(edge[e_id][1]) 16 | dep_rels.append(edge[e_id][0]) 17 | e_id += 1 18 | else: 19 | dep_heads.append(0) 20 | dep_rels.append(edge[0][0]) 21 | return dep_heads, dep_rels 22 | 23 | def obtain_annotate(results: dict, only_tokens: bool=False, encode_eng:bool=False): 24 | tokens, postag, heads, deprels = [], [], [], [] 25 | for sentence in results.sentences: 26 | tok, pos = zip(*[[word.text, word.xpos.split('|')[0]] for word in sentence.words]) 27 | postag.extend(pos) 28 | tokens.extend(tok) 29 | dep = [(word.dependency_relation if word.dependency_relation != 'root' else 'ROOT', word.governor, eval(word.index)) for word in sentence.words] 30 | head, rel = parse_adj(dep) 31 | heads.extend([he+len(heads) for he in head]) 32 | deprels.extend(rel) 33 | if not only_tokens: 34 | return tokens, postag, heads, deprels 35 | else: 36 | return tokens 37 | 38 | def parse_xml(path : str): 39 | data = [] 40 | tree = ET.parse(path) 41 | root = tree.getroot() 42 | for review in tqdm(root.findall('Review'), 'Process'): 43 | for sentences in review.findall('sentences'): 44 | for sentence in sentences.findall('sentence'): 45 | obj = {} 46 | text = sentence.find('text').text 47 | if text is None or len(text) < 1: continue 48 | results = nlpmodel(text) 49 | tokens, pos_tag, heads, rels = obtain_annotate(results) 50 | obj['token'] = tokens 51 | obj['pos'] = pos_tag 52 | obj['head'] = heads 53 | obj['deprel'] = rels 54 | asp_total = [] 55 | for asps in sentence.findall('Opinions'): 56 | for asp in asps.findall('Opinion'): 57 | aspect_dict = {} 58 | from_idx = eval(asp.get('from')) 59 | to_idx = eval(asp.get('to')) 60 | polarity = asp.get('polarity') 61 | term = asp.get('target') 62 | if polarity == 'conflict': continue 63 | if term == 'NULL':continue 64 | context_l = text[:from_idx] 65 | term_lr = text[from_idx:to_idx] 66 | if term_lr.lower() != term: 67 | print(text + ' / ' + term) 68 | if len(context_l) > 0: 69 | token_l = obtain_annotate(nlpmodel(context_l), only_tokens=True) 70 | else: 71 | token_l = [] 72 | if len(text[from_idx:to_idx]) < 1: continue 73 | token_term = obtain_annotate(nlpmodel(text[from_idx:to_idx]), only_tokens=True) 74 | aspect_dict['term'] = token_term 75 | aspect_dict['from'] = len(token_l) 76 | aspect_dict['to'] = len(token_l) + len(token_term) 77 | aspect_dict['polarity'] = polarity 78 | asp_total.append(aspect_dict) 79 | 80 | obj['aspects'] = asp_total 81 | if len(asp_total) > 0: data.append(obj) 82 | return data 83 | 84 | if __name__ == '__main__': 85 | # Original Data Source 86 | train_data = parse_xml('ABSA16Tur_Train.xml') 87 | with open('train.json', 'w', encoding='utf-8') as tr_fp: 88 | json_str = json.dumps(train_data, indent=4) 89 | tr_fp.write(json_str) 90 | tr_fp.close() 91 | 92 | test_data = parse_xml('ABSA16Tur_Test.xml') 93 | with open('test.json', 'w', encoding='utf-8') as te_fp: 94 | json_str = json.dumps(test_data, indent=4) 95 | te_fp.write(json_str) 96 | te_fp.close() 97 | print('WELL DONE.') -------------------------------------------------------------------------------- /data/Multilingual/baseline/KumaGCN_french_dutch_spanish.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import spacy 4 | import pickle 5 | nlp = spacy.load('de') 6 | 7 | def dependency_adj_matrix(text): 8 | print(text) 9 | document = nlp(text) 10 | print("[tlog] document: " + str(document)) 11 | #sys.exit(0) 12 | seq_len = len(text.split()) 13 | matrix = np.zeros((seq_len, seq_len)).astype('float32') 14 | pos = [] 15 | dep_rel = [] 16 | i = 0 17 | for sentence in document.sentences: 18 | print("[tlog] sentence: " + str(sentence)) 19 | for word in sentence.words: 20 | #print("[tlog] token: " + str(token.pos_)) 21 | if word.index + i < seq_len: #there are some bugs for here such as SPACE 22 | pos.append(word.pos) 23 | dep_rel.append(word.dependency_relation) 24 | #print("[tlog] token: " + str(token.dep_)) 25 | #print("[tlog] token.i: " + str(token.i)) 26 | #print("[tlog] token.children: " + str([child for child in token.children])) 27 | #print("\n") 28 | #sys.exit(0) governor 29 | if word.index + i < seq_len: 30 | index = word.index + i 31 | head_index = word.governor + i 32 | matrix[index][index] = 1 33 | 34 | matrix[head_index][index] = 1 35 | matrix[index][head_index] = 1 36 | 37 | i += len(sentence.words) 38 | #print("[tlog] matrix: " + str(matrix)) 39 | #sys.exit(0) 40 | return matrix, pos, dep_rel 41 | 42 | def dependency_adj_matrix2(text): 43 | # https://spacy.io/docs/usage/processing-text 44 | #print("[tlog] text: " + str(text)) # Maybe for parsing, we should not lower case this 45 | document = nlp(text) 46 | #print("[tlog] document: " + str(document)) 47 | #sys.exit(0) 48 | seq_len = len(text.split()) 49 | matrix = np.zeros((seq_len, seq_len)).astype('float32') 50 | pos = [] 51 | dep_rel = [] 52 | for token in document: 53 | #print("[tlog] token: " + str(token)) 54 | #print("[tlog] token: " + str(token.pos_)) 55 | if token.i < seq_len: #there are some bugs for here such as SPACE 56 | pos.append(token.tag_) 57 | dep_rel.append(token.dep_) 58 | #print("[tlog] token: " + str(token.dep_)) 59 | #print("[tlog] token.i: " + str(token.i)) 60 | #print("[tlog] token.children: " + str([child for child in token.children])) 61 | #print("\n") 62 | #sys.exit(0) 63 | if token.i < seq_len: 64 | matrix[token.i][token.i] = 1 65 | # https://spacy.io/docs/api/token 66 | for child in token.children: # tzy: do not distinguish the arc types 67 | if child.i < seq_len: 68 | matrix[token.i][child.i] = 1 69 | matrix[child.i][token.i] = 1 70 | 71 | #print("[tlog] matrix: " + str(matrix)) 72 | #sys.exit(0) 73 | return matrix, pos, dep_rel 74 | 75 | def process(filename): 76 | print(filename) 77 | fin = open(filename, 'r', encoding='utf-8', newline='\n', errors='ignore') 78 | lines = fin.readlines() 79 | fin.close() 80 | idx2graph = {} 81 | fout = open(filename+'.graph', 'wb') 82 | pos_out = open(filename+'.pos', 'w') 83 | rel_out = open(filename+'.rel', 'w') 84 | for i in range(0, len(lines), 3): 85 | #text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")] 86 | #aspect = lines[i + 1].lower().strip() 87 | text_left, _, text_right = [s.strip() for s in lines[i].partition("$T$")] 88 | aspect = lines[i + 1].strip() 89 | #adj_matrix, pos, rel = dependency_adj_matrix(text_left.strip()+' '+aspect+' '+text_right.strip()) 90 | adj_matrix, pos, rel = dependency_adj_matrix2(text_left.strip()+' '+aspect+' '+text_right.strip()) 91 | idx2graph[i] = adj_matrix 92 | pos_out.write(" ".join(pos)+"\n") 93 | rel_out.write(" ".join(rel)+"\n") 94 | pickle.dump(idx2graph, fout) 95 | fout.close() 96 | 97 | if __name__ == '__main__': 98 | process('./datasets/french/restaurant_train.raw') 99 | process('./datasets/french/restaurant_test.raw') 100 | 101 | process('./datasets/spanish/restaurant_train.raw') 102 | process('./datasets/spanish/restaurant_test.raw') 103 | 104 | process('./datasets/dutch/restaurant_train.raw') 105 | process('./datasets/dutch/restaurant_test.raw') 106 | -------------------------------------------------------------------------------- /data/Multilingual/baseline/RGAT_french_dutch_spanish.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import spacy 3 | import pickle 4 | from tqdm import tqdm 5 | import json 6 | nlp = spacy.load('fr') # fr - franch / es - spanish / nl -dutch 7 | MAP_INV = {0 : 'neutral', 1 : 'positive', -1 : 'negative'} 8 | 9 | def read_data(path : str): 10 | with open(path, 'r') as fp: 11 | data = fp.readlines() 12 | fp.close() 13 | data_gp = [] 14 | for idx in range(0, len(data), 3): 15 | sentence = data[idx].strip() 16 | term = data[idx+1].strip() 17 | polarity = eval(data[idx+2].strip()) 18 | data_gp.append([sentence, term, polarity]) 19 | return data_gp 20 | 21 | def construct_data(data : list): 22 | out_data = [] 23 | for text in tqdm(data, desc='Processing'): 24 | sentence, term, polarity = text[0], text[1], text[2] 25 | document = nlp(sentence.replace('$T$', term)) 26 | tokens, pos_tag = zip(*[[token.text, token.tag_.split('_')[0]] for token in document]) 27 | sentence_post = sentence.replace('$T$', term) 28 | aspect_sentiment = [[term, MAP_INV[polarity]]] 29 | from_idx = len([tok.text for tok in nlp(sentence.split('$T$')[0])]) 30 | term_tok = len([tok.text for tok in nlp(term)]) 31 | from_to = [[from_idx, from_idx + term_tok]] 32 | predicted_dependencies = [token.dep_.split(':')[0] if token.dep_ != 'ROOT' else 'root' for token in document] 33 | predicted_heads = [token.head.i+1 if token.dep_ != 'ROOT' else 0 for token in document] 34 | dependencies = [list(ele) for ele in list(zip(predicted_dependencies, predicted_heads, list(range(1, len(predicted_heads) + 1))))] 35 | obj = {} 36 | obj['sentence'] = sentence_post 37 | obj['tokens'] = list(tokens) 38 | obj['tags'] = list(pos_tag) 39 | obj['predicted_dependencies'] = predicted_dependencies 40 | obj['predicted_heads'] = predicted_heads 41 | obj['dependencies'] = dependencies 42 | obj['aspect_sentiment'] = aspect_sentiment 43 | obj['from_to'] = from_to 44 | out_data.append(obj) 45 | return out_data 46 | 47 | if __name__ == '__main__': 48 | train_data = read_data('restaurant_train.raw') 49 | train_data = construct_data(train_data) 50 | with open('restaurant_Train.json', 'w', encoding='utf-8') as tr_fp: 51 | json_str = json.dumps(train_data, indent=4) 52 | tr_fp.write(json_str) 53 | tr_fp.close() 54 | 55 | test_data = read_data('restaurant_test.raw') 56 | test_data = construct_data(test_data) 57 | with open('restaurant_Test.json', 'w', encoding='utf-8') as te_fp: 58 | json_str = json.dumps(test_data, indent=4) 59 | te_fp.write(json_str) 60 | te_fp.close() 61 | print('WELL DONE.') 62 | -------------------------------------------------------------------------------- /data/Multilingual/baseline/RGAT_turkish.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | from tqdm import tqdm 4 | import json 5 | 6 | import stanfordnlp 7 | nlpmodel = stanfordnlp.Pipeline(processors='tokenize,pos,depparse', lang="tr") 8 | 9 | MAP_INV = {0 : 'neutral', 1 : 'positive', -1 : 'negative'} 10 | 11 | def parse_adj(edge): 12 | e_id, dep_rels, dep_heads = 1, [], [] 13 | for eidx in range(len(edge)): 14 | if (eidx + 1) != edge[0][2]: 15 | dep_heads.append(edge[e_id][1]) 16 | dep_rels.append(edge[e_id][0]) 17 | e_id += 1 18 | else: 19 | dep_heads.append(0) 20 | dep_rels.append(edge[0][0]) 21 | return dep_heads, dep_rels 22 | 23 | 24 | def obtain_annotate(results: dict, only_tokens: bool=False, encode_eng:bool=False): 25 | tokens, postag, heads, deprels, deps = [], [], [], [], [] 26 | for sentence in results.sentences: 27 | tok, pos = zip(*[[word.text, word.xpos.split('|')[0].upper()] for word in sentence.words]) 28 | postag.extend(pos) 29 | tokens.extend(tok) 30 | dep = [(word.dependency_relation, word.governor, eval(word.index)) for word in sentence.words] 31 | head, rel = parse_adj(dep) 32 | heads.extend([he+len(heads) for he in head]) 33 | deprels.extend(rel) 34 | dep =[list(gr) for gr in dep] 35 | for idx in range(len(dep)): 36 | if dep[idx][1] !=0:dep[idx][1] += len(deps) 37 | dep[idx][2] += len(deps) 38 | if dep[idx][0] == 'ROOT' : dep[idx][0] = 'root' 39 | deps.extend(dep) 40 | if not only_tokens: 41 | return tokens, postag, deps, heads, deprels 42 | else: 43 | return tokens 44 | 45 | 46 | def read_data(path : str): 47 | with open(path, 'r') as fp: 48 | data = fp.readlines() 49 | fp.close() 50 | data_gp = [] 51 | for idx in range(0, len(data), 3): 52 | sentence = data[idx].strip() 53 | term = data[idx+1].strip() 54 | polarity = eval(data[idx+2].strip()) 55 | data_gp.append([sentence, term, polarity]) 56 | return data_gp 57 | 58 | def construct_data(data : list): 59 | out_data = [] 60 | for text in tqdm(data, desc='Processing'): 61 | sentence, term, polarity = text[0], text[1], text[2] 62 | results = nlpmodel(sentence.replace('$T$', term)) 63 | tokens, pos_tag, deps, heads, rels = obtain_annotate(results) 64 | predicted_heads = [ele[1] for ele in deps] 65 | sentence_post = sentence.replace('$T$', term) 66 | aspect_sentiment = [[term, MAP_INV[polarity]]] 67 | tok_lr = sentence.split('$T$')[0] 68 | if len(tok_lr) < 1: 69 | from_idx = 0 70 | else: 71 | from_idx = len(obtain_annotate(nlpmodel(sentence.split('$T$')[0]), only_tokens=True)) 72 | term_tok = len(obtain_annotate(nlpmodel(term), only_tokens=True)) 73 | from_to = [[from_idx, from_idx + term_tok]] 74 | obj = {} 75 | obj['sentence'] = sentence_post 76 | obj['tokens'] = list(tokens) 77 | obj['tags'] = list(pos_tag) 78 | obj['predicted_dependencies'] = rels 79 | obj['predicted_heads'] = predicted_heads 80 | obj['dependencies'] = deps 81 | obj['aspect_sentiment'] = aspect_sentiment 82 | obj['from_to'] = from_to 83 | out_data.append(obj) 84 | return out_data 85 | 86 | if __name__ == '__main__': 87 | train_data = read_data('restaurant_train.raw') 88 | train_data = construct_data(train_data) 89 | with open('restaurant_Train.json', 'w', encoding='utf-8') as tr_fp: 90 | json_str = json.dumps(train_data, indent=4) 91 | tr_fp.write(json_str) 92 | tr_fp.close() 93 | 94 | test_data = read_data('restaurant_test.raw') 95 | test_data = construct_data(test_data) 96 | with open('restaurant_Test.json', 'w', encoding='utf-8') as te_fp: 97 | json_str = json.dumps(test_data, indent=4) 98 | te_fp.write(json_str) 99 | te_fp.close() 100 | print('WELL DONE.') 101 | -------------------------------------------------------------------------------- /data/Multilingual/download_and_process_multilingual.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DATA_DIR=$1 4 | OUTPUT_DIR=$2 5 | EXECUTE_DIR=$3 6 | PORT=9000 7 | 8 | if test -z "$DATA_DIR" 9 | then 10 | DATA_DIR='.' 11 | fi 12 | 13 | if test -z "$OUTPUT_DIR" 14 | then 15 | OUTPUT_DIR='dataset' 16 | fi 17 | 18 | if test -z "$EXECUTE_DIR" 19 | then 20 | EXECUTE_DIR='.' 21 | fi 22 | 23 | echo "Download multilingual data in mirror source of ZJU MMF" 24 | echo "Origin data for multilingual dataset can be found in https://alt.qcri.org/semeval2016/task5/" 25 | 26 | echo ">> 1 FRENCH" 27 | FRENCH_TRAIN_FILE=${DATA_DIR}/french_train.raw 28 | FRENCH_TEST_FILE=${DATA_DIR}/french_test.raw 29 | FRENCH_OUT_DIR=${OUTPUT_DIR}/JSONABSA_French 30 | 31 | wget -O $FRENCH_TRAIN_FILE https://expic.xlxw.org/hycxg/datamirror/french_train.raw 32 | wget -O $FRENCH_TEST_FILE https://expic.xlxw.org/hycxg/datamirror/french_test.raw 33 | 34 | echo "Process french raw data to HyCxG format" 35 | python $EXECUTE_DIR/process_multilingual.py --train_file $FRENCH_TRAIN_FILE --test_file $FRENCH_TEST_FILE \ 36 | --out_path $FRENCH_OUT_DIR --lang french --port $PORT 37 | 38 | echo ">> 2 SPANISH" 39 | SPANISH_TRAIN_FILE=${DATA_DIR}/spanish_train.raw 40 | SPANISH_TEST_FILE=${DATA_DIR}/spanish_test.raw 41 | SPANISH_OUT_DIR=${OUTPUT_DIR}/JSONABSA_Spanish 42 | 43 | wget -O $SPANISH_TRAIN_FILE https://expic.xlxw.org/hycxg/datamirror/spanish_train.raw 44 | wget -O $SPANISH_TEST_FILE https://expic.xlxw.org/hycxg/datamirror/spanish_test.raw 45 | 46 | echo "Process spanish raw data to HyCxG format" 47 | python $EXECUTE_DIR/process_multilingual.py --train_file $SPANISH_TRAIN_FILE --test_file $SPANISH_TEST_FILE \ 48 | --out_path $SPANISH_OUT_DIR --lang spanish --port $PORT 49 | 50 | echo ">> 3 TURKISH" 51 | TURKISH_TRAIN_FILE=${DATA_DIR}/turkish_train.raw 52 | TURKISH_TEST_FILE=${DATA_DIR}/turkish_test.raw 53 | TURKISH_OUT_DIR=${OUTPUT_DIR}/JSONABSA_Turkish 54 | 55 | wget -O $TURKISH_TRAIN_FILE https://expic.xlxw.org/hycxg/datamirror/turkish_train.raw 56 | wget -O $TURKISH_TEST_FILE https://expic.xlxw.org/hycxg/datamirror/turkish_test.raw 57 | 58 | echo "Process turkish raw data to HyCxG format" 59 | python $EXECUTE_DIR/process_multilingual.py --train_file $TURKISH_TRAIN_FILE --test_file $TURKISH_TEST_FILE \ 60 | --out_path $TURKISH_OUT_DIR --lang turkish --port $PORT 61 | 62 | 63 | echo ">> 4 DUTCH" 64 | DUTCH_TRAIN_FILE=${DATA_DIR}/dutch_train.raw 65 | DUTCH_TEST_FILE=${DATA_DIR}/dutch_test.raw 66 | DUTCH_OUT_DIR=${OUTPUT_DIR}/JSONABSA_Dutch 67 | 68 | wget -O $DUTCH_TRAIN_FILE https://expic.xlxw.org/hycxg/datamirror/dutch_train.raw 69 | wget -O $DUTCH_TEST_FILE https://expic.xlxw.org/hycxg/datamirror/dutch_test.raw 70 | 71 | echo "Process dutch raw data to HyCxG format" 72 | python $EXECUTE_DIR/process_multilingual.py --train_file $DUTCH_TRAIN_FILE --test_file $DUTCH_TEST_FILE \ 73 | --out_path $DUTCH_OUT_DIR --lang dutch --port $PORT 74 | -------------------------------------------------------------------------------- /data/README_ZH.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/data) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/data/README_ZH.md) 16 | 17 | ## HyCxG使用的数据集 18 | 19 | 在本工作中,我们用到了以下五个方面的数据集(括号中为在论文中提到的位置): 20 | + **方面级情感分析数据集** `[Rest 14/Lap 14/Rest 15/Rest 16/MAMS]` (4.2节的Results on ABSA tasks) 21 | + **GLUE基准数据集** `[CoLA/SST-2/MNLI/QNLI/RTE/QQP/MRPC/STS]` (4.2节的Results on GLUE tasks) 22 | + **多语言情感分析数据集** `[French/Spanish/Turkish/Dutch]` (4.3节的Multilingual results) 23 | + **反事实检测数据集** (附录F的Pattern Recognition Capability of CxG) 24 | + **口语化情感分析数据集** `[Twitter/GermEval]` (附录H的Colloquial Expression Results) 25 | 26 | ### 方面级情感分析数据集 27 | 方面级情感分析(Aspect-based sentiment analysis)数据集请见[`ABSA文件夹`](https://github.com/xlxwalex/HyCxG/tree/main/data/ABSA),其包含来自SemEval 2014/15/16的4个数据集以及MAMS数据集。我们在文件夹中给出了数据集的原始链接以及镜像数据下载脚本,并提供了转换为HyCxG所需数据格式的脚本。 28 | 29 | 除此之外,为了方便使用者在其他基线模型上评估性能,我们给不同的基线模型提供了转换脚本可以转换为它们官方数据格式的脚本(更多关于基线模型的信息请见[`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines)) 30 | 31 | ### GLUE基准数据集 32 | GLUE基准(GLUE benchmark)是常用的自然语言理解任务评估基准,共由11个任务组成,请见[`GLUE文件夹`](https://github.com/xlxwalex/HyCxG/tree/main/data/GLUE)。我们测试了除`Winograd NLI(WNLI)`以及`Diagnostics Main(DM)`外的所有任务。我们在文件夹中给出了GLUE基准的原始下载链接以及镜像下载脚本(该镜像我们直接使用了Hugging Face datasets中的数据),并提供了转换为HyCxG所需数据格式的脚本。 33 | 34 | ### 多语言情感分析数据集 35 | 多语言情感分析数据集基于SemEval 2016,我们选择了法语、西班牙语、土耳其语以及荷兰语作为多语言实验数据集,请见[`Multilingual文件夹`](https://github.com/xlxwalex/HyCxG/tree/main/data/Multilingual)。我们在文件夹中给出了数据集的原始链接以及镜像数据下载脚本,并提供了转换为HyCxG所需数据格式的脚本。 36 | 37 | 另外由于其他基线模型只在英语数据集上进行了性能评估,为了方便使用者能够对比基线模型在多语言实验下的性能,我们提供了不同的基线模型的数据转换脚本以将它们的数据格式进行转换,具体细节请见[`Multilingual文件夹`](https://github.com/xlxwalex/HyCxG/tree/main/data/Multilingual)以及[`guidelines`](https://github.com/xlxwalex/HyCxG/tree/main/guidelines)。 38 | 39 | ### 反事实检测数据集 40 | 反事实检测(Counterfactual detection)数据集基于SemEval2020的Task5,请见[`Counterfactual文件夹`](https://github.com/xlxwalex/HyCxG/tree/main/data/Counterfactual)。我们同样在文件夹中给出了数据集的原始链接以及镜像数据下载脚本,并提供了转换为HyCxG所需数据格式的脚本。 41 | 42 | ### 口语化情感分析数据集 43 | 口语化情感分析实验基于Twitter以及GermEval这两个在社交媒体数据上标注的数据集,请见[`Colloquial文件夹`](https://github.com/xlxwalex/HyCxG/tree/main/data/Colloquial)。我们在该文件夹中给出了数据集的原始链接以及镜像数据下载脚本,并提供了转换为HyCxG所需数据格式的脚本。 44 | 45 | ## 快速下载并处理 46 | 除了进入以上各个子目录分别下载数据文件并处理的方式之外,我们也提供了只需一步即可直接下载并处理所有数据的脚本[`data_pipeline.sh`](https://github.com/xlxwalex/HyCxG/tree/main/data/data_pipeline.sh)。在使用脚本前,请您确认已经安装了[`requirements.txt`](https://github.com/xlxwalex/HyCxG/blob/main/requirements.txt)中的依赖包。在安装完依赖包后,用以下命令来获得并处理数据(可以不附加任何参数,所有参数均有默认值): 47 | ```shell 48 | bash data_pipeline.sh [--DATA_DIR] [--OUTPUT_DIR] [--STANFORD_DIR] 49 | ``` 50 | **参数含义:** 51 | + DATA_DIR: 下载的原始数据所在文件夹, 默认为当前文件夹 52 | + OUTPUT_DIR: 处理好后的数据存储的文件夹, 默认为`dataset` 53 | + STANFORD_DIR: 斯坦福解析器所在位置,默认为本目录的`stanford-corenlp-3.9.2-minimal` 54 | 55 | **注意:** 如果您的系统中存有斯坦福解析器,请将`STANFORD_DIR`设置为解析器所在目录。若解析器文件夹不存在,那么程序会自动从镜像源中进行下载(共353MB)。您也可以使用[`download_stanfordcore.py`](https://github.com/xlxwalex/HyCxG/tree/main/data/download_stanfordcore.py)手动进行下载。 56 | 57 | ## 镜像数据源 58 | 由于以上数据集以及解析器均为公开数据集并且能够直接得到,因此为了方便使用者进行统一下载,我们提供了以上数据的备份镜像源(除了GLUE基准数据集)。如果您是数据集以及解析器的版权所有者,并认为在该数据源分发可能违反您的公开许可,请联系[`xlxw@zju.edu.cn`](mailto:xlxw@zju.edu.cn),我们会立刻撤下您的数据集或解析器。 59 | 60 | ## 数据使用规范 61 | 通过下载数据或以任何方式访问这些数据集,请遵守原始数据集的使用条款,具体原始数据集的链接请见各数据集文件夹。请注意这些数据不得用于任何`非法`或`歧视性`的目的。 -------------------------------------------------------------------------------- /data/data_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DATA_DIR=$1 4 | OUTPUT_DIR=$2 5 | STANFORD_DIR=$3 6 | 7 | if test -z "$DATA_DIR" 8 | then 9 | DATA_DIR='.' 10 | fi 11 | 12 | if test -z "$OUTPUT_DIR" 13 | then 14 | OUTPUT_DIR='dataset' 15 | fi 16 | 17 | if test -z "$STANFORD_DIR" 18 | then 19 | STANFORD_DIR='stanford-corenlp-3.9.2-minimal' 20 | fi 21 | 22 | if [ -d "$STANFORD_DIR" ]; then 23 | echo "$STANFORD_DIR exists, pass" 24 | else 25 | echo "$STANFORD_DIR does not exist, try to download" 26 | python download_stanfordcore.py 27 | fi 28 | 29 | # Download and process ABSA datasets 30 | bash ABSA/download_and_process_absa.sh $DATA_DIR $OUTPUT_DIR $STANFORD_DIR ABSA 31 | 32 | # Download and process GLUE datasets 33 | bash GLUE/download_and_process_glue.sh $OUTPUT_DIR $STANFORD_DIR all GLUE 34 | 35 | # Download and process Colloquial datasets 36 | bash Colloquial/download_and_process_colloquial.sh $DATA_DIR $OUTPUT_DIR $STANFORD_DIR Colloquial 37 | 38 | # Download and process Counterfactual datasets 39 | bash Counterfactual/download_and_process_counterfactual.sh $DATA_DIR $OUTPUT_DIR $STANFORD_DIR Counterfactual 40 | 41 | # Download and process Multilingual datasets 42 | bash Multilingual/download_and_process_multilingual.sh $DATA_DIR $OUTPUT_DIR Multilingual -------------------------------------------------------------------------------- /data/download_stanfordcore.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from tqdm import tqdm 3 | import zipfile 4 | 5 | STANFORD_CORE_LINK = 'https://expic.xlxw.org/hycxg/stanfordcore/stanford-corenlp-3.9.2-minimal.zip' 6 | 7 | def download_stanfordcore(url: str, fname: str): 8 | resp = requests.get(url, stream=True) 9 | total = int(resp.headers.get('content-length', 0)) 10 | 11 | with open(fname, 'wb') as file, tqdm( 12 | desc=fname, 13 | total=total, 14 | unit='iB', 15 | unit_scale=True, 16 | unit_divisor=1024, 17 | ) as bar: 18 | for data in resp.iter_content(chunk_size=1024): 19 | size = file.write(data) 20 | bar.update(size) 21 | 22 | def unzip_stanfordcore(file_name: str, out_path: str=r'.'): 23 | file_zip = zipfile.ZipFile(file_name, 'r') 24 | for file in file_zip.namelist(): 25 | file_zip.extract(file, out_path) 26 | file_zip.close() 27 | 28 | if __name__ == '__main__': 29 | download_stanfordcore(STANFORD_CORE_LINK, 'stanford-corenlp-3.9.2-minimal.zip') 30 | print('>> stanford-corenlp-3.9.2-minimal.zip is downloaded.') 31 | unzip_stanfordcore('stanford-corenlp-3.9.2-minimal.zip') 32 | print('>> Stanford Core files are ready.') 33 | -------------------------------------------------------------------------------- /figures/hycxg-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xlxwalex/HyCxG/bc55ea31f339ec36710b7e9d6a22d4fb1577fa20/figures/hycxg-logo.png -------------------------------------------------------------------------------- /figures/main-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xlxwalex/HyCxG/bc55ea31f339ec36710b7e9d6a22d4fb1577fa20/figures/main-logo.png -------------------------------------------------------------------------------- /figures/sub-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xlxwalex/HyCxG/bc55ea31f339ec36710b7e9d6a22d4fb1577fa20/figures/sub-logo.png -------------------------------------------------------------------------------- /guidelines/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | 16 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/guidelines/) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/guidelines/README_ZH.md) 17 | 18 | ## Guidelines 19 | 20 | Under construction.. -------------------------------------------------------------------------------- /guidelines/README_ZH.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 | 5 |
6 |
7 |

8 |

9 | 10 | GitHub 11 | 12 |

13 | 14 | --- 15 | [**English**](https://github.com/xlxwalex/HyCxG/tree/main/guidelines/) | [**简体中文**](https://github.com/xlxwalex/HyCxG/tree/main/guidelines/README_ZH.md) 16 | ## 指南 17 | 18 | 正在建设中.. -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.11.0+cu113 2 | transformers==4.30.0 3 | pandas==1.4.2 4 | scipy==1.6.0 5 | six==1.15.0 6 | cytoolz==0.11.0 7 | tqdm==4.64.0 8 | numpy==1.24.1 9 | scikit_learn==1.2.0 10 | nltk==3.5 11 | spacy==2.3.5 12 | allennlp==2.1.0 13 | datasets==2.3.2 14 | stanfordcorenlp==3.9.1.1 15 | stanfordnlp==0.2.0 16 | stanza==1.4.2 17 | -------------------------------------------------------------------------------- /tutorials/01_cxgtokenizer_tutorial.py: -------------------------------------------------------------------------------- 1 | from Tokenizer.constants import * 2 | from Tokenizer.ModelTokenizer import CxGTokenizer 3 | 4 | class ARG_Test: 5 | cxg_vocab_path: str = os.path.abspath(os.path.join(os.path.dirname(__file__), "../dataset/Vocab/CxG")) 6 | lm_path: str = 'bert-base-uncased' 7 | do_lower_case: bool = True 8 | lm_group: str = 'BERT' 9 | 10 | # Prepare the args 11 | args = ARG_Test() 12 | 13 | # Initialize CxGTokenizer 14 | cxg_tokenizer = CxGTokenizer(args, lang='eng') # Current language is English 15 | 16 | # acquire constructions 17 | test_sentence = "The restaurants try too hard to make fancy food." 18 | constructions = cxg_tokenizer.tokenize(test_sentence, raw=True) 19 | print(constructions) 20 | # Output: 21 | #{ 22 | # 'text': 'The restaurants try too hard to make fancy food.', 23 | # 'token': ['the', 'restaurants', 'try', 'too', 'hard', 'to', 'make', 'fancy', 'food', '.'], 24 | # 'cons_idx': [1501, 10765, 1943], 'cons_start': [3, 3, 4], 'cons_end': [7, 6, 7], 25 | # 'cons_pattern': ['ADV--hard--to--VERB', 'ADV--hard--to', 'hard--PART--VERB'] 26 | # } 27 | -------------------------------------------------------------------------------- /tutorials/02_coverage_solver_tutorial.py: -------------------------------------------------------------------------------- 1 | from Simuann import CxGCoverage 2 | import random 3 | 4 | t_minutes = 0.05 5 | test_cxgs = { 6 | 'the--NOUN--was--ADV' : (1, 5), 7 | 'the--NOUN--was' : (1, 4), 8 | 'NOUN--AUX--ADV' : (2, 5), 9 | 'AUX--so--ADJ' : (3, 6) 10 | } 11 | 12 | cxg_names = list(test_cxgs) 13 | 14 | # Initialize states 15 | init_state = [0] * len(cxg_names) 16 | for _ in range(random.randint(1, len(cxg_names))): 17 | init_state[random.randint(0, len(cxg_names)-1)] = 1 18 | 19 | # Pack inputs 20 | starts, ends, patterns = [], [], [] 21 | for cxg in test_cxgs: 22 | starts.append(test_cxgs[cxg][0]) 23 | ends.append(test_cxgs[cxg][1]) 24 | patterns.append(cxg) 25 | 26 | # Initialize CxGCoverage 27 | cp = CxGCoverage(init_state, patterns, starts, ends, vis=True) 28 | cp.set_schedule(cp.auto(minutes=t_minutes)) 29 | state, energy = cp.anneal() 30 | print() 31 | print('>> Results:') 32 | for ids in range(len(state)): 33 | if state[ids] == 1: 34 | cxg = list(test_cxgs)[ids] 35 | print('CXG : {}, ({}, {})'.format(cxg, test_cxgs[cxg][0], test_cxgs[cxg][1])) 36 | 37 | # Output: 38 | # Temperature Energy Accept Improve Elapsed Remaining 39 | # 0.10000 0.66 0.00% 0.00% 0:00:01 0:00:00 40 | # Temperature Energy Accept Improve Elapsed Remaining 41 | # 0.10000 0.66 0.07% 0.04% 0:00:03 0:00:00 42 | # >> Results: 43 | # CXG : the--NOUN--was, (1, 4) 44 | # CXG : AUX--so--ADJ, (3, 6) -------------------------------------------------------------------------------- /tutorials/03_hypergraph_tutorial.py: -------------------------------------------------------------------------------- 1 | import os 2 | from utils.coverage import cxg_max_coverage 3 | from utils.hypergraph import construct_graph 4 | from Tokenizer import CxGTokenizer 5 | from transformers import AutoTokenizer 6 | import random 7 | random.seed(0) 8 | 9 | class ARG_Test: 10 | cxg_vocab_path: str = os.path.abspath(os.path.join(os.path.dirname(__file__), "../dataset/Vocab/CxG")) 11 | lm_path: str = 'roberta-base-english' 12 | do_lower_case: bool = True 13 | lm_group: str = 'RoBERTa' 14 | t_minutes: float = 0.05 15 | 16 | # Prepare the args 17 | args = ARG_Test() 18 | cxgprocessor = CxGTokenizer(args, lang='eng') 19 | tokenizer = AutoTokenizer.from_pretrained(args.lm_path) 20 | 21 | # Process the sentence 22 | sentence = 'I can understand the prices if it served better food.' 23 | tokens = tokenizer.tokenize(sentence) 24 | sentence_mask = [0] + [1] * len(tokens) + [0] 25 | tokens = [''] + tokens + [''] 26 | token_ids = tokenizer.convert_tokens_to_ids(tokens) 27 | cxgs = cxgprocessor.tokenize(sentence, raw=True) 28 | selected = cxg_max_coverage(cxgs['cons_start'], cxgs['cons_end'], cxgs['cons_idx'], cxgs['cons_pattern'], T_minutes=args.t_minutes) 29 | hg, edges = construct_graph([selected], [sentence_mask], pad_len=15) 30 | 31 | print('>> Results') 32 | print('Tokens = {}'.format(tokens)) 33 | print('Token ids = {}'.format(token_ids)) 34 | print('constructions = {}'.format(cxgs)) 35 | print('selected constructions = {}'.format(selected)) 36 | print('hypergraph adjs =\n{}'.format(hg)) 37 | 38 | # Outputs: 39 | # >> Results 40 | # Tokens = ['', 'I', 'Ġcan', 'Ġunderstand', 'Ġthe', 'Ġprices', 'Ġif', 'Ġit', 'Ġserved', 'Ġbetter', 'Ġfood', '.', ''] 41 | # Token ids = [0, 100, 64, 1346, 5, 850, 114, 24, 1665, 357, 689, 4, 2] 42 | # constructions = { 43 | # 'text': 'I can understand the prices if it served better food.', 44 | # 'token': ['i', 'can', 'understand', 'the', 'prices', 'if', 'it', 'served', 'better', 'food', '.'], 45 | # 'cons_idx': [5943, 6071, 16646, 6388, 13591, 11402, 786, 4387, 13648, 5683, 12421, 12967], 46 | # 'cons_start': [0, 0, 0, 1, 1, 2, 3, 3, 3, 4, 4, 5], 'cons_end': [4, 5, 3, 5, 4, 5, 7, 6, 8, 8, 7, 8], 47 | # 'cons_pattern': ['i--AUX--VERB--DET', 'i--AUX--VERB--DET--NOUN', 'i--AUX--VERB', 'can--VERB--DET--NOUN', 'can--VERB--DET', 'understand--DET--NOUN', 'the--NOUN--SCONJ--PRON', 'the--NOUN--SCONJ', 'the--NOUN--SCONJ--PRON--VERB', 'NOUN--SCONJ--PRON--VERB', 'NOUN--SCONJ--PRON', 'if--PRON--VERB'] 48 | # } 49 | # selected constructions = [(0, 5, 6071, 'i--AUX--VERB--DET--NOUN'), (5, 8, 12967, 'if--PRON--VERB')] 50 | # hypergraph adjs = 51 | # [array([[0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], 52 | # [0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0.], 53 | # [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])] --------------------------------------------------------------------------------