├── .gitignore ├── .gitmodules ├── README.en.md ├── README.md ├── bash ├── finetune.sh └── generate.sh ├── data ├── .DS_Store ├── NaSGEC-Exam │ └── .gitkeep ├── NaSGEC-Media │ ├── .DS_Store │ └── .gitkeep ├── NaSGEC-Thesis │ └── .gitkeep ├── README.md └── dict │ ├── dict.label0.txt │ ├── dict.src.txt │ └── dict.tgt.txt ├── demo.py ├── guidelines └── .gitkeep ├── models └── .gitkeep ├── pics └── data_statistic.png ├── preprocessed └── .gitkeep ├── requirements.txt ├── results └── .gitkeep └── utils ├── convert_from_fairseq_to_huggingface.py ├── segment_bert.py ├── tokenization.py └── vocab.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "SynGEC"] 2 | path = SynGEC 3 | url = https://github.com/HillZhang1999/SynGEC.git 4 | -------------------------------------------------------------------------------- /README.en.md: -------------------------------------------------------------------------------- 1 | # NaSGEC: a Multi-Domain Chinese Grammatical Error Correction Dataset from Native Speaker Texts 2 | 3 | [中文](./README.md)|[English](./README.en.md) 4 | 5 | ## Introduction 6 | We presents the first multi-domain Chinese grammatical error correction dataset from native speaker texts, NaSGEC, which includes real erroneous sentences from three domains: social media (Media), academic writing (Thesis), and Chinese exams (Exam). The aim is to promote cross-domain research in Chinese grammatical error correction (CGEC). Each erroneous sentence is annotated by two annotators and reviewed by one expert, thus providing multiple high-quality reference corrections. 7 | 8 | In addition, we trained a series of high-quality benchmark CGEC models based on Chinese BART, including: 1) training data based on high-quality human annotation (Lang8+HSK); 2) training data automatically constructed from a large-scale (>100 million) native language text corpus. 9 | 10 | Furthermore, we also fine-tuned the above models with the manually annotated NaSGEC dataset to build advanced CGEC models for specific domains. 11 | 12 | ## NaSGEC Dataset 13 | The NaSGEC dataset mainly includes 12,500 sentences and corresponding reference corrections from three Chinese native language domains, namely: 14 | 15 | + **Social Media (NaSGEC-Media)**: 4,000 sentences from articles posted on WeChat official account platform; 16 | + **Scientific Writing (NaSGEC-Thesis)**:1,500 sentences obtained from undergraduate thesis in computer science; 17 | + **Chinese Examination (NaSGEC-Exam)**:7,000 sentences obtained from Chinese exam papers; 18 | 19 | 20 | The main data statistical indicators are shown in the table below: 21 | ![Data statistics](./pics/data_statistic.png) 22 | 23 | For more detailed data introduction and cross domain analysis, please refer to our paper. 24 | 25 | *Note: the full dataset will be released as soon as possible.* 26 | 27 | 28 | ## Benchmark CGEC Models 29 | ### Experimental Enviroment 30 | Our models are developed based on the `SynGEC` code library, and the experimental environment installation is as follows: 31 | 32 | ``` 33 | git clone git@github.com:HillZhang1999/NaSGEC.git 34 | git submodule update --recursive --remote --force 35 | conda create -n nasgec python==3.8 36 | conda activate nasgec 37 | pip install -r requirements.txt 38 | python -m spacy download en 39 | cd ./SynGEC/src/src_syngec/fairseq-0.10.2 40 | pip install --editable ./ 41 | ``` 42 | 43 | ### Model Usage 44 | We have released the following 5 CGEC models: 45 | | Model | Link | 46 | | :------- | :---------: | 47 | | **real_learner_bart_CGEC** | [Google Drive](https://drive.google.com/file/d/1AamhBi6vJ8RVzzHtr43Uaoqrm7_vPpuB/view?usp=share_link) | 48 | | **pseudo_native_bart_CGEC** | [Google Drive](https://drive.google.com/file/d/1dKbrej1Eh_M1DFqtCvvSqso0QUUn9EvC/view?usp=share_link) | 49 | | **pseudo_native_bart_CGEC_media** | [Google Drive](https://drive.google.com/file/d/17dSnSEPq-eyWZ-Uck4G6fO8XwjNfxmDi/view?usp=share_link) | 50 | | **pseudo_native_bart_CGEC_thesis** | [Google Drive](https://drive.google.com/file/d/1J-BFDSxV4eQ2JvFEXdvI2AktZOxNd8rq/view?usp=share_link) | 51 | | **real_learner_bart_CGEC_exam** | [Google Drive](https://drive.google.com/file/d/1iQ0i7JMNXyoKjd5BdAfIPGg3QBLr9Lr3/view?usp=share_link) | 52 | 53 | In addition to the Fairseq version mentioned above, our models also support `HuggingFace transformers`: 54 | 55 | ``` 56 | from transformers import BertTokenizer, BartForConditionalGeneration, Text2TextGenerationPipeline 57 | tokenizer = BertTokenizer.from_pretrained("/mnt/nas_alinlp/zuyi.bzy/zhangyue/NaSGEC/models/real_learner_bart_CGEC") 58 | model = BartForConditionalGeneration.from_pretrained("/mnt/nas_alinlp/zuyi.bzy/zhangyue/NaSGEC/models/real_learner_bart_CGEC") 59 | encoded_input = tokenizer(["北京是中国的都。", "他说:”我最爱的运动是打蓝球“", "我每天大约喝5次水左右。", "今天,我非常开开心。"], return_tensors="pt", padding=True, truncation=True) 60 | if "token_type_ids" in encoded_input: 61 | del encoded_input["token_type_ids"] 62 | output = model.generate(**encoded_input) 63 | print(tokenizer.batch_decode(output, skip_special_tokens=True)) 64 | ``` 65 | 66 | Hugging Face Models: 67 | | Model | Link | 68 | | :------- | :---------: | 69 | | **HillZhang/real_learner_bart_CGEC** | [HuggingFace](https://huggingface.co/HillZhang/real_learner_bart_CGEC)| 70 | | **HillZhang/pseudo_native_bart_CGEC** | [HuggingFace](https://huggingface.co/HillZhang/pseudo_native_bart_CGEC)| 71 | | **HillZhang/pseudo_native_bart_CGEC_media** | [HuggingFace](https://huggingface.co/HillZhang/pseudo_native_bart_CGEC_media) | 72 | | **HillZhang/pseudo_native_bart_CGEC_thesis** | [HuggingFace](https://huggingface.co/HillZhang/pseudo_native_bart_CGEC_thesis) | 73 | | **HillZhang/real_learner_bart_CGEC_exam** | [HuggingFace](https://huggingface.co/HillZhang/real_learner_bart_CGEC_exam) | 74 | 75 | 76 | ### Performance Evaluation 77 | The metric used in our paper is based on [MuCGEC](https://github.com/HillZhang1999/MuCGEC). The **ChERANT** tool proposed by this work mainly calculates Precision/Recall/F_0.5 at the word level [[Link]](https://github.com/HillZhang1999/MuCGEC/tree/main/scorers/ChERRANT). We will provide an online evaluation website in the future. 78 | 79 | In addition, our model can also achieve SOTA performance on previous benchmarks such as NLPCC18/MuCGEC. 80 | 81 | ## Citation 82 | 83 | If you think our work is helpful, please cite our paper: 84 | NaSGEC: a Multi-Domain Chinese Grammatical Error Correction Dataset from Native Speaker Texts (Accepted by ACL2023 Findings) [PDF]() 85 | 86 | ``` 87 | @inproceedings{zhang-etal-2023-nasgec, 88 | title = "{Na}{SGEC}: a Multi-Domain Chinese Grammatical Error Correction Dataset from Native Speaker Texts", 89 | author = "Zhang, Yue and 90 | Zhang, Bo and 91 | Jiang, Haochen and 92 | Li, Zhenghua and 93 | Li, Chen and 94 | Huang, Fei and 95 | Zhang, Min" 96 | booktitle = "Findings of ACL", 97 | year = "2023" 98 | } 99 | ``` 100 | 101 | ## Connection 102 | 103 | If you encounter any issues when using our dataset and code, you can contact hillzhang1999@qq.com. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NaSGEC: Multi-Domain Chinese Grammatical Error Correction for Native Speaker Texts 2 | 3 | [中文](./README.md)|[English](./README.en.md) 4 | ## 简介 5 | 本文提出了首个多领域中文母语纠错数据集NaSGEC,包含3个领域的真实病句:社交媒体(Media),学术写作(Thesis)和语文考试(Exam),旨在推动中文语法纠错(CGEC)的跨领域(Cross-domain)研究。每个病句由双人独立标注+专家审查,从而提供多个高质量参考修改。 6 | 7 | 同时,我们基于中文BART训练了一系列高质量基准CGEC模型,主要包括:1) 基于高质量人工标注训练数据(Lang8+HSK);2) 基于超大规模(>1亿)母语文本自动构造的训练数据。 8 | 9 | 此外,我们也使用人工标注的NaSGEC数据集对上述模型进行领域内微调,从而构建针对特定领域的先进中文语法纠错模型。 10 | 11 | ## NaSGEC数据集 12 | NaSGEC数据集主要包含3个中文母语领域的12,500个句子及其对应的修改结果,这三个领域分别为: 13 | 14 | + **社交媒体(NaSGEC-Media)**:从微信公众号文章中获取的4000句; 15 | + **学术写作(NaSGEC-Thesis)**:从计算机专业本科生毕业论文中获取的1500句; 16 | + **语文考试(NaSGEC-Exam)**:从语文考试试卷中获取的7000句; 17 | 18 | 主要的数据统计指标如下表所示: 19 | ![Data statistics](./pics/data_statistic.png) 20 | 21 | 更多详细数据介绍和跨领域分析,请参考我们的论文。 22 | 23 | 24 | ## 如何获取数据 25 | 请参考“语析LAGroup”的官方仓库https://github.com/SUDA-LA/CGECData 26 | 27 | ## Benchmark纠错模型 28 | ### 实验环境 29 | 本文所提出的模型基于`SynGEC`代码库开发,实验环境安装如下所示: 30 | 31 | ``` 32 | git clone git@github.com:HillZhang1999/NaSGEC.git 33 | git submodule init && git submodule update --recursive --remote --force 34 | conda create -n nasgec python==3.8 35 | conda activate nasgec 36 | pip install -r requirements.txt 37 | python -m spacy download en 38 | cd ./SynGEC/src/src_syngec/fairseq-0.10.2 39 | pip install --editable ./ 40 | ``` 41 | 42 | ### 模型使用 43 | 我们开源了如下5个中文纠错模型,分别适用于不同的领域: 44 | | 模型 | 链接 | 描述 | 45 | | :------- | :---------: | :---------: | 46 | | **real_learner_bart_CGEC** | [Google Drive](https://drive.google.com/file/d/1AamhBi6vJ8RVzzHtr43Uaoqrm7_vPpuB/view?usp=share_link) | 伪母语数据预训练,真实二语者数据微调,适合二语者文本和病句题 | 47 | | **pseudo_native_bart_CGEC** | [Google Drive](https://drive.google.com/file/d/1dKbrej1Eh_M1DFqtCvvSqso0QUUn9EvC/view?usp=share_link) | 使用伪母语数据训练的模型,适合通用母语写作场景 | 48 | | **pseudo_native_bart_CGEC_media** | [Google Drive](https://drive.google.com/file/d/17dSnSEPq-eyWZ-Uck4G6fO8XwjNfxmDi/view?usp=share_link) | 伪母语数据预训练,NaSGEC-Media微调,适合日常写作 | 49 | | **pseudo_native_bart_CGEC_thesis** | [Google Drive](https://drive.google.com/file/d/1J-BFDSxV4eQ2JvFEXdvI2AktZOxNd8rq/view?usp=share_link) | 伪母语数据预训练,NaSGEC-Thesis微调,适合学术写作 | 50 | | **real_learner_bart_CGEC_exam** | [Google Drive](https://drive.google.com/file/d/1iQ0i7JMNXyoKjd5BdAfIPGg3QBLr9Lr3/view?usp=share_link) | 真实二语者数据预训练,NaSGEC-Exam微调,适合纠正语病错误 | 51 | 52 | 模型下载后放入`./models`目录,具体的推理方法可以参考`./bash/generate.sh`。 53 | 同时,用户也可以继续对上述模型进行微调训练,方法可以参考`./bash/finetune.sh`。 54 | 55 | 除了上述Fairseq版本,我们也支持`HuggingFace transformers`一键调用我们的模型进行推理,使用方式如下所示: 56 | 57 | ``` 58 | from transformers import BertTokenizer, BartForConditionalGeneration, Text2TextGenerationPipeline 59 | tokenizer = BertTokenizer.from_pretrained("HillZhang/real_learner_bart_CGEC") 60 | model = BartForConditionalGeneration.from_pretrained("HillZhang/real_learner_bart_CGEC") 61 | encoded_input = tokenizer(["北京是中国的都。", "他说:”我最爱的运动是打蓝球“", "我每天大约喝5次水左右。", "今天,我非常开开心。"], return_tensors="pt", padding=True, truncation=True) 62 | if "token_type_ids" in encoded_input: 63 | del encoded_input["token_type_ids"] 64 | output = model.generate(**encoded_input) 65 | print(tokenizer.batch_decode(output, skip_special_tokens=True)) 66 | ``` 67 | 68 | Hugging Face模型链接为: 69 | | 模型 | 链接 | 70 | | :------- | :---------: | 71 | | **HillZhang/real_learner_bart_CGEC** | [HuggingFace](https://huggingface.co/HillZhang/real_learner_bart_CGEC)| 72 | | **HillZhang/pseudo_native_bart_CGEC** | [HuggingFace](https://huggingface.co/HillZhang/pseudo_native_bart_CGEC)| 73 | | **HillZhang/pseudo_native_bart_CGEC_media** | [HuggingFace](https://huggingface.co/HillZhang/pseudo_native_bart_CGEC_media) | 74 | | **HillZhang/pseudo_native_bart_CGEC_thesis** | [HuggingFace](https://huggingface.co/HillZhang/pseudo_native_bart_CGEC_thesis) | 75 | | **HillZhang/real_learner_bart_CGEC_exam** | [HuggingFace](https://huggingface.co/HillZhang/real_learner_bart_CGEC_exam) | 76 | 77 | **Hugging Face版本是由Fairseq训练的权重通过脚本转换而来,所以性能可能存在一些差异。** 78 | 79 | ### 效果评估 80 | 本文使用的Metric基于[MuCGEC](https://github.com/HillZhang1999/MuCGEC)工作提出的**ChERRANT**工具,主要计算字级别的Precision/Recall/F_0.5指标,具体使用方法可以参考[[Link]](https://github.com/HillZhang1999/MuCGEC/tree/main/scorers/ChERRANT)。后续我们会提供在线评测网站。 81 | 82 | 此外,我们的模型在NLPCC18/MuCGEC等前人数据集上也能取得SOTA的性能。 83 | 84 | ## 引用 85 | 86 | 如果您认为我们的工作对您的工作有帮助,请引用我们的论文: 87 | NaSGEC: a Multi-Domain Chinese Grammatical Error Correction Dataset from Native Speaker Texts (Accepted by ACL2023 Findings) [PDF]() 88 | 89 | ``` 90 | @inproceedings{zhang-etal-2023-nasgec, 91 | title = "{Na}{SGEC}: a Multi-Domain Chinese Grammatical Error Correction Dataset from Native Speaker Texts", 92 | author = "Zhang, Yue and 93 | Zhang, Bo and 94 | Jiang, Haochen and 95 | Li, Zhenghua and 96 | Li, Chen and 97 | Huang, Fei and 98 | Zhang, Min" 99 | booktitle = "Findings of ACL", 100 | year = "2023" 101 | } 102 | ``` 103 | 104 | ## 联系 105 | 如果您在使用我们的数据集及代码的过程中遇到了任何问题,可联系 hillzhang1999@qq.com。 106 | -------------------------------------------------------------------------------- /bash/finetune.sh: -------------------------------------------------------------------------------- 1 | # Preprocess data for fine-tuning 2 | 3 | FAIRSEQ_DIR=../SynGEC/src/src_syngec/fairseq-0.10.2/fairseq_cli 4 | DATA_DIR=../data/NaSGEC-Thesis 5 | PROCESSED_DIR=../preprocess/NaSGEC-Thesis 6 | mkdir -p $PROCESSED_DIR 7 | WORKER_NUM=64 8 | SYNTAX_DICT=../data/dict/dict.label0.txt 9 | 10 | TRAIN_SRC_FILE=$DATA_DIR/train/src.txt 11 | TRAIN_TGT_FILE=$DATA_DIR/train/tgt.txt 12 | VALID_SRC_FILE=$DATA_DIR/dev/src.txt 13 | VALID_TGT_FILE=$DATA_DIR/dev/tgt.txt 14 | 15 | # tokenizing 16 | if [ ! -f $TRAIN_SRC_FILE".char" ]; then 17 | echo "Tokenizing..." 18 | cd ../utils 19 | python segment_bert.py <$TRAIN_SRC_FILE >$TRAIN_SRC_FILE".char" 20 | python segment_bert.py <$TRAIN_TGT_FILE >$TRAIN_TGT_FILE".char" 21 | python segment_bert.py <$VALID_SRC_FILE >$VALID_SRC_FILE".char" 22 | python segment_bert.py <$VALID_TGT_FILE >$VALID_TGT_FILE".char" 23 | cd - 24 | fi 25 | 26 | cp $TRAIN_SRC_FILE".char" $PROCESSED_DIR/train.char.src 27 | cp $TRAIN_TGT_FILE".char" $PROCESSED_DIR/train.char.tgt 28 | cp $VALID_SRC_FILE".char" $PROCESSED_DIR/valid.char.src 29 | cp $VALID_TGT_FILE".char" $PROCESSED_DIR/valid.char.tgt 30 | mkdir -p $PROCESSED_DIR/bin 31 | 32 | echo "Preprocessing..." 33 | 34 | python $FAIRSEQ_DIR/preprocess.py --source-lang src --target-lang tgt \ 35 | --user-dir ../SynGEC/src/src_syngec/syngec_model \ 36 | --task syntax-enhanced-translation \ 37 | --trainpref $PROCESSED_DIR/train.char \ 38 | --validpref $PROCESSED_DIR/valid.char \ 39 | --destdir $PROCESSED_DIR/bin \ 40 | --workers $WORKER_NUM \ 41 | --labeldict $SYNTAX_DICT \ 42 | --srcdict ../data/dict/dict.src.txt \ 43 | --tgtdict ../dict/dict/dict.src.txt 44 | 45 | echo "Finished!" 46 | 47 | ######## Finetuning ######## 48 | SEED=42 49 | PRETRAIN_MODEL_PATH=../model/pseudo_native_bart_zh.pt 50 | MODEL_DIR=../model/pseudo_native_bart_zh_finetuned_with_NaSGEC_Thesis/$SEED 51 | 52 | mkdir -p $MODEL_DIR/src 53 | cp ./finetune.sh $MODEL_DIR 54 | 55 | CUDA_VISIBLE_DEVICES=0 NCCL_DEBUG=INFO nohup python -u $FAIRSEQ_DIR/train.py $PROCESSED_DIR/bin \ 56 | --save-dir $MODEL_DIR \ 57 | --user-dir ../SynGEC/src/src_syngec/syngec_model \ 58 | --task syntax-enhanced-translation \ 59 | --arch syntax_enhanced_bart_large \ 60 | --finetune-from-model $PRETRAIN_MODEL_PATH \ 61 | --skip-invalid-size-inputs-valid-test \ 62 | --max-tokens 1024 \ 63 | --update-freq 1 \ 64 | --optimizer adam \ 65 | --lr 1e-05 \ 66 | --max-source-positions 512 \ 67 | --max-target-positions 512 \ 68 | --warmup-updates 0 \ 69 | -s src \ 70 | -t tgt \ 71 | --lr-scheduler polynomial_decay \ 72 | --clip-norm 1.0 \ 73 | --criterion label_smoothed_cross_entropy \ 74 | --label-smoothing 0.1 \ 75 | --dropout 0.3 \ 76 | --share-all-embeddings \ 77 | --adam-betas '(0.9,0.999)' \ 78 | --log-format tqdm \ 79 | --find-unused-parameters \ 80 | --fp16 \ 81 | --max-epoch 100 \ 82 | --patience 10 \ 83 | --seed $SEED >MODEL_DIR/nohup.log 2>&1 & 84 | 85 | wait 86 | -------------------------------------------------------------------------------- /bash/generate.sh: -------------------------------------------------------------------------------- 1 | CUDA_DEVICE=0 2 | BEAM=12 3 | N_BEST=1 4 | SEED=2022 5 | FAIRSEQ_DIR=../SynGEC/src/src_syngec/fairseq-0.10.2/fairseq_cli 6 | 7 | TEST_DIR=/mnt/nas_alinlp/zuyi.bzy/zhangyue/NaSGEC/data/test # 测试集路径 8 | INPUT_FILE=$TEST_DIR/input.txt 9 | MODEL_PATH=../models/real_learner_bart_CGEC.pt # 模型路径 10 | PROCESSED_DIR=../data/dict 11 | OUTPUT_DIR=../results/test 12 | 13 | # tokenizing 14 | if [ ! -f $INPUT_FILE".char" ]; then 15 | echo "Tokenizing..." 16 | cd ../utils 17 | python segment_bert.py <$INPUT_FILE >$INPUT_FILE".char" 18 | cd - 19 | fi 20 | 21 | mkdir -p $OUTPUT_DIR 22 | cp $INPUT_FILE $OUTPUT_DIR/input.txt 23 | INPUT_FILE=$INPUT_FILE".char" 24 | cp $INPUT_FILE $OUTPUT_DIR/input.char 25 | 26 | echo "Generating..." 27 | SECONDS=0 28 | 29 | CUDA_VISIBLE_DEVICES=$CUDA_DEVICE python -u ${FAIRSEQ_DIR}/interactive.py $PROCESSED_DIR \ 30 | --user-dir ../SynGEC/src/src_syngec/syngec_model \ 31 | --task syntax-enhanced-translation \ 32 | --path ${MODEL_PATH} \ 33 | --beam ${BEAM} \ 34 | --nbest ${N_BEST} \ 35 | -s src \ 36 | -t tgt \ 37 | --buffer-size 10000 \ 38 | --batch-size 32 \ 39 | --num-workers 12 \ 40 | --log-format tqdm \ 41 | --remove-bpe \ 42 | --fp16 \ 43 | --output_file $OUTPUT_DIR/output.nbest \ 44 | <$OUTPUT_DIR/input.char 45 | 46 | echo "Generating Finish!" 47 | duration=$SECONDS 48 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed." 49 | 50 | cat $OUTPUT_DIR/output.nbest | grep "^D-" | python -c "import sys; x = sys.stdin.readlines(); x = ''.join([ x[i] for i in range(len(x)) if (i % ${N_BEST} == 0) ]); print(x)" | cut -f 3 >$OUTPUT_DIR/output.char 51 | sed -i '$d' $OUTPUT_DIR/output.char 52 | cat $OUTPUT_DIR/output.char | python -c "import sys; x = sys.stdin.readlines(); x = '\n'.join([''.join([tok[2:] if len(tok) > 2 and tok[:2] == '##' else tok for tok in sent.split()]) for sent in x]); print(x)" >$OUTPUT_DIR/output.txt # 最终预测结果 53 | -------------------------------------------------------------------------------- /data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/data/.DS_Store -------------------------------------------------------------------------------- /data/NaSGEC-Exam/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/data/NaSGEC-Exam/.gitkeep -------------------------------------------------------------------------------- /data/NaSGEC-Media/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/data/NaSGEC-Media/.DS_Store -------------------------------------------------------------------------------- /data/NaSGEC-Media/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/data/NaSGEC-Media/.gitkeep -------------------------------------------------------------------------------- /data/NaSGEC-Thesis/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/data/NaSGEC-Thesis/.gitkeep -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | # NaSGEC 2 | 3 | ## 如何获取数据 4 | 请参考“语析LAGroup”的官方仓库https://github.com/SUDA-LA/CGECData 5 | -------------------------------------------------------------------------------- /data/dict/dict.label0.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | M 5 | R 6 | S 7 | acomp 8 | advcl 9 | advmod 10 | amod 11 | appos 12 | aux 13 | auxpass 14 | cc 15 | ccomp 16 | conj 17 | cop 18 | csubj 19 | csubjpass 20 | dep 21 | det 22 | discourse 23 | dobj 24 | expl 25 | infmod 26 | iobj 27 | mark 28 | mwe 29 | neg 30 | nn 31 | npadvmod 32 | nsubj 33 | nsubjpass 34 | num 35 | number 36 | parataxis 37 | partmod 38 | pcomp 39 | pobj 40 | poss 41 | possessive 42 | preconj 43 | predet 44 | prep 45 | prt 46 | punct 47 | quantmod 48 | rcmod 49 | root 50 | tmod 51 | xcomp 52 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizer, BartForConditionalGeneration, Text2TextGenerationPipeline 2 | 3 | valid_models = ["HillZhang/pseudo_native_bart_CGEC", "HillZhang/pseudo_native_bart_CGEC", "HillZhang/pseudo_native_bart_CGEC_media", "HillZhang/pseudo_native_bart_CGEC_thesis", "HillZhang/real_learner_bart_CGEC_exam"] 4 | 5 | tokenizer = BertTokenizer.from_pretrained(valid_models[0]) 6 | model = BartForConditionalGeneration.from_pretrained(valid_models[0]) 7 | encoded_input = tokenizer(["北京是中国的都。", "他说:”我最爱的运动是打蓝球“", "我每天大约喝5次水左右。", "今天,我非常开开心。"], return_tensors="pt", padding=True, truncation=True) 8 | if "token_type_ids" in encoded_input: 9 | del encoded_input["token_type_ids"] 10 | output = model.generate(**encoded_input) 11 | print(tokenizer.batch_decode(output, skip_special_tokens=True)) 12 | 13 | -------------------------------------------------------------------------------- /guidelines/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/guidelines/.gitkeep -------------------------------------------------------------------------------- /models/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/models/.gitkeep -------------------------------------------------------------------------------- /pics/data_statistic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/pics/data_statistic.png -------------------------------------------------------------------------------- /preprocessed/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/preprocessed/.gitkeep -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | antlr4-python3-runtime==4.9.3 2 | astroid==2.11.7 3 | blessings==1.7 4 | charset-normalizer==2.1.0 5 | click==8.1.3 6 | dill==0.3.5.1 7 | emoji==2.0.0 8 | filelock==3.7.1 9 | gpustat==0.6.0 10 | huggingface-hub==0.8.1 11 | idna==3.3 12 | isort==5.10.1 13 | joblib==1.1.0 14 | lazy-object-proxy==1.7.1 15 | mccabe==0.7.0 16 | multiprocess==0.70.13 17 | nltk==3.7 18 | numpy==1.23.1 19 | nvidia-ml-py3==7.352.0 20 | omegaconf==2.2.2 21 | packaging==21.3 22 | pathos==0.2.9 23 | platformdirs==2.5.2 24 | pox==0.3.1 25 | ppft==1.7.6.5 26 | protobuf==4.21.2 27 | psutil==5.9.1 28 | pylint==2.14.5 29 | pyparsing==3.0.9 30 | PyYAML==6.0 31 | regex==2022.7.9 32 | requests==2.28.1 33 | six==1.16.0 34 | stanza==1.4.0 35 | tokenizers==0.12.1 36 | tomli==2.0.1 37 | tomlkit==0.11.1 38 | torch==1.12.0 39 | transformers==4.20.1 40 | typing_extensions==4.3.0 41 | urllib3==1.26.10 42 | wrapt==1.14.1 43 | xlrd==2.0.1 44 | xlwt==1.3.0 45 | supar==1.1.4 46 | errant 47 | subword-nmt -------------------------------------------------------------------------------- /results/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/results/.gitkeep -------------------------------------------------------------------------------- /utils/convert_from_fairseq_to_huggingface.py: -------------------------------------------------------------------------------- 1 | from transformers import ( 2 | BartConfig, 3 | BartForConditionalGeneration, 4 | BartForSequenceClassification, 5 | BartModel, 6 | BartTokenizer, 7 | ) 8 | from fairseq.checkpoint_utils import load_model_ensemble_and_task 9 | import os 10 | import logging 11 | import sys 12 | from pathlib import Path 13 | import argparse 14 | import torch 15 | 16 | logging.basicConfig( 17 | format="%(asctime)s | %(levelname)s | %(name)s | [%(filename)s:%(lineno)d] %(message)s", 18 | datefmt="%Y-%m-%d %H:%M:%S", 19 | level=os.environ.get("LOGLEVEL", "INFO").upper(), 20 | stream=sys.stdout, 21 | ) 22 | logger = logging.getLogger("convert") 23 | 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--checkpoint_path", type=str, default="~/NaSGEC/models/pseudo_native_bart_zh.twisted.pt") 26 | parser.add_argument("--data_dir", type=str, default="~/NaSGEC/data/dict") 27 | parser.add_argument("--save_dir", type=str, default="~/NaSGEC/models/test") 28 | main_args = parser.parse_args() 29 | 30 | logger.info('Load fairseq checkpoint...') 31 | models, args, task = load_model_ensemble_and_task(filenames=[os.path.expanduser(main_args.checkpoint_path)], 32 | arg_overrides={'data': os.path.expanduser(main_args.data_dir)}) 33 | 34 | fairseq_transformer = models[0].eval() 35 | 36 | logger.info('Huggingface config...') 37 | huggingface_config = BartConfig.from_pretrained('fnlp/bart-large-chinese', 38 | activation_function=args.activation_fn, 39 | d_model=args.encoder_embed_dim, 40 | encoder_attention_heads=args.encoder_attention_heads, 41 | encoder_ffn_dim=args.encoder_ffn_embed_dim, 42 | encoder_layers=args.encoder_layers, 43 | decoder_attention_heads=args.decoder_attention_heads, 44 | decoder_ffn_dim=args.decoder_ffn_embed_dim, 45 | decoder_layers=args.decoder_layers, 46 | normalize_embedding=args.layernorm_embedding, 47 | scale_embedding=(not args.no_scale_embedding), 48 | static_position_embeddings=(not args.encoder_learned_pos), 49 | vocab_size=len(task.source_dictionary), 50 | revision="v1.0" 51 | ) 52 | logger.info('Init huggingface model...') 53 | huggingface_model = BartForConditionalGeneration(huggingface_config).eval() 54 | 55 | logger.info('Convert...') 56 | def remove_ignore_keys_(state_dict): 57 | ignore_keys = [ 58 | "encoder.version", 59 | "decoder.version", 60 | "model.encoder.version", 61 | "model.decoder.version", 62 | "_float_tensor", 63 | "decoder.output_projection.weight", 64 | "encoder.embed_positions._float_tensor", 65 | "decoder.embed_positions._float_tensor" 66 | ] 67 | for k in ignore_keys: 68 | state_dict.pop(k, None) 69 | 70 | def rename_key(dct, old, new): 71 | val = dct.pop(old) 72 | dct[new] = val 73 | 74 | state_dict = fairseq_transformer.state_dict() 75 | remove_ignore_keys_(state_dict) 76 | huggingface_model.model.load_state_dict(state_dict, strict=False) 77 | 78 | logger.info('Success!') 79 | Path(main_args.save_dir).mkdir(exist_ok=True) 80 | huggingface_model.save_pretrained(main_args.save_dir) -------------------------------------------------------------------------------- /utils/segment_bert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tokenization 3 | from tqdm import tqdm 4 | from multiprocessing import Pool 5 | 6 | tokenizer = tokenization.FullTokenizer(vocab_file="./vocab.txt", do_lower_case=False) 7 | 8 | def split(line): 9 | line = line.strip() 10 | origin_line = line 11 | line = line.replace(" ", "") 12 | line = tokenization.convert_to_unicode(line) 13 | if not line: 14 | return '' 15 | tokens = tokenizer.tokenize(line) 16 | return ' '.join(tokens) 17 | 18 | with Pool(64) as pool: 19 | for ret in pool.imap(split, tqdm(sys.stdin), chunksize=1024): 20 | print(ret) 21 | 22 | -------------------------------------------------------------------------------- /utils/tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import unicodedata 23 | import six 24 | 25 | 26 | def convert_to_unicode(text): 27 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" 28 | if six.PY3: 29 | if isinstance(text, str): 30 | return text 31 | elif isinstance(text, bytes): 32 | return text.decode("utf-8", "ignore") 33 | else: 34 | raise ValueError("Unsupported string type: %s" % (type(text))) 35 | elif six.PY2: 36 | if isinstance(text, str): 37 | return text.decode("utf-8", "ignore") 38 | elif isinstance(text, unicode): 39 | return text 40 | else: 41 | raise ValueError("Unsupported string type: %s" % (type(text))) 42 | else: 43 | raise ValueError("Not running on Python2 or Python 3?") 44 | 45 | 46 | def printable_text(text): 47 | """Returns text encoded in a way suitable for print or `tf.logging`.""" 48 | 49 | # These functions want `str` for both Python2 and Python3, but in one case 50 | # it's a Unicode string and in the other it's a byte string. 51 | if six.PY3: 52 | if isinstance(text, str): 53 | return text 54 | elif isinstance(text, bytes): 55 | return text.decode("utf-8", "ignore") 56 | else: 57 | raise ValueError("Unsupported string type: %s" % (type(text))) 58 | elif six.PY2: 59 | if isinstance(text, str): 60 | return text 61 | elif isinstance(text, unicode): 62 | return text.encode("utf-8") 63 | else: 64 | raise ValueError("Unsupported string type: %s" % (type(text))) 65 | else: 66 | raise ValueError("Not running on Python2 or Python 3?") 67 | 68 | 69 | def load_vocab(vocab_file): 70 | """Loads a vocabulary file into a dictionary.""" 71 | vocab = collections.OrderedDict() 72 | index = 0 73 | with open(vocab_file, "r") as reader: 74 | while True: 75 | token = convert_to_unicode(reader.readline()) 76 | if not token: 77 | break 78 | token = token.strip() 79 | vocab[token] = index 80 | index += 1 81 | return vocab 82 | 83 | 84 | def convert_by_vocab(vocab, items): 85 | """Converts a sequence of [tokens|ids] using the vocab.""" 86 | output = [] 87 | for item in items: 88 | if item not in vocab: 89 | print("warning: %s not in vocab" % item) 90 | item = "[UNK]" 91 | output.append(vocab[item]) 92 | return output 93 | 94 | 95 | def convert_tokens_to_ids(vocab, tokens): 96 | return convert_by_vocab(vocab, tokens) 97 | 98 | 99 | def convert_ids_to_tokens(inv_vocab, ids): 100 | return convert_by_vocab(inv_vocab, ids) 101 | 102 | 103 | def whitespace_tokenize(text): 104 | """Runs basic whitespace cleaning and splitting on a peice of text.""" 105 | text = text.strip() 106 | if not text: 107 | return [] 108 | tokens = text.split() 109 | return tokens 110 | 111 | 112 | class FullTokenizer(object): 113 | """Runs end-to-end tokenziation.""" 114 | 115 | def __init__(self, vocab_file, do_lower_case=True): 116 | self.vocab = load_vocab(vocab_file) 117 | self.inv_vocab = {v: k for k, v in self.vocab.items()} 118 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) 119 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 120 | 121 | def tokenize(self, text): 122 | split_tokens = [] 123 | for token in self.basic_tokenizer.tokenize(text): 124 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 125 | split_tokens.append(sub_token) 126 | 127 | return split_tokens 128 | 129 | def convert_tokens_to_ids(self, tokens): 130 | return convert_by_vocab(self.vocab, tokens) 131 | 132 | def convert_ids_to_tokens(self, ids): 133 | return convert_by_vocab(self.inv_vocab, ids) 134 | 135 | 136 | class BasicTokenizer(object): 137 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 138 | 139 | def __init__(self, do_lower_case=True): 140 | """Constructs a BasicTokenizer. 141 | Args: 142 | do_lower_case: Whether to lower case the input. 143 | """ 144 | self.do_lower_case = do_lower_case 145 | 146 | def tokenize(self, text): 147 | """Tokenizes a piece of text.""" 148 | text = convert_to_unicode(text) 149 | text = self._clean_text(text) 150 | 151 | # This was added on November 1st, 2018 for the multilingual and Chinese 152 | # models. This is also applied to the English models now, but it doesn't 153 | # matter since the English models were not trained on any Chinese data 154 | # and generally don't have any Chinese data in them (there are Chinese 155 | # characters in the vocabulary because Wikipedia does have some Chinese 156 | # words in the English Wikipedia.). 157 | text = self._tokenize_chinese_chars(text) 158 | 159 | orig_tokens = whitespace_tokenize(text) 160 | split_tokens = [] 161 | for token in orig_tokens: 162 | if self.do_lower_case: 163 | token = token.lower() 164 | token = self._run_strip_accents(token) 165 | split_tokens.extend(self._run_split_on_punc(token)) 166 | 167 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 168 | return output_tokens 169 | 170 | def _run_strip_accents(self, text): 171 | """Strips accents from a piece of text.""" 172 | text = unicodedata.normalize("NFD", text) 173 | output = [] 174 | for char in text: 175 | cat = unicodedata.category(char) 176 | if cat == "Mn": 177 | continue 178 | output.append(char) 179 | return "".join(output) 180 | 181 | def _run_split_on_punc(self, text): 182 | """Splits punctuation on a piece of text.""" 183 | chars = list(text) 184 | i = 0 185 | start_new_word = True 186 | output = [] 187 | while i < len(chars): 188 | char = chars[i] 189 | if _is_punctuation(char): 190 | output.append([char]) 191 | start_new_word = True 192 | else: 193 | if start_new_word: 194 | output.append([]) 195 | start_new_word = False 196 | output[-1].append(char) 197 | i += 1 198 | 199 | return ["".join(x) for x in output] 200 | 201 | def _tokenize_chinese_chars(self, text): 202 | """Adds whitespace around any CJK character.""" 203 | output = [] 204 | for char in text: 205 | cp = ord(char) 206 | if self._is_chinese_char(cp): 207 | output.append(" ") 208 | output.append(char) 209 | output.append(" ") 210 | else: 211 | output.append(char) 212 | return "".join(output) 213 | 214 | def _is_chinese_char(self, cp): 215 | """Checks whether CP is the codepoint of a CJK character.""" 216 | # This defines a "chinese character" as anything in the CJK Unicode block: 217 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 218 | # 219 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 220 | # despite its name. The modern Korean Hangul alphabet is a different block, 221 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 222 | # space-separated words, so they are not treated specially and handled 223 | # like the all of the other languages. 224 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 225 | (cp >= 0x3400 and cp <= 0x4DBF) or # 226 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 227 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 228 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 229 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 230 | (cp >= 0xF900 and cp <= 0xFAFF) or # 231 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 232 | return True 233 | 234 | return False 235 | 236 | def _clean_text(self, text): 237 | """Performs invalid character removal and whitespace cleanup on text.""" 238 | output = [] 239 | for char in text: 240 | cp = ord(char) 241 | if cp == 0 or cp == 0xfffd or _is_control(char): 242 | continue 243 | if _is_whitespace(char): 244 | output.append(" ") 245 | else: 246 | output.append(char) 247 | return "".join(output) 248 | 249 | 250 | class WordpieceTokenizer(object): 251 | """Runs WordPiece tokenziation.""" 252 | 253 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): 254 | self.vocab = vocab 255 | self.unk_token = unk_token 256 | self.max_input_chars_per_word = max_input_chars_per_word 257 | 258 | def tokenize(self, text): 259 | """Tokenizes a piece of text into its word pieces. 260 | This uses a greedy longest-match-first algorithm to perform tokenization 261 | using the given vocabulary. 262 | For example: 263 | input = "unaffable" 264 | output = ["un", "##aff", "##able"] 265 | Args: 266 | text: A single token or whitespace separated tokens. This should have 267 | already been passed through `BasicTokenizer. 268 | Returns: 269 | A list of wordpiece tokens. 270 | """ 271 | 272 | text = convert_to_unicode(text) 273 | 274 | output_tokens = [] 275 | for token in whitespace_tokenize(text): 276 | chars = list(token) 277 | if len(chars) > self.max_input_chars_per_word: 278 | output_tokens.append(self.unk_token) 279 | continue 280 | 281 | is_bad = False 282 | start = 0 283 | sub_tokens = [] 284 | while start < len(chars): 285 | end = len(chars) 286 | cur_substr = None 287 | while start < end: 288 | substr = "".join(chars[start:end]) 289 | if start > 0: 290 | substr = "##" + substr 291 | if substr in self.vocab: 292 | cur_substr = substr 293 | break 294 | end -= 1 295 | if cur_substr is None: 296 | is_bad = True 297 | break 298 | sub_tokens.append(cur_substr) 299 | start = end 300 | 301 | if is_bad: 302 | # output_tokens.append(self.unk_token) 303 | output_tokens.append(token) # keep the UNK token 304 | else: 305 | output_tokens.extend(sub_tokens) 306 | return output_tokens 307 | 308 | 309 | def _is_whitespace(char): 310 | """Checks whether `chars` is a whitespace character.""" 311 | # \t, \n, and \r are technically contorl characters but we treat them 312 | # as whitespace since they are generally considered as such. 313 | if char == " " or char == "\t" or char == "\n" or char == "\r": 314 | return True 315 | cat = unicodedata.category(char) 316 | if cat == "Zs": 317 | return True 318 | return False 319 | 320 | 321 | def _is_control(char): 322 | """Checks whether `chars` is a control character.""" 323 | # These are technically control characters but we count them as whitespace 324 | # characters. 325 | if char == "\t" or char == "\n" or char == "\r": 326 | return False 327 | cat = unicodedata.category(char) 328 | if cat.startswith("C"): 329 | return True 330 | return False 331 | 332 | 333 | def _is_punctuation(char): 334 | """Checks whether `chars` is a punctuation character.""" 335 | cp = ord(char) 336 | # We treat all non-letter/number ASCII as punctuation. 337 | # Characters such as "^", "$", and "`" are not in the Unicode 338 | # Punctuation class but we treat them as punctuation anyways, for 339 | # consistency. 340 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 341 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 342 | return True 343 | cat = unicodedata.category(char) 344 | if cat.startswith("P"): 345 | return True 346 | return False --------------------------------------------------------------------------------