├── .gitignore
├── .gitmodules
├── README.en.md
├── README.md
├── bash
    ├── finetune.sh
    └── generate.sh
├── data
    ├── .DS_Store
    ├── NaSGEC-Exam
    │   └── .gitkeep
    ├── NaSGEC-Media
    │   ├── .DS_Store
    │   └── .gitkeep
    ├── NaSGEC-Thesis
    │   └── .gitkeep
    ├── README.md
    └── dict
    │   ├── dict.label0.txt
    │   ├── dict.src.txt
    │   └── dict.tgt.txt
├── demo.py
├── guidelines
    └── .gitkeep
├── models
    └── .gitkeep
├── pics
    └── data_statistic.png
├── preprocessed
    └── .gitkeep
├── requirements.txt
├── results
    └── .gitkeep
└── utils
    ├── convert_from_fairseq_to_huggingface.py
    ├── segment_bert.py
    ├── tokenization.py
    └── vocab.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "SynGEC"]
2 | 	path = SynGEC
3 | 	url = https://github.com/HillZhang1999/SynGEC.git
4 | 


--------------------------------------------------------------------------------
/README.en.md:
--------------------------------------------------------------------------------
  1 | # NaSGEC: a Multi-Domain Chinese Grammatical Error Correction Dataset from Native Speaker Texts
  2 | 
  3 | [中文](./README.md)|[English](./README.en.md)
  4 | 
  5 | ## Introduction
  6 | We presents the first multi-domain Chinese grammatical error correction dataset from native speaker texts, NaSGEC, which includes real erroneous sentences from three domains: social media (Media), academic writing (Thesis), and Chinese exams (Exam). The aim is to promote cross-domain research in Chinese grammatical error correction (CGEC). Each erroneous sentence is annotated by two annotators and reviewed by one expert, thus providing multiple high-quality reference corrections.
  7 | 
  8 | In addition, we trained a series of high-quality benchmark CGEC models based on Chinese BART, including: 1) training data based on high-quality human annotation (Lang8+HSK); 2) training data automatically constructed from a large-scale (>100 million) native language text corpus.
  9 | 
 10 | Furthermore, we also fine-tuned the above models with the manually annotated NaSGEC dataset to build advanced CGEC models for specific domains.
 11 | 
 12 | ## NaSGEC Dataset
 13 | The NaSGEC dataset mainly includes 12,500 sentences and corresponding reference corrections from three Chinese native language domains, namely:
 14 | 
 15 | + **Social Media (NaSGEC-Media)**: 4,000 sentences from articles posted on WeChat official account platform；
 16 | + **Scientific Writing (NaSGEC-Thesis)**：1,500 sentences obtained from undergraduate thesis in computer science；
 17 | + **Chinese Examination (NaSGEC-Exam)**：7,000 sentences obtained from Chinese exam papers；
 18 | 
 19 | 
 20 | The main data statistical indicators are shown in the table below:
 21 | ![Data statistics](./pics/data_statistic.png)
 22 | 
 23 | For more detailed data introduction and cross domain analysis, please refer to our paper.
 24 | 
 25 | *Note: the full dataset will be released as soon as possible.*
 26 | 
 27 | 
 28 | ## Benchmark CGEC Models
 29 | ### Experimental Enviroment
 30 | Our models are developed based on the `SynGEC` code library, and the experimental environment installation is as follows:
 31 | 
 32 | ```
 33 | git clone git@github.com:HillZhang1999/NaSGEC.git
 34 | git submodule update --recursive --remote --force
 35 | conda create -n nasgec python==3.8
 36 | conda activate nasgec
 37 | pip install -r requirements.txt
 38 | python -m spacy download en
 39 | cd ./SynGEC/src/src_syngec/fairseq-0.10.2
 40 | pip install --editable ./
 41 | ```
 42 | 
 43 | ### Model Usage
 44 | We have released the following 5 CGEC models:
 45 | | Model | Link |
 46 | | :------- | :---------: | 
 47 | | **real_learner_bart_CGEC** | [Google Drive](https://drive.google.com/file/d/1AamhBi6vJ8RVzzHtr43Uaoqrm7_vPpuB/view?usp=share_link) | 
 48 | | **pseudo_native_bart_CGEC** | [Google Drive](https://drive.google.com/file/d/1dKbrej1Eh_M1DFqtCvvSqso0QUUn9EvC/view?usp=share_link) | 
 49 | | **pseudo_native_bart_CGEC_media** | [Google Drive](https://drive.google.com/file/d/17dSnSEPq-eyWZ-Uck4G6fO8XwjNfxmDi/view?usp=share_link) | 
 50 | | **pseudo_native_bart_CGEC_thesis** | [Google Drive](https://drive.google.com/file/d/1J-BFDSxV4eQ2JvFEXdvI2AktZOxNd8rq/view?usp=share_link) |
 51 | | **real_learner_bart_CGEC_exam** | [Google Drive](https://drive.google.com/file/d/1iQ0i7JMNXyoKjd5BdAfIPGg3QBLr9Lr3/view?usp=share_link) |
 52 | 
 53 | In addition to the Fairseq version mentioned above, our models also support `HuggingFace transformers`:
 54 | 
 55 | ```
 56 | from transformers import BertTokenizer, BartForConditionalGeneration, Text2TextGenerationPipeline
 57 | tokenizer = BertTokenizer.from_pretrained("/mnt/nas_alinlp/zuyi.bzy/zhangyue/NaSGEC/models/real_learner_bart_CGEC")
 58 | model = BartForConditionalGeneration.from_pretrained("/mnt/nas_alinlp/zuyi.bzy/zhangyue/NaSGEC/models/real_learner_bart_CGEC")
 59 | encoded_input = tokenizer(["北京是中国的都。", "他说：”我最爱的运动是打蓝球“", "我每天大约喝5次水左右。", "今天，我非常开开心。"], return_tensors="pt", padding=True, truncation=True)
 60 | if "token_type_ids" in encoded_input:
 61 |     del encoded_input["token_type_ids"]
 62 | output = model.generate(**encoded_input)
 63 | print(tokenizer.batch_decode(output, skip_special_tokens=True))
 64 | ```
 65 | 
 66 | Hugging Face Models:
 67 | | Model | Link |
 68 | | :------- | :---------: |
 69 | | **HillZhang/real_learner_bart_CGEC** | [HuggingFace](https://huggingface.co/HillZhang/real_learner_bart_CGEC)|
 70 | | **HillZhang/pseudo_native_bart_CGEC** | [HuggingFace](https://huggingface.co/HillZhang/pseudo_native_bart_CGEC)|
 71 | | **HillZhang/pseudo_native_bart_CGEC_media** | [HuggingFace](https://huggingface.co/HillZhang/pseudo_native_bart_CGEC_media) |
 72 | | **HillZhang/pseudo_native_bart_CGEC_thesis** | [HuggingFace](https://huggingface.co/HillZhang/pseudo_native_bart_CGEC_thesis) |
 73 | | **HillZhang/real_learner_bart_CGEC_exam** | [HuggingFace](https://huggingface.co/HillZhang/real_learner_bart_CGEC_exam) |
 74 | 
 75 | 
 76 | ### Performance Evaluation
 77 | The metric used in our paper is based on [MuCGEC](https://github.com/HillZhang1999/MuCGEC). The **ChERANT** tool proposed by this work mainly calculates Precision/Recall/F_0.5 at the word level [[Link]](https://github.com/HillZhang1999/MuCGEC/tree/main/scorers/ChERRANT). We will provide an online evaluation website in the future.
 78 | 
 79 | In addition, our model can also achieve SOTA performance on previous benchmarks such as NLPCC18/MuCGEC.
 80 | 
 81 | ## Citation
 82 | 
 83 | If you think our work is helpful, please cite our paper:
 84 | NaSGEC: a Multi-Domain Chinese Grammatical Error Correction Dataset from Native Speaker Texts (Accepted by ACL2023 Findings) [PDF]()
 85 | 
 86 | ```
 87 | @inproceedings{zhang-etal-2023-nasgec,
 88 |     title = "{Na}{SGEC}: a Multi-Domain Chinese Grammatical Error Correction Dataset from Native Speaker Texts",
 89 |     author = "Zhang, Yue  and
 90 |       Zhang, Bo  and
 91 |       Jiang, Haochen  and
 92 |       Li, Zhenghua  and
 93 |       Li, Chen  and
 94 |       Huang, Fei  and
 95 |       Zhang, Min"
 96 |     booktitle = "Findings of ACL",
 97 |     year = "2023"
 98 |     }
 99 | ```
100 | 
101 | ## Connection
102 | 
103 | If you encounter any issues when using our dataset and code, you can contact hillzhang1999@qq.com.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # NaSGEC: Multi-Domain Chinese Grammatical Error Correction for Native Speaker Texts
  2 | 
  3 | [中文](./README.md)|[English](./README.en.md)
  4 | ## 简介
  5 | 本文提出了首个多领域中文母语纠错数据集NaSGEC，包含3个领域的真实病句：社交媒体(Media)，学术写作(Thesis)和语文考试(Exam)，旨在推动中文语法纠错(CGEC)的跨领域(Cross-domain)研究。每个病句由双人独立标注+专家审查，从而提供多个高质量参考修改。
  6 | 
  7 | 同时，我们基于中文BART训练了一系列高质量基准CGEC模型，主要包括：1) 基于高质量人工标注训练数据(Lang8+HSK)；2) 基于超大规模(>1亿)母语文本自动构造的训练数据。
  8 | 
  9 | 此外，我们也使用人工标注的NaSGEC数据集对上述模型进行领域内微调，从而构建针对特定领域的先进中文语法纠错模型。
 10 | 
 11 | ## NaSGEC数据集
 12 | NaSGEC数据集主要包含3个中文母语领域的12,500个句子及其对应的修改结果，这三个领域分别为：
 13 | 
 14 | + **社交媒体(NaSGEC-Media)**：从微信公众号文章中获取的4000句；
 15 | + **学术写作(NaSGEC-Thesis)**：从计算机专业本科生毕业论文中获取的1500句；
 16 | + **语文考试(NaSGEC-Exam)**：从语文考试试卷中获取的7000句；
 17 | 
 18 | 主要的数据统计指标如下表所示：
 19 | ![Data statistics](./pics/data_statistic.png)
 20 | 
 21 | 更多详细数据介绍和跨领域分析，请参考我们的论文。
 22 | 
 23 | 
 24 | ## 如何获取数据
 25 | 请参考“语析LAGroup”的官方仓库https://github.com/SUDA-LA/CGECData
 26 | 
 27 | ## Benchmark纠错模型
 28 | ### 实验环境
 29 | 本文所提出的模型基于`SynGEC`代码库开发，实验环境安装如下所示:
 30 | 
 31 | ```
 32 | git clone git@github.com:HillZhang1999/NaSGEC.git
 33 | git submodule init && git submodule update --recursive --remote --force
 34 | conda create -n nasgec python==3.8
 35 | conda activate nasgec
 36 | pip install -r requirements.txt
 37 | python -m spacy download en
 38 | cd ./SynGEC/src/src_syngec/fairseq-0.10.2
 39 | pip install --editable ./
 40 | ```
 41 | 
 42 | ### 模型使用
 43 | 我们开源了如下5个中文纠错模型，分别适用于不同的领域：
 44 | | 模型 | 链接 | 描述 |
 45 | | :------- | :---------: | :---------: |
 46 | | **real_learner_bart_CGEC** | [Google Drive](https://drive.google.com/file/d/1AamhBi6vJ8RVzzHtr43Uaoqrm7_vPpuB/view?usp=share_link) | 伪母语数据预训练，真实二语者数据微调，适合二语者文本和病句题 |
 47 | | **pseudo_native_bart_CGEC** | [Google Drive](https://drive.google.com/file/d/1dKbrej1Eh_M1DFqtCvvSqso0QUUn9EvC/view?usp=share_link) | 使用伪母语数据训练的模型，适合通用母语写作场景 |
 48 | | **pseudo_native_bart_CGEC_media** | [Google Drive](https://drive.google.com/file/d/17dSnSEPq-eyWZ-Uck4G6fO8XwjNfxmDi/view?usp=share_link) | 伪母语数据预训练，NaSGEC-Media微调，适合日常写作 |
 49 | | **pseudo_native_bart_CGEC_thesis** | [Google Drive](https://drive.google.com/file/d/1J-BFDSxV4eQ2JvFEXdvI2AktZOxNd8rq/view?usp=share_link) | 伪母语数据预训练，NaSGEC-Thesis微调，适合学术写作 |
 50 | | **real_learner_bart_CGEC_exam** | [Google Drive](https://drive.google.com/file/d/1iQ0i7JMNXyoKjd5BdAfIPGg3QBLr9Lr3/view?usp=share_link) | 真实二语者数据预训练，NaSGEC-Exam微调，适合纠正语病错误 |
 51 | 
 52 | 模型下载后放入`./models`目录，具体的推理方法可以参考`./bash/generate.sh`。
 53 | 同时，用户也可以继续对上述模型进行微调训练，方法可以参考`./bash/finetune.sh`。
 54 | 
 55 | 除了上述Fairseq版本，我们也支持`HuggingFace transformers`一键调用我们的模型进行推理，使用方式如下所示：
 56 | 
 57 | ```
 58 | from transformers import BertTokenizer, BartForConditionalGeneration, Text2TextGenerationPipeline
 59 | tokenizer = BertTokenizer.from_pretrained("HillZhang/real_learner_bart_CGEC")
 60 | model = BartForConditionalGeneration.from_pretrained("HillZhang/real_learner_bart_CGEC")
 61 | encoded_input = tokenizer(["北京是中国的都。", "他说：”我最爱的运动是打蓝球“", "我每天大约喝5次水左右。", "今天，我非常开开心。"], return_tensors="pt", padding=True, truncation=True)
 62 | if "token_type_ids" in encoded_input:
 63 |     del encoded_input["token_type_ids"]
 64 | output = model.generate(**encoded_input)
 65 | print(tokenizer.batch_decode(output, skip_special_tokens=True))
 66 | ```
 67 | 
 68 | Hugging Face模型链接为：
 69 | | 模型 | 链接 |
 70 | | :------- | :---------: |
 71 | | **HillZhang/real_learner_bart_CGEC** | [HuggingFace](https://huggingface.co/HillZhang/real_learner_bart_CGEC)|
 72 | | **HillZhang/pseudo_native_bart_CGEC** | [HuggingFace](https://huggingface.co/HillZhang/pseudo_native_bart_CGEC)|
 73 | | **HillZhang/pseudo_native_bart_CGEC_media** | [HuggingFace](https://huggingface.co/HillZhang/pseudo_native_bart_CGEC_media) |
 74 | | **HillZhang/pseudo_native_bart_CGEC_thesis** | [HuggingFace](https://huggingface.co/HillZhang/pseudo_native_bart_CGEC_thesis) |
 75 | | **HillZhang/real_learner_bart_CGEC_exam** | [HuggingFace](https://huggingface.co/HillZhang/real_learner_bart_CGEC_exam) |
 76 | 
 77 | **Hugging Face版本是由Fairseq训练的权重通过脚本转换而来，所以性能可能存在一些差异。**
 78 | 
 79 | ### 效果评估
 80 | 本文使用的Metric基于[MuCGEC](https://github.com/HillZhang1999/MuCGEC)工作提出的**ChERRANT**工具，主要计算字级别的Precision/Recall/F_0.5指标，具体使用方法可以参考[[Link]](https://github.com/HillZhang1999/MuCGEC/tree/main/scorers/ChERRANT)。后续我们会提供在线评测网站。
 81 | 
 82 | 此外，我们的模型在NLPCC18/MuCGEC等前人数据集上也能取得SOTA的性能。
 83 | 
 84 | ## 引用
 85 | 
 86 | 如果您认为我们的工作对您的工作有帮助，请引用我们的论文：
 87 | NaSGEC: a Multi-Domain Chinese Grammatical Error Correction Dataset from Native Speaker Texts (Accepted by ACL2023 Findings) [PDF]()
 88 | 
 89 | ```
 90 | @inproceedings{zhang-etal-2023-nasgec,
 91 |     title = "{Na}{SGEC}: a Multi-Domain Chinese Grammatical Error Correction Dataset from Native Speaker Texts",
 92 |     author = "Zhang, Yue  and
 93 |       Zhang, Bo  and
 94 |       Jiang, Haochen  and
 95 |       Li, Zhenghua  and
 96 |       Li, Chen  and
 97 |       Huang, Fei  and
 98 |       Zhang, Min"
 99 |     booktitle = "Findings of ACL",
100 |     year = "2023"
101 |     }
102 | ```
103 | 
104 | ## 联系
105 | 如果您在使用我们的数据集及代码的过程中遇到了任何问题，可联系 hillzhang1999@qq.com。
106 | 


--------------------------------------------------------------------------------
/bash/finetune.sh:
--------------------------------------------------------------------------------
 1 | # Preprocess data for fine-tuning
 2 | 
 3 | FAIRSEQ_DIR=../SynGEC/src/src_syngec/fairseq-0.10.2/fairseq_cli
 4 | DATA_DIR=../data/NaSGEC-Thesis
 5 | PROCESSED_DIR=../preprocess/NaSGEC-Thesis
 6 | mkdir -p $PROCESSED_DIR
 7 | WORKER_NUM=64
 8 | SYNTAX_DICT=../data/dict/dict.label0.txt
 9 | 
10 | TRAIN_SRC_FILE=$DATA_DIR/train/src.txt
11 | TRAIN_TGT_FILE=$DATA_DIR/train/tgt.txt
12 | VALID_SRC_FILE=$DATA_DIR/dev/src.txt
13 | VALID_TGT_FILE=$DATA_DIR/dev/tgt.txt
14 | 
15 | # tokenizing
16 | if [ ! -f $TRAIN_SRC_FILE".char" ]; then
17 |   echo "Tokenizing..."
18 |   cd ../utils
19 |   python segment_bert.py <$TRAIN_SRC_FILE >$TRAIN_SRC_FILE".char"
20 |   python segment_bert.py <$TRAIN_TGT_FILE >$TRAIN_TGT_FILE".char"
21 |   python segment_bert.py <$VALID_SRC_FILE >$VALID_SRC_FILE".char"
22 |   python segment_bert.py <$VALID_TGT_FILE >$VALID_TGT_FILE".char"
23 |   cd -
24 | fi
25 | 
26 | cp $TRAIN_SRC_FILE".char" $PROCESSED_DIR/train.char.src
27 | cp $TRAIN_TGT_FILE".char" $PROCESSED_DIR/train.char.tgt
28 | cp $VALID_SRC_FILE".char" $PROCESSED_DIR/valid.char.src
29 | cp $VALID_TGT_FILE".char" $PROCESSED_DIR/valid.char.tgt
30 | mkdir -p $PROCESSED_DIR/bin
31 | 
32 | echo "Preprocessing..."
33 | 
34 | python $FAIRSEQ_DIR/preprocess.py --source-lang src --target-lang tgt \
35 |   --user-dir ../SynGEC/src/src_syngec/syngec_model \
36 |   --task syntax-enhanced-translation \
37 |   --trainpref $PROCESSED_DIR/train.char \
38 |   --validpref $PROCESSED_DIR/valid.char \
39 |   --destdir $PROCESSED_DIR/bin \
40 |   --workers $WORKER_NUM \
41 |   --labeldict $SYNTAX_DICT \
42 |   --srcdict ../data/dict/dict.src.txt \
43 |   --tgtdict ../dict/dict/dict.src.txt
44 | 
45 | echo "Finished!"
46 | 
47 | ######## Finetuning ########
48 | SEED=42
49 | PRETRAIN_MODEL_PATH=../model/pseudo_native_bart_zh.pt
50 | MODEL_DIR=../model/pseudo_native_bart_zh_finetuned_with_NaSGEC_Thesis/$SEED
51 | 
52 | mkdir -p $MODEL_DIR/src
53 | cp ./finetune.sh $MODEL_DIR
54 | 
55 | CUDA_VISIBLE_DEVICES=0 NCCL_DEBUG=INFO nohup python -u $FAIRSEQ_DIR/train.py $PROCESSED_DIR/bin \
56 |   --save-dir $MODEL_DIR \
57 |   --user-dir ../SynGEC/src/src_syngec/syngec_model \
58 |   --task syntax-enhanced-translation \
59 |   --arch syntax_enhanced_bart_large \
60 |   --finetune-from-model $PRETRAIN_MODEL_PATH \
61 |   --skip-invalid-size-inputs-valid-test \
62 |   --max-tokens 1024 \
63 |   --update-freq 1 \
64 |   --optimizer adam \
65 |   --lr 1e-05 \
66 |   --max-source-positions 512 \
67 |   --max-target-positions 512 \
68 |   --warmup-updates 0 \
69 |   -s src \
70 |   -t tgt \
71 |   --lr-scheduler polynomial_decay \
72 |   --clip-norm 1.0 \
73 |   --criterion label_smoothed_cross_entropy \
74 |   --label-smoothing 0.1 \
75 |   --dropout 0.3 \
76 |   --share-all-embeddings \
77 |   --adam-betas '(0.9,0.999)' \
78 |   --log-format tqdm \
79 |   --find-unused-parameters \
80 |   --fp16 \
81 |   --max-epoch 100 \
82 |   --patience 10 \
83 |   --seed $SEED >MODEL_DIR/nohup.log 2>&1 &
84 | 
85 | wait
86 | 


--------------------------------------------------------------------------------
/bash/generate.sh:
--------------------------------------------------------------------------------
 1 | CUDA_DEVICE=0
 2 | BEAM=12
 3 | N_BEST=1
 4 | SEED=2022
 5 | FAIRSEQ_DIR=../SynGEC/src/src_syngec/fairseq-0.10.2/fairseq_cli
 6 | 
 7 | TEST_DIR=/mnt/nas_alinlp/zuyi.bzy/zhangyue/NaSGEC/data/test # 测试集路径
 8 | INPUT_FILE=$TEST_DIR/input.txt
 9 | MODEL_PATH=../models/real_learner_bart_CGEC.pt # 模型路径
10 | PROCESSED_DIR=../data/dict
11 | OUTPUT_DIR=../results/test
12 | 
13 | # tokenizing
14 | if [ ! -f $INPUT_FILE".char" ]; then
15 |   echo "Tokenizing..."
16 |   cd ../utils
17 |   python segment_bert.py <$INPUT_FILE >$INPUT_FILE".char"
18 |   cd -
19 | fi
20 | 
21 | mkdir -p $OUTPUT_DIR
22 | cp $INPUT_FILE $OUTPUT_DIR/input.txt
23 | INPUT_FILE=$INPUT_FILE".char"
24 | cp $INPUT_FILE $OUTPUT_DIR/input.char
25 | 
26 | echo "Generating..."
27 | SECONDS=0
28 | 
29 | CUDA_VISIBLE_DEVICES=$CUDA_DEVICE python -u ${FAIRSEQ_DIR}/interactive.py $PROCESSED_DIR \
30 |   --user-dir ../SynGEC/src/src_syngec/syngec_model \
31 |   --task syntax-enhanced-translation \
32 |   --path ${MODEL_PATH} \
33 |   --beam ${BEAM} \
34 |   --nbest ${N_BEST} \
35 |   -s src \
36 |   -t tgt \
37 |   --buffer-size 10000 \
38 |   --batch-size 32 \
39 |   --num-workers 12 \
40 |   --log-format tqdm \
41 |   --remove-bpe \
42 |   --fp16 \
43 |   --output_file $OUTPUT_DIR/output.nbest \
44 |   <$OUTPUT_DIR/input.char
45 | 
46 | echo "Generating Finish!"
47 | duration=$SECONDS
48 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
49 | 
50 | cat $OUTPUT_DIR/output.nbest | grep "^D-" | python -c "import sys; x = sys.stdin.readlines(); x = ''.join([ x[i] for i in range(len(x)) if (i % ${N_BEST} == 0) ]); print(x)" | cut -f 3 >$OUTPUT_DIR/output.char
51 | sed -i '$d' $OUTPUT_DIR/output.char
52 | cat $OUTPUT_DIR/output.char | python -c "import sys; x = sys.stdin.readlines(); x = '\n'.join([''.join([tok[2:] if len(tok) > 2 and tok[:2] == '##' else tok for tok in sent.split()]) for sent in x]); print(x)" >$OUTPUT_DIR/output.txt # 最终预测结果
53 | 


--------------------------------------------------------------------------------
/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/data/.DS_Store


--------------------------------------------------------------------------------
/data/NaSGEC-Exam/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/data/NaSGEC-Exam/.gitkeep


--------------------------------------------------------------------------------
/data/NaSGEC-Media/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/data/NaSGEC-Media/.DS_Store


--------------------------------------------------------------------------------
/data/NaSGEC-Media/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/data/NaSGEC-Media/.gitkeep


--------------------------------------------------------------------------------
/data/NaSGEC-Thesis/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/data/NaSGEC-Thesis/.gitkeep


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | # NaSGEC
2 | 
3 | ## 如何获取数据
4 | 请参考“语析LAGroup”的官方仓库https://github.com/SUDA-LA/CGECData
5 | 


--------------------------------------------------------------------------------
/data/dict/dict.label0.txt:
--------------------------------------------------------------------------------
 1 | <pad>
 2 | <unk>
 3 | <nadj>
 4 | M
 5 | R
 6 | S
 7 | acomp
 8 | advcl
 9 | advmod
10 | amod
11 | appos
12 | aux
13 | auxpass
14 | cc
15 | ccomp
16 | conj
17 | cop
18 | csubj
19 | csubjpass
20 | dep
21 | det
22 | discourse
23 | dobj
24 | expl
25 | infmod
26 | iobj
27 | mark
28 | mwe
29 | neg
30 | nn
31 | npadvmod
32 | nsubj
33 | nsubjpass
34 | num
35 | number
36 | parataxis
37 | partmod
38 | pcomp
39 | pobj
40 | poss
41 | possessive
42 | preconj
43 | predet
44 | prep
45 | prt
46 | punct
47 | quantmod
48 | rcmod
49 | root
50 | tmod
51 | xcomp
52 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
 1 | from transformers import BertTokenizer, BartForConditionalGeneration, Text2TextGenerationPipeline
 2 | 
 3 | valid_models = ["HillZhang/pseudo_native_bart_CGEC", "HillZhang/pseudo_native_bart_CGEC", "HillZhang/pseudo_native_bart_CGEC_media", "HillZhang/pseudo_native_bart_CGEC_thesis", "HillZhang/real_learner_bart_CGEC_exam"]
 4 | 
 5 | tokenizer = BertTokenizer.from_pretrained(valid_models[0])
 6 | model = BartForConditionalGeneration.from_pretrained(valid_models[0])
 7 | encoded_input = tokenizer(["北京是中国的都。", "他说：”我最爱的运动是打蓝球“", "我每天大约喝5次水左右。", "今天，我非常开开心。"], return_tensors="pt", padding=True, truncation=True)
 8 | if "token_type_ids" in encoded_input:
 9 |     del encoded_input["token_type_ids"]
10 | output = model.generate(**encoded_input)
11 | print(tokenizer.batch_decode(output, skip_special_tokens=True))
12 | 
13 | 


--------------------------------------------------------------------------------
/guidelines/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/guidelines/.gitkeep


--------------------------------------------------------------------------------
/models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/models/.gitkeep


--------------------------------------------------------------------------------
/pics/data_statistic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/pics/data_statistic.png


--------------------------------------------------------------------------------
/preprocessed/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/preprocessed/.gitkeep


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | antlr4-python3-runtime==4.9.3
 2 | astroid==2.11.7
 3 | blessings==1.7
 4 | charset-normalizer==2.1.0
 5 | click==8.1.3
 6 | dill==0.3.5.1
 7 | emoji==2.0.0
 8 | filelock==3.7.1
 9 | gpustat==0.6.0
10 | huggingface-hub==0.8.1
11 | idna==3.3
12 | isort==5.10.1
13 | joblib==1.1.0
14 | lazy-object-proxy==1.7.1
15 | mccabe==0.7.0
16 | multiprocess==0.70.13
17 | nltk==3.7
18 | numpy==1.23.1
19 | nvidia-ml-py3==7.352.0
20 | omegaconf==2.2.2
21 | packaging==21.3
22 | pathos==0.2.9
23 | platformdirs==2.5.2
24 | pox==0.3.1
25 | ppft==1.7.6.5
26 | protobuf==4.21.2
27 | psutil==5.9.1
28 | pylint==2.14.5
29 | pyparsing==3.0.9
30 | PyYAML==6.0
31 | regex==2022.7.9
32 | requests==2.28.1
33 | six==1.16.0
34 | stanza==1.4.0
35 | tokenizers==0.12.1
36 | tomli==2.0.1
37 | tomlkit==0.11.1
38 | torch==1.12.0
39 | transformers==4.20.1
40 | typing_extensions==4.3.0
41 | urllib3==1.26.10
42 | wrapt==1.14.1
43 | xlrd==2.0.1
44 | xlwt==1.3.0
45 | supar==1.1.4
46 | errant
47 | subword-nmt


--------------------------------------------------------------------------------
/results/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HillZhang1999/NaSGEC/13044f16a68b07ce57a3fa423199e046a97e6244/results/.gitkeep


--------------------------------------------------------------------------------
/utils/convert_from_fairseq_to_huggingface.py:
--------------------------------------------------------------------------------
 1 | from transformers import (
 2 |     BartConfig,
 3 |     BartForConditionalGeneration,
 4 |     BartForSequenceClassification,
 5 |     BartModel,
 6 |     BartTokenizer,
 7 | )
 8 | from fairseq.checkpoint_utils import load_model_ensemble_and_task
 9 | import os
10 | import logging
11 | import sys
12 | from pathlib import Path
13 | import argparse
14 | import torch
15 | 
16 | logging.basicConfig(
17 |     format="%(asctime)s | %(levelname)s | %(name)s |  [%(filename)s:%(lineno)d] %(message)s",
18 |     datefmt="%Y-%m-%d %H:%M:%S",
19 |     level=os.environ.get("LOGLEVEL", "INFO").upper(),
20 |     stream=sys.stdout,
21 | )
22 | logger = logging.getLogger("convert")
23 | 
24 | parser = argparse.ArgumentParser()
25 | parser.add_argument("--checkpoint_path", type=str, default="~/NaSGEC/models/pseudo_native_bart_zh.twisted.pt")
26 | parser.add_argument("--data_dir", type=str, default="~/NaSGEC/data/dict")
27 | parser.add_argument("--save_dir", type=str, default="~/NaSGEC/models/test")
28 | main_args = parser.parse_args()
29 | 
30 | logger.info('Load fairseq checkpoint...')
31 | models, args, task = load_model_ensemble_and_task(filenames=[os.path.expanduser(main_args.checkpoint_path)],
32 |                                                  arg_overrides={'data': os.path.expanduser(main_args.data_dir)})
33 | 
34 | fairseq_transformer = models[0].eval()
35 | 
36 | logger.info('Huggingface config...')
37 | huggingface_config = BartConfig.from_pretrained('fnlp/bart-large-chinese',
38 |                                                 activation_function=args.activation_fn,
39 |                                                 d_model=args.encoder_embed_dim,
40 |                                                 encoder_attention_heads=args.encoder_attention_heads, 
41 |                                                 encoder_ffn_dim=args.encoder_ffn_embed_dim, 
42 |                                                 encoder_layers=args.encoder_layers,
43 |                                                 decoder_attention_heads=args.decoder_attention_heads, 
44 |                                                 decoder_ffn_dim=args.decoder_ffn_embed_dim, 
45 |                                                 decoder_layers=args.decoder_layers,
46 |                                                 normalize_embedding=args.layernorm_embedding, 
47 |                                                 scale_embedding=(not args.no_scale_embedding), 
48 |                                                 static_position_embeddings=(not args.encoder_learned_pos),
49 |                                                 vocab_size=len(task.source_dictionary),
50 |                                                 revision="v1.0"
51 |                                                )
52 | logger.info('Init huggingface model...')
53 | huggingface_model = BartForConditionalGeneration(huggingface_config).eval()
54 | 
55 | logger.info('Convert...')
56 | def remove_ignore_keys_(state_dict):
57 |     ignore_keys = [
58 |         "encoder.version",
59 |         "decoder.version",
60 |         "model.encoder.version",
61 |         "model.decoder.version",
62 |         "_float_tensor",
63 |         "decoder.output_projection.weight",
64 |         "encoder.embed_positions._float_tensor",
65 |         "decoder.embed_positions._float_tensor"
66 |     ]
67 |     for k in ignore_keys:
68 |         state_dict.pop(k, None)
69 | 
70 | def rename_key(dct, old, new):
71 |     val = dct.pop(old)
72 |     dct[new] = val
73 |     
74 | state_dict = fairseq_transformer.state_dict()
75 | remove_ignore_keys_(state_dict)
76 | huggingface_model.model.load_state_dict(state_dict, strict=False)
77 | 
78 | logger.info('Success!')
79 | Path(main_args.save_dir).mkdir(exist_ok=True)
80 | huggingface_model.save_pretrained(main_args.save_dir)


--------------------------------------------------------------------------------
/utils/segment_bert.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import tokenization
 3 | from tqdm import tqdm
 4 | from multiprocessing import Pool
 5 | 
 6 | tokenizer = tokenization.FullTokenizer(vocab_file="./vocab.txt", do_lower_case=False)
 7 | 
 8 | def split(line):
 9 |     line = line.strip()
10 |     origin_line = line
11 |     line = line.replace(" ", "")
12 |     line = tokenization.convert_to_unicode(line)
13 |     if not line:
14 |         return ''
15 |     tokens = tokenizer.tokenize(line)
16 |     return ' '.join(tokens)
17 |     
18 | with Pool(64) as pool:
19 |     for ret in pool.imap(split, tqdm(sys.stdin), chunksize=1024):
20 |         print(ret)
21 |     
22 | 


--------------------------------------------------------------------------------
/utils/tokenization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import collections
 22 | import unicodedata
 23 | import six
 24 | 
 25 | 
 26 | def convert_to_unicode(text):
 27 |   """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
 28 |   if six.PY3:
 29 |     if isinstance(text, str):
 30 |       return text
 31 |     elif isinstance(text, bytes):
 32 |       return text.decode("utf-8", "ignore")
 33 |     else:
 34 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 35 |   elif six.PY2:
 36 |     if isinstance(text, str):
 37 |       return text.decode("utf-8", "ignore")
 38 |     elif isinstance(text, unicode):
 39 |       return text
 40 |     else:
 41 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 42 |   else:
 43 |     raise ValueError("Not running on Python2 or Python 3?")
 44 | 
 45 | 
 46 | def printable_text(text):
 47 |   """Returns text encoded in a way suitable for print or `tf.logging`."""
 48 | 
 49 |   # These functions want `str` for both Python2 and Python3, but in one case
 50 |   # it's a Unicode string and in the other it's a byte string.
 51 |   if six.PY3:
 52 |     if isinstance(text, str):
 53 |       return text
 54 |     elif isinstance(text, bytes):
 55 |       return text.decode("utf-8", "ignore")
 56 |     else:
 57 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 58 |   elif six.PY2:
 59 |     if isinstance(text, str):
 60 |       return text
 61 |     elif isinstance(text, unicode):
 62 |       return text.encode("utf-8")
 63 |     else:
 64 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 65 |   else:
 66 |     raise ValueError("Not running on Python2 or Python 3?")
 67 | 
 68 | 
 69 | def load_vocab(vocab_file):
 70 |   """Loads a vocabulary file into a dictionary."""
 71 |   vocab = collections.OrderedDict()
 72 |   index = 0
 73 |   with open(vocab_file, "r") as reader:
 74 |     while True:
 75 |       token = convert_to_unicode(reader.readline())
 76 |       if not token:
 77 |         break
 78 |       token = token.strip()
 79 |       vocab[token] = index
 80 |       index += 1
 81 |   return vocab
 82 | 
 83 | 
 84 | def convert_by_vocab(vocab, items):
 85 |   """Converts a sequence of [tokens|ids] using the vocab."""
 86 |   output = []
 87 |   for item in items:
 88 |     if item not in vocab:
 89 |       print("warning: %s not in vocab" % item)
 90 |       item = "[UNK]"
 91 |     output.append(vocab[item])
 92 |   return output
 93 | 
 94 | 
 95 | def convert_tokens_to_ids(vocab, tokens):
 96 |   return convert_by_vocab(vocab, tokens)
 97 | 
 98 | 
 99 | def convert_ids_to_tokens(inv_vocab, ids):
100 |   return convert_by_vocab(inv_vocab, ids)
101 | 
102 | 
103 | def whitespace_tokenize(text):
104 |   """Runs basic whitespace cleaning and splitting on a peice of text."""
105 |   text = text.strip()
106 |   if not text:
107 |     return []
108 |   tokens = text.split()
109 |   return tokens
110 | 
111 | 
112 | class FullTokenizer(object):
113 |   """Runs end-to-end tokenziation."""
114 | 
115 |   def __init__(self, vocab_file, do_lower_case=True):
116 |     self.vocab = load_vocab(vocab_file)
117 |     self.inv_vocab = {v: k for k, v in self.vocab.items()}
118 |     self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
119 |     self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
120 | 
121 |   def tokenize(self, text):
122 |     split_tokens = []
123 |     for token in self.basic_tokenizer.tokenize(text):
124 |       for sub_token in self.wordpiece_tokenizer.tokenize(token):
125 |         split_tokens.append(sub_token)
126 | 
127 |     return split_tokens
128 | 
129 |   def convert_tokens_to_ids(self, tokens):
130 |     return convert_by_vocab(self.vocab, tokens)
131 | 
132 |   def convert_ids_to_tokens(self, ids):
133 |     return convert_by_vocab(self.inv_vocab, ids)
134 | 
135 | 
136 | class BasicTokenizer(object):
137 |   """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
138 | 
139 |   def __init__(self, do_lower_case=True):
140 |     """Constructs a BasicTokenizer.
141 |     Args:
142 |       do_lower_case: Whether to lower case the input.
143 |     """
144 |     self.do_lower_case = do_lower_case
145 | 
146 |   def tokenize(self, text):
147 |     """Tokenizes a piece of text."""
148 |     text = convert_to_unicode(text)
149 |     text = self._clean_text(text)
150 | 
151 |     # This was added on November 1st, 2018 for the multilingual and Chinese
152 |     # models. This is also applied to the English models now, but it doesn't
153 |     # matter since the English models were not trained on any Chinese data
154 |     # and generally don't have any Chinese data in them (there are Chinese
155 |     # characters in the vocabulary because Wikipedia does have some Chinese
156 |     # words in the English Wikipedia.).
157 |     text = self._tokenize_chinese_chars(text)
158 | 
159 |     orig_tokens = whitespace_tokenize(text)
160 |     split_tokens = []
161 |     for token in orig_tokens:
162 |       if self.do_lower_case:
163 |         token = token.lower()
164 |         token = self._run_strip_accents(token)
165 |       split_tokens.extend(self._run_split_on_punc(token))
166 | 
167 |     output_tokens = whitespace_tokenize(" ".join(split_tokens))
168 |     return output_tokens
169 | 
170 |   def _run_strip_accents(self, text):
171 |     """Strips accents from a piece of text."""
172 |     text = unicodedata.normalize("NFD", text)
173 |     output = []
174 |     for char in text:
175 |       cat = unicodedata.category(char)
176 |       if cat == "Mn":
177 |         continue
178 |       output.append(char)
179 |     return "".join(output)
180 | 
181 |   def _run_split_on_punc(self, text):
182 |     """Splits punctuation on a piece of text."""
183 |     chars = list(text)
184 |     i = 0
185 |     start_new_word = True
186 |     output = []
187 |     while i < len(chars):
188 |       char = chars[i]
189 |       if _is_punctuation(char):
190 |         output.append([char])
191 |         start_new_word = True
192 |       else:
193 |         if start_new_word:
194 |           output.append([])
195 |         start_new_word = False
196 |         output[-1].append(char)
197 |       i += 1
198 | 
199 |     return ["".join(x) for x in output]
200 | 
201 |   def _tokenize_chinese_chars(self, text):
202 |     """Adds whitespace around any CJK character."""
203 |     output = []
204 |     for char in text:
205 |       cp = ord(char)
206 |       if self._is_chinese_char(cp):
207 |         output.append(" ")
208 |         output.append(char)
209 |         output.append(" ")
210 |       else:
211 |         output.append(char)
212 |     return "".join(output)
213 | 
214 |   def _is_chinese_char(self, cp):
215 |     """Checks whether CP is the codepoint of a CJK character."""
216 |     # This defines a "chinese character" as anything in the CJK Unicode block:
217 |     #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
218 |     #
219 |     # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
220 |     # despite its name. The modern Korean Hangul alphabet is a different block,
221 |     # as is Japanese Hiragana and Katakana. Those alphabets are used to write
222 |     # space-separated words, so they are not treated specially and handled
223 |     # like the all of the other languages.
224 |     if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
225 |         (cp >= 0x3400 and cp <= 0x4DBF) or  #
226 |         (cp >= 0x20000 and cp <= 0x2A6DF) or  #
227 |         (cp >= 0x2A700 and cp <= 0x2B73F) or  #
228 |         (cp >= 0x2B740 and cp <= 0x2B81F) or  #
229 |         (cp >= 0x2B820 and cp <= 0x2CEAF) or
230 |         (cp >= 0xF900 and cp <= 0xFAFF) or  #
231 |         (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
232 |       return True
233 | 
234 |     return False
235 | 
236 |   def _clean_text(self, text):
237 |     """Performs invalid character removal and whitespace cleanup on text."""
238 |     output = []
239 |     for char in text:
240 |       cp = ord(char)
241 |       if cp == 0 or cp == 0xfffd or _is_control(char):
242 |         continue
243 |       if _is_whitespace(char):
244 |         output.append(" ")
245 |       else:
246 |         output.append(char)
247 |     return "".join(output)
248 | 
249 | 
250 | class WordpieceTokenizer(object):
251 |   """Runs WordPiece tokenziation."""
252 | 
253 |   def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
254 |     self.vocab = vocab
255 |     self.unk_token = unk_token
256 |     self.max_input_chars_per_word = max_input_chars_per_word
257 | 
258 |   def tokenize(self, text):
259 |     """Tokenizes a piece of text into its word pieces.
260 |     This uses a greedy longest-match-first algorithm to perform tokenization
261 |     using the given vocabulary.
262 |     For example:
263 |       input = "unaffable"
264 |       output = ["un", "##aff", "##able"]
265 |     Args:
266 |       text: A single token or whitespace separated tokens. This should have
267 |         already been passed through `BasicTokenizer.
268 |     Returns:
269 |       A list of wordpiece tokens.
270 |     """
271 | 
272 |     text = convert_to_unicode(text)
273 | 
274 |     output_tokens = []
275 |     for token in whitespace_tokenize(text):
276 |       chars = list(token)
277 |       if len(chars) > self.max_input_chars_per_word:
278 |         output_tokens.append(self.unk_token)
279 |         continue
280 | 
281 |       is_bad = False
282 |       start = 0
283 |       sub_tokens = []
284 |       while start < len(chars):
285 |         end = len(chars)
286 |         cur_substr = None
287 |         while start < end:
288 |           substr = "".join(chars[start:end])
289 |           if start > 0:
290 |             substr = "##" + substr
291 |           if substr in self.vocab:
292 |             cur_substr = substr
293 |             break
294 |           end -= 1
295 |         if cur_substr is None:
296 |           is_bad = True
297 |           break
298 |         sub_tokens.append(cur_substr)
299 |         start = end
300 | 
301 |       if is_bad:
302 |         # output_tokens.append(self.unk_token)
303 |         output_tokens.append(token)  # keep the UNK token
304 |       else:
305 |         output_tokens.extend(sub_tokens)
306 |     return output_tokens
307 | 
308 | 
309 | def _is_whitespace(char):
310 |   """Checks whether `chars` is a whitespace character."""
311 |   # \t, \n, and \r are technically contorl characters but we treat them
312 |   # as whitespace since they are generally considered as such.
313 |   if char == " " or char == "\t" or char == "\n" or char == "\r":
314 |     return True
315 |   cat = unicodedata.category(char)
316 |   if cat == "Zs":
317 |     return True
318 |   return False
319 | 
320 | 
321 | def _is_control(char):
322 |   """Checks whether `chars` is a control character."""
323 |   # These are technically control characters but we count them as whitespace
324 |   # characters.
325 |   if char == "\t" or char == "\n" or char == "\r":
326 |     return False
327 |   cat = unicodedata.category(char)
328 |   if cat.startswith("C"):
329 |     return True
330 |   return False
331 | 
332 | 
333 | def _is_punctuation(char):
334 |   """Checks whether `chars` is a punctuation character."""
335 |   cp = ord(char)
336 |   # We treat all non-letter/number ASCII as punctuation.
337 |   # Characters such as "^", "$", and "`" are not in the Unicode
338 |   # Punctuation class but we treat them as punctuation anyways, for
339 |   # consistency.
340 |   if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
341 |       (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
342 |     return True
343 |   cat = unicodedata.category(char)
344 |   if cat.startswith("P"):
345 |     return True
346 |   return False


--------------------------------------------------------------------------------