├── .github ├── release.yml └── workflows │ ├── ci.yaml │ ├── push_to_hub.yaml │ └── release.yaml ├── .gitignore ├── .tagpr ├── CHANGELOG.md ├── JGLUE.py ├── Makefile ├── README.md ├── pyproject.toml ├── tests ├── JGLUE_test.py └── __init__.py └── uv.lock /.github/release.yml: -------------------------------------------------------------------------------- 1 | changelog: 2 | exclude: 3 | labels: 4 | - tagpr 5 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | paths-ignore: 9 | - 'README.md' 10 | 11 | jobs: 12 | test: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: ['3.8', '3.9', '3.10'] 17 | 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v4 21 | 22 | - name: Setup Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v4 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | 27 | - name: Install dependencies 28 | run: | 29 | make setup 30 | make install 31 | 32 | - name: Format 33 | run: | 34 | make format 35 | 36 | - name: Lint 37 | run: | 38 | make lint 39 | 40 | - name: Type check 41 | run: | 42 | make typecheck 43 | 44 | - name: Run tests 45 | run: | 46 | make test 47 | -------------------------------------------------------------------------------- /.github/workflows/push_to_hub.yaml: -------------------------------------------------------------------------------- 1 | name: Sync to Hugging Face Hub 2 | 3 | on: 4 | workflow_run: 5 | workflows: 6 | - CI 7 | branches: 8 | - main 9 | types: 10 | - completed 11 | 12 | jobs: 13 | push_to_hub: 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - name: Checkout repository 18 | uses: actions/checkout@v2 19 | 20 | - name: Push to Huggingface hub 21 | env: 22 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 23 | HF_USERNAME: ${{ secrets.HF_USERNAME }} 24 | run: | 25 | git fetch --unshallow 26 | git push --force https://${HF_USERNAME}:${HF_TOKEN}@huggingface.co/datasets/${HF_USERNAME}/JGLUE main 27 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Release 2 | on: 3 | push: 4 | branches: ["main"] 5 | jobs: 6 | tagpr: 7 | runs-on: ubuntu-latest 8 | permissions: 9 | contents: write 10 | pull-requests: write 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - uses: Songmu/tagpr@v1 16 | env: 17 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/python 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python 3 | 4 | ### Python ### 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | 166 | ### Python Patch ### 167 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 168 | poetry.toml 169 | 170 | # ruff 171 | .ruff_cache/ 172 | 173 | # End of https://www.toptal.com/developers/gitignore/api/python 174 | -------------------------------------------------------------------------------- /.tagpr: -------------------------------------------------------------------------------- 1 | # config file for the tagpr in git config format 2 | # The tagpr generates the initial configuration, which you can rewrite to suit your environment. 3 | # CONFIGURATIONS: 4 | # tagpr.releaseBranch 5 | # Generally, it is "main." It is the branch for releases. The tagpr tracks this branch, 6 | # creates or updates a pull request as a release candidate, or tags when they are merged. 7 | # 8 | # tagpr.versionFile 9 | # Versioning file containing the semantic version needed to be updated at release. 10 | # It will be synchronized with the "git tag". 11 | # Often this is a meta-information file such as gemspec, setup.cfg, package.json, etc. 12 | # Sometimes the source code file, such as version.go or Bar.pm, is used. 13 | # If you do not want to use versioning files but only git tags, specify the "-" string here. 14 | # You can specify multiple version files by comma separated strings. 15 | # 16 | # tagpr.vPrefix 17 | # Flag whether or not v-prefix is added to semver when git tagging. (e.g. v1.2.3 if true) 18 | # This is only a tagging convention, not how it is described in the version file. 19 | # 20 | # tagpr.changelog (Optional) 21 | # Flag whether or not changelog is added or changed during the release. 22 | # 23 | # tagpr.command (Optional) 24 | # Command to change files just before release. 25 | # 26 | # tagpr.template (Optional) 27 | # Pull request template file in go template format 28 | # 29 | # tagpr.templateText (Optional) 30 | # Pull request template text in go template format 31 | # 32 | # tagpr.release (Optional) 33 | # GitHub Release creation behavior after tagging [true, draft, false] 34 | # If this value is not set, the release is to be created. 35 | # 36 | # tagpr.majorLabels (Optional) 37 | # Label of major update targets. Default is [major] 38 | # 39 | # tagpr.minorLabels (Optional) 40 | # Label of minor update targets. Default is [minor] 41 | # 42 | # tagpr.commitPrefix (Optional) 43 | # Prefix of commit message. Default is "[tagpr]" 44 | # 45 | [tagpr] 46 | vPrefix = true 47 | releaseBranch = main 48 | versionFile = pyproject.toml 49 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [v1.2.0](https://github.com/shunk031/huggingface-datasets_JGLUE/compare/v1.1.1...v1.2.0) - 2025-03-31 4 | - Update dataset URLs and version to 1.2.0 by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/20 5 | 6 | ## [v1.1.1](https://github.com/shunk031/huggingface-datasets_JGLUE/compare/v1.1.0...v1.1.1) - 2025-03-31 7 | - Use uv by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/27 8 | - Bump JGLUE version to 1.1.0 by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/29 9 | 10 | ## [v1.1.0](https://github.com/shunk031/huggingface-datasets_JGLUE/compare/v1.0.0...v1.1.0) - 2025-03-31 11 | - Update dataset URLs to use versioned tags and bump version to 1.1.0 by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/22 12 | 13 | ## [v0.0.1](https://github.com/shunk031/huggingface-datasets_JGLUE/commits/v0.0.1) - 2025-03-31 14 | - Initialize by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/1 15 | - Fix for the huggingface dataset viewer by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/2 16 | - Follow HF SQuAD dataset by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/4 17 | - refactor JGLUE.py by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/5 18 | - Specify open encoding by @polm in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/6 19 | - download parquet from hf datasets in `MARC-ja` by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/10 20 | - open file in utf-8 by @kishida in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/11 21 | - update README by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/12 22 | - Update for CI by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/14 23 | - Add JCoLA dataset by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/13 24 | - update README.md by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/16 25 | - Remove datasets.tasks and add trust_remote_code by @mjun0812 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/17 26 | - Set package-mode to false in pyproject.toml by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/21 27 | - Update dataset URLs and version to 1.0.0 in JGLUE.py and pyproject.toml by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/23 28 | - Add GitHub Actions workflow for automated release tagging by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/24 29 | -------------------------------------------------------------------------------- /JGLUE.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import random 4 | import string 5 | import warnings 6 | from dataclasses import dataclass 7 | from typing import Dict, List, Literal, Optional 8 | 9 | import datasets as ds 10 | import pandas as pd 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | _JGLUE_CITATION = """\ 15 | @inproceedings{kurihara-lrec-2022-jglue, 16 | title={JGLUE: Japanese general language understanding evaluation}, 17 | author={Kurihara, Kentaro and Kawahara, Daisuke and Shibata, Tomohide}, 18 | booktitle={Proceedings of the Thirteenth Language Resources and Evaluation Conference}, 19 | pages={2957--2966}, 20 | year={2022}, 21 | url={https://aclanthology.org/2022.lrec-1.317/} 22 | } 23 | @inproceedings{kurihara-nlp-2022-jglue, 24 | title={JGLUE: 日本語言語理解ベンチマーク}, 25 | author={栗原健太郎 and 河原大輔 and 柴田知秀}, 26 | booktitle={言語処理学会第28回年次大会}, 27 | pages={2023--2028}, 28 | year={2022}, 29 | url={https://www.anlp.jp/proceedings/annual_meeting/2022/pdf_dir/E8-4.pdf}, 30 | note={in Japanese} 31 | } 32 | """ 33 | 34 | _JCOLA_CITATION = """\ 35 | @article{someya2023jcola, 36 | title={JCoLA: Japanese Corpus of Linguistic Acceptability}, 37 | author={Taiga Someya and Yushi Sugimoto and Yohei Oseki}, 38 | year={2023}, 39 | eprint={2309.12676}, 40 | archivePrefix={arXiv}, 41 | primaryClass={cs.CL} 42 | } 43 | @inproceedings{someya-nlp-2022-jcola, 44 | title={日本語版 CoLA の構築}, 45 | author={染谷 大河 and 大関 洋平}, 46 | booktitle={言語処理学会第28回年次大会}, 47 | pages={1872--1877}, 48 | year={2022}, 49 | url={https://www.anlp.jp/proceedings/annual_meeting/2022/pdf_dir/E7-1.pdf}, 50 | note={in Japanese} 51 | } 52 | """ 53 | 54 | _MARC_JA_CITATION = """\ 55 | @inproceedings{marc_reviews, 56 | title={The Multilingual Amazon Reviews Corpus}, 57 | author={Keung, Phillip and Lu, Yichao and Szarvas, György and Smith, Noah A.}, 58 | booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing}, 59 | pages={4563--4568}, 60 | year={2020} 61 | } 62 | """ 63 | 64 | _JSTS_JNLI_CITATION = """\ 65 | @inproceedings{miyazaki2016cross, 66 | title={Cross-lingual image caption generation}, 67 | author={Miyazaki, Takashi and Shimizu, Nobuyuki}, 68 | booktitle={Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, 69 | pages={1780--1790}, 70 | year={2016} 71 | } 72 | """ 73 | 74 | _DESCRIPTION = """\ 75 | JGLUE, Japanese General Language Understanding Evaluation, \ 76 | is built to measure the general NLU ability in Japanese. JGLUE has been constructed \ 77 | from scratch without translation. We hope that JGLUE will facilitate NLU research in Japanese.\ 78 | """ 79 | 80 | _JGLUE_HOMEPAGE = "https://github.com/yahoojapan/JGLUE" 81 | _JCOLA_HOMEPAGE = "https://github.com/osekilab/JCoLA" 82 | _MARC_JA_HOMEPAGE = "https://registry.opendata.aws/amazon-reviews-ml/" 83 | 84 | _JGLUE_LICENSE = """\ 85 | This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.\ 86 | """ 87 | 88 | _DESCRIPTION_CONFIGS = { 89 | "MARC-ja": "MARC-ja is a dataset of the text classification task. This dataset is based on the Japanese portion of Multilingual Amazon Reviews Corpus (MARC) (Keung+, 2020).", 90 | "JCoLA": "JCoLA (Japanese Corpus of Linguistic Accept010 ability) is a novel dataset for targeted syntactic evaluations of language models in Japanese, which consists of 10,020 sentences with acceptability judgments by linguists.", 91 | "JSTS": "JSTS is a Japanese version of the STS (Semantic Textual Similarity) dataset. STS is a task to estimate the semantic similarity of a sentence pair.", 92 | "JNLI": "JNLI is a Japanese version of the NLI (Natural Language Inference) dataset. NLI is a task to recognize the inference relation that a premise sentence has to a hypothesis sentence.", 93 | "JSQuAD": "JSQuAD is a Japanese version of SQuAD (Rajpurkar+, 2016), one of the datasets of reading comprehension.", 94 | "JCommonsenseQA": "JCommonsenseQA is a Japanese version of CommonsenseQA (Talmor+, 2019), which is a multiple-choice question answering dataset that requires commonsense reasoning ability.", 95 | } 96 | 97 | _URLS = { 98 | "MARC-ja": { 99 | "data": "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz", 100 | "filter_review_id_list": { 101 | "valid": "https://raw.githubusercontent.com/yahoojapan/JGLUE/main/preprocess/marc-ja/data/filter_review_id_list/valid.txt", 102 | }, 103 | "label_conv_review_id_list": { 104 | "valid": "https://raw.githubusercontent.com/yahoojapan/JGLUE/main/preprocess/marc-ja/data/label_conv_review_id_list/valid.txt", 105 | }, 106 | }, 107 | "JCoLA": { 108 | "train": { 109 | "in_domain": { 110 | "json": "https://raw.githubusercontent.com/osekilab/JCoLA/main/data/jcola-v1.0/in_domain_train-v1.0.json", 111 | } 112 | }, 113 | "valid": { 114 | "in_domain": { 115 | "json": "https://raw.githubusercontent.com/osekilab/JCoLA/main/data/jcola-v1.0/in_domain_valid-v1.0.json", 116 | }, 117 | "out_of_domain": { 118 | "json": "https://raw.githubusercontent.com/osekilab/JCoLA/main/data/jcola-v1.0/out_of_domain_valid-v1.0.json", 119 | "json_annotated": "https://raw.githubusercontent.com/osekilab/JCoLA/main/data/jcola-v1.0/out_of_domain_valid_annotated-v1.0.json", 120 | }, 121 | }, 122 | }, 123 | "JSTS": { 124 | "train": "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/tags/v1.2.0/datasets/jsts-v1.2/train-v1.2.json", 125 | "valid": "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/tags/v1.2.0/datasets/jsts-v1.2/valid-v1.2.json", 126 | }, 127 | "JNLI": { 128 | "train": "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/tags/v1.2.0/datasets/jnli-v1.2/train-v1.2.json", 129 | "valid": "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/tags/v1.2.0/datasets/jnli-v1.2/valid-v1.2.json", 130 | }, 131 | "JSQuAD": { 132 | "train": "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/tags/v1.2.0/datasets/jsquad-v1.2/train-v1.2.json", 133 | "valid": "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/tags/v1.2.0/datasets/jsquad-v1.2/valid-v1.2.json", 134 | }, 135 | "JCommonsenseQA": { 136 | "train": "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/tags/v1.2.0/datasets/jcommonsenseqa-v1.2/train-v1.2.json", 137 | "valid": "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/tags/v1.2.0/datasets/jcommonsenseqa-v1.2/valid-v1.2.json", 138 | }, 139 | } 140 | 141 | 142 | def dataset_info_jsts() -> ds.DatasetInfo: 143 | features = ds.Features( 144 | { 145 | "sentence_pair_id": ds.Value("string"), 146 | "yjcaptions_id": ds.Value("string"), 147 | "sentence1": ds.Value("string"), 148 | "sentence2": ds.Value("string"), 149 | "label": ds.Value("float"), 150 | } 151 | ) 152 | return ds.DatasetInfo( 153 | description=_DESCRIPTION, 154 | citation=_JGLUE_CITATION, 155 | homepage=f"{_JSTS_JNLI_CITATION}\n{_JGLUE_HOMEPAGE}", 156 | license=_JGLUE_LICENSE, 157 | features=features, 158 | ) 159 | 160 | 161 | def dataset_info_jnli() -> ds.DatasetInfo: 162 | features = ds.Features( 163 | { 164 | "sentence_pair_id": ds.Value("string"), 165 | "yjcaptions_id": ds.Value("string"), 166 | "sentence1": ds.Value("string"), 167 | "sentence2": ds.Value("string"), 168 | "label": ds.ClassLabel( 169 | num_classes=3, names=["entailment", "contradiction", "neutral"] 170 | ), 171 | } 172 | ) 173 | return ds.DatasetInfo( 174 | description=_DESCRIPTION, 175 | citation=_JGLUE_CITATION, 176 | homepage=f"{_JSTS_JNLI_CITATION}\n{_JGLUE_HOMEPAGE}", 177 | license=_JGLUE_LICENSE, 178 | features=features, 179 | supervised_keys=None, 180 | ) 181 | 182 | 183 | def dataset_info_jsquad() -> ds.DatasetInfo: 184 | features = ds.Features( 185 | { 186 | "id": ds.Value("string"), 187 | "title": ds.Value("string"), 188 | "context": ds.Value("string"), 189 | "question": ds.Value("string"), 190 | "answers": ds.Sequence( 191 | {"text": ds.Value("string"), "answer_start": ds.Value("int32")} 192 | ), 193 | "is_impossible": ds.Value("bool"), 194 | } 195 | ) 196 | return ds.DatasetInfo( 197 | description=_DESCRIPTION, 198 | citation=_JGLUE_CITATION, 199 | homepage=_JGLUE_HOMEPAGE, 200 | license=_JGLUE_LICENSE, 201 | features=features, 202 | supervised_keys=None, 203 | ) 204 | 205 | 206 | def dataset_info_jcommonsenseqa() -> ds.DatasetInfo: 207 | features = ds.Features( 208 | { 209 | "q_id": ds.Value("int64"), 210 | "question": ds.Value("string"), 211 | "choice0": ds.Value("string"), 212 | "choice1": ds.Value("string"), 213 | "choice2": ds.Value("string"), 214 | "choice3": ds.Value("string"), 215 | "choice4": ds.Value("string"), 216 | "label": ds.ClassLabel( 217 | num_classes=5, 218 | names=["choice0", "choice1", "choice2", "choice3", "choice4"], 219 | ), 220 | } 221 | ) 222 | return ds.DatasetInfo( 223 | description=_DESCRIPTION, 224 | citation=_JGLUE_CITATION, 225 | homepage=_JGLUE_HOMEPAGE, 226 | license=_JGLUE_LICENSE, 227 | features=features, 228 | ) 229 | 230 | 231 | def dataset_info_jcola() -> ds.DatasetInfo: 232 | features = ds.Features( 233 | { 234 | "uid": ds.Value("int64"), 235 | "source": ds.Value("string"), 236 | "label": ds.ClassLabel( 237 | num_classes=2, 238 | names=["unacceptable", "acceptable"], 239 | ), 240 | "diacritic": ds.Value("string"), 241 | "sentence": ds.Value("string"), 242 | "original": ds.Value("string"), 243 | "translation": ds.Value("string"), 244 | "gloss": ds.Value("bool"), 245 | "linguistic_phenomenon": { 246 | "argument_structure": ds.Value("bool"), 247 | "binding": ds.Value("bool"), 248 | "control_raising": ds.Value("bool"), 249 | "ellipsis": ds.Value("bool"), 250 | "filler_gap": ds.Value("bool"), 251 | "island_effects": ds.Value("bool"), 252 | "morphology": ds.Value("bool"), 253 | "nominal_structure": ds.Value("bool"), 254 | "negative_polarity_concord_items": ds.Value("bool"), 255 | "quantifier": ds.Value("bool"), 256 | "verbal_agreement": ds.Value("bool"), 257 | "simple": ds.Value("bool"), 258 | }, 259 | } 260 | ) 261 | return ds.DatasetInfo( 262 | description=_DESCRIPTION, 263 | citation=f"{_JCOLA_CITATION}\n{_JGLUE_CITATION}", 264 | homepage=_JCOLA_HOMEPAGE, 265 | features=features, 266 | ) 267 | 268 | 269 | def dataset_info_marc_ja() -> ds.DatasetInfo: 270 | features = ds.Features( 271 | { 272 | "sentence": ds.Value("string"), 273 | "label": ds.ClassLabel( 274 | num_classes=3, names=["positive", "negative", "neutral"] 275 | ), 276 | "review_id": ds.Value("string"), 277 | } 278 | ) 279 | return ds.DatasetInfo( 280 | description=_DESCRIPTION, 281 | citation=f"{_MARC_JA_CITATION}\n{_JGLUE_CITATION}", 282 | homepage=_MARC_JA_HOMEPAGE, 283 | license=_JGLUE_LICENSE, 284 | features=features, 285 | ) 286 | 287 | 288 | @dataclass 289 | class JGLUEConfig(ds.BuilderConfig): 290 | """Class for JGLUE benchmark configuration""" 291 | 292 | 293 | @dataclass 294 | class MarcJaConfig(JGLUEConfig): 295 | name: str = "MARC-ja" 296 | is_han_to_zen: bool = False 297 | max_instance_num: Optional[int] = None 298 | max_char_length: int = 500 299 | is_pos_neg: bool = True 300 | train_ratio: float = 0.94 301 | val_ratio: float = 0.03 302 | test_ratio: float = 0.03 303 | output_testset: bool = False 304 | filter_review_id_list_valid: bool = True 305 | label_conv_review_id_list_valid: bool = True 306 | 307 | def __post_init__(self) -> None: 308 | assert self.train_ratio + self.val_ratio + self.test_ratio == 1.0 309 | 310 | 311 | JcolaDomain = Literal["in_domain", "out_of_domain"] 312 | 313 | 314 | @dataclass 315 | class JcolaConfig(JGLUEConfig): 316 | name: str = "JCoLA" 317 | domain: JcolaDomain = "in_domain" 318 | 319 | 320 | def get_label(rating: int, is_pos_neg: bool = False) -> Optional[str]: 321 | if rating >= 4: 322 | return "positive" 323 | elif rating <= 2: 324 | return "negative" 325 | else: 326 | if is_pos_neg: 327 | return None 328 | else: 329 | return "neutral" 330 | 331 | 332 | def is_filtered_by_ascii_rate(text: str, threshold: float = 0.9) -> bool: 333 | ascii_letters = set(string.printable) 334 | rate = sum(c in ascii_letters for c in text) / len(text) 335 | return rate >= threshold 336 | 337 | 338 | def shuffle_dataframe(df: pd.DataFrame) -> pd.DataFrame: 339 | instances = df.to_dict(orient="records") 340 | random.seed(1) 341 | random.shuffle(instances) 342 | return pd.DataFrame(instances) 343 | 344 | 345 | def get_filter_review_id_list( 346 | filter_review_id_list_paths: Dict[str, str], 347 | ) -> Dict[str, List[str]]: 348 | filter_review_id_list_valid = filter_review_id_list_paths.get("valid") 349 | filter_review_id_list_test = filter_review_id_list_paths.get("test") 350 | 351 | filter_review_id_list = {} 352 | 353 | if filter_review_id_list_valid is not None: 354 | with open(filter_review_id_list_valid, "r", encoding="utf-8") as rf: 355 | filter_review_id_list["valid"] = [line.rstrip() for line in rf] 356 | 357 | if filter_review_id_list_test is not None: 358 | with open(filter_review_id_list_test, "r", encoding="utf-8") as rf: 359 | filter_review_id_list["test"] = [line.rstrip() for line in rf] 360 | 361 | return filter_review_id_list 362 | 363 | 364 | def get_label_conv_review_id_list( 365 | label_conv_review_id_list_paths: Dict[str, str], 366 | ) -> Dict[str, Dict[str, str]]: 367 | import csv 368 | 369 | label_conv_review_id_list_valid = label_conv_review_id_list_paths.get("valid") 370 | label_conv_review_id_list_test = label_conv_review_id_list_paths.get("test") 371 | 372 | label_conv_review_id_list: Dict[str, Dict[str, str]] = {} 373 | 374 | if label_conv_review_id_list_valid is not None: 375 | with open(label_conv_review_id_list_valid, "r", encoding="utf-8") as rf: 376 | label_conv_review_id_list["valid"] = { 377 | row[0]: row[1] for row in csv.reader(rf) 378 | } 379 | 380 | if label_conv_review_id_list_test is not None: 381 | with open(label_conv_review_id_list_test, "r", encoding="utf-8") as rf: 382 | label_conv_review_id_list["test"] = { 383 | row[0]: row[1] for row in csv.reader(rf) 384 | } 385 | 386 | return label_conv_review_id_list 387 | 388 | 389 | def output_data( 390 | df: pd.DataFrame, 391 | train_ratio: float, 392 | val_ratio: float, 393 | test_ratio: float, 394 | output_testset: bool, 395 | filter_review_id_list_paths: Dict[str, str], 396 | label_conv_review_id_list_paths: Dict[str, str], 397 | ) -> Dict[str, pd.DataFrame]: 398 | instance_num = len(df) 399 | split_dfs: Dict[str, pd.DataFrame] = {} 400 | length1 = int(instance_num * train_ratio) 401 | split_dfs["train"] = df.iloc[:length1] 402 | 403 | length2 = int(instance_num * (train_ratio + val_ratio)) 404 | split_dfs["valid"] = df.iloc[length1:length2] 405 | split_dfs["test"] = df.iloc[length2:] 406 | 407 | filter_review_id_list = get_filter_review_id_list( 408 | filter_review_id_list_paths=filter_review_id_list_paths, 409 | ) 410 | label_conv_review_id_list = get_label_conv_review_id_list( 411 | label_conv_review_id_list_paths=label_conv_review_id_list_paths, 412 | ) 413 | 414 | for eval_type in ("valid", "test"): 415 | if filter_review_id_list.get(eval_type): 416 | df = split_dfs[eval_type] 417 | df = df[~df["review_id"].isin(filter_review_id_list[eval_type])] 418 | split_dfs[eval_type] = df 419 | 420 | for eval_type in ("valid", "test"): 421 | if label_conv_review_id_list.get(eval_type): 422 | df = split_dfs[eval_type] 423 | df = df.assign( 424 | converted_label=df["review_id"].map(label_conv_review_id_list["valid"]) 425 | ) 426 | df = df.assign( 427 | label=df[["label", "converted_label"]].apply( 428 | lambda xs: xs["label"] 429 | if pd.isnull(xs["converted_label"]) 430 | else xs["converted_label"], 431 | axis=1, 432 | ) 433 | ) 434 | df = df.drop(columns=["converted_label"]) 435 | split_dfs[eval_type] = df 436 | 437 | return { 438 | "train": split_dfs["train"], 439 | "valid": split_dfs["valid"], 440 | } 441 | 442 | 443 | def preprocess_for_marc_ja( 444 | config: MarcJaConfig, 445 | data_file_path: str, 446 | filter_review_id_list_paths: Dict[str, str], 447 | label_conv_review_id_list_paths: Dict[str, str], 448 | ) -> Dict[str, pd.DataFrame]: 449 | try: 450 | import mojimoji 451 | 452 | def han_to_zen(text: str) -> str: 453 | return mojimoji.han_to_zen(text) 454 | 455 | except ImportError: 456 | warnings.warn( 457 | "can't import `mojimoji`, failing back to method that do nothing. " 458 | "We recommend running `pip install mojimoji` to reproduce the original preprocessing.", 459 | UserWarning, 460 | ) 461 | 462 | def han_to_zen(text: str) -> str: 463 | return text 464 | 465 | try: 466 | from bs4 import BeautifulSoup 467 | 468 | def cleanup_text(text: str) -> str: 469 | return BeautifulSoup(text, "html.parser").get_text() 470 | 471 | except ImportError: 472 | warnings.warn( 473 | "can't import `beautifulsoup4`, failing back to method that do nothing." 474 | "We recommend running `pip install beautifulsoup4` to reproduce the original preprocessing.", 475 | UserWarning, 476 | ) 477 | 478 | def cleanup_text(text: str) -> str: 479 | return text 480 | 481 | from tqdm import tqdm 482 | 483 | df = pd.read_csv(data_file_path, delimiter="\t") 484 | df = df[["review_body", "star_rating", "review_id"]] 485 | 486 | # rename columns 487 | df = df.rename(columns={"review_body": "text", "star_rating": "rating"}) 488 | 489 | # convert the rating to label 490 | tqdm.pandas(dynamic_ncols=True, desc="Convert the rating to the label") 491 | df = df.assign( 492 | label=df["rating"].progress_apply( 493 | lambda rating: get_label(rating, config.is_pos_neg) 494 | ) 495 | ) 496 | 497 | # remove rows where the label is None 498 | df = df[~df["label"].isnull()] 499 | 500 | # remove html tags from the text 501 | tqdm.pandas(dynamic_ncols=True, desc="Remove html tags from the text") 502 | df = df.assign(text=df["text"].progress_apply(cleanup_text)) 503 | 504 | # filter by ascii rate 505 | tqdm.pandas(dynamic_ncols=True, desc="Filter by ascii rate") 506 | df = df[~df["text"].progress_apply(is_filtered_by_ascii_rate)] 507 | 508 | if config.max_char_length is not None: 509 | df = df[df["text"].str.len() <= config.max_char_length] 510 | 511 | if config.is_han_to_zen: 512 | df = df.assign(text=df["text"].apply(han_to_zen)) 513 | 514 | df = df[["text", "label", "review_id"]] 515 | df = df.rename(columns={"text": "sentence"}) 516 | 517 | # shuffle dataset 518 | df = shuffle_dataframe(df) 519 | 520 | split_dfs = output_data( 521 | df=df, 522 | train_ratio=config.train_ratio, 523 | val_ratio=config.val_ratio, 524 | test_ratio=config.test_ratio, 525 | output_testset=config.output_testset, 526 | filter_review_id_list_paths=filter_review_id_list_paths, 527 | label_conv_review_id_list_paths=label_conv_review_id_list_paths, 528 | ) 529 | return split_dfs 530 | 531 | 532 | class JGLUE(ds.GeneratorBasedBuilder): 533 | JGLUE_VERSION = ds.Version("1.2.0") 534 | JCOLA_VERSION = ds.Version("1.0.0") 535 | 536 | BUILDER_CONFIG_CLASS = JGLUEConfig 537 | BUILDER_CONFIGS = [ 538 | MarcJaConfig( 539 | name="MARC-ja", 540 | version=JGLUE_VERSION, 541 | description=_DESCRIPTION_CONFIGS["MARC-ja"], 542 | ), 543 | JcolaConfig( 544 | name="JCoLA", 545 | version=JCOLA_VERSION, 546 | description=_DESCRIPTION_CONFIGS["JCoLA"], 547 | ), 548 | JGLUEConfig( 549 | name="JSTS", 550 | version=JGLUE_VERSION, 551 | description=_DESCRIPTION_CONFIGS["JSTS"], 552 | ), 553 | JGLUEConfig( 554 | name="JNLI", 555 | version=JGLUE_VERSION, 556 | description=_DESCRIPTION_CONFIGS["JNLI"], 557 | ), 558 | JGLUEConfig( 559 | name="JSQuAD", 560 | version=JGLUE_VERSION, 561 | description=_DESCRIPTION_CONFIGS["JSQuAD"], 562 | ), 563 | JGLUEConfig( 564 | name="JCommonsenseQA", 565 | version=JGLUE_VERSION, 566 | description=_DESCRIPTION_CONFIGS["JCommonsenseQA"], 567 | ), 568 | ] 569 | 570 | def _info(self) -> ds.DatasetInfo: 571 | if self.config.name == "JSTS": 572 | return dataset_info_jsts() 573 | elif self.config.name == "JNLI": 574 | return dataset_info_jnli() 575 | elif self.config.name == "JSQuAD": 576 | return dataset_info_jsquad() 577 | elif self.config.name == "JCommonsenseQA": 578 | return dataset_info_jcommonsenseqa() 579 | elif self.config.name == "JCoLA": 580 | return dataset_info_jcola() 581 | elif self.config.name == "MARC-ja": 582 | return dataset_info_marc_ja() 583 | else: 584 | raise ValueError(f"Invalid config name: {self.config.name}") 585 | 586 | def __split_generators_marc_ja(self, dl_manager: ds.DownloadManager): 587 | try: 588 | file_paths = dl_manager.download_and_extract(_URLS[self.config.name]) 589 | except FileNotFoundError as err: 590 | logger.warning(err) 591 | # An error occurs because the file cannot be downloaded from _URLS[MARC-ja]['data']. 592 | # So, remove the 'data' key and try to download again. 593 | urls = _URLS[self.config.name] 594 | urls.pop("data") # type: ignore[attr-defined] 595 | file_paths = dl_manager.download_and_extract(urls) 596 | 597 | filter_review_id_list = file_paths["filter_review_id_list"] 598 | label_conv_review_id_list = file_paths["label_conv_review_id_list"] 599 | 600 | try: 601 | split_dfs = preprocess_for_marc_ja( 602 | config=self.config, 603 | data_file_path=file_paths["data"], 604 | filter_review_id_list_paths=filter_review_id_list, 605 | label_conv_review_id_list_paths=label_conv_review_id_list, 606 | ) 607 | except KeyError as err: 608 | from urllib.parse import urljoin 609 | 610 | logger.warning(err) 611 | 612 | base_url = "https://huggingface.co/datasets/shunk031/JGLUE/resolve/refs%2Fconvert%2Fparquet/MARC-ja/" 613 | marcja_parquet_urls = { 614 | "train": urljoin(base_url, "jglue-train.parquet"), 615 | "valid": urljoin(base_url, "jglue-validation.parquet"), 616 | } 617 | file_paths = dl_manager.download_and_extract(marcja_parquet_urls) 618 | split_dfs = {k: pd.read_parquet(v) for k, v in file_paths.items()} 619 | 620 | return [ 621 | ds.SplitGenerator( 622 | name=ds.Split.TRAIN, 623 | gen_kwargs={"split_df": split_dfs["train"]}, 624 | ), 625 | ds.SplitGenerator( 626 | name=ds.Split.VALIDATION, 627 | gen_kwargs={"split_df": split_dfs["valid"]}, 628 | ), 629 | ] 630 | 631 | def __split_generators_jcola(self, dl_manager: ds.DownloadManager): 632 | file_paths = dl_manager.download_and_extract(_URLS[self.config.name]) 633 | 634 | return [ 635 | ds.SplitGenerator( 636 | name=ds.Split.TRAIN, 637 | gen_kwargs={"file_path": file_paths["train"]["in_domain"]["json"]}, 638 | ), 639 | ds.SplitGenerator( 640 | name=ds.Split.VALIDATION, 641 | gen_kwargs={"file_path": file_paths["valid"]["in_domain"]["json"]}, 642 | ), 643 | ds.SplitGenerator( 644 | name=ds.NamedSplit("validation_out_of_domain"), 645 | gen_kwargs={"file_path": file_paths["valid"]["out_of_domain"]["json"]}, 646 | ), 647 | ds.SplitGenerator( 648 | name=ds.NamedSplit("validation_out_of_domain_annotated"), 649 | gen_kwargs={ 650 | "file_path": file_paths["valid"]["out_of_domain"]["json_annotated"] 651 | }, 652 | ), 653 | ] 654 | 655 | def __split_generators(self, dl_manager: ds.DownloadManager): 656 | file_paths = dl_manager.download_and_extract(_URLS[self.config.name]) 657 | 658 | return [ 659 | ds.SplitGenerator( 660 | name=ds.Split.TRAIN, 661 | gen_kwargs={"file_path": file_paths["train"]}, 662 | ), 663 | ds.SplitGenerator( 664 | name=ds.Split.VALIDATION, 665 | gen_kwargs={"file_path": file_paths["valid"]}, 666 | ), 667 | ] 668 | 669 | def _split_generators(self, dl_manager: ds.DownloadManager): 670 | if self.config.name == "MARC-ja": 671 | return self.__split_generators_marc_ja(dl_manager) 672 | elif self.config.name == "JCoLA": 673 | return self.__split_generators_jcola(dl_manager) 674 | else: 675 | return self.__split_generators(dl_manager) 676 | 677 | def __generate_examples_marc_ja(self, split_df: Optional[pd.DataFrame] = None): 678 | if split_df is None: 679 | raise ValueError(f"Invalid preprocessing for {self.config.name}") 680 | 681 | instances = split_df.to_dict(orient="records") 682 | for i, data_dict in enumerate(instances): 683 | yield i, data_dict 684 | 685 | def __generate_examples_jcola(self, file_path: Optional[str] = None): 686 | if file_path is None: 687 | raise ValueError(f"Invalid argument for {self.config.name}") 688 | 689 | def convert_label(json_dict): 690 | label_int = json_dict["label"] 691 | label_str = "unacceptable" if label_int == 0 else "acceptable" 692 | json_dict["label"] = label_str 693 | return json_dict 694 | 695 | def convert_addntional_info(json_dict): 696 | json_dict["translation"] = json_dict.get("translation") 697 | json_dict["gloss"] = json_dict.get("gloss") 698 | return json_dict 699 | 700 | def convert_phenomenon(json_dict): 701 | argument_structure = json_dict.get("Arg. Str.") 702 | 703 | def json_pop(key): 704 | return json_dict.pop(key) if argument_structure is not None else None 705 | 706 | json_dict["linguistic_phenomenon"] = { 707 | "argument_structure": json_pop("Arg. Str."), 708 | "binding": json_pop("binding"), 709 | "control_raising": json_pop("control/raising"), 710 | "ellipsis": json_pop("ellipsis"), 711 | "filler_gap": json_pop("filler-gap"), 712 | "island_effects": json_pop("island effects"), 713 | "morphology": json_pop("morphology"), 714 | "nominal_structure": json_pop("nominal structure"), 715 | "negative_polarity_concord_items": json_pop("NPI/NCI"), 716 | "quantifier": json_pop("quantifier"), 717 | "verbal_agreement": json_pop("verbal agr."), 718 | "simple": json_pop("simple"), 719 | } 720 | return json_dict 721 | 722 | with open(file_path, "r", encoding="utf-8") as rf: 723 | for i, line in enumerate(rf): 724 | json_dict = json.loads(line) 725 | 726 | example = convert_label(json_dict) 727 | example = convert_addntional_info(example) 728 | example = convert_phenomenon(example) 729 | 730 | yield i, example 731 | 732 | def __generate_examples_jsquad(self, file_path: Optional[str] = None): 733 | if file_path is None: 734 | raise ValueError(f"Invalid argument for {self.config.name}") 735 | 736 | with open(file_path, "r", encoding="utf-8") as rf: 737 | json_data = json.load(rf) 738 | 739 | for json_dict in json_data["data"]: 740 | title = json_dict["title"] 741 | paragraphs = json_dict["paragraphs"] 742 | 743 | for paragraph in paragraphs: 744 | context = paragraph["context"] 745 | questions = paragraph["qas"] 746 | 747 | for question_dict in questions: 748 | q_id = question_dict["id"] 749 | question = question_dict["question"] 750 | answers = question_dict["answers"] 751 | is_impossible = question_dict["is_impossible"] 752 | 753 | example_dict = { 754 | "id": q_id, 755 | "title": title, 756 | "context": context, 757 | "question": question, 758 | "answers": answers, 759 | "is_impossible": is_impossible, 760 | } 761 | 762 | yield q_id, example_dict 763 | 764 | def __generate_examples_jcommonsenseqa(self, file_path: Optional[str] = None): 765 | if file_path is None: 766 | raise ValueError(f"Invalid argument for {self.config.name}") 767 | 768 | with open(file_path, "r", encoding="utf-8") as rf: 769 | for i, line in enumerate(rf): 770 | json_dict = json.loads(line) 771 | json_dict["label"] = f"choice{json_dict['label']}" 772 | yield i, json_dict 773 | 774 | def __generate_examples(self, file_path: Optional[str] = None): 775 | if file_path is None: 776 | raise ValueError(f"Invalid argument for {self.config.name}") 777 | 778 | with open(file_path, "r", encoding="utf-8") as rf: 779 | for i, line in enumerate(rf): 780 | json_dict = json.loads(line) 781 | yield i, json_dict 782 | 783 | def _generate_examples( 784 | self, 785 | file_path: Optional[str] = None, 786 | split_df: Optional[pd.DataFrame] = None, 787 | ): 788 | if self.config.name == "MARC-ja": 789 | yield from self.__generate_examples_marc_ja(split_df) 790 | 791 | elif self.config.name == "JCoLA": 792 | yield from self.__generate_examples_jcola(file_path) 793 | 794 | elif self.config.name == "JSQuAD": 795 | yield from self.__generate_examples_jsquad(file_path) 796 | 797 | elif self.config.name == "JCommonsenseQA": 798 | yield from self.__generate_examples_jcommonsenseqa(file_path) 799 | 800 | else: 801 | yield from self.__generate_examples(file_path) 802 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Installation 3 | # 4 | 5 | .PHONY: setup 6 | setup: 7 | pip install -U uv 8 | 9 | .PHONY: install 10 | install: 11 | uv sync 12 | 13 | # 14 | # linter/formatter/typecheck 15 | # 16 | 17 | .PHONY: lint 18 | lint: install 19 | uv run ruff check --output-format=github . 20 | 21 | .PHONY: format 22 | format: install 23 | uv run ruff format --check --diff . 24 | 25 | .PHONY: typecheck 26 | typecheck: install 27 | uv run mypy --cache-dir=/dev/null . 28 | 29 | .PHONY: test 30 | test: install 31 | uv run pytest -vsx --log-cli-level=INFO 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | --- 2 | annotations_creators: 3 | - crowdsourced 4 | language: 5 | - ja 6 | language_creators: 7 | - crowdsourced 8 | - found 9 | license: 10 | - cc-by-4.0 11 | multilinguality: 12 | - monolingual 13 | pretty_name: JGLUE 14 | size_categories: [] 15 | source_datasets: 16 | - original 17 | tags: 18 | - MARC 19 | - CoLA 20 | - STS 21 | - NLI 22 | - SQuAD 23 | - CommonsenseQA 24 | task_categories: 25 | - multiple-choice 26 | - question-answering 27 | - sentence-similarity 28 | - text-classification 29 | task_ids: 30 | - multiple-choice-qa 31 | - open-domain-qa 32 | - multi-class-classification 33 | - sentiment-classification 34 | --- 35 | 36 | # Dataset Card for JGLUE 37 | 38 |

39 | 40 | CI 41 | 42 | 43 | Sync to Hugging Face Hub 44 | 45 | 46 | LRECACL2022 2022.lrec-1.317 47 | 48 | 49 | Hugging Face Datasets Hub 50 | 51 |

52 | 53 | This dataset loading script is developed on [GitHub](https://github.com/shunk031/huggingface-datasets_JGLUE). 54 | Please feel free to open an [issue](https://github.com/shunk031/huggingface-datasets_JGLUE/issues/new/choose) or [pull request](https://github.com/shunk031/huggingface-datasets_JGLUE/pulls). 55 | 56 | > [!IMPORTANT] 57 | > The version of this loading script has been updated to correspond to the version of JGLUE. 58 | > Please check the release history at [yahoojapan/JGLUE/releases](https://github.com/yahoojapan/JGLUE/releases) and [shunk031/huggingface-datasets_JGLUE/releases](https://github.com/shunk031/huggingface-datasets_JGLUE/releases). 59 | 60 | ## Table of Contents 61 | - [Table of Contents](#table-of-contents) 62 | - [Dataset Description](#dataset-description) 63 | - [Dataset Summary](#dataset-summary) 64 | - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards) 65 | - [Languages](#languages) 66 | - [Dataset Structure](#dataset-structure) 67 | - [Data Instances](#data-instances) 68 | - [Data Fields](#data-fields) 69 | - [Data Splits](#data-splits) 70 | - [Dataset Creation](#dataset-creation) 71 | - [Curation Rationale](#curation-rationale) 72 | - [Source Data](#source-data) 73 | - [Annotations](#annotations) 74 | - [Personal and Sensitive Information](#personal-and-sensitive-information) 75 | - [Considerations for Using the Data](#considerations-for-using-the-data) 76 | - [Social Impact of Dataset](#social-impact-of-dataset) 77 | - [Discussion of Biases](#discussion-of-biases) 78 | - [Other Known Limitations](#other-known-limitations) 79 | - [Additional Information](#additional-information) 80 | - [Dataset Curators](#dataset-curators) 81 | - [Licensing Information](#licensing-information) 82 | - [Citation Information](#citation-information) 83 | - [Contributions](#contributions) 84 | 85 | ## Dataset Description 86 | 87 | - **Homepage:** https://github.com/yahoojapan/JGLUE 88 | - **Repository:** https://github.com/shunk031/huggingface-datasets_JGLUE 89 | 90 | ### Dataset Summary 91 | 92 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE#jglue-japanese-general-language-understanding-evaluation): 93 | 94 | > JGLUE, Japanese General Language Understanding Evaluation, is built to measure the general NLU ability in Japanese. JGLUE has been constructed from scratch without translation. We hope that JGLUE will facilitate NLU research in Japanese. 95 | 96 | > JGLUE has been constructed by a joint research project of Yahoo Japan Corporation and Kawahara Lab at Waseda University. 97 | 98 | ### Supported Tasks and Leaderboards 99 | 100 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE#tasksdatasets): 101 | 102 | > JGLUE consists of the tasks of text classification, sentence pair classification, and QA. Each task consists of multiple datasets. 103 | 104 | #### Supported Tasks 105 | 106 | ##### MARC-ja 107 | 108 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE#marc-ja): 109 | 110 | > MARC-ja is a dataset of the text classification task. This dataset is based on the Japanese portion of [Multilingual Amazon Reviews Corpus (MARC)](https://docs.opendata.aws/amazon-reviews-ml/readme.html) ([Keung+, 2020](https://aclanthology.org/2020.emnlp-main.369/)). 111 | 112 | ##### JCoLA 113 | 114 | From [JCoLA's README.md](https://github.com/osekilab/JCoLA#jcola-japanese-corpus-of-linguistic-acceptability) 115 | 116 | > JCoLA (Japanese Corpus of Linguistic Accept010 ability) is a novel dataset for targeted syntactic evaluations of language models in Japanese, which consists of 10,020 sentences with acceptability judgments by linguists. The sentences are manually extracted from linguistics journals, handbooks and textbooks. JCoLA is included in [JGLUE benchmark](https://github.com/yahoojapan/JGLUE) (Kurihara et al., 2022). 117 | 118 | ##### JSTS 119 | 120 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE#jsts): 121 | 122 | > JSTS is a Japanese version of the STS (Semantic Textual Similarity) dataset. STS is a task to estimate the semantic similarity of a sentence pair. The sentences in JSTS and JNLI (described below) are extracted from the Japanese version of the MS COCO Caption Dataset, [the YJ Captions Dataset](https://github.com/yahoojapan/YJCaptions) ([Miyazaki and Shimizu, 2016](https://aclanthology.org/P16-1168/)). 123 | 124 | ##### JNLI 125 | 126 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE#jnli): 127 | 128 | > JNLI is a Japanese version of the NLI (Natural Language Inference) dataset. NLI is a task to recognize the inference relation that a premise sentence has to a hypothesis sentence. The inference relations are entailment, contradiction, and neutral. 129 | 130 | ##### JSQuAD 131 | 132 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE#jsquad): 133 | 134 | > JSQuAD is a Japanese version of [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) ([Rajpurkar+, 2018](https://aclanthology.org/P18-2124/)), one of the datasets of reading comprehension. Each instance in the dataset consists of a question regarding a given context (Wikipedia article) and its answer. JSQuAD is based on SQuAD 1.1 (there are no unanswerable questions). We used [the Japanese Wikipedia dump](https://dumps.wikimedia.org/jawiki/) as of 20211101. 135 | 136 | ##### JCommonsenseQA 137 | 138 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE#jcommonsenseqa): 139 | 140 | > JCommonsenseQA is a Japanese version of [CommonsenseQA](https://www.tau-nlp.org/commonsenseqa) ([Talmor+, 2019](https://aclanthology.org/N19-1421/)), which is a multiple-choice question answering dataset that requires commonsense reasoning ability. It is built using crowdsourcing with seeds extracted from the knowledge base [ConceptNet](https://conceptnet.io/). 141 | 142 | #### Leaderboard 143 | 144 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE#leaderboard): 145 | 146 | > A leaderboard will be made public soon. The test set will be released at that time. 147 | 148 | ### Languages 149 | 150 | The language data in JGLUE is in Japanese ([BCP-47 ja-JP](https://www.rfc-editor.org/info/bcp47)). 151 | 152 | ## Dataset Structure 153 | 154 | ### Data Instances 155 | 156 | When loading a specific configuration, users has to append a version dependent suffix: 157 | 158 | #### MARC-ja 159 | 160 | ```python 161 | from datasets import load_dataset 162 | 163 | dataset = load_dataset("shunk031/JGLUE", name="MARC-ja") 164 | 165 | print(dataset) 166 | # DatasetDict({ 167 | # train: Dataset({ 168 | # features: ['sentence', 'label', 'review_id'], 169 | # num_rows: 187528 170 | # }) 171 | # validation: Dataset({ 172 | # features: ['sentence', 'label', 'review_id'], 173 | # num_rows: 5654 174 | # }) 175 | # }) 176 | ``` 177 | 178 | #### JCoLA 179 | 180 | ```python 181 | from datasets import load_dataset 182 | 183 | dataset = load_dataset("shunk031/JGLUE", name="JCoLA") 184 | 185 | print(dataset) 186 | # DatasetDict({ 187 | # train: Dataset({ 188 | # features: ['uid', 'source', 'label', 'diacritic', 'sentence', 'original', 'translation', 'gloss', 'simple', 'linguistic_phenomenon'], 189 | # num_rows: 6919 190 | # }) 191 | # validation: Dataset({ 192 | # features: ['uid', 'source', 'label', 'diacritic', 'sentence', 'original', 'translation', 'gloss', 'simple', 'linguistic_phenomenon'], 193 | # num_rows: 865 194 | # }) 195 | # validation_out_of_domain: Dataset({ 196 | # features: ['uid', 'source', 'label', 'diacritic', 'sentence', 'original', 'translation', 'gloss', 'simple', 'linguistic_phenomenon'], 197 | # num_rows: 685 198 | # }) 199 | # validation_out_of_domain_annotated: Dataset({ 200 | # features: ['uid', 'source', 'label', 'diacritic', 'sentence', 'original', 'translation', 'gloss', 'simple', 'linguistic_phenomenon'], 201 | # num_rows: 685 202 | # }) 203 | # }) 204 | ``` 205 | 206 | An example of the JCoLA dataset (validation - out of domain annotated) looks as follows: 207 | 208 | ```json 209 | { 210 | "uid": 9109, 211 | "source": "Asano_and_Ura_2010", 212 | "label": 1, 213 | "diacritic": "g", 214 | "sentence": "太郎のゴミの捨て方について話した。", 215 | "original": "太郎のゴミの捨て方", 216 | "translation": "‘The way (for Taro) to throw out garbage’", 217 | "gloss": true, 218 | "linguistic_phenomenon": { 219 | "argument_structure": true, 220 | "binding": false, 221 | "control_raising": false, 222 | "ellipsis": false, 223 | "filler_gap": false, 224 | "island_effects": false, 225 | "morphology": false, 226 | "nominal_structure": false, 227 | "negative_polarity_concord_items": false, 228 | "quantifier": false, 229 | "verbal_agreement": false, 230 | "simple": false 231 | } 232 | } 233 | ``` 234 | 235 | #### JSTS 236 | 237 | ```python 238 | from datasets import load_dataset 239 | 240 | dataset = load_dataset("shunk031/JGLUE", name="JSTS") 241 | 242 | print(dataset) 243 | # DatasetDict({ 244 | # train: Dataset({ 245 | # features: ['sentence_pair_id', 'yjcaptions_id', 'sentence1', 'sentence2', 'label'], 246 | # num_rows: 12451 247 | # }) 248 | # validation: Dataset({ 249 | # features: ['sentence_pair_id', 'yjcaptions_id', 'sentence1', 'sentence2', 'label'], 250 | # num_rows: 1457 251 | # }) 252 | # }) 253 | ``` 254 | 255 | An example of the JSTS dataset looks as follows: 256 | 257 | ```json 258 | { 259 | "sentence_pair_id": "691", 260 | "yjcaptions_id": "127202-129817-129818", 261 | "sentence1": "街中の道路を大きなバスが走っています。 (A big bus is running on the road in the city.)", 262 | "sentence2": "道路を大きなバスが走っています。 (There is a big bus running on the road.)", 263 | "label": 4.4 264 | } 265 | ``` 266 | 267 | #### JNLI 268 | 269 | ```python 270 | from datasets import load_dataset 271 | 272 | dataset = load_dataset("shunk031/JGLUE", name="JNLI") 273 | 274 | print(dataset) 275 | # DatasetDict({ 276 | # train: Dataset({ 277 | # features: ['sentence_pair_id', 'yjcaptions_id', 'sentence1', 'sentence2', 'label'], 278 | # num_rows: 20073 279 | # }) 280 | # validation: Dataset({ 281 | # features: ['sentence_pair_id', 'yjcaptions_id', 'sentence1', 'sentence2', 'label'], 282 | # num_rows: 2434 283 | # }) 284 | # }) 285 | ``` 286 | 287 | An example of the JNLI dataset looks as follows: 288 | 289 | ```json 290 | { 291 | "sentence_pair_id": "1157", 292 | "yjcaptions_id": "127202-129817-129818", 293 | "sentence1": "街中の道路を大きなバスが走っています。 (A big bus is running on the road in the city.)", 294 | "sentence2": "道路を大きなバスが走っています。 (There is a big bus running on the road.)", 295 | "label": "entailment" 296 | } 297 | ``` 298 | 299 | #### JSQuAD 300 | 301 | ```python 302 | from datasets import load_dataset 303 | 304 | dataset = load_dataset("shunk031/JGLUE", name="JSQuAD") 305 | 306 | print(dataset) 307 | # DatasetDict({ 308 | # train: Dataset({ 309 | # features: ['id', 'title', 'context', 'question', 'answers', 'is_impossible'], 310 | # num_rows: 62859 311 | # }) 312 | # validation: Dataset({ 313 | # features: ['id', 'title', 'context', 'question', 'answers', 'is_impossible'], 314 | # num_rows: 4442 315 | # }) 316 | # }) 317 | ``` 318 | 319 | An example of the JSQuAD looks as follows: 320 | 321 | ```json 322 | { 323 | "id": "a1531320p0q0", 324 | "title": "東海道新幹線", 325 | "context": "東海道新幹線 [SEP] 1987 年(昭和 62 年)4 月 1 日の国鉄分割民営化により、JR 東海が運営を継承した。西日本旅客鉄道(JR 西日本)が継承した山陽新幹線とは相互乗り入れが行われており、東海道新幹線区間のみで運転される列車にも JR 西日本所有の車両が使用されることがある。2020 年(令和 2 年)3 月現在、東京駅 - 新大阪駅間の所要時間は最速 2 時間 21 分、最高速度 285 km/h で運行されている。", 326 | "question": "2020 年(令和 2 年)3 月現在、東京駅 - 新大阪駅間の最高速度はどのくらいか。", 327 | "answers": { 328 | "text": ["285 km/h"], 329 | "answer_start": [182] 330 | }, 331 | "is_impossible": false 332 | } 333 | ``` 334 | 335 | #### JCommonsenseQA 336 | 337 | ```python 338 | from datasets import load_dataset 339 | 340 | dataset = load_dataset("shunk031/JGLUE", name="JCommonsenseQA") 341 | 342 | print(dataset) 343 | # DatasetDict({ 344 | # train: Dataset({ 345 | # features: ['q_id', 'question', 'choice0', 'choice1', 'choice2', 'choice3', 'choice4', 'label'], 346 | # num_rows: 8939 347 | # }) 348 | # validation: Dataset({ 349 | # features: ['q_id', 'question', 'choice0', 'choice1', 'choice2', 'choice3', 'choice4', 'label'], 350 | # num_rows: 1119 351 | # }) 352 | # }) 353 | ``` 354 | 355 | An example of the JCommonsenseQA looks as follows: 356 | 357 | ```json 358 | { 359 | "q_id": 3016, 360 | "question": "会社の最高責任者を何というか? (What do you call the chief executive officer of a company?)", 361 | "choice0": "社長 (president)", 362 | "choice1": "教師 (teacher)", 363 | "choice2": "部長 (manager)", 364 | "choice3": "バイト (part-time worker)", 365 | "choice4": "部下 (subordinate)", 366 | "label": 0 367 | } 368 | ``` 369 | 370 | ### Data Fields 371 | 372 | #### MARC-ja 373 | 374 | - `sentence_pair_id`: ID of the sentence pair 375 | - `yjcaptions_id`: sentence ids in yjcaptions (explained below) 376 | - `sentence1`: first sentence 377 | - `sentence2`: second sentence 378 | - `label`: sentence similarity: 5 (equivalent meaning) - 0 (completely different meaning) 379 | 380 | ##### Explanation for `yjcaptions_id` 381 | 382 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE#explanation-for-yjcaptions_id), there are the following two cases: 383 | 384 | 1. sentence pairs in one image: `(image id)-(sentence1 id)-(sentence2 id)` 385 | - e.g., 723-844-847 386 | - a sentence id starting with "g" means a sentence generated by a crowdworker (e.g., 69501-75698-g103): only for JNLI 387 | 2. sentence pairs in two images: `(image id of sentence1)_(image id of sentence2)-(sentence1 id)-(sentence2 id)` 388 | - e.g., 91337_217583-96105-91680 389 | 390 | #### JCoLA 391 | 392 | From [JCoLA's README.md](https://github.com/osekilab/JCoLA#data-description) and [JCoLA's paper](https://arxiv.org/abs/2309.12676) 393 | 394 | - `uid`: unique id of the sentence 395 | - `source`: author and the year of publication of the source article 396 | - `label`: acceptability judgement label (0 for unacceptable, 1 for acceptable) 397 | - `diacritic`: acceptability judgement as originally notated in the source article 398 | - `sentence`: sentence (modified by the author if needed) 399 | - `original`: original sentence as presented in the source article 400 | - `translation`: English translation of the sentence as presentend in the source article (if any) 401 | - `gloss`: gloss of the sentence as presented in the source article (if any) 402 | - `linguistic_phenomenon` 403 | - `argument_structure`: acceptability judgements based on the order of arguments and case marking 404 | - `binding`: acceptability judgements based on the binding of noun phrases 405 | - `control_raising`: acceptability judgements based on predicates that are categorized as control or raising 406 | - `ellipsis`: acceptability judgements based on the possibility of omitting elements in the sentences 407 | - `filler_gap`: acceptability judgements based on the dependency between the moved element and the gap 408 | - `island effects`: acceptability judgements based on the restrictions on filler-gap dependencies such as wh-movements 409 | - `morphology`: acceptability judgements based on the morphology 410 | - `nominal_structure`: acceptability judgements based on the internal structure of noun phrases 411 | - `negative_polarity_concord_items`: acceptability judgements based on the restrictions on where negative polarity/concord items (NPIs/NCIs) can appear 412 | - `quantifiers`: acceptability judgements based on the distribution of quantifiers such as floating quantifiers 413 | - `verbal_agreement`: acceptability judgements based on the dependency between subjects and verbs 414 | - `simple`: acceptability judgements that do not have marked syntactic structures 415 | 416 | #### JNLI 417 | 418 | - `sentence_pair_id`: ID of the sentence pair 419 | - `yjcaptions_id`: sentence ids in the yjcaptions 420 | - `sentence1`: premise sentence 421 | - `sentence2`: hypothesis sentence 422 | - `label`: inference relation 423 | 424 | #### JSQuAD 425 | 426 | - `title`: title of a Wikipedia article 427 | - `paragraphs`: a set of paragraphs 428 | - `qas`: a set of pairs of a question and its answer 429 | - `question`: question 430 | - `id`: id of a question 431 | - `answers`: a set of answers 432 | - `text`: answer text 433 | - `answer_start`: start position (character index) 434 | - `is_impossible`: all the values are false 435 | - `context`: a concatenation of the title and paragraph 436 | 437 | #### JCommonsenseQA 438 | 439 | - `q_id`: ID of the question 440 | - `question`: question 441 | - `choice{0..4}`: choice 442 | - `label`: correct choice id 443 | 444 | ### Data Splits 445 | 446 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE/blob/main/README.md#tasksdatasets): 447 | 448 | > Only train/dev sets are available now, and the test set will be available after the leaderboard is made public. 449 | 450 | From [JCoLA's paper](https://arxiv.org/abs/2309.12676): 451 | 452 | > The in-domain data is split into training data (6,919 instances), development data (865 instances), and test data (865 instances). On the other hand, the out-of-domain data is only used for evaluation, and divided into development data (685 instances) and test data (686 instances). 453 | 454 | | Task | Dataset | Train | Dev | Test | 455 | |------------------------------|----------------|--------:|------:|------:| 456 | | Text Classification | MARC-ja | 187,528 | 5,654 | 5,639 | 457 | | | JCoLA | 6,919 | 865† / 685‡ | 865† / 685‡ | 458 | | Sentence Pair Classification | JSTS | 12,451 | 1,457 | 1,589 | 459 | | | JNLI | 20,073 | 2,434 | 2,508 | 460 | | Question Answering | JSQuAD | 62,859 | 4,442 | 4,420 | 461 | | | JCommonsenseQA | 8,939 | 1,119 | 1,118 | 462 | 463 | > JCoLA: † in domain. ‡ out of domain. 464 | 465 | ## Dataset Creation 466 | 467 | ### Curation Rationale 468 | 469 | From [JGLUE's paper](https://aclanthology.org/2022.lrec-1.317/): 470 | 471 | > JGLUE is designed to cover a wide range of GLUE and SuperGLUE tasks and consists of three kinds of tasks: text classification, sentence pair classification, and question answering. 472 | 473 | ### Source Data 474 | 475 | #### Initial Data Collection and Normalization 476 | 477 | [More Information Needed] 478 | 479 | #### Who are the source language producers? 480 | 481 | - The source language producers are users of Amazon (MARC-ja), crowd-workers of [Yahoo! Crowdsourcing](https://crowdsourcing.yahoo.co.jp/) (JSTS, JNLI and JCommonsenseQA), writers of the Japanese Wikipedia (JSQuAD), crowd-workers of [Lancers](https://www.lancers.jp/). 482 | 483 | ### Annotations 484 | 485 | #### Annotation process 486 | 487 | ##### MARC-ja 488 | 489 | From [JGLUE's paper](https://aclanthology.org/2022.lrec-1.317/): 490 | 491 | > As one of the text classification datasets, we build a dataset based on the Multilingual Amazon Reviews Corpus (MARC) (Keung et al., 2020). MARC is a multilingual corpus of product reviews with 5-level star ratings (1-5) on the Amazon shopping site. This corpus covers six languages, including English and Japanese. For JGLUE, we use the Japanese part of MARC and to make it easy for both humans and computers to judge a class label, we cast the text classification task as a binary classification task, where 1- and 2-star ratings are converted to “negative”, and 4 and 5 are converted to “positive”. We do not use reviews with a 3-star rating. 492 | 493 | > One of the problems with MARC is that it sometimes contains data where the rating diverges from the review text. This happens, for example, when a review with positive content is given a rating of 1 or 2. These data degrade the quality of our dataset. To improve the quality of the dev/test instances used for evaluation, we crowdsource a positive/negative judgment task for approximately 12,000 reviews. We adopt only reviews with the same votes from 7 or more out of 10 workers and assign a label of the maximum votes to these reviews. We divide the resulting reviews into dev/test data. 494 | 495 | > We obtained 5,654 and 5,639 instances for the dev and test data, respectively, through the above procedure. For the training data, we extracted 187,528 instances directly from MARC without performing the cleaning procedure because of the large number of training instances. The statistics of MARC-ja are listed in Table 2. For the evaluation metric for MARC-ja, we use accuracy because it is a binary classification task of texts. 496 | 497 | ##### JCoLA 498 | 499 | From [JCoLA's paper](https://arxiv.org/abs/2309.12676): 500 | 501 | > ### 3 JCoLA 502 | > In this study, we introduce JCoLA (Japanese Corpus of Linguistic Acceptability), which will be the first large-scale acceptability judgment task dataset focusing on Japanese. JCoLA consists of sentences from textbooks and handbooks on Japanese syntax, as well as from journal articles on Japanese syntax that are published in JEAL (Journal of East Asian Linguistics), one of the prestigious journals in theoretical linguistics. 503 | 504 | > #### 3.1 Data Collection 505 | > Sentences in JCoLA were collected from prominent textbooks and handbooks focusing on Japanese syntax. In addition to the main text, example sentences included in the footnotes were also considered for collection. We also collected acceptability judgments from journal articles on Japanese syntax published in JEAL (Journal of East Asian Linguistics): one of the prestigious journals in the-oretical linguistics. Specifically, we examined all the articles published in JEAL between 2006 and 2015 (133 papers in total), and extracted 2,252 acceptability judgments from 26 papers on Japanese syntax (Table 2). Acceptability judgments include sentences in appendices and footnotes, but not sentences presented for analyses of syntactic structures (e.g. sentences with brackets to show their syntactic structures). As a result, a total of 11,984 example. sentences were collected. Using this as a basis, JCoLA was constructed through the methodology explained in the following sections. 506 | 507 | ##### JSTS and JNLI 508 | 509 | From [JGLUE's paper](https://aclanthology.org/2022.lrec-1.317/): 510 | 511 | > For the sentence pair classification datasets, we construct a semantic textual similarity (STS) dataset, JSTS, and a natural language inference (NLI) dataset, JNLI. 512 | 513 | > ### Overview 514 | > STS is a task of estimating the semantic similarity of a sentence pair. Gold similarity is usually assigned as an average of the integer values 0 (completely different meaning) to 5 (equivalent meaning) assigned by multiple workers through crowdsourcing. 515 | 516 | > NLI is a task of recognizing the inference relation that a premise sentence has to a hypothesis sentence. Inference relations are generally defined by three labels: “entailment”, “contradiction”, and “neutral”. Gold inference relations are often assigned by majority voting after collecting answers from multiple workers through crowdsourcing. 517 | 518 | > For the STS and NLI tasks, STS-B (Cer et al., 2017) and MultiNLI (Williams et al., 2018) are included in GLUE, respectively. As Japanese datasets, JSNLI (Yoshikoshi et al., 2020) is a machine translated dataset of the NLI dataset SNLI (Stanford NLI), and JSICK (Yanaka and Mineshima, 2021) is a human translated dataset of the STS/NLI dataset SICK (Marelli et al., 2014). As mentioned in Section 1, these have problems originating from automatic/manual translations. To solve this problem, we construct STS/NLI datasets in Japanese from scratch. We basically extract sentence pairs in JSTS and JNLI from the Japanese version of the MS COCO Caption Dataset (Chen et al., 2015), the YJ Captions Dataset (Miyazaki and Shimizu, 2016). Most of the sentence pairs in JSTS and JNLI overlap, allowing us to analyze the relationship between similarities and inference relations for the same sentence pairs like SICK and JSICK. 519 | 520 | > The similarity value in JSTS is assigned a real number from 0 to 5 as in STS-B. The inference relation in JNLI is assigned from the above three labels as in SNLI and MultiNLI. The definitions of the inference relations are also based on SNLI. 521 | 522 | > ### Method of Construction 523 | > Our construction flow for JSTS and JNLI is shown in Figure 1. Basically, two captions for the same image of YJ Captions are used as sentence pairs. For these sentence pairs, similarities and NLI relations of entailment and neutral are obtained by crowdsourcing. However, it is difficult to collect sentence pairs with low similarity and contradiction relations from captions for the same image. To solve this problem, we collect sentence pairs with low similarity from captions for different images. We collect contradiction relations by asking workers to write contradictory sentences for a given caption. 524 | 525 | > The detailed construction procedure for JSTS and JNLI is described below. 526 | > 1. We crowdsource an STS task using two captions for the same image from YJ Captions. We ask five workers to answer the similarity between two captions and take the mean value as the gold similarity. We delete sentence pairs with a large variance in the answers because such pairs have poor answer quality. We performed this task on 16,000 sentence pairs and deleted sentence pairs with a similarity variance of 1.0 or higher, resulting in the collection of 10,236 sentence pairs with gold similarity. We refer to this collected data as JSTS-A. 527 | > 2. To collect sentence pairs with low similarity, we crowdsource the same STS task as Step 1 using sentence pairs of captions for different images. We conducted this task on 4,000 sentence pairs and collected 2,970 sentence pairs with gold similarity. We refer to this collected data as JSTS-B. 528 | > 3. For JSTS-A, we crowdsource an NLI task. Since inference relations are directional, we obtain inference relations in both directions for sentence pairs. As mentioned earlier,it is difficult to collect instances of contradiction from JSTS-A, which was collected from the captions of the same images,and thus we collect instances of entailment and neutral in this step. We collect inference relation answers from 10 workers. If six or more people give the same answer, we adopt it as the gold label if it is entailment or neutral. To obtain inference relations in both directions for JSTS-A, we performed this task on 20,472 sentence pairs, twice as many as JSTS-A. As a result, we collected inference relations for 17,501 sentence pairs. We refer to this collected data as JNLI-A. We do not use JSTS-B for the NLI task because it is difficult to define and determine the inference relations between captions of different images. 529 | > 4. To collect NLI instances of contradiction, we crowdsource a task of writing four contradictory sentences for each caption in YJCaptions. From the written sentences, we remove sentence pairs with an edit distance of 0.75 or higher to remove low-quality sentences, such as short sentences and sentences with low relevance to the original sentence. Furthermore, we perform a one-way NLI task with 10 workers to verify whether the created sentence pairs are contradictory. Only the sentence pairs answered as contradiction by at least six workers are adopted. Finally,since the contradiction relation has no direction, we automatically assign contradiction in the opposite direction of the adopted sentence pairs. Using 1,800 captions, we acquired 7,200 sentence pairs, from which we collected 3,779 sentence pairs to which we assigned the one-way contradiction relation.By automatically assigning the contradiction relation in the opposite direction, we doubled the number of instances to 7,558. We refer to this collected data as JNLI-C. 530 | > 5. For the 3,779 sentence pairs collected in Step 4, we crowdsource an STS task, assigning similarity and filtering in the same way as in Steps1 and 2. In this way, we collected 2,303 sentence pairs with gold similarity from 3,779 pairs. We refer to this collected data as JSTS-C. 531 | 532 | ##### JSQuAD 533 | 534 | From [JGLUE's paper](https://aclanthology.org/2022.lrec-1.317/): 535 | 536 | > As QA datasets, we build a Japanese version of SQuAD (Rajpurkar et al., 2016), one of the datasets of reading comprehension, and a Japanese version ofCommonsenseQA, which is explained in the next section. 537 | 538 | > Reading comprehension is the task of reading a document and answering questions about it. Many reading comprehension evaluation sets have been built in English, followed by those in other languages or multilingual ones. 539 | 540 | > In Japanese, reading comprehension datasets for quizzes (Suzukietal.,2018) and those in the drivingdomain (Takahashi et al., 2019) have been built, but none are in the general domain. We use Wikipedia to build a dataset for the general domain. The construction process is basically based on SQuAD 1.1 (Rajpurkar et al., 2016). 541 | 542 | > First, to extract high-quality articles from Wikipedia, we use Nayuki, which estimates the quality of articles on the basis of hyperlinks in Wikipedia. We randomly chose 822 articles from the top-ranked 10,000 articles. For example, the articles include “熊本県 (Kumamoto Prefecture)” and “フランス料理 (French cuisine)”. Next, we divide an article into paragraphs, present each paragraph to crowdworkers, and ask them to write questions and answers that can be answered if one understands the paragraph. Figure 2 shows an example of JSQuAD. We ask workers to write two additional answers for the dev and test sets to make the system evaluation robust. 543 | 544 | ##### JCommonsenseQA 545 | 546 | From [JGLUE's paper](https://aclanthology.org/2022.lrec-1.317/): 547 | 548 | > ### Overview 549 | > JCommonsenseQA is a Japanese version of CommonsenseQA (Talmor et al., 2019), which consists of five choice QA to evaluate commonsense reasoning ability. Figure 3 shows examples of JCommonsenseQA. In the same way as CommonsenseQA, JCommonsenseQA is built using crowdsourcing with seeds extracted from the knowledge base ConceptNet (Speer et al., 2017). ConceptNet is a multilingual knowledge base that consists of triplets of two concepts and their relation. The triplets are directional and represented as (source concept, relation, target concept), for example (bullet train, AtLocation, station). 550 | 551 | > ### Method of Construction 552 | > The construction flow for JCommonsenseQA is shown in Figure 4. First, we collect question sets (QSs) from ConceptNet, each of which consists of a source concept and three target concepts that have the same relation to the source concept. Next, for each QS, we crowdAtLocation 2961source a task of writing a question with only one target concept as the answer and a task of adding two distractors. We describe the detailed construction procedure for JCommonsenseQA below, showing how it differs from CommonsenseQA. 553 | 554 | > 1. We collect Japanese QSs from ConceptNet. CommonsenseQA uses only forward relations (source concept, relation, target concept) excluding general ones such as “RelatedTo” and “IsA”. JCommonsenseQA similarly uses a set of 22 relations5, excluding general ones, but the direction of the relations is bidirectional to make the questions more diverse. In other words, we also use relations in the opposite direction (source concept, relation−1, target concept).6 With this setup, we extracted 43,566 QSs with Japanese source/target concepts and randomly selected 7,500 from them. 555 | > 2. Some low-quality questions in CommonsenseQA contain distractors that can be considered to be an answer. To improve the quality of distractors, we add the following two processes that are not adopted in CommonsenseQA. First, if three target concepts of a QS include a spelling variation or a synonym of one another, this QS is removed. To identify spelling variations, we use the word ID of the morphological dictionary Juman Dic7. Second, we crowdsource a task of judging whether target concepts contain a synonym. As a result, we adopted 5,920 QSs from 7,500. 556 | > 3. For each QS, we crowdsource a task of writing a question sentence in which only one from the three target concepts is an answer. In the example shown in Figure 4, “駅 (station)” is an answer, and the others are distractors. To remove low quality question sentences, we remove the following question sentences. 557 | > - Question sentences that contain a choice word(this is because such a question is easily solved). 558 | > - Question sentences that contain the expression “XX characters”.8 (XX is a number). 559 | > - Improperly formatted question sentences that do not end with “?”. 560 | > - As a result, 5,920 × 3 = 17,760question sentences were created, from which we adopted 15,310 by removing inappropriate question sentences. 561 | > 4. In CommonsenseQA, when adding distractors, one is selected from ConceptNet, and the other is created by crowdsourcing. In JCommonsenseQA, to have a wider variety of distractors, two distractors are created by crowdsourcing instead of selecting from ConceptNet. To improve the quality of the questions9, we remove questions whose added distractors fall into one of the following categories: 562 | > - Distractors are included in a question sentence. 563 | > - Distractors overlap with one of existing choices. 564 | > - As a result, distractors were added to the 15,310 questions, of which we adopted 13,906. 565 | > 5. We asked three crowdworkers to answer each question and adopt only those answered correctly by at least two workers. As a result, we adopted 11,263 out of the 13,906 questions. 566 | 567 | #### Who are the annotators? 568 | 569 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE/blob/main/README.md#tasksdatasets): 570 | 571 | > We use Yahoo! Crowdsourcing for all crowdsourcing tasks in constructing the datasets. 572 | 573 | From [JCoLA's paper](https://arxiv.org/abs/2309.12676): 574 | 575 | > As a reference for the upper limit of accuracy in JCoLA, human acceptability judgment experiments were conducted on Lancers2 with a subset of the JCoLA data. 576 | 577 | ### Personal and Sensitive Information 578 | 579 | [More Information Needed] 580 | 581 | ## Considerations for Using the Data 582 | 583 | ### Social Impact of Dataset 584 | 585 | From [JGLUE's paper](https://aclanthology.org/2022.lrec-1.317/): 586 | 587 | > We build a Japanese NLU benchmark, JGLUE, from scratch without translation to measure the general NLU ability in Japanese. We hope that JGLUE will facilitate NLU research in Japanese. 588 | 589 | ### Discussion of Biases 590 | 591 | [More Information Needed] 592 | 593 | ### Other Known Limitations 594 | 595 | From [JCoLA's paper](https://arxiv.org/abs/2309.12676): 596 | 597 | > All the sentences included in JCoLA have been extracted from textbooks, handbooks and journal articles on theoretical syntax. Therefore, those sentences are guaranteed to be theoretically meaningful, making JCoLA a challenging dataset. However, the distribution of linguistic phenomena directly reflects that of the source literature and thus turns out to be extremely skewed. Indeed, as can be seen in Table 3, while the number of sentences exceeds 100 for most linguistic phenomena, there are several linguistic phenomena for which there are only about 10 sentences. In addition, since it is difficult to force language models to interpret sentences given specific contexts, those sentences whose unacceptability depends on contexts were inevitably removed from JCoLA. This removal process resulted in the deletion of unacceptable sentences from some linguistic phenomena (such as ellipsis), consequently skewing the balance between acceptable and unacceptable sentences (with a higher proportion of acceptable sentences). 598 | 599 | ## Additional Information 600 | 601 | - 日本語言語理解ベンチマーク JGLUE の構築 〜 自然言語処理モデルの評価用データセットを公開しました - Yahoo! JAPAN Tech Blog https://techblog.yahoo.co.jp/entry/2022122030379907/ 602 | 603 | ### Dataset Curators 604 | 605 | #### MARC-ja 606 | 607 | - Keung, Phillip, et al. "The Multilingual Amazon Reviews Corpus." Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP). 2020. 608 | 609 | #### JCoLA 610 | 611 | - Someya, Sugimoto, and Oseki. "JCoLA: Japanese Corpus of Linguistic Acceptability." arxiv preprint arXiv:2309.12676 (2023). 612 | 613 | #### JSTS and JNLI 614 | 615 | - Miyazaki, Takashi, and Nobuyuki Shimizu. "Cross-lingual image caption generation." Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). 2016. 616 | 617 | #### JSQuAD 618 | 619 | The JGLUE's 'authors curated the original data for JSQuAD from the Japanese wikipedia dump. 620 | 621 | #### JCommonsenseQA 622 | 623 | In the same way as CommonsenseQA, JCommonsenseQA is built using crowdsourcing with seeds extracted from the knowledge base ConceptNet 624 | 625 | ### Licensing Information 626 | 627 | #### JGLUE 628 | 629 | From [JGLUE's README.md'](https://github.com/yahoojapan/JGLUE#license): 630 | 631 | > This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License. 632 | 633 | #### JCoLA 634 | 635 | From [JCoLA's README.md'](https://github.com/osekilab/JCoLA#license): 636 | 637 | > The text in this corpus is excerpted from the published works, and copyright (where applicable) remains with the original authors or publishers. We expect that research use within Japan is legal under fair use, but make no guarantee of this. 638 | 639 | ### Citation Information 640 | 641 | #### JGLUE 642 | 643 | ```bibtex 644 | @inproceedings{kurihara-lrec-2022-jglue, 645 | title={JGLUE: Japanese general language understanding evaluation}, 646 | author={Kurihara, Kentaro and Kawahara, Daisuke and Shibata, Tomohide}, 647 | booktitle={Proceedings of the Thirteenth Language Resources and Evaluation Conference}, 648 | pages={2957--2966}, 649 | year={2022}, 650 | url={https://aclanthology.org/2022.lrec-1.317/} 651 | } 652 | ``` 653 | 654 | ```bibtex 655 | @inproceedings{kurihara-nlp-2022-jglue, 656 | title={JGLUE: 日本語言語理解ベンチマーク}, 657 | author={栗原健太郎 and 河原大輔 and 柴田知秀}, 658 | booktitle={言語処理学会第 28 回年次大会}, 659 | pages={2023--2028}, 660 | year={2022}, 661 | url={https://www.anlp.jp/proceedings/annual_meeting/2022/pdf_dir/E8-4.pdf}, 662 | note={in Japanese} 663 | } 664 | ``` 665 | 666 | #### MARC-ja 667 | 668 | ```bibtex 669 | @inproceedings{marc_reviews, 670 | title={The Multilingual Amazon Reviews Corpus}, 671 | author={Keung, Phillip and Lu, Yichao and Szarvas, György and Smith, Noah A.}, 672 | booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing}, 673 | year={2020} 674 | } 675 | ``` 676 | 677 | #### JCoLA 678 | 679 | ```bibtex 680 | @article{someya-arxiv-2023-jcola, 681 | title={JCoLA: Japanese Corpus of Linguistic Acceptability}, 682 | author={Taiga Someya and Yushi Sugimoto and Yohei Oseki}, 683 | year={2023}, 684 | eprint={2309.12676}, 685 | archivePrefix={arXiv}, 686 | primaryClass={cs.CL} 687 | } 688 | ``` 689 | 690 | ```bibtex 691 | @inproceedings{someya-nlp-2022-jcola, 692 | title={日本語版 CoLA の構築}, 693 | author={染谷 大河 and 大関 洋平}, 694 | booktitle={言語処理学会第 28 回年次大会}, 695 | pages={1872--1877}, 696 | year={2022}, 697 | url={https://www.anlp.jp/proceedings/annual_meeting/2022/pdf_dir/E7-1.pdf}, 698 | note={in Japanese} 699 | } 700 | ``` 701 | 702 | #### JSTS and JNLI 703 | 704 | ```bibtex 705 | @inproceedings{miyazaki2016cross, 706 | title={Cross-lingual image caption generation}, 707 | author={Miyazaki, Takashi and Shimizu, Nobuyuki}, 708 | booktitle={Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, 709 | pages={1780--1790}, 710 | year={2016} 711 | } 712 | ``` 713 | 714 | ### Contributions 715 | 716 | Thanks to [Kentaro Kurihara](https://twitter.com/kkurihara_cs), [Daisuke Kawahara](https://twitter.com/daisukekawahar1), and [Tomohide Shibata](https://twitter.com/stomohide) for creating JGLUE dataset. 717 | Thanks to [Taiga Someya](https://twitter.com/T0a8i0g9a) for creating JCoLA dataset. 718 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "huggingface-datasets-jglue" 3 | version = "1.2.0" 4 | description = "Dataset loading script for JGLUE: Japanese General Language Understanding Evaluation" 5 | authors = [ 6 | { name = "Shunsuke Kitada", email = "shunsuke.kitada.0831@gmail.com" }, 7 | ] 8 | readme = "README.md" 9 | requires-python = ">=3.8" 10 | dependencies = [ 11 | "beautifulsoup4>=4.11.2", 12 | "datasets>=3.0.0", 13 | "mecab-python3>=1.0.6", 14 | "mojimoji>=0.0.12", 15 | "pyknp>=0.6.1", 16 | ] 17 | 18 | [dependency-groups] 19 | dev = ["mypy>=1.0.0", "pytest>=6.0.0", "ruff>=0.1.5"] 20 | 21 | [tool.mypy] 22 | python_version = "3.10" 23 | ignore_missing_imports = true 24 | -------------------------------------------------------------------------------- /tests/JGLUE_test.py: -------------------------------------------------------------------------------- 1 | import datasets as ds 2 | import pytest 3 | 4 | # In datasets>=3.0.0, HF_DATASETS_TRUST_REMOTE_CODE defaults to False, 5 | # which triggers confirmation dialogs when loading datasets and interrupts testing. 6 | # Therefore, HF_DATASETS_TRUST_REMOTE_CODE is set to True. 7 | ds.config.HF_DATASETS_TRUST_REMOTE_CODE = True 8 | 9 | 10 | @pytest.fixture 11 | def dataset_path() -> str: 12 | return "JGLUE.py" 13 | 14 | 15 | @pytest.mark.parametrize( 16 | argnames="dataset_name, expected_num_train, expected_num_valid,", 17 | argvalues=( 18 | ("JSTS", 12451, 1457), 19 | ("JNLI", 20073, 2434), 20 | ("JSQuAD", 62859, 4442), 21 | ("JCommonsenseQA", 8939, 1119), 22 | ), 23 | ) 24 | def test_load_dataset( 25 | dataset_path: str, 26 | dataset_name: str, 27 | expected_num_train: int, 28 | expected_num_valid: int, 29 | ): 30 | dataset = ds.load_dataset(path=dataset_path, name=dataset_name) 31 | assert isinstance(dataset, ds.DatasetDict) 32 | 33 | assert dataset["train"].num_rows == expected_num_train 34 | assert dataset["validation"].num_rows == expected_num_valid 35 | 36 | 37 | def test_load_marc_ja( 38 | dataset_path: str, 39 | dataset_name: str = "MARC-ja", 40 | expected_num_train: int = 187528, 41 | expected_num_valid: int = 5654, 42 | ): 43 | dataset = ds.load_dataset( 44 | path=dataset_path, 45 | name=dataset_name, 46 | is_pos_neg=True, 47 | max_char_length=500, 48 | filter_review_id_list_valid=True, 49 | label_conv_review_id_list_valid=True, 50 | ) 51 | assert isinstance(dataset, ds.DatasetDict) 52 | 53 | assert dataset["train"].num_rows == expected_num_train 54 | assert dataset["validation"].num_rows == expected_num_valid 55 | 56 | 57 | def test_load_jcola( 58 | dataset_path: str, 59 | dataset_name: str = "JCoLA", 60 | expected_num_train: int = 6919, 61 | expected_num_valid: int = 865, 62 | expected_num_valid_ood: int = 685, 63 | ): 64 | dataset = ds.load_dataset(path=dataset_path, name=dataset_name) 65 | assert isinstance(dataset, ds.DatasetDict) 66 | 67 | assert dataset["train"].num_rows == expected_num_train 68 | assert dataset["validation"].num_rows == expected_num_valid 69 | assert dataset["validation_out_of_domain"].num_rows == expected_num_valid_ood 70 | assert ( 71 | dataset["validation_out_of_domain_annotated"].num_rows == expected_num_valid_ood 72 | ) 73 | 74 | 75 | def test_jglue_version(): 76 | import tomli 77 | 78 | from JGLUE import JGLUE 79 | 80 | jglue_version = JGLUE.JGLUE_VERSION 81 | jglue_major, jglue_minor, _ = jglue_version.tuple 82 | 83 | with open("pyproject.toml", "rb") as rf: 84 | pyproject_toml = tomli.load(rf) 85 | 86 | project_version = ds.Version(pyproject_toml["project"]["version"]) 87 | proj_major, proj_minor, _ = project_version.tuple 88 | 89 | assert jglue_major == proj_major and jglue_minor == proj_minor, ( 90 | f"JGLUE and project version mismatch: {jglue_version=} != {project_version=}" 91 | ) 92 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shunk031/huggingface-datasets_JGLUE/41cc99f1c01d41b1ab13435ae97b7076c2199f4c/tests/__init__.py --------------------------------------------------------------------------------