├── .github
├── release.yml
└── workflows
│ ├── ci.yaml
│ ├── push_to_hub.yaml
│ └── release.yaml
├── .gitignore
├── .tagpr
├── CHANGELOG.md
├── JGLUE.py
├── Makefile
├── README.md
├── pyproject.toml
├── tests
├── JGLUE_test.py
└── __init__.py
└── uv.lock
/.github/release.yml:
--------------------------------------------------------------------------------
1 | changelog:
2 | exclude:
3 | labels:
4 | - tagpr
5 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | push:
5 | branches: [main]
6 | pull_request:
7 | branches: [main]
8 | paths-ignore:
9 | - 'README.md'
10 |
11 | jobs:
12 | test:
13 | runs-on: ubuntu-latest
14 | strategy:
15 | matrix:
16 | python-version: ['3.8', '3.9', '3.10']
17 |
18 | steps:
19 | - name: Checkout
20 | uses: actions/checkout@v4
21 |
22 | - name: Setup Python ${{ matrix.python-version }}
23 | uses: actions/setup-python@v4
24 | with:
25 | python-version: ${{ matrix.python-version }}
26 |
27 | - name: Install dependencies
28 | run: |
29 | make setup
30 | make install
31 |
32 | - name: Format
33 | run: |
34 | make format
35 |
36 | - name: Lint
37 | run: |
38 | make lint
39 |
40 | - name: Type check
41 | run: |
42 | make typecheck
43 |
44 | - name: Run tests
45 | run: |
46 | make test
47 |
--------------------------------------------------------------------------------
/.github/workflows/push_to_hub.yaml:
--------------------------------------------------------------------------------
1 | name: Sync to Hugging Face Hub
2 |
3 | on:
4 | workflow_run:
5 | workflows:
6 | - CI
7 | branches:
8 | - main
9 | types:
10 | - completed
11 |
12 | jobs:
13 | push_to_hub:
14 | runs-on: ubuntu-latest
15 |
16 | steps:
17 | - name: Checkout repository
18 | uses: actions/checkout@v2
19 |
20 | - name: Push to Huggingface hub
21 | env:
22 | HF_TOKEN: ${{ secrets.HF_TOKEN }}
23 | HF_USERNAME: ${{ secrets.HF_USERNAME }}
24 | run: |
25 | git fetch --unshallow
26 | git push --force https://${HF_USERNAME}:${HF_TOKEN}@huggingface.co/datasets/${HF_USERNAME}/JGLUE main
27 |
--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
1 | name: Release
2 | on:
3 | push:
4 | branches: ["main"]
5 | jobs:
6 | tagpr:
7 | runs-on: ubuntu-latest
8 | permissions:
9 | contents: write
10 | pull-requests: write
11 |
12 | steps:
13 | - uses: actions/checkout@v4
14 |
15 | - uses: Songmu/tagpr@v1
16 | env:
17 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by https://www.toptal.com/developers/gitignore/api/python
2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python
3 |
4 | ### Python ###
5 | # Byte-compiled / optimized / DLL files
6 | __pycache__/
7 | *.py[cod]
8 | *$py.class
9 |
10 | # C extensions
11 | *.so
12 |
13 | # Distribution / packaging
14 | .Python
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | wheels/
27 | share/python-wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | MANIFEST
32 |
33 | # PyInstaller
34 | # Usually these files are written by a python script from a template
35 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 |
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 |
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .nox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *.cover
53 | *.py,cover
54 | .hypothesis/
55 | .pytest_cache/
56 | cover/
57 |
58 | # Translations
59 | *.mo
60 | *.pot
61 |
62 | # Django stuff:
63 | *.log
64 | local_settings.py
65 | db.sqlite3
66 | db.sqlite3-journal
67 |
68 | # Flask stuff:
69 | instance/
70 | .webassets-cache
71 |
72 | # Scrapy stuff:
73 | .scrapy
74 |
75 | # Sphinx documentation
76 | docs/_build/
77 |
78 | # PyBuilder
79 | .pybuilder/
80 | target/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # IPython
86 | profile_default/
87 | ipython_config.py
88 |
89 | # pyenv
90 | # For a library or package, you might want to ignore these files since the code is
91 | # intended to run in multiple environments; otherwise, check them in:
92 | .python-version
93 |
94 | # pipenv
95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
98 | # install all needed dependencies.
99 | #Pipfile.lock
100 |
101 | # poetry
102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | # This is especially recommended for binary packages to ensure reproducibility, and is more
104 | # commonly ignored for libraries.
105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 |
108 | # pdm
109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | # in version control.
113 | # https://pdm.fming.dev/#use-with-ide
114 | .pdm.toml
115 |
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 |
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 |
123 | # SageMath parsed files
124 | *.sage.py
125 |
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 |
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 |
139 | # Rope project settings
140 | .ropeproject
141 |
142 | # mkdocs documentation
143 | /site
144 |
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 |
150 | # Pyre type checker
151 | .pyre/
152 |
153 | # pytype static type analyzer
154 | .pytype/
155 |
156 | # Cython debug symbols
157 | cython_debug/
158 |
159 | # PyCharm
160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | # and can be added to the global gitignore or merged into this file. For a more nuclear
163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 |
166 | ### Python Patch ###
167 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
168 | poetry.toml
169 |
170 | # ruff
171 | .ruff_cache/
172 |
173 | # End of https://www.toptal.com/developers/gitignore/api/python
174 |
--------------------------------------------------------------------------------
/.tagpr:
--------------------------------------------------------------------------------
1 | # config file for the tagpr in git config format
2 | # The tagpr generates the initial configuration, which you can rewrite to suit your environment.
3 | # CONFIGURATIONS:
4 | # tagpr.releaseBranch
5 | # Generally, it is "main." It is the branch for releases. The tagpr tracks this branch,
6 | # creates or updates a pull request as a release candidate, or tags when they are merged.
7 | #
8 | # tagpr.versionFile
9 | # Versioning file containing the semantic version needed to be updated at release.
10 | # It will be synchronized with the "git tag".
11 | # Often this is a meta-information file such as gemspec, setup.cfg, package.json, etc.
12 | # Sometimes the source code file, such as version.go or Bar.pm, is used.
13 | # If you do not want to use versioning files but only git tags, specify the "-" string here.
14 | # You can specify multiple version files by comma separated strings.
15 | #
16 | # tagpr.vPrefix
17 | # Flag whether or not v-prefix is added to semver when git tagging. (e.g. v1.2.3 if true)
18 | # This is only a tagging convention, not how it is described in the version file.
19 | #
20 | # tagpr.changelog (Optional)
21 | # Flag whether or not changelog is added or changed during the release.
22 | #
23 | # tagpr.command (Optional)
24 | # Command to change files just before release.
25 | #
26 | # tagpr.template (Optional)
27 | # Pull request template file in go template format
28 | #
29 | # tagpr.templateText (Optional)
30 | # Pull request template text in go template format
31 | #
32 | # tagpr.release (Optional)
33 | # GitHub Release creation behavior after tagging [true, draft, false]
34 | # If this value is not set, the release is to be created.
35 | #
36 | # tagpr.majorLabels (Optional)
37 | # Label of major update targets. Default is [major]
38 | #
39 | # tagpr.minorLabels (Optional)
40 | # Label of minor update targets. Default is [minor]
41 | #
42 | # tagpr.commitPrefix (Optional)
43 | # Prefix of commit message. Default is "[tagpr]"
44 | #
45 | [tagpr]
46 | vPrefix = true
47 | releaseBranch = main
48 | versionFile = pyproject.toml
49 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | ## [v1.2.0](https://github.com/shunk031/huggingface-datasets_JGLUE/compare/v1.1.1...v1.2.0) - 2025-03-31
4 | - Update dataset URLs and version to 1.2.0 by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/20
5 |
6 | ## [v1.1.1](https://github.com/shunk031/huggingface-datasets_JGLUE/compare/v1.1.0...v1.1.1) - 2025-03-31
7 | - Use uv by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/27
8 | - Bump JGLUE version to 1.1.0 by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/29
9 |
10 | ## [v1.1.0](https://github.com/shunk031/huggingface-datasets_JGLUE/compare/v1.0.0...v1.1.0) - 2025-03-31
11 | - Update dataset URLs to use versioned tags and bump version to 1.1.0 by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/22
12 |
13 | ## [v0.0.1](https://github.com/shunk031/huggingface-datasets_JGLUE/commits/v0.0.1) - 2025-03-31
14 | - Initialize by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/1
15 | - Fix for the huggingface dataset viewer by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/2
16 | - Follow HF SQuAD dataset by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/4
17 | - refactor JGLUE.py by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/5
18 | - Specify open encoding by @polm in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/6
19 | - download parquet from hf datasets in `MARC-ja` by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/10
20 | - open file in utf-8 by @kishida in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/11
21 | - update README by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/12
22 | - Update for CI by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/14
23 | - Add JCoLA dataset by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/13
24 | - update README.md by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/16
25 | - Remove datasets.tasks and add trust_remote_code by @mjun0812 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/17
26 | - Set package-mode to false in pyproject.toml by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/21
27 | - Update dataset URLs and version to 1.0.0 in JGLUE.py and pyproject.toml by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/23
28 | - Add GitHub Actions workflow for automated release tagging by @shunk031 in https://github.com/shunk031/huggingface-datasets_JGLUE/pull/24
29 |
--------------------------------------------------------------------------------
/JGLUE.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import random
4 | import string
5 | import warnings
6 | from dataclasses import dataclass
7 | from typing import Dict, List, Literal, Optional
8 |
9 | import datasets as ds
10 | import pandas as pd
11 |
12 | logger = logging.getLogger(__name__)
13 |
14 | _JGLUE_CITATION = """\
15 | @inproceedings{kurihara-lrec-2022-jglue,
16 | title={JGLUE: Japanese general language understanding evaluation},
17 | author={Kurihara, Kentaro and Kawahara, Daisuke and Shibata, Tomohide},
18 | booktitle={Proceedings of the Thirteenth Language Resources and Evaluation Conference},
19 | pages={2957--2966},
20 | year={2022},
21 | url={https://aclanthology.org/2022.lrec-1.317/}
22 | }
23 | @inproceedings{kurihara-nlp-2022-jglue,
24 | title={JGLUE: 日本語言語理解ベンチマーク},
25 | author={栗原健太郎 and 河原大輔 and 柴田知秀},
26 | booktitle={言語処理学会第28回年次大会},
27 | pages={2023--2028},
28 | year={2022},
29 | url={https://www.anlp.jp/proceedings/annual_meeting/2022/pdf_dir/E8-4.pdf},
30 | note={in Japanese}
31 | }
32 | """
33 |
34 | _JCOLA_CITATION = """\
35 | @article{someya2023jcola,
36 | title={JCoLA: Japanese Corpus of Linguistic Acceptability},
37 | author={Taiga Someya and Yushi Sugimoto and Yohei Oseki},
38 | year={2023},
39 | eprint={2309.12676},
40 | archivePrefix={arXiv},
41 | primaryClass={cs.CL}
42 | }
43 | @inproceedings{someya-nlp-2022-jcola,
44 | title={日本語版 CoLA の構築},
45 | author={染谷 大河 and 大関 洋平},
46 | booktitle={言語処理学会第28回年次大会},
47 | pages={1872--1877},
48 | year={2022},
49 | url={https://www.anlp.jp/proceedings/annual_meeting/2022/pdf_dir/E7-1.pdf},
50 | note={in Japanese}
51 | }
52 | """
53 |
54 | _MARC_JA_CITATION = """\
55 | @inproceedings{marc_reviews,
56 | title={The Multilingual Amazon Reviews Corpus},
57 | author={Keung, Phillip and Lu, Yichao and Szarvas, György and Smith, Noah A.},
58 | booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing},
59 | pages={4563--4568},
60 | year={2020}
61 | }
62 | """
63 |
64 | _JSTS_JNLI_CITATION = """\
65 | @inproceedings{miyazaki2016cross,
66 | title={Cross-lingual image caption generation},
67 | author={Miyazaki, Takashi and Shimizu, Nobuyuki},
68 | booktitle={Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
69 | pages={1780--1790},
70 | year={2016}
71 | }
72 | """
73 |
74 | _DESCRIPTION = """\
75 | JGLUE, Japanese General Language Understanding Evaluation, \
76 | is built to measure the general NLU ability in Japanese. JGLUE has been constructed \
77 | from scratch without translation. We hope that JGLUE will facilitate NLU research in Japanese.\
78 | """
79 |
80 | _JGLUE_HOMEPAGE = "https://github.com/yahoojapan/JGLUE"
81 | _JCOLA_HOMEPAGE = "https://github.com/osekilab/JCoLA"
82 | _MARC_JA_HOMEPAGE = "https://registry.opendata.aws/amazon-reviews-ml/"
83 |
84 | _JGLUE_LICENSE = """\
85 | This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.\
86 | """
87 |
88 | _DESCRIPTION_CONFIGS = {
89 | "MARC-ja": "MARC-ja is a dataset of the text classification task. This dataset is based on the Japanese portion of Multilingual Amazon Reviews Corpus (MARC) (Keung+, 2020).",
90 | "JCoLA": "JCoLA (Japanese Corpus of Linguistic Accept010 ability) is a novel dataset for targeted syntactic evaluations of language models in Japanese, which consists of 10,020 sentences with acceptability judgments by linguists.",
91 | "JSTS": "JSTS is a Japanese version of the STS (Semantic Textual Similarity) dataset. STS is a task to estimate the semantic similarity of a sentence pair.",
92 | "JNLI": "JNLI is a Japanese version of the NLI (Natural Language Inference) dataset. NLI is a task to recognize the inference relation that a premise sentence has to a hypothesis sentence.",
93 | "JSQuAD": "JSQuAD is a Japanese version of SQuAD (Rajpurkar+, 2016), one of the datasets of reading comprehension.",
94 | "JCommonsenseQA": "JCommonsenseQA is a Japanese version of CommonsenseQA (Talmor+, 2019), which is a multiple-choice question answering dataset that requires commonsense reasoning ability.",
95 | }
96 |
97 | _URLS = {
98 | "MARC-ja": {
99 | "data": "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz",
100 | "filter_review_id_list": {
101 | "valid": "https://raw.githubusercontent.com/yahoojapan/JGLUE/main/preprocess/marc-ja/data/filter_review_id_list/valid.txt",
102 | },
103 | "label_conv_review_id_list": {
104 | "valid": "https://raw.githubusercontent.com/yahoojapan/JGLUE/main/preprocess/marc-ja/data/label_conv_review_id_list/valid.txt",
105 | },
106 | },
107 | "JCoLA": {
108 | "train": {
109 | "in_domain": {
110 | "json": "https://raw.githubusercontent.com/osekilab/JCoLA/main/data/jcola-v1.0/in_domain_train-v1.0.json",
111 | }
112 | },
113 | "valid": {
114 | "in_domain": {
115 | "json": "https://raw.githubusercontent.com/osekilab/JCoLA/main/data/jcola-v1.0/in_domain_valid-v1.0.json",
116 | },
117 | "out_of_domain": {
118 | "json": "https://raw.githubusercontent.com/osekilab/JCoLA/main/data/jcola-v1.0/out_of_domain_valid-v1.0.json",
119 | "json_annotated": "https://raw.githubusercontent.com/osekilab/JCoLA/main/data/jcola-v1.0/out_of_domain_valid_annotated-v1.0.json",
120 | },
121 | },
122 | },
123 | "JSTS": {
124 | "train": "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/tags/v1.2.0/datasets/jsts-v1.2/train-v1.2.json",
125 | "valid": "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/tags/v1.2.0/datasets/jsts-v1.2/valid-v1.2.json",
126 | },
127 | "JNLI": {
128 | "train": "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/tags/v1.2.0/datasets/jnli-v1.2/train-v1.2.json",
129 | "valid": "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/tags/v1.2.0/datasets/jnli-v1.2/valid-v1.2.json",
130 | },
131 | "JSQuAD": {
132 | "train": "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/tags/v1.2.0/datasets/jsquad-v1.2/train-v1.2.json",
133 | "valid": "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/tags/v1.2.0/datasets/jsquad-v1.2/valid-v1.2.json",
134 | },
135 | "JCommonsenseQA": {
136 | "train": "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/tags/v1.2.0/datasets/jcommonsenseqa-v1.2/train-v1.2.json",
137 | "valid": "https://raw.githubusercontent.com/yahoojapan/JGLUE/refs/tags/v1.2.0/datasets/jcommonsenseqa-v1.2/valid-v1.2.json",
138 | },
139 | }
140 |
141 |
142 | def dataset_info_jsts() -> ds.DatasetInfo:
143 | features = ds.Features(
144 | {
145 | "sentence_pair_id": ds.Value("string"),
146 | "yjcaptions_id": ds.Value("string"),
147 | "sentence1": ds.Value("string"),
148 | "sentence2": ds.Value("string"),
149 | "label": ds.Value("float"),
150 | }
151 | )
152 | return ds.DatasetInfo(
153 | description=_DESCRIPTION,
154 | citation=_JGLUE_CITATION,
155 | homepage=f"{_JSTS_JNLI_CITATION}\n{_JGLUE_HOMEPAGE}",
156 | license=_JGLUE_LICENSE,
157 | features=features,
158 | )
159 |
160 |
161 | def dataset_info_jnli() -> ds.DatasetInfo:
162 | features = ds.Features(
163 | {
164 | "sentence_pair_id": ds.Value("string"),
165 | "yjcaptions_id": ds.Value("string"),
166 | "sentence1": ds.Value("string"),
167 | "sentence2": ds.Value("string"),
168 | "label": ds.ClassLabel(
169 | num_classes=3, names=["entailment", "contradiction", "neutral"]
170 | ),
171 | }
172 | )
173 | return ds.DatasetInfo(
174 | description=_DESCRIPTION,
175 | citation=_JGLUE_CITATION,
176 | homepage=f"{_JSTS_JNLI_CITATION}\n{_JGLUE_HOMEPAGE}",
177 | license=_JGLUE_LICENSE,
178 | features=features,
179 | supervised_keys=None,
180 | )
181 |
182 |
183 | def dataset_info_jsquad() -> ds.DatasetInfo:
184 | features = ds.Features(
185 | {
186 | "id": ds.Value("string"),
187 | "title": ds.Value("string"),
188 | "context": ds.Value("string"),
189 | "question": ds.Value("string"),
190 | "answers": ds.Sequence(
191 | {"text": ds.Value("string"), "answer_start": ds.Value("int32")}
192 | ),
193 | "is_impossible": ds.Value("bool"),
194 | }
195 | )
196 | return ds.DatasetInfo(
197 | description=_DESCRIPTION,
198 | citation=_JGLUE_CITATION,
199 | homepage=_JGLUE_HOMEPAGE,
200 | license=_JGLUE_LICENSE,
201 | features=features,
202 | supervised_keys=None,
203 | )
204 |
205 |
206 | def dataset_info_jcommonsenseqa() -> ds.DatasetInfo:
207 | features = ds.Features(
208 | {
209 | "q_id": ds.Value("int64"),
210 | "question": ds.Value("string"),
211 | "choice0": ds.Value("string"),
212 | "choice1": ds.Value("string"),
213 | "choice2": ds.Value("string"),
214 | "choice3": ds.Value("string"),
215 | "choice4": ds.Value("string"),
216 | "label": ds.ClassLabel(
217 | num_classes=5,
218 | names=["choice0", "choice1", "choice2", "choice3", "choice4"],
219 | ),
220 | }
221 | )
222 | return ds.DatasetInfo(
223 | description=_DESCRIPTION,
224 | citation=_JGLUE_CITATION,
225 | homepage=_JGLUE_HOMEPAGE,
226 | license=_JGLUE_LICENSE,
227 | features=features,
228 | )
229 |
230 |
231 | def dataset_info_jcola() -> ds.DatasetInfo:
232 | features = ds.Features(
233 | {
234 | "uid": ds.Value("int64"),
235 | "source": ds.Value("string"),
236 | "label": ds.ClassLabel(
237 | num_classes=2,
238 | names=["unacceptable", "acceptable"],
239 | ),
240 | "diacritic": ds.Value("string"),
241 | "sentence": ds.Value("string"),
242 | "original": ds.Value("string"),
243 | "translation": ds.Value("string"),
244 | "gloss": ds.Value("bool"),
245 | "linguistic_phenomenon": {
246 | "argument_structure": ds.Value("bool"),
247 | "binding": ds.Value("bool"),
248 | "control_raising": ds.Value("bool"),
249 | "ellipsis": ds.Value("bool"),
250 | "filler_gap": ds.Value("bool"),
251 | "island_effects": ds.Value("bool"),
252 | "morphology": ds.Value("bool"),
253 | "nominal_structure": ds.Value("bool"),
254 | "negative_polarity_concord_items": ds.Value("bool"),
255 | "quantifier": ds.Value("bool"),
256 | "verbal_agreement": ds.Value("bool"),
257 | "simple": ds.Value("bool"),
258 | },
259 | }
260 | )
261 | return ds.DatasetInfo(
262 | description=_DESCRIPTION,
263 | citation=f"{_JCOLA_CITATION}\n{_JGLUE_CITATION}",
264 | homepage=_JCOLA_HOMEPAGE,
265 | features=features,
266 | )
267 |
268 |
269 | def dataset_info_marc_ja() -> ds.DatasetInfo:
270 | features = ds.Features(
271 | {
272 | "sentence": ds.Value("string"),
273 | "label": ds.ClassLabel(
274 | num_classes=3, names=["positive", "negative", "neutral"]
275 | ),
276 | "review_id": ds.Value("string"),
277 | }
278 | )
279 | return ds.DatasetInfo(
280 | description=_DESCRIPTION,
281 | citation=f"{_MARC_JA_CITATION}\n{_JGLUE_CITATION}",
282 | homepage=_MARC_JA_HOMEPAGE,
283 | license=_JGLUE_LICENSE,
284 | features=features,
285 | )
286 |
287 |
288 | @dataclass
289 | class JGLUEConfig(ds.BuilderConfig):
290 | """Class for JGLUE benchmark configuration"""
291 |
292 |
293 | @dataclass
294 | class MarcJaConfig(JGLUEConfig):
295 | name: str = "MARC-ja"
296 | is_han_to_zen: bool = False
297 | max_instance_num: Optional[int] = None
298 | max_char_length: int = 500
299 | is_pos_neg: bool = True
300 | train_ratio: float = 0.94
301 | val_ratio: float = 0.03
302 | test_ratio: float = 0.03
303 | output_testset: bool = False
304 | filter_review_id_list_valid: bool = True
305 | label_conv_review_id_list_valid: bool = True
306 |
307 | def __post_init__(self) -> None:
308 | assert self.train_ratio + self.val_ratio + self.test_ratio == 1.0
309 |
310 |
311 | JcolaDomain = Literal["in_domain", "out_of_domain"]
312 |
313 |
314 | @dataclass
315 | class JcolaConfig(JGLUEConfig):
316 | name: str = "JCoLA"
317 | domain: JcolaDomain = "in_domain"
318 |
319 |
320 | def get_label(rating: int, is_pos_neg: bool = False) -> Optional[str]:
321 | if rating >= 4:
322 | return "positive"
323 | elif rating <= 2:
324 | return "negative"
325 | else:
326 | if is_pos_neg:
327 | return None
328 | else:
329 | return "neutral"
330 |
331 |
332 | def is_filtered_by_ascii_rate(text: str, threshold: float = 0.9) -> bool:
333 | ascii_letters = set(string.printable)
334 | rate = sum(c in ascii_letters for c in text) / len(text)
335 | return rate >= threshold
336 |
337 |
338 | def shuffle_dataframe(df: pd.DataFrame) -> pd.DataFrame:
339 | instances = df.to_dict(orient="records")
340 | random.seed(1)
341 | random.shuffle(instances)
342 | return pd.DataFrame(instances)
343 |
344 |
345 | def get_filter_review_id_list(
346 | filter_review_id_list_paths: Dict[str, str],
347 | ) -> Dict[str, List[str]]:
348 | filter_review_id_list_valid = filter_review_id_list_paths.get("valid")
349 | filter_review_id_list_test = filter_review_id_list_paths.get("test")
350 |
351 | filter_review_id_list = {}
352 |
353 | if filter_review_id_list_valid is not None:
354 | with open(filter_review_id_list_valid, "r", encoding="utf-8") as rf:
355 | filter_review_id_list["valid"] = [line.rstrip() for line in rf]
356 |
357 | if filter_review_id_list_test is not None:
358 | with open(filter_review_id_list_test, "r", encoding="utf-8") as rf:
359 | filter_review_id_list["test"] = [line.rstrip() for line in rf]
360 |
361 | return filter_review_id_list
362 |
363 |
364 | def get_label_conv_review_id_list(
365 | label_conv_review_id_list_paths: Dict[str, str],
366 | ) -> Dict[str, Dict[str, str]]:
367 | import csv
368 |
369 | label_conv_review_id_list_valid = label_conv_review_id_list_paths.get("valid")
370 | label_conv_review_id_list_test = label_conv_review_id_list_paths.get("test")
371 |
372 | label_conv_review_id_list: Dict[str, Dict[str, str]] = {}
373 |
374 | if label_conv_review_id_list_valid is not None:
375 | with open(label_conv_review_id_list_valid, "r", encoding="utf-8") as rf:
376 | label_conv_review_id_list["valid"] = {
377 | row[0]: row[1] for row in csv.reader(rf)
378 | }
379 |
380 | if label_conv_review_id_list_test is not None:
381 | with open(label_conv_review_id_list_test, "r", encoding="utf-8") as rf:
382 | label_conv_review_id_list["test"] = {
383 | row[0]: row[1] for row in csv.reader(rf)
384 | }
385 |
386 | return label_conv_review_id_list
387 |
388 |
389 | def output_data(
390 | df: pd.DataFrame,
391 | train_ratio: float,
392 | val_ratio: float,
393 | test_ratio: float,
394 | output_testset: bool,
395 | filter_review_id_list_paths: Dict[str, str],
396 | label_conv_review_id_list_paths: Dict[str, str],
397 | ) -> Dict[str, pd.DataFrame]:
398 | instance_num = len(df)
399 | split_dfs: Dict[str, pd.DataFrame] = {}
400 | length1 = int(instance_num * train_ratio)
401 | split_dfs["train"] = df.iloc[:length1]
402 |
403 | length2 = int(instance_num * (train_ratio + val_ratio))
404 | split_dfs["valid"] = df.iloc[length1:length2]
405 | split_dfs["test"] = df.iloc[length2:]
406 |
407 | filter_review_id_list = get_filter_review_id_list(
408 | filter_review_id_list_paths=filter_review_id_list_paths,
409 | )
410 | label_conv_review_id_list = get_label_conv_review_id_list(
411 | label_conv_review_id_list_paths=label_conv_review_id_list_paths,
412 | )
413 |
414 | for eval_type in ("valid", "test"):
415 | if filter_review_id_list.get(eval_type):
416 | df = split_dfs[eval_type]
417 | df = df[~df["review_id"].isin(filter_review_id_list[eval_type])]
418 | split_dfs[eval_type] = df
419 |
420 | for eval_type in ("valid", "test"):
421 | if label_conv_review_id_list.get(eval_type):
422 | df = split_dfs[eval_type]
423 | df = df.assign(
424 | converted_label=df["review_id"].map(label_conv_review_id_list["valid"])
425 | )
426 | df = df.assign(
427 | label=df[["label", "converted_label"]].apply(
428 | lambda xs: xs["label"]
429 | if pd.isnull(xs["converted_label"])
430 | else xs["converted_label"],
431 | axis=1,
432 | )
433 | )
434 | df = df.drop(columns=["converted_label"])
435 | split_dfs[eval_type] = df
436 |
437 | return {
438 | "train": split_dfs["train"],
439 | "valid": split_dfs["valid"],
440 | }
441 |
442 |
443 | def preprocess_for_marc_ja(
444 | config: MarcJaConfig,
445 | data_file_path: str,
446 | filter_review_id_list_paths: Dict[str, str],
447 | label_conv_review_id_list_paths: Dict[str, str],
448 | ) -> Dict[str, pd.DataFrame]:
449 | try:
450 | import mojimoji
451 |
452 | def han_to_zen(text: str) -> str:
453 | return mojimoji.han_to_zen(text)
454 |
455 | except ImportError:
456 | warnings.warn(
457 | "can't import `mojimoji`, failing back to method that do nothing. "
458 | "We recommend running `pip install mojimoji` to reproduce the original preprocessing.",
459 | UserWarning,
460 | )
461 |
462 | def han_to_zen(text: str) -> str:
463 | return text
464 |
465 | try:
466 | from bs4 import BeautifulSoup
467 |
468 | def cleanup_text(text: str) -> str:
469 | return BeautifulSoup(text, "html.parser").get_text()
470 |
471 | except ImportError:
472 | warnings.warn(
473 | "can't import `beautifulsoup4`, failing back to method that do nothing."
474 | "We recommend running `pip install beautifulsoup4` to reproduce the original preprocessing.",
475 | UserWarning,
476 | )
477 |
478 | def cleanup_text(text: str) -> str:
479 | return text
480 |
481 | from tqdm import tqdm
482 |
483 | df = pd.read_csv(data_file_path, delimiter="\t")
484 | df = df[["review_body", "star_rating", "review_id"]]
485 |
486 | # rename columns
487 | df = df.rename(columns={"review_body": "text", "star_rating": "rating"})
488 |
489 | # convert the rating to label
490 | tqdm.pandas(dynamic_ncols=True, desc="Convert the rating to the label")
491 | df = df.assign(
492 | label=df["rating"].progress_apply(
493 | lambda rating: get_label(rating, config.is_pos_neg)
494 | )
495 | )
496 |
497 | # remove rows where the label is None
498 | df = df[~df["label"].isnull()]
499 |
500 | # remove html tags from the text
501 | tqdm.pandas(dynamic_ncols=True, desc="Remove html tags from the text")
502 | df = df.assign(text=df["text"].progress_apply(cleanup_text))
503 |
504 | # filter by ascii rate
505 | tqdm.pandas(dynamic_ncols=True, desc="Filter by ascii rate")
506 | df = df[~df["text"].progress_apply(is_filtered_by_ascii_rate)]
507 |
508 | if config.max_char_length is not None:
509 | df = df[df["text"].str.len() <= config.max_char_length]
510 |
511 | if config.is_han_to_zen:
512 | df = df.assign(text=df["text"].apply(han_to_zen))
513 |
514 | df = df[["text", "label", "review_id"]]
515 | df = df.rename(columns={"text": "sentence"})
516 |
517 | # shuffle dataset
518 | df = shuffle_dataframe(df)
519 |
520 | split_dfs = output_data(
521 | df=df,
522 | train_ratio=config.train_ratio,
523 | val_ratio=config.val_ratio,
524 | test_ratio=config.test_ratio,
525 | output_testset=config.output_testset,
526 | filter_review_id_list_paths=filter_review_id_list_paths,
527 | label_conv_review_id_list_paths=label_conv_review_id_list_paths,
528 | )
529 | return split_dfs
530 |
531 |
532 | class JGLUE(ds.GeneratorBasedBuilder):
533 | JGLUE_VERSION = ds.Version("1.2.0")
534 | JCOLA_VERSION = ds.Version("1.0.0")
535 |
536 | BUILDER_CONFIG_CLASS = JGLUEConfig
537 | BUILDER_CONFIGS = [
538 | MarcJaConfig(
539 | name="MARC-ja",
540 | version=JGLUE_VERSION,
541 | description=_DESCRIPTION_CONFIGS["MARC-ja"],
542 | ),
543 | JcolaConfig(
544 | name="JCoLA",
545 | version=JCOLA_VERSION,
546 | description=_DESCRIPTION_CONFIGS["JCoLA"],
547 | ),
548 | JGLUEConfig(
549 | name="JSTS",
550 | version=JGLUE_VERSION,
551 | description=_DESCRIPTION_CONFIGS["JSTS"],
552 | ),
553 | JGLUEConfig(
554 | name="JNLI",
555 | version=JGLUE_VERSION,
556 | description=_DESCRIPTION_CONFIGS["JNLI"],
557 | ),
558 | JGLUEConfig(
559 | name="JSQuAD",
560 | version=JGLUE_VERSION,
561 | description=_DESCRIPTION_CONFIGS["JSQuAD"],
562 | ),
563 | JGLUEConfig(
564 | name="JCommonsenseQA",
565 | version=JGLUE_VERSION,
566 | description=_DESCRIPTION_CONFIGS["JCommonsenseQA"],
567 | ),
568 | ]
569 |
570 | def _info(self) -> ds.DatasetInfo:
571 | if self.config.name == "JSTS":
572 | return dataset_info_jsts()
573 | elif self.config.name == "JNLI":
574 | return dataset_info_jnli()
575 | elif self.config.name == "JSQuAD":
576 | return dataset_info_jsquad()
577 | elif self.config.name == "JCommonsenseQA":
578 | return dataset_info_jcommonsenseqa()
579 | elif self.config.name == "JCoLA":
580 | return dataset_info_jcola()
581 | elif self.config.name == "MARC-ja":
582 | return dataset_info_marc_ja()
583 | else:
584 | raise ValueError(f"Invalid config name: {self.config.name}")
585 |
586 | def __split_generators_marc_ja(self, dl_manager: ds.DownloadManager):
587 | try:
588 | file_paths = dl_manager.download_and_extract(_URLS[self.config.name])
589 | except FileNotFoundError as err:
590 | logger.warning(err)
591 | # An error occurs because the file cannot be downloaded from _URLS[MARC-ja]['data'].
592 | # So, remove the 'data' key and try to download again.
593 | urls = _URLS[self.config.name]
594 | urls.pop("data") # type: ignore[attr-defined]
595 | file_paths = dl_manager.download_and_extract(urls)
596 |
597 | filter_review_id_list = file_paths["filter_review_id_list"]
598 | label_conv_review_id_list = file_paths["label_conv_review_id_list"]
599 |
600 | try:
601 | split_dfs = preprocess_for_marc_ja(
602 | config=self.config,
603 | data_file_path=file_paths["data"],
604 | filter_review_id_list_paths=filter_review_id_list,
605 | label_conv_review_id_list_paths=label_conv_review_id_list,
606 | )
607 | except KeyError as err:
608 | from urllib.parse import urljoin
609 |
610 | logger.warning(err)
611 |
612 | base_url = "https://huggingface.co/datasets/shunk031/JGLUE/resolve/refs%2Fconvert%2Fparquet/MARC-ja/"
613 | marcja_parquet_urls = {
614 | "train": urljoin(base_url, "jglue-train.parquet"),
615 | "valid": urljoin(base_url, "jglue-validation.parquet"),
616 | }
617 | file_paths = dl_manager.download_and_extract(marcja_parquet_urls)
618 | split_dfs = {k: pd.read_parquet(v) for k, v in file_paths.items()}
619 |
620 | return [
621 | ds.SplitGenerator(
622 | name=ds.Split.TRAIN,
623 | gen_kwargs={"split_df": split_dfs["train"]},
624 | ),
625 | ds.SplitGenerator(
626 | name=ds.Split.VALIDATION,
627 | gen_kwargs={"split_df": split_dfs["valid"]},
628 | ),
629 | ]
630 |
631 | def __split_generators_jcola(self, dl_manager: ds.DownloadManager):
632 | file_paths = dl_manager.download_and_extract(_URLS[self.config.name])
633 |
634 | return [
635 | ds.SplitGenerator(
636 | name=ds.Split.TRAIN,
637 | gen_kwargs={"file_path": file_paths["train"]["in_domain"]["json"]},
638 | ),
639 | ds.SplitGenerator(
640 | name=ds.Split.VALIDATION,
641 | gen_kwargs={"file_path": file_paths["valid"]["in_domain"]["json"]},
642 | ),
643 | ds.SplitGenerator(
644 | name=ds.NamedSplit("validation_out_of_domain"),
645 | gen_kwargs={"file_path": file_paths["valid"]["out_of_domain"]["json"]},
646 | ),
647 | ds.SplitGenerator(
648 | name=ds.NamedSplit("validation_out_of_domain_annotated"),
649 | gen_kwargs={
650 | "file_path": file_paths["valid"]["out_of_domain"]["json_annotated"]
651 | },
652 | ),
653 | ]
654 |
655 | def __split_generators(self, dl_manager: ds.DownloadManager):
656 | file_paths = dl_manager.download_and_extract(_URLS[self.config.name])
657 |
658 | return [
659 | ds.SplitGenerator(
660 | name=ds.Split.TRAIN,
661 | gen_kwargs={"file_path": file_paths["train"]},
662 | ),
663 | ds.SplitGenerator(
664 | name=ds.Split.VALIDATION,
665 | gen_kwargs={"file_path": file_paths["valid"]},
666 | ),
667 | ]
668 |
669 | def _split_generators(self, dl_manager: ds.DownloadManager):
670 | if self.config.name == "MARC-ja":
671 | return self.__split_generators_marc_ja(dl_manager)
672 | elif self.config.name == "JCoLA":
673 | return self.__split_generators_jcola(dl_manager)
674 | else:
675 | return self.__split_generators(dl_manager)
676 |
677 | def __generate_examples_marc_ja(self, split_df: Optional[pd.DataFrame] = None):
678 | if split_df is None:
679 | raise ValueError(f"Invalid preprocessing for {self.config.name}")
680 |
681 | instances = split_df.to_dict(orient="records")
682 | for i, data_dict in enumerate(instances):
683 | yield i, data_dict
684 |
685 | def __generate_examples_jcola(self, file_path: Optional[str] = None):
686 | if file_path is None:
687 | raise ValueError(f"Invalid argument for {self.config.name}")
688 |
689 | def convert_label(json_dict):
690 | label_int = json_dict["label"]
691 | label_str = "unacceptable" if label_int == 0 else "acceptable"
692 | json_dict["label"] = label_str
693 | return json_dict
694 |
695 | def convert_addntional_info(json_dict):
696 | json_dict["translation"] = json_dict.get("translation")
697 | json_dict["gloss"] = json_dict.get("gloss")
698 | return json_dict
699 |
700 | def convert_phenomenon(json_dict):
701 | argument_structure = json_dict.get("Arg. Str.")
702 |
703 | def json_pop(key):
704 | return json_dict.pop(key) if argument_structure is not None else None
705 |
706 | json_dict["linguistic_phenomenon"] = {
707 | "argument_structure": json_pop("Arg. Str."),
708 | "binding": json_pop("binding"),
709 | "control_raising": json_pop("control/raising"),
710 | "ellipsis": json_pop("ellipsis"),
711 | "filler_gap": json_pop("filler-gap"),
712 | "island_effects": json_pop("island effects"),
713 | "morphology": json_pop("morphology"),
714 | "nominal_structure": json_pop("nominal structure"),
715 | "negative_polarity_concord_items": json_pop("NPI/NCI"),
716 | "quantifier": json_pop("quantifier"),
717 | "verbal_agreement": json_pop("verbal agr."),
718 | "simple": json_pop("simple"),
719 | }
720 | return json_dict
721 |
722 | with open(file_path, "r", encoding="utf-8") as rf:
723 | for i, line in enumerate(rf):
724 | json_dict = json.loads(line)
725 |
726 | example = convert_label(json_dict)
727 | example = convert_addntional_info(example)
728 | example = convert_phenomenon(example)
729 |
730 | yield i, example
731 |
732 | def __generate_examples_jsquad(self, file_path: Optional[str] = None):
733 | if file_path is None:
734 | raise ValueError(f"Invalid argument for {self.config.name}")
735 |
736 | with open(file_path, "r", encoding="utf-8") as rf:
737 | json_data = json.load(rf)
738 |
739 | for json_dict in json_data["data"]:
740 | title = json_dict["title"]
741 | paragraphs = json_dict["paragraphs"]
742 |
743 | for paragraph in paragraphs:
744 | context = paragraph["context"]
745 | questions = paragraph["qas"]
746 |
747 | for question_dict in questions:
748 | q_id = question_dict["id"]
749 | question = question_dict["question"]
750 | answers = question_dict["answers"]
751 | is_impossible = question_dict["is_impossible"]
752 |
753 | example_dict = {
754 | "id": q_id,
755 | "title": title,
756 | "context": context,
757 | "question": question,
758 | "answers": answers,
759 | "is_impossible": is_impossible,
760 | }
761 |
762 | yield q_id, example_dict
763 |
764 | def __generate_examples_jcommonsenseqa(self, file_path: Optional[str] = None):
765 | if file_path is None:
766 | raise ValueError(f"Invalid argument for {self.config.name}")
767 |
768 | with open(file_path, "r", encoding="utf-8") as rf:
769 | for i, line in enumerate(rf):
770 | json_dict = json.loads(line)
771 | json_dict["label"] = f"choice{json_dict['label']}"
772 | yield i, json_dict
773 |
774 | def __generate_examples(self, file_path: Optional[str] = None):
775 | if file_path is None:
776 | raise ValueError(f"Invalid argument for {self.config.name}")
777 |
778 | with open(file_path, "r", encoding="utf-8") as rf:
779 | for i, line in enumerate(rf):
780 | json_dict = json.loads(line)
781 | yield i, json_dict
782 |
783 | def _generate_examples(
784 | self,
785 | file_path: Optional[str] = None,
786 | split_df: Optional[pd.DataFrame] = None,
787 | ):
788 | if self.config.name == "MARC-ja":
789 | yield from self.__generate_examples_marc_ja(split_df)
790 |
791 | elif self.config.name == "JCoLA":
792 | yield from self.__generate_examples_jcola(file_path)
793 |
794 | elif self.config.name == "JSQuAD":
795 | yield from self.__generate_examples_jsquad(file_path)
796 |
797 | elif self.config.name == "JCommonsenseQA":
798 | yield from self.__generate_examples_jcommonsenseqa(file_path)
799 |
800 | else:
801 | yield from self.__generate_examples(file_path)
802 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | #
2 | # Installation
3 | #
4 |
5 | .PHONY: setup
6 | setup:
7 | pip install -U uv
8 |
9 | .PHONY: install
10 | install:
11 | uv sync
12 |
13 | #
14 | # linter/formatter/typecheck
15 | #
16 |
17 | .PHONY: lint
18 | lint: install
19 | uv run ruff check --output-format=github .
20 |
21 | .PHONY: format
22 | format: install
23 | uv run ruff format --check --diff .
24 |
25 | .PHONY: typecheck
26 | typecheck: install
27 | uv run mypy --cache-dir=/dev/null .
28 |
29 | .PHONY: test
30 | test: install
31 | uv run pytest -vsx --log-cli-level=INFO
32 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | annotations_creators:
3 | - crowdsourced
4 | language:
5 | - ja
6 | language_creators:
7 | - crowdsourced
8 | - found
9 | license:
10 | - cc-by-4.0
11 | multilinguality:
12 | - monolingual
13 | pretty_name: JGLUE
14 | size_categories: []
15 | source_datasets:
16 | - original
17 | tags:
18 | - MARC
19 | - CoLA
20 | - STS
21 | - NLI
22 | - SQuAD
23 | - CommonsenseQA
24 | task_categories:
25 | - multiple-choice
26 | - question-answering
27 | - sentence-similarity
28 | - text-classification
29 | task_ids:
30 | - multiple-choice-qa
31 | - open-domain-qa
32 | - multi-class-classification
33 | - sentiment-classification
34 | ---
35 |
36 | # Dataset Card for JGLUE
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 | This dataset loading script is developed on [GitHub](https://github.com/shunk031/huggingface-datasets_JGLUE).
54 | Please feel free to open an [issue](https://github.com/shunk031/huggingface-datasets_JGLUE/issues/new/choose) or [pull request](https://github.com/shunk031/huggingface-datasets_JGLUE/pulls).
55 |
56 | > [!IMPORTANT]
57 | > The version of this loading script has been updated to correspond to the version of JGLUE.
58 | > Please check the release history at [yahoojapan/JGLUE/releases](https://github.com/yahoojapan/JGLUE/releases) and [shunk031/huggingface-datasets_JGLUE/releases](https://github.com/shunk031/huggingface-datasets_JGLUE/releases).
59 |
60 | ## Table of Contents
61 | - [Table of Contents](#table-of-contents)
62 | - [Dataset Description](#dataset-description)
63 | - [Dataset Summary](#dataset-summary)
64 | - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
65 | - [Languages](#languages)
66 | - [Dataset Structure](#dataset-structure)
67 | - [Data Instances](#data-instances)
68 | - [Data Fields](#data-fields)
69 | - [Data Splits](#data-splits)
70 | - [Dataset Creation](#dataset-creation)
71 | - [Curation Rationale](#curation-rationale)
72 | - [Source Data](#source-data)
73 | - [Annotations](#annotations)
74 | - [Personal and Sensitive Information](#personal-and-sensitive-information)
75 | - [Considerations for Using the Data](#considerations-for-using-the-data)
76 | - [Social Impact of Dataset](#social-impact-of-dataset)
77 | - [Discussion of Biases](#discussion-of-biases)
78 | - [Other Known Limitations](#other-known-limitations)
79 | - [Additional Information](#additional-information)
80 | - [Dataset Curators](#dataset-curators)
81 | - [Licensing Information](#licensing-information)
82 | - [Citation Information](#citation-information)
83 | - [Contributions](#contributions)
84 |
85 | ## Dataset Description
86 |
87 | - **Homepage:** https://github.com/yahoojapan/JGLUE
88 | - **Repository:** https://github.com/shunk031/huggingface-datasets_JGLUE
89 |
90 | ### Dataset Summary
91 |
92 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE#jglue-japanese-general-language-understanding-evaluation):
93 |
94 | > JGLUE, Japanese General Language Understanding Evaluation, is built to measure the general NLU ability in Japanese. JGLUE has been constructed from scratch without translation. We hope that JGLUE will facilitate NLU research in Japanese.
95 |
96 | > JGLUE has been constructed by a joint research project of Yahoo Japan Corporation and Kawahara Lab at Waseda University.
97 |
98 | ### Supported Tasks and Leaderboards
99 |
100 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE#tasksdatasets):
101 |
102 | > JGLUE consists of the tasks of text classification, sentence pair classification, and QA. Each task consists of multiple datasets.
103 |
104 | #### Supported Tasks
105 |
106 | ##### MARC-ja
107 |
108 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE#marc-ja):
109 |
110 | > MARC-ja is a dataset of the text classification task. This dataset is based on the Japanese portion of [Multilingual Amazon Reviews Corpus (MARC)](https://docs.opendata.aws/amazon-reviews-ml/readme.html) ([Keung+, 2020](https://aclanthology.org/2020.emnlp-main.369/)).
111 |
112 | ##### JCoLA
113 |
114 | From [JCoLA's README.md](https://github.com/osekilab/JCoLA#jcola-japanese-corpus-of-linguistic-acceptability)
115 |
116 | > JCoLA (Japanese Corpus of Linguistic Accept010 ability) is a novel dataset for targeted syntactic evaluations of language models in Japanese, which consists of 10,020 sentences with acceptability judgments by linguists. The sentences are manually extracted from linguistics journals, handbooks and textbooks. JCoLA is included in [JGLUE benchmark](https://github.com/yahoojapan/JGLUE) (Kurihara et al., 2022).
117 |
118 | ##### JSTS
119 |
120 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE#jsts):
121 |
122 | > JSTS is a Japanese version of the STS (Semantic Textual Similarity) dataset. STS is a task to estimate the semantic similarity of a sentence pair. The sentences in JSTS and JNLI (described below) are extracted from the Japanese version of the MS COCO Caption Dataset, [the YJ Captions Dataset](https://github.com/yahoojapan/YJCaptions) ([Miyazaki and Shimizu, 2016](https://aclanthology.org/P16-1168/)).
123 |
124 | ##### JNLI
125 |
126 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE#jnli):
127 |
128 | > JNLI is a Japanese version of the NLI (Natural Language Inference) dataset. NLI is a task to recognize the inference relation that a premise sentence has to a hypothesis sentence. The inference relations are entailment, contradiction, and neutral.
129 |
130 | ##### JSQuAD
131 |
132 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE#jsquad):
133 |
134 | > JSQuAD is a Japanese version of [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) ([Rajpurkar+, 2018](https://aclanthology.org/P18-2124/)), one of the datasets of reading comprehension. Each instance in the dataset consists of a question regarding a given context (Wikipedia article) and its answer. JSQuAD is based on SQuAD 1.1 (there are no unanswerable questions). We used [the Japanese Wikipedia dump](https://dumps.wikimedia.org/jawiki/) as of 20211101.
135 |
136 | ##### JCommonsenseQA
137 |
138 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE#jcommonsenseqa):
139 |
140 | > JCommonsenseQA is a Japanese version of [CommonsenseQA](https://www.tau-nlp.org/commonsenseqa) ([Talmor+, 2019](https://aclanthology.org/N19-1421/)), which is a multiple-choice question answering dataset that requires commonsense reasoning ability. It is built using crowdsourcing with seeds extracted from the knowledge base [ConceptNet](https://conceptnet.io/).
141 |
142 | #### Leaderboard
143 |
144 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE#leaderboard):
145 |
146 | > A leaderboard will be made public soon. The test set will be released at that time.
147 |
148 | ### Languages
149 |
150 | The language data in JGLUE is in Japanese ([BCP-47 ja-JP](https://www.rfc-editor.org/info/bcp47)).
151 |
152 | ## Dataset Structure
153 |
154 | ### Data Instances
155 |
156 | When loading a specific configuration, users has to append a version dependent suffix:
157 |
158 | #### MARC-ja
159 |
160 | ```python
161 | from datasets import load_dataset
162 |
163 | dataset = load_dataset("shunk031/JGLUE", name="MARC-ja")
164 |
165 | print(dataset)
166 | # DatasetDict({
167 | # train: Dataset({
168 | # features: ['sentence', 'label', 'review_id'],
169 | # num_rows: 187528
170 | # })
171 | # validation: Dataset({
172 | # features: ['sentence', 'label', 'review_id'],
173 | # num_rows: 5654
174 | # })
175 | # })
176 | ```
177 |
178 | #### JCoLA
179 |
180 | ```python
181 | from datasets import load_dataset
182 |
183 | dataset = load_dataset("shunk031/JGLUE", name="JCoLA")
184 |
185 | print(dataset)
186 | # DatasetDict({
187 | # train: Dataset({
188 | # features: ['uid', 'source', 'label', 'diacritic', 'sentence', 'original', 'translation', 'gloss', 'simple', 'linguistic_phenomenon'],
189 | # num_rows: 6919
190 | # })
191 | # validation: Dataset({
192 | # features: ['uid', 'source', 'label', 'diacritic', 'sentence', 'original', 'translation', 'gloss', 'simple', 'linguistic_phenomenon'],
193 | # num_rows: 865
194 | # })
195 | # validation_out_of_domain: Dataset({
196 | # features: ['uid', 'source', 'label', 'diacritic', 'sentence', 'original', 'translation', 'gloss', 'simple', 'linguistic_phenomenon'],
197 | # num_rows: 685
198 | # })
199 | # validation_out_of_domain_annotated: Dataset({
200 | # features: ['uid', 'source', 'label', 'diacritic', 'sentence', 'original', 'translation', 'gloss', 'simple', 'linguistic_phenomenon'],
201 | # num_rows: 685
202 | # })
203 | # })
204 | ```
205 |
206 | An example of the JCoLA dataset (validation - out of domain annotated) looks as follows:
207 |
208 | ```json
209 | {
210 | "uid": 9109,
211 | "source": "Asano_and_Ura_2010",
212 | "label": 1,
213 | "diacritic": "g",
214 | "sentence": "太郎のゴミの捨て方について話した。",
215 | "original": "太郎のゴミの捨て方",
216 | "translation": "‘The way (for Taro) to throw out garbage’",
217 | "gloss": true,
218 | "linguistic_phenomenon": {
219 | "argument_structure": true,
220 | "binding": false,
221 | "control_raising": false,
222 | "ellipsis": false,
223 | "filler_gap": false,
224 | "island_effects": false,
225 | "morphology": false,
226 | "nominal_structure": false,
227 | "negative_polarity_concord_items": false,
228 | "quantifier": false,
229 | "verbal_agreement": false,
230 | "simple": false
231 | }
232 | }
233 | ```
234 |
235 | #### JSTS
236 |
237 | ```python
238 | from datasets import load_dataset
239 |
240 | dataset = load_dataset("shunk031/JGLUE", name="JSTS")
241 |
242 | print(dataset)
243 | # DatasetDict({
244 | # train: Dataset({
245 | # features: ['sentence_pair_id', 'yjcaptions_id', 'sentence1', 'sentence2', 'label'],
246 | # num_rows: 12451
247 | # })
248 | # validation: Dataset({
249 | # features: ['sentence_pair_id', 'yjcaptions_id', 'sentence1', 'sentence2', 'label'],
250 | # num_rows: 1457
251 | # })
252 | # })
253 | ```
254 |
255 | An example of the JSTS dataset looks as follows:
256 |
257 | ```json
258 | {
259 | "sentence_pair_id": "691",
260 | "yjcaptions_id": "127202-129817-129818",
261 | "sentence1": "街中の道路を大きなバスが走っています。 (A big bus is running on the road in the city.)",
262 | "sentence2": "道路を大きなバスが走っています。 (There is a big bus running on the road.)",
263 | "label": 4.4
264 | }
265 | ```
266 |
267 | #### JNLI
268 |
269 | ```python
270 | from datasets import load_dataset
271 |
272 | dataset = load_dataset("shunk031/JGLUE", name="JNLI")
273 |
274 | print(dataset)
275 | # DatasetDict({
276 | # train: Dataset({
277 | # features: ['sentence_pair_id', 'yjcaptions_id', 'sentence1', 'sentence2', 'label'],
278 | # num_rows: 20073
279 | # })
280 | # validation: Dataset({
281 | # features: ['sentence_pair_id', 'yjcaptions_id', 'sentence1', 'sentence2', 'label'],
282 | # num_rows: 2434
283 | # })
284 | # })
285 | ```
286 |
287 | An example of the JNLI dataset looks as follows:
288 |
289 | ```json
290 | {
291 | "sentence_pair_id": "1157",
292 | "yjcaptions_id": "127202-129817-129818",
293 | "sentence1": "街中の道路を大きなバスが走っています。 (A big bus is running on the road in the city.)",
294 | "sentence2": "道路を大きなバスが走っています。 (There is a big bus running on the road.)",
295 | "label": "entailment"
296 | }
297 | ```
298 |
299 | #### JSQuAD
300 |
301 | ```python
302 | from datasets import load_dataset
303 |
304 | dataset = load_dataset("shunk031/JGLUE", name="JSQuAD")
305 |
306 | print(dataset)
307 | # DatasetDict({
308 | # train: Dataset({
309 | # features: ['id', 'title', 'context', 'question', 'answers', 'is_impossible'],
310 | # num_rows: 62859
311 | # })
312 | # validation: Dataset({
313 | # features: ['id', 'title', 'context', 'question', 'answers', 'is_impossible'],
314 | # num_rows: 4442
315 | # })
316 | # })
317 | ```
318 |
319 | An example of the JSQuAD looks as follows:
320 |
321 | ```json
322 | {
323 | "id": "a1531320p0q0",
324 | "title": "東海道新幹線",
325 | "context": "東海道新幹線 [SEP] 1987 年(昭和 62 年)4 月 1 日の国鉄分割民営化により、JR 東海が運営を継承した。西日本旅客鉄道(JR 西日本)が継承した山陽新幹線とは相互乗り入れが行われており、東海道新幹線区間のみで運転される列車にも JR 西日本所有の車両が使用されることがある。2020 年(令和 2 年)3 月現在、東京駅 - 新大阪駅間の所要時間は最速 2 時間 21 分、最高速度 285 km/h で運行されている。",
326 | "question": "2020 年(令和 2 年)3 月現在、東京駅 - 新大阪駅間の最高速度はどのくらいか。",
327 | "answers": {
328 | "text": ["285 km/h"],
329 | "answer_start": [182]
330 | },
331 | "is_impossible": false
332 | }
333 | ```
334 |
335 | #### JCommonsenseQA
336 |
337 | ```python
338 | from datasets import load_dataset
339 |
340 | dataset = load_dataset("shunk031/JGLUE", name="JCommonsenseQA")
341 |
342 | print(dataset)
343 | # DatasetDict({
344 | # train: Dataset({
345 | # features: ['q_id', 'question', 'choice0', 'choice1', 'choice2', 'choice3', 'choice4', 'label'],
346 | # num_rows: 8939
347 | # })
348 | # validation: Dataset({
349 | # features: ['q_id', 'question', 'choice0', 'choice1', 'choice2', 'choice3', 'choice4', 'label'],
350 | # num_rows: 1119
351 | # })
352 | # })
353 | ```
354 |
355 | An example of the JCommonsenseQA looks as follows:
356 |
357 | ```json
358 | {
359 | "q_id": 3016,
360 | "question": "会社の最高責任者を何というか? (What do you call the chief executive officer of a company?)",
361 | "choice0": "社長 (president)",
362 | "choice1": "教師 (teacher)",
363 | "choice2": "部長 (manager)",
364 | "choice3": "バイト (part-time worker)",
365 | "choice4": "部下 (subordinate)",
366 | "label": 0
367 | }
368 | ```
369 |
370 | ### Data Fields
371 |
372 | #### MARC-ja
373 |
374 | - `sentence_pair_id`: ID of the sentence pair
375 | - `yjcaptions_id`: sentence ids in yjcaptions (explained below)
376 | - `sentence1`: first sentence
377 | - `sentence2`: second sentence
378 | - `label`: sentence similarity: 5 (equivalent meaning) - 0 (completely different meaning)
379 |
380 | ##### Explanation for `yjcaptions_id`
381 |
382 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE#explanation-for-yjcaptions_id), there are the following two cases:
383 |
384 | 1. sentence pairs in one image: `(image id)-(sentence1 id)-(sentence2 id)`
385 | - e.g., 723-844-847
386 | - a sentence id starting with "g" means a sentence generated by a crowdworker (e.g., 69501-75698-g103): only for JNLI
387 | 2. sentence pairs in two images: `(image id of sentence1)_(image id of sentence2)-(sentence1 id)-(sentence2 id)`
388 | - e.g., 91337_217583-96105-91680
389 |
390 | #### JCoLA
391 |
392 | From [JCoLA's README.md](https://github.com/osekilab/JCoLA#data-description) and [JCoLA's paper](https://arxiv.org/abs/2309.12676)
393 |
394 | - `uid`: unique id of the sentence
395 | - `source`: author and the year of publication of the source article
396 | - `label`: acceptability judgement label (0 for unacceptable, 1 for acceptable)
397 | - `diacritic`: acceptability judgement as originally notated in the source article
398 | - `sentence`: sentence (modified by the author if needed)
399 | - `original`: original sentence as presented in the source article
400 | - `translation`: English translation of the sentence as presentend in the source article (if any)
401 | - `gloss`: gloss of the sentence as presented in the source article (if any)
402 | - `linguistic_phenomenon`
403 | - `argument_structure`: acceptability judgements based on the order of arguments and case marking
404 | - `binding`: acceptability judgements based on the binding of noun phrases
405 | - `control_raising`: acceptability judgements based on predicates that are categorized as control or raising
406 | - `ellipsis`: acceptability judgements based on the possibility of omitting elements in the sentences
407 | - `filler_gap`: acceptability judgements based on the dependency between the moved element and the gap
408 | - `island effects`: acceptability judgements based on the restrictions on filler-gap dependencies such as wh-movements
409 | - `morphology`: acceptability judgements based on the morphology
410 | - `nominal_structure`: acceptability judgements based on the internal structure of noun phrases
411 | - `negative_polarity_concord_items`: acceptability judgements based on the restrictions on where negative polarity/concord items (NPIs/NCIs) can appear
412 | - `quantifiers`: acceptability judgements based on the distribution of quantifiers such as floating quantifiers
413 | - `verbal_agreement`: acceptability judgements based on the dependency between subjects and verbs
414 | - `simple`: acceptability judgements that do not have marked syntactic structures
415 |
416 | #### JNLI
417 |
418 | - `sentence_pair_id`: ID of the sentence pair
419 | - `yjcaptions_id`: sentence ids in the yjcaptions
420 | - `sentence1`: premise sentence
421 | - `sentence2`: hypothesis sentence
422 | - `label`: inference relation
423 |
424 | #### JSQuAD
425 |
426 | - `title`: title of a Wikipedia article
427 | - `paragraphs`: a set of paragraphs
428 | - `qas`: a set of pairs of a question and its answer
429 | - `question`: question
430 | - `id`: id of a question
431 | - `answers`: a set of answers
432 | - `text`: answer text
433 | - `answer_start`: start position (character index)
434 | - `is_impossible`: all the values are false
435 | - `context`: a concatenation of the title and paragraph
436 |
437 | #### JCommonsenseQA
438 |
439 | - `q_id`: ID of the question
440 | - `question`: question
441 | - `choice{0..4}`: choice
442 | - `label`: correct choice id
443 |
444 | ### Data Splits
445 |
446 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE/blob/main/README.md#tasksdatasets):
447 |
448 | > Only train/dev sets are available now, and the test set will be available after the leaderboard is made public.
449 |
450 | From [JCoLA's paper](https://arxiv.org/abs/2309.12676):
451 |
452 | > The in-domain data is split into training data (6,919 instances), development data (865 instances), and test data (865 instances). On the other hand, the out-of-domain data is only used for evaluation, and divided into development data (685 instances) and test data (686 instances).
453 |
454 | | Task | Dataset | Train | Dev | Test |
455 | |------------------------------|----------------|--------:|------:|------:|
456 | | Text Classification | MARC-ja | 187,528 | 5,654 | 5,639 |
457 | | | JCoLA | 6,919 | 865† / 685‡ | 865† / 685‡ |
458 | | Sentence Pair Classification | JSTS | 12,451 | 1,457 | 1,589 |
459 | | | JNLI | 20,073 | 2,434 | 2,508 |
460 | | Question Answering | JSQuAD | 62,859 | 4,442 | 4,420 |
461 | | | JCommonsenseQA | 8,939 | 1,119 | 1,118 |
462 |
463 | > JCoLA: † in domain. ‡ out of domain.
464 |
465 | ## Dataset Creation
466 |
467 | ### Curation Rationale
468 |
469 | From [JGLUE's paper](https://aclanthology.org/2022.lrec-1.317/):
470 |
471 | > JGLUE is designed to cover a wide range of GLUE and SuperGLUE tasks and consists of three kinds of tasks: text classification, sentence pair classification, and question answering.
472 |
473 | ### Source Data
474 |
475 | #### Initial Data Collection and Normalization
476 |
477 | [More Information Needed]
478 |
479 | #### Who are the source language producers?
480 |
481 | - The source language producers are users of Amazon (MARC-ja), crowd-workers of [Yahoo! Crowdsourcing](https://crowdsourcing.yahoo.co.jp/) (JSTS, JNLI and JCommonsenseQA), writers of the Japanese Wikipedia (JSQuAD), crowd-workers of [Lancers](https://www.lancers.jp/).
482 |
483 | ### Annotations
484 |
485 | #### Annotation process
486 |
487 | ##### MARC-ja
488 |
489 | From [JGLUE's paper](https://aclanthology.org/2022.lrec-1.317/):
490 |
491 | > As one of the text classification datasets, we build a dataset based on the Multilingual Amazon Reviews Corpus (MARC) (Keung et al., 2020). MARC is a multilingual corpus of product reviews with 5-level star ratings (1-5) on the Amazon shopping site. This corpus covers six languages, including English and Japanese. For JGLUE, we use the Japanese part of MARC and to make it easy for both humans and computers to judge a class label, we cast the text classification task as a binary classification task, where 1- and 2-star ratings are converted to “negative”, and 4 and 5 are converted to “positive”. We do not use reviews with a 3-star rating.
492 |
493 | > One of the problems with MARC is that it sometimes contains data where the rating diverges from the review text. This happens, for example, when a review with positive content is given a rating of 1 or 2. These data degrade the quality of our dataset. To improve the quality of the dev/test instances used for evaluation, we crowdsource a positive/negative judgment task for approximately 12,000 reviews. We adopt only reviews with the same votes from 7 or more out of 10 workers and assign a label of the maximum votes to these reviews. We divide the resulting reviews into dev/test data.
494 |
495 | > We obtained 5,654 and 5,639 instances for the dev and test data, respectively, through the above procedure. For the training data, we extracted 187,528 instances directly from MARC without performing the cleaning procedure because of the large number of training instances. The statistics of MARC-ja are listed in Table 2. For the evaluation metric for MARC-ja, we use accuracy because it is a binary classification task of texts.
496 |
497 | ##### JCoLA
498 |
499 | From [JCoLA's paper](https://arxiv.org/abs/2309.12676):
500 |
501 | > ### 3 JCoLA
502 | > In this study, we introduce JCoLA (Japanese Corpus of Linguistic Acceptability), which will be the first large-scale acceptability judgment task dataset focusing on Japanese. JCoLA consists of sentences from textbooks and handbooks on Japanese syntax, as well as from journal articles on Japanese syntax that are published in JEAL (Journal of East Asian Linguistics), one of the prestigious journals in theoretical linguistics.
503 |
504 | > #### 3.1 Data Collection
505 | > Sentences in JCoLA were collected from prominent textbooks and handbooks focusing on Japanese syntax. In addition to the main text, example sentences included in the footnotes were also considered for collection. We also collected acceptability judgments from journal articles on Japanese syntax published in JEAL (Journal of East Asian Linguistics): one of the prestigious journals in the-oretical linguistics. Specifically, we examined all the articles published in JEAL between 2006 and 2015 (133 papers in total), and extracted 2,252 acceptability judgments from 26 papers on Japanese syntax (Table 2). Acceptability judgments include sentences in appendices and footnotes, but not sentences presented for analyses of syntactic structures (e.g. sentences with brackets to show their syntactic structures). As a result, a total of 11,984 example. sentences were collected. Using this as a basis, JCoLA was constructed through the methodology explained in the following sections.
506 |
507 | ##### JSTS and JNLI
508 |
509 | From [JGLUE's paper](https://aclanthology.org/2022.lrec-1.317/):
510 |
511 | > For the sentence pair classification datasets, we construct a semantic textual similarity (STS) dataset, JSTS, and a natural language inference (NLI) dataset, JNLI.
512 |
513 | > ### Overview
514 | > STS is a task of estimating the semantic similarity of a sentence pair. Gold similarity is usually assigned as an average of the integer values 0 (completely different meaning) to 5 (equivalent meaning) assigned by multiple workers through crowdsourcing.
515 |
516 | > NLI is a task of recognizing the inference relation that a premise sentence has to a hypothesis sentence. Inference relations are generally defined by three labels: “entailment”, “contradiction”, and “neutral”. Gold inference relations are often assigned by majority voting after collecting answers from multiple workers through crowdsourcing.
517 |
518 | > For the STS and NLI tasks, STS-B (Cer et al., 2017) and MultiNLI (Williams et al., 2018) are included in GLUE, respectively. As Japanese datasets, JSNLI (Yoshikoshi et al., 2020) is a machine translated dataset of the NLI dataset SNLI (Stanford NLI), and JSICK (Yanaka and Mineshima, 2021) is a human translated dataset of the STS/NLI dataset SICK (Marelli et al., 2014). As mentioned in Section 1, these have problems originating from automatic/manual translations. To solve this problem, we construct STS/NLI datasets in Japanese from scratch. We basically extract sentence pairs in JSTS and JNLI from the Japanese version of the MS COCO Caption Dataset (Chen et al., 2015), the YJ Captions Dataset (Miyazaki and Shimizu, 2016). Most of the sentence pairs in JSTS and JNLI overlap, allowing us to analyze the relationship between similarities and inference relations for the same sentence pairs like SICK and JSICK.
519 |
520 | > The similarity value in JSTS is assigned a real number from 0 to 5 as in STS-B. The inference relation in JNLI is assigned from the above three labels as in SNLI and MultiNLI. The definitions of the inference relations are also based on SNLI.
521 |
522 | > ### Method of Construction
523 | > Our construction flow for JSTS and JNLI is shown in Figure 1. Basically, two captions for the same image of YJ Captions are used as sentence pairs. For these sentence pairs, similarities and NLI relations of entailment and neutral are obtained by crowdsourcing. However, it is difficult to collect sentence pairs with low similarity and contradiction relations from captions for the same image. To solve this problem, we collect sentence pairs with low similarity from captions for different images. We collect contradiction relations by asking workers to write contradictory sentences for a given caption.
524 |
525 | > The detailed construction procedure for JSTS and JNLI is described below.
526 | > 1. We crowdsource an STS task using two captions for the same image from YJ Captions. We ask five workers to answer the similarity between two captions and take the mean value as the gold similarity. We delete sentence pairs with a large variance in the answers because such pairs have poor answer quality. We performed this task on 16,000 sentence pairs and deleted sentence pairs with a similarity variance of 1.0 or higher, resulting in the collection of 10,236 sentence pairs with gold similarity. We refer to this collected data as JSTS-A.
527 | > 2. To collect sentence pairs with low similarity, we crowdsource the same STS task as Step 1 using sentence pairs of captions for different images. We conducted this task on 4,000 sentence pairs and collected 2,970 sentence pairs with gold similarity. We refer to this collected data as JSTS-B.
528 | > 3. For JSTS-A, we crowdsource an NLI task. Since inference relations are directional, we obtain inference relations in both directions for sentence pairs. As mentioned earlier,it is difficult to collect instances of contradiction from JSTS-A, which was collected from the captions of the same images,and thus we collect instances of entailment and neutral in this step. We collect inference relation answers from 10 workers. If six or more people give the same answer, we adopt it as the gold label if it is entailment or neutral. To obtain inference relations in both directions for JSTS-A, we performed this task on 20,472 sentence pairs, twice as many as JSTS-A. As a result, we collected inference relations for 17,501 sentence pairs. We refer to this collected data as JNLI-A. We do not use JSTS-B for the NLI task because it is difficult to define and determine the inference relations between captions of different images.
529 | > 4. To collect NLI instances of contradiction, we crowdsource a task of writing four contradictory sentences for each caption in YJCaptions. From the written sentences, we remove sentence pairs with an edit distance of 0.75 or higher to remove low-quality sentences, such as short sentences and sentences with low relevance to the original sentence. Furthermore, we perform a one-way NLI task with 10 workers to verify whether the created sentence pairs are contradictory. Only the sentence pairs answered as contradiction by at least six workers are adopted. Finally,since the contradiction relation has no direction, we automatically assign contradiction in the opposite direction of the adopted sentence pairs. Using 1,800 captions, we acquired 7,200 sentence pairs, from which we collected 3,779 sentence pairs to which we assigned the one-way contradiction relation.By automatically assigning the contradiction relation in the opposite direction, we doubled the number of instances to 7,558. We refer to this collected data as JNLI-C.
530 | > 5. For the 3,779 sentence pairs collected in Step 4, we crowdsource an STS task, assigning similarity and filtering in the same way as in Steps1 and 2. In this way, we collected 2,303 sentence pairs with gold similarity from 3,779 pairs. We refer to this collected data as JSTS-C.
531 |
532 | ##### JSQuAD
533 |
534 | From [JGLUE's paper](https://aclanthology.org/2022.lrec-1.317/):
535 |
536 | > As QA datasets, we build a Japanese version of SQuAD (Rajpurkar et al., 2016), one of the datasets of reading comprehension, and a Japanese version ofCommonsenseQA, which is explained in the next section.
537 |
538 | > Reading comprehension is the task of reading a document and answering questions about it. Many reading comprehension evaluation sets have been built in English, followed by those in other languages or multilingual ones.
539 |
540 | > In Japanese, reading comprehension datasets for quizzes (Suzukietal.,2018) and those in the drivingdomain (Takahashi et al., 2019) have been built, but none are in the general domain. We use Wikipedia to build a dataset for the general domain. The construction process is basically based on SQuAD 1.1 (Rajpurkar et al., 2016).
541 |
542 | > First, to extract high-quality articles from Wikipedia, we use Nayuki, which estimates the quality of articles on the basis of hyperlinks in Wikipedia. We randomly chose 822 articles from the top-ranked 10,000 articles. For example, the articles include “熊本県 (Kumamoto Prefecture)” and “フランス料理 (French cuisine)”. Next, we divide an article into paragraphs, present each paragraph to crowdworkers, and ask them to write questions and answers that can be answered if one understands the paragraph. Figure 2 shows an example of JSQuAD. We ask workers to write two additional answers for the dev and test sets to make the system evaluation robust.
543 |
544 | ##### JCommonsenseQA
545 |
546 | From [JGLUE's paper](https://aclanthology.org/2022.lrec-1.317/):
547 |
548 | > ### Overview
549 | > JCommonsenseQA is a Japanese version of CommonsenseQA (Talmor et al., 2019), which consists of five choice QA to evaluate commonsense reasoning ability. Figure 3 shows examples of JCommonsenseQA. In the same way as CommonsenseQA, JCommonsenseQA is built using crowdsourcing with seeds extracted from the knowledge base ConceptNet (Speer et al., 2017). ConceptNet is a multilingual knowledge base that consists of triplets of two concepts and their relation. The triplets are directional and represented as (source concept, relation, target concept), for example (bullet train, AtLocation, station).
550 |
551 | > ### Method of Construction
552 | > The construction flow for JCommonsenseQA is shown in Figure 4. First, we collect question sets (QSs) from ConceptNet, each of which consists of a source concept and three target concepts that have the same relation to the source concept. Next, for each QS, we crowdAtLocation 2961source a task of writing a question with only one target concept as the answer and a task of adding two distractors. We describe the detailed construction procedure for JCommonsenseQA below, showing how it differs from CommonsenseQA.
553 |
554 | > 1. We collect Japanese QSs from ConceptNet. CommonsenseQA uses only forward relations (source concept, relation, target concept) excluding general ones such as “RelatedTo” and “IsA”. JCommonsenseQA similarly uses a set of 22 relations5, excluding general ones, but the direction of the relations is bidirectional to make the questions more diverse. In other words, we also use relations in the opposite direction (source concept, relation−1, target concept).6 With this setup, we extracted 43,566 QSs with Japanese source/target concepts and randomly selected 7,500 from them.
555 | > 2. Some low-quality questions in CommonsenseQA contain distractors that can be considered to be an answer. To improve the quality of distractors, we add the following two processes that are not adopted in CommonsenseQA. First, if three target concepts of a QS include a spelling variation or a synonym of one another, this QS is removed. To identify spelling variations, we use the word ID of the morphological dictionary Juman Dic7. Second, we crowdsource a task of judging whether target concepts contain a synonym. As a result, we adopted 5,920 QSs from 7,500.
556 | > 3. For each QS, we crowdsource a task of writing a question sentence in which only one from the three target concepts is an answer. In the example shown in Figure 4, “駅 (station)” is an answer, and the others are distractors. To remove low quality question sentences, we remove the following question sentences.
557 | > - Question sentences that contain a choice word(this is because such a question is easily solved).
558 | > - Question sentences that contain the expression “XX characters”.8 (XX is a number).
559 | > - Improperly formatted question sentences that do not end with “?”.
560 | > - As a result, 5,920 × 3 = 17,760question sentences were created, from which we adopted 15,310 by removing inappropriate question sentences.
561 | > 4. In CommonsenseQA, when adding distractors, one is selected from ConceptNet, and the other is created by crowdsourcing. In JCommonsenseQA, to have a wider variety of distractors, two distractors are created by crowdsourcing instead of selecting from ConceptNet. To improve the quality of the questions9, we remove questions whose added distractors fall into one of the following categories:
562 | > - Distractors are included in a question sentence.
563 | > - Distractors overlap with one of existing choices.
564 | > - As a result, distractors were added to the 15,310 questions, of which we adopted 13,906.
565 | > 5. We asked three crowdworkers to answer each question and adopt only those answered correctly by at least two workers. As a result, we adopted 11,263 out of the 13,906 questions.
566 |
567 | #### Who are the annotators?
568 |
569 | From [JGLUE's README.md](https://github.com/yahoojapan/JGLUE/blob/main/README.md#tasksdatasets):
570 |
571 | > We use Yahoo! Crowdsourcing for all crowdsourcing tasks in constructing the datasets.
572 |
573 | From [JCoLA's paper](https://arxiv.org/abs/2309.12676):
574 |
575 | > As a reference for the upper limit of accuracy in JCoLA, human acceptability judgment experiments were conducted on Lancers2 with a subset of the JCoLA data.
576 |
577 | ### Personal and Sensitive Information
578 |
579 | [More Information Needed]
580 |
581 | ## Considerations for Using the Data
582 |
583 | ### Social Impact of Dataset
584 |
585 | From [JGLUE's paper](https://aclanthology.org/2022.lrec-1.317/):
586 |
587 | > We build a Japanese NLU benchmark, JGLUE, from scratch without translation to measure the general NLU ability in Japanese. We hope that JGLUE will facilitate NLU research in Japanese.
588 |
589 | ### Discussion of Biases
590 |
591 | [More Information Needed]
592 |
593 | ### Other Known Limitations
594 |
595 | From [JCoLA's paper](https://arxiv.org/abs/2309.12676):
596 |
597 | > All the sentences included in JCoLA have been extracted from textbooks, handbooks and journal articles on theoretical syntax. Therefore, those sentences are guaranteed to be theoretically meaningful, making JCoLA a challenging dataset. However, the distribution of linguistic phenomena directly reflects that of the source literature and thus turns out to be extremely skewed. Indeed, as can be seen in Table 3, while the number of sentences exceeds 100 for most linguistic phenomena, there are several linguistic phenomena for which there are only about 10 sentences. In addition, since it is difficult to force language models to interpret sentences given specific contexts, those sentences whose unacceptability depends on contexts were inevitably removed from JCoLA. This removal process resulted in the deletion of unacceptable sentences from some linguistic phenomena (such as ellipsis), consequently skewing the balance between acceptable and unacceptable sentences (with a higher proportion of acceptable sentences).
598 |
599 | ## Additional Information
600 |
601 | - 日本語言語理解ベンチマーク JGLUE の構築 〜 自然言語処理モデルの評価用データセットを公開しました - Yahoo! JAPAN Tech Blog https://techblog.yahoo.co.jp/entry/2022122030379907/
602 |
603 | ### Dataset Curators
604 |
605 | #### MARC-ja
606 |
607 | - Keung, Phillip, et al. "The Multilingual Amazon Reviews Corpus." Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP). 2020.
608 |
609 | #### JCoLA
610 |
611 | - Someya, Sugimoto, and Oseki. "JCoLA: Japanese Corpus of Linguistic Acceptability." arxiv preprint arXiv:2309.12676 (2023).
612 |
613 | #### JSTS and JNLI
614 |
615 | - Miyazaki, Takashi, and Nobuyuki Shimizu. "Cross-lingual image caption generation." Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). 2016.
616 |
617 | #### JSQuAD
618 |
619 | The JGLUE's 'authors curated the original data for JSQuAD from the Japanese wikipedia dump.
620 |
621 | #### JCommonsenseQA
622 |
623 | In the same way as CommonsenseQA, JCommonsenseQA is built using crowdsourcing with seeds extracted from the knowledge base ConceptNet
624 |
625 | ### Licensing Information
626 |
627 | #### JGLUE
628 |
629 | From [JGLUE's README.md'](https://github.com/yahoojapan/JGLUE#license):
630 |
631 | > This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.
632 |
633 | #### JCoLA
634 |
635 | From [JCoLA's README.md'](https://github.com/osekilab/JCoLA#license):
636 |
637 | > The text in this corpus is excerpted from the published works, and copyright (where applicable) remains with the original authors or publishers. We expect that research use within Japan is legal under fair use, but make no guarantee of this.
638 |
639 | ### Citation Information
640 |
641 | #### JGLUE
642 |
643 | ```bibtex
644 | @inproceedings{kurihara-lrec-2022-jglue,
645 | title={JGLUE: Japanese general language understanding evaluation},
646 | author={Kurihara, Kentaro and Kawahara, Daisuke and Shibata, Tomohide},
647 | booktitle={Proceedings of the Thirteenth Language Resources and Evaluation Conference},
648 | pages={2957--2966},
649 | year={2022},
650 | url={https://aclanthology.org/2022.lrec-1.317/}
651 | }
652 | ```
653 |
654 | ```bibtex
655 | @inproceedings{kurihara-nlp-2022-jglue,
656 | title={JGLUE: 日本語言語理解ベンチマーク},
657 | author={栗原健太郎 and 河原大輔 and 柴田知秀},
658 | booktitle={言語処理学会第 28 回年次大会},
659 | pages={2023--2028},
660 | year={2022},
661 | url={https://www.anlp.jp/proceedings/annual_meeting/2022/pdf_dir/E8-4.pdf},
662 | note={in Japanese}
663 | }
664 | ```
665 |
666 | #### MARC-ja
667 |
668 | ```bibtex
669 | @inproceedings{marc_reviews,
670 | title={The Multilingual Amazon Reviews Corpus},
671 | author={Keung, Phillip and Lu, Yichao and Szarvas, György and Smith, Noah A.},
672 | booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing},
673 | year={2020}
674 | }
675 | ```
676 |
677 | #### JCoLA
678 |
679 | ```bibtex
680 | @article{someya-arxiv-2023-jcola,
681 | title={JCoLA: Japanese Corpus of Linguistic Acceptability},
682 | author={Taiga Someya and Yushi Sugimoto and Yohei Oseki},
683 | year={2023},
684 | eprint={2309.12676},
685 | archivePrefix={arXiv},
686 | primaryClass={cs.CL}
687 | }
688 | ```
689 |
690 | ```bibtex
691 | @inproceedings{someya-nlp-2022-jcola,
692 | title={日本語版 CoLA の構築},
693 | author={染谷 大河 and 大関 洋平},
694 | booktitle={言語処理学会第 28 回年次大会},
695 | pages={1872--1877},
696 | year={2022},
697 | url={https://www.anlp.jp/proceedings/annual_meeting/2022/pdf_dir/E7-1.pdf},
698 | note={in Japanese}
699 | }
700 | ```
701 |
702 | #### JSTS and JNLI
703 |
704 | ```bibtex
705 | @inproceedings{miyazaki2016cross,
706 | title={Cross-lingual image caption generation},
707 | author={Miyazaki, Takashi and Shimizu, Nobuyuki},
708 | booktitle={Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
709 | pages={1780--1790},
710 | year={2016}
711 | }
712 | ```
713 |
714 | ### Contributions
715 |
716 | Thanks to [Kentaro Kurihara](https://twitter.com/kkurihara_cs), [Daisuke Kawahara](https://twitter.com/daisukekawahar1), and [Tomohide Shibata](https://twitter.com/stomohide) for creating JGLUE dataset.
717 | Thanks to [Taiga Someya](https://twitter.com/T0a8i0g9a) for creating JCoLA dataset.
718 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "huggingface-datasets-jglue"
3 | version = "1.2.0"
4 | description = "Dataset loading script for JGLUE: Japanese General Language Understanding Evaluation"
5 | authors = [
6 | { name = "Shunsuke Kitada", email = "shunsuke.kitada.0831@gmail.com" },
7 | ]
8 | readme = "README.md"
9 | requires-python = ">=3.8"
10 | dependencies = [
11 | "beautifulsoup4>=4.11.2",
12 | "datasets>=3.0.0",
13 | "mecab-python3>=1.0.6",
14 | "mojimoji>=0.0.12",
15 | "pyknp>=0.6.1",
16 | ]
17 |
18 | [dependency-groups]
19 | dev = ["mypy>=1.0.0", "pytest>=6.0.0", "ruff>=0.1.5"]
20 |
21 | [tool.mypy]
22 | python_version = "3.10"
23 | ignore_missing_imports = true
24 |
--------------------------------------------------------------------------------
/tests/JGLUE_test.py:
--------------------------------------------------------------------------------
1 | import datasets as ds
2 | import pytest
3 |
4 | # In datasets>=3.0.0, HF_DATASETS_TRUST_REMOTE_CODE defaults to False,
5 | # which triggers confirmation dialogs when loading datasets and interrupts testing.
6 | # Therefore, HF_DATASETS_TRUST_REMOTE_CODE is set to True.
7 | ds.config.HF_DATASETS_TRUST_REMOTE_CODE = True
8 |
9 |
10 | @pytest.fixture
11 | def dataset_path() -> str:
12 | return "JGLUE.py"
13 |
14 |
15 | @pytest.mark.parametrize(
16 | argnames="dataset_name, expected_num_train, expected_num_valid,",
17 | argvalues=(
18 | ("JSTS", 12451, 1457),
19 | ("JNLI", 20073, 2434),
20 | ("JSQuAD", 62859, 4442),
21 | ("JCommonsenseQA", 8939, 1119),
22 | ),
23 | )
24 | def test_load_dataset(
25 | dataset_path: str,
26 | dataset_name: str,
27 | expected_num_train: int,
28 | expected_num_valid: int,
29 | ):
30 | dataset = ds.load_dataset(path=dataset_path, name=dataset_name)
31 | assert isinstance(dataset, ds.DatasetDict)
32 |
33 | assert dataset["train"].num_rows == expected_num_train
34 | assert dataset["validation"].num_rows == expected_num_valid
35 |
36 |
37 | def test_load_marc_ja(
38 | dataset_path: str,
39 | dataset_name: str = "MARC-ja",
40 | expected_num_train: int = 187528,
41 | expected_num_valid: int = 5654,
42 | ):
43 | dataset = ds.load_dataset(
44 | path=dataset_path,
45 | name=dataset_name,
46 | is_pos_neg=True,
47 | max_char_length=500,
48 | filter_review_id_list_valid=True,
49 | label_conv_review_id_list_valid=True,
50 | )
51 | assert isinstance(dataset, ds.DatasetDict)
52 |
53 | assert dataset["train"].num_rows == expected_num_train
54 | assert dataset["validation"].num_rows == expected_num_valid
55 |
56 |
57 | def test_load_jcola(
58 | dataset_path: str,
59 | dataset_name: str = "JCoLA",
60 | expected_num_train: int = 6919,
61 | expected_num_valid: int = 865,
62 | expected_num_valid_ood: int = 685,
63 | ):
64 | dataset = ds.load_dataset(path=dataset_path, name=dataset_name)
65 | assert isinstance(dataset, ds.DatasetDict)
66 |
67 | assert dataset["train"].num_rows == expected_num_train
68 | assert dataset["validation"].num_rows == expected_num_valid
69 | assert dataset["validation_out_of_domain"].num_rows == expected_num_valid_ood
70 | assert (
71 | dataset["validation_out_of_domain_annotated"].num_rows == expected_num_valid_ood
72 | )
73 |
74 |
75 | def test_jglue_version():
76 | import tomli
77 |
78 | from JGLUE import JGLUE
79 |
80 | jglue_version = JGLUE.JGLUE_VERSION
81 | jglue_major, jglue_minor, _ = jglue_version.tuple
82 |
83 | with open("pyproject.toml", "rb") as rf:
84 | pyproject_toml = tomli.load(rf)
85 |
86 | project_version = ds.Version(pyproject_toml["project"]["version"])
87 | proj_major, proj_minor, _ = project_version.tuple
88 |
89 | assert jglue_major == proj_major and jglue_minor == proj_minor, (
90 | f"JGLUE and project version mismatch: {jglue_version=} != {project_version=}"
91 | )
92 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shunk031/huggingface-datasets_JGLUE/41cc99f1c01d41b1ab13435ae97b7076c2199f4c/tests/__init__.py
--------------------------------------------------------------------------------