├── .github
└── workflows
│ └── codeql-analysis.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── ci.yml
├── competitions
├── mrc_baseline.ipynb
├── mrc_utils.py
├── sentiment_analysis_baseline.ipynb
├── textual_similarity_baseline.ipynb
└── utils.py
├── examples
├── assets
│ └── utils.py
├── evaluation
│ ├── zh-pretrain-ernie-1.0-evaluation.ipynb
│ └── zh-sentiment-analysis-evaluation.ipynb
└── interpretation
│ ├── example_level
│ ├── en-similarity_for_reppoint.ipynb
│ ├── zh-sentiment-analysis_for_feature_sim.ipynb
│ ├── zh-sentiment-analysis_for_gradient_sim.ipynb
│ └── zh-sentiment-analysis_for_reppoint.ipynb
│ └── token_level
│ ├── zh-sentiment-analysis.ipynb
│ ├── zh-sentiment-analysis_for_normlime.ipynb
│ └── zh-similarity.ipynb
├── imgs
├── bias.png
├── bias_correction.png
├── data_map.png
├── data_map_criterion.png
├── data_map_lt.png
├── data_map_main.png
├── data_map_normal.png
├── dirty-accuracy.png
├── dirty.png
├── dirty_analysis.png
├── equation1.png
├── equation2.png
├── equation3.png
├── equation4.png
├── equation5.png
├── example.gif
├── example.png
├── example_case.png
├── framework.png
├── overview.png
├── paddlenlp脏数据识别及修正.png
├── paddlenlp覆盖不足识别及有效增强.png
├── pretrained_labeled_case.png
├── process-for-dirty.png
├── process-for-sparse.png
├── rationale.png
├── rationale_example.png
├── redundancy_removal.png
├── saliency_map.png
├── sentiment_labeled_case.png
├── sparse.png
├── sparse_analysis.png
├── target-performance.png
├── token.gif
├── token.png
├── trustai.png
├── visual.png
├── visual2.png
├── visual3.png
├── why_trustai.png
└── 覆盖不足识别.png
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests
├── __init__.py
└── interpretation
│ ├── __init__.py
│ ├── assets
│ └── utils.py
│ └── example_level
│ ├── __init__.py
│ ├── test_feature_similarity.py
│ ├── test_gradient_similarity.py
│ └── test_reppoint.py
├── trustai
├── .gitignore
├── __init__.py
├── demo
│ ├── __init__.py
│ ├── demo.py
│ └── utils.py
├── evaluation
│ ├── README.md
│ ├── __init__.py
│ └── evaluator.py
└── interpretation
│ ├── __init__.py
│ ├── base_interpret.py
│ ├── example_level
│ ├── README.md
│ ├── __init__.py
│ ├── common
│ │ ├── __init__.py
│ │ ├── data_class.py
│ │ └── utils.py
│ └── method
│ │ ├── __init__.py
│ │ ├── example_base_interpreter.py
│ │ ├── feature_similarity.py
│ │ ├── gradient_similarity.py
│ │ └── representer_point.py
│ ├── python_utils.py
│ └── token_level
│ ├── README.md
│ ├── __init__.py
│ ├── common
│ ├── __init__.py
│ ├── postprocess_attribution.py
│ └── predict_functions.py
│ ├── data_processor
│ ├── __init__.py
│ ├── data_class.py
│ └── visualizer.py
│ └── method
│ ├── __init__.py
│ ├── attention.py
│ ├── base_interpret.py
│ ├── gradient_shap.py
│ ├── integrated_gradients.py
│ ├── lime.py
│ └── norm_lime.py
└── tutorials
├── README.md
├── data_bias_identification
├── data_distribution_correction
│ ├── README.md
│ ├── balance_train_data.py
│ ├── get_rationale_importance.py
│ ├── train.py
│ └── utils.py
└── less_learn_shortcut
│ ├── README.md
│ ├── find_bias_word.py
│ ├── lls.py
│ ├── train.py
│ └── utils.py
├── data_map
├── README.md
├── data.py
├── plot_map.py
├── run_train_pointwise.sh
├── sample_100.tsv
├── sample_stat_summary.py
└── train_pointwise.py
├── dirty_data_identification
├── README.md
├── find_dirty_data.py
├── train.py
└── utils.py
├── enhanced_by_rationale
├── README.md
├── train.py
└── utils.py
├── map_analysis
├── utils.py
└── zh-similarity-application.ipynb
├── redundancy_removal
├── README.md
├── args.py
├── download.sh
├── predictor
│ ├── dataloader_factory.py
│ ├── model.py
│ └── model_manager.py
├── requirements.txt
├── run_predict.py
├── run_select.py
├── selector
│ ├── dataloader_factory.py
│ ├── model.py
│ └── model_manager.py
├── test.sh
├── train.sh
├── train_predictor.sh
├── train_select_data.sh
├── train_selector.sh
└── utils
│ ├── checklist_process.py
│ ├── dureader_robust.py
│ ├── logger.py
│ ├── predict.py
│ └── tools.py
└── sparse_data_identification
├── README.md
├── find_sparse_data.py
├── find_valid_data.py
├── train.py
└── utils.py
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: [ main, pattern1 ]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: [ main ]
20 | schedule:
21 | - cron: '19 6 * * 5'
22 |
23 | jobs:
24 | analyze:
25 | name: Analyze
26 | runs-on: ubuntu-latest
27 | permissions:
28 | actions: read
29 | contents: read
30 | security-events: write
31 |
32 | strategy:
33 | fail-fast: false
34 | matrix:
35 | language: [ python ]
36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
38 |
39 | steps:
40 | - name: Checkout repository
41 | uses: actions/checkout@v3
42 |
43 | # Initializes the CodeQL tools for scanning.
44 | - name: Initialize CodeQL
45 | uses: github/codeql-action/init@v2
46 | with:
47 | languages: ${{ matrix.language }}
48 | # If you wish to specify custom queries, you can do so here or in a config file.
49 | # By default, queries listed here will override any specified in a config file.
50 | # Prefix the list here with "+" to use these queries and those in the config file.
51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main
52 |
53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
54 | # If this step fails, then you should remove it and run the build manually (see below)
55 | - name: Autobuild
56 | uses: github/codeql-action/autobuild@v2
57 |
58 | # ℹ️ Command-line programs to run using the OS shell.
59 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
60 |
61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
62 | # and modify them (or add more) to build your code if your project
63 | # uses a compiled language
64 |
65 | #- run: |
66 | # make bootstrap
67 | # make release
68 |
69 | - name: Perform CodeQL Analysis
70 | uses: github/codeql-action/analyze@v2
71 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Virtualenv
2 | /.venv/
3 | /venv/
4 |
5 | # Byte-compiled / optimized / DLL files
6 | __pycache__/
7 | *.py[cod]
8 |
9 | # C extensions
10 | *.so
11 |
12 | # Distribution / packaging
13 | /bin/
14 | /build/
15 | /develop-eggs/
16 | /dist/
17 | /eggs/
18 | /lib/
19 | /lib64/
20 | /output/
21 | /parts/
22 | /sdist/
23 | /var/
24 | /*.egg-info/
25 | /.installed.cfg
26 | /*.egg
27 | /.eggs
28 |
29 | # AUTHORS and ChangeLog will be generated while packaging
30 | /AUTHORS
31 | /ChangeLog
32 |
33 | # BCloud / BuildSubmitter
34 | /build_submitter.*
35 | /logger_client_log
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | .tox/
43 | .coverage
44 | .cache
45 | .pytest_cache
46 | nosetests.xml
47 | coverage.xml
48 | .idea
49 | # Translations
50 | *.mo
51 |
52 | # Sphinx documentation
53 | /docs/_build/
54 |
55 |
56 | *bak
57 | *copy*
58 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/mirrors-yapf
3 | rev: v0.32.0
4 | hooks:
5 | - id: yapf
6 | files: \.py$
7 | args: ["--style={column_limit: 120}"]
8 | - repo: https://github.com/pre-commit/pre-commit-hooks
9 | rev: a11d9314b22d8f8c7556443875b731ef05965464
10 | hooks:
11 | - id: check-merge-conflict
12 | - id: check-symlinks
13 | - id: detect-private-key
14 | - id: end-of-file-fixer
15 | files: \.md$
16 | - id: trailing-whitespace
17 | files: \.md$
18 | - repo: https://github.com/Lucas-C/pre-commit-hooks
19 | rev: v1.0.1
20 | hooks:
21 | - id: forbid-crlf
22 | files: \.md$
23 | - id: remove-crlf
24 | files: \.md$
25 | - id: forbid-tabs
26 | files: \.md$
27 | - id: remove-tabs
28 | exclude: (\.tsv)$
--------------------------------------------------------------------------------
/ci.yml:
--------------------------------------------------------------------------------
1 | Global:
2 | tool : build_submitter
3 |
4 | Default:
5 | profile : [publish]
6 |
7 | Profiles:
8 | - profile:
9 | name : dev
10 | env: DECK_CENTOS6U3_K3
11 | command : python setup.py bdist_wheel
12 | release : true
13 |
14 | - profile:
15 | name : publish
16 | env: DECK_CENTOS6U3_K3
17 | command : python setup.py bdist_wheel
18 | release : true
19 |
--------------------------------------------------------------------------------
/imgs/bias.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/bias.png
--------------------------------------------------------------------------------
/imgs/bias_correction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/bias_correction.png
--------------------------------------------------------------------------------
/imgs/data_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/data_map.png
--------------------------------------------------------------------------------
/imgs/data_map_criterion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/data_map_criterion.png
--------------------------------------------------------------------------------
/imgs/data_map_lt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/data_map_lt.png
--------------------------------------------------------------------------------
/imgs/data_map_main.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/data_map_main.png
--------------------------------------------------------------------------------
/imgs/data_map_normal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/data_map_normal.png
--------------------------------------------------------------------------------
/imgs/dirty-accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/dirty-accuracy.png
--------------------------------------------------------------------------------
/imgs/dirty.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/dirty.png
--------------------------------------------------------------------------------
/imgs/dirty_analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/dirty_analysis.png
--------------------------------------------------------------------------------
/imgs/equation1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/equation1.png
--------------------------------------------------------------------------------
/imgs/equation2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/equation2.png
--------------------------------------------------------------------------------
/imgs/equation3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/equation3.png
--------------------------------------------------------------------------------
/imgs/equation4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/equation4.png
--------------------------------------------------------------------------------
/imgs/equation5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/equation5.png
--------------------------------------------------------------------------------
/imgs/example.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/example.gif
--------------------------------------------------------------------------------
/imgs/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/example.png
--------------------------------------------------------------------------------
/imgs/example_case.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/example_case.png
--------------------------------------------------------------------------------
/imgs/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/framework.png
--------------------------------------------------------------------------------
/imgs/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/overview.png
--------------------------------------------------------------------------------
/imgs/paddlenlp脏数据识别及修正.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/paddlenlp脏数据识别及修正.png
--------------------------------------------------------------------------------
/imgs/paddlenlp覆盖不足识别及有效增强.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/paddlenlp覆盖不足识别及有效增强.png
--------------------------------------------------------------------------------
/imgs/pretrained_labeled_case.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/pretrained_labeled_case.png
--------------------------------------------------------------------------------
/imgs/process-for-dirty.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/process-for-dirty.png
--------------------------------------------------------------------------------
/imgs/process-for-sparse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/process-for-sparse.png
--------------------------------------------------------------------------------
/imgs/rationale.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/rationale.png
--------------------------------------------------------------------------------
/imgs/rationale_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/rationale_example.png
--------------------------------------------------------------------------------
/imgs/redundancy_removal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/redundancy_removal.png
--------------------------------------------------------------------------------
/imgs/saliency_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/saliency_map.png
--------------------------------------------------------------------------------
/imgs/sentiment_labeled_case.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/sentiment_labeled_case.png
--------------------------------------------------------------------------------
/imgs/sparse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/sparse.png
--------------------------------------------------------------------------------
/imgs/sparse_analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/sparse_analysis.png
--------------------------------------------------------------------------------
/imgs/target-performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/target-performance.png
--------------------------------------------------------------------------------
/imgs/token.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/token.gif
--------------------------------------------------------------------------------
/imgs/token.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/token.png
--------------------------------------------------------------------------------
/imgs/trustai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/trustai.png
--------------------------------------------------------------------------------
/imgs/visual.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/visual.png
--------------------------------------------------------------------------------
/imgs/visual2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/visual2.png
--------------------------------------------------------------------------------
/imgs/visual3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/visual3.png
--------------------------------------------------------------------------------
/imgs/why_trustai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/why_trustai.png
--------------------------------------------------------------------------------
/imgs/覆盖不足识别.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/覆盖不足识别.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | paddlenlp
3 | paddlepaddle-gpu >= 2.0.0
4 | scikit-learn
5 | tqdm
6 | matplotlib
7 | IPython
8 | pre-commit
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | # Here's a link about setup.cfg
2 | # https://setuptools.readthedocs.io/en/latest/setuptools.html#configuring-setup-using-setup-cfg-files
3 | [metadata]
4 | # Project name, the project name will be used while publishing and installing
5 | name = trustai
6 | # Author's name and email address
7 | author = Baidu NLP
8 | author_email = nlp-parser@baidu.com
9 | # Project version, versions only above than 1.0 will assumed as a released version.
10 | # When modifying project version to above than 1.0, here's the rules should be followed.
11 | # http://wiki.baidu.com/pages/viewpage.action?pageId=469686381
12 | version = 0.1.12
13 | # A brief introduction about the project, ANY NON-ENGLISH CHARACTER IS NOT SUPPORTED!
14 | description = baidu TrustAI
15 | # A longer version of introduction abouth the project, you can also include readme, change log, etc. .md or rst file is recommended.
16 | long_description = file: README.md
17 | long_description_content_type = text/markdown
18 | # Main page of the project, usually the project's icode page, you can set to its wiki or other documents url instead.
19 | home_page = https://github.com/PaddlePaddle/TrustAI
20 | # License, you can ignore this if the project is not going to open source to the public.
21 | license = Apache License 2.0
22 | # Project type, you can ignore this if the project is not going to open source to the public.
23 | # Choose the right field to fulfill from PyPI's official list.
24 | # https://pypi.org/pypi?%3Aaction=list_classifiers
25 | classifier =
26 | Programming Language :: Python :: 3
27 | Programming Language :: Python :: 3.6
28 | Programming Language :: Python :: 3.7
29 | Programming Language :: Python :: 3.8
30 | License :: OSI Approved :: Apache Software License
31 | Operating System :: OS Independent
32 | # keywords, used for indexing, easier to search for other users if they are interested of your project.
33 | keywords =
34 | baidu
35 | TrustAI
36 | interpretation
37 |
38 | [options]
39 | # Package name. find means search automatically, you also can have detailed configuration in options.packages.find
40 | packages = find:
41 | # Dependency management, all project's dependency is needed here.
42 | # Every single line for a specified dependency, only the dependency is need, you don't have to consider the hierarchy dependency
43 | # Versions here should be as abstract as possible, usually only specific a version range including minimum and maximum version.
44 | install_requires =
45 | scikit-learn
46 | numpy
47 | tqdm
48 | matplotlib
49 | IPython
50 |
51 | # Test dependencies, all dependencies for tests here. The format is align to install_requires.
52 | # You can use the internal unittest, or the simplier framework such as pytest or nose.
53 | # python3 has a mock library with itself, but it's not exist in python 2, add as you need.
54 | #tests_require =
55 | # pytest
56 | # mock
57 |
58 | # directory for unit test
59 | test_suite = trustai.tests
60 | # add all data files controled by git
61 | include_package_data = True
62 | # You can run zip source code for plain python project
63 | zip_safe = False
64 |
65 | # You can set this configuration to let users run directly the main entrance function
66 | #[options.entry_points]
67 | #console_scripts =
68 | # TrustAI = trustai.cmdline:main
69 |
70 | # You can add conf/data directory into package, the following directory will be installed under site-package
71 | # Only file is supported, but you can use wildcard instead.
72 | #[options.package_data]
73 | #trustai =
74 | # conf/*
75 | # data/*
76 |
77 | [sdist]
78 | dist_dir = output/dist
79 |
80 | [bdist_wheel]
81 | # set universal=1 if this project can both run in python 2 or 3 environment.
82 | #universal=1
83 | dist_dir = output/dist
84 |
85 | [easy_install]
86 | # using baidu's official pip source
87 | index_url = http://pip.baidu.com/root/baidu/+simple/
88 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 | ################################################################################
4 | #
5 | # Copyright (c) 2022 Baidu.com, Inc. All Rights Reserved
6 | #
7 | ################################################################################
8 | """
9 | Setup script.
10 |
11 | Authors: zhangshuai28(zhangshuai28@baidu.com)
12 | Date: 2022/03/14 14:53:37
13 | """
14 |
15 | import setuptools
16 |
17 | setuptools.setup()
18 |
19 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/tests/__init__.py
--------------------------------------------------------------------------------
/tests/interpretation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/tests/interpretation/__init__.py
--------------------------------------------------------------------------------
/tests/interpretation/example_level/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/tests/interpretation/example_level/__init__.py
--------------------------------------------------------------------------------
/tests/interpretation/example_level/test_feature_similarity.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python3
2 | import os
3 | import sys
4 | import unittest
5 | from functools import partial
6 |
7 | import paddle
8 | import numpy as np
9 | from paddlenlp.data import Stack, Tuple, Pad, Vocab, JiebaTokenizer
10 | from paddlenlp.datasets import load_dataset, MapDataset
11 | from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
12 |
13 | sys.path.insert(0, '../')
14 | sys.path.insert(0, '../../../')
15 | from assets.utils import (
16 | create_dataloader,
17 | convert_example,
18 | create_dataloader_from_scratch,
19 | LSTMModel,
20 | preprocess_fn_lstm,
21 | get_sublayer,
22 | )
23 | from trustai.interpretation.example_level.method.feature_similarity import (
24 | FeatureSimilarityModel, )
25 |
26 |
27 | class TestFeatureSimilarity(unittest.TestCase):
28 |
29 | def test_bert_model(self):
30 | MODEL_NAME = "ernie-1.0"
31 | DATASET_NAME = "chnsenticorp"
32 | paddle_model = ErnieForSequenceClassification.from_pretrained(MODEL_NAME, num_classes=2)
33 | tokenizer = ErnieTokenizer.from_pretrained(MODEL_NAME)
34 | state_dict = paddle.load(f"../assets/{DATASET_NAME}-{MODEL_NAME}/model_state.pdparams")
35 | paddle_model.set_dict(state_dict)
36 |
37 | train_ds, dev_ds, test_ds = load_dataset(DATASET_NAME, splits=["train", "dev", "test"])
38 |
39 | batch_size = 32
40 | max_seq_length = 128
41 |
42 | trans_func = partial(
43 | convert_example,
44 | tokenizer=tokenizer,
45 | max_seq_length=max_seq_length,
46 | is_test=True,
47 | )
48 | batchify_fn = lambda samples, fn=Tuple(
49 | Pad(axis=0, pad_val=tokenizer.pad_token_id), # input
50 | Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment
51 | ): [data for data in fn(samples)]
52 |
53 | train_data_loader = create_dataloader(
54 | train_ds,
55 | mode="train",
56 | batch_size=batch_size,
57 | batchify_fn=batchify_fn,
58 | trans_fn=trans_func,
59 | shuffle=False,
60 | )
61 |
62 | feature_sim_model = FeatureSimilarityModel(paddle_model, train_data_loader, classifier_layer_name="classifier")
63 |
64 | def test_predict_fn(self):
65 |
66 | def predict_fn(inputs, paddle_model, classifier_layer_name="classifier"):
67 | """predict_fn"""
68 |
69 | x_feature = []
70 |
71 | def forward_pre_hook(layer, input):
72 | """
73 | Hook for a given layer in model.
74 | """
75 | x_feature.extend(input[0])
76 |
77 | classifier = get_sublayer(paddle_model, classifier_layer_name)
78 |
79 | forward_pre_hook_handle = classifier.register_forward_pre_hook(forward_pre_hook)
80 |
81 | if isinstance(inputs, (tuple, list)):
82 | logits = paddle_model(*inputs) # get logits, [bs, num_c]
83 | else:
84 | logits = paddle_model(inputs) # get logits, [bs, num_c]
85 |
86 | forward_pre_hook_handle.remove()
87 |
88 | probas = paddle.nn.functional.softmax(logits, axis=1) # get probabilities.
89 | preds = paddle.argmax(probas, axis=1) # get predictions.
90 | x_feature = paddle.to_tensor(x_feature)
91 | return x_feature, probas, preds
92 |
93 | MODEL_NAME = "ernie-1.0"
94 | DATASET_NAME = "chnsenticorp"
95 | paddle_model = ErnieForSequenceClassification.from_pretrained(MODEL_NAME, num_classes=2)
96 | tokenizer = ErnieTokenizer.from_pretrained(MODEL_NAME)
97 | state_dict = paddle.load(f"../assets/{DATASET_NAME}-{MODEL_NAME}/model_state.pdparams")
98 | paddle_model.set_dict(state_dict)
99 |
100 | train_ds, dev_ds, test_ds = load_dataset(DATASET_NAME, splits=["train", "dev", "test"])
101 |
102 | batch_size = 32
103 | max_seq_length = 128
104 |
105 | trans_func = partial(
106 | convert_example,
107 | tokenizer=tokenizer,
108 | max_seq_length=max_seq_length,
109 | is_test=True,
110 | )
111 | batchify_fn = lambda samples, fn=Tuple(
112 | Pad(axis=0, pad_val=tokenizer.pad_token_id), # input
113 | Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment
114 | ): [data for data in fn(samples)]
115 |
116 | predict_fn_test = partial(predict_fn, paddle_model=paddle_model)
117 |
118 | train_data_loader = create_dataloader(
119 | train_ds,
120 | mode="train",
121 | batch_size=batch_size,
122 | batchify_fn=batchify_fn,
123 | trans_fn=trans_func,
124 | shuffle=False,
125 | )
126 |
127 | feature_sim_model = FeatureSimilarityModel(
128 | paddle_model,
129 | train_data_loader,
130 | classifier_layer_name="classifier",
131 | predict_fn=predict_fn_test,
132 | )
133 |
134 | def test_lstm_model(self):
135 | PARAMS_PATH = "../assets/chnsenticorp-bilstm/final.pdparams"
136 | VOCAB_PATH = "../assets/chnsenticorp-bilstm/bilstm_word_dict.txt"
137 | vocab = Vocab.from_json(VOCAB_PATH)
138 | tokenizer = JiebaTokenizer(vocab)
139 | label_map = {0: "negative", 1: "positive"}
140 | vocab_size = len(vocab)
141 | num_classes = len(label_map)
142 | pad_token_id = vocab.to_indices("[PAD]")
143 |
144 | DATASET_NAME = "chnsenticorp"
145 | paddle_model = LSTMModel(vocab_size, num_classes, direction="bidirect", padding_idx=pad_token_id)
146 | state_dict = paddle.load(PARAMS_PATH)
147 | paddle_model.set_dict(state_dict)
148 |
149 | train_ds, dev_ds, test_ds = load_dataset(DATASET_NAME, splits=["train", "dev", "test"])
150 |
151 | # train_ds = [d['text'] for d in list(train_ds)[:1200]]
152 | # train_ds = [d["text"] for d in list(train_ds)]
153 | # train_ds = MapDataset(train_ds)
154 |
155 | batch_size = 32
156 | max_seq_length = 128
157 |
158 | trans_func = partial(preprocess_fn_lstm, tokenizer=tokenizer, is_test=True)
159 | batchify_fn = lambda samples, fn=Tuple(
160 | Pad(axis=0, pad_val=pad_token_id), # input
161 | Pad(axis=0, pad_val=pad_token_id), # sequence_length
162 | ): [data for data in fn(samples)]
163 |
164 | train_data_loader = create_dataloader(
165 | train_ds,
166 | mode="train",
167 | batch_size=batch_size,
168 | batchify_fn=batchify_fn,
169 | trans_fn=trans_func,
170 | shuffle=False,
171 | )
172 |
173 | feature_sim_model = FeatureSimilarityModel(paddle_model,
174 | train_data_loader,
175 | classifier_layer_name="output_layer")
176 |
177 |
178 | if __name__ == "__main__":
179 | unittest.main()
180 |
--------------------------------------------------------------------------------
/trustai/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105 | __pypackages__/
106 |
107 | # Celery stuff
108 | celerybeat-schedule
109 | celerybeat.pid
110 |
111 | # SageMath parsed files
112 | *.sage.py
113 |
114 | # Environments
115 | .env
116 | .venv
117 | env/
118 | venv/
119 | ENV/
120 | env.bak/
121 | venv.bak/
122 |
123 | # Spyder project settings
124 | .spyderproject
125 | .spyproject
126 |
127 | # Rope project settings
128 | .ropeproject
129 |
130 | # mkdocs documentation
131 | /site
132 |
133 | # mypy
134 | .mypy_cache/
135 | .dmypy.json
136 | dmypy.json
137 |
138 | # Pyre type checker
139 | .pyre/
140 |
141 | # pytype static type analyzer
142 | .pytype/
143 |
144 | # Cython debug symbols
145 | cython_debug/
146 |
147 | # PyCharm
148 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
149 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
150 | # and can be added to the global gitignore or merged into this file. For a more nuclear
151 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
152 | #.idea/
153 |
--------------------------------------------------------------------------------
/trustai/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """TrustAI"""
15 |
16 | __version__ = "0.1.12"
--------------------------------------------------------------------------------
/trustai/demo/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Init file"""
15 |
16 | from .demo import *
--------------------------------------------------------------------------------
/trustai/demo/demo.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """pretrain for demo"""
15 |
16 | import logging
17 | import logging.handlers
18 | import os
19 | import sys
20 | import re
21 | import requests
22 | import shutil
23 | import tarfile
24 | import warnings
25 | import functools
26 |
27 | from tqdm import tqdm
28 | from paddle.io import DataLoader, BatchSampler
29 | try:
30 | from paddlenlp.transformers import *
31 | from paddlenlp.datasets import load_dataset
32 |
33 | except ImportError as e:
34 | sys.stderr.write(
35 | '''The demo module depends on paddlenlp, please install paddlenlp firstly. cmd: pip install -U paddlenlp. ''')
36 | exit(-1)
37 |
38 | from .utils import DOWNLOAD_MODEL_PATH_DICT, MODEL_HOME, get_path_from_url
39 | from .utils import LocalDataCollatorWithPadding, preprocess_function, get_path_from_url
40 |
41 |
42 | class DEMO(object):
43 |
44 | def __init__(self, task_name, device: str = None):
45 | self.device = device
46 | assert self.device is None or isinstance(self.device, str) and re.search(
47 | r"^cpu$|^gpu$|^gpu:\d+$", self.device
48 | ) is not None, "The format of the ``devices`` should be like ``cpu``, ``gpu``, ``gpu:0``, ``gpu:1`` etc."
49 |
50 | self._paddle_env_set()
51 | self.task_name = task_name
52 | model_path = self.get_model_path(task_name)
53 | self.paddle_model = AutoModelForSequenceClassification.from_pretrained(model_path)
54 | self.tokenizer = AutoTokenizer.from_pretrained(model_path)
55 | self.unk_id = self.tokenizer.unk_token_id
56 | self.pad_id = self.tokenizer.pad_token_type_id
57 | self.cls_id = self.tokenizer.cls_token_id
58 | self.mask_id = self.tokenizer.mask_token_id
59 |
60 | def get_model_path(self, model_name):
61 | try:
62 | model_url, md5sum = DOWNLOAD_MODEL_PATH_DICT[model_name]
63 | except KeyError:
64 | logging.warn(
65 | f"The model_name `{model_name}` is wrong, currently only the following models are supported : {', '.join(DOWNLOAD_MODEL_PATH_DICT.keys())}."
66 | )
67 | exit(-1)
68 | model_path = get_path_from_url(model_url, MODEL_HOME, md5sum=md5sum)
69 | return model_path
70 |
71 | def get_model(self):
72 | return self.paddle_model
73 |
74 | def get_tokenizer(self):
75 | return self.tokenizer
76 |
77 | def get_train_data_and_dataloader(self, batch_size=8, max_seq_length=256):
78 | task_name = self.task_name.split('/')
79 | if len(task_name) == 2:
80 | train_ds = load_dataset(task_name[0], name=task_name[1], splits=["train"])
81 | else:
82 | train_ds = load_dataset(task_name[0], splits=["train"])
83 | trans_func = functools.partial(preprocess_function,
84 | max_seq_length=max_seq_length,
85 | tokenizer=self.tokenizer,
86 | is_test=True)
87 | train_ds = train_ds.map(trans_func)
88 | train_batch_sampler = BatchSampler(train_ds, batch_size=batch_size, shuffle=False)
89 | collate_fn = LocalDataCollatorWithPadding(self.tokenizer)
90 | train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=collate_fn)
91 | return train_ds.data, train_data_loader,
92 |
93 | def __call__(self, *args, **kwargs):
94 | return self.process(*args, **kwargs)
95 |
96 | def process(self, text, text_pair=None):
97 | tokenize_result = self.tokenizer(text, text_pair=text_pair, return_tensors='pd', padding=True)
98 | input_ids = tokenize_result['input_ids']
99 | token_type_ids = tokenize_result['token_type_ids']
100 | tokens = [self.tokenizer.convert_ids_to_tokens(_input_ids) for _input_ids in input_ids.tolist()]
101 | return tokens, (input_ids, token_type_ids)
102 |
103 | def _paddle_env_set(self):
104 | import paddle
105 | if self.device is not None:
106 | if not paddle.is_compiled_with_cuda() and self.device[:3] == 'gpu':
107 | warnings.warn("Paddle is not installed with GPU support. Change to CPU version now.")
108 | self.device = 'cpu'
109 |
110 | # globally set device.
111 | paddle.set_device(self.device)
112 |
113 | def __getitem__(self, key):
114 | return getattr(self, key)
--------------------------------------------------------------------------------
/trustai/evaluation/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/trustai/evaluation/README.md
--------------------------------------------------------------------------------
/trustai/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Init file"""
15 |
16 | from .evaluator import *
--------------------------------------------------------------------------------
/trustai/interpretation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """interpreter"""
15 |
16 | from .token_level import *
17 | from .example_level import *
--------------------------------------------------------------------------------
/trustai/interpretation/base_interpret.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """base interpreter"""
15 |
16 | import abc
17 | import sys
18 | import numpy as np
19 | import re
20 | import warnings
21 |
22 | from .python_utils import versiontuple2tuple
23 |
24 |
25 | class Interpreter(abc.ABC):
26 | """Interpreter is the base class for all interpretation algorithms.
27 | Args:
28 | paddle_model (callable): A model with ``forward`` and possibly ``backward`` functions.
29 | device (str): The device used for running `paddle_model`, options: ``cpu``, ``gpu``, ``gpu:0``, ``gpu:1`` etc. default: None
30 | """
31 |
32 | def __init__(self, paddle_model: callable, device: str = None, **kwargs):
33 | self.device = device
34 | self.paddle_model = paddle_model
35 | self.predict_fn = None
36 |
37 | assert self.device is None or isinstance(self.device, str) and re.search(
38 | r"^cpu$|^gpu$|^gpu:\d+$", self.device
39 | ) is not None, "The format of the ``devices`` should be like ``cpu``, ``gpu``, ``gpu:0``, ``gpu:1`` etc."
40 |
41 | self._paddle_env_set()
42 |
43 | def __call__(self, *args, **kwargs):
44 | return self.interpret(*args, **kwargs)
45 |
46 | @abc.abstractmethod
47 | def interpret(self, **kwargs):
48 | """Main function of the interpreter."""
49 | raise NotImplementedError
50 |
51 | @abc.abstractmethod
52 | def _build_predict_fn(self, **kwargs):
53 | """Build self.predict_fn for interpreters."""
54 | raise NotImplementedError
55 |
56 | def _paddle_env_set(self):
57 | import paddle
58 | if self.device is not None:
59 | if not paddle.is_compiled_with_cuda() and self.device[:3] == 'gpu':
60 | warnings.warn("Paddle is not installed with GPU support. Change to CPU version now.")
61 | self.device = 'cpu'
62 |
63 | # globally set device.
64 | paddle.set_device(self.device)
65 | self.paddle_model.to(self.device)
66 |
67 | if versiontuple2tuple(paddle.version.full_version) >= (2, 2, 1):
68 | # From Paddle2.2.1, gradients are supported in eval mode.
69 | self.paddle_model.eval()
70 | else:
71 | # Former versions.
72 | self.paddle_model.train()
73 | for n, v in self.paddle_model.named_sublayers():
74 | if "batchnorm" in v.__class__.__name__.lower():
75 | v._use_global_stats = True
76 | if "dropout" in v.__class__.__name__.lower():
77 | v.p = 0
78 |
--------------------------------------------------------------------------------
/trustai/interpretation/example_level/README.md:
--------------------------------------------------------------------------------
1 | # 实例级证据分析
2 |
3 |
4 | ## 功能介绍
5 | 针对给定的模型(含训练数据)和测试输入,实例级证据分析方法对每一训练样本赋值一个分数,用其表示该样本对预测文本的影响度。然后,对当前预测文本影响大的若干训练样本作为模型预测依赖证据,解释模型对预测。
6 |
7 | TrustAI提供了3种实例级证据分析方法,分别是:
8 | * [基于梯度的方法](https://proceedings.neurips.cc/paper/2018/file/8a7129b8f3edd95b7d969dfc2c8e9d9d-Paper.pdf):基于训练样本的梯度计算其对模型的影响度a, 通过影响度a、训练样本i的表示、测试样本t的表示来计算训练样本i对测试样本t的影响度。该方法计算得到的影响分数受a影响较大,即给出的证据是对模型影响大的样本,这样的样本可能是困难数据或脏数据,所以使用者可以根据这种方法识别脏数据。
9 | * [基于表示相似度方法](https://arxiv.org/pdf/2104.04128.pdf):计算训练样本i和测试样本t的表示的相似度,该相似度作为训练样本i对测试样本t的影响度。该相似度可通过cosine、KL散度、欧氏距离等多种方法实现。
10 | * [基于梯度相似度方法](https://arxiv.org/pdf/2102.05262.pdf):计算训练样本i和测试样本t的梯度的相似度,该相似度作为训练样本i对测试样本t的影响度。该相似度可通过cosine等方法实现。
11 |
12 |
13 |
14 | ## 使用示例
15 | TrustAI为所有实例级证据分析方法提供统一的使用接口。
16 | * 接口输入:训练数据、训练好的模型、和测试数据;
17 | * 接口输出:测试数据的实例级证据,含支持当前测试数据的证据(正影响)和不支持测试数据的证据(负影响)。
18 |
19 |
20 |
21 | #### 基于梯度的方法
22 | 调用代码:
23 | ```python
24 | from trustai.interpretation import RepresenterPointModel
25 | # initialization
26 | # 开发者需要传入模型及对应的训练数据,以及模型输出层中最后一层的layer name
27 | representer_model = RepresenterPointModel(model, train_data_loader, classifier_layer_name="classifier")
28 | # res为模型返回的结构化的结果,类型为list。list中的每一个元素对应一个测试实例的返回结果,该结果包括预测标签,正影响数据的索引,负影响数据的索引,正影响数据的分值和负影响数据的分值。
29 | res = []
30 | for batch in test_dataloader:
31 | res += representer_model(batch, sample_num=3)
32 | ```
33 | *注:返回证据数量由sample_num指定,若sample_num为-1则返回按影响度排序的所有训练数据。*
34 |
35 | 输出结果:
36 | ```txt
37 | 测试文本(来自情感分析任务): 本来不想评价了,但为了携程的携粉们,还是说一下,这称不上是九点,细说就真没必要了,就一个字:差
38 | 预测结果: 0(表示负向情感)
39 |
40 | 支持当前测试文本的训练样本(正影响样本):
41 | text: 感觉非常奇怪,这套书我明明都写了两次评论了,可我的当当始终提醒我对这套书写评论!晕啊!这是套很好的书,也不用我写几次评论吧! gold label: 1 score: 0.03509485349059105
42 | text: 1)背面少个螺丝钉,说是thinkpad都少,靠 2)键盘周围的壳不平整,按下去发现有:“滋啦滋啦”声音,我才意识到,那是个双面胶,按下去就不上来了,过会儿还是回弹上来,很明显仅靠双面胶是 粘不住的,你还不如拿502呢,起码这样粘得严实还能让我心里舒服(但是这样只是弥补质量问题),何必还弄个滋啦兹啦的声音,多闹心啊,(还有一地方用了双面胶,我换内存的时候发现键盘下部盖子左侧打不开,一直不敢用力 gold label: 1 score: 0.03008783608675003
43 | text: 用了6年的THINKPAD,一直认为是笔记本中最好的! 现在这台新的让我......哎!! gold label: 0 score: 0.029884012416005135
44 |
45 | 不支持当前测试文本的训练样本(负影响样本):
46 | text: 是LINUX系统 相当及其恶心 不知道这狗 日 的是什么想法 要强行逼我们使用啊 买了两台电脑 一个事VISTA系统 一个 是 LINUX 就没见一个XP的 网上销售这东西 最重要的是打架尽量不要涉及到售后服务这块 尽量是都搞好了相安无事 其实网上的售后服务比没有售后服务还差劲 我的THINKPAD SL400就是因为换货期间以为是键盘小问题就懒得换了 gold label: 1 score: -0.07112707197666168
47 | text: 盼了2周终于拿到本了,一开机就屏不亮,本人自己跑回总部退机,现在还在等着检测,说要等上15个工作日,呵呵,买个电脑容易吗?时间浪费的起吗?请问? gold label: 0 score: -0.07233154773712158
48 | text: 价格确实比较高,而且还没有早餐提供。 携程拿到的价格不好?还是自己保留起来不愿意让利给我们这些客户呢? 到前台搞价格,430就可以了。 gold label: 1 score: -0.08243595063686371
49 | ```
50 |
51 |
52 | #### 基于表示相似度的方法
53 | 调用代码:
54 | ```python
55 | from trustai.interpretation import FeatureSimilarityModel
56 | # initialization
57 | # 开发者需要传入模型及对应的训练数据,以及模型输出层中最后一层的layer name
58 | feature_sim_model = FeatureSimilarityModel(model, train_data_loader, classifier_layer_name="classifier")
59 | # 开发者可以通过sim_fn参数指定相似度计算方式,目前支持cos、dot、euc(分别是余弦距离,点积距离和欧式距离)
60 | #res为模型返回的结构化的结果,类型为list。list中的每一个元素对应一个测试实例的返回结果,该结果包括预测标签,正影响数据的索引,负影响数据的索引,正影响数据的分值和负影响数据的分值。
61 | res = []
62 | for batch in test_dataloader:
63 | res += feature_sim_model(batch, sample_num=3, sim_fn='cos')
64 | ```
65 | *注:返回证据数量由sample_num指定,若sample_num为-1则返回按影响度排序的所有训练数据。*
66 |
67 | 输出结果:
68 | ```txt
69 | 测试文本(来自情感分析任务): 没有光驱,重装Windows需要外接光驱,对于电脑新手会很麻烦(没什么人会用Linux吧)
70 | 预测结果: 0(负向情感)
71 |
72 | 支持当前测试文本的训练样本(正影响样本):
73 | text: Linux系统不太好用,平时习惯用Windows xp 系统,一下子用这个系统感觉很不习惯,建议开发或预装Windows xp系统. gold label: 0 score: 0.9393996000289917
74 | text: 1、机器较沉 2、VISTA用起来不习惯,且占系统较多 3、音频插口、右前侧的二个USB口在用鼠标时感觉手靠得太近了 gold label: 0 score: 0.9354583621025085
75 | text: vista系统下也没有无线网卡驱动,用驱动精灵可解决。 机器稍有点重。 散热有待改进。 gold label: 0 score: 0.9348428249359131
76 |
77 | 不支持当前测试文本的训练样本(负影响样本):
78 | text: “任务型教学”在我国外语教学界备受关注。国家教育部新《英语课程标准》将“倡导‘任务型’的教学途径,培养学生综合语言运用能力”写入教学建议。任务型教学被视为改革我国传统外语教学的良方。本书立足我国外语教学现状,全面分析了“任务型教学”的理论和实践基础、以实例说明“任务型教学”的具体操作步骤。为广大一线英语教师提供了教学和研究参考。 gold label: 1 score: -0.12422356754541397
79 | text: 当美国发生次贷危机乃至影响全世界以后,对于应对危机,我们国内的绝大多数专家对此都异口同声,观点基本雷同,而且又莫衷一是,人云亦云,本书的作者以其独特的视觉和丰富的知识,在书中告诉我们这次危机的来龙去脉,我们国家应该以怎样的方式去直面这次危机,如何转危为安,化危为机;作为普通读者也能从书中领会到一些对自己有益的知识。读完这本书以后,你更能体会到一种不一样的思维,非常值得一读。 gold label: 1 score: -0.128561332821846
80 | text: 我从06年开始几乎每月都有一次出差,山西很多酒店都住过了,印象最深的就是三晋国际,其他还有龙城国际,华苑宾馆,黄河京都,还有一个叫什么交通大厦的,下面我对这些酒店做个最真实、准确地点评: 三晋国际——这是我认为最让太原市骄傲的酒店,我们衡量一个酒店的最直接的就是你能够得到什么服务,在这家酒店里,我感觉到了家一般的照顾,第一次来这里,感冒了,嘴里冷不丁说了一句,服务生就听到了,然后熬了一碗姜汤到我房间,当然也是免费的,很感动;洗澡时,一不小心摔倒了,副总经理、总监等等都慰问了我,其实这也不完全是酒店的错,但是从那以后,我发现每个房间浴室都放置了防滑垫和塑料拖鞋;有一次我把袜子之类的放在洗手间了,谁知道我回来后竟然发现服务员帮我免费清洗了,还把我不小心掰断的心爱的梳子还用胶给我粘好了,还留了言给我,真的很让我意外也有点不敢相信!对一个出差特别频繁,时间特别紧张的人来说,办理入住和退房就是一个最让人烦躁的时间,但是我算过了,三晋国际前台办理退房、入住的时间没有超过一分钟!!!在北京都很难有这样的待遇!其他的,比如前台接待、门厅服务之类的就不用说了,真的很好; 当然我也有建议:1、酒店的被子能否换厚一点的,冬天冷啊;2、一些房间的电话没有免提,不是很方便;3、外面的电话打不进来,可能是酒店为了安全考虑吧,但还是希望能够有外线拨入的功能。 龙城国际——不知道五星级是谁给的评价?!酒店一般,还不如华苑宾馆,无法容忍的是,前台接待服务态度太差了!唯一的优点是,早餐挺好吃。 华苑宾馆——06、07年都挺好的,今天偶尔又住了一下,发现时间长了,枕头、被子不是很干净,其他倒是挺好的,服务态度、环境都还不错,早餐有点单一。 黄河京都——地方太偏了!看起来挺好,住进去不咋地,无法容忍的是,也给大家提个醒,我退房的时间整整用了29分钟,快半个钟头了,我差点晕倒!结帐的服务员只顾打电话,不理我。 交通大厦——噩梦般的酒店,我再也不会住了!!隔音效果太差,还不幸地让我听到了隔壁小两口的闺房密语,哈哈,让我坐噩梦的是,半夜不知道什么单位来查房,从好多房间带走了好多女孩子,好怕怕地说……还有就是前台一个戴眼镜的,白白的女孩子,态度可真差啊,郁闷! 太原还有好多酒店,可能我不会一一住到,但还是希望所有的酒店都能够像三晋国际一样,给山西人长脸! gold label: 1 score: -0.17390453815460205
81 | ```
82 |
83 |
84 | #### 基于梯度的相似度方法示例
85 | 调用代码:
86 |
87 | ```python
88 | from trustai.interpretation import GradientSimilarityModel
89 | # initialization
90 | # 开发者需要传入模型及对应的训练数据,以及模型输出层中最后一层的layer name
91 | # 注意因为需要计算每一条数据对于模型参数的梯度,所以train_dataloader的batch_size需要设置为1,测试数据对应的dataloader的batch_size也需为1
92 | grad_sim_model = GradientSimilarityModel(model, train_data_loader, classifier_layer_name="classifier")
93 | # 开发者可以通过sim_fn参数指定相似度计算方式,目前支持cos、dot(分别是余弦距离,点积距离)
94 | # res为模型返回的结构化的结果,类型为list。list中的每一个元素对应一个测试实例的返回结果,该结果包括预测标签,正影响数据的索引,负影响数据的索引,正影响数据的分值和负影响数据的分值。
95 | res = []
96 | for batch in test_dataloader:
97 | res += grad_sim_model(batch, sample_num=3, sim_fn='cos')
98 | ```
99 | *注:返回证据数量由sample_num指定,若sample_num为-1则返回按影响度排序的所有训练数据。*
100 |
101 | 输出结果:
102 | ```txt
103 | 测试文本(来自情感分析任务): 没有光驱,重装Windows需要外接光驱,对于电脑新手会很麻烦(没什么人会用Linux吧)
104 | 预测结果: 0(负向情感)
105 |
106 | 支持当前测试文本的训练样本(正影响样本):
107 | text: Linux系统不太好用,平时习惯用Windows xp 系统,一下子用这个系统感觉很不习惯,建议开发或预装Windows xp系统. gold label: 0 score: 0.9395108222961426
108 | text: 1、机器较沉 2、VISTA用起来不习惯,且占系统较多 3、音频插口、右前侧的二个USB口在用鼠标时感觉手靠得太近了 gold label: 0 score: 0.9355786442756653
109 | text: vista系统下也没有无线网卡驱动,用驱动精灵可解决。 机器稍有点重。 散热有待改进。 gold label: 0 score: 0.9349631071090698
110 |
111 | 不支持当前测试文本的训练样本(负影响样本):
112 | text: 价格确实比较高,而且还没有早餐提供。 携程拿到的价格不好?还是自己保留起来不愿意让利给我们这些客户呢? 到前台搞价格,430就可以了。 gold label: 1 score: -0.49774348735809326
113 | text: 买机器送的移动硬盘2.5寸250G的,没开封,想卖出,感兴趣短息联系,北京13901019711 gold label: 1 score: -0.5244823694229126
114 | text: 买机器送的移动硬盘2.5寸250G的,没开封,想卖出,感兴趣短息联系,北京13901019711 gold label: 0 score: -0.5244823694229126
115 | ```
116 |
117 |
118 | 详细示例见[examples](../../../examples/interpretation/example_level/)。
119 |
--------------------------------------------------------------------------------
/trustai/interpretation/example_level/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """interpreter"""
15 |
16 | from .method import *
--------------------------------------------------------------------------------
/trustai/interpretation/example_level/common/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
--------------------------------------------------------------------------------
/trustai/interpretation/example_level/common/data_class.py:
--------------------------------------------------------------------------------
1 | """
2 | data class
3 | """
4 |
5 | from dataclasses import dataclass
6 | from typing import Any
7 | from typing import List
8 | from typing import Dict
9 | from typing import Tuple
10 |
11 |
12 | @dataclass
13 | class ExampleResult(object):
14 | pred_label: int
15 | pos_indexes: List[int]
16 | neg_indexes: List[int]
17 | pos_scores: List[float]
18 | neg_scores: List[float]
--------------------------------------------------------------------------------
/trustai/interpretation/example_level/common/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Some useful functions."""
15 | import paddle
16 | import paddle.nn.functional as F
17 | import numpy as np
18 | from .data_class import ExampleResult
19 |
20 |
21 | def get_sublayer(model, sublayer_name='classifier'):
22 | """
23 | Get the sublayer named sublayer_name in model.
24 | Args:
25 | model (obj:`paddle.nn.Layer`): Any paddle model.
26 | sublayer_name (obj:`str`, defaults to classifier): The sublayer name.
27 | Returns:
28 | layer(obj:`paddle.nn.Layer.common.sublayer_name`):The sublayer named sublayer_name in model.
29 | """
30 | for name, layer in model.named_children():
31 | if name == sublayer_name:
32 | return layer
33 |
34 |
35 | def dot_similarity(inputs_a, inputs_b):
36 | """
37 | calaculate dot-product similarity between the two inputs.
38 | """
39 | return paddle.sum(inputs_a * inputs_b, axis=1)
40 |
41 |
42 | def cos_similarity(inputs_a, inputs_b, step=500000):
43 | """
44 | calaculate cosine similarity between the two inputs.
45 | """
46 | # Processing to avoid paddle bug
47 | start, end = 0, step
48 | res = []
49 | while start < inputs_a.shape[0]:
50 | res.append(F.cosine_similarity(inputs_a[start:end], inputs_b.unsqueeze(0)))
51 | start = end
52 | end = end + step
53 | return paddle.concat(res, axis=0)
54 |
55 |
56 | def euc_similarity(inputs_a, inputs_b):
57 | """
58 | calaculate euclidean similarity between the two inputs.
59 | """
60 | return -paddle.linalg.norm(inputs_a - inputs_b.unsqueeze(0), axis=-1).squeeze(-1)
61 |
62 |
63 | def get_top_and_bottom_n_examples(scores, pred_label, sample_num=3):
64 | """
65 | get n index of the highest and lowest score, return the structual result.
66 | """
67 |
68 | top_score, top_index = paddle.topk(scores, sample_num, axis=0, largest=True, sorted=True)
69 |
70 | bottom_score, bottom_index = paddle.topk(scores, sample_num, axis=0, largest=False, sorted=True)
71 |
72 | res = ExampleResult(pred_label=pred_label,
73 | pos_indexes=top_index.numpy(),
74 | neg_indexes=bottom_index.numpy(),
75 | pos_scores=top_score.numpy(),
76 | neg_scores=bottom_score.numpy())
77 |
78 | return res
79 |
--------------------------------------------------------------------------------
/trustai/interpretation/example_level/method/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """example method"""
15 |
16 | from .representer_point import RepresenterPointModel
17 | from .feature_similarity import FeatureSimilarityModel
18 | from .gradient_similarity import GradientSimilarityModel
19 |
20 | __all__ = ["RepresenterPointModel", "FeatureSimilarityModel", "GradientSimilarityModel"]
--------------------------------------------------------------------------------
/trustai/interpretation/example_level/method/example_base_interpreter.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import functools
3 |
4 | import paddle
5 |
6 | from ...base_interpret import Interpreter
7 | from ..common.utils import get_sublayer
8 |
9 |
10 | class ExampleBaseInterpreter(Interpreter):
11 | """Interpreter is the base class for all interpretation algorithms.
12 | Args:
13 | paddle_model (callable): A model with ``forward`` and possibly ``backward`` functions.
14 | device (str): The device used for running `paddle_model`, options: ``cpu``, ``gpu``, ``gpu:0``, ``gpu:1`` etc. default: None
15 | predict_fn(callabel: default=None): If the paddle_model prediction has special process, user can customize the prediction function.
16 | classifier_layer_name(str: default=classifier): Name of the classifier layer in paddle_model.
17 | """
18 |
19 | def __init__(self,
20 | paddle_model: callable,
21 | device: str = None,
22 | predict_fn=None,
23 | classifier_layer_name="classifier",
24 | **kwargs):
25 | Interpreter.__init__(self, paddle_model, device)
26 | self.paddle_model = paddle_model
27 | self.classifier_layer_name = classifier_layer_name
28 | self._build_predict_fn(predict_fn=predict_fn)
29 |
30 | def __call__(self, *args, **kwargs):
31 | return self.interpret(*args, **kwargs)
32 |
33 | def _build_predict_fn(self, predict_fn=None):
34 | if predict_fn is not None:
35 | self.predict_fn = functools.partial(predict_fn,
36 | classifier_layer_name=self.classifier_layer_name,
37 | paddle_model=self.paddle_model)
38 | return
39 |
40 | def predict_fn(inputs, classifier_layer_name=None, paddle_model=None):
41 | """predict_fn"""
42 | if paddle_model is None:
43 | paddle_model = self.paddle_model
44 | if classifier_layer_name is None:
45 | classifier_layer_name = self.classifier_layer_name
46 |
47 | cached_features = []
48 |
49 | def forward_pre_hook(layer, input):
50 | """
51 | Pre_hook for a given layer in model.
52 | """
53 | cached_features.extend(input[0])
54 |
55 | cached_logits = []
56 |
57 | def forward_post_hook(layer, input, output):
58 | """
59 | Post_hook for a given layer in model.
60 | """
61 | cached_logits.append(output)
62 |
63 | classifier = get_sublayer(paddle_model, classifier_layer_name)
64 |
65 | forward_pre_hook_handle = classifier.register_forward_pre_hook(forward_pre_hook)
66 | forward_post_hook_handle = classifier.register_forward_post_hook(forward_post_hook)
67 |
68 | if isinstance(inputs, (tuple, list)):
69 | res = paddle_model(*inputs) # get logits, [bs, num_c]
70 | else:
71 | res = paddle_model(inputs) # get logits, [bs, num_c]
72 |
73 | forward_pre_hook_handle.remove()
74 | forward_post_hook_handle.remove()
75 |
76 | logits = cached_logits[-1]
77 | if len(logits.shape) < 2:
78 | logits = logits.unsqueeze(0)
79 |
80 | probas = paddle.nn.functional.softmax(cached_logits[-1], axis=1) # get probabilities.
81 | preds = paddle.argmax(probas, axis=1).tolist() # get predictions.
82 | return paddle.to_tensor(cached_features), probas, preds
83 |
84 | self.predict_fn = predict_fn
85 |
86 | @abc.abstractmethod
87 | def interpret(self, **kwargs):
88 | """Main function of the interpreter."""
89 | raise NotImplementedError
90 |
--------------------------------------------------------------------------------
/trustai/interpretation/example_level/method/feature_similarity.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python3
2 | """
3 | feature-based similarity method.
4 | cosine, cot and euc.
5 | """
6 | import os
7 | import sys
8 | import functools
9 | import warnings
10 |
11 | import paddle
12 | import paddle.nn.functional as F
13 | from tqdm import tqdm
14 |
15 | from ..common.utils import get_sublayer, dot_similarity, cos_similarity, euc_similarity, get_top_and_bottom_n_examples
16 | from .example_base_interpreter import ExampleBaseInterpreter
17 |
18 |
19 | class FeatureSimilarityModel(ExampleBaseInterpreter):
20 | """
21 | Feature-based similarity method for NLP tasks.
22 | """
23 |
24 | def __init__(
25 | self,
26 | paddle_model,
27 | train_dataloader,
28 | device=None,
29 | classifier_layer_name="classifier",
30 | predict_fn=None,
31 | cached_train_feature=None,
32 | ):
33 | """
34 | Initialization.
35 | Args:
36 | paddle_model(callable): A model with ``forward``.
37 | train_dataloader(iterable): Dataloader of model's training data.
38 | device(str: default=None): Device type, and it should be ``cpu``, ``gpu``, ``gpu:0``, ``gpu:1`` etc.
39 | classifier_layer_name(str: default=classifier): Name of the classifier layer in paddle_model.
40 | predict_fn(callabel: default=None): If the paddle_model prediction has special process, user can customize the prediction function.
41 | """
42 | ExampleBaseInterpreter.__init__(self, paddle_model, device, predict_fn, classifier_layer_name)
43 | self.paddle_model = paddle_model
44 | self.classifier_layer_name = classifier_layer_name
45 |
46 | if cached_train_feature is not None and os.path.isfile(cached_train_feature):
47 | self.train_feature = paddle.load(cached_train_feature)
48 | else:
49 | self.train_feature, _ = self.extract_feature_from_dataloader(train_dataloader)
50 | if cached_train_feature is not None:
51 | try:
52 | paddle.save(self.train_feature, cached_train_feature)
53 | except IOError:
54 | import sys
55 | sys.stderr.write("save cached_train_feature fail")
56 |
57 | def interpret(self, data, sample_num=3, sim_fn="cos"):
58 | """
59 | Select most similar and dissimilar examples for a given data using the `sim_fn` metric.
60 | Args:
61 | data(iterable): one batch of data to interpret.
62 | sample_num(int: default=3): the number of positive examples and negtive examples selected for each instance. Return all the training examples ordered by `influence score` if this parameter is -1.
63 | sim_fn(str: default=cos): the similarity metric to select examples. It should be ``cos``, ``dot`` or ``euc``.
64 | """
65 | if sample_num == -1:
66 | sample_num = len(self.train_feature)
67 |
68 | val_feature, preds = self.extract_feature(self.paddle_model, data)
69 | if sim_fn == "dot":
70 | similarity_fn = dot_similarity
71 | elif sim_fn == "cos":
72 | similarity_fn = cos_similarity
73 | elif sim_fn == "euc":
74 | similarity_fn = euc_similarity
75 | else:
76 | raise ValueError(f"sim_fn only support ['dot', 'cos', 'euc'] in feature similarity, but gets `{sim_fn}`")
77 | res = []
78 | preds = preds.tolist()
79 | for index in range(len(preds)):
80 | tmp = similarity_fn(self.train_feature, paddle.to_tensor(val_feature[index]))
81 | pred_label = preds[index]
82 | example_result = get_top_and_bottom_n_examples(tmp, pred_label, sample_num=sample_num)
83 | res.append(example_result)
84 |
85 | return res
86 |
87 | @paddle.no_grad()
88 | def extract_feature(self, paddle_model, data):
89 | """
90 | extract feature from one batch of data.
91 | """
92 | if self.paddle_model.training:
93 | self.paddle_model.eval()
94 | feature, _, pred = self.predict_fn(data)
95 | return paddle.to_tensor(feature), paddle.to_tensor(pred)
96 |
97 | def extract_feature_from_dataloader(self, dataloader):
98 | """
99 | extract feature from data_loader.
100 | """
101 | print("Extracting feature from given dataloader, it will take some time...")
102 | features, preds = [], []
103 |
104 | for batch in tqdm(dataloader):
105 | feature, pred = self.extract_feature(self.paddle_model, batch)
106 | features.append(feature)
107 | preds.append(pred)
108 | return paddle.concat(features, axis=0), paddle.concat(preds, axis=0)
--------------------------------------------------------------------------------
/trustai/interpretation/example_level/method/gradient_similarity.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python3
2 | """
3 | gradient-based similarity method.
4 | cosine and dot.
5 | """
6 | import os
7 | import functools
8 | import warnings
9 |
10 | import paddle
11 | import paddle.nn.functional as F
12 | from tqdm import tqdm
13 |
14 | from ..common.utils import get_sublayer, dot_similarity, cos_similarity, euc_similarity, get_top_and_bottom_n_examples
15 | from .example_base_interpreter import ExampleBaseInterpreter
16 |
17 |
18 | class GradientSimilarityModel(ExampleBaseInterpreter):
19 | """
20 | Gradient-based similarity method for NLP tasks.
21 | """
22 |
23 | def __init__(
24 | self,
25 | paddle_model,
26 | train_dataloader,
27 | device=None,
28 | classifier_layer_name="classifier",
29 | predict_fn=None,
30 | criterion=None,
31 | cached_train_grad=None,
32 | ):
33 | """
34 | Initialization.
35 | Args:
36 | paddle_model(callable): A model with ``forward``.
37 | train_dataloader(iterable): Dataloader of model's training data.
38 | device(str: default=None): Device type, and it should be ``cpu``, ``gpu``, ``gpu:0``, ``gpu:1`` etc.
39 | classifier_layer_name(str: default=classifier): Name of the classifier layer in paddle_model.
40 | predict_fn(callabel: default=None): If the paddle_model prediction has special process, user can customize the prediction function.
41 | criterion(paddle.nn.layer.loss: default=None): criterion to calculate model loss.
42 | cached_train_grad(str: default=None): Path of the cached train_dataloader gradient. In the first training time, it will take some time to generate the train_grad
43 | """
44 | ExampleBaseInterpreter.__init__(self, paddle_model, device, predict_fn, classifier_layer_name)
45 | self.paddle_model = paddle_model
46 | self.classifier_layer_name = classifier_layer_name
47 | self.criterion = (criterion if criterion is not None else paddle.nn.loss.CrossEntropyLoss())
48 | if cached_train_grad is not None and os.path.isfile(cached_train_grad):
49 | self.train_grad = paddle.load(cached_train_grad)
50 | else:
51 | self.train_grad, *_ = self.get_grad_from_dataloader(train_dataloader)
52 | if cached_train_grad is not None:
53 | try:
54 | paddle.save(self.train_grad, cached_train_grad)
55 | except IOError:
56 | import sys
57 | sys.stderr.write("save cached_train_grad fail")
58 |
59 | def interpret(self, data, sample_num=3, sim_fn="cos"):
60 | """
61 | Select most similar and dissimilar examples for a given data using the `sim_fn` metric.
62 | Args:
63 | data(iterable): one batch of data to interpret.
64 | sample_num(int: default=3): the number of positive examples and negtive examples selected for each instance. Return all the training examples ordered by `influence score` if this parameter is -1.
65 | sim_fn(str: default=cos): the similarity metric to select examples. It should be ``cos`` or ``dot``.
66 | """
67 | if sample_num == -1:
68 | sample_num = len(self.train_grad)
69 |
70 | val_feature, _, preds = self.get_grad(self.paddle_model, data)
71 |
72 | if sim_fn == "dot":
73 | similarity_fn = dot_similarity
74 | elif sim_fn == "cos":
75 | similarity_fn = cos_similarity
76 | else:
77 | raise ValueError(f"sim_fn only support ['dot', 'cos'] in gradient simmialrity, but gets `{sim_fn}`")
78 | res = []
79 | preds = preds.tolist()
80 | for index in range(len(preds)):
81 | tmp = similarity_fn(self.train_grad, paddle.to_tensor(val_feature[index]))
82 | pred_label = preds[index]
83 | example_result = get_top_and_bottom_n_examples(tmp, pred_label, sample_num=sample_num)
84 | res.append(example_result)
85 |
86 | return res
87 |
88 | def get_grad(self, paddle_model, data):
89 | """
90 | get grad from one batch of data.
91 | """
92 | if paddle_model.training:
93 | paddle_model.eval()
94 | if isinstance(data, (tuple, list)):
95 | assert len(data[0]) == 1, "batch_size must be 1"
96 | else:
97 | assert len(data) == 1, "batch_size must be 1"
98 | _, prob, pred = self.predict_fn(data)
99 | loss = self.criterion(prob, paddle.to_tensor(pred))
100 |
101 | # adapt for paddle 2.4
102 | if paddle.version.full_version >= '2.4.0':
103 | for n, p in self.paddle_model.named_parameters():
104 | if self.classifier_layer_name in n:
105 | p.retain_grads()
106 |
107 | loss.backward()
108 | grad = self._get_flat_param_grad()
109 | self._clear_all_grad()
110 | return paddle.to_tensor(grad), paddle.to_tensor(prob), paddle.to_tensor(pred)
111 |
112 | def get_grad_from_dataloader(self, data_loader):
113 | """
114 | get grad from data_loader.
115 | """
116 | print("Extracting gradient for given dataloader, it will take some time...")
117 | probas, preds, grads = [], [], []
118 |
119 | for batch in tqdm(data_loader):
120 | grad, prob, pred = self.get_grad(self.paddle_model, batch)
121 | grads.append(grad)
122 | probas.append(prob)
123 | preds.append(pred)
124 |
125 | grads = paddle.concat(grads, axis=0)
126 | probas = paddle.concat(probas, axis=0)
127 | preds = paddle.concat(preds, axis=0)
128 | return grads, probas, preds
129 |
130 | def _get_flat_param_grad(self):
131 | """
132 | get gradient
133 | """
134 | return paddle.concat([
135 | paddle.flatten(p.grad) for n, p in self.paddle_model.named_parameters() if self.classifier_layer_name in n
136 | ]).unsqueeze(axis=0)
137 |
138 | def _clear_all_grad(self):
139 | """
140 | clear gradient
141 | """
142 | for p in self.paddle_model.parameters():
143 | p.clear_gradient()
144 |
--------------------------------------------------------------------------------
/trustai/interpretation/python_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """python utils"""
15 |
16 | def versiontuple2tuple(v):
17 | """ref: https://stackoverflow.com/a/11887825/4834515"""
18 | return tuple(map(int, filter(str.isdigit, v.split("."))))
--------------------------------------------------------------------------------
/trustai/interpretation/token_level/README.md:
--------------------------------------------------------------------------------
1 | # 特征级证据分析
2 |
3 | ## 功能介绍
4 | 针对给定的模型和预测结果,特征级证据分析方法对测试输入中的每一特征(在以文本形式为输入的NLP任务中,特征为输入中的字或词)赋值一个分数,用其表示该特征对预测结果的影响度。然后影响度大的若干特征被选择作为模型预测依赖的证据,解释模型的预测结果。
5 |
6 | TrustAI提供了3种主流的特征级证据分析方法,分别是[Lime](https://arxiv.org/abs/1602.04938)、[Attention](https://arxiv.org/pdf/1902.10186.pdf)、[Integrated Gradient](https://arxiv.org/abs/1703.01365)方法。
7 |
8 | ## 使用示例
9 | 模型预测依赖证据的输出包含三步:待解释模型准备、测试数据准备、证据输出。
10 |
11 | 一、待解释模型准备
12 | ```python
13 | from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
14 | model = ErnieForSequenceClassification.from_pretrained('ernie-1.0', num_classes=2)
15 | tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
16 | ```
17 |
18 | 二、测试数据准备:将输入测试文本转化为模型的输入
19 | ```python
20 | data = [{ "text": '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'}]
21 |
22 | # preprocess_fn是开发者自定义的预处理函数,将文本转化为模型的输入格式
23 | model_inputs = preprocess_fn(data)
24 | ```
25 |
26 | 三、证据获取:这里以Integrated Gradient方法为例
27 | ```python
28 | from trustai.interpretation.token_level import IntGradInterpreter
29 | ig = IntGradInterpreter(model)
30 | result = ig(preprocess_fn(data), steps=100)
31 |
32 | # attributions的长度与用户切词相同
33 | # 数值的大小表示对应特征对预测结果支持的程度
34 | print(result[0].attributions)
35 | # [ 0.02149865 0.13750568 0.03729623 0.20981199 0.11474895 0.00191162
36 | # 0.01166647 0.01939347 0.00499799 -0.01771647 0.05467343 -0.05574901
37 | # 0.0797711 0.02094495 -0.02661019 0.01423277 0.03983632 0.05040766
38 | # 0.03474617 0.10548145 -0.02475511 -0.06759283 -0.07004125 -0.0207927
39 | # 0.03771218 0.01511401 -0.01349011 0.01542336]
40 | ```
41 |
42 |
43 | TrustAI支持将输出的重要度分数映射到更大粒度片段上,这里给出了一个基于jieba分词的使用示例。
44 |
45 | ```python
46 | import jieba
47 |
48 | from trustai.interpretation import get_word_offset
49 | # 分析的文本
50 | print(data[0]['text'])
51 | # 这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般
52 |
53 | # 由于示例使用的是ernie-1.0模型,若要与attribution对齐,需要在原始文本拼接[CLS], [SEP]
54 | context = "[CLS]" + " " + data[0]['text'] + " " + "[SEP]"
55 | # 开发者自定分词
56 | words = ["[CLS]"] + list(jieba.cut(data[0]['text'])) + ["[SEP]"]
57 | # ['[CLS]', '这个', '宾馆', '比较', '陈旧', '了', ',', '特价', '的', '房间', '也', '很', '一般', '。', '总体', '来说', '一般', '[SEP]']
58 |
59 | # 获取用户自定切词的word_offset_map
60 | # word_offset_map表示开发者自定义切词在context中的字符的偏移位置
61 | word_offset_map = get_word_offset(context, words)
62 | # [[0, 5], [6, 8], [8, 10], [10, 12], [12, 14], [14, 15], [15, 16], [16, 18], [18, 19], [19, 21], [21, 22], [22, 23], [23, 25], [25, 26], [26, 28], [28, 30], [30, 32], [33, 38]]
63 |
64 | # 计算模型切词offset_map
65 | subword_offset_map = tokenizer.get_offset_mapping(context)
66 | # [(0, 1), (1, 3), (3, 4), (4, 5), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12), (12, 13), (13, 14), (14, 15), (15, 16), (16, 17), (17, 18), (18, 19), (19, 20), (20, 21), (21, 22), (22, 23), (23, 24), (24, 25), (25, 26), (26, 27), (27, 28), (28, 29), (29, 30), (30, 31), (31, 32), (33, 34), (34, 37), (37, 38)]
67 |
68 | # 将attributions对齐到words
69 | aligns = ig.alignment(result, [context], [batch_words], [word_offset_map], [subword_offset_map], special_tokens=["[CLS]", '[SEP]'])
70 |
71 | print(aligns[0].words)
72 | # ['[CLS]', '这个', '宾馆', '比较', '陈旧', '了', ',', '特价', '的', '房间', '也', '很', '一般', '。', '总体', '来说', '一般', '[SEP]']
73 | print(aligns[0].word_attributions)
74 | # [0.021498650312423706, 0.17480190843343735, 0.3245609328150749, 0.013578088022768497, 0.02439146302640438, -0.01771647110581398, 0.05467343330383301, 0.024022094905376434, 0.020944949239492416, -0.012377424165606499, 0.03983632102608681, 0.05040765926241875, 0.14022761583328247, -0.024755112826824188, -0.13763408362865448, 0.01691947504878044, 0.001623895950615406, 0.015423357486724854]
75 | print(aligns[0].pred_label)
76 | # 0
77 | print(aligns[0].pred_proba)
78 | # [0.86797816 0.1320218 ]
79 | print(aligns[0].rationale)
80 | # (1, 2, 6, 11, 12)
81 | print(aligns[0].rationale_tokens)
82 | # ('这个', '宾馆', ',', '很', '一般')
83 | print(aligns[0].non_rationale)
84 | # (3, 4, 5, 7, 8, 9, 10, 13, 14, 15, 16)
85 | print(aligns[0].non_rationale_tokens)
86 | # ('比较', '陈旧', '了', '特价', '的', '房间', '也', '。', '总体', '来说', '一般')
87 |
88 | ```
89 |
90 | 同时,TrustAI提供了可视化功能,调用代码如下:
91 | ```python
92 | # html为HTML格式的文本,可以保存为html文件
93 | html = visualize_text(VisualizationTextRecord(aligns[0], true_label=0))
94 | ```
95 |
96 | 可视化输出示例:
97 |
98 | 
99 | 图1 可视化实例:实例来自情感分析任务
100 |
101 |
102 | 详细示例见[examples](../../../examples/interpretation/token_level)。
103 |
--------------------------------------------------------------------------------
/trustai/interpretation/token_level/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """token-level"""
15 |
16 | from .method import *
17 | from .common import get_word_offset
18 | from .data_processor import *
--------------------------------------------------------------------------------
/trustai/interpretation/token_level/common/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """init"""
15 |
16 | from .postprocess_attribution import *
17 | from .predict_functions import *
18 |
--------------------------------------------------------------------------------
/trustai/interpretation/token_level/common/postprocess_attribution.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """postprocess attribution"""
15 |
16 | import copy
17 | import warnings
18 |
19 |
20 | def get_word_offset(context, words):
21 | """get_word_offset"""
22 | pointer = 0 # point at the context
23 | offset_map = []
24 | for i in range(len(words)):
25 | seg_start_idx = context.find(words[i], pointer)
26 | seg_end_idx = seg_start_idx + len(words[i])
27 | offset_map.append([seg_start_idx, seg_end_idx])
28 | pointer = seg_end_idx
29 | return offset_map
30 |
31 |
32 | def get_word_attributions(words, word_offset_map, subword_offset_map, attributions):
33 | """get_word_attributions"""
34 | result = []
35 |
36 | pointer1 = 0 # point at the context
37 | pointer2 = 0 # point at the sorted_token array
38 |
39 | for i in range(len(word_offset_map)):
40 | # merge spcial offset position in subword_offset_map
41 | seg_start_idx, seg_end_idx = word_offset_map[i]
42 | cur_set = []
43 | while pointer2 < len(subword_offset_map):
44 | while pointer2 < len(subword_offset_map) and subword_offset_map[pointer2][1] <= seg_start_idx:
45 | pointer2 += 1
46 | if subword_offset_map[pointer2][0] >= seg_end_idx:
47 | break
48 | cur_set.append(pointer2)
49 | pointer2 += 1
50 | result.append([cur_set, i, words[i]])
51 | pointer2 -= 1
52 | pointer1 = seg_end_idx
53 | word_attributions = merge_attributions(result, attributions)
54 | return word_attributions
55 |
56 |
57 | def get_rationales_and_non_ratioanles(words, word_attributions, special_tokens=[], rationale_num=5):
58 | """"get_rationales_and_non_ratioanles"""
59 | assert len(words) == len(word_attributions)
60 |
61 | sorted_rationale_ids = list(sorted(range(len(words)), key=lambda i: word_attributions[i], reverse=True))
62 | rationale_tokens = []
63 | rationale_ids = []
64 | non_rationale_tokens = []
65 | non_rationale_ids = []
66 | for idx in sorted_rationale_ids:
67 | if words[idx] in special_tokens:
68 | continue
69 | if len(rationale_ids) < rationale_num:
70 | rationale_ids.append(idx)
71 | rationale_tokens.append(words[idx])
72 | else:
73 | non_rationale_ids.append(idx)
74 | non_rationale_tokens.append(words[idx])
75 | rationale_ids, rationale_tokens = zip(*list(sorted(zip(rationale_ids, rationale_tokens), key=lambda ele: ele[0])))
76 | if len(non_rationale_ids) == 0:
77 | non_rationale_ids = []
78 | non_rationale_tokens = []
79 | else:
80 | non_rationale_ids, non_rationale_tokens = zip(
81 | *list(sorted(zip(non_rationale_ids, non_rationale_tokens), key=lambda ele: ele[0])))
82 | return {
83 | "rationale_ids": rationale_ids,
84 | "rationale_tokens": rationale_tokens,
85 | "non_rationale_ids": non_rationale_ids,
86 | "non_rationale_tokens": non_rationale_tokens
87 | }
88 |
89 |
90 | def merge_subword_special_idx(words, word_offset_map, subword_offset_map, special_tokens):
91 | """merge_subword_special_idx"""
92 | spcial_token_ids = []
93 | for idx, word in enumerate(words):
94 | if word in special_tokens:
95 | spcial_token_ids.append(idx)
96 | special_token_offset = []
97 | special_token_offset = [word_offset_map[idx] for idx in spcial_token_ids]
98 | subword_start_ids, subword_end_ids = list(zip(*subword_offset_map))
99 | merge_idx = []
100 | for token_start, token_end in special_token_offset:
101 | try:
102 | sub_start_id = subword_start_ids.index(token_start)
103 | sub_end_id = subword_end_ids.index(token_end)
104 | merge_idx.append([sub_start_id, sub_end_id])
105 | except:
106 | warnings.warn("Error offset mapping! Please check your offset map.")
107 | new_subword_offset_map = copy.deepcopy(subword_offset_map)
108 | for merge_start, merge_end in merge_idx[::-1]:
109 | spceial_toekn_start_id = new_subword_offset_map[merge_start][0]
110 | spceial_toekn_end_id = new_subword_offset_map[merge_end][1]
111 | del new_subword_offset_map[merge_start:merge_end + 1]
112 | new_subword_offset_map.insert(merge_start, [spceial_toekn_start_id, spceial_toekn_end_id])
113 | return new_subword_offset_map
114 |
115 |
116 | def merge_attributions(match_list, attributions):
117 | """merge_attributions"""
118 | over_all = []
119 | miss = 0
120 | for i in match_list:
121 | over_all.extend(i[0])
122 |
123 | attribution_dic = {}
124 | for i in range(len(attributions)):
125 | split_time = over_all.count(i)
126 | if split_time:
127 | attribution_dic[i] = attributions[i] / split_time
128 | else:
129 | attribution_dic[i] = 0.0
130 | if miss != 0:
131 | print(miss)
132 |
133 | attributions = []
134 | for i in range(len(match_list)):
135 | cur_attribution = 0.0
136 | for j in match_list[i][0]:
137 | if j == -1:
138 | continue
139 | cur_attribution += attribution_dic[j]
140 | attributions.append(cur_attribution)
141 | return attributions
--------------------------------------------------------------------------------
/trustai/interpretation/token_level/data_processor/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """init"""
15 |
16 | from .visualizer import *
17 | from .data_class import *
--------------------------------------------------------------------------------
/trustai/interpretation/token_level/data_processor/data_class.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """data class"""
15 |
16 | from dataclasses import dataclass
17 | from typing import Any
18 | from typing import List
19 | from typing import Dict
20 | from typing import Tuple
21 |
22 |
23 | @dataclass
24 | class TokenResult(object):
25 | attributions: List[float]
26 | pred_label: float
27 | pred_proba: List[float]
28 |
29 |
30 | @dataclass
31 | class AttentionResult(TokenResult):
32 | pass
33 |
34 |
35 | @dataclass
36 | class GradShapResult(TokenResult):
37 | pass
38 |
39 |
40 | @dataclass
41 | class IGResult(TokenResult):
42 | error_percent: float
43 |
44 |
45 | @dataclass
46 | class LimeResult(TokenResult):
47 | lime_score: float
48 |
49 |
50 | @dataclass
51 | class NormLIMEResult(object):
52 | # {id : (attribution, word_idx)}
53 | attributions: Dict[int, Tuple[float, int]]
54 |
55 |
56 | @dataclass
57 | class InterpretResult(object):
58 | words: List[str]
59 | word_attributions: List[float]
60 | pred_label: float
61 | pred_proba: List[float]
62 | rationale: List[int]
63 | non_rationale: List[int]
64 | rationale_tokens: List[str]
65 | non_rationale_tokens: List[str]
66 | rationale_pred_proba: float = None
67 | non_rationale_pred_proba: float = None
68 |
--------------------------------------------------------------------------------
/trustai/interpretation/token_level/data_processor/visualizer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """visualization function"""
15 |
16 | from IPython.core.display import display, HTML
17 |
18 | import numpy as np
19 |
20 | from .data_class import TokenResult
21 | from .data_class import InterpretResult
22 |
23 |
24 | class VisualizationTextRecord(object):
25 | """
26 | A record for text visulization.
27 | Part of the code is modified from https://github.com/pytorch/captum/blob/master/captum/attr/_utils/visualization.py
28 | """
29 |
30 | def __init__(self, interpret_res, true_label=None, words=None):
31 | if words is not None:
32 | self.words = words
33 | else:
34 | self.words = interpret_res.words
35 | self.pred_label = interpret_res.pred_label
36 | if isinstance(self.pred_label, np.ndarray):
37 | self.pred_proba = [
38 | round(proba[label], 2) for proba, label in zip(interpret_res.pred_proba, self.pred_label)
39 | ]
40 | self.pred_label = self.pred_label.tolist()
41 | else:
42 | self.pred_proba = interpret_res.pred_proba[self.pred_label]
43 | self.true_label = true_label if true_label is not None else ''
44 |
45 | # Normalization for attributions
46 | if isinstance(interpret_res, InterpretResult):
47 | word_attributions = interpret_res.word_attributions
48 | else:
49 | word_attributions = interpret_res.attributions
50 | _max = max(word_attributions)
51 | _min = min(word_attributions)
52 | self.word_attributions = [(word_imp - _min) / (_max - _min) for word_imp in word_attributions]
53 |
54 | def record_html(self):
55 | """change all informations to html"""
56 | return "".join([
57 | "",
58 | self._format_class(self.true_label),
59 | self._format_class(self.pred_label, self.pred_proba),
60 | self._format_word_attributions(),
61 | "
",
62 | ])
63 |
64 | def _format_class(self, label, prob=None):
65 | if prob is None:
66 | return '{label} | '.format(label=label)
67 | elif isinstance(prob, list):
68 | return '{label} ({prob}) | '\
69 | .format(label=str(label), prob=str(prob))
70 | else:
71 | return '{label} ({prob:.2f}) | '\
72 | .format(label=label, prob=prob)
73 |
74 | def _format_word_attributions(self):
75 | tags = [""]
76 | for word, importance in zip(self.words, self.word_attributions[:len(self.words)]):
77 | color = self._background_color(importance)
78 | unwrapped_tag = ' {word}\
80 | ' \
81 | .format(color=color, word=word)
82 | tags.append(unwrapped_tag)
83 | tags.append(" | ")
84 | return "".join(tags)
85 |
86 | def _background_color(self, importance):
87 | importance = max(-1, min(1, importance))
88 | if importance > 0:
89 | hue = 120
90 | sat = 75
91 | lig = 100 - int(30 * importance)
92 | else:
93 | hue = 0
94 | sat = 75
95 | lig = 100 - int(-40 * importance)
96 | return "hsl({}, {}%, {}%)".format(hue, sat, lig)
97 |
98 |
99 | def visualize_text(text_records):
100 | """visualize text"""
101 | html = [""]
102 | rows = ["Golden Label | "
103 | "Predicted Label (Prob) | "
104 | "Important scores | "]
105 | for record in text_records:
106 | rows.append(record.record_html())
107 | html.append("".join(rows))
108 | html.append("
---|
")
109 | html = HTML("".join(html))
110 | display(html)
111 | return html.data
112 |
113 |
114 | def visualize(interpret_res, true_labels=None, words=None):
115 | """
116 | interpret_res: List[TokenResult, InterpretResult], Interpretability Results
117 | true_labels: List[int], Golden labels for test examples
118 | words: List[List[str]], The word segmentation result of the test examples, the length of words is equal to the attributions
119 | """
120 | result_num = len(interpret_res)
121 | if true_labels is None:
122 | true_labels = [None] * result_num
123 | if words is None:
124 | words = [None] * result_num
125 | records = []
126 | for i in range(result_num):
127 | records.append(VisualizationTextRecord(interpret_res[i], true_label=true_labels[i], words=words[i]))
128 | html = visualize_text(records)
129 | return html
--------------------------------------------------------------------------------
/trustai/interpretation/token_level/method/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """all method"""
15 |
16 | from .attention import AttentionInterpreter
17 | from .gradient_shap import GradShapInterpreter
18 | from .integrated_gradients import IntGradInterpreter
19 | from .lime import LIMEInterpreter
20 | from .norm_lime import NormLIMEInterpreter
21 |
22 | __all__ = [
23 | "AttentionInterpreter", "GradShapInterpreter", "IntGradInterpreter", "LIMEInterpreter", "NormLIMEInterpreter"
24 | ]
--------------------------------------------------------------------------------
/trustai/interpretation/token_level/method/attention.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """attention interpreter"""
15 |
16 | import paddle
17 |
18 | from ..data_processor import AttentionResult
19 | from .base_interpret import TokenInterpreter
20 |
21 |
22 | class AttentionInterpreter(TokenInterpreter):
23 | """
24 | Attention Interpreter for NLP tasks.
25 | """
26 |
27 | def __init__(self, paddle_model, device=None, attention_name=None, predict_fn=None) -> None:
28 | """
29 | Args:
30 | paddle_model (callable): A model with ``forward`` and possibly ``backward`` functions.
31 | device (str, optional): The device used for running `paddle_model`, options: ``cpu``, ``gpu``, ``gpu:0``, ``gpu:1`` etc. Default: None.
32 | attention_name(str, optional): The layer name of attention. The correct name of embedding can be found through ``print(model)``. Default: None.
33 | predict_fn(callable, optional): If the paddle_model prediction has special process, the user can customize the prediction function. Default: None.
34 | """
35 | TokenInterpreter.__init__(self, paddle_model, device)
36 |
37 | # build predict function
38 | self._build_predict_fn(attention_name=attention_name, predict_fn=predict_fn)
39 |
40 | def interpret(self, data):
41 | """Main function of the interpreter.
42 | Args:
43 | data ([type]): The inputs of the paddle_model.
44 |
45 | Returns:
46 | List[AttentionResult]: a list of predicted labels, probabilities, and interpretations.
47 | """
48 |
49 | if isinstance(data, (tuple, list)):
50 | bs = data[0].shape[0]
51 | else:
52 | bs = data.shape[0]
53 |
54 | attributions, pred_label, pred_proba = self._attention_interpret(data)
55 |
56 | # returns
57 | rets = []
58 | for i in range(bs):
59 | attresult = AttentionResult(attributions=attributions[i],
60 | pred_label=pred_label[i],
61 | pred_proba=pred_proba[i])
62 | rets.append(attresult)
63 | return rets
64 |
65 | def _build_predict_fn(self, attention_name=None, predict_fn=None):
66 | assert attention_name is not None or \
67 | predict_fn is not None, "At least One of attention_name and predict_fn is not None."
68 |
69 | if attention_name is None:
70 | self.predict_fn = predict_fn
71 | return
72 |
73 | def predict_fn(inputs, paddle_model=None):
74 | if paddle_model is None:
75 | paddle_model = self.paddle_model
76 | target_feature_map = []
77 |
78 | def hook(layer, input, output):
79 | target_feature_map.append(output)
80 | return output
81 |
82 | hooks = []
83 | for name, v in paddle_model.named_sublayers():
84 | if attention_name in name:
85 | h = v.register_forward_post_hook(hook)
86 | hooks.append(h)
87 |
88 | if isinstance(inputs, (tuple, list)):
89 | logits = paddle_model(*inputs) # get logits, [bs, num_c]
90 | else:
91 | logits = paddle_model(inputs) # get logits, [bs, num_c]
92 |
93 | bs = logits.shape[0]
94 | for h in hooks:
95 | h.remove()
96 |
97 | probas = paddle.nn.functional.softmax(logits, axis=1) # get probabilities.
98 | preds = paddle.argmax(probas, axis=1) # get predictions.
99 | # logits or probas
100 | preds = preds.reshape((bs, ))
101 | attention = target_feature_map[0].sum(1)[:, 0]
102 | return attention.numpy(), preds.numpy(), probas.numpy()
103 |
104 | self.predict_fn = predict_fn
105 |
106 | def _attention_interpret(self, data) -> tuple:
107 | attentions, labels, probas = self.predict_fn(data, paddle_model=self.paddle_model)
108 | return attentions, labels, probas
109 |
--------------------------------------------------------------------------------
/trustai/interpretation/token_level/method/base_interpret.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """TokenInterpreter"""
15 |
16 | import abc
17 |
18 | from ..data_processor import InterpretResult
19 | from ..common import merge_subword_special_idx
20 | from ..common import get_word_attributions
21 | from ..common import get_rationales_and_non_ratioanles
22 | from ...base_interpret import Interpreter
23 |
24 |
25 | class TokenInterpreter(Interpreter):
26 | """
27 | Interpreter is the base class for all interpretation algorithms.
28 | """
29 |
30 | def __init__(self, *args, **akwargs):
31 | Interpreter.__init__(self, *args, **akwargs)
32 |
33 | def __call__(self, *args, **kwargs):
34 | return self.interpret(*args, **kwargs)
35 |
36 | @abc.abstractmethod
37 | def interpret(self, **kwargs):
38 | """Main function of the interpreter."""
39 | raise NotImplementedError
40 |
41 | @abc.abstractmethod
42 | def _build_predict_fn(self, **kwargs):
43 | """Build self.predict_fn for interpreters."""
44 | raise NotImplementedError
45 |
46 | def alignment(self,
47 | interpret_results,
48 | contexts,
49 | batch_words,
50 | word_offset_maps,
51 | subword_offset_maps,
52 | special_tokens=[],
53 | rationale_num=5):
54 | """Align the subword's attributions to the word. Return top words with the top ``rationale_num`` as rationale and the other words as non-rationale.
55 | Args:
56 | interpret_results ([data_class]): The Interpreter functions ouputs, like ``AttentionResult``, ``LIMEResult`` etc.
57 | contexts ([str]): The input text with speical_tokens to tokenizer, like ``[CLS] How are you? [SEP]``.
58 | batch_words ([[str]]): The word segmentation resutls of the contexts.
59 | word_offset_maps ([(int, int)]): The offset mapping of word segationment.
60 | subword_offset_maps ([(int, int)]): The offset mapping of subwords.
61 | special_tokens ([str], optional): The speical tokens which not be extracted as rationales.
62 | rationale_num (int, optional): The number of rationales. Default: 5
63 | Returns:
64 | List[InterpretResult]: a list of predicted labels, probabilities, interpretations, rationales etc.
65 | """
66 |
67 | result = []
68 | assert len(contexts) == len(batch_words) == len(word_offset_maps) == len(subword_offset_maps) == len(
69 | interpret_results
70 | ), f"The lenght of contexts, batch_words, word_offset_maps, subword_offset_maps, interpret_results should be equal."
71 |
72 | for i in range(len(contexts)):
73 | words = batch_words[i]
74 | context = contexts[i]
75 | word_offset_map = word_offset_maps[i]
76 | subword_offset_map = subword_offset_maps[i]
77 | interpret_result = interpret_results[i]
78 | assert subword_offset_map[-1][1] == word_offset_map[-1][
79 | 1], "error offset_map, please check word_offset_maps and subword_offset_maps"
80 |
81 | # merge speical tokens for subword_offset_map
82 | subword_offset_map = merge_subword_special_idx(words, word_offset_map, subword_offset_map, special_tokens)
83 |
84 | attributions = interpret_result.attributions
85 | pred_label = interpret_result.pred_label
86 | pred_proba = interpret_result.pred_proba
87 |
88 | # get word attributions
89 | word_attributions = get_word_attributions(words, word_offset_map, subword_offset_map, attributions)
90 | # get ratioanles and non-rationales
91 | ratioanle_result = get_rationales_and_non_ratioanles(words,
92 | word_attributions,
93 | special_tokens=special_tokens,
94 | rationale_num=rationale_num)
95 | interpret_result = InterpretResult(words=words,
96 | word_attributions=word_attributions,
97 | pred_label=pred_label,
98 | pred_proba=pred_proba,
99 | rationale=ratioanle_result['rationale_ids'],
100 | non_rationale=ratioanle_result['non_rationale_ids'],
101 | rationale_tokens=ratioanle_result['rationale_tokens'],
102 | non_rationale_tokens=ratioanle_result['non_rationale_tokens'])
103 | result.append(interpret_result)
104 | return result
--------------------------------------------------------------------------------
/trustai/interpretation/token_level/method/gradient_shap.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """GradShapInterpreter"""
15 |
16 | from ..data_processor import GradShapResult
17 | from .base_interpret import TokenInterpreter
18 |
19 |
20 | class GradShapInterpreter(TokenInterpreter):
21 | """A wrap class of interpretdl.GradShapInterpreter, please refer to ``interpretdl/interpreter/gradient_shap.py`` for details"""
22 |
23 | def __init__(self,
24 | paddle_model,
25 | device='gpu',
26 | n_samples=5,
27 | noise_amount=0.1,
28 | embedding_name="word_embeddings") -> None:
29 | """
30 | Args:
31 | paddle_model (callable): A model with ``forward`` and possibly ``backward`` functions.
32 | device (str, optional): The device used for running `paddle_model`, options: ``cpu``, ``gpu``. Default: gpu.
33 | n_samples (int, optional): [description]. Defaults to 5.
34 | noise_amount (float, optional): Noise level of added noise to the embeddings.
35 | The std of Guassian random noise is ``noise_amount * embedding.mean() * (x_max - x_min)``. Default: 0.1
36 | embedding_name (str, optional): name of the embedding layer at which the noises will be applied.
37 | Defaults to 'word_embeddings'. The correct name of embedding can be found through ``print(model)``.
38 | """
39 | TokenInterpreter.__init__(self, paddle_model, device)
40 |
41 | # build predict function
42 | self.gradshap = self._build_predict_fn(paddle_model, device)
43 |
44 | self.n_samples = n_samples
45 | self.noise_amount = noise_amount
46 | self.embedding_name = embedding_name
47 |
48 | def interpret(self, data):
49 | """Main function of the interpreter.
50 | Args:
51 | data ([type]): The inputs of the paddle_model.
52 | labels ([type], optional): The target label to analyze. If None, the most likely label will be used. Default: None.
53 | Returns:
54 | List[GradShapResult]: a list of predicted labels, probabilities and interpretations.
55 | """
56 |
57 | if isinstance(data, (tuple, list)):
58 | bs = data[0].shape[0]
59 | else:
60 | bs = data.shape[0]
61 |
62 | pred_label, pred_proba, attributions = self.gradshap.interpret(data,
63 | n_samples=self.n_samples,
64 | noise_amount=self.noise_amount,
65 | embedding_name=self.embedding_name,
66 | return_pred=True)
67 | # returns
68 | rets = []
69 | for i in range(bs):
70 | shapresult = GradShapResult(attributions=attributions[i],
71 | pred_label=pred_label[i],
72 | pred_proba=pred_proba[i])
73 | rets.append(shapresult)
74 | return rets
75 |
76 | def _build_predict_fn(self, paddle_model, device='gpu'):
77 | try:
78 | from interpretdl import GradShapNLPInterpreter
79 | except ImportError as e:
80 | import sys
81 | sys.stderr.write(
82 | '''Warning with import interpretdl: please install interpretdl firstly. cmd: pip install -U interpretdl'''
83 | )
84 | raise e
85 |
86 | return GradShapNLPInterpreter(paddle_model, device)
87 |
--------------------------------------------------------------------------------
/trustai/interpretation/token_level/method/norm_lime.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """norm lime"""
15 |
16 | from ..data_processor import NormLIMEResult
17 | from .base_interpret import TokenInterpreter
18 |
19 |
20 | class NormLIMEInterpreter(TokenInterpreter):
21 | """A wrap class of interpretdl.NormLIMENLPInterpreter, please refer to ``interpretdl/interpreter/_normlime_base.py`` for details"""
22 |
23 | def __init__(self, paddle_model, preprocess_fn, unk_id, pad_id=None, device=None, batch_size=50) -> None:
24 | """
25 | Args:
26 | paddle_model (callable): A model with ``forward`` and possibly ``backward`` functions.
27 | preprocess_fn (Callable): A user-defined function that input raw string and outputs the a tuple of inputs to feed into the NLP model.
28 | unk_id (int): The word id to replace occluded words. Typical choices include "", , and .
29 | pad_id (int or None): The word id used to pad the sequences. If None, it means there is no padding. Default: None.
30 | device (str, optional): The device used for running `paddle_model`, options: ``cpu``, ``gpu``, ``gpu:0``, ``gpu:1`` etc. Default: None.
31 | batch_size (int, optional): Number of samples to forward each time. Default: 50
32 | """
33 | TokenInterpreter.__init__(self, paddle_model, device)
34 |
35 | # build predict function
36 | self.normlime = self._build_predict_fn(paddle_model, device)
37 |
38 | self.batch_size = batch_size
39 | self.preprocess_fn = preprocess_fn
40 | self.unk_id = unk_id
41 | self.pad_id = pad_id
42 |
43 | def interpret(self, data, num_samples=500, temp_data_file='all_lime_weights.npz', save_path='normlime_weights.npy'):
44 | """Main function of the interpreter.
45 | Args:
46 | data ([type]): The inputs of the paddle_model.
47 | labels ([type], optional): The target label to analyze. If None, the most likely label will be used. Default: None.
48 | num_samples (int, optional): LIME sampling numbers. Larger number of samples usually gives more accurate interpretation. Default: 1000
49 | temp_data_file (str, optinal): The .npz file to save/load the dictionary where key is word ids joined by '-' and value is another dictionary with lime weights. Default: 'all_lime_weights.npz'
50 | save_path (str, optional): The .npy path to save the normlime weights. It is a dictionary where the key is label and value is segmentation ids with their importance. Default: 'normlime_weights.npy'
51 |
52 | Returns:
53 | [NormLIMEResult] NormLIME weights: {label_i: weights on features}
54 |
55 | """
56 |
57 | normlime_weights = self.normlime.interpret(data,
58 | self.preprocess_fn,
59 | unk_id=self.unk_id,
60 | pad_id=self.pad_id,
61 | num_samples=num_samples,
62 | batch_size=self.batch_size,
63 | temp_data_file=temp_data_file,
64 | save_path=save_path)
65 |
66 | normresult = NormLIMEResult(attributions=normlime_weights)
67 | return normresult
68 |
69 | def _build_predict_fn(self, paddle_model, device='gpu'):
70 | try:
71 | from interpretdl import NormLIMENLPInterpreter
72 | except ImportError as e:
73 | import sys
74 | sys.stderr.write(
75 | '''Warning with import interpretdl: please install interpretdl firstly. cmd: pip install -U interpretdl'''
76 | )
77 | raise e
78 | return NormLIMENLPInterpreter(paddle_model, device)
79 |
--------------------------------------------------------------------------------
/tutorials/README.md:
--------------------------------------------------------------------------------
1 | # 应用案例
2 | 基于对模型预测依赖证据的分析,TrustAI提供了模型缺陷识别方案及对应的优化方案。在该目录,我们通过具体实例介绍几种方案,一是希望开发者可直接利用这些方案对其任务模型进行优化,二是希望启发研究者探索证据分析的更多价值。
3 | * 训练数据缺陷识别及针对性优化策略
4 | * [训练数据覆盖不足识别及有效数据增强](#训练数据覆盖不足识别及有效数据增强)
5 | * [训练数据中脏数据识别及标注修正](#训练数据中脏数据识别及标注修正)
6 | * [训练数据分布偏置识别及偏置消除](#训练数据偏置识别及偏置消除)
7 | * [数据权重修正](#数据权重修正)
8 | * [数据分布修正](#数据分布修正)
9 | * 基于证据指导的预测机制优化
10 | * [证据识别及基于证据的预测](#证据识别及基于证据的预测)
11 |
12 | 同样地,我们也探索了证据分析的其他价值,如:
13 | * [基于证据指导的模型增强方案](#基于证据指导的模型增强方案)
14 | * [基于证据指导的预测错误数据识别](#基于证据指导的预测错误数据识别)
15 |
16 |
17 | ## 训练数据覆盖不足识别及有效数据增强
18 | ### 方法介绍
19 | 训练数据覆盖不足会导致模型在对应的测试数据上表现不好。数据扩充是提升模型效果首选方法,然而数据标注是一个费时费力的工作,如何标注更少的数据带来更大的效果提升是大多数NLP开发者面临的难题。
20 |
21 | TrustAI可识别因训练数据覆盖不足而导致的预测效果差的测试样本(这些样本构成的集合称为目标集),并能帮助开发者从未标注数据中选择有效数据进行标注,提高训练数据对目标集的覆盖度,进而提升模型效果。
22 |
23 | ### 方法效果
24 |
25 | 由于标注数据成本高昂,下表给出了基于相似度计算任务LCQMC数据集进行的模拟实验效果。实验基于ERNIE-3.0-base-zh在LCQMC训练数据上微调得到模型,在LCQMC测试集和DuQM鲁棒性数据集上进行效果评估,评估指标为准确率。
26 |
27 |
28 | | 数据集 | 数据量 | LCQMCdev | LCQMCtest | DuQM | 目标集 |
29 | | :-------: | :-------: | :-----: | :-----: |:-----: |:-----: |
30 | | 基线 | 5000 | 86.42% | 84.49% | 69.17% | 55.19% |
31 | | 基线 + 随机1000条 | 6000 | 86.76% | 85.05% | 69.23% | 55.20% |
32 | | 基线 + 策略1000条 | 6000 | 87.04% | 85.58% | 70.20% | 69.60% |
33 |
34 | 实验结论:增加20%有效训练数据,该方案将目标集效果提升14.40%(随机选择同等规模数据加入训练数据,效果仅提升0.01%);同时在整个测试集上,该方案将效果提升1.03%(随机选择方案仅提升0.06%)。
35 |
36 | 同时,该策略接入了PaddleNLP的分类系统,在多分类、多标签分类、及层次分类任务上完成了效果验证,效果如图1所示:通过TrustAI提供的有效数据选择策略,增加10%训练数据带来的效果提升大于随机增加20%训练数据的效果,也就是说,该策略能够**节省一半标注成本**。
37 |
38 | 
39 | 图1 在三个常见分类任务上应用“数据覆盖不足识别及有效数据增强”策略的效果
40 |
41 |
42 |
43 | 详细方案和实验介绍见应用示例[训练数据覆盖不足识别及有效数据增强](./sparse_data_identification)。
44 |
45 |
46 |
47 | ## 训练数据中脏数据识别及标注修正
48 | ### 方法介绍
49 | 训练数据标注质量对模型效果有较大影响,往往会成为模型效果提升的瓶颈。但当标注数据规模较大时,数据检查就成为一个难题。
50 |
51 | TrustAI提供了脏数据(即标注质量差的数据)自动识别功能,降低人工检查数据成本。如图2所示,在三个公开数据集上,TrustAI提供的脏数据识别策略,其识别的脏数据比例远高于随机选择策略。
52 |
53 |
54 |
55 | 
56 | 图2 在3个数据集上,不同策略识别的脏数据效果
57 |
58 |
59 | ### 方法效果
60 |
61 | 下表给出了基于相似度计算任务LCQMC数据集上进行的实验效果。实验基于ERNIE-3.0-base-zh在LCQMC训练数据上微调得到模型,并在LCQMC测试集和DuQM鲁棒性数据集上评估效果,评估指标为准确率。
62 |
63 |
64 | | 数据集 | LCQMCdev | LCQMCtest | DuQM |
65 | | :-------: | :-----: | :-----: |:-----: |
66 | | 基线 | 86.42% | 84.49% | 69.17% |
67 | | 数据修正 | 87.76% | 86.62% | 73.18% |
68 |
69 | 结果说明:对候选脏数据(规模为原始训练集的10%)进行人工标注修正,数据修正后重新训练模型,在LCQMC测试集上效果提升2.13%,在DuQM数据集上效果提升4.01%。
70 |
71 | 同时,该策略接入了PaddleNLP的分类系统,在多分类、多标签分类、及层次分类任务上完成了效果验证,效果如图3所示。
72 |
73 | 
74 | 图3 在三个常见分类任务上应用“脏数据识别及标注修正”策略的效果
75 |
76 |
77 | 详细方案和实验介绍见应用示例[训练数据中脏数据识别](./dirty_data_identification)。
78 |
79 |
80 |
81 |
82 | ## 训练数据偏置识别及偏置消除
83 | ### 方法介绍
84 | 研究表明,神经网络模型会利用数据集中的偏置作为预测捷径,如在情感分析任务中,遇到否定词模型会倾向预测为“负向”情感。这种偏置会导致模型没有真正理解语言,导致模型的鲁棒性降低。
85 |
86 | TrustAI提供了数据权重修正和数据分布修正两种优化策略,在不需要人工介入的条件下,缓解训练数据偏置对模型训练的影响,提升模型的语义理解能力,进而提升模型的鲁棒性。
87 | * 数据权重修正:降低偏置样本对训练loss的影响,即减少模型从偏置样本中学习。具体方案详见[Du, Yanrui, et al. 2022](https://arxiv.org/abs/2205.12593),其提供了`lls_d`和`lls_d_f`两种样本偏置度计算策略,前者考虑了词的有偏性,而后者同时考虑词的有偏性和频次。
88 | * 数据分布修正:通过对非偏置数据多次重复采样,使训练数据分布尽量均衡。
89 |
90 | ### 方法效果 - 数据权重修正
91 |
92 | 实验基于ERNIE-3.0-base-zh在相似度计算任务LCQMC数据集上微调得到基线模型,在LCQMC测试集和DuQM鲁棒性数据集上评估效果,评估指标为准确率。
93 |
94 | 效果如下表所示:相比于基线,数据权重修正后,模型在鲁棒性数据集DuQM上准确率提升0.94%。
95 |
96 | | 数据集 | LCQMCdev | LCQMCtest | DuQM |
97 | | :-------: | :-------: | :-------: | :-------: |
98 | | 基线 | 90.93% | 87.06% | 73.82% |
99 | | lls_d | 90.76% | 87.58% | 74.76% |
100 | | lls_d_f | 90.80% | 87.22% | 74.44% |
101 |
102 | 详细见应用示例[数据权重修正](./data_bias_identification/data_distribution_correction)。
103 |
104 |
105 |
106 | ### 方法效果 - 数据分布修正
107 |
108 | 实验基于ERNIE-3.0-base-zh在情感分析任务ChnsentiCorp数据集上微调得到基线模型,在情感分析鲁棒性数据集上评估效果,评估指标为准确率。
109 |
110 | 效果如下表所示:相比于基线,数据分布修正后,模型在鲁棒性数据集上准确率提升1.41%。
111 | | 数据集 | 鲁棒性数据集 |
112 | | :-------: | :-------: |
113 | | 基线 | 69.97 |
114 | | 分布修正 | 71.38 |
115 |
116 | 详细见应用示例[数据分布修正](./data_bias_identification/data_distribution_correction)。
117 |
118 |
119 |
120 | ## 证据识别及基于证据的预测
121 |
122 | ### 方法介绍
123 |
124 | 在长文本理解问题上,输入中的冗余信息往往会干扰模型预测,导致模型鲁棒性差。如在机器阅读理解(MRC)任务中,模型容易受到输入中扰动信息干扰,即输入中加入一些与答案生成无关的信息,模型生成的答案却可能发生改变。
125 |
126 | 为了降低模型受无关信息干扰带来的影响,TrustAI构建“证据识别-基于证据的预测”二阶段流程。首先,通过证据抽取识别输入中有效信息,排除冗余数据;然后基于识别的有效信息进行最终答案生成,提高模型鲁棒性。
127 |
128 | ### 方法效果
129 | 我们在MRC任务上做了验证,基于ERNIE-3.0-base-zh在DuReader-robust训练数据上微调了基线模型,在DuReader-robust的验证集合、测试集合和Challenge Test(DuReader-chechlist)集合上做了效果验证评估,评估指标为答案的EM(exactly match)。
130 |
131 |
132 | | 模型 | DuReader-robust dev EM | DuReader-robust Test EM | **DuReader-checklist dev EM** |
133 | | :----------------: | ---------------------- | ----------------------- | :---------------------------: |
134 | | roberta-base | 73.18 | 45.97 | 27.56 |
135 | | Selector-Predictor | 74.31 | 50.91 | 31.04 |
136 |
137 | 实验结论:“证据识别-基于证据的预测”二阶段方案,将模型在测试集上的效果提升4.94%,同时将训练的模型直接在DuReader Checklist数据集评估,相较于官方基线汇报结果,EM提升3.48%。
138 |
139 | 详细见应用示例[解决文本冗余导致精度下降的问题](./redundancy_removal)。
140 |
141 |
142 | ## 基于证据指导的模型增强方案
143 | ### 方法介绍
144 | 经过对多个模型预测依赖证据的评估,发现NN模型提供证据的合理性偏弱。为进一步提高证据的合理性,TrustAI提供了基于证据指导的模型增强方案([Jayaram etc. 2021](https://aclanthology.org/2021.emnlp-main.450/)),即标注少量证据数据,通过联合学习原始任务和证据学习任务,用证据学习目标指导模型依赖合理的证据进行预测,提升模型可解释性。
145 |
146 |
147 | 
148 |
149 |
150 | ### 方法效果
151 |
152 | 实验基于ERNIE-2.0-EN-Base在英文情感分析SST数据集上微调得到基线模型,然后选择1000条训练数据进行证据标注,在这些数据上进行证据学习。最终,在500条标有证据的验证数据上进行了效果评估。评估指标除了模型预测准确率外,还包括可解释评估指标,即证据的合理性、充分性和完备性。
153 |
154 | 实验结果如下表:在加入证据指导后,模型预测效果略有提升,准确率提升0.5%;模型可解释性提升明显:证据合理性提升5.0%、充分性降低0.185(该指标越低越好)、完备性提升0.044。
155 |
156 | | 数据集 | 准确率 | 合理性 | 充分性 | 完备性 |
157 | | :-------: | :-----: | :-----: | :-----: | :-----: |
158 | | base | 93.5% | 26.1% | 0.367 | 0.118 |
159 | | base + maw loss | 94.0% | 31.1% | 0.182 | 0.162 |
160 |
161 | 应用的详细示例见[基于证据指导的模型增强方案](./enhanced_by_rationale)。
162 |
163 |
164 |
165 | ## 基于证据指导的预测错误数据识别
166 | ### 方法介绍
167 | TrustAI可信分析方法提供了模型预测依赖的证据,用来解释模型为什么会做这个预测。当预测依赖证据合理性弱,是否说明预测结果不可信呢?基于这个假设,我们做了一些探索,发现可以基于对模型预测依赖证据的分析,识别潜在预测错误的数据。
168 |
169 |
170 | ### 方法效果
171 | 实验基于ERNIE-1.0-base在相似度计算任务LCQMC数据集上微调得到相似度计算模型,然后在LCQMC验证集合上进行预测,并利用TrustAI提供的特征级证据分析方法识别模型预测依赖证据。
172 |
173 | 在我们的实验中,选择模型判断为语义相似但MAP(评估两个输入文本证据的一致性)低于指定阈值(0.3)的数据,其占全部测试数据比例为3.4%。相比于全量测试数据,模型在该类数据上效果下降了9.67%。
174 |
175 | | 数据集 | acc |
176 | | :-------: | :-----: |
177 | | 全部数据 | 89.53% |
178 | | 候选预测错误数据 | 79.86% |
179 |
180 | 基于候选预测错误数据,通过数据自动增强方法生成相关数据,作为强正负例加入到原始训练数据中,重训模型后,模型在该类数据上准确率提升7%。
181 |
182 | 详细见应用示例[相似度计算任务的预测错误数据识别](./map_analysis/zh-similarity-application.ipynb)。
183 |
184 |
185 |
186 |
187 |
--------------------------------------------------------------------------------
/tutorials/data_bias_identification/data_distribution_correction/README.md:
--------------------------------------------------------------------------------
1 | # 训练数据偏置识别及偏置消除 - 数据分布修正
2 |
3 | ## 方法介绍
4 | 受限于数据集收集方法、标注人员经验等影响,构建的训练数据集中往往存在偏置现象。模型会利用数据集偏置作为预测捷径,如在情感分析任务中,遇到否定词或描述直接给出“负向”情感预测。这种偏置会导致模型没有学会真正的理解和推理能力,在与训练数据分布一致的测试数据上表现很好,但在与训练数据分布不一致的测试数据上往往会表现较差。
5 |
6 | TrustAI提供了数据集偏置识别及基于分布修正的偏置缓解策略。
7 | * 偏置识别:统计训练数据中词与标注标签的分布,在分布上不均衡的词可能是偏置词,这里需要使用任务相关词典对候选偏置词过滤,得到真正的偏置词。包含偏置词的样本为偏置样本。
8 | * 分布修正:对非偏置样本进行重复采样。
9 |
10 |
11 | 注:开发者可访问[ AI Studio示例 ](https://aistudio.baidu.com/aistudio/projectdetail/4434652)快速体验本案例。
12 |
13 | ## 实验步骤
14 | 实验基于ERNIE-3.0-base-zh在情感分析任务ChnsentiCorp数据集上微调得到基线模型,在情感分析鲁棒性数据集上评估效果,评估指标为准确率。
15 |
16 |
17 | **Step 1**:识别偏置词。基于特征级证据可信分析方法(`IntGradInterpreter`)获取训练数据预测依赖的证据,然后统计各证据频次信息。
18 | ```shell
19 | # 下载数据
20 | wget --no-check-certificate https://trustai.bj.bcebos.com/application_data/distribution_data.tar && tar xf distribution_data.tar && rm distribution_data.tar
21 | # 训练基线模型
22 | python -u train.py --dataset_dir ./data --train_file train.tsv --dev_file robust.tsv --num_classes 2 --save_dir ./checkpoint
23 |
24 | # 统计重要证据和频次
25 | python -u get_rationale_importance.py --dataset_dir ./data --input_file train.tsv --num_classes 2 --rationale_path ./data/rationale_importance.txt --init_from_ckpt ./checkpoint/model_state.pdparams
26 | # rationale_path为证据及其频次保存的地址
27 | ```
28 |
29 | **Step 2**:识别偏置样本,及对偏置样本重复采样以达到均衡。
30 |
31 | ```shell
32 | # 生成均衡训练数据
33 | python -u balance_train_data.py --input_path ./data/train.tsv --rationale_path ./data/rationale_importance.txt --output_path ./data/balanced_train.tsv
34 | ```
35 |
36 | 基于生成的均衡数据`balanced_train.tsv`训练模型。
37 |
38 | ```shell
39 | python -u train.py --dataset_dir ./data --train_file balanced_train.tsv --dev_file robust.tsv --num_classes 2 --save_dir ./checkpoint
40 | ```
41 | 实验效果如下表所示:
42 | | 数据集 | 鲁棒性数据集 |
43 | | :-------: | :-------: |
44 | | 基线 | 69.97 |
45 | | 分布修正 | 71.38 |
46 |
47 | 注:以上结果均为10次实验的平均值。
48 |
--------------------------------------------------------------------------------
/tutorials/data_bias_identification/data_distribution_correction/balance_train_data.py:
--------------------------------------------------------------------------------
1 | import re
2 | import json
3 | import collections
4 | import random
5 | import time
6 | import os
7 | import argparse
8 | from collections import defaultdict
9 |
10 | import numpy as np
11 | import paddle
12 | import jieba
13 | from paddle.io import DataLoader, BatchSampler
14 | from paddlenlp.data import DataCollatorWithPadding
15 | from paddlenlp.datasets import load_dataset
16 | from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer
17 | from paddlenlp.utils.log import logger
18 | from trustai.interpretation import get_word_offset
19 | from trustai.interpretation import IntGradInterpreter
20 | from LAC import LAC
21 | from tqdm import tqdm
22 |
23 | from utils import evaluate, preprocess_function
24 |
25 | parser = argparse.ArgumentParser()
26 | parser.add_argument("--input_path", type=str, default=None, help="file path of input data.")
27 | parser.add_argument("--output_path", type=str, default=None, help="file path of output data.")
28 |
29 | parser.add_argument("--seed", type=int, default=3, help="random seed for initialization")
30 | parser.add_argument("--rationale_path",
31 | type=str,
32 | default="./data/rationale_importance.txt",
33 | help="Path to save rationale importance data.")
34 |
35 | args = parser.parse_args()
36 |
37 |
38 | def set_seed(seed):
39 | """
40 | Sets random seed
41 | """
42 | random.seed(seed)
43 | np.random.seed(seed)
44 | paddle.seed(seed)
45 | os.environ['PYTHONHASHSEED'] = str(seed)
46 |
47 |
48 | def run():
49 | """
50 | Get rationale importance
51 | """
52 | set_seed(args.seed)
53 |
54 | # init lexical analyzer of chinese
55 | lac = LAC(mode='lac')
56 |
57 | # load ratioanle importance
58 | with open(args.rationale_path, 'r') as f:
59 | tokens = []
60 | for line in f:
61 | if line.strip():
62 | token, frequency = line.split('\t')
63 | frequency = int(frequency)
64 | if frequency > 2:
65 | tokens.append(token)
66 | # load ChnSentiCorp train data
67 | with open(args.input_path, 'r') as f:
68 | examples = []
69 | for i, line in enumerate(tqdm(list(f))):
70 | label, text = line.strip().split('\t')
71 | examples.append((i, int(label), text, list(jieba.cut(text))))
72 |
73 | # Statistics rationale index in positive and negative examples respectively
74 | pos_dict = collections.defaultdict(list)
75 | neg_dict = collections.defaultdict(list)
76 | rate_dict = {}
77 | for i, token in enumerate(tqdm(tokens[::-1])):
78 | for example in examples:
79 | if token in example[3]:
80 | if example[1] == 1:
81 | pos_dict[token].append(example[0])
82 | else:
83 | neg_dict[token].append(example[0])
84 |
85 | # filter rationale by postag and positive negative ratio
86 | for token in sorted(list(set(pos_dict.keys()) & set(neg_dict.keys()))):
87 | pos_list = pos_dict[token]
88 | neg_list = neg_dict[token]
89 | pos_ratio = len(pos_list) / (len(pos_list) + len(neg_list))
90 | postags = lac.run(token)[1]
91 | if (pos_ratio <= 0.15 or pos_ratio >= 0.85) and not (set(['c', 'r', 'w', 'm']) & set(postags)):
92 | rate_dict[token] = [pos_ratio if pos_ratio < 0.5 else 1 - pos_ratio, len(pos_list), len(neg_list), postags]
93 | for k, v in rate_dict.items():
94 | print(k, v, len(pos_dict[k]), len(neg_dict[k]))
95 | # sampling the data that will be added to the training set
96 | add_dict = defaultdict(int)
97 | add_list = []
98 | for token in rate_dict:
99 | pos_num = len(pos_dict[token])
100 | neg_num = len(neg_dict[token])
101 | tmp_dict = defaultdict(int)
102 | if pos_num > neg_num:
103 | for idx in random.choices(neg_dict[token], k=min(pos_num - neg_num, neg_num * 2)):
104 | tmp_dict[idx] += 1
105 | else:
106 | for idx in random.choices(pos_dict[token], k=min(neg_num - pos_num, pos_num * 2)):
107 | tmp_dict[idx] += 1
108 | for idx, count in tmp_dict.items():
109 | add_dict[idx] = max(add_dict[idx], count)
110 | for idx, count in add_dict.items():
111 | add_list.extend([idx] * count)
112 | print(add_dict)
113 | random.shuffle(add_list)
114 | # write data to train data
115 | logger.info(f"add number: {len(add_list)}")
116 | with open(args.output_path, 'w') as f:
117 | for example in examples:
118 | f.write(str(example[1]) + '\t' + example[2] + '\n')
119 | for idx in add_list:
120 | example = examples[idx]
121 | f.write(str(example[1]) + '\t' + example[2] + '\n')
122 |
123 |
124 | if __name__ == "__main__":
125 | run()
126 |
--------------------------------------------------------------------------------
/tutorials/data_bias_identification/data_distribution_correction/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import numpy as np
16 |
17 | import paddle
18 | import paddle.nn.functional as F
19 | from paddlenlp.utils.log import logger
20 |
21 |
22 | @paddle.no_grad()
23 | def evaluate(model, criterion, metric, data_loader, name=''):
24 | """
25 | Given a dataset, it evaluates model and computes the metric.
26 | Args:
27 | model(obj:`paddle.nn.Layer`): A model to classify texts.
28 | criterion(obj:`paddle.nn.Layer`): It can compute the loss.
29 | metric(obj:`paddle.metric.Metric`): The evaluation metric.
30 | data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
31 | """
32 |
33 | model.eval()
34 | metric.reset()
35 | losses = []
36 | for batch in data_loader:
37 | input_ids, token_type_ids, labels = batch['input_ids'], batch[
38 | 'token_type_ids'], batch['labels']
39 | logits = model(input_ids, token_type_ids)
40 | loss = criterion(logits, labels)
41 | losses.append(loss.numpy())
42 | correct = metric.compute(logits, labels)
43 | metric.update(correct)
44 |
45 | acc = metric.accumulate()
46 | logger.info("%s: eval loss: %.5f, acc: %.5f" % (name, np.mean(losses), acc))
47 | model.train()
48 | metric.reset()
49 |
50 | return acc
51 |
52 |
53 | def preprocess_function(example, tokenizer, max_seq_length, is_test=False):
54 | """
55 | Builds model inputs from a sequence for sequence classification tasks
56 | by concatenating and adding special tokens.
57 |
58 | Args:
59 | example(obj:`list[str]`): input data, containing text and label if it have label.
60 | tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
61 | which contains most of the methods. Users should refer to the superclass for more information regarding methods.
62 | max_seq_length(obj:`int`): The maximum total input sequence length after tokenization.
63 | Sequences longer than this will be truncated, sequences shorter will be padded.
64 | label_nums(obj:`int`): The number of the labels.
65 | Returns:
66 | result(obj:`dict`): The preprocessed data including input_ids, token_type_ids, labels.
67 | """
68 | if 'text_b' not in example:
69 | result = tokenizer(text=example["text_a"], max_seq_len=max_seq_length)
70 | else:
71 | result = tokenizer(text=example["text_a"], text_pair=example['text_b'], max_seq_len=max_seq_length)
72 |
73 | if not is_test:
74 | result["labels"] = np.array([example['label']], dtype='int64')
75 | return result
76 |
--------------------------------------------------------------------------------
/tutorials/data_bias_identification/less_learn_shortcut/README.md:
--------------------------------------------------------------------------------
1 | # 解决训练数据分布偏置的问题 - 数据权重修正方案
2 | ## 方法介绍
3 | 受限于数据集收集方法、标注人员经验等影响,构建的训练数据集中往往存在偏置现象。模型会利用数据集偏置作为预测捷径,如在情感分析任务中,遇到否定词或描述直接给出“负向”情感预测。这种偏置会导致模型没有学会真正的理解和推理能力,在与训练数据分布一致的测试数据上表现很好,但在与训练数据分布不一致的测试数据上往往会表现较差。
4 |
5 | TrustAI提供了数据集偏置识别及基于权重修正的偏置缓解策略。
6 | * 偏置识别:统计训练数据中词与标注标签的分布,在分布上不均衡的词可能是偏置词,包含偏置词的样本为偏置样本。
7 | * 权重修正:降低偏置样本对训练loss的影响,即针对每一条样本计算一个偏置度,在训练loss计算时通过偏置度降低偏置样本影响,具体见[Du, Yanrui, et al. 2022](https://arxiv.org/abs/2205.12593)。
8 |
9 | 注:开发者可访问[ AI Studio示例 ](https://aistudio.baidu.com/aistudio/projectdetail/4434616)快速体验本案例。
10 |
11 | ## 实验步骤
12 | 实验基于ERNIE-3.0-base-zh在情感分析任务ChnsentiCorp数据集上微调得到基线模型,在情感分析鲁棒性数据集上评估效果,评估指标为准确率。
13 |
14 |
15 | **Step 1**:识别训练数据中的偏置词。在训练数据中,统计每个词在不同类别上的分布,对于频次大于`cnt_threshold`、且最少在一个类别上出现比例大于`p_threshold`的词视为偏置词。
16 |
17 | ```shell
18 | # 下载数据
19 | wget --no-check-certificate https://trustai.bj.bcebos.com/application_data/lls_data.tar && tar xf lls_data.tar && rm lls_data.tar
20 | # 统计偏置词
21 | python -u find_bias_word.py --output_dir output --input_path ./data/train.tsv --num_classes 2 --cnt_threshold 3 --p_threshold 0.90 --output_dir output
22 | # cnt_threshold表示为偏置词最少需要出现的频次
23 | # p_threshold表示偏置比例的阈值,偏置词至少需要在一个类别上大于此阈值
24 | # output_dir表示统计结果的存储路径
25 | ```
26 |
27 | **Step 2**:基于偏置词的统计结果,针对每一训练样本,计算偏置度,作为样本对训练loss的影响权重。
28 |
29 | 当前方案提供了`lls_d`和`lls_d_f`两种计算样本偏置度的策略,前者考虑词的有偏性,后者同时考虑词的有偏性和频次。
30 |
31 | ```shell
32 | # 基于`lls_d`策略计算样本偏置度
33 | python -u lls.py --input_path ./data/train.tsv --bias_dir ./output --stopwords_path ./data/stop_words.txt --num_classes 2 --mode lls_d --output_path ./data/train_lls_d.tsv
34 | # 基于`lls_d_f`策略计算样本偏置度
35 | python -u lls.py --input_path ./data/train.tsv --bias_dir ./output --stopwords_path ./data/stop_words.txt --num_classes 2 --mode lls_d_f --output_path ./data/train_lls_d_f.tsv
36 | # mode表示计算样本偏置度的策略,当前有`lls_d`和`lls_d_f`两种策略
37 | # output_path表示为生成带偏置度训练集的存储路径
38 | ```
39 |
40 | **Step 3**:用带偏置度的训练数据训练模型,偏置度作用于loss计算。
41 | ```shell
42 | # 基于`lls_d`策略产生的数据训练模型
43 | python -u train.py --dataset_dir ./data --train_file train_lls_d.tsv --dev_file dev.tsv --test_files test.tsv DuQM --num_classes 2 --save_dir ./lls_d_checkpoint
44 | # 基于`lls_d_f`策略产生的数据训练模型
45 | python -u train.py --dataset_dir ./data --train_file train_lls_d_f.tsv --dev_file dev.tsv --test_files test.tsv DuQM --num_classes 2 --save_dir ./lls_d_f_checkpoint
46 | ```
47 |
48 | 实验结果如下表所示:相比于基线,权重修正后,模型在鲁棒性数据集DuQM上准确率提升0.94%。
49 |
50 | | 数据集 | LCQMCdev | LCQMCtest | DuQM |
51 | | :-------: | :-------: | :-------: | :-------: |
52 | | 基线 | 90.93 | 87.06 | 73.82 |
53 | | lls_d | 90.76 | 87.58 | 74.76 |
54 | | lls_d_f | 90.80 | 87.22 | 74.44 |
55 |
56 | 注:以上结果均为3次实验的平均值。
57 |
--------------------------------------------------------------------------------
/tutorials/data_bias_identification/less_learn_shortcut/find_bias_word.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import collections
4 | import argparse
5 |
6 | from LAC import LAC
7 | from tqdm import tqdm
8 |
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument("--output_dir",
11 | default="./output",
12 | type=str,
13 | help="The output directory where the result will be written.")
14 | parser.add_argument("--input_path", type=str, default=None, help="train data file path")
15 | parser.add_argument('--num_classes', type=int, default=2, help='Number of classification.')
16 | parser.add_argument('--cnt_threshold', type=int, default=3, help='Count threshold of statistical biased words')
17 | parser.add_argument('--p_threshold', type=float, default=0.85, help='Probability threshold of statistical biased words')
18 |
19 | args = parser.parse_args()
20 |
21 |
22 | class BiasWord(object):
23 | """
24 | Statistic the biased words in the dataset
25 | """
26 |
27 | def __init__(self, segments, labels, num_classes=2, cnt_threshold=3, p_threshold=0.85):
28 | self.cnt_threshold = cnt_threshold
29 | self.p_threshold = p_threshold
30 | self.num_classes = num_classes
31 | self.segments = segments
32 | self.labels = labels
33 |
34 | def process(self):
35 | """
36 | process function
37 | """
38 | self._get_dict()
39 | self._search_bias_word()
40 | print("number of bias_words:", len(self.bias_words))
41 | return self.bias_words, self.bias_word_cnt, self.id2words
42 |
43 | def _get_dict(self):
44 | self.word2ids = collections.defaultdict(set)
45 | self.id2words = collections.defaultdict(set)
46 | for n, segs in enumerate(self.segments):
47 | for seg in segs:
48 | self.word2ids[seg].add(n)
49 | self.id2words[n] = set(segs)
50 |
51 | def _search_bias_word(self):
52 | self.bias_words = {}
53 | self.bias_word_cnt = {}
54 | for word, sentids in self.word2ids.items():
55 | if len(sentids) >= self.cnt_threshold:
56 | cnts = [0] * self.num_classes
57 |
58 | for sentid in sentids:
59 | label = self.labels[sentid]
60 | cnts[label] += 1
61 | assert sum(cnts) != 0
62 | max_cnt = max(cnts)
63 | p = max_cnt / sum(cnts)
64 | if p >= self.p_threshold:
65 | self.bias_words[word] = p
66 | self.bias_word_cnt[word] = len(sentids)
67 |
68 |
69 | if __name__ == "__main__":
70 | # initialize tokenizer
71 | lac = LAC(mode='rank')
72 |
73 | # preprocess data, get segments、labels and lines
74 | segments = []
75 | labels = []
76 | lines = []
77 | with open(args.input_path, 'r') as f:
78 | for line in tqdm(list(f)):
79 | lines.append(line)
80 | query, title, label = line.strip().split('\t')
81 | seg_res = lac.run([query, title])
82 | query_segs = seg_res[0][0]
83 | title_segs = seg_res[1][0]
84 | segments.append(query_segs + title_segs)
85 | labels.append(int(label))
86 |
87 | # get bias_words
88 | biasword = BiasWord(segments, labels, num_classes=2, cnt_threshold=args.cnt_threshold, p_threshold=args.p_threshold)
89 | # b_words: biased words, dict
90 | # b_word_cnt: count of biased words, dict
91 | # id2words: sentence index to words, dict
92 | b_words, b_word_cnt, id2words = biasword.process()
93 |
94 | # save result to output_dir
95 | if not os.path.exists(args.output_dir):
96 | os.makedirs(args.output_dir)
97 | with open(os.path.join(args.output_dir, "bias_word.json"), 'w') as f:
98 | json.dump(b_words, f, ensure_ascii=False)
99 | with open(os.path.join(args.output_dir, "bias_word_cnt.json"), 'w') as f:
100 | json.dump(b_word_cnt, f, ensure_ascii=False)
101 | with open(os.path.join(args.output_dir, "id2words.json"), 'w') as f:
102 | for k, v in id2words.items():
103 | id2words[k] = list(v)
104 | json.dump(id2words, f, ensure_ascii=False)
105 |
--------------------------------------------------------------------------------
/tutorials/data_bias_identification/less_learn_shortcut/lls.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 |
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument("--input_path", type=str, default=None, help="input data file path")
7 | parser.add_argument("--output_path", type=str, default=None, help="output data file path")
8 | parser.add_argument("--bias_dir", type=str, default=None, help="bias data directory.")
9 | parser.add_argument("--stopwords_path", type=str, default=None, help="stopwords data file path")
10 | parser.add_argument('--num_classes', type=int, default=2, help='Number of classification.')
11 | parser.add_argument('--alpha',
12 | type=float,
13 | default=0.01,
14 | help='Hyperparameters for frequency of words when mode is lls_d_f.')
15 | parser.add_argument('--mode',
16 | type=str,
17 | default='lls_d',
18 | choices=['lls_d', 'lls_d_f'],
19 | help='Hyperparameters for frequency of words.')
20 |
21 | args = parser.parse_args()
22 |
23 |
24 | def filter_stopwords(score_w, stop_words):
25 | for word in list(score_w.keys()):
26 | if word in stop_words:
27 | del score_w[word]
28 | return score_w
29 |
30 |
31 | def word_score(d, num_classes):
32 | score_w = {}
33 | for k in d.keys():
34 | score_w[k] = abs(d[k] - 1 / num_classes)
35 | return score_w
36 |
37 |
38 | def word_score_freq(d, d_cnt, num_classes, alpha):
39 | score_w = {}
40 | for k in d.keys():
41 | score_w[k] = abs(d[k] - 1 / num_classes) + alpha * d_cnt[k]
42 | return score_w
43 |
44 |
45 | def lls_basic(score_w, id2words):
46 | sample_bias = {}
47 | for n in range(len(id2words)):
48 |
49 | sample_score = 0
50 | cnt = 0
51 | for word in id2words[str(n)]:
52 | if word in score_w:
53 | sample_score += score_w[word]
54 | cnt += 1
55 | if cnt != 0:
56 | sample_bias[n] = sample_score / cnt
57 | return sample_bias
58 |
59 |
60 | def softxmax(sample_bias, a=0, b=0.15):
61 | """
62 | Score normalization
63 | """
64 | scores = []
65 | for k, v in sample_bias.items():
66 | scores.append(v)
67 | maxn, minn = max(scores), min(scores)
68 | sample_bias_norm = {}
69 | for k, sc in sample_bias.items():
70 | sc_softmax = a + (b - a) / (maxn - minn) * (sc - minn)
71 | sample_bias_norm[k] = (1 - sc_softmax)
72 | return sample_bias_norm
73 |
74 |
75 | if __name__ == "__main__":
76 |
77 | # load data
78 | with open(args.stopwords_path, 'r') as f:
79 | stop_words = []
80 | for line in f.readlines():
81 | stop_words.append(line.strip())
82 | with open(os.path.join(args.bias_dir, 'id2words.json'), 'r') as f:
83 | id2words = json.load(f)
84 | with open(os.path.join(args.bias_dir, 'bias_word.json'), 'r') as f:
85 | d = json.load(f)
86 | with open(os.path.join(args.bias_dir, 'bias_word_cnt.json'), 'r') as f:
87 | d_cnt = json.load(f)
88 | with open(args.input_path, 'r') as f:
89 | lines = list(f)
90 |
91 | # get bias degree for example
92 | mode = args.mode
93 | if mode == 'lls_d':
94 | score_w = word_score(d, num_classes=2)
95 | score_w = filter_stopwords(score_w, stop_words)
96 | sample_bias = lls_basic(score_w, id2words)
97 | sample_bias_norm = softxmax(sample_bias)
98 | elif mode == 'lls_d_f':
99 | score_w = word_score_freq(d, d_cnt, num_classes=args.num_classes, alpha=args.alpha)
100 | score_w = filter_stopwords(score_w, stop_words)
101 | sample_bias = lls_basic(score_w, id2words)
102 | sample_bias_norm = softxmax(sample_bias)
103 | else:
104 | raise KeyError(f"Unknown mode: {mode}, mode should be chosen from [lls_d, lls_d_f].")
105 |
106 | # save result
107 | with open(args.output_path, 'w', encoding='utf-8') as f:
108 | for n, line in enumerate(lines):
109 | if n in sample_bias_norm:
110 | f.write(line.strip() + '\t' + str(sample_bias_norm[n]) + '\n')
111 | else:
112 | f.write(line.strip() + '\t' + str(1) + '\n')
--------------------------------------------------------------------------------
/tutorials/data_bias_identification/less_learn_shortcut/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import numpy as np
16 |
17 | import paddle
18 | import paddle.nn.functional as F
19 | from paddlenlp.utils.log import logger
20 |
21 |
22 | @paddle.no_grad()
23 | def evaluate(model, criterion, metric, data_loader, name=''):
24 | """
25 | Given a dataset, it evaluates model and computes the metric.
26 | Args:
27 | model(obj:`paddle.nn.Layer`): A model to classify texts.
28 | criterion(obj:`paddle.nn.Layer`): It can compute the loss.
29 | metric(obj:`paddle.metric.Metric`): The evaluation metric.
30 | data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
31 | """
32 |
33 | model.eval()
34 | metric.reset()
35 | losses = []
36 | for batch in data_loader:
37 | input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']
38 | logits = model(input_ids, token_type_ids)
39 | loss = criterion(logits, labels)
40 | loss = loss.mean()
41 | losses.append(loss.numpy())
42 | correct = metric.compute(logits, labels)
43 | metric.update(correct)
44 |
45 | acc = metric.accumulate()
46 | logger.info("%s: eval loss: %.5f, acc: %.5f" % (name, np.mean(losses), acc))
47 | model.train()
48 | metric.reset()
49 |
50 | return acc
51 |
52 |
53 | def preprocess_function(example, tokenizer, max_seq_length, is_test=False):
54 | """
55 | Builds model inputs from a sequence for sequence classification tasks
56 | by concatenating and adding special tokens.
57 |
58 | Args:
59 | example(obj:`list[str]`): input data, containing text and label if it have label.
60 | tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
61 | which contains most of the methods. Users should refer to the superclass for more information regarding methods.
62 | max_seq_length(obj:`int`): The maximum total input sequence length after tokenization.
63 | Sequences longer than this will be truncated, sequences shorter will be padded.
64 | label_nums(obj:`int`): The number of the labels.
65 | Returns:
66 | result(obj:`dict`): The preprocessed data including input_ids, token_type_ids, labels.
67 | """
68 | if 'text_b' not in example:
69 | result = tokenizer(text=example["text_a"], max_seq_len=max_seq_length)
70 | else:
71 | result = tokenizer(text=example["text_a"], text_pair=example['text_b'], max_seq_len=max_seq_length)
72 |
73 | if not is_test:
74 | result["labels"] = np.array([example['label']], dtype='int64')
75 | result["weights"] = np.array([example['weight']], dtype='float32')
76 | return result
77 |
--------------------------------------------------------------------------------
/tutorials/data_map/README.md:
--------------------------------------------------------------------------------
1 | # 基于训练信号的数据地图绘制 Dataset Cartography with Training Dynamics
2 |
3 | ## 方法介绍
4 | 现有工作表明,可以使用训练数据在训练过程中的信号绘制数据地图;根据信号特征划分数据,不同数据区域具有不同特点,如难学、标注错误等。通过绘制数据地图,可以帮助开发者更好地了解训练数据。
5 |
6 | TrustAI提供了"训练信号收集 -> 数据地图绘制"方案。首先,收集每条训练数据在训练过程中不同step下的训练信号;然后,根据得到的统计信号,基于指定的信号维度绘制数据地图。
7 |
8 | 注:开发者可访问[ AI Studio示例 ](https://aistudio.baidu.com/aistudio/projectdetail/5307701)快速体验本案例。
9 |
10 | ## 实验步骤
11 | 我们以相似度计算任务LCQMC数据集上的模拟实验为例,介绍该方案实现步骤和效果。
12 |
13 |
14 | **Step 1**:从LCQMC训练集中随机抽取100条数据作为训练数据样例。训练集文件为.tsv格式,内容如下
15 |
16 | ```shell
17 | # tsv格式数据,一行一条训练数据,不同字段用tab分隔,各字段如下介绍:
18 | [
19 | "text_a" : xxx, // 训练数据文本a
20 | "text_b" : xxx, // 训练数据文本b
21 | "label" : xxx, // 训练数据对应的label
22 | "s_label" : xxx, // 训练数据的构造label (可省略)
23 | ]
24 | ```
25 | 注: ``s_label``可省,为用户构造的label,如构造脏数据。
26 |
27 | 基于ERNIE-3.0-base-zh在新训练集`sample_100.tsv`上微调得到基线模型,运行命令如下所示:
28 |
29 | ```shell
30 | # 训练模型并收集训练信号
31 | sh run_train_pointwise.sh
32 | ```
33 | 所有训练数据的训练信号按训练step保存在`outputs`路径下。
34 |
35 | 注: 训练信号的收集代码可参考代码`train_pointwise.py`中Lines 199-218,用户可根据自己模型代码进行修改。收集的训练信号如下(用户可自行设计更多信号进行收集):
36 |
37 | ```shell
38 | # .jsonl 格式数据
39 | [
40 | {'id' : xxx, // 训练数据的id
41 | 'label' : xxx, // 训练数据对应的label
42 | 'pred_label' : xxx, // 训练数据的预测label
43 | 'correct' : xxx, // 训练数据是否被预测正确
44 | 'loss' : xxx, // 训练数据当前的loss
45 | 'probs' : [xxx, xxx], // 训练数据在当前每个类下的预测概率(one-hot形式)
46 | 'label_probs' : xxx // 训练数据在label类别下的预测概率
47 | }
48 | ... ...
49 | ]
50 | ```
51 |
52 | **Step 2**:训练信号处理,即基于不同训练steps收集到的信号计算整体信号,如基于不同steps得到的预测概率计算整体平均预测概率。
53 |
54 | ```shell
55 | # 训练信号处理
56 | python -u sample_stat_summary.py
57 | ```
58 | 产出数据保存在`outputs`路径下。
59 |
60 |
61 | 训练信号详细信息
62 |
63 | ```shell
64 | # tsv 格式数据,一行保存一条训练数据的所有训练信号,信号之间用tab进行分隔,各信号如下表示:
65 | [
66 | "id" : xxx, // 训练数据的id
67 | "label" : xxx, // 训练数据对应的label
68 | "s_label" : xxx, // 训练数据的构造label,数据地图绘制允许标记困难数据(s_label = 1)和构造脏数据(s_label = 2)
69 | "correct_times" : xxx, // 总共预测正确的次数
70 | "correct_ratio" : xxx, // 预测正确次数占比
71 | "avg_probs" : xxx, // 多次预测的置信度的平均数
72 | "label_var" : xxx, // 多次预测的置信度的方差
73 | "max_label_probs" : xxx, // 多次预测的置信度的最大值
74 | "min_label_probs" : xxx, // 多次预测的置信度的最小值
75 | "forgetting_times" : xxx, // 多次预测反映出的,模型对本数据的遗忘次数(之前预测对了,后来又错了)
76 | "learnt_times" : xxx, // 多次预测反映出的,模型对本数据的学会次数(之前预测错了,后来又对了)
77 | "first_forget" : xxx, // 多次预测中,第一次遗忘本数据
78 | "first_learn" : xxx, // 多次预测中,第一次学会本数据
79 | ]
80 | ```
81 |
82 |
83 | **Step 3**:基于产出的训练信号,选择两个信号作为数据地图的主要维度(默认为平均置信度与置信方差),并选择其他信号(如正确比例、正确次数、遗忘次数、学习次数等)以颜色、形状等进行区别绘制数据地图。
84 |
85 | ```shell
86 | # 数据地图绘制
87 | python -u plot_map.py
88 |
89 | # 参数选择
90 | attr1: str类型, 默认值为"avg_probs",选择一个信号作为数据地图的纵轴
91 | attr2: str类型, 默认值为"label_var",选择一个信号作为数据地图的横轴
92 | criterion: str类型,默认值为空,选择一个训练信号作为数据筛选依据,仅满足条件的数据会被绘制在地图上
93 | threshold: float类型,默认值0,与criterion一同使用,为选择的训练信号设置阈值,筛选数据
94 | use_f_times: float类型,默认值-1,使用forgotten_times并选择所有遗忘次数不小于use_f_times的样本
95 | use_l_times: float类型,默认值-1,使用learnt_times并选择所有遗忘次数不小于use_l_times的样本
96 |
97 | # 数据地图样例
98 | python -u plot_map.py # 图1左
99 | python -u plot_map.py --criterion forgetting_times --threshold 1 # 图1中
100 | python -u plot_map.py --use_l_times 0 # 图1右
101 | ```
102 |
103 |
104 |
105 |
106 | 
107 | 图1 数据地图样例。左: 默认参数设置的数据地图;中:指定criterion为forgetting_times,threshold为1的数据绘制地图;右:使用learnt_times(use_l_times=0)区分数据的颜色,其中左上和左下角数据分别使用correct_times做进一步区分。此数据地图根据全量LCQMC数据的训练信号绘制,而非提供的100条样例。
108 |
109 |
--------------------------------------------------------------------------------
/tutorials/data_map/data.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import paddle
16 | import numpy as np
17 |
18 | from paddlenlp.datasets import MapDataset
19 |
20 |
21 | def create_dataloader(dataset,
22 | mode='train',
23 | batch_size=1,
24 | batchify_fn=None,
25 | trans_fn=None):
26 | if trans_fn:
27 | dataset = dataset.map(trans_fn)
28 |
29 | shuffle = True if mode == 'train' else False
30 | if mode == 'train':
31 | batch_sampler = paddle.io.DistributedBatchSampler(
32 | dataset, batch_size=batch_size, shuffle=shuffle)
33 | else:
34 | batch_sampler = paddle.io.BatchSampler(
35 | dataset, batch_size=batch_size, shuffle=shuffle)
36 |
37 | return paddle.io.DataLoader(
38 | dataset=dataset,
39 | batch_sampler=batch_sampler,
40 | collate_fn=batchify_fn,
41 | return_list=True)
42 |
43 |
44 | def read_text_pair(data_path):
45 | """Reads data."""
46 | with open(data_path, 'r', encoding='utf-8') as f:
47 | for line in f:
48 | data = line.rstrip().split("\t")
49 | if len(data) != 2:
50 | continue
51 | yield {'query': data[0], 'title': data[1]}
52 |
53 |
54 | def convert_pointwise_example(example,
55 | tokenizer,
56 | max_seq_length=512,
57 | is_test=False,
58 | language='en'):
59 | if language == 'ch':
60 | q_name = "text_t"
61 | t_name = "text_q"
62 | s_name = "s_label"
63 | #t_name = "title"
64 | l_name = "label"
65 | else:
66 | q_name = "text_t"
67 | t_name = "text_q"
68 | l_name = "label"
69 | s_name = "s_label"
70 | #q_name = "sentence1"
71 | #t_name = "sentence2"
72 | #l_name = "labels"
73 |
74 | query, title = example[q_name], example[t_name]
75 |
76 | encoded_inputs = tokenizer(
77 | text=query, text_pair=title, max_seq_len=max_seq_length)
78 |
79 | input_ids = encoded_inputs["input_ids"]
80 | token_type_ids = encoded_inputs["token_type_ids"]
81 | sep_ids = len(input_ids) - 1
82 |
83 | #print("input_ids:",input_ids)
84 | if not is_test:
85 | label = np.array([example[l_name]], dtype="int64")
86 | s_label = np.array([example[s_name]],dtype="int64")
87 | sep_ids = np.array([sep_ids], dtype="int64")
88 | #print('label', label)
89 | return input_ids, token_type_ids, label, s_label, sep_ids
90 | else:
91 | return input_ids, token_type_ids
92 |
93 |
94 | def convert_pairwise_example(example,
95 | tokenizer,
96 | max_seq_length=512,
97 | phase="train"):
98 |
99 | if phase == "train":
100 | query, pos_title, neg_title = example["query"], example[
101 | "title"], example["neg_title"]
102 |
103 | pos_inputs = tokenizer(
104 | text=query, text_pair=pos_title, max_seq_len=max_seq_length)
105 | neg_inputs = tokenizer(
106 | text=query, text_pair=neg_title, max_seq_len=max_seq_length)
107 |
108 | pos_input_ids = pos_inputs["input_ids"]
109 | pos_token_type_ids = pos_inputs["token_type_ids"]
110 | neg_input_ids = neg_inputs["input_ids"]
111 | neg_token_type_ids = neg_inputs["token_type_ids"]
112 |
113 | return (pos_input_ids, pos_token_type_ids, neg_input_ids,
114 | neg_token_type_ids)
115 |
116 | else:
117 | query, title = example["query"], example["title"]
118 |
119 | inputs = tokenizer(
120 | text=query, text_pair=title, max_seq_len=max_seq_length)
121 |
122 | input_ids = inputs["input_ids"]
123 | token_type_ids = inputs["token_type_ids"]
124 | if phase == "eval":
125 | return input_ids, token_type_ids, example["label"]
126 | elif phase == "predict":
127 | return input_ids, token_type_ids
128 | else:
129 | raise ValueError("not supported phase:{}".format(phase))
130 |
131 |
132 | def gen_pair(dataset, pool_size=100):
133 | """
134 | Generate triplet randomly based on dataset
135 |
136 | Args:
137 | dataset: A `MapDataset` or `IterDataset` or a tuple of those.
138 | Each example is composed of 2 texts: exampe["query"], example["title"]
139 | pool_size: the number of example to sample negative example randomly
140 |
141 | Return:
142 | dataset: A `MapDataset` or `IterDataset` or a tuple of those.
143 | Each example is composed of 2 texts: exampe["query"], example["pos_title"]、example["neg_title"]
144 | """
145 |
146 | if len(dataset) < pool_size:
147 | pool_size = len(dataset)
148 |
149 | new_examples = []
150 | pool = []
151 | tmp_exmaples = []
152 |
153 | for example in dataset:
154 | label = example["label"]
155 |
156 | # Filter negative example
157 | if label == 0:
158 | continue
159 |
160 | tmp_exmaples.append(example)
161 | pool.append(example["title"])
162 |
163 | if len(pool) >= pool_size:
164 | np.random.shuffle(pool)
165 | for idx, example in enumerate(tmp_exmaples):
166 | example["neg_title"] = pool[idx]
167 | new_examples.append(example)
168 | tmp_exmaples = []
169 | pool = []
170 | else:
171 | continue
172 | return MapDataset(new_examples)
173 |
--------------------------------------------------------------------------------
/tutorials/data_map/run_train_pointwise.sh:
--------------------------------------------------------------------------------
1 | ###
2 | # This script is used to finetune pretrained models
3 | ###
4 |
5 | export CUDA_VISIBLE_DEVICES=3
6 | LANGUAGE="ch"
7 | timestamp=`date +"%Y%m%d_%H%M%S"`
8 | data_dir='./'
9 | LEARNING_RATE=3e-5
10 | MAX_SEQ_LENGTH=256
11 |
12 | [ -d "logs" ] || mkdir -p "logs"
13 | [ -d "outputs" ] || mkdir -p "outputs"
14 | set -x
15 |
16 | train_file=sample_100.tsv
17 | dev_file=$train_file
18 | train_size=100
19 |
20 | batch_size=32
21 | epoch=5
22 | save_model_num=5
23 | epoch_steps=$[$train_size/$batch_size]
24 | save_steps=$[$epoch_steps*$epoch/${save_model_num}]
25 |
26 | python3 ./train_pointwise.py \
27 | --learning_rate $LEARNING_RATE \
28 | --max_seq_length $MAX_SEQ_LENGTH \
29 | --batch_size ${batch_size} \
30 | --epochs ${epoch} \
31 | --data_dir $data_dir \
32 | --train_set ${train_file} \
33 | --dev_set ${dev_file} \
34 | --eval_step ${save_steps} \
35 | --warmup_proportion 0.1 \
36 | --save_dir saved_model/${timestamp} >> logs/log_${timestamp}
37 |
38 |
--------------------------------------------------------------------------------
/tutorials/data_map/sample_100.tsv:
--------------------------------------------------------------------------------
1 | text_a text_b label type
2 | 御龙在天小还丹怎么做 御龙在天怎么上不去 0 0
3 | 东风日产轩逸怎么样? 东风日产新轩逸怎么样 1 0
4 | 穿越火线有哪些小说 穿越火线小说。 1 0
5 | 我的爸爸作文 爸爸的手作文 1 0
6 | 小米华为魅族哪个好? 酷派小米华为魅族哪个好 1 0
7 | 为什么苹果信号这么差? 为什么苹果6手机信号这么差 1 0
8 | 微信朋友圈能看到访客吗 微信朋友圈都能看到吗 0 0
9 | 魔兽世界,猎人宏 魔兽世界猎人弓 1 0
10 | 怎么查别人微信的聊天记录 微信聊天记录怎么查 1 0
11 | 列方程解应用题, 列方程解应用题。 1 0
12 | 支付宝里的钱怎么转到银行卡 支付宝的钱怎么转到银行卡 1 0
13 | 北京夜店有哪些 北京有哪些夜店 1 0
14 | 这是什么颜色啊? 这是什么颜色阿! 1 0
15 | 优酷视频为什么不能下载 优酷为什么不能下载视频了 1 0
16 | 怎么样才能做好网店! 怎么做好网店? 1 0
17 | 这是哪个明星小时候 这是哪个明星的小时候 1 0
18 | 天天飞车哪个车手刷高分多 天天飞车高分用哪个车手? 1 0
19 | 不客气的,为防止掉线,请您在3-4分钟内回复我一下就好 为防止掉线,请您在3分钟内回复我一下,谢谢您的配合。您好,在否?谢谢 1 0
20 | 我的特一营全集观看那有了吗 电视剧我的特一营全集观看哪里有 1 0
21 | 如何从网上查询个人征信 在网上如何查个人征信,要明细版的 1 0
22 | 在家赚钱有些什么方法? 在家带孩子有什么赚钱的方法 1 0
23 | 现在有什么好看的连续剧或者电影? 有什么好看的电视剧,或者电影 1 0
24 | 天龙八部问题 新天龙八部问题 0 0
25 | 这个里番是什么类型的? 是什么类型 0 0
26 | 婴儿理发器哪个牌子好 婴儿理发器哪个牌子好? 1 0
27 | 手机版的百度知道可以签到嘛? 百度知道怎么看还有多少升级啊? 0 0
28 | 怎样跟自己喜欢的人表白? 怎么和自己喜欢的人表白? 1 0
29 | 发什么成绩? 发什么成绩 1 0
30 | 几岁才可以办银行卡啊? 办银行卡要几岁才可以办? 1 0
31 | 您的是人工审核的方式吗? 只有人工审核的方式吗 0 0
32 | 怎么让刘海紧贴着额头 怎么使刘海不贴额头 0 0
33 | 无聊的时候你们都在干嘛呢? 你们无聊的时候都干嘛呢? 1 0
34 | 我爱你用韩语怎么写? 我爱你用韩语怎么说? 0 0
35 | 世界上有没有外星人 世界有没有外星人 1 0
36 | 这爱已打烊是什么意思 打烊是什么意思 0 0
37 | 硅是由什么构成的 硅由什么构成 1 0
38 | 二本中电子信息工程专业哪个学校比较好 电子信息工程专业分流哪个方向比较好 0 0
39 | 男孩子名字,哪个字好 带氵,钅字旁的男孩名字有哪些? 0 0
40 | 天天酷跑怎么刷金币和钻石 天天酷跑怎么刷金币钻石 1 0
41 | 入团申请书500字左右 入团申请书600字 0 0
42 | 从马鞍山火车站怎么到博望汽车站? 从马鞍山博望到黄池怎么走 0 0
43 | 大写的我怎么写 给的大写怎么写 0 0
44 | 戴耳机的男生头像 头像耳机男生 1 0
45 | 纪念碑谷这关怎么过? 史上最坑爹的游戏,咋过, 0 0
46 | 这个是您转账的 这应该是您查询错误账户了 0 0
47 | 平板电脑哪个品牌好呢 国产哪个牌子的平板电脑好 1 0
48 | 小和尚念经下一句是什么 小和尚念经,下一句是什么? 1 0
49 | 男票,是什么意思? 男票、女票什么意思? 1 0
50 | 怎样摆脱手机依赖症 如何摆脱手机依赖症 1 0
51 | 世界上最大的岛屿是 世界最大的岛屿? 1 0
52 | 过年手机一般会降价么? 过年手机会降价吗 1 0
53 | 您好,您是本人操作的吗 请您让本人操作 0 0
54 | 想找个情侣头像 谁帮我找个情侣头像吖 1 0
55 | 您好您的情况小二已经帮您反馈您后续关注一下您的手机和邮箱的信息. 您好.您的情况已经帮您反馈您可以后续关注一下您的手机和邮箱信息. 1 0
56 | 西游记是什么小说 西游记是什么体的小说 1 0
57 | 求!这张图片的高清大图! 求这张图片的高清大图。 1 0
58 | 吃榴莲不能吃什么 榴莲吃了不能吃什么 1 0
59 | 苹果手机铃声叫什么啊 苹果手机是什么铃声 1 0
60 | 眼皮老是跳怎么回事 左眼皮老是跳是怎么回事 1 0
61 | 灵魂是什么? 灵魂是什么?由来是什么? 1 0
62 | 请问QQ游戏的欢乐斗地主比斗地主好玩很多吗? 欢乐斗地主里面的斗牛游戏好玩吗?可以下载吗 0 0
63 | 薏米红豆水一天喝多少 红豆薏米一天喝多少 1 0
64 | 都在说大老虎什么意思 老虎油,什么意思? 0 0
65 | 天津艺术职业学院有普通类专业 天津城市职业学院电脑艺术设计专业要考专业吗? 0 0
66 | 传说中的黑洞是怎样形成的? 斑竹是怎么形成的? 0 0
67 | 什么意思?翻译一下。 翻译一下什么意思? 1 0
68 | 想给我儿子起个名字: 我想给儿子起个名字 1 0
69 | 数学几何证明题 几何数学证明题 1 0
70 | 疯狂猜成语的答案是什么? 疯狂猜成语关于嘴的成语有哪些 0 0
71 | 什么办法才能让鼻梁长高 有什么让鼻梁变高的方法 1 0
72 | 室内设计和计算机信息管理哪个专业好 计算机信息管理和电子商务哪个专业更好 0 0
73 | 暗黑魔法师崛起大理石手 暗黑魔法师:崛起缺少文件 0 0
74 | 为什么女人喜欢男人吃下面 为什么男人喜欢亲女人下面? 0 0
75 | 蓝颜红颜是什么 红颜和蓝颜分别指什么? 1 0
76 | 如何坚持马克思主义社会科学方法论的指导地位 如何理解马克思主义社会科学方法论的革命性变革 0 0
77 | 魏晨你喜欢吗 那么爱你为什么的吉他谱简单吗 0 0
78 | 璀璨人生全集在哪里可以看? 哪里有璀璨人生的全集? 1 0
79 | 这是什么动漫?叫什么? 这是什么动漫叫什么名字? 0 0
80 | 我什么时候放假? 你什么时候放假? 0 0
81 | 武林外传好看吗 谁有武林外传体验区激活码啊 0 0
82 | wd做个假体丰胸手术多少钱?最近有人做了吗 怎样才能有效丰胸?假如做隆胸手术得多少钱? 0 0
83 | 这个头像的大图 求这个头像大图! 1 0
84 | 十全十美是什么动物? 十全十美的动物是什么 1 0
85 | 《步步惊情》什么时候上映 步步惊情什么时候上映? 1 0
86 | 什么是婚育证明怎么写 村里开一胎生育证明怎么写 0 0
87 | 和您购买这个手机号的时间,麻烦您了。 手机号的购买的时间麻烦您您提供一下 1 0
88 | 双子座与狮子座配吗 双子座和狮子座和吗 1 0
89 | 该不该请领导吃饭 怎么暗示请领导吃饭 0 0
90 | 请问这是什么字体? 请问大家这是什么字体? 1 0
91 | 为什么天会下雨了 过年了为什么反而不放假了 0 0
92 | 为什么奥比岛打不开? 为什么奥比岛打不开 1 0
93 | 梦见自己生了个男孩子。 我梦见自己生了男孩子 1 0
94 | 正确的反义词是什么? 正确的反义词是什么 1 0
95 | 珠海有什么好玩的地方 珠海有什么好玩的地方? 1 0
96 | 孕妇可以吃黄瓜吗加了醋的 孕妇能吃黄瓜吗 1 0
97 | 儿童音乐乐园雅马哈音乐中心怎么样 儿童钢琴专业课程雅马哈音乐中心怎么样 0 0
98 | 淘宝换货怎么换? 淘宝怎么换货 1 0
99 | 想学摄影应该买什么书比较好? 摄影技术去哪里学比较好。 0 0
100 | 传奇登陆器怎么下载 怎么下载传奇登陆器 1 0
101 | 男的做什么工作挣钱 做什么工作挣钱? 1 0
102 |
--------------------------------------------------------------------------------
/tutorials/data_map/sample_stat_summary.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 |
3 | import sys
4 | import json
5 | import numpy as np
6 |
7 | file_name = "output_data.json"
8 | output_file_names = ["correct_times", "correct_ratio", "avg_probs", "label_var",
9 | "max_label_probs", "min_label_probs", "forgetting_times", "learnt_times", "first_forget", "first_learn", "pred_label", "pred_dist"]
10 | num_samples = 100
11 |
12 | _input_path = "./outputs/"
13 | _output = open(_input_path + file_name + ".result", "w")
14 |
15 | def list_concat(score_dict, input_file, input_path = "./data/", sample_size=1, pred_idx=-2, label_idx=-1, score_idx=-3, get_max_probs=False):
16 | """
17 | add info of each epoch (or given steps) into a dict of lists
18 | """
19 | _input = open(input_path + input_file, "r")
20 | for i, line in enumerate(_input):
21 | info = json.loads(line.strip().replace("\'", "\""))
22 | sid = int(info["id"])
23 | label = int(info["label"]) # [1:-1] to avoid "[]"
24 | if "noisy_label" in info:
25 | s_label = int(info["noisy_label"])
26 | else:
27 | s_label = 0
28 | # score = float(info["probs"])
29 | label_probs = float(info["label_probs"]) # the score under GL
30 | pred_correctness = info["correct"]
31 | if get_max_probs:
32 | all_probs = [eval(j) for j in info["probs"]]
33 | max_probs = np.max(all_probs)
34 | else:
35 | max_probs = 1.0
36 |
37 | score_info = [] # list of scores under different class
38 | for score in info["probs"]: # the number of classes here
39 | score_float = float(score)
40 | score_info.append(score_float)
41 |
42 | if not score_dict["id"][sid]:
43 | score_dict["id"][sid] = sid
44 | score_dict["label"][sid] = label
45 | score_dict["s_label"][sid] = s_label
46 | score_dict["label_probs"][sid].append(label_probs)
47 | score_dict["max_probs"][sid].append(max_probs)
48 | score_dict["pred_info"][sid].append(pred_correctness)
49 | score_dict["pred_label"][sid].append(str(np.argmax(score_info)))
50 |
51 | # add forget info
52 | list_length = len(score_dict["pred_info"][sid])
53 | if list_length > 1:
54 | if score_dict["pred_info"][sid][list_length - 1] == score_dict["pred_info"][sid][list_length - 2]:
55 | score_dict["forget_info"][sid].append("None")
56 | elif score_dict["pred_info"][sid][list_length - 1] == "true":
57 | score_dict["forget_info"][sid].append("Learn")
58 | else:
59 | score_dict["forget_info"][sid].append("Forget")
60 | else:
61 | score_dict["forget_info"][sid].append("None")
62 | #if sid == 1:
63 | # print(score_dict["forget_info"][sid])
64 |
65 | # if i >= sample_size:
66 | # break
67 |
68 | _input.close()
69 |
70 | def check_correct_ratio(correct_lists):
71 | """
72 | ratio that a model predict classes correctly in different epochs
73 | """
74 | if len(correct_lists) == 0 or len(correct_lists[0]) == 0:
75 | return [0], [0]
76 | ratio_list = []
77 | pos_list = []
78 | for c_list in correct_lists:
79 | pos_cnt = 0
80 | for info in c_list:
81 | if info == "true":
82 | pos_cnt += 1
83 | ratio_list.append(float(pos_cnt)/len(c_list) if len(c_list)!=0 else 0)
84 | pos_list.append(pos_cnt)
85 | return pos_list, ratio_list
86 |
87 | def check_forget_time(forget_lists):
88 | if len(forget_lists) == 0 or len(forget_lists[0]) == 0:
89 | return [0], [0], 0, 0
90 | forgetting_list = []
91 | learnt_list = []
92 | first_forgetting_time = []
93 | first_learnt_time = []
94 | for f_list in forget_lists:
95 | forgetting_cnt = 0
96 | learnt_cnt = 0
97 | first_f_time = 0
98 | first_l_time = 0
99 | for i, info in enumerate(f_list):
100 | if info == "Forget":
101 | forgetting_cnt += 1
102 | if first_f_time == 0:
103 | first_f_time = i
104 | elif info == "Learn":
105 | learnt_cnt += 1
106 | if first_l_time == 0:
107 | first_l_time = i
108 | forgetting_list.append(forgetting_cnt)
109 | learnt_list.append(learnt_cnt)
110 | first_forgetting_time.append(first_f_time)
111 | first_learnt_time.append(first_l_time)
112 |
113 | return forgetting_list, learnt_list, first_forgetting_time, first_learnt_time
114 |
115 | def check_pred_distribution(pred_lists):
116 | pred_list = []
117 | for scores in pred_lists:
118 | score_dist_dict = {"0": 0, "1": 0, "2": 0, "3": 0, "4": 0}
119 | for score in scores:
120 | score_dist_dict[score] += 1
121 | pred_list.append(score_dist_dict)
122 | return pred_list
123 |
124 |
125 | info_dict = {"id": [[] for i in range(num_samples)], "label": [[] for i in range(num_samples)],
126 | "s_label": [[] for i in range(num_samples)], "label_probs": [[] for i in range(num_samples)],
127 | "max_probs": [[] for i in range(num_samples)], "pred_info": [[] for i in range(num_samples)],
128 | "forget_info": [[] for i in range(num_samples)], "pred_label": [[] for i in range(num_samples)]}
129 | list_concat(info_dict, file_name, _input_path, sample_size=num_samples)
130 |
131 | print(len(info_dict["label_probs"]), len(info_dict["label_probs"][0]))
132 |
133 | info_dict["correct_times"], info_dict["correct_ratio"] = check_correct_ratio(info_dict["pred_info"])
134 | info_dict["label_var"] = np.var(info_dict["label_probs"], axis=1)
135 | info_dict["max_var"] = np.var(info_dict["max_probs"], axis=1)
136 | info_dict["avg_probs"] = np.mean(info_dict["label_probs"], axis=1)
137 | info_dict["max_label_probs"] = np.max(info_dict["label_probs"], axis=1)
138 | info_dict["min_label_probs"] = np.min(info_dict["label_probs"], axis=1)
139 | info_dict["forgetting_times"], info_dict["learnt_times"], info_dict["first_forget"], info_dict["first_learn"] = check_forget_time(info_dict["forget_info"])
140 | info_dict["pred_dist"] = check_pred_distribution(info_dict["pred_label"])
141 | output_file_names = ["id", "label", "s_label"] + output_file_names
142 |
143 | _output.write("\t".join(output_file_names) + "\n")
144 | for i in range(num_samples):
145 | info_list = []
146 | for name in output_file_names:
147 | info_list.append(str(info_dict[name][i]))
148 | _output.write("\t".join(info_list) + "\n")
149 |
150 | _output.close()
--------------------------------------------------------------------------------
/tutorials/dirty_data_identification/README.md:
--------------------------------------------------------------------------------
1 | # 训练数据中脏数据识别
2 |
3 | ### 方法介绍
4 | 训练数据标注质量对模型效果有较大影响,但受限于标注人员水平、标注任务难易程度等影响,训练数据中都存在一定比例的标注错误的数据(称为**脏数据**)。当标注数据规模较大时,数据标注检查就成为一个难题。
5 |
6 |
7 | 
8 | 图1 脏数据识别及标注修正策略流程
9 |
10 |
11 | TrustAI提供了"脏数据识别 -> 修正"方案,如图1所示。首先,基于实例级证据分析方法识别候选脏数据(对模型训练影响较大的数据)。然后,对候选脏数据进行标注修正,使用修正后的数据重新训练模型,可显著提升模型效果。
12 |
13 | 注:开发者可访问[ AI Studio示例 ](https://aistudio.baidu.com/aistudio/projectdetail/4434058)快速体验本案例。
14 |
15 | ## 实验步骤
16 | 我们以基于相似度计算任务LCQMC数据集上的模拟实验为例,介绍该方案实现步骤和效果。
17 |
18 |
19 | **Step 1**:从LCQMC训练集中随机抽取5000条数据作为新训练集。基于ERNIE-3.0-base-zh在新训练集`train_5000.tsv`微调得到基线模型,运行命令如下所示:
20 |
21 | ```shell
22 | # 下载数据
23 | wget --no-check-certificate https://trustai.bj.bcebos.com/application_data/dirty_data.tar && tar xf dirty_data.tar && rm dirty_data.tar
24 | # 训练基线模型
25 | python -u train.py --dataset_dir ./data --train_file train_5000.tsv --dev_file dev.tsv --test_files test.tsv --num_classes 2 --save_dir ./checkpoint
26 | ```
27 | 训练的基线模型保存在`checkpoint`路径下。
28 |
29 |
30 | **Step 2**:识别训练集中的脏数据。
31 | 脏数据选择方法:基于TrustAI提供的实例级可信分析方法`RepresenterPointModel`,计算每一条训练样本对模型loss的影响分数,一般该分数表明了样本作为脏数据的可能性。我们使用这个分数识别脏数据。
32 |
33 | ```shell
34 | # 从训练集中识别候选脏数据
35 | python -u find_dirty_data.py --dataset_dir ./data --train_file train_5000.tsv --num_classes 2 --rest_path ./data/rest_train.tsv --init_from_ckpt ./checkpoint/model_state.pdparams --dirty_path ./data/dirty_train.tsv --dirty_num 500
36 | # dirty_num表示选取候选脏数据的数量
37 | # dirty_path表示候选脏数据的存储路径
38 | ```
39 |
40 |
41 | **Step 3**:对候选脏数据(在我们的实验中,其占比为全部训练集10%)进行标注修正,修正后的数据保存在`correction_data.tsv`(数据修正比例为**38.4%**,随机选择数据其需要修正的数据比例为**5.0%**)。
42 |
43 |
44 | **Step 4**:使用修正后的新训练集`train_5000_correction.tsv`重新训练模型,并评估模型效果。
45 | ```shell
46 | # 下载数据:含train_5000_correction.tsv文件
47 | python -u train.py --dataset_dir ./data --train_file train_5000_correction.tsv --dev_file dev.tsv --test_files test.tsv DuQM --num_classes 2 --save_dir ./new_checkpoint
48 | ```
49 |
50 | 由下表可知,候选脏数据修正(规模为原始训练集的10%)后,模型在LCQMC测试集上提升2.13%,在DuQM数据集上提升4.01%。
51 |
52 |
53 | | 数据集 | LCQMCdev | LCQMCtest | DuQM |
54 | | :-------: | :-----: | :-----: |:-----: |
55 | | 基线 | 86.42% | 84.87% | 69.51% |
56 | | 数据修正 | 87.76% | 86.62% | 73.18% |
57 |
58 | 注:以上结果均为10次实验的平均值。
59 |
--------------------------------------------------------------------------------
/tutorials/dirty_data_identification/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import numpy as np
16 |
17 | import paddle
18 | import paddle.nn.functional as F
19 | from paddlenlp.utils.log import logger
20 |
21 |
22 | @paddle.no_grad()
23 | def evaluate(model, criterion, metric, data_loader, name=''):
24 | """
25 | Given a dataset, it evaluates model and computes the metric.
26 | Args:
27 | model(obj:`paddle.nn.Layer`): A model to classify texts.
28 | criterion(obj:`paddle.nn.Layer`): It can compute the loss.
29 | metric(obj:`paddle.metric.Metric`): The evaluation metric.
30 | data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
31 | """
32 |
33 | model.eval()
34 | metric.reset()
35 | losses = []
36 | for batch in data_loader:
37 | input_ids, token_type_ids, labels = batch['input_ids'], batch[
38 | 'token_type_ids'], batch['labels']
39 | logits = model(input_ids, token_type_ids)
40 | loss = criterion(logits, labels)
41 | losses.append(loss.numpy())
42 | correct = metric.compute(logits, labels)
43 | metric.update(correct)
44 |
45 | acc = metric.accumulate()
46 | logger.info("%s: eval loss: %.5f, acc: %.5f" % (name, np.mean(losses), acc))
47 | model.train()
48 | metric.reset()
49 |
50 | return acc
51 |
52 |
53 | def preprocess_function(example, tokenizer, max_seq_length, is_test=False):
54 | """
55 | Builds model inputs from a sequence for sequence classification tasks
56 | by concatenating and adding special tokens.
57 |
58 | Args:
59 | example(obj:`list[str]`): input data, containing text and label if it have label.
60 | tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
61 | which contains most of the methods. Users should refer to the superclass for more information regarding methods.
62 | max_seq_length(obj:`int`): The maximum total input sequence length after tokenization.
63 | Sequences longer than this will be truncated, sequences shorter will be padded.
64 | label_nums(obj:`int`): The number of the labels.
65 | Returns:
66 | result(obj:`dict`): The preprocessed data including input_ids, token_type_ids, labels.
67 | """
68 | if 'text_b' not in example:
69 | result = tokenizer(text=example["text_a"], max_seq_len=max_seq_length)
70 | else:
71 | result = tokenizer(text=example["text_a"], text_pair=example['text_b'], max_seq_len=max_seq_length)
72 |
73 | if not is_test:
74 | result["labels"] = np.array([example['label']], dtype='int64')
75 | return result
76 |
--------------------------------------------------------------------------------
/tutorials/enhanced_by_rationale/README.md:
--------------------------------------------------------------------------------
1 | # 基于证据指导的模型增强方案
2 | ## 方法介绍
3 |
4 | 经过对多个模型预测依赖证据的评估,发现深度学习模型提供的证据合理性偏弱。为进一步提高证据的合理性,TrustAI提供了基于证据指导的模型增强方案([Jayaram etc. 2021](https://aclanthology.org/2021.emnlp-main.450/)),即标注少量证据数据,通过联合学习原始任务和证据学习任务,用证据学习目标指导模型依赖合理的证据进行预测,提升模型可解释性。
5 |
6 |
7 | ## 实验步骤
8 |
9 | 实验基于ERNIE-2.0-EN-Base在英文情感分析SST数据集上微调得到基线模型,然后选择1000条训练数据进行证据人工标注,并基于这1000条数据进行证据学习。最终,在500条标有证据的验证数据上进行效果评估。评估指标除了模型预测准确率外,还包括可解释评估指标,即证据的合理性、充分性和完备性。
10 |
11 |
12 | 我们实验中使用数据可通过如下命令下载:
13 | ```shell
14 | # 下载样例数据,每个文件仅包含两条样例数据,开发者可根据样例数据的格式自行标注证据数据
15 | wget --no-check-certificate https://trustai.bj.bcebos.com/application_data/rationale_data.tar && tar xf rationale_data.ta && rm rationale_data.ta
16 | ```
17 |
18 | 数据下载后,通过`train.py`进行模型训练,该训练过程引入了证据学习目标,指导模型依赖合理的证据进行预测。
19 | ```shell
20 | python -u train.py --dataset_dir ./data --train_file train.tsv --dev_file dev.tsv --num_classes 2 --save_dir ./maw --use_maw
21 | # user_maw表示是否使用证据增强模型效果
22 | ```
23 |
24 | 实验结果如下表,在加入证据指导后,模型预测效果略有提升,准确率提升0.5%;模型可解释性提升明显:证据合理性提升5.0%、充分性降低0.185(该指标越低越好)、完备性提升0.044。
25 |
26 | | 数据集 | 准确率 | 合理性 | 充分性 | 完备性 |
27 | | :-------: | :-----: | :-----: | :-----: | :-----: |
28 | | base | 93.5% | 26.1% | 0.367 | 0.118 |
29 | | base + maw loss | 94.0% | 31.1% | 0.182 | 0.162 |
30 |
31 | 注:以上结果均为3次实验的平均值。
32 |
--------------------------------------------------------------------------------
/tutorials/enhanced_by_rationale/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import numpy as np
16 | import itertools
17 |
18 | import paddle
19 | import paddle.nn.functional as F
20 | from paddlenlp.utils.log import logger
21 |
22 |
23 | @paddle.no_grad()
24 | def evaluate(model, criterion, metric, data_loader, name=''):
25 | """
26 | Given a dataset, it evaluates model and computes the metric.
27 | Args:
28 | model(obj:`paddle.nn.Layer`): A model to classify texts.
29 | criterion(obj:`paddle.nn.Layer`): It can compute the loss.
30 | metric(obj:`paddle.metric.Metric`): The evaluation metric.
31 | data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
32 | """
33 |
34 | model.eval()
35 | metric.reset()
36 | losses = []
37 | for batch in data_loader:
38 | input_ids, token_type_ids, labels, _, _ = batch
39 | logits = model(input_ids, token_type_ids)
40 | loss = criterion(logits, labels)
41 | losses.append(loss.numpy())
42 | correct = metric.compute(logits, labels)
43 | metric.update(correct)
44 |
45 | acc = metric.accumulate()
46 | logger.info("%s: eval loss: %.5f, acc: %.5f" % (name, np.mean(losses), acc))
47 | model.train()
48 | metric.reset()
49 |
50 | return acc
51 |
52 |
53 | def preprocess_function(example, tokenizer, max_seq_length, is_test=False):
54 | """
55 | Builds model inputs from a sequence for sequence classification tasks
56 | by concatenating and adding special tokens.
57 |
58 | Args:
59 | example(obj:`list[str]`): input data, containing text and label if it have label.
60 | tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
61 | which contains most of the methods. Users should refer to the superclass for more information regarding methods.
62 | max_seq_length(obj:`int`): The maximum total input sequence length after tokenization.
63 | Sequences longer than this will be truncated, sequences shorter will be padded.
64 | label_nums(obj:`int`): The number of the labels.
65 | Returns:
66 | result(obj:`dict`): The preprocessed data including input_ids, token_type_ids, labels.
67 | """
68 | if is_test:
69 | result = tokenizer(text=example['text_a'], max_seq_len=max_seq_length, return_attention_mask=True)
70 | return result['input_ids'], result['token_type_ids']
71 | else:
72 | tokens = example['tokens']
73 | rationales = example['rationales']
74 | tokens = [tokenizer._tokenize(token) for token in tokens]
75 | assert len(tokens) == len(rationales)
76 | rationales = list(
77 | itertools.chain(*[[rationale] * len(sub_tokens) for sub_tokens, rationale in zip(tokens, rationales)]))
78 | tokens = list(itertools.chain(*tokens))
79 | result = tokenizer(text=tokens,
80 | max_seq_len=max_seq_length,
81 | is_split_into_words=True,
82 | return_attention_mask=True)
83 | input_ids = result["input_ids"]
84 | token_type_ids = result["token_type_ids"]
85 | attention_mask = result["attention_mask"]
86 | seq_len = len(input_ids)
87 | rationales = [0] + rationales[:seq_len - 2] + [0]
88 | assert len(rationales) == seq_len
89 | label = np.array([example['label']], dtype="int64")
90 | return input_ids, token_type_ids, label, rationales, attention_mask
91 |
--------------------------------------------------------------------------------
/tutorials/redundancy_removal/args.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import argparse
16 |
17 |
18 | def parse_args():
19 | parser = argparse.ArgumentParser(description=__doc__)
20 | parser.add_argument("--model_name", type=str, required=True, help="Name of pre-trained model.")
21 | parser.add_argument("--output_dir",
22 | type=str,
23 | required=True,
24 | help="The output directory where the model predictions and checkpoints will be written.")
25 | parser.add_argument(
26 | "--data_dir",
27 | type=str,
28 | required=True,
29 | help="The data directory should include `train` and `dev` set to train model and `test` set to test model.")
30 | parser.add_argument("--max_seq_length",
31 | default=512,
32 | type=int,
33 | help="The maximum total input sequence length after tokenization.")
34 | parser.add_argument("--batch_size", default=24, type=int, help="Batch size per GPU/CPU for training.")
35 | parser.add_argument("--learning_rate", default=7e-5, type=float, help="The initial learning rate for Adam.")
36 | parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
37 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
38 | parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
39 | parser.add_argument("--num_train_epochs", default=20, type=int, help="Total number of train epochs to perform.")
40 | parser.add_argument("--max_steps",
41 | default=-1,
42 | type=int,
43 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
44 | parser.add_argument("--warmup_proportion",
45 | default=0.0,
46 | type=float,
47 | help="Proportion of training steps to perform linear learning rate warmup for.")
48 | parser.add_argument("--logging_steps", type=int, default=10, help="Log every X updates steps.")
49 | parser.add_argument("--save_steps", type=int, default=200, help="Save checkpoint every X updates steps.")
50 | parser.add_argument("--load_model_path", type=str, default=None, help="The checkpoint directory where the model")
51 | parser.add_argument("--get_k_sentences", type=int, default=0, help="load checkpoint path")
52 | parser.add_argument("--set_k_sentences_ground_true", type=int, default=0, help="set k sentences ground true")
53 | parser.add_argument("--early_stop_nums", type=int, default=5, help="probability threshold for selecting sentences")
54 | parser.add_argument("--one_alpha", type=float, default=0.4, help="probability threshold for selecting sentences")
55 | parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
56 | parser.add_argument('--device',
57 | choices=['cpu', 'gpu'],
58 | default="gpu",
59 | help="Select which device to train model, defaults to gpu.")
60 | parser.add_argument("--doc_stride",
61 | type=int,
62 | default=128,
63 | help="When splitting up a long document into chunks, how much stride to take between chunks.")
64 | parser.add_argument(
65 | "--n_best_size",
66 | type=int,
67 | default=20,
68 | help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
69 | parser.add_argument("--max_query_length", type=int, default=64, help="Max query length.")
70 | parser.add_argument("--max_answer_length", type=int, default=30, help="Max answer length.")
71 | parser.add_argument("--hidden_size", type=int, default=768, help="hidden size")
72 | parser.add_argument("--verbose", action='store_true', help="Whether to output verbose log.")
73 | parser.add_argument("--do_train", action='store_true', help="Whether to train the model.")
74 | parser.add_argument("--do_predict", action='store_true', help="Whether to predict.")
75 | parser.add_argument("--use_loose_metric", action='store_true', help="whether to use loose metric to choose model.")
76 | parser.add_argument("--use_similarity", action='store_true', help="whether to use similarity to choose sentence.")
77 | parser.add_argument("--early_stop", action='store_true', help="whether to use early stop.")
78 | args = parser.parse_args()
79 | return args
80 |
--------------------------------------------------------------------------------
/tutorials/redundancy_removal/download.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Download dataset and model parameters
3 | set -e
4 |
5 | if [ ! -d "./data" ]; then
6 | mkdir "./data"
7 | fi
8 |
9 | echo "Download DuReader-robust dataset"
10 | wget --no-check-certificate https://dataset-bj.cdn.bcebos.com/dureader_robust/data/dureader_robust-data.tar.gz
11 | tar -zxvf dureader_robust-data.tar.gz
12 | mv dureader_robust-data data/robust
13 | rm dureader_robust-data.tar.gz
14 |
15 | echo "Download DuReader-checklist dataset"
16 | wget --no-check-certificate https://dataset-bj.cdn.bcebos.com/lic2021/dureader_checklist.dataset.tar.gz
17 | tar -zxvf dureader_checklist.dataset.tar.gz
18 | mv dataset data/checklist
19 | rm dureader_checklist.dataset.tar.gz
20 | mkdir ./data/checklist_wo_no_answer
21 | python ./utils/checklist_process.py --input_data_dir ./data/checklist --output_data_dir ./data/checklist_wo_no_answer
--------------------------------------------------------------------------------
/tutorials/redundancy_removal/predictor/model.py:
--------------------------------------------------------------------------------
1 | import paddle
2 | from paddlenlp.transformers import AutoModelForQuestionAnswering
3 |
4 |
5 | class Predictor(paddle.nn.Layer):
6 |
7 | def __init__(self, args):
8 | super(Predictor, self).__init__()
9 | self.model = AutoModelForQuestionAnswering.from_pretrained(args.model_name)
10 |
11 | def forward(self, x, mode="train"):
12 | data = {
13 | "input_ids": x[0],
14 | "token_type_ids": x[1],
15 | }
16 | if mode == "train":
17 | data["start_positions"] = x[2]
18 | data["end_positions"] = x[3]
19 |
20 | logits = self.model(input_ids=data["input_ids"], token_type_ids=data["token_type_ids"])
21 |
22 | if mode == "dev" or mode == "test":
23 | return logits
24 |
25 | # Compute loss
26 | start_logits, end_logits = logits
27 | start_position = paddle.unsqueeze(data["start_positions"], axis=-1)
28 | end_position = paddle.unsqueeze(data["end_positions"], axis=-1)
29 | start_loss = paddle.nn.functional.cross_entropy(input=start_logits, label=start_position)
30 | end_loss = paddle.nn.functional.cross_entropy(input=end_logits, label=end_position)
31 | loss = (start_loss + end_loss) / 2
32 | return loss, logits
33 |
--------------------------------------------------------------------------------
/tutorials/redundancy_removal/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | paddlenlp
3 | paddlepaddle-gpu >= 2.0.0
4 | scikit-learn
5 | tqdm
6 | matplotlib
7 | IPython
8 | pre-commit
--------------------------------------------------------------------------------
/tutorials/redundancy_removal/run_predict.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import random
16 |
17 | import numpy as np
18 | import paddle
19 |
20 | from args import parse_args
21 | from predictor import model, model_manager
22 |
23 |
24 | def set_seed(args):
25 | random.seed(args.seed)
26 | np.random.seed(args.seed)
27 | paddle.seed(args.seed)
28 |
29 |
30 | def run(args):
31 | paddle.set_device(args.device)
32 | if paddle.distributed.get_world_size() > 1:
33 | paddle.distributed.init_parallel_env()
34 | rank = paddle.distributed.get_rank()
35 |
36 | set_seed(args)
37 |
38 | predictor = model.Predictor(args)
39 | if paddle.distributed.get_world_size() > 1:
40 | predictor = paddle.DataParallel(predictor)
41 |
42 | # Prepare model manager
43 | manager = model_manager.ModelManager(args, predictor)
44 |
45 | if args.do_train:
46 | manager.train(rank)
47 | if args.do_predict and rank == 0:
48 | manager.test()
49 |
50 |
51 | if __name__ == "__main__":
52 | args = parse_args()
53 | run(args)
54 |
--------------------------------------------------------------------------------
/tutorials/redundancy_removal/run_select.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import random
16 |
17 | import numpy as np
18 | import paddle
19 |
20 | from args import parse_args
21 | from selector import model, model_manager
22 |
23 |
24 | def set_seed(args):
25 | random.seed(args.seed)
26 | np.random.seed(args.seed)
27 | paddle.seed(args.seed)
28 |
29 |
30 | def run(args):
31 | set_seed(args)
32 |
33 | # Prepare device and model
34 | paddle.set_device(args.device)
35 | if paddle.distributed.get_world_size() > 1:
36 | paddle.distributed.init_parallel_env()
37 | rank = paddle.distributed.get_rank()
38 |
39 | selector = model.Selector(args)
40 | if paddle.distributed.get_world_size() > 1:
41 | selector = paddle.DataParallel(selector)
42 |
43 | # Prepare model manager
44 | manager = model_manager.ModelManager(args, selector)
45 |
46 | if args.do_train:
47 | manager.train(rank)
48 |
49 | if args.do_predict and rank == 0:
50 | manager.test()
51 |
52 |
53 | if __name__ == "__main__":
54 | args = parse_args()
55 | run(args)
56 |
--------------------------------------------------------------------------------
/tutorials/redundancy_removal/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | data_dir="./data/robust"
3 | output_dir="./output"
4 | load_selector_model_dir="./"
5 | load_predictor_model_dir="./"
6 | split="dev"
7 |
8 | ARGS=`getopt -a -o :d:o:s:p:S:h -l data_dir:,output_dir:,selector_model_dir:,predictor_model_dir:,split:,help -- "$@"`
9 | eval set -- "${ARGS}"
10 | while true
11 | do
12 | case "$1" in
13 | -d|--data_dir)
14 | data_dir="$2"
15 | shift
16 | ;;
17 | -o|--output_dir)
18 | output_dir="$2"
19 | shift
20 | ;;
21 | -s|--selector_model_dir)
22 | load_selector_model_dir="$2"
23 | shift
24 | ;;
25 | -p|--predictor_model_dir)
26 | load_predictor_model_dir="$2"
27 | shift
28 | ;;
29 | -S|--split)
30 | split="$2"
31 | shift
32 | ;;
33 | -h|--help)
34 | echo "help"
35 | ;;
36 | --)
37 | shift
38 | break
39 | ;;
40 | esac
41 | shift
42 | done
43 |
44 | if [ ! -d "./cache" ]; then
45 | mkdir "./cache"
46 | fi
47 | if [ ! -d "${output_dir}" ]; then
48 | mkdir "${output_dir}"
49 | fi
50 | if [ ! -d "${output_dir}/selected-test-data" ]; then
51 | mkdir "${output_dir}/selected-test-data"
52 | fi
53 | if [ $split = "dev" ]; then
54 | if [ ! -d "${output_dir}/tmp" ]; then
55 | mkdir "${output_dir}/tmp"
56 | fi
57 | cp "${data_dir}/dev.json" "${output_dir}/tmp/test.json"
58 | echo "Data path: ${data_dir}/dev.json"
59 | data_dir="${output_dir}/tmp"
60 | else
61 | echo "Data path: ${data_dir}/test.json"
62 | fi
63 |
64 | echo "Output path: ${output_dir}/selected-test-data"
65 |
66 | python -u run_select.py \
67 | --model_name hfl/roberta-wwm-ext \
68 | --do_predict \
69 | --batch_size 24 \
70 | --data_dir $data_dir \
71 | --load_model_path "${load_selector_model_dir}/model_state.pdparams" \
72 | --one_alpha 0.1 \
73 | --output_dir "${output_dir}/selected-test-data/" \
74 | --device gpu
75 |
76 | mv "${output_dir}/selected-test-data/test_prediction.json" "${output_dir}/selected-test-data/test.json"
77 |
78 | python -u run_predict.py \
79 | --model_name hfl/roberta-wwm-ext \
80 | --do_predict \
81 | --batch_size 24 \
82 | --max_seq_length 384 \
83 | --data_dir "${output_dir}/selected-test-data/" \
84 | --load_model_path "${load_predictor_model_dir}/model_state.pdparams" \
85 | --output_dir "${output_dir}/predict-result/" \
86 | --device gpu
87 |
88 |
89 |
--------------------------------------------------------------------------------
/tutorials/redundancy_removal/train.sh:
--------------------------------------------------------------------------------
1 |
2 | data_dir="./data/robust"
3 | output_dir="./output"
4 |
5 | # Read Parameter
6 | # -d data dir path
7 | # -o output_dir
8 | while getopts ":d:o:" optname
9 | do
10 | case "$optname" in
11 | "d")
12 | data_dir=$OPTARG
13 | ;;
14 | "o")
15 | output_dir=$OPTARG
16 | ;;
17 | ":")
18 | echo "No argument value for option $OPTARG"
19 | ;;
20 | "?")
21 | echo "Unknown option $OPTARG"
22 | ;;
23 | *)
24 | echo "Unknown error while processing options"
25 | ;;
26 | esac
27 | done
28 |
29 | echo "Data path: ${data_dir}"
30 | echo "Selector Output path: ${output_dir}/selector/"
31 |
32 | # prepare dir
33 |
34 | # clean cache
35 | if [ ! -d "./cache" ]; then
36 | mkdir "./cache"
37 | else
38 | rm -rf "./cache"
39 | mkdir "./cache"
40 | fi
41 | if [ ! -d "${output_dir}" ]; then
42 | mkdir "${output_dir}"
43 | fi
44 | if [ ! -d "${output_dir}/selector" ]; then
45 | mkdir "${output_dir}/selector"
46 | fi
47 | if [ ! -d "${output_dir}/selected-data" ]; then
48 | mkdir "${output_dir}/selected-data"
49 | fi
50 | if [ ! -d "${output_dir}/predictor" ]; then
51 | mkdir "${output_dir}/predictor"
52 | fi
53 | if [ ! -d "${output_dir}/tmp" ]; then
54 | mkdir "${output_dir}/tmp"
55 | fi
56 |
57 | echo "########## Selector Training #############"
58 |
59 | # train selector
60 | python -u ./run_select.py \
61 | --model_name hfl/roberta-wwm-ext \
62 | --max_seq_length 512 \
63 | --batch_size 24 \
64 | --learning_rate 8e-5 \
65 | --num_train_epochs 100 \
66 | --logging_steps 10 \
67 | --save_steps 200 \
68 | --warmup_proportion 0.1 \
69 | --weight_decay 0.01 \
70 | --output_dir "${output_dir}/selector/" \
71 | --data_dir ${data_dir} \
72 | --set_k_sentences_ground_true 0 \
73 | --early_stop_nums 5 \
74 | --one_alpha -1 \
75 | --do_train \
76 | --use_loose_metric \
77 | --early_stop \
78 | --device gpu
79 |
80 |
81 | echo "########## Dev Processing #############"
82 | cp "${data_dir}/dev.json" "${output_dir}/tmp/test.json"
83 |
84 | # predict selector
85 | python -u ./run_select.py \
86 | --model_name hfl/roberta-wwm-ext \
87 | --max_seq_length 512 \
88 | --batch_size 24 \
89 | --load_model_path "${output_dir}/selector/best_model/model_state.pdparams" \
90 | --data_dir "${output_dir}/tmp/" \
91 | --output_dir "${output_dir}/selector/" \
92 | --set_k_sentences_ground_true 0 \
93 | --one_alpha 0.1 \
94 | --do_predict \
95 | --use_loose_metric \
96 | --device gpu
97 |
98 | rm -rf "${output_dir}/tmp"
99 | # Postprocess selected data
100 | temp_dir="${data_dir}/*"
101 | cp -f ${temp_dir} "${output_dir}/selected-data"
102 | rm -f "${output_dir}/selected-data/dev.json"
103 | origin_dev_path="${output_dir}/selector/test_prediction.json"
104 | mv ${origin_dev_path} "${output_dir}/selected-data/dev.json"
105 |
106 |
107 | rm -rf "${output_dir}/tmp"
108 | # Postprocess selected data
109 | temp_dir="${data_dir}/*"
110 | cp -f ${temp_dir} "${output_dir}/selected-data"
111 | rm -f "${output_dir}/selected-data/dev.json"
112 | origin_dev_path="${output_dir}/selector/test_prediction.json"
113 | mv ${origin_dev_path} "${output_dir}/selected-data/dev.json"
114 |
115 | echo "########## Predictor Training #############"
116 |
117 | # Train predictor
118 | python -u run_predict.py \
119 | --model_name hfl/roberta-wwm-ext \
120 | --max_seq_length 384 \
121 | --batch_size 24 \
122 | --learning_rate 5e-5 \
123 | --num_train_epochs 8 \
124 | --logging_steps 10 \
125 | --save_steps 100 \
126 | --warmup_proportion 0.1 \
127 | --weight_decay 0.01 \
128 | --output_dir "$output_dir/predictor/" \
129 | --data_dir "$output_dir/selected-data/" \
130 | --do_train \
131 | --device gpu
132 |
133 | python -u run_predict.py \
134 | --model_name hfl/roberta-wwm-ext \
135 | --max_seq_length 384 \
136 | --batch_size 24 \
137 | --learning_rate 5e-5 \
138 | --num_train_epochs 8 \
139 | --logging_steps 10 \
140 | --save_steps 200 \
141 | --warmup_proportion 0.1 \
142 | --weight_decay 0.01 \
143 | --output_dir "$output_dir/predictor/" \
144 | --data_dir "$output_dir/selected-data/" \
145 | --do_predict \
146 | --device gpu
--------------------------------------------------------------------------------
/tutorials/redundancy_removal/train_predictor.sh:
--------------------------------------------------------------------------------
1 | data_dir="./data/robust"
2 | output_dir="./output"
3 |
4 | # Read Parameter
5 | # -d data dir path
6 | # -o output_dir
7 | while getopts ":d:o:" optname
8 | do
9 | case "$optname" in
10 | "d")
11 | data_dir=$OPTARG
12 | ;;
13 | "o")
14 | output_dir=$OPTARG
15 | ;;
16 | ":")
17 | echo "No argument value for option $OPTARG"
18 | ;;
19 | "?")
20 | echo "Unknown option $OPTARG"
21 | ;;
22 | *)
23 | echo "Unknown error while processing options"
24 | ;;
25 | esac
26 | done
27 |
28 | echo "Data path: ${data_dir}"
29 |
30 | # prepare dir
31 |
32 | # clean cache
33 | if [ ! -d "./cache" ]; then
34 | mkdir "./cache"
35 | else
36 | rm -rf "./cache"
37 | mkdir "./cache"
38 | fi
39 | if [ ! -d "${output_dir}" ]; then
40 | mkdir "${output_dir}"
41 | fi
42 | if [ ! -d "${output_dir}/selector" ]; then
43 | mkdir "${output_dir}/selector"
44 | fi
45 | if [ ! -d "${output_dir}/selected-data" ]; then
46 | mkdir "${output_dir}/selected-data"
47 | fi
48 | if [ ! -d "${output_dir}/predictor" ]; then
49 | mkdir "${output_dir}/predictor"
50 | fi
51 | if [ ! -d "${output_dir}/tmp" ]; then
52 | mkdir "${output_dir}/tmp"
53 | fi
54 |
55 | echo "########## Predictor Training #############"
56 |
57 | # Train predictor
58 | python -u run_predict.py \
59 | --model_name hfl/roberta-wwm-ext \
60 | --max_seq_length 384 \
61 | --batch_size 24 \
62 | --learning_rate 5e-5 \
63 | --num_train_epochs 8 \
64 | --logging_steps 10 \
65 | --save_steps 100 \
66 | --warmup_proportion 0.1 \
67 | --weight_decay 0.01 \
68 | --output_dir "$output_dir/predictor/" \
69 | --data_dir "$output_dir/selected-data/" \
70 | --do_train \
71 | --device gpu
72 |
73 | python -u run_predict.py \
74 | --model_name hfl/roberta-wwm-ext \
75 | --max_seq_length 384 \
76 | --batch_size 24 \
77 | --learning_rate 5e-5 \
78 | --num_train_epochs 8 \
79 | --logging_steps 10 \
80 | --save_steps 200 \
81 | --warmup_proportion 0.1 \
82 | --weight_decay 0.01 \
83 | --output_dir "$output_dir/predictor/" \
84 | --data_dir "$output_dir/selected-data/" \
85 | --do_predict \
86 | --device gpu
--------------------------------------------------------------------------------
/tutorials/redundancy_removal/train_select_data.sh:
--------------------------------------------------------------------------------
1 |
2 | data_dir="./data/robust"
3 | output_dir="./output"
4 |
5 | # Read Parameter
6 | # -d data dir path
7 | # -o output_dir
8 | while getopts ":d:o:" optname
9 | do
10 | case "$optname" in
11 | "d")
12 | data_dir=$OPTARG
13 | ;;
14 | "o")
15 | output_dir=$OPTARG
16 | ;;
17 | ":")
18 | echo "No argument value for option $OPTARG"
19 | ;;
20 | "?")
21 | echo "Unknown option $OPTARG"
22 | ;;
23 | *)
24 | echo "Unknown error while processing options"
25 | ;;
26 | esac
27 | done
28 |
29 | echo "Data path: ${data_dir}"
30 | echo "Selector Output path: ${output_dir}/selector/"
31 |
32 | # prepare dir
33 |
34 | # clean cache
35 | if [ ! -d "./cache" ]; then
36 | mkdir "./cache"
37 | else
38 | rm -rf "./cache"
39 | mkdir "./cache"
40 | fi
41 | if [ ! -d "${output_dir}" ]; then
42 | mkdir "${output_dir}"
43 | fi
44 | if [ ! -d "${output_dir}/selector" ]; then
45 | mkdir "${output_dir}/selector"
46 | fi
47 | if [ ! -d "${output_dir}/selected-data" ]; then
48 | mkdir "${output_dir}/selected-data"
49 | fi
50 | if [ ! -d "${output_dir}/predictor" ]; then
51 | mkdir "${output_dir}/predictor"
52 | fi
53 | if [ ! -d "${output_dir}/tmp" ]; then
54 | mkdir "${output_dir}/tmp"
55 | fi
56 |
57 |
58 | echo "########## Dev Processing #############"
59 | cp "${data_dir}/dev.json" "${output_dir}/tmp/test.json"
60 |
61 | # predict selector
62 | python -u ./run_select.py \
63 | --model_name hfl/roberta-wwm-ext \
64 | --max_seq_length 512 \
65 | --batch_size 24 \
66 | --load_model_path "${output_dir}/selector/best_model/model_state.pdparams" \
67 | --data_dir "${output_dir}/tmp/" \
68 | --output_dir "${output_dir}/selector/" \
69 | --set_k_sentences_ground_true 0 \
70 | --one_alpha 0.1 \
71 | --do_predict \
72 | --use_loose_metric \
73 | --device gpu
74 |
75 | rm -rf "${output_dir}/tmp"
76 | # Postprocess selected data
77 | temp_dir="${data_dir}/*"
78 | cp -f ${temp_dir} "${output_dir}/selected-data"
79 | rm -f "${output_dir}/selected-data/dev.json"
80 | origin_dev_path="${output_dir}/selector/test_prediction.json"
81 | mv ${origin_dev_path} "${output_dir}/selected-data/dev.json"
82 |
--------------------------------------------------------------------------------
/tutorials/redundancy_removal/train_selector.sh:
--------------------------------------------------------------------------------
1 |
2 | data_dir="./data/robust"
3 | output_dir="./output"
4 |
5 | # Read Parameter
6 | # -d data dir path
7 | # -o output_dir
8 | while getopts ":d:o:" optname
9 | do
10 | case "$optname" in
11 | "d")
12 | data_dir=$OPTARG
13 | ;;
14 | "o")
15 | output_dir=$OPTARG
16 | ;;
17 | ":")
18 | echo "No argument value for option $OPTARG"
19 | ;;
20 | "?")
21 | echo "Unknown option $OPTARG"
22 | ;;
23 | *)
24 | echo "Unknown error while processing options"
25 | ;;
26 | esac
27 | done
28 |
29 | echo "Data path: ${data_dir}"
30 | echo "Selector Output path: ${output_dir}/selector/"
31 |
32 | # prepare dir
33 |
34 | # clean cache
35 | if [ ! -d "./cache" ]; then
36 | mkdir "./cache"
37 | else
38 | rm -rf "./cache"
39 | mkdir "./cache"
40 | fi
41 | if [ ! -d "${output_dir}" ]; then
42 | mkdir "${output_dir}"
43 | fi
44 | if [ ! -d "${output_dir}/selector" ]; then
45 | mkdir "${output_dir}/selector"
46 | fi
47 | if [ ! -d "${output_dir}/selected-data" ]; then
48 | mkdir "${output_dir}/selected-data"
49 | fi
50 | if [ ! -d "${output_dir}/predictor" ]; then
51 | mkdir "${output_dir}/predictor"
52 | fi
53 | if [ ! -d "${output_dir}/tmp" ]; then
54 | mkdir "${output_dir}/tmp"
55 | fi
56 |
57 | echo "########## Selector Training #############"
58 |
59 | # train selector
60 | python -u ./run_select.py \
61 | --model_name hfl/roberta-wwm-ext \
62 | --max_seq_length 512 \
63 | --batch_size 24 \
64 | --learning_rate 8e-5 \
65 | --num_train_epochs 100 \
66 | --logging_steps 10 \
67 | --save_steps 200 \
68 | --warmup_proportion 0.1 \
69 | --weight_decay 0.01 \
70 | --output_dir "${output_dir}/selector/" \
71 | --data_dir ${data_dir} \
72 | --set_k_sentences_ground_true 0 \
73 | --early_stop_nums 5 \
74 | --one_alpha -1 \
75 | --do_train \
76 | --use_loose_metric \
77 | --early_stop \
78 | --device gpu
--------------------------------------------------------------------------------
/tutorials/redundancy_removal/utils/checklist_process.py:
--------------------------------------------------------------------------------
1 | import json
2 | import tools
3 | import argparse
4 | import os
5 |
6 |
7 | def process(input_data_path, output_data_path):
8 | with open(input_data_path, 'r', encoding='utf-8') as f:
9 | obj = json.load(f)
10 | for i, data in enumerate(obj['data']):
11 | x = []
12 | for j, paragraphs in enumerate(data["paragraphs"]):
13 | for k, ans in enumerate(paragraphs["qas"][0]["answers"]):
14 | answer = ans["text"]
15 | if answer != "" and len(tools.split_sentence(answer)) == 1:
16 | x.append(paragraphs)
17 | else:
18 | break
19 | obj["data"][i]["paragraphs"] = x
20 |
21 | with open(output_data_path, 'w+', encoding="utf8") as outfile:
22 | json.dump(obj, outfile, ensure_ascii=False)
23 |
24 |
25 | if __name__ == "__main__":
26 | parser = argparse.ArgumentParser(description=__doc__)
27 | parser.add_argument("--input_data_dir", type=str, required=True, help="checklist input data dir")
28 | parser.add_argument("--output_data_dir", type=str, required=True, help="checklist output data dir")
29 | args = parser.parse_args()
30 | process(os.path.join(args.input_data_dir, 'train.json'), os.path.join(args.output_data_dir, 'train.json'))
31 | process(os.path.join(args.input_data_dir, 'dev.json'), os.path.join(args.output_data_dir, 'dev.json'))
32 |
--------------------------------------------------------------------------------
/tutorials/redundancy_removal/utils/dureader_robust.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
3 | # Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | # Lint as: python3
18 |
19 | import json
20 | import os
21 |
22 | import datasets
23 | from datasets.tasks import QuestionAnsweringExtractive
24 |
25 | logger = datasets.logging.get_logger(__name__)
26 |
27 | _DESCRIPTION = """\
28 | DureaderRobust is a chinese reading comprehension \
29 | dataset, designed to evaluate the MRC models from \
30 | three aspects: over-sensitivity, over-stability \
31 | and generalization.
32 | """
33 |
34 | _URL = "https://bj.bcebos.com/paddlenlp/datasets/dureader_robust-data.tar.gz"
35 |
36 |
37 | class DureaderRobustConfig(datasets.BuilderConfig):
38 | """BuilderConfig for DureaderRobust."""
39 | data_dir: str = None
40 | do_train: bool = True
41 | do_predict: bool = True
42 |
43 | def __init__(self, **kwargs):
44 | """BuilderConfig for DureaderRobust.
45 |
46 | Args:
47 | **kwargs: keyword arguments forwarded to super.
48 | """
49 | super(DureaderRobustConfig, self).__init__(**kwargs)
50 |
51 |
52 | class DureaderRobust(datasets.GeneratorBasedBuilder):
53 | BUILDER_CONFIGS = [
54 | DureaderRobustConfig(
55 | name="plain_text",
56 | version=datasets.Version("1.0.0", ""),
57 | description="Plain text",
58 | ),
59 | ]
60 |
61 | def _info(self):
62 | return datasets.DatasetInfo(
63 | description=_DESCRIPTION,
64 | features=datasets.Features({
65 | "id":
66 | datasets.Value("string"),
67 | "title":
68 | datasets.Value("string"),
69 | "context":
70 | datasets.Value("string"),
71 | "question":
72 | datasets.Value("string"),
73 | "answers":
74 | datasets.features.Sequence({
75 | "text": datasets.Value("string"),
76 | "answer_start": datasets.Value("int32"),
77 | }),
78 | }),
79 | # No default supervised_keys (as we have to pass both question
80 | # and context as input).
81 | supervised_keys=None,
82 | homepage="https://arxiv.org/abs/2004.11142",
83 | task_templates=[
84 | QuestionAnsweringExtractive(question_column="question",
85 | context_column="context",
86 | answers_column="answers")
87 | ],
88 | )
89 |
90 | def _split_generators(self, dl_manager):
91 | # dl_dir = dl_manager.download_and_extract(_URL)
92 | result = []
93 | if self.config.do_train:
94 | result += [
95 | datasets.SplitGenerator(name=datasets.Split.TRAIN,
96 | gen_kwargs={"filepath": os.path.join(self.config.data_dir, "train.json")}),
97 | datasets.SplitGenerator(name=datasets.Split.VALIDATION,
98 | gen_kwargs={"filepath": os.path.join(self.config.data_dir, "dev.json")}),
99 | ]
100 | if self.config.do_predict:
101 | result.append(
102 | datasets.SplitGenerator(name=datasets.Split.TEST,
103 | gen_kwargs={"filepath": os.path.join(self.config.data_dir, 'test.json')}))
104 | return result
105 |
106 | def _generate_examples(self, filepath):
107 | """This function returns the examples in the raw (text) form."""
108 | logger.info("generating examples from = %s", filepath)
109 | key = 0
110 | with open(filepath, encoding="utf-8") as f:
111 | durobust = json.load(f)
112 | for article in durobust["data"]:
113 | title = article.get("title", "")
114 | for paragraph in article["paragraphs"]:
115 | context = paragraph["context"] # do not strip leading blank spaces GH-2585
116 | for qa in paragraph["qas"]:
117 | answer_starts = [answer["answer_start"] for answer in qa.get("answers", '')]
118 | answers = [answer["text"] for answer in qa.get("answers", '')]
119 | # Features currently used are "context", "question", and "answers".
120 | # Others are extracted here for the ease of future expansions.
121 | yield key, {
122 | "title": title,
123 | "context": context,
124 | "question": qa["question"],
125 | "id": qa["id"],
126 | "answers": {
127 | "answer_start": answer_starts,
128 | "text": answers,
129 | },
130 | }
131 | key += 1
132 |
--------------------------------------------------------------------------------
/tutorials/redundancy_removal/utils/logger.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import json
16 | import os
17 | import time
18 |
19 |
20 | class Logger():
21 |
22 | def __init__(self, args):
23 | self.args = args
24 | self.tic_train = time.time()
25 | self.performance_list = []
26 | self.mode_dict = {
27 | "ERROR": "31",
28 | "INFO": "32",
29 | "WARNING": "33",
30 | }
31 |
32 | def __get_time_str(self):
33 | return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "," + str(
34 | int(round(time.time() * 1000)) % 1000).zfill(3)
35 |
36 | def __log(self, log_str, mode="INFO"):
37 | print("\033[" + self.mode_dict[mode] + "m[" + self.__get_time_str() + "] [ " + mode + "]\033[0m - " +
38 | log_str)
39 |
40 | def info(self, info_str: str):
41 | self.__log(info_str, mode="INFO")
42 |
43 | def error(self, info_str: str):
44 | self.__log(info_str, mode="ERROR")
45 |
46 | def warn(self, info_str: str):
47 | self.__log(info_str, mode="WARNING")
48 |
49 | def logging_step(self, global_step, epoch, step, loss):
50 | if global_step % self.args.logging_steps == 0:
51 | self.info("Global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" %
52 | (global_step, epoch + 1, step + 1, loss, self.args.logging_steps /
53 | (time.time() - self.tic_train)))
54 | self.tic_train = time.time()
55 |
56 | def add_performance(self, performance):
57 | self.performance_list.append(performance)
58 |
59 | def save_performance(self):
60 | output_dir = os.path.join(self.args.output_dir, "logging.json")
61 | with open(output_dir, "w", encoding="utf8") as f:
62 | json.dump(self.performance_list, f)
63 |
64 | def logging_result(self, x):
65 | print(x)
66 |
--------------------------------------------------------------------------------
/tutorials/redundancy_removal/utils/predict.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 |
4 | from paddlenlp.metrics.squad import squad_evaluate
5 |
6 |
7 | def get_data(filepath):
8 | with open(filepath, encoding="utf-8") as f:
9 | durobust = json.load(f)
10 | data = []
11 | for article in durobust["data"]:
12 | title = article.get("title", "")
13 | for paragraph in article["paragraphs"]:
14 | context = paragraph["context"] # do not strip leading blank spaces GH-2585
15 | for qa in paragraph["qas"]:
16 | answer_starts = [answer["answer_start"] for answer in qa.get("answers", '')]
17 | answers = [answer["text"] for answer in qa.get("answers", '')]
18 | # Features currently used are "context", "question", and "answers".
19 | # Others are extracted here for the ease of future expansions.
20 | data.append({
21 | "title": title,
22 | "context": context,
23 | "question": qa["question"],
24 | "id": qa["id"],
25 | "answers": {
26 | "answer_start": answer_starts,
27 | "text": answers,
28 | },
29 | })
30 | return data
31 |
32 |
33 | if __name__ == "__main__":
34 | parser = argparse.ArgumentParser(description=__doc__)
35 | parser.add_argument("--test_data_dir", type=str, required=True, help="test data dir")
36 | parser.add_argument("--pred_data_dir", type=str, required=True, help="prediction data dir")
37 | args = parser.parse_args()
38 | raw_datasets = get_data(args.test_data_dir)
39 |
40 | with open(args.pred_data_dir, encoding="utf-8") as f:
41 | all_predictions = json.load(f)
42 |
43 | result = squad_evaluate(examples=raw_datasets, preds=all_predictions, is_whitespace_splited=False)
44 |
--------------------------------------------------------------------------------
/tutorials/redundancy_removal/utils/tools.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 |
16 |
17 | def remove_None(str_list: list) -> list:
18 | return [t for t in str_list if t is not None]
19 |
20 |
21 | def remove_blank_str(str_list: list) -> list:
22 | return [t for t in str_list if t != ""]
23 |
24 |
25 | def process_single_sign(str_list: list) -> list:
26 | x = []
27 | for i in str_list:
28 | if i in ["|", "!", "。", "!", "?", "\\x0d", ";", ";", "?", "!"]:
29 | continue
30 | x.append(i)
31 | return x
32 |
33 |
34 | def strip_str(s: str) -> str:
35 | return s.strip().strip("\n").strip("\\x0d")
36 |
37 |
38 | def print_red(s: str, flag=True):
39 | if flag:
40 | print('\033[31m' + s + '\033[0m')
41 |
42 |
43 | def split_sentence(s: str, remove_illegal_sign=True) -> list:
44 | # s.replace(" ","。")
45 | c_list = re.split('(。|!|?|\\x0d|;|;|\?|!|\||\.{2,}|[\u4E00-\u9FA5]\.{1,} *|\. )', s)
46 | c_list.append("")
47 | c_list = ["".join(i) for i in zip(c_list[0::2], c_list[1::2])]
48 | if remove_illegal_sign:
49 | c_list = remove_None(c_list)
50 | c_list = process_single_sign(c_list)
51 | c_list = remove_blank_str(c_list)
52 | return [strip_str(c) for c in c_list]
53 | else:
54 | return c_list
55 |
56 |
57 | def batchify(dataset, batch_size):
58 | batch = []
59 | for i, data in enumerate(dataset):
60 | if (i % batch_size == 0):
61 | batch.append([data])
62 | else:
63 | batch[int(i / batch_size)].append(data)
64 | return batch
65 |
66 |
67 | def padding_sentence(data, padding_value=0):
68 | max_x_len = 0
69 | max_y_len = 0
70 | for x in data:
71 | if len(x) > max_x_len:
72 | max_x_len = len(x)
73 | for y in x:
74 | if len(y) > max_y_len:
75 | max_y_len = len(y)
76 | for x in data:
77 | x += [[0] * max_y_len] * (max_x_len - len(x))
78 | for y in x:
79 | y += [0] * (max_y_len - len(y))
80 | return data
81 |
82 |
83 | def padding_batch(data, padding_value=0):
84 | for x in data:
85 | max_y_len = 0
86 | for y in x:
87 | if len(y) > max_y_len:
88 | max_y_len = len(y)
89 | for y in x:
90 | y += [0] * (max_y_len - len(y))
91 | return data
92 |
--------------------------------------------------------------------------------
/tutorials/sparse_data_identification/README.md:
--------------------------------------------------------------------------------
1 | # 训练数据覆盖不足识别及有效数据增强
2 |
3 | ## 方法介绍
4 |
5 | 训练数据覆盖不足会导致模型在对应的测试数据上表现不好。数据扩充是提升模型效果首选方法,然而数据标注是一个费时费力的工作,如何标注更少的数据带来更大的效果提升是大多数NLP开发者面临的难题。
6 |
7 |
8 | 
9 | 图1 数据覆盖不足识别及有效数据增强策略流程
10 |
11 |
12 | TrustAI提供图1流程,用尽量少的标注数据提升模型效果。首先,基于可信分析中实例级证据分析方法,从测试数据中识别因训练数据覆盖不足而导致预测效果差的测试样本,称作目标集(即证据不足测试数据)。然后,在未标注数据中,选择可作为目标集预测证据的数据进行标注。最后,将新标注的数据加入到训练数据中重训模型。
13 |
14 | 注:开发者可访问[ AI Studio示例 ](https://aistudio.baidu.com/aistudio/projectdetail/4434403)快速体验本案例。
15 |
16 | ## 实验步骤
17 |
18 | 我们以基于相似度计算任务LCQMC数据集进行的模拟实验为例介绍该方案实现步骤和效果。
19 |
20 | **Step 1**:从LCQMC训练数据中随机抽取5000条作为新训练集,剩余数据作为未标注数据。基于ERNIE-3.0-base-zh在新训练集`train_5000.tsv`微调得到相似度计算基线模型,运行命令如下所示:
21 |
22 | ```shell
23 | # 下载数据
24 | wget --no-check-certificate https://trustai.bj.bcebos.com/application_data/sparse_data.tar && tar xf sparse_data.tar && rm sparse_data.tar
25 | # 训练基线模型
26 | python -u train.py --dataset_dir ./data --train_file train_5000.tsv --dev_file dev.tsv --test_files test.tsv DuQM --num_classes 2 --save_dir ./checkpoint
27 | ```
28 | 训练的基线模型保存在`checkpoint`目录中。
29 |
30 |
31 | **Step 2**:基于基线模型从验证集中选择目标数据,即**目标集**。
32 |
33 | 目标集选择方法:针对验证集中每一条样本,基于TrustAI提供的实例级证据分析方法`FeatureSimilarityModel`,计算每一训练样本作为其正支持证据的分数;然后计算每一验证样本在所有训练数据上得到的平均证据支持分数。分数较低的样本表明其证据不足,将其加入到目标集中。
34 |
35 | ```shell
36 | # 选取目标数据
37 | python -u find_sparse_data.py --dataset_dir ./data --train_file train_5000.tsv --dev_file dev.tsv --num_classes 2 --init_from_ckpt ./checkpoint/model_state.pdparams --sparse_num 50 --sparse_path ./data/sparse_data.tsv
38 | # sparse_num表示选择的目标数据的数量
39 | # sparse_path表示目标集存储的路径
40 | ```
41 |
42 | 如图2所示,模型在目标集上的效果降低非常明显。
43 |
44 | 
45 | 图2 模型在整个测试集和目标集上的表现
46 |
47 |
48 |
49 | **Step 3**:针对目标集中数据,再次利用`FeatureSimilarityModel`方法从未标注数据`rest_train.tsv`中选择可支持它们预测的数据,作为增强数据进行人工标注。
50 |
51 | 注:此处为模拟实验,`rest_train.tsv`的数据已被标注
52 |
53 | ```shell
54 | # 选取有效数据
55 | python -u find_valid_data.py --dataset_dir ./data --unlabeled_file rest_train.tsv --target_file sparse_data.tsv --num_classes 2 --init_from_ckpt ./checkpoint/model_state.pdparams --valid_threshold 0.7 --valid_num 1000 --valid_path ./data/valid_data.tsv
56 | # valid_threshold表示目标集证据的分数阈值,开发者可根据自己数据自主调整,默认为0.7
57 | # valid_num表示抽取有效数据的数量
58 | # valid_path表示有效数据的存储路径
59 | ```
60 |
61 | **Step 4**:完成增强数据的标注后,将其加入到原训练数据集合中,重新训练模型及评估模型效果。
62 |
63 | ```shell
64 | # 将标注过的有效数据加入到原始训练数据中
65 | cat ./data/train_5000.tsv ./data/valid_data.tsv > ./data/merge_valid.tsv
66 | # 基于增强后的数据训练模型
67 | python -u train.py --dataset_dir ./data --train_file merge_valid.tsv --dev_file dev.tsv --test_files test.tsv DuQM sparse_data.tsv --num_classes 2 --save_dir ./valid_checkpoint
68 | ```
69 | 实验结果如下表所示(对比基线:随机选择相同规模的数据进行标注,作为增强数据):
70 |
71 | | 数据集 | 数据量 | LCQMCdev | LCQMCtest | DuQM | 目标集 |
72 | | :-------: | :-------: | :-----: | :-----: |:-----: |:-----: |
73 | | 基线 | 5000 | 86.31% | 84.49% | 69.17% | 55.20% |
74 | | 基线 + 随机1000条 | 6000 | 86.76% | 85.05% | 69.23% | 55.20% |
75 | | 基线 + 策略1000条 | 6000 | 87.04% | 85.58% | 70.20% | 69.60% |
76 |
77 | 注:以上结果均为10次实验的平均值。
78 |
--------------------------------------------------------------------------------
/tutorials/sparse_data_identification/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import numpy as np
16 |
17 | import paddle
18 | import paddle.nn.functional as F
19 | from paddlenlp.utils.log import logger
20 |
21 |
22 | @paddle.no_grad()
23 | def evaluate(model, criterion, metric, data_loader, name=''):
24 | """
25 | Given a dataset, it evaluates model and computes the metric.
26 | Args:
27 | model(obj:`paddle.nn.Layer`): A model to classify texts.
28 | criterion(obj:`paddle.nn.Layer`): It can compute the loss.
29 | metric(obj:`paddle.metric.Metric`): The evaluation metric.
30 | data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
31 | """
32 |
33 | model.eval()
34 | metric.reset()
35 | losses = []
36 | for batch in data_loader:
37 | input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']
38 | logits = model(input_ids, token_type_ids)
39 | loss = criterion(logits, labels)
40 | losses.append(loss.numpy())
41 | correct = metric.compute(logits, labels)
42 | metric.update(correct)
43 |
44 | acc = metric.accumulate()
45 | logger.info("%s: eval loss: %.5f, acc: %.5f" % (name, np.mean(losses), acc))
46 | model.train()
47 | metric.reset()
48 |
49 | return acc
50 |
51 |
52 | def preprocess_function(example, tokenizer, max_seq_length, is_test=False):
53 | """
54 | Builds model inputs from a sequence for sequence classification tasks
55 | by concatenating and adding special tokens.
56 |
57 | Args:
58 | example(obj:`list[str]`): input data, containing text and label if it have label.
59 | tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
60 | which contains most of the methods. Users should refer to the superclass for more information regarding methods.
61 | max_seq_length(obj:`int`): The maximum total input sequence length after tokenization.
62 | Sequences longer than this will be truncated, sequences shorter will be padded.
63 | label_nums(obj:`int`): The number of the labels.
64 | Returns:
65 | result(obj:`dict`): The preprocessed data including input_ids, token_type_ids, labels.
66 | """
67 | if 'text_b' not in example:
68 | result = tokenizer(text=example["text_a"], max_seq_len=max_seq_length)
69 | else:
70 | result = tokenizer(text=example["text_a"], text_pair=example['text_b'], max_seq_len=max_seq_length)
71 |
72 | if not is_test:
73 | result["labels"] = np.array([example['label']], dtype='int64')
74 | return result
75 |
--------------------------------------------------------------------------------