├── .github └── workflows │ └── codeql-analysis.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── ci.yml ├── competitions ├── mrc_baseline.ipynb ├── mrc_utils.py ├── sentiment_analysis_baseline.ipynb ├── textual_similarity_baseline.ipynb └── utils.py ├── examples ├── assets │ └── utils.py ├── evaluation │ ├── zh-pretrain-ernie-1.0-evaluation.ipynb │ └── zh-sentiment-analysis-evaluation.ipynb └── interpretation │ ├── example_level │ ├── en-similarity_for_reppoint.ipynb │ ├── zh-sentiment-analysis_for_feature_sim.ipynb │ ├── zh-sentiment-analysis_for_gradient_sim.ipynb │ └── zh-sentiment-analysis_for_reppoint.ipynb │ └── token_level │ ├── zh-sentiment-analysis.ipynb │ ├── zh-sentiment-analysis_for_normlime.ipynb │ └── zh-similarity.ipynb ├── imgs ├── bias.png ├── bias_correction.png ├── data_map.png ├── data_map_criterion.png ├── data_map_lt.png ├── data_map_main.png ├── data_map_normal.png ├── dirty-accuracy.png ├── dirty.png ├── dirty_analysis.png ├── equation1.png ├── equation2.png ├── equation3.png ├── equation4.png ├── equation5.png ├── example.gif ├── example.png ├── example_case.png ├── framework.png ├── overview.png ├── paddlenlp脏数据识别及修正.png ├── paddlenlp覆盖不足识别及有效增强.png ├── pretrained_labeled_case.png ├── process-for-dirty.png ├── process-for-sparse.png ├── rationale.png ├── rationale_example.png ├── redundancy_removal.png ├── saliency_map.png ├── sentiment_labeled_case.png ├── sparse.png ├── sparse_analysis.png ├── target-performance.png ├── token.gif ├── token.png ├── trustai.png ├── visual.png ├── visual2.png ├── visual3.png ├── why_trustai.png └── 覆盖不足识别.png ├── requirements.txt ├── setup.cfg ├── setup.py ├── tests ├── __init__.py └── interpretation │ ├── __init__.py │ ├── assets │ └── utils.py │ └── example_level │ ├── __init__.py │ ├── test_feature_similarity.py │ ├── test_gradient_similarity.py │ └── test_reppoint.py ├── trustai ├── .gitignore ├── __init__.py ├── demo │ ├── __init__.py │ ├── demo.py │ └── utils.py ├── evaluation │ ├── README.md │ ├── __init__.py │ └── evaluator.py └── interpretation │ ├── __init__.py │ ├── base_interpret.py │ ├── example_level │ ├── README.md │ ├── __init__.py │ ├── common │ │ ├── __init__.py │ │ ├── data_class.py │ │ └── utils.py │ └── method │ │ ├── __init__.py │ │ ├── example_base_interpreter.py │ │ ├── feature_similarity.py │ │ ├── gradient_similarity.py │ │ └── representer_point.py │ ├── python_utils.py │ └── token_level │ ├── README.md │ ├── __init__.py │ ├── common │ ├── __init__.py │ ├── postprocess_attribution.py │ └── predict_functions.py │ ├── data_processor │ ├── __init__.py │ ├── data_class.py │ └── visualizer.py │ └── method │ ├── __init__.py │ ├── attention.py │ ├── base_interpret.py │ ├── gradient_shap.py │ ├── integrated_gradients.py │ ├── lime.py │ └── norm_lime.py └── tutorials ├── README.md ├── data_bias_identification ├── data_distribution_correction │ ├── README.md │ ├── balance_train_data.py │ ├── get_rationale_importance.py │ ├── train.py │ └── utils.py └── less_learn_shortcut │ ├── README.md │ ├── find_bias_word.py │ ├── lls.py │ ├── train.py │ └── utils.py ├── data_map ├── README.md ├── data.py ├── plot_map.py ├── run_train_pointwise.sh ├── sample_100.tsv ├── sample_stat_summary.py └── train_pointwise.py ├── dirty_data_identification ├── README.md ├── find_dirty_data.py ├── train.py └── utils.py ├── enhanced_by_rationale ├── README.md ├── train.py └── utils.py ├── map_analysis ├── utils.py └── zh-similarity-application.ipynb ├── redundancy_removal ├── README.md ├── args.py ├── download.sh ├── predictor │ ├── dataloader_factory.py │ ├── model.py │ └── model_manager.py ├── requirements.txt ├── run_predict.py ├── run_select.py ├── selector │ ├── dataloader_factory.py │ ├── model.py │ └── model_manager.py ├── test.sh ├── train.sh ├── train_predictor.sh ├── train_select_data.sh ├── train_selector.sh └── utils │ ├── checklist_process.py │ ├── dureader_robust.py │ ├── logger.py │ ├── predict.py │ └── tools.py └── sparse_data_identification ├── README.md ├── find_sparse_data.py ├── find_valid_data.py ├── train.py └── utils.py /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ main, pattern1 ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ main ] 20 | schedule: 21 | - cron: '19 6 * * 5' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ python ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v3 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v2 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 52 | 53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 54 | # If this step fails, then you should remove it and run the build manually (see below) 55 | - name: Autobuild 56 | uses: github/codeql-action/autobuild@v2 57 | 58 | # ℹ️ Command-line programs to run using the OS shell. 59 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 60 | 61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 62 | # and modify them (or add more) to build your code if your project 63 | # uses a compiled language 64 | 65 | #- run: | 66 | # make bootstrap 67 | # make release 68 | 69 | - name: Perform CodeQL Analysis 70 | uses: github/codeql-action/analyze@v2 71 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Virtualenv 2 | /.venv/ 3 | /venv/ 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | /bin/ 14 | /build/ 15 | /develop-eggs/ 16 | /dist/ 17 | /eggs/ 18 | /lib/ 19 | /lib64/ 20 | /output/ 21 | /parts/ 22 | /sdist/ 23 | /var/ 24 | /*.egg-info/ 25 | /.installed.cfg 26 | /*.egg 27 | /.eggs 28 | 29 | # AUTHORS and ChangeLog will be generated while packaging 30 | /AUTHORS 31 | /ChangeLog 32 | 33 | # BCloud / BuildSubmitter 34 | /build_submitter.* 35 | /logger_client_log 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | .tox/ 43 | .coverage 44 | .cache 45 | .pytest_cache 46 | nosetests.xml 47 | coverage.xml 48 | .idea 49 | # Translations 50 | *.mo 51 | 52 | # Sphinx documentation 53 | /docs/_build/ 54 | 55 | 56 | *bak 57 | *copy* 58 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/mirrors-yapf 3 | rev: v0.32.0 4 | hooks: 5 | - id: yapf 6 | files: \.py$ 7 | args: ["--style={column_limit: 120}"] 8 | - repo: https://github.com/pre-commit/pre-commit-hooks 9 | rev: a11d9314b22d8f8c7556443875b731ef05965464 10 | hooks: 11 | - id: check-merge-conflict 12 | - id: check-symlinks 13 | - id: detect-private-key 14 | - id: end-of-file-fixer 15 | files: \.md$ 16 | - id: trailing-whitespace 17 | files: \.md$ 18 | - repo: https://github.com/Lucas-C/pre-commit-hooks 19 | rev: v1.0.1 20 | hooks: 21 | - id: forbid-crlf 22 | files: \.md$ 23 | - id: remove-crlf 24 | files: \.md$ 25 | - id: forbid-tabs 26 | files: \.md$ 27 | - id: remove-tabs 28 | exclude: (\.tsv)$ -------------------------------------------------------------------------------- /ci.yml: -------------------------------------------------------------------------------- 1 | Global: 2 | tool : build_submitter 3 | 4 | Default: 5 | profile : [publish] 6 | 7 | Profiles: 8 | - profile: 9 | name : dev 10 | env: DECK_CENTOS6U3_K3 11 | command : python setup.py bdist_wheel 12 | release : true 13 | 14 | - profile: 15 | name : publish 16 | env: DECK_CENTOS6U3_K3 17 | command : python setup.py bdist_wheel 18 | release : true 19 | -------------------------------------------------------------------------------- /imgs/bias.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/bias.png -------------------------------------------------------------------------------- /imgs/bias_correction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/bias_correction.png -------------------------------------------------------------------------------- /imgs/data_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/data_map.png -------------------------------------------------------------------------------- /imgs/data_map_criterion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/data_map_criterion.png -------------------------------------------------------------------------------- /imgs/data_map_lt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/data_map_lt.png -------------------------------------------------------------------------------- /imgs/data_map_main.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/data_map_main.png -------------------------------------------------------------------------------- /imgs/data_map_normal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/data_map_normal.png -------------------------------------------------------------------------------- /imgs/dirty-accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/dirty-accuracy.png -------------------------------------------------------------------------------- /imgs/dirty.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/dirty.png -------------------------------------------------------------------------------- /imgs/dirty_analysis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/dirty_analysis.png -------------------------------------------------------------------------------- /imgs/equation1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/equation1.png -------------------------------------------------------------------------------- /imgs/equation2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/equation2.png -------------------------------------------------------------------------------- /imgs/equation3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/equation3.png -------------------------------------------------------------------------------- /imgs/equation4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/equation4.png -------------------------------------------------------------------------------- /imgs/equation5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/equation5.png -------------------------------------------------------------------------------- /imgs/example.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/example.gif -------------------------------------------------------------------------------- /imgs/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/example.png -------------------------------------------------------------------------------- /imgs/example_case.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/example_case.png -------------------------------------------------------------------------------- /imgs/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/framework.png -------------------------------------------------------------------------------- /imgs/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/overview.png -------------------------------------------------------------------------------- /imgs/paddlenlp脏数据识别及修正.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/paddlenlp脏数据识别及修正.png -------------------------------------------------------------------------------- /imgs/paddlenlp覆盖不足识别及有效增强.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/paddlenlp覆盖不足识别及有效增强.png -------------------------------------------------------------------------------- /imgs/pretrained_labeled_case.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/pretrained_labeled_case.png -------------------------------------------------------------------------------- /imgs/process-for-dirty.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/process-for-dirty.png -------------------------------------------------------------------------------- /imgs/process-for-sparse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/process-for-sparse.png -------------------------------------------------------------------------------- /imgs/rationale.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/rationale.png -------------------------------------------------------------------------------- /imgs/rationale_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/rationale_example.png -------------------------------------------------------------------------------- /imgs/redundancy_removal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/redundancy_removal.png -------------------------------------------------------------------------------- /imgs/saliency_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/saliency_map.png -------------------------------------------------------------------------------- /imgs/sentiment_labeled_case.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/sentiment_labeled_case.png -------------------------------------------------------------------------------- /imgs/sparse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/sparse.png -------------------------------------------------------------------------------- /imgs/sparse_analysis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/sparse_analysis.png -------------------------------------------------------------------------------- /imgs/target-performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/target-performance.png -------------------------------------------------------------------------------- /imgs/token.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/token.gif -------------------------------------------------------------------------------- /imgs/token.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/token.png -------------------------------------------------------------------------------- /imgs/trustai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/trustai.png -------------------------------------------------------------------------------- /imgs/visual.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/visual.png -------------------------------------------------------------------------------- /imgs/visual2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/visual2.png -------------------------------------------------------------------------------- /imgs/visual3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/visual3.png -------------------------------------------------------------------------------- /imgs/why_trustai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/why_trustai.png -------------------------------------------------------------------------------- /imgs/覆盖不足识别.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/imgs/覆盖不足识别.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | paddlenlp 3 | paddlepaddle-gpu >= 2.0.0 4 | scikit-learn 5 | tqdm 6 | matplotlib 7 | IPython 8 | pre-commit -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # Here's a link about setup.cfg 2 | # https://setuptools.readthedocs.io/en/latest/setuptools.html#configuring-setup-using-setup-cfg-files 3 | [metadata] 4 | # Project name, the project name will be used while publishing and installing 5 | name = trustai 6 | # Author's name and email address 7 | author = Baidu NLP 8 | author_email = nlp-parser@baidu.com 9 | # Project version, versions only above than 1.0 will assumed as a released version. 10 | # When modifying project version to above than 1.0, here's the rules should be followed. 11 | # http://wiki.baidu.com/pages/viewpage.action?pageId=469686381 12 | version = 0.1.12 13 | # A brief introduction about the project, ANY NON-ENGLISH CHARACTER IS NOT SUPPORTED! 14 | description = baidu TrustAI 15 | # A longer version of introduction abouth the project, you can also include readme, change log, etc. .md or rst file is recommended. 16 | long_description = file: README.md 17 | long_description_content_type = text/markdown 18 | # Main page of the project, usually the project's icode page, you can set to its wiki or other documents url instead. 19 | home_page = https://github.com/PaddlePaddle/TrustAI 20 | # License, you can ignore this if the project is not going to open source to the public. 21 | license = Apache License 2.0 22 | # Project type, you can ignore this if the project is not going to open source to the public. 23 | # Choose the right field to fulfill from PyPI's official list. 24 | # https://pypi.org/pypi?%3Aaction=list_classifiers 25 | classifier = 26 | Programming Language :: Python :: 3 27 | Programming Language :: Python :: 3.6 28 | Programming Language :: Python :: 3.7 29 | Programming Language :: Python :: 3.8 30 | License :: OSI Approved :: Apache Software License 31 | Operating System :: OS Independent 32 | # keywords, used for indexing, easier to search for other users if they are interested of your project. 33 | keywords = 34 | baidu 35 | TrustAI 36 | interpretation 37 | 38 | [options] 39 | # Package name. find means search automatically, you also can have detailed configuration in options.packages.find 40 | packages = find: 41 | # Dependency management, all project's dependency is needed here. 42 | # Every single line for a specified dependency, only the dependency is need, you don't have to consider the hierarchy dependency 43 | # Versions here should be as abstract as possible, usually only specific a version range including minimum and maximum version. 44 | install_requires = 45 | scikit-learn 46 | numpy 47 | tqdm 48 | matplotlib 49 | IPython 50 | 51 | # Test dependencies, all dependencies for tests here. The format is align to install_requires. 52 | # You can use the internal unittest, or the simplier framework such as pytest or nose. 53 | # python3 has a mock library with itself, but it's not exist in python 2, add as you need. 54 | #tests_require = 55 | # pytest 56 | # mock 57 | 58 | # directory for unit test 59 | test_suite = trustai.tests 60 | # add all data files controled by git 61 | include_package_data = True 62 | # You can run zip source code for plain python project 63 | zip_safe = False 64 | 65 | # You can set this configuration to let users run directly the main entrance function 66 | #[options.entry_points] 67 | #console_scripts = 68 | # TrustAI = trustai.cmdline:main 69 | 70 | # You can add conf/data directory into package, the following directory will be installed under site-package 71 | # Only file is supported, but you can use wildcard instead. 72 | #[options.package_data] 73 | #trustai = 74 | # conf/* 75 | # data/* 76 | 77 | [sdist] 78 | dist_dir = output/dist 79 | 80 | [bdist_wheel] 81 | # set universal=1 if this project can both run in python 2 or 3 environment. 82 | #universal=1 83 | dist_dir = output/dist 84 | 85 | [easy_install] 86 | # using baidu's official pip source 87 | index_url = http://pip.baidu.com/root/baidu/+simple/ 88 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | ################################################################################ 4 | # 5 | # Copyright (c) 2022 Baidu.com, Inc. All Rights Reserved 6 | # 7 | ################################################################################ 8 | """ 9 | Setup script. 10 | 11 | Authors: zhangshuai28(zhangshuai28@baidu.com) 12 | Date: 2022/03/14 14:53:37 13 | """ 14 | 15 | import setuptools 16 | 17 | setuptools.setup() 18 | 19 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/tests/__init__.py -------------------------------------------------------------------------------- /tests/interpretation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/tests/interpretation/__init__.py -------------------------------------------------------------------------------- /tests/interpretation/example_level/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/tests/interpretation/example_level/__init__.py -------------------------------------------------------------------------------- /tests/interpretation/example_level/test_feature_similarity.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | import os 3 | import sys 4 | import unittest 5 | from functools import partial 6 | 7 | import paddle 8 | import numpy as np 9 | from paddlenlp.data import Stack, Tuple, Pad, Vocab, JiebaTokenizer 10 | from paddlenlp.datasets import load_dataset, MapDataset 11 | from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer 12 | 13 | sys.path.insert(0, '../') 14 | sys.path.insert(0, '../../../') 15 | from assets.utils import ( 16 | create_dataloader, 17 | convert_example, 18 | create_dataloader_from_scratch, 19 | LSTMModel, 20 | preprocess_fn_lstm, 21 | get_sublayer, 22 | ) 23 | from trustai.interpretation.example_level.method.feature_similarity import ( 24 | FeatureSimilarityModel, ) 25 | 26 | 27 | class TestFeatureSimilarity(unittest.TestCase): 28 | 29 | def test_bert_model(self): 30 | MODEL_NAME = "ernie-1.0" 31 | DATASET_NAME = "chnsenticorp" 32 | paddle_model = ErnieForSequenceClassification.from_pretrained(MODEL_NAME, num_classes=2) 33 | tokenizer = ErnieTokenizer.from_pretrained(MODEL_NAME) 34 | state_dict = paddle.load(f"../assets/{DATASET_NAME}-{MODEL_NAME}/model_state.pdparams") 35 | paddle_model.set_dict(state_dict) 36 | 37 | train_ds, dev_ds, test_ds = load_dataset(DATASET_NAME, splits=["train", "dev", "test"]) 38 | 39 | batch_size = 32 40 | max_seq_length = 128 41 | 42 | trans_func = partial( 43 | convert_example, 44 | tokenizer=tokenizer, 45 | max_seq_length=max_seq_length, 46 | is_test=True, 47 | ) 48 | batchify_fn = lambda samples, fn=Tuple( 49 | Pad(axis=0, pad_val=tokenizer.pad_token_id), # input 50 | Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment 51 | ): [data for data in fn(samples)] 52 | 53 | train_data_loader = create_dataloader( 54 | train_ds, 55 | mode="train", 56 | batch_size=batch_size, 57 | batchify_fn=batchify_fn, 58 | trans_fn=trans_func, 59 | shuffle=False, 60 | ) 61 | 62 | feature_sim_model = FeatureSimilarityModel(paddle_model, train_data_loader, classifier_layer_name="classifier") 63 | 64 | def test_predict_fn(self): 65 | 66 | def predict_fn(inputs, paddle_model, classifier_layer_name="classifier"): 67 | """predict_fn""" 68 | 69 | x_feature = [] 70 | 71 | def forward_pre_hook(layer, input): 72 | """ 73 | Hook for a given layer in model. 74 | """ 75 | x_feature.extend(input[0]) 76 | 77 | classifier = get_sublayer(paddle_model, classifier_layer_name) 78 | 79 | forward_pre_hook_handle = classifier.register_forward_pre_hook(forward_pre_hook) 80 | 81 | if isinstance(inputs, (tuple, list)): 82 | logits = paddle_model(*inputs) # get logits, [bs, num_c] 83 | else: 84 | logits = paddle_model(inputs) # get logits, [bs, num_c] 85 | 86 | forward_pre_hook_handle.remove() 87 | 88 | probas = paddle.nn.functional.softmax(logits, axis=1) # get probabilities. 89 | preds = paddle.argmax(probas, axis=1) # get predictions. 90 | x_feature = paddle.to_tensor(x_feature) 91 | return x_feature, probas, preds 92 | 93 | MODEL_NAME = "ernie-1.0" 94 | DATASET_NAME = "chnsenticorp" 95 | paddle_model = ErnieForSequenceClassification.from_pretrained(MODEL_NAME, num_classes=2) 96 | tokenizer = ErnieTokenizer.from_pretrained(MODEL_NAME) 97 | state_dict = paddle.load(f"../assets/{DATASET_NAME}-{MODEL_NAME}/model_state.pdparams") 98 | paddle_model.set_dict(state_dict) 99 | 100 | train_ds, dev_ds, test_ds = load_dataset(DATASET_NAME, splits=["train", "dev", "test"]) 101 | 102 | batch_size = 32 103 | max_seq_length = 128 104 | 105 | trans_func = partial( 106 | convert_example, 107 | tokenizer=tokenizer, 108 | max_seq_length=max_seq_length, 109 | is_test=True, 110 | ) 111 | batchify_fn = lambda samples, fn=Tuple( 112 | Pad(axis=0, pad_val=tokenizer.pad_token_id), # input 113 | Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment 114 | ): [data for data in fn(samples)] 115 | 116 | predict_fn_test = partial(predict_fn, paddle_model=paddle_model) 117 | 118 | train_data_loader = create_dataloader( 119 | train_ds, 120 | mode="train", 121 | batch_size=batch_size, 122 | batchify_fn=batchify_fn, 123 | trans_fn=trans_func, 124 | shuffle=False, 125 | ) 126 | 127 | feature_sim_model = FeatureSimilarityModel( 128 | paddle_model, 129 | train_data_loader, 130 | classifier_layer_name="classifier", 131 | predict_fn=predict_fn_test, 132 | ) 133 | 134 | def test_lstm_model(self): 135 | PARAMS_PATH = "../assets/chnsenticorp-bilstm/final.pdparams" 136 | VOCAB_PATH = "../assets/chnsenticorp-bilstm/bilstm_word_dict.txt" 137 | vocab = Vocab.from_json(VOCAB_PATH) 138 | tokenizer = JiebaTokenizer(vocab) 139 | label_map = {0: "negative", 1: "positive"} 140 | vocab_size = len(vocab) 141 | num_classes = len(label_map) 142 | pad_token_id = vocab.to_indices("[PAD]") 143 | 144 | DATASET_NAME = "chnsenticorp" 145 | paddle_model = LSTMModel(vocab_size, num_classes, direction="bidirect", padding_idx=pad_token_id) 146 | state_dict = paddle.load(PARAMS_PATH) 147 | paddle_model.set_dict(state_dict) 148 | 149 | train_ds, dev_ds, test_ds = load_dataset(DATASET_NAME, splits=["train", "dev", "test"]) 150 | 151 | # train_ds = [d['text'] for d in list(train_ds)[:1200]] 152 | # train_ds = [d["text"] for d in list(train_ds)] 153 | # train_ds = MapDataset(train_ds) 154 | 155 | batch_size = 32 156 | max_seq_length = 128 157 | 158 | trans_func = partial(preprocess_fn_lstm, tokenizer=tokenizer, is_test=True) 159 | batchify_fn = lambda samples, fn=Tuple( 160 | Pad(axis=0, pad_val=pad_token_id), # input 161 | Pad(axis=0, pad_val=pad_token_id), # sequence_length 162 | ): [data for data in fn(samples)] 163 | 164 | train_data_loader = create_dataloader( 165 | train_ds, 166 | mode="train", 167 | batch_size=batch_size, 168 | batchify_fn=batchify_fn, 169 | trans_fn=trans_func, 170 | shuffle=False, 171 | ) 172 | 173 | feature_sim_model = FeatureSimilarityModel(paddle_model, 174 | train_data_loader, 175 | classifier_layer_name="output_layer") 176 | 177 | 178 | if __name__ == "__main__": 179 | unittest.main() 180 | -------------------------------------------------------------------------------- /trustai/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 105 | __pypackages__/ 106 | 107 | # Celery stuff 108 | celerybeat-schedule 109 | celerybeat.pid 110 | 111 | # SageMath parsed files 112 | *.sage.py 113 | 114 | # Environments 115 | .env 116 | .venv 117 | env/ 118 | venv/ 119 | ENV/ 120 | env.bak/ 121 | venv.bak/ 122 | 123 | # Spyder project settings 124 | .spyderproject 125 | .spyproject 126 | 127 | # Rope project settings 128 | .ropeproject 129 | 130 | # mkdocs documentation 131 | /site 132 | 133 | # mypy 134 | .mypy_cache/ 135 | .dmypy.json 136 | dmypy.json 137 | 138 | # Pyre type checker 139 | .pyre/ 140 | 141 | # pytype static type analyzer 142 | .pytype/ 143 | 144 | # Cython debug symbols 145 | cython_debug/ 146 | 147 | # PyCharm 148 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 149 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 150 | # and can be added to the global gitignore or merged into this file. For a more nuclear 151 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 152 | #.idea/ 153 | -------------------------------------------------------------------------------- /trustai/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """TrustAI""" 15 | 16 | __version__ = "0.1.12" -------------------------------------------------------------------------------- /trustai/demo/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Init file""" 15 | 16 | from .demo import * -------------------------------------------------------------------------------- /trustai/demo/demo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """pretrain for demo""" 15 | 16 | import logging 17 | import logging.handlers 18 | import os 19 | import sys 20 | import re 21 | import requests 22 | import shutil 23 | import tarfile 24 | import warnings 25 | import functools 26 | 27 | from tqdm import tqdm 28 | from paddle.io import DataLoader, BatchSampler 29 | try: 30 | from paddlenlp.transformers import * 31 | from paddlenlp.datasets import load_dataset 32 | 33 | except ImportError as e: 34 | sys.stderr.write( 35 | '''The demo module depends on paddlenlp, please install paddlenlp firstly. cmd: pip install -U paddlenlp. ''') 36 | exit(-1) 37 | 38 | from .utils import DOWNLOAD_MODEL_PATH_DICT, MODEL_HOME, get_path_from_url 39 | from .utils import LocalDataCollatorWithPadding, preprocess_function, get_path_from_url 40 | 41 | 42 | class DEMO(object): 43 | 44 | def __init__(self, task_name, device: str = None): 45 | self.device = device 46 | assert self.device is None or isinstance(self.device, str) and re.search( 47 | r"^cpu$|^gpu$|^gpu:\d+$", self.device 48 | ) is not None, "The format of the ``devices`` should be like ``cpu``, ``gpu``, ``gpu:0``, ``gpu:1`` etc." 49 | 50 | self._paddle_env_set() 51 | self.task_name = task_name 52 | model_path = self.get_model_path(task_name) 53 | self.paddle_model = AutoModelForSequenceClassification.from_pretrained(model_path) 54 | self.tokenizer = AutoTokenizer.from_pretrained(model_path) 55 | self.unk_id = self.tokenizer.unk_token_id 56 | self.pad_id = self.tokenizer.pad_token_type_id 57 | self.cls_id = self.tokenizer.cls_token_id 58 | self.mask_id = self.tokenizer.mask_token_id 59 | 60 | def get_model_path(self, model_name): 61 | try: 62 | model_url, md5sum = DOWNLOAD_MODEL_PATH_DICT[model_name] 63 | except KeyError: 64 | logging.warn( 65 | f"The model_name `{model_name}` is wrong, currently only the following models are supported : {', '.join(DOWNLOAD_MODEL_PATH_DICT.keys())}." 66 | ) 67 | exit(-1) 68 | model_path = get_path_from_url(model_url, MODEL_HOME, md5sum=md5sum) 69 | return model_path 70 | 71 | def get_model(self): 72 | return self.paddle_model 73 | 74 | def get_tokenizer(self): 75 | return self.tokenizer 76 | 77 | def get_train_data_and_dataloader(self, batch_size=8, max_seq_length=256): 78 | task_name = self.task_name.split('/') 79 | if len(task_name) == 2: 80 | train_ds = load_dataset(task_name[0], name=task_name[1], splits=["train"]) 81 | else: 82 | train_ds = load_dataset(task_name[0], splits=["train"]) 83 | trans_func = functools.partial(preprocess_function, 84 | max_seq_length=max_seq_length, 85 | tokenizer=self.tokenizer, 86 | is_test=True) 87 | train_ds = train_ds.map(trans_func) 88 | train_batch_sampler = BatchSampler(train_ds, batch_size=batch_size, shuffle=False) 89 | collate_fn = LocalDataCollatorWithPadding(self.tokenizer) 90 | train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=collate_fn) 91 | return train_ds.data, train_data_loader, 92 | 93 | def __call__(self, *args, **kwargs): 94 | return self.process(*args, **kwargs) 95 | 96 | def process(self, text, text_pair=None): 97 | tokenize_result = self.tokenizer(text, text_pair=text_pair, return_tensors='pd', padding=True) 98 | input_ids = tokenize_result['input_ids'] 99 | token_type_ids = tokenize_result['token_type_ids'] 100 | tokens = [self.tokenizer.convert_ids_to_tokens(_input_ids) for _input_ids in input_ids.tolist()] 101 | return tokens, (input_ids, token_type_ids) 102 | 103 | def _paddle_env_set(self): 104 | import paddle 105 | if self.device is not None: 106 | if not paddle.is_compiled_with_cuda() and self.device[:3] == 'gpu': 107 | warnings.warn("Paddle is not installed with GPU support. Change to CPU version now.") 108 | self.device = 'cpu' 109 | 110 | # globally set device. 111 | paddle.set_device(self.device) 112 | 113 | def __getitem__(self, key): 114 | return getattr(self, key) -------------------------------------------------------------------------------- /trustai/evaluation/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/TrustAI/f0cc490438d4619b97c1bcf26794ffcea4f0ce95/trustai/evaluation/README.md -------------------------------------------------------------------------------- /trustai/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Init file""" 15 | 16 | from .evaluator import * -------------------------------------------------------------------------------- /trustai/interpretation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """interpreter""" 15 | 16 | from .token_level import * 17 | from .example_level import * -------------------------------------------------------------------------------- /trustai/interpretation/base_interpret.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """base interpreter""" 15 | 16 | import abc 17 | import sys 18 | import numpy as np 19 | import re 20 | import warnings 21 | 22 | from .python_utils import versiontuple2tuple 23 | 24 | 25 | class Interpreter(abc.ABC): 26 | """Interpreter is the base class for all interpretation algorithms. 27 | Args: 28 | paddle_model (callable): A model with ``forward`` and possibly ``backward`` functions. 29 | device (str): The device used for running `paddle_model`, options: ``cpu``, ``gpu``, ``gpu:0``, ``gpu:1`` etc. default: None 30 | """ 31 | 32 | def __init__(self, paddle_model: callable, device: str = None, **kwargs): 33 | self.device = device 34 | self.paddle_model = paddle_model 35 | self.predict_fn = None 36 | 37 | assert self.device is None or isinstance(self.device, str) and re.search( 38 | r"^cpu$|^gpu$|^gpu:\d+$", self.device 39 | ) is not None, "The format of the ``devices`` should be like ``cpu``, ``gpu``, ``gpu:0``, ``gpu:1`` etc." 40 | 41 | self._paddle_env_set() 42 | 43 | def __call__(self, *args, **kwargs): 44 | return self.interpret(*args, **kwargs) 45 | 46 | @abc.abstractmethod 47 | def interpret(self, **kwargs): 48 | """Main function of the interpreter.""" 49 | raise NotImplementedError 50 | 51 | @abc.abstractmethod 52 | def _build_predict_fn(self, **kwargs): 53 | """Build self.predict_fn for interpreters.""" 54 | raise NotImplementedError 55 | 56 | def _paddle_env_set(self): 57 | import paddle 58 | if self.device is not None: 59 | if not paddle.is_compiled_with_cuda() and self.device[:3] == 'gpu': 60 | warnings.warn("Paddle is not installed with GPU support. Change to CPU version now.") 61 | self.device = 'cpu' 62 | 63 | # globally set device. 64 | paddle.set_device(self.device) 65 | self.paddle_model.to(self.device) 66 | 67 | if versiontuple2tuple(paddle.version.full_version) >= (2, 2, 1): 68 | # From Paddle2.2.1, gradients are supported in eval mode. 69 | self.paddle_model.eval() 70 | else: 71 | # Former versions. 72 | self.paddle_model.train() 73 | for n, v in self.paddle_model.named_sublayers(): 74 | if "batchnorm" in v.__class__.__name__.lower(): 75 | v._use_global_stats = True 76 | if "dropout" in v.__class__.__name__.lower(): 77 | v.p = 0 78 | -------------------------------------------------------------------------------- /trustai/interpretation/example_level/README.md: -------------------------------------------------------------------------------- 1 | # 实例级证据分析 2 | 3 | 4 | ## 功能介绍 5 | 针对给定的模型(含训练数据)和测试输入,实例级证据分析方法对每一训练样本赋值一个分数,用其表示该样本对预测文本的影响度。然后,对当前预测文本影响大的若干训练样本作为模型预测依赖证据,解释模型对预测。 6 | 7 | TrustAI提供了3种实例级证据分析方法,分别是: 8 | * [基于梯度的方法](https://proceedings.neurips.cc/paper/2018/file/8a7129b8f3edd95b7d969dfc2c8e9d9d-Paper.pdf):基于训练样本的梯度计算其对模型的影响度a, 通过影响度a、训练样本i的表示、测试样本t的表示来计算训练样本i对测试样本t的影响度。该方法计算得到的影响分数受a影响较大,即给出的证据是对模型影响大的样本,这样的样本可能是困难数据或脏数据,所以使用者可以根据这种方法识别脏数据。 9 | * [基于表示相似度方法](https://arxiv.org/pdf/2104.04128.pdf):计算训练样本i和测试样本t的表示的相似度,该相似度作为训练样本i对测试样本t的影响度。该相似度可通过cosine、KL散度、欧氏距离等多种方法实现。 10 | * [基于梯度相似度方法](https://arxiv.org/pdf/2102.05262.pdf):计算训练样本i和测试样本t的梯度的相似度,该相似度作为训练样本i对测试样本t的影响度。该相似度可通过cosine等方法实现。 11 | 12 | 13 | 14 | ## 使用示例 15 | TrustAI为所有实例级证据分析方法提供统一的使用接口。 16 | * 接口输入:训练数据、训练好的模型、和测试数据; 17 | * 接口输出:测试数据的实例级证据,含支持当前测试数据的证据(正影响)和不支持测试数据的证据(负影响)。 18 | 19 | 20 | 21 | #### 基于梯度的方法 22 | 调用代码: 23 | ```python 24 | from trustai.interpretation import RepresenterPointModel 25 | # initialization 26 | # 开发者需要传入模型及对应的训练数据,以及模型输出层中最后一层的layer name 27 | representer_model = RepresenterPointModel(model, train_data_loader, classifier_layer_name="classifier") 28 | # res为模型返回的结构化的结果,类型为list。list中的每一个元素对应一个测试实例的返回结果,该结果包括预测标签,正影响数据的索引,负影响数据的索引,正影响数据的分值和负影响数据的分值。 29 | res = [] 30 | for batch in test_dataloader: 31 | res += representer_model(batch, sample_num=3) 32 | ``` 33 | *注:返回证据数量由sample_num指定,若sample_num为-1则返回按影响度排序的所有训练数据。* 34 | 35 | 输出结果: 36 | ```txt 37 | 测试文本(来自情感分析任务): 本来不想评价了,但为了携程的携粉们,还是说一下,这称不上是九点,细说就真没必要了,就一个字:差 38 | 预测结果: 0(表示负向情感) 39 | 40 | 支持当前测试文本的训练样本(正影响样本): 41 | text: 感觉非常奇怪,这套书我明明都写了两次评论了,可我的当当始终提醒我对这套书写评论!晕啊!这是套很好的书,也不用我写几次评论吧! gold label: 1 score: 0.03509485349059105 42 | text: 1)背面少个螺丝钉,说是thinkpad都少,靠 2)键盘周围的壳不平整,按下去发现有:“滋啦滋啦”声音,我才意识到,那是个双面胶,按下去就不上来了,过会儿还是回弹上来,很明显仅靠双面胶是 粘不住的,你还不如拿502呢,起码这样粘得严实还能让我心里舒服(但是这样只是弥补质量问题),何必还弄个滋啦兹啦的声音,多闹心啊,(还有一地方用了双面胶,我换内存的时候发现键盘下部盖子左侧打不开,一直不敢用力 gold label: 1 score: 0.03008783608675003 43 | text: 用了6年的THINKPAD,一直认为是笔记本中最好的! 现在这台新的让我......哎!! gold label: 0 score: 0.029884012416005135 44 | 45 | 不支持当前测试文本的训练样本(负影响样本): 46 | text: 是LINUX系统 相当及其恶心 不知道这狗 日 的是什么想法 要强行逼我们使用啊 买了两台电脑 一个事VISTA系统 一个 是 LINUX 就没见一个XP的 网上销售这东西 最重要的是打架尽量不要涉及到售后服务这块 尽量是都搞好了相安无事 其实网上的售后服务比没有售后服务还差劲 我的THINKPAD SL400就是因为换货期间以为是键盘小问题就懒得换了 gold label: 1 score: -0.07112707197666168 47 | text: 盼了2周终于拿到本了,一开机就屏不亮,本人自己跑回总部退机,现在还在等着检测,说要等上15个工作日,呵呵,买个电脑容易吗?时间浪费的起吗?请问? gold label: 0 score: -0.07233154773712158 48 | text: 价格确实比较高,而且还没有早餐提供。 携程拿到的价格不好?还是自己保留起来不愿意让利给我们这些客户呢? 到前台搞价格,430就可以了。 gold label: 1 score: -0.08243595063686371 49 | ``` 50 | 51 | 52 | #### 基于表示相似度的方法 53 | 调用代码: 54 | ```python 55 | from trustai.interpretation import FeatureSimilarityModel 56 | # initialization 57 | # 开发者需要传入模型及对应的训练数据,以及模型输出层中最后一层的layer name 58 | feature_sim_model = FeatureSimilarityModel(model, train_data_loader, classifier_layer_name="classifier") 59 | # 开发者可以通过sim_fn参数指定相似度计算方式,目前支持cos、dot、euc(分别是余弦距离,点积距离和欧式距离) 60 | #res为模型返回的结构化的结果,类型为list。list中的每一个元素对应一个测试实例的返回结果,该结果包括预测标签,正影响数据的索引,负影响数据的索引,正影响数据的分值和负影响数据的分值。 61 | res = [] 62 | for batch in test_dataloader: 63 | res += feature_sim_model(batch, sample_num=3, sim_fn='cos') 64 | ``` 65 | *注:返回证据数量由sample_num指定,若sample_num为-1则返回按影响度排序的所有训练数据。* 66 | 67 | 输出结果: 68 | ```txt 69 | 测试文本(来自情感分析任务): 没有光驱,重装Windows需要外接光驱,对于电脑新手会很麻烦(没什么人会用Linux吧) 70 | 预测结果: 0(负向情感) 71 | 72 | 支持当前测试文本的训练样本(正影响样本): 73 | text: Linux系统不太好用,平时习惯用Windows xp 系统,一下子用这个系统感觉很不习惯,建议开发或预装Windows xp系统. gold label: 0 score: 0.9393996000289917 74 | text: 1、机器较沉 2、VISTA用起来不习惯,且占系统较多 3、音频插口、右前侧的二个USB口在用鼠标时感觉手靠得太近了 gold label: 0 score: 0.9354583621025085 75 | text: vista系统下也没有无线网卡驱动,用驱动精灵可解决。 机器稍有点重。 散热有待改进。 gold label: 0 score: 0.9348428249359131 76 | 77 | 不支持当前测试文本的训练样本(负影响样本): 78 | text: “任务型教学”在我国外语教学界备受关注。国家教育部新《英语课程标准》将“倡导‘任务型’的教学途径,培养学生综合语言运用能力”写入教学建议。任务型教学被视为改革我国传统外语教学的良方。本书立足我国外语教学现状,全面分析了“任务型教学”的理论和实践基础、以实例说明“任务型教学”的具体操作步骤。为广大一线英语教师提供了教学和研究参考。 gold label: 1 score: -0.12422356754541397 79 | text: 当美国发生次贷危机乃至影响全世界以后,对于应对危机,我们国内的绝大多数专家对此都异口同声,观点基本雷同,而且又莫衷一是,人云亦云,本书的作者以其独特的视觉和丰富的知识,在书中告诉我们这次危机的来龙去脉,我们国家应该以怎样的方式去直面这次危机,如何转危为安,化危为机;作为普通读者也能从书中领会到一些对自己有益的知识。读完这本书以后,你更能体会到一种不一样的思维,非常值得一读。 gold label: 1 score: -0.128561332821846 80 | text: 我从06年开始几乎每月都有一次出差,山西很多酒店都住过了,印象最深的就是三晋国际,其他还有龙城国际,华苑宾馆,黄河京都,还有一个叫什么交通大厦的,下面我对这些酒店做个最真实、准确地点评: 三晋国际——这是我认为最让太原市骄傲的酒店,我们衡量一个酒店的最直接的就是你能够得到什么服务,在这家酒店里,我感觉到了家一般的照顾,第一次来这里,感冒了,嘴里冷不丁说了一句,服务生就听到了,然后熬了一碗姜汤到我房间,当然也是免费的,很感动;洗澡时,一不小心摔倒了,副总经理、总监等等都慰问了我,其实这也不完全是酒店的错,但是从那以后,我发现每个房间浴室都放置了防滑垫和塑料拖鞋;有一次我把袜子之类的放在洗手间了,谁知道我回来后竟然发现服务员帮我免费清洗了,还把我不小心掰断的心爱的梳子还用胶给我粘好了,还留了言给我,真的很让我意外也有点不敢相信!对一个出差特别频繁,时间特别紧张的人来说,办理入住和退房就是一个最让人烦躁的时间,但是我算过了,三晋国际前台办理退房、入住的时间没有超过一分钟!!!在北京都很难有这样的待遇!其他的,比如前台接待、门厅服务之类的就不用说了,真的很好; 当然我也有建议:1、酒店的被子能否换厚一点的,冬天冷啊;2、一些房间的电话没有免提,不是很方便;3、外面的电话打不进来,可能是酒店为了安全考虑吧,但还是希望能够有外线拨入的功能。 龙城国际——不知道五星级是谁给的评价?!酒店一般,还不如华苑宾馆,无法容忍的是,前台接待服务态度太差了!唯一的优点是,早餐挺好吃。 华苑宾馆——06、07年都挺好的,今天偶尔又住了一下,发现时间长了,枕头、被子不是很干净,其他倒是挺好的,服务态度、环境都还不错,早餐有点单一。 黄河京都——地方太偏了!看起来挺好,住进去不咋地,无法容忍的是,也给大家提个醒,我退房的时间整整用了29分钟,快半个钟头了,我差点晕倒!结帐的服务员只顾打电话,不理我。 交通大厦——噩梦般的酒店,我再也不会住了!!隔音效果太差,还不幸地让我听到了隔壁小两口的闺房密语,哈哈,让我坐噩梦的是,半夜不知道什么单位来查房,从好多房间带走了好多女孩子,好怕怕地说……还有就是前台一个戴眼镜的,白白的女孩子,态度可真差啊,郁闷! 太原还有好多酒店,可能我不会一一住到,但还是希望所有的酒店都能够像三晋国际一样,给山西人长脸! gold label: 1 score: -0.17390453815460205 81 | ``` 82 | 83 | 84 | #### 基于梯度的相似度方法示例 85 | 调用代码: 86 | 87 | ```python 88 | from trustai.interpretation import GradientSimilarityModel 89 | # initialization 90 | # 开发者需要传入模型及对应的训练数据,以及模型输出层中最后一层的layer name 91 | # 注意因为需要计算每一条数据对于模型参数的梯度,所以train_dataloader的batch_size需要设置为1,测试数据对应的dataloader的batch_size也需为1 92 | grad_sim_model = GradientSimilarityModel(model, train_data_loader, classifier_layer_name="classifier") 93 | # 开发者可以通过sim_fn参数指定相似度计算方式,目前支持cos、dot(分别是余弦距离,点积距离) 94 | # res为模型返回的结构化的结果,类型为list。list中的每一个元素对应一个测试实例的返回结果,该结果包括预测标签,正影响数据的索引,负影响数据的索引,正影响数据的分值和负影响数据的分值。 95 | res = [] 96 | for batch in test_dataloader: 97 | res += grad_sim_model(batch, sample_num=3, sim_fn='cos') 98 | ``` 99 | *注:返回证据数量由sample_num指定,若sample_num为-1则返回按影响度排序的所有训练数据。* 100 | 101 | 输出结果: 102 | ```txt 103 | 测试文本(来自情感分析任务): 没有光驱,重装Windows需要外接光驱,对于电脑新手会很麻烦(没什么人会用Linux吧) 104 | 预测结果: 0(负向情感) 105 | 106 | 支持当前测试文本的训练样本(正影响样本): 107 | text: Linux系统不太好用,平时习惯用Windows xp 系统,一下子用这个系统感觉很不习惯,建议开发或预装Windows xp系统. gold label: 0 score: 0.9395108222961426 108 | text: 1、机器较沉 2、VISTA用起来不习惯,且占系统较多 3、音频插口、右前侧的二个USB口在用鼠标时感觉手靠得太近了 gold label: 0 score: 0.9355786442756653 109 | text: vista系统下也没有无线网卡驱动,用驱动精灵可解决。 机器稍有点重。 散热有待改进。 gold label: 0 score: 0.9349631071090698 110 | 111 | 不支持当前测试文本的训练样本(负影响样本): 112 | text: 价格确实比较高,而且还没有早餐提供。 携程拿到的价格不好?还是自己保留起来不愿意让利给我们这些客户呢? 到前台搞价格,430就可以了。 gold label: 1 score: -0.49774348735809326 113 | text: 买机器送的移动硬盘2.5寸250G的,没开封,想卖出,感兴趣短息联系,北京13901019711 gold label: 1 score: -0.5244823694229126 114 | text: 买机器送的移动硬盘2.5寸250G的,没开封,想卖出,感兴趣短息联系,北京13901019711 gold label: 0 score: -0.5244823694229126 115 | ``` 116 | 117 | 118 | 详细示例见[examples](../../../examples/interpretation/example_level/)。 119 | -------------------------------------------------------------------------------- /trustai/interpretation/example_level/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """interpreter""" 15 | 16 | from .method import * -------------------------------------------------------------------------------- /trustai/interpretation/example_level/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. -------------------------------------------------------------------------------- /trustai/interpretation/example_level/common/data_class.py: -------------------------------------------------------------------------------- 1 | """ 2 | data class 3 | """ 4 | 5 | from dataclasses import dataclass 6 | from typing import Any 7 | from typing import List 8 | from typing import Dict 9 | from typing import Tuple 10 | 11 | 12 | @dataclass 13 | class ExampleResult(object): 14 | pred_label: int 15 | pos_indexes: List[int] 16 | neg_indexes: List[int] 17 | pos_scores: List[float] 18 | neg_scores: List[float] -------------------------------------------------------------------------------- /trustai/interpretation/example_level/common/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Some useful functions.""" 15 | import paddle 16 | import paddle.nn.functional as F 17 | import numpy as np 18 | from .data_class import ExampleResult 19 | 20 | 21 | def get_sublayer(model, sublayer_name='classifier'): 22 | """ 23 | Get the sublayer named sublayer_name in model. 24 | Args: 25 | model (obj:`paddle.nn.Layer`): Any paddle model. 26 | sublayer_name (obj:`str`, defaults to classifier): The sublayer name. 27 | Returns: 28 | layer(obj:`paddle.nn.Layer.common.sublayer_name`):The sublayer named sublayer_name in model. 29 | """ 30 | for name, layer in model.named_children(): 31 | if name == sublayer_name: 32 | return layer 33 | 34 | 35 | def dot_similarity(inputs_a, inputs_b): 36 | """ 37 | calaculate dot-product similarity between the two inputs. 38 | """ 39 | return paddle.sum(inputs_a * inputs_b, axis=1) 40 | 41 | 42 | def cos_similarity(inputs_a, inputs_b, step=500000): 43 | """ 44 | calaculate cosine similarity between the two inputs. 45 | """ 46 | # Processing to avoid paddle bug 47 | start, end = 0, step 48 | res = [] 49 | while start < inputs_a.shape[0]: 50 | res.append(F.cosine_similarity(inputs_a[start:end], inputs_b.unsqueeze(0))) 51 | start = end 52 | end = end + step 53 | return paddle.concat(res, axis=0) 54 | 55 | 56 | def euc_similarity(inputs_a, inputs_b): 57 | """ 58 | calaculate euclidean similarity between the two inputs. 59 | """ 60 | return -paddle.linalg.norm(inputs_a - inputs_b.unsqueeze(0), axis=-1).squeeze(-1) 61 | 62 | 63 | def get_top_and_bottom_n_examples(scores, pred_label, sample_num=3): 64 | """ 65 | get n index of the highest and lowest score, return the structual result. 66 | """ 67 | 68 | top_score, top_index = paddle.topk(scores, sample_num, axis=0, largest=True, sorted=True) 69 | 70 | bottom_score, bottom_index = paddle.topk(scores, sample_num, axis=0, largest=False, sorted=True) 71 | 72 | res = ExampleResult(pred_label=pred_label, 73 | pos_indexes=top_index.numpy(), 74 | neg_indexes=bottom_index.numpy(), 75 | pos_scores=top_score.numpy(), 76 | neg_scores=bottom_score.numpy()) 77 | 78 | return res 79 | -------------------------------------------------------------------------------- /trustai/interpretation/example_level/method/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """example method""" 15 | 16 | from .representer_point import RepresenterPointModel 17 | from .feature_similarity import FeatureSimilarityModel 18 | from .gradient_similarity import GradientSimilarityModel 19 | 20 | __all__ = ["RepresenterPointModel", "FeatureSimilarityModel", "GradientSimilarityModel"] -------------------------------------------------------------------------------- /trustai/interpretation/example_level/method/example_base_interpreter.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import functools 3 | 4 | import paddle 5 | 6 | from ...base_interpret import Interpreter 7 | from ..common.utils import get_sublayer 8 | 9 | 10 | class ExampleBaseInterpreter(Interpreter): 11 | """Interpreter is the base class for all interpretation algorithms. 12 | Args: 13 | paddle_model (callable): A model with ``forward`` and possibly ``backward`` functions. 14 | device (str): The device used for running `paddle_model`, options: ``cpu``, ``gpu``, ``gpu:0``, ``gpu:1`` etc. default: None 15 | predict_fn(callabel: default=None): If the paddle_model prediction has special process, user can customize the prediction function. 16 | classifier_layer_name(str: default=classifier): Name of the classifier layer in paddle_model. 17 | """ 18 | 19 | def __init__(self, 20 | paddle_model: callable, 21 | device: str = None, 22 | predict_fn=None, 23 | classifier_layer_name="classifier", 24 | **kwargs): 25 | Interpreter.__init__(self, paddle_model, device) 26 | self.paddle_model = paddle_model 27 | self.classifier_layer_name = classifier_layer_name 28 | self._build_predict_fn(predict_fn=predict_fn) 29 | 30 | def __call__(self, *args, **kwargs): 31 | return self.interpret(*args, **kwargs) 32 | 33 | def _build_predict_fn(self, predict_fn=None): 34 | if predict_fn is not None: 35 | self.predict_fn = functools.partial(predict_fn, 36 | classifier_layer_name=self.classifier_layer_name, 37 | paddle_model=self.paddle_model) 38 | return 39 | 40 | def predict_fn(inputs, classifier_layer_name=None, paddle_model=None): 41 | """predict_fn""" 42 | if paddle_model is None: 43 | paddle_model = self.paddle_model 44 | if classifier_layer_name is None: 45 | classifier_layer_name = self.classifier_layer_name 46 | 47 | cached_features = [] 48 | 49 | def forward_pre_hook(layer, input): 50 | """ 51 | Pre_hook for a given layer in model. 52 | """ 53 | cached_features.extend(input[0]) 54 | 55 | cached_logits = [] 56 | 57 | def forward_post_hook(layer, input, output): 58 | """ 59 | Post_hook for a given layer in model. 60 | """ 61 | cached_logits.append(output) 62 | 63 | classifier = get_sublayer(paddle_model, classifier_layer_name) 64 | 65 | forward_pre_hook_handle = classifier.register_forward_pre_hook(forward_pre_hook) 66 | forward_post_hook_handle = classifier.register_forward_post_hook(forward_post_hook) 67 | 68 | if isinstance(inputs, (tuple, list)): 69 | res = paddle_model(*inputs) # get logits, [bs, num_c] 70 | else: 71 | res = paddle_model(inputs) # get logits, [bs, num_c] 72 | 73 | forward_pre_hook_handle.remove() 74 | forward_post_hook_handle.remove() 75 | 76 | logits = cached_logits[-1] 77 | if len(logits.shape) < 2: 78 | logits = logits.unsqueeze(0) 79 | 80 | probas = paddle.nn.functional.softmax(cached_logits[-1], axis=1) # get probabilities. 81 | preds = paddle.argmax(probas, axis=1).tolist() # get predictions. 82 | return paddle.to_tensor(cached_features), probas, preds 83 | 84 | self.predict_fn = predict_fn 85 | 86 | @abc.abstractmethod 87 | def interpret(self, **kwargs): 88 | """Main function of the interpreter.""" 89 | raise NotImplementedError 90 | -------------------------------------------------------------------------------- /trustai/interpretation/example_level/method/feature_similarity.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | """ 3 | feature-based similarity method. 4 | cosine, cot and euc. 5 | """ 6 | import os 7 | import sys 8 | import functools 9 | import warnings 10 | 11 | import paddle 12 | import paddle.nn.functional as F 13 | from tqdm import tqdm 14 | 15 | from ..common.utils import get_sublayer, dot_similarity, cos_similarity, euc_similarity, get_top_and_bottom_n_examples 16 | from .example_base_interpreter import ExampleBaseInterpreter 17 | 18 | 19 | class FeatureSimilarityModel(ExampleBaseInterpreter): 20 | """ 21 | Feature-based similarity method for NLP tasks. 22 | """ 23 | 24 | def __init__( 25 | self, 26 | paddle_model, 27 | train_dataloader, 28 | device=None, 29 | classifier_layer_name="classifier", 30 | predict_fn=None, 31 | cached_train_feature=None, 32 | ): 33 | """ 34 | Initialization. 35 | Args: 36 | paddle_model(callable): A model with ``forward``. 37 | train_dataloader(iterable): Dataloader of model's training data. 38 | device(str: default=None): Device type, and it should be ``cpu``, ``gpu``, ``gpu:0``, ``gpu:1`` etc. 39 | classifier_layer_name(str: default=classifier): Name of the classifier layer in paddle_model. 40 | predict_fn(callabel: default=None): If the paddle_model prediction has special process, user can customize the prediction function. 41 | """ 42 | ExampleBaseInterpreter.__init__(self, paddle_model, device, predict_fn, classifier_layer_name) 43 | self.paddle_model = paddle_model 44 | self.classifier_layer_name = classifier_layer_name 45 | 46 | if cached_train_feature is not None and os.path.isfile(cached_train_feature): 47 | self.train_feature = paddle.load(cached_train_feature) 48 | else: 49 | self.train_feature, _ = self.extract_feature_from_dataloader(train_dataloader) 50 | if cached_train_feature is not None: 51 | try: 52 | paddle.save(self.train_feature, cached_train_feature) 53 | except IOError: 54 | import sys 55 | sys.stderr.write("save cached_train_feature fail") 56 | 57 | def interpret(self, data, sample_num=3, sim_fn="cos"): 58 | """ 59 | Select most similar and dissimilar examples for a given data using the `sim_fn` metric. 60 | Args: 61 | data(iterable): one batch of data to interpret. 62 | sample_num(int: default=3): the number of positive examples and negtive examples selected for each instance. Return all the training examples ordered by `influence score` if this parameter is -1. 63 | sim_fn(str: default=cos): the similarity metric to select examples. It should be ``cos``, ``dot`` or ``euc``. 64 | """ 65 | if sample_num == -1: 66 | sample_num = len(self.train_feature) 67 | 68 | val_feature, preds = self.extract_feature(self.paddle_model, data) 69 | if sim_fn == "dot": 70 | similarity_fn = dot_similarity 71 | elif sim_fn == "cos": 72 | similarity_fn = cos_similarity 73 | elif sim_fn == "euc": 74 | similarity_fn = euc_similarity 75 | else: 76 | raise ValueError(f"sim_fn only support ['dot', 'cos', 'euc'] in feature similarity, but gets `{sim_fn}`") 77 | res = [] 78 | preds = preds.tolist() 79 | for index in range(len(preds)): 80 | tmp = similarity_fn(self.train_feature, paddle.to_tensor(val_feature[index])) 81 | pred_label = preds[index] 82 | example_result = get_top_and_bottom_n_examples(tmp, pred_label, sample_num=sample_num) 83 | res.append(example_result) 84 | 85 | return res 86 | 87 | @paddle.no_grad() 88 | def extract_feature(self, paddle_model, data): 89 | """ 90 | extract feature from one batch of data. 91 | """ 92 | if self.paddle_model.training: 93 | self.paddle_model.eval() 94 | feature, _, pred = self.predict_fn(data) 95 | return paddle.to_tensor(feature), paddle.to_tensor(pred) 96 | 97 | def extract_feature_from_dataloader(self, dataloader): 98 | """ 99 | extract feature from data_loader. 100 | """ 101 | print("Extracting feature from given dataloader, it will take some time...") 102 | features, preds = [], [] 103 | 104 | for batch in tqdm(dataloader): 105 | feature, pred = self.extract_feature(self.paddle_model, batch) 106 | features.append(feature) 107 | preds.append(pred) 108 | return paddle.concat(features, axis=0), paddle.concat(preds, axis=0) -------------------------------------------------------------------------------- /trustai/interpretation/example_level/method/gradient_similarity.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | """ 3 | gradient-based similarity method. 4 | cosine and dot. 5 | """ 6 | import os 7 | import functools 8 | import warnings 9 | 10 | import paddle 11 | import paddle.nn.functional as F 12 | from tqdm import tqdm 13 | 14 | from ..common.utils import get_sublayer, dot_similarity, cos_similarity, euc_similarity, get_top_and_bottom_n_examples 15 | from .example_base_interpreter import ExampleBaseInterpreter 16 | 17 | 18 | class GradientSimilarityModel(ExampleBaseInterpreter): 19 | """ 20 | Gradient-based similarity method for NLP tasks. 21 | """ 22 | 23 | def __init__( 24 | self, 25 | paddle_model, 26 | train_dataloader, 27 | device=None, 28 | classifier_layer_name="classifier", 29 | predict_fn=None, 30 | criterion=None, 31 | cached_train_grad=None, 32 | ): 33 | """ 34 | Initialization. 35 | Args: 36 | paddle_model(callable): A model with ``forward``. 37 | train_dataloader(iterable): Dataloader of model's training data. 38 | device(str: default=None): Device type, and it should be ``cpu``, ``gpu``, ``gpu:0``, ``gpu:1`` etc. 39 | classifier_layer_name(str: default=classifier): Name of the classifier layer in paddle_model. 40 | predict_fn(callabel: default=None): If the paddle_model prediction has special process, user can customize the prediction function. 41 | criterion(paddle.nn.layer.loss: default=None): criterion to calculate model loss. 42 | cached_train_grad(str: default=None): Path of the cached train_dataloader gradient. In the first training time, it will take some time to generate the train_grad 43 | """ 44 | ExampleBaseInterpreter.__init__(self, paddle_model, device, predict_fn, classifier_layer_name) 45 | self.paddle_model = paddle_model 46 | self.classifier_layer_name = classifier_layer_name 47 | self.criterion = (criterion if criterion is not None else paddle.nn.loss.CrossEntropyLoss()) 48 | if cached_train_grad is not None and os.path.isfile(cached_train_grad): 49 | self.train_grad = paddle.load(cached_train_grad) 50 | else: 51 | self.train_grad, *_ = self.get_grad_from_dataloader(train_dataloader) 52 | if cached_train_grad is not None: 53 | try: 54 | paddle.save(self.train_grad, cached_train_grad) 55 | except IOError: 56 | import sys 57 | sys.stderr.write("save cached_train_grad fail") 58 | 59 | def interpret(self, data, sample_num=3, sim_fn="cos"): 60 | """ 61 | Select most similar and dissimilar examples for a given data using the `sim_fn` metric. 62 | Args: 63 | data(iterable): one batch of data to interpret. 64 | sample_num(int: default=3): the number of positive examples and negtive examples selected for each instance. Return all the training examples ordered by `influence score` if this parameter is -1. 65 | sim_fn(str: default=cos): the similarity metric to select examples. It should be ``cos`` or ``dot``. 66 | """ 67 | if sample_num == -1: 68 | sample_num = len(self.train_grad) 69 | 70 | val_feature, _, preds = self.get_grad(self.paddle_model, data) 71 | 72 | if sim_fn == "dot": 73 | similarity_fn = dot_similarity 74 | elif sim_fn == "cos": 75 | similarity_fn = cos_similarity 76 | else: 77 | raise ValueError(f"sim_fn only support ['dot', 'cos'] in gradient simmialrity, but gets `{sim_fn}`") 78 | res = [] 79 | preds = preds.tolist() 80 | for index in range(len(preds)): 81 | tmp = similarity_fn(self.train_grad, paddle.to_tensor(val_feature[index])) 82 | pred_label = preds[index] 83 | example_result = get_top_and_bottom_n_examples(tmp, pred_label, sample_num=sample_num) 84 | res.append(example_result) 85 | 86 | return res 87 | 88 | def get_grad(self, paddle_model, data): 89 | """ 90 | get grad from one batch of data. 91 | """ 92 | if paddle_model.training: 93 | paddle_model.eval() 94 | if isinstance(data, (tuple, list)): 95 | assert len(data[0]) == 1, "batch_size must be 1" 96 | else: 97 | assert len(data) == 1, "batch_size must be 1" 98 | _, prob, pred = self.predict_fn(data) 99 | loss = self.criterion(prob, paddle.to_tensor(pred)) 100 | 101 | # adapt for paddle 2.4 102 | if paddle.version.full_version >= '2.4.0': 103 | for n, p in self.paddle_model.named_parameters(): 104 | if self.classifier_layer_name in n: 105 | p.retain_grads() 106 | 107 | loss.backward() 108 | grad = self._get_flat_param_grad() 109 | self._clear_all_grad() 110 | return paddle.to_tensor(grad), paddle.to_tensor(prob), paddle.to_tensor(pred) 111 | 112 | def get_grad_from_dataloader(self, data_loader): 113 | """ 114 | get grad from data_loader. 115 | """ 116 | print("Extracting gradient for given dataloader, it will take some time...") 117 | probas, preds, grads = [], [], [] 118 | 119 | for batch in tqdm(data_loader): 120 | grad, prob, pred = self.get_grad(self.paddle_model, batch) 121 | grads.append(grad) 122 | probas.append(prob) 123 | preds.append(pred) 124 | 125 | grads = paddle.concat(grads, axis=0) 126 | probas = paddle.concat(probas, axis=0) 127 | preds = paddle.concat(preds, axis=0) 128 | return grads, probas, preds 129 | 130 | def _get_flat_param_grad(self): 131 | """ 132 | get gradient 133 | """ 134 | return paddle.concat([ 135 | paddle.flatten(p.grad) for n, p in self.paddle_model.named_parameters() if self.classifier_layer_name in n 136 | ]).unsqueeze(axis=0) 137 | 138 | def _clear_all_grad(self): 139 | """ 140 | clear gradient 141 | """ 142 | for p in self.paddle_model.parameters(): 143 | p.clear_gradient() 144 | -------------------------------------------------------------------------------- /trustai/interpretation/python_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """python utils""" 15 | 16 | def versiontuple2tuple(v): 17 | """ref: https://stackoverflow.com/a/11887825/4834515""" 18 | return tuple(map(int, filter(str.isdigit, v.split(".")))) -------------------------------------------------------------------------------- /trustai/interpretation/token_level/README.md: -------------------------------------------------------------------------------- 1 | # 特征级证据分析 2 | 3 | ## 功能介绍 4 | 针对给定的模型和预测结果,特征级证据分析方法对测试输入中的每一特征(在以文本形式为输入的NLP任务中,特征为输入中的字或词)赋值一个分数,用其表示该特征对预测结果的影响度。然后影响度大的若干特征被选择作为模型预测依赖的证据,解释模型的预测结果。 5 | 6 | TrustAI提供了3种主流的特征级证据分析方法,分别是[Lime](https://arxiv.org/abs/1602.04938)、[Attention](https://arxiv.org/pdf/1902.10186.pdf)、[Integrated Gradient](https://arxiv.org/abs/1703.01365)方法。 7 | 8 | ## 使用示例 9 | 模型预测依赖证据的输出包含三步:待解释模型准备、测试数据准备、证据输出。 10 | 11 | 一、待解释模型准备 12 | ```python 13 | from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer 14 | model = ErnieForSequenceClassification.from_pretrained('ernie-1.0', num_classes=2) 15 | tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') 16 | ``` 17 | 18 | 二、测试数据准备:将输入测试文本转化为模型的输入 19 | ```python 20 | data = [{ "text": '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般'}] 21 | 22 | # preprocess_fn是开发者自定义的预处理函数,将文本转化为模型的输入格式 23 | model_inputs = preprocess_fn(data) 24 | ``` 25 | 26 | 三、证据获取:这里以Integrated Gradient方法为例 27 | ```python 28 | from trustai.interpretation.token_level import IntGradInterpreter 29 | ig = IntGradInterpreter(model) 30 | result = ig(preprocess_fn(data), steps=100) 31 | 32 | # attributions的长度与用户切词相同 33 | # 数值的大小表示对应特征对预测结果支持的程度 34 | print(result[0].attributions) 35 | # [ 0.02149865 0.13750568 0.03729623 0.20981199 0.11474895 0.00191162 36 | # 0.01166647 0.01939347 0.00499799 -0.01771647 0.05467343 -0.05574901 37 | # 0.0797711 0.02094495 -0.02661019 0.01423277 0.03983632 0.05040766 38 | # 0.03474617 0.10548145 -0.02475511 -0.06759283 -0.07004125 -0.0207927 39 | # 0.03771218 0.01511401 -0.01349011 0.01542336] 40 | ``` 41 | 42 | 43 | TrustAI支持将输出的重要度分数映射到更大粒度片段上,这里给出了一个基于jieba分词的使用示例。 44 | 45 | ```python 46 | import jieba 47 | 48 | from trustai.interpretation import get_word_offset 49 | # 分析的文本 50 | print(data[0]['text']) 51 | # 这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般 52 | 53 | # 由于示例使用的是ernie-1.0模型,若要与attribution对齐,需要在原始文本拼接[CLS], [SEP] 54 | context = "[CLS]" + " " + data[0]['text'] + " " + "[SEP]" 55 | # 开发者自定分词 56 | words = ["[CLS]"] + list(jieba.cut(data[0]['text'])) + ["[SEP]"] 57 | # ['[CLS]', '这个', '宾馆', '比较', '陈旧', '了', ',', '特价', '的', '房间', '也', '很', '一般', '。', '总体', '来说', '一般', '[SEP]'] 58 | 59 | # 获取用户自定切词的word_offset_map 60 | # word_offset_map表示开发者自定义切词在context中的字符的偏移位置 61 | word_offset_map = get_word_offset(context, words) 62 | # [[0, 5], [6, 8], [8, 10], [10, 12], [12, 14], [14, 15], [15, 16], [16, 18], [18, 19], [19, 21], [21, 22], [22, 23], [23, 25], [25, 26], [26, 28], [28, 30], [30, 32], [33, 38]] 63 | 64 | # 计算模型切词offset_map 65 | subword_offset_map = tokenizer.get_offset_mapping(context) 66 | # [(0, 1), (1, 3), (3, 4), (4, 5), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12), (12, 13), (13, 14), (14, 15), (15, 16), (16, 17), (17, 18), (18, 19), (19, 20), (20, 21), (21, 22), (22, 23), (23, 24), (24, 25), (25, 26), (26, 27), (27, 28), (28, 29), (29, 30), (30, 31), (31, 32), (33, 34), (34, 37), (37, 38)] 67 | 68 | # 将attributions对齐到words 69 | aligns = ig.alignment(result, [context], [batch_words], [word_offset_map], [subword_offset_map], special_tokens=["[CLS]", '[SEP]']) 70 | 71 | print(aligns[0].words) 72 | # ['[CLS]', '这个', '宾馆', '比较', '陈旧', '了', ',', '特价', '的', '房间', '也', '很', '一般', '。', '总体', '来说', '一般', '[SEP]'] 73 | print(aligns[0].word_attributions) 74 | # [0.021498650312423706, 0.17480190843343735, 0.3245609328150749, 0.013578088022768497, 0.02439146302640438, -0.01771647110581398, 0.05467343330383301, 0.024022094905376434, 0.020944949239492416, -0.012377424165606499, 0.03983632102608681, 0.05040765926241875, 0.14022761583328247, -0.024755112826824188, -0.13763408362865448, 0.01691947504878044, 0.001623895950615406, 0.015423357486724854] 75 | print(aligns[0].pred_label) 76 | # 0 77 | print(aligns[0].pred_proba) 78 | # [0.86797816 0.1320218 ] 79 | print(aligns[0].rationale) 80 | # (1, 2, 6, 11, 12) 81 | print(aligns[0].rationale_tokens) 82 | # ('这个', '宾馆', ',', '很', '一般') 83 | print(aligns[0].non_rationale) 84 | # (3, 4, 5, 7, 8, 9, 10, 13, 14, 15, 16) 85 | print(aligns[0].non_rationale_tokens) 86 | # ('比较', '陈旧', '了', '特价', '的', '房间', '也', '。', '总体', '来说', '一般') 87 | 88 | ``` 89 | 90 | 同时,TrustAI提供了可视化功能,调用代码如下: 91 | ```python 92 | # html为HTML格式的文本,可以保存为html文件 93 | html = visualize_text(VisualizationTextRecord(aligns[0], true_label=0)) 94 | ``` 95 | 96 | 可视化输出示例: 97 |

98 |
99 | 图1 可视化实例:实例来自情感分析任务 100 |

101 | 102 | 详细示例见[examples](../../../examples/interpretation/token_level)。 103 | -------------------------------------------------------------------------------- /trustai/interpretation/token_level/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """token-level""" 15 | 16 | from .method import * 17 | from .common import get_word_offset 18 | from .data_processor import * -------------------------------------------------------------------------------- /trustai/interpretation/token_level/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """init""" 15 | 16 | from .postprocess_attribution import * 17 | from .predict_functions import * 18 | -------------------------------------------------------------------------------- /trustai/interpretation/token_level/common/postprocess_attribution.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """postprocess attribution""" 15 | 16 | import copy 17 | import warnings 18 | 19 | 20 | def get_word_offset(context, words): 21 | """get_word_offset""" 22 | pointer = 0 # point at the context 23 | offset_map = [] 24 | for i in range(len(words)): 25 | seg_start_idx = context.find(words[i], pointer) 26 | seg_end_idx = seg_start_idx + len(words[i]) 27 | offset_map.append([seg_start_idx, seg_end_idx]) 28 | pointer = seg_end_idx 29 | return offset_map 30 | 31 | 32 | def get_word_attributions(words, word_offset_map, subword_offset_map, attributions): 33 | """get_word_attributions""" 34 | result = [] 35 | 36 | pointer1 = 0 # point at the context 37 | pointer2 = 0 # point at the sorted_token array 38 | 39 | for i in range(len(word_offset_map)): 40 | # merge spcial offset position in subword_offset_map 41 | seg_start_idx, seg_end_idx = word_offset_map[i] 42 | cur_set = [] 43 | while pointer2 < len(subword_offset_map): 44 | while pointer2 < len(subword_offset_map) and subword_offset_map[pointer2][1] <= seg_start_idx: 45 | pointer2 += 1 46 | if subword_offset_map[pointer2][0] >= seg_end_idx: 47 | break 48 | cur_set.append(pointer2) 49 | pointer2 += 1 50 | result.append([cur_set, i, words[i]]) 51 | pointer2 -= 1 52 | pointer1 = seg_end_idx 53 | word_attributions = merge_attributions(result, attributions) 54 | return word_attributions 55 | 56 | 57 | def get_rationales_and_non_ratioanles(words, word_attributions, special_tokens=[], rationale_num=5): 58 | """"get_rationales_and_non_ratioanles""" 59 | assert len(words) == len(word_attributions) 60 | 61 | sorted_rationale_ids = list(sorted(range(len(words)), key=lambda i: word_attributions[i], reverse=True)) 62 | rationale_tokens = [] 63 | rationale_ids = [] 64 | non_rationale_tokens = [] 65 | non_rationale_ids = [] 66 | for idx in sorted_rationale_ids: 67 | if words[idx] in special_tokens: 68 | continue 69 | if len(rationale_ids) < rationale_num: 70 | rationale_ids.append(idx) 71 | rationale_tokens.append(words[idx]) 72 | else: 73 | non_rationale_ids.append(idx) 74 | non_rationale_tokens.append(words[idx]) 75 | rationale_ids, rationale_tokens = zip(*list(sorted(zip(rationale_ids, rationale_tokens), key=lambda ele: ele[0]))) 76 | if len(non_rationale_ids) == 0: 77 | non_rationale_ids = [] 78 | non_rationale_tokens = [] 79 | else: 80 | non_rationale_ids, non_rationale_tokens = zip( 81 | *list(sorted(zip(non_rationale_ids, non_rationale_tokens), key=lambda ele: ele[0]))) 82 | return { 83 | "rationale_ids": rationale_ids, 84 | "rationale_tokens": rationale_tokens, 85 | "non_rationale_ids": non_rationale_ids, 86 | "non_rationale_tokens": non_rationale_tokens 87 | } 88 | 89 | 90 | def merge_subword_special_idx(words, word_offset_map, subword_offset_map, special_tokens): 91 | """merge_subword_special_idx""" 92 | spcial_token_ids = [] 93 | for idx, word in enumerate(words): 94 | if word in special_tokens: 95 | spcial_token_ids.append(idx) 96 | special_token_offset = [] 97 | special_token_offset = [word_offset_map[idx] for idx in spcial_token_ids] 98 | subword_start_ids, subword_end_ids = list(zip(*subword_offset_map)) 99 | merge_idx = [] 100 | for token_start, token_end in special_token_offset: 101 | try: 102 | sub_start_id = subword_start_ids.index(token_start) 103 | sub_end_id = subword_end_ids.index(token_end) 104 | merge_idx.append([sub_start_id, sub_end_id]) 105 | except: 106 | warnings.warn("Error offset mapping! Please check your offset map.") 107 | new_subword_offset_map = copy.deepcopy(subword_offset_map) 108 | for merge_start, merge_end in merge_idx[::-1]: 109 | spceial_toekn_start_id = new_subword_offset_map[merge_start][0] 110 | spceial_toekn_end_id = new_subword_offset_map[merge_end][1] 111 | del new_subword_offset_map[merge_start:merge_end + 1] 112 | new_subword_offset_map.insert(merge_start, [spceial_toekn_start_id, spceial_toekn_end_id]) 113 | return new_subword_offset_map 114 | 115 | 116 | def merge_attributions(match_list, attributions): 117 | """merge_attributions""" 118 | over_all = [] 119 | miss = 0 120 | for i in match_list: 121 | over_all.extend(i[0]) 122 | 123 | attribution_dic = {} 124 | for i in range(len(attributions)): 125 | split_time = over_all.count(i) 126 | if split_time: 127 | attribution_dic[i] = attributions[i] / split_time 128 | else: 129 | attribution_dic[i] = 0.0 130 | if miss != 0: 131 | print(miss) 132 | 133 | attributions = [] 134 | for i in range(len(match_list)): 135 | cur_attribution = 0.0 136 | for j in match_list[i][0]: 137 | if j == -1: 138 | continue 139 | cur_attribution += attribution_dic[j] 140 | attributions.append(cur_attribution) 141 | return attributions -------------------------------------------------------------------------------- /trustai/interpretation/token_level/data_processor/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """init""" 15 | 16 | from .visualizer import * 17 | from .data_class import * -------------------------------------------------------------------------------- /trustai/interpretation/token_level/data_processor/data_class.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """data class""" 15 | 16 | from dataclasses import dataclass 17 | from typing import Any 18 | from typing import List 19 | from typing import Dict 20 | from typing import Tuple 21 | 22 | 23 | @dataclass 24 | class TokenResult(object): 25 | attributions: List[float] 26 | pred_label: float 27 | pred_proba: List[float] 28 | 29 | 30 | @dataclass 31 | class AttentionResult(TokenResult): 32 | pass 33 | 34 | 35 | @dataclass 36 | class GradShapResult(TokenResult): 37 | pass 38 | 39 | 40 | @dataclass 41 | class IGResult(TokenResult): 42 | error_percent: float 43 | 44 | 45 | @dataclass 46 | class LimeResult(TokenResult): 47 | lime_score: float 48 | 49 | 50 | @dataclass 51 | class NormLIMEResult(object): 52 | # {id : (attribution, word_idx)} 53 | attributions: Dict[int, Tuple[float, int]] 54 | 55 | 56 | @dataclass 57 | class InterpretResult(object): 58 | words: List[str] 59 | word_attributions: List[float] 60 | pred_label: float 61 | pred_proba: List[float] 62 | rationale: List[int] 63 | non_rationale: List[int] 64 | rationale_tokens: List[str] 65 | non_rationale_tokens: List[str] 66 | rationale_pred_proba: float = None 67 | non_rationale_pred_proba: float = None 68 | -------------------------------------------------------------------------------- /trustai/interpretation/token_level/data_processor/visualizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """visualization function""" 15 | 16 | from IPython.core.display import display, HTML 17 | 18 | import numpy as np 19 | 20 | from .data_class import TokenResult 21 | from .data_class import InterpretResult 22 | 23 | 24 | class VisualizationTextRecord(object): 25 | """ 26 | A record for text visulization. 27 | Part of the code is modified from https://github.com/pytorch/captum/blob/master/captum/attr/_utils/visualization.py 28 | """ 29 | 30 | def __init__(self, interpret_res, true_label=None, words=None): 31 | if words is not None: 32 | self.words = words 33 | else: 34 | self.words = interpret_res.words 35 | self.pred_label = interpret_res.pred_label 36 | if isinstance(self.pred_label, np.ndarray): 37 | self.pred_proba = [ 38 | round(proba[label], 2) for proba, label in zip(interpret_res.pred_proba, self.pred_label) 39 | ] 40 | self.pred_label = self.pred_label.tolist() 41 | else: 42 | self.pred_proba = interpret_res.pred_proba[self.pred_label] 43 | self.true_label = true_label if true_label is not None else '' 44 | 45 | # Normalization for attributions 46 | if isinstance(interpret_res, InterpretResult): 47 | word_attributions = interpret_res.word_attributions 48 | else: 49 | word_attributions = interpret_res.attributions 50 | _max = max(word_attributions) 51 | _min = min(word_attributions) 52 | self.word_attributions = [(word_imp - _min) / (_max - _min) for word_imp in word_attributions] 53 | 54 | def record_html(self): 55 | """change all informations to html""" 56 | return "".join([ 57 | "", 58 | self._format_class(self.true_label), 59 | self._format_class(self.pred_label, self.pred_proba), 60 | self._format_word_attributions(), 61 | "", 62 | ]) 63 | 64 | def _format_class(self, label, prob=None): 65 | if prob is None: 66 | return '{label}'.format(label=label) 67 | elif isinstance(prob, list): 68 | return '{label} ({prob})'\ 69 | .format(label=str(label), prob=str(prob)) 70 | else: 71 | return '{label} ({prob:.2f})'\ 72 | .format(label=label, prob=prob) 73 | 74 | def _format_word_attributions(self): 75 | tags = [""] 76 | for word, importance in zip(self.words, self.word_attributions[:len(self.words)]): 77 | color = self._background_color(importance) 78 | unwrapped_tag = ' {word}\ 80 | ' \ 81 | .format(color=color, word=word) 82 | tags.append(unwrapped_tag) 83 | tags.append("") 84 | return "".join(tags) 85 | 86 | def _background_color(self, importance): 87 | importance = max(-1, min(1, importance)) 88 | if importance > 0: 89 | hue = 120 90 | sat = 75 91 | lig = 100 - int(30 * importance) 92 | else: 93 | hue = 0 94 | sat = 75 95 | lig = 100 - int(-40 * importance) 96 | return "hsl({}, {}%, {}%)".format(hue, sat, lig) 97 | 98 | 99 | def visualize_text(text_records): 100 | """visualize text""" 101 | html = [""] 102 | rows = ["" 103 | "" 104 | ""] 105 | for record in text_records: 106 | rows.append(record.record_html()) 107 | html.append("".join(rows)) 108 | html.append("
Golden LabelPredicted Label (Prob)Important scores
") 109 | html = HTML("".join(html)) 110 | display(html) 111 | return html.data 112 | 113 | 114 | def visualize(interpret_res, true_labels=None, words=None): 115 | """ 116 | interpret_res: List[TokenResult, InterpretResult], Interpretability Results 117 | true_labels: List[int], Golden labels for test examples 118 | words: List[List[str]], The word segmentation result of the test examples, the length of words is equal to the attributions 119 | """ 120 | result_num = len(interpret_res) 121 | if true_labels is None: 122 | true_labels = [None] * result_num 123 | if words is None: 124 | words = [None] * result_num 125 | records = [] 126 | for i in range(result_num): 127 | records.append(VisualizationTextRecord(interpret_res[i], true_label=true_labels[i], words=words[i])) 128 | html = visualize_text(records) 129 | return html -------------------------------------------------------------------------------- /trustai/interpretation/token_level/method/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """all method""" 15 | 16 | from .attention import AttentionInterpreter 17 | from .gradient_shap import GradShapInterpreter 18 | from .integrated_gradients import IntGradInterpreter 19 | from .lime import LIMEInterpreter 20 | from .norm_lime import NormLIMEInterpreter 21 | 22 | __all__ = [ 23 | "AttentionInterpreter", "GradShapInterpreter", "IntGradInterpreter", "LIMEInterpreter", "NormLIMEInterpreter" 24 | ] -------------------------------------------------------------------------------- /trustai/interpretation/token_level/method/attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """attention interpreter""" 15 | 16 | import paddle 17 | 18 | from ..data_processor import AttentionResult 19 | from .base_interpret import TokenInterpreter 20 | 21 | 22 | class AttentionInterpreter(TokenInterpreter): 23 | """ 24 | Attention Interpreter for NLP tasks. 25 | """ 26 | 27 | def __init__(self, paddle_model, device=None, attention_name=None, predict_fn=None) -> None: 28 | """ 29 | Args: 30 | paddle_model (callable): A model with ``forward`` and possibly ``backward`` functions. 31 | device (str, optional): The device used for running `paddle_model`, options: ``cpu``, ``gpu``, ``gpu:0``, ``gpu:1`` etc. Default: None. 32 | attention_name(str, optional): The layer name of attention. The correct name of embedding can be found through ``print(model)``. Default: None. 33 | predict_fn(callable, optional): If the paddle_model prediction has special process, the user can customize the prediction function. Default: None. 34 | """ 35 | TokenInterpreter.__init__(self, paddle_model, device) 36 | 37 | # build predict function 38 | self._build_predict_fn(attention_name=attention_name, predict_fn=predict_fn) 39 | 40 | def interpret(self, data): 41 | """Main function of the interpreter. 42 | Args: 43 | data ([type]): The inputs of the paddle_model. 44 | 45 | Returns: 46 | List[AttentionResult]: a list of predicted labels, probabilities, and interpretations. 47 | """ 48 | 49 | if isinstance(data, (tuple, list)): 50 | bs = data[0].shape[0] 51 | else: 52 | bs = data.shape[0] 53 | 54 | attributions, pred_label, pred_proba = self._attention_interpret(data) 55 | 56 | # returns 57 | rets = [] 58 | for i in range(bs): 59 | attresult = AttentionResult(attributions=attributions[i], 60 | pred_label=pred_label[i], 61 | pred_proba=pred_proba[i]) 62 | rets.append(attresult) 63 | return rets 64 | 65 | def _build_predict_fn(self, attention_name=None, predict_fn=None): 66 | assert attention_name is not None or \ 67 | predict_fn is not None, "At least One of attention_name and predict_fn is not None." 68 | 69 | if attention_name is None: 70 | self.predict_fn = predict_fn 71 | return 72 | 73 | def predict_fn(inputs, paddle_model=None): 74 | if paddle_model is None: 75 | paddle_model = self.paddle_model 76 | target_feature_map = [] 77 | 78 | def hook(layer, input, output): 79 | target_feature_map.append(output) 80 | return output 81 | 82 | hooks = [] 83 | for name, v in paddle_model.named_sublayers(): 84 | if attention_name in name: 85 | h = v.register_forward_post_hook(hook) 86 | hooks.append(h) 87 | 88 | if isinstance(inputs, (tuple, list)): 89 | logits = paddle_model(*inputs) # get logits, [bs, num_c] 90 | else: 91 | logits = paddle_model(inputs) # get logits, [bs, num_c] 92 | 93 | bs = logits.shape[0] 94 | for h in hooks: 95 | h.remove() 96 | 97 | probas = paddle.nn.functional.softmax(logits, axis=1) # get probabilities. 98 | preds = paddle.argmax(probas, axis=1) # get predictions. 99 | # logits or probas 100 | preds = preds.reshape((bs, )) 101 | attention = target_feature_map[0].sum(1)[:, 0] 102 | return attention.numpy(), preds.numpy(), probas.numpy() 103 | 104 | self.predict_fn = predict_fn 105 | 106 | def _attention_interpret(self, data) -> tuple: 107 | attentions, labels, probas = self.predict_fn(data, paddle_model=self.paddle_model) 108 | return attentions, labels, probas 109 | -------------------------------------------------------------------------------- /trustai/interpretation/token_level/method/base_interpret.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """TokenInterpreter""" 15 | 16 | import abc 17 | 18 | from ..data_processor import InterpretResult 19 | from ..common import merge_subword_special_idx 20 | from ..common import get_word_attributions 21 | from ..common import get_rationales_and_non_ratioanles 22 | from ...base_interpret import Interpreter 23 | 24 | 25 | class TokenInterpreter(Interpreter): 26 | """ 27 | Interpreter is the base class for all interpretation algorithms. 28 | """ 29 | 30 | def __init__(self, *args, **akwargs): 31 | Interpreter.__init__(self, *args, **akwargs) 32 | 33 | def __call__(self, *args, **kwargs): 34 | return self.interpret(*args, **kwargs) 35 | 36 | @abc.abstractmethod 37 | def interpret(self, **kwargs): 38 | """Main function of the interpreter.""" 39 | raise NotImplementedError 40 | 41 | @abc.abstractmethod 42 | def _build_predict_fn(self, **kwargs): 43 | """Build self.predict_fn for interpreters.""" 44 | raise NotImplementedError 45 | 46 | def alignment(self, 47 | interpret_results, 48 | contexts, 49 | batch_words, 50 | word_offset_maps, 51 | subword_offset_maps, 52 | special_tokens=[], 53 | rationale_num=5): 54 | """Align the subword's attributions to the word. Return top words with the top ``rationale_num`` as rationale and the other words as non-rationale. 55 | Args: 56 | interpret_results ([data_class]): The Interpreter functions ouputs, like ``AttentionResult``, ``LIMEResult`` etc. 57 | contexts ([str]): The input text with speical_tokens to tokenizer, like ``[CLS] How are you? [SEP]``. 58 | batch_words ([[str]]): The word segmentation resutls of the contexts. 59 | word_offset_maps ([(int, int)]): The offset mapping of word segationment. 60 | subword_offset_maps ([(int, int)]): The offset mapping of subwords. 61 | special_tokens ([str], optional): The speical tokens which not be extracted as rationales. 62 | rationale_num (int, optional): The number of rationales. Default: 5 63 | Returns: 64 | List[InterpretResult]: a list of predicted labels, probabilities, interpretations, rationales etc. 65 | """ 66 | 67 | result = [] 68 | assert len(contexts) == len(batch_words) == len(word_offset_maps) == len(subword_offset_maps) == len( 69 | interpret_results 70 | ), f"The lenght of contexts, batch_words, word_offset_maps, subword_offset_maps, interpret_results should be equal." 71 | 72 | for i in range(len(contexts)): 73 | words = batch_words[i] 74 | context = contexts[i] 75 | word_offset_map = word_offset_maps[i] 76 | subword_offset_map = subword_offset_maps[i] 77 | interpret_result = interpret_results[i] 78 | assert subword_offset_map[-1][1] == word_offset_map[-1][ 79 | 1], "error offset_map, please check word_offset_maps and subword_offset_maps" 80 | 81 | # merge speical tokens for subword_offset_map 82 | subword_offset_map = merge_subword_special_idx(words, word_offset_map, subword_offset_map, special_tokens) 83 | 84 | attributions = interpret_result.attributions 85 | pred_label = interpret_result.pred_label 86 | pred_proba = interpret_result.pred_proba 87 | 88 | # get word attributions 89 | word_attributions = get_word_attributions(words, word_offset_map, subword_offset_map, attributions) 90 | # get ratioanles and non-rationales 91 | ratioanle_result = get_rationales_and_non_ratioanles(words, 92 | word_attributions, 93 | special_tokens=special_tokens, 94 | rationale_num=rationale_num) 95 | interpret_result = InterpretResult(words=words, 96 | word_attributions=word_attributions, 97 | pred_label=pred_label, 98 | pred_proba=pred_proba, 99 | rationale=ratioanle_result['rationale_ids'], 100 | non_rationale=ratioanle_result['non_rationale_ids'], 101 | rationale_tokens=ratioanle_result['rationale_tokens'], 102 | non_rationale_tokens=ratioanle_result['non_rationale_tokens']) 103 | result.append(interpret_result) 104 | return result -------------------------------------------------------------------------------- /trustai/interpretation/token_level/method/gradient_shap.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """GradShapInterpreter""" 15 | 16 | from ..data_processor import GradShapResult 17 | from .base_interpret import TokenInterpreter 18 | 19 | 20 | class GradShapInterpreter(TokenInterpreter): 21 | """A wrap class of interpretdl.GradShapInterpreter, please refer to ``interpretdl/interpreter/gradient_shap.py`` for details""" 22 | 23 | def __init__(self, 24 | paddle_model, 25 | device='gpu', 26 | n_samples=5, 27 | noise_amount=0.1, 28 | embedding_name="word_embeddings") -> None: 29 | """ 30 | Args: 31 | paddle_model (callable): A model with ``forward`` and possibly ``backward`` functions. 32 | device (str, optional): The device used for running `paddle_model`, options: ``cpu``, ``gpu``. Default: gpu. 33 | n_samples (int, optional): [description]. Defaults to 5. 34 | noise_amount (float, optional): Noise level of added noise to the embeddings. 35 | The std of Guassian random noise is ``noise_amount * embedding.mean() * (x_max - x_min)``. Default: 0.1 36 | embedding_name (str, optional): name of the embedding layer at which the noises will be applied. 37 | Defaults to 'word_embeddings'. The correct name of embedding can be found through ``print(model)``. 38 | """ 39 | TokenInterpreter.__init__(self, paddle_model, device) 40 | 41 | # build predict function 42 | self.gradshap = self._build_predict_fn(paddle_model, device) 43 | 44 | self.n_samples = n_samples 45 | self.noise_amount = noise_amount 46 | self.embedding_name = embedding_name 47 | 48 | def interpret(self, data): 49 | """Main function of the interpreter. 50 | Args: 51 | data ([type]): The inputs of the paddle_model. 52 | labels ([type], optional): The target label to analyze. If None, the most likely label will be used. Default: None. 53 | Returns: 54 | List[GradShapResult]: a list of predicted labels, probabilities and interpretations. 55 | """ 56 | 57 | if isinstance(data, (tuple, list)): 58 | bs = data[0].shape[0] 59 | else: 60 | bs = data.shape[0] 61 | 62 | pred_label, pred_proba, attributions = self.gradshap.interpret(data, 63 | n_samples=self.n_samples, 64 | noise_amount=self.noise_amount, 65 | embedding_name=self.embedding_name, 66 | return_pred=True) 67 | # returns 68 | rets = [] 69 | for i in range(bs): 70 | shapresult = GradShapResult(attributions=attributions[i], 71 | pred_label=pred_label[i], 72 | pred_proba=pred_proba[i]) 73 | rets.append(shapresult) 74 | return rets 75 | 76 | def _build_predict_fn(self, paddle_model, device='gpu'): 77 | try: 78 | from interpretdl import GradShapNLPInterpreter 79 | except ImportError as e: 80 | import sys 81 | sys.stderr.write( 82 | '''Warning with import interpretdl: please install interpretdl firstly. cmd: pip install -U interpretdl''' 83 | ) 84 | raise e 85 | 86 | return GradShapNLPInterpreter(paddle_model, device) 87 | -------------------------------------------------------------------------------- /trustai/interpretation/token_level/method/norm_lime.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """norm lime""" 15 | 16 | from ..data_processor import NormLIMEResult 17 | from .base_interpret import TokenInterpreter 18 | 19 | 20 | class NormLIMEInterpreter(TokenInterpreter): 21 | """A wrap class of interpretdl.NormLIMENLPInterpreter, please refer to ``interpretdl/interpreter/_normlime_base.py`` for details""" 22 | 23 | def __init__(self, paddle_model, preprocess_fn, unk_id, pad_id=None, device=None, batch_size=50) -> None: 24 | """ 25 | Args: 26 | paddle_model (callable): A model with ``forward`` and possibly ``backward`` functions. 27 | preprocess_fn (Callable): A user-defined function that input raw string and outputs the a tuple of inputs to feed into the NLP model. 28 | unk_id (int): The word id to replace occluded words. Typical choices include "", , and . 29 | pad_id (int or None): The word id used to pad the sequences. If None, it means there is no padding. Default: None. 30 | device (str, optional): The device used for running `paddle_model`, options: ``cpu``, ``gpu``, ``gpu:0``, ``gpu:1`` etc. Default: None. 31 | batch_size (int, optional): Number of samples to forward each time. Default: 50 32 | """ 33 | TokenInterpreter.__init__(self, paddle_model, device) 34 | 35 | # build predict function 36 | self.normlime = self._build_predict_fn(paddle_model, device) 37 | 38 | self.batch_size = batch_size 39 | self.preprocess_fn = preprocess_fn 40 | self.unk_id = unk_id 41 | self.pad_id = pad_id 42 | 43 | def interpret(self, data, num_samples=500, temp_data_file='all_lime_weights.npz', save_path='normlime_weights.npy'): 44 | """Main function of the interpreter. 45 | Args: 46 | data ([type]): The inputs of the paddle_model. 47 | labels ([type], optional): The target label to analyze. If None, the most likely label will be used. Default: None. 48 | num_samples (int, optional): LIME sampling numbers. Larger number of samples usually gives more accurate interpretation. Default: 1000 49 | temp_data_file (str, optinal): The .npz file to save/load the dictionary where key is word ids joined by '-' and value is another dictionary with lime weights. Default: 'all_lime_weights.npz' 50 | save_path (str, optional): The .npy path to save the normlime weights. It is a dictionary where the key is label and value is segmentation ids with their importance. Default: 'normlime_weights.npy' 51 | 52 | Returns: 53 | [NormLIMEResult] NormLIME weights: {label_i: weights on features} 54 | 55 | """ 56 | 57 | normlime_weights = self.normlime.interpret(data, 58 | self.preprocess_fn, 59 | unk_id=self.unk_id, 60 | pad_id=self.pad_id, 61 | num_samples=num_samples, 62 | batch_size=self.batch_size, 63 | temp_data_file=temp_data_file, 64 | save_path=save_path) 65 | 66 | normresult = NormLIMEResult(attributions=normlime_weights) 67 | return normresult 68 | 69 | def _build_predict_fn(self, paddle_model, device='gpu'): 70 | try: 71 | from interpretdl import NormLIMENLPInterpreter 72 | except ImportError as e: 73 | import sys 74 | sys.stderr.write( 75 | '''Warning with import interpretdl: please install interpretdl firstly. cmd: pip install -U interpretdl''' 76 | ) 77 | raise e 78 | return NormLIMENLPInterpreter(paddle_model, device) 79 | -------------------------------------------------------------------------------- /tutorials/README.md: -------------------------------------------------------------------------------- 1 | # 应用案例 2 | 基于对模型预测依赖证据的分析,TrustAI提供了模型缺陷识别方案及对应的优化方案。在该目录,我们通过具体实例介绍几种方案,一是希望开发者可直接利用这些方案对其任务模型进行优化,二是希望启发研究者探索证据分析的更多价值。 3 | * 训练数据缺陷识别及针对性优化策略 4 | * [训练数据覆盖不足识别及有效数据增强](#训练数据覆盖不足识别及有效数据增强) 5 | * [训练数据中脏数据识别及标注修正](#训练数据中脏数据识别及标注修正) 6 | * [训练数据分布偏置识别及偏置消除](#训练数据偏置识别及偏置消除) 7 | * [数据权重修正](#数据权重修正) 8 | * [数据分布修正](#数据分布修正) 9 | * 基于证据指导的预测机制优化 10 | * [证据识别及基于证据的预测](#证据识别及基于证据的预测) 11 | 12 | 同样地,我们也探索了证据分析的其他价值,如: 13 | * [基于证据指导的模型增强方案](#基于证据指导的模型增强方案) 14 | * [基于证据指导的预测错误数据识别](#基于证据指导的预测错误数据识别) 15 | 16 | 17 | ## 训练数据覆盖不足识别及有效数据增强 18 | ### 方法介绍 19 | 训练数据覆盖不足会导致模型在对应的测试数据上表现不好。数据扩充是提升模型效果首选方法,然而数据标注是一个费时费力的工作,如何标注更少的数据带来更大的效果提升是大多数NLP开发者面临的难题。 20 | 21 | TrustAI可识别因训练数据覆盖不足而导致的预测效果差的测试样本(这些样本构成的集合称为目标集),并能帮助开发者从未标注数据中选择有效数据进行标注,提高训练数据对目标集的覆盖度,进而提升模型效果。 22 | 23 | ### 方法效果 24 | 25 | 由于标注数据成本高昂,下表给出了基于相似度计算任务LCQMC数据集进行的模拟实验效果。实验基于ERNIE-3.0-base-zh在LCQMC训练数据上微调得到模型,在LCQMC测试集和DuQM鲁棒性数据集上进行效果评估,评估指标为准确率。 26 | 27 | 28 | | 数据集 | 数据量 | LCQMCdev | LCQMCtest | DuQM | 目标集 | 29 | | :-------: | :-------: | :-----: | :-----: |:-----: |:-----: | 30 | | 基线 | 5000 | 86.42% | 84.49% | 69.17% | 55.19% | 31 | | 基线 + 随机1000条 | 6000 | 86.76% | 85.05% | 69.23% | 55.20% | 32 | | 基线 + 策略1000条 | 6000 | 87.04% | 85.58% | 70.20% | 69.60% | 33 | 34 | 实验结论:增加20%有效训练数据,该方案将目标集效果提升14.40%(随机选择同等规模数据加入训练数据,效果仅提升0.01%);同时在整个测试集上,该方案将效果提升1.03%(随机选择方案仅提升0.06%)。 35 | 36 | 同时,该策略接入了PaddleNLP的分类系统,在多分类、多标签分类、及层次分类任务上完成了效果验证,效果如图1所示:通过TrustAI提供的有效数据选择策略,增加10%训练数据带来的效果提升大于随机增加20%训练数据的效果,也就是说,该策略能够**节省一半标注成本**。 37 |

38 |
39 | 图1 在三个常见分类任务上应用“数据覆盖不足识别及有效数据增强”策略的效果 40 |

41 | 42 | 43 | 详细方案和实验介绍见应用示例[训练数据覆盖不足识别及有效数据增强](./sparse_data_identification)。 44 |
45 | 46 | 47 | ## 训练数据中脏数据识别及标注修正 48 | ### 方法介绍 49 | 训练数据标注质量对模型效果有较大影响,往往会成为模型效果提升的瓶颈。但当标注数据规模较大时,数据检查就成为一个难题。 50 | 51 | TrustAI提供了脏数据(即标注质量差的数据)自动识别功能,降低人工检查数据成本。如图2所示,在三个公开数据集上,TrustAI提供的脏数据识别策略,其识别的脏数据比例远高于随机选择策略。 52 | 53 | 54 |

55 |
56 | 图2 在3个数据集上,不同策略识别的脏数据效果 57 |

58 | 59 | ### 方法效果 60 | 61 | 下表给出了基于相似度计算任务LCQMC数据集上进行的实验效果。实验基于ERNIE-3.0-base-zh在LCQMC训练数据上微调得到模型,并在LCQMC测试集和DuQM鲁棒性数据集上评估效果,评估指标为准确率。 62 | 63 | 64 | | 数据集 | LCQMCdev | LCQMCtest | DuQM | 65 | | :-------: | :-----: | :-----: |:-----: | 66 | | 基线 | 86.42% | 84.49% | 69.17% | 67 | | 数据修正 | 87.76% | 86.62% | 73.18% | 68 | 69 | 结果说明:对候选脏数据(规模为原始训练集的10%)进行人工标注修正,数据修正后重新训练模型,在LCQMC测试集上效果提升2.13%,在DuQM数据集上效果提升4.01%。 70 | 71 | 同时,该策略接入了PaddleNLP的分类系统,在多分类、多标签分类、及层次分类任务上完成了效果验证,效果如图3所示。 72 |

73 |
74 | 图3 在三个常见分类任务上应用“脏数据识别及标注修正”策略的效果 75 |

76 | 77 | 详细方案和实验介绍见应用示例[训练数据中脏数据识别](./dirty_data_identification)。 78 | 79 |
80 | 81 | 82 | ## 训练数据偏置识别及偏置消除 83 | ### 方法介绍 84 | 研究表明,神经网络模型会利用数据集中的偏置作为预测捷径,如在情感分析任务中,遇到否定词模型会倾向预测为“负向”情感。这种偏置会导致模型没有真正理解语言,导致模型的鲁棒性降低。 85 | 86 | TrustAI提供了数据权重修正和数据分布修正两种优化策略,在不需要人工介入的条件下,缓解训练数据偏置对模型训练的影响,提升模型的语义理解能力,进而提升模型的鲁棒性。 87 | * 数据权重修正:降低偏置样本对训练loss的影响,即减少模型从偏置样本中学习。具体方案详见[Du, Yanrui, et al. 2022](https://arxiv.org/abs/2205.12593),其提供了`lls_d`和`lls_d_f`两种样本偏置度计算策略,前者考虑了词的有偏性,而后者同时考虑词的有偏性和频次。 88 | * 数据分布修正:通过对非偏置数据多次重复采样,使训练数据分布尽量均衡。 89 | 90 | ### 方法效果 - 数据权重修正 91 | 92 | 实验基于ERNIE-3.0-base-zh在相似度计算任务LCQMC数据集上微调得到基线模型,在LCQMC测试集和DuQM鲁棒性数据集上评估效果,评估指标为准确率。 93 | 94 | 效果如下表所示:相比于基线,数据权重修正后,模型在鲁棒性数据集DuQM上准确率提升0.94%。 95 | 96 | | 数据集 | LCQMCdev | LCQMCtest | DuQM | 97 | | :-------: | :-------: | :-------: | :-------: | 98 | | 基线 | 90.93% | 87.06% | 73.82% | 99 | | lls_d | 90.76% | 87.58% | 74.76% | 100 | | lls_d_f | 90.80% | 87.22% | 74.44% | 101 | 102 | 详细见应用示例[数据权重修正](./data_bias_identification/data_distribution_correction)。 103 | 104 |
105 | 106 | ### 方法效果 - 数据分布修正 107 | 108 | 实验基于ERNIE-3.0-base-zh在情感分析任务ChnsentiCorp数据集上微调得到基线模型,在情感分析鲁棒性数据集上评估效果,评估指标为准确率。 109 | 110 | 效果如下表所示:相比于基线,数据分布修正后,模型在鲁棒性数据集上准确率提升1.41%。 111 | | 数据集 | 鲁棒性数据集 | 112 | | :-------: | :-------: | 113 | | 基线 | 69.97 | 114 | | 分布修正 | 71.38 | 115 | 116 | 详细见应用示例[数据分布修正](./data_bias_identification/data_distribution_correction)。 117 | 118 |
119 | 120 | ## 证据识别及基于证据的预测 121 | 122 | ### 方法介绍 123 | 124 | 在长文本理解问题上,输入中的冗余信息往往会干扰模型预测,导致模型鲁棒性差。如在机器阅读理解(MRC)任务中,模型容易受到输入中扰动信息干扰,即输入中加入一些与答案生成无关的信息,模型生成的答案却可能发生改变。 125 | 126 | 为了降低模型受无关信息干扰带来的影响,TrustAI构建“证据识别-基于证据的预测”二阶段流程。首先,通过证据抽取识别输入中有效信息,排除冗余数据;然后基于识别的有效信息进行最终答案生成,提高模型鲁棒性。 127 | 128 | ### 方法效果 129 | 我们在MRC任务上做了验证,基于ERNIE-3.0-base-zh在DuReader-robust训练数据上微调了基线模型,在DuReader-robust的验证集合、测试集合和Challenge Test(DuReader-chechlist)集合上做了效果验证评估,评估指标为答案的EM(exactly match)。 130 | 131 | 132 | | 模型 | DuReader-robust dev EM | DuReader-robust Test EM | **DuReader-checklist dev EM** | 133 | | :----------------: | ---------------------- | ----------------------- | :---------------------------: | 134 | | roberta-base | 73.18 | 45.97 | 27.56 | 135 | | Selector-Predictor | 74.31 | 50.91 | 31.04 | 136 | 137 | 实验结论:“证据识别-基于证据的预测”二阶段方案,将模型在测试集上的效果提升4.94%,同时将训练的模型直接在DuReader Checklist数据集评估,相较于官方基线汇报结果,EM提升3.48%。 138 | 139 | 详细见应用示例[解决文本冗余导致精度下降的问题](./redundancy_removal)。 140 | 141 | 142 | ## 基于证据指导的模型增强方案 143 | ### 方法介绍 144 | 经过对多个模型预测依赖证据的评估,发现NN模型提供证据的合理性偏弱。为进一步提高证据的合理性,TrustAI提供了基于证据指导的模型增强方案([Jayaram etc. 2021](https://aclanthology.org/2021.emnlp-main.450/)),即标注少量证据数据,通过联合学习原始任务和证据学习任务,用证据学习目标指导模型依赖合理的证据进行预测,提升模型可解释性。 145 | 146 |

147 |
148 |

149 | 150 | ### 方法效果 151 | 152 | 实验基于ERNIE-2.0-EN-Base在英文情感分析SST数据集上微调得到基线模型,然后选择1000条训练数据进行证据标注,在这些数据上进行证据学习。最终,在500条标有证据的验证数据上进行了效果评估。评估指标除了模型预测准确率外,还包括可解释评估指标,即证据的合理性、充分性和完备性。 153 | 154 | 实验结果如下表:在加入证据指导后,模型预测效果略有提升,准确率提升0.5%;模型可解释性提升明显:证据合理性提升5.0%、充分性降低0.185(该指标越低越好)、完备性提升0.044。 155 | 156 | | 数据集 | 准确率 | 合理性 | 充分性 | 完备性 | 157 | | :-------: | :-----: | :-----: | :-----: | :-----: | 158 | | base | 93.5% | 26.1% | 0.367 | 0.118 | 159 | | base + maw loss | 94.0% | 31.1% | 0.182 | 0.162 | 160 | 161 | 应用的详细示例见[基于证据指导的模型增强方案](./enhanced_by_rationale)。 162 | 163 |
164 | 165 | ## 基于证据指导的预测错误数据识别 166 | ### 方法介绍 167 | TrustAI可信分析方法提供了模型预测依赖的证据,用来解释模型为什么会做这个预测。当预测依赖证据合理性弱,是否说明预测结果不可信呢?基于这个假设,我们做了一些探索,发现可以基于对模型预测依赖证据的分析,识别潜在预测错误的数据。 168 | 169 | 170 | ### 方法效果 171 | 实验基于ERNIE-1.0-base在相似度计算任务LCQMC数据集上微调得到相似度计算模型,然后在LCQMC验证集合上进行预测,并利用TrustAI提供的特征级证据分析方法识别模型预测依赖证据。 172 | 173 | 在我们的实验中,选择模型判断为语义相似但MAP(评估两个输入文本证据的一致性)低于指定阈值(0.3)的数据,其占全部测试数据比例为3.4%。相比于全量测试数据,模型在该类数据上效果下降了9.67%。 174 | 175 | | 数据集 | acc | 176 | | :-------: | :-----: | 177 | | 全部数据 | 89.53% | 178 | | 候选预测错误数据 | 79.86% | 179 | 180 | 基于候选预测错误数据,通过数据自动增强方法生成相关数据,作为强正负例加入到原始训练数据中,重训模型后,模型在该类数据上准确率提升7%。 181 | 182 | 详细见应用示例[相似度计算任务的预测错误数据识别](./map_analysis/zh-similarity-application.ipynb)。 183 | 184 |
185 | 186 | 187 | -------------------------------------------------------------------------------- /tutorials/data_bias_identification/data_distribution_correction/README.md: -------------------------------------------------------------------------------- 1 | # 训练数据偏置识别及偏置消除 - 数据分布修正 2 | 3 | ## 方法介绍 4 | 受限于数据集收集方法、标注人员经验等影响,构建的训练数据集中往往存在偏置现象。模型会利用数据集偏置作为预测捷径,如在情感分析任务中,遇到否定词或描述直接给出“负向”情感预测。这种偏置会导致模型没有学会真正的理解和推理能力,在与训练数据分布一致的测试数据上表现很好,但在与训练数据分布不一致的测试数据上往往会表现较差。 5 | 6 | TrustAI提供了数据集偏置识别及基于分布修正的偏置缓解策略。 7 | * 偏置识别:统计训练数据中词与标注标签的分布,在分布上不均衡的词可能是偏置词,这里需要使用任务相关词典对候选偏置词过滤,得到真正的偏置词。包含偏置词的样本为偏置样本。 8 | * 分布修正:对非偏置样本进行重复采样。 9 | 10 | 11 | 注:开发者可访问[ AI Studio示例 ](https://aistudio.baidu.com/aistudio/projectdetail/4434652)快速体验本案例。 12 | 13 | ## 实验步骤 14 | 实验基于ERNIE-3.0-base-zh在情感分析任务ChnsentiCorp数据集上微调得到基线模型,在情感分析鲁棒性数据集上评估效果,评估指标为准确率。 15 | 16 | 17 | **Step 1**:识别偏置词。基于特征级证据可信分析方法(`IntGradInterpreter`)获取训练数据预测依赖的证据,然后统计各证据频次信息。 18 | ```shell 19 | # 下载数据 20 | wget --no-check-certificate https://trustai.bj.bcebos.com/application_data/distribution_data.tar && tar xf distribution_data.tar && rm distribution_data.tar 21 | # 训练基线模型 22 | python -u train.py --dataset_dir ./data --train_file train.tsv --dev_file robust.tsv --num_classes 2 --save_dir ./checkpoint 23 | 24 | # 统计重要证据和频次 25 | python -u get_rationale_importance.py --dataset_dir ./data --input_file train.tsv --num_classes 2 --rationale_path ./data/rationale_importance.txt --init_from_ckpt ./checkpoint/model_state.pdparams 26 | # rationale_path为证据及其频次保存的地址 27 | ``` 28 | 29 | **Step 2**:识别偏置样本,及对偏置样本重复采样以达到均衡。 30 | 31 | ```shell 32 | # 生成均衡训练数据 33 | python -u balance_train_data.py --input_path ./data/train.tsv --rationale_path ./data/rationale_importance.txt --output_path ./data/balanced_train.tsv 34 | ``` 35 | 36 | 基于生成的均衡数据`balanced_train.tsv`训练模型。 37 | 38 | ```shell 39 | python -u train.py --dataset_dir ./data --train_file balanced_train.tsv --dev_file robust.tsv --num_classes 2 --save_dir ./checkpoint 40 | ``` 41 | 实验效果如下表所示: 42 | | 数据集 | 鲁棒性数据集 | 43 | | :-------: | :-------: | 44 | | 基线 | 69.97 | 45 | | 分布修正 | 71.38 | 46 | 47 | 注:以上结果均为10次实验的平均值。 48 | -------------------------------------------------------------------------------- /tutorials/data_bias_identification/data_distribution_correction/balance_train_data.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | import collections 4 | import random 5 | import time 6 | import os 7 | import argparse 8 | from collections import defaultdict 9 | 10 | import numpy as np 11 | import paddle 12 | import jieba 13 | from paddle.io import DataLoader, BatchSampler 14 | from paddlenlp.data import DataCollatorWithPadding 15 | from paddlenlp.datasets import load_dataset 16 | from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer 17 | from paddlenlp.utils.log import logger 18 | from trustai.interpretation import get_word_offset 19 | from trustai.interpretation import IntGradInterpreter 20 | from LAC import LAC 21 | from tqdm import tqdm 22 | 23 | from utils import evaluate, preprocess_function 24 | 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument("--input_path", type=str, default=None, help="file path of input data.") 27 | parser.add_argument("--output_path", type=str, default=None, help="file path of output data.") 28 | 29 | parser.add_argument("--seed", type=int, default=3, help="random seed for initialization") 30 | parser.add_argument("--rationale_path", 31 | type=str, 32 | default="./data/rationale_importance.txt", 33 | help="Path to save rationale importance data.") 34 | 35 | args = parser.parse_args() 36 | 37 | 38 | def set_seed(seed): 39 | """ 40 | Sets random seed 41 | """ 42 | random.seed(seed) 43 | np.random.seed(seed) 44 | paddle.seed(seed) 45 | os.environ['PYTHONHASHSEED'] = str(seed) 46 | 47 | 48 | def run(): 49 | """ 50 | Get rationale importance 51 | """ 52 | set_seed(args.seed) 53 | 54 | # init lexical analyzer of chinese 55 | lac = LAC(mode='lac') 56 | 57 | # load ratioanle importance 58 | with open(args.rationale_path, 'r') as f: 59 | tokens = [] 60 | for line in f: 61 | if line.strip(): 62 | token, frequency = line.split('\t') 63 | frequency = int(frequency) 64 | if frequency > 2: 65 | tokens.append(token) 66 | # load ChnSentiCorp train data 67 | with open(args.input_path, 'r') as f: 68 | examples = [] 69 | for i, line in enumerate(tqdm(list(f))): 70 | label, text = line.strip().split('\t') 71 | examples.append((i, int(label), text, list(jieba.cut(text)))) 72 | 73 | # Statistics rationale index in positive and negative examples respectively 74 | pos_dict = collections.defaultdict(list) 75 | neg_dict = collections.defaultdict(list) 76 | rate_dict = {} 77 | for i, token in enumerate(tqdm(tokens[::-1])): 78 | for example in examples: 79 | if token in example[3]: 80 | if example[1] == 1: 81 | pos_dict[token].append(example[0]) 82 | else: 83 | neg_dict[token].append(example[0]) 84 | 85 | # filter rationale by postag and positive negative ratio 86 | for token in sorted(list(set(pos_dict.keys()) & set(neg_dict.keys()))): 87 | pos_list = pos_dict[token] 88 | neg_list = neg_dict[token] 89 | pos_ratio = len(pos_list) / (len(pos_list) + len(neg_list)) 90 | postags = lac.run(token)[1] 91 | if (pos_ratio <= 0.15 or pos_ratio >= 0.85) and not (set(['c', 'r', 'w', 'm']) & set(postags)): 92 | rate_dict[token] = [pos_ratio if pos_ratio < 0.5 else 1 - pos_ratio, len(pos_list), len(neg_list), postags] 93 | for k, v in rate_dict.items(): 94 | print(k, v, len(pos_dict[k]), len(neg_dict[k])) 95 | # sampling the data that will be added to the training set 96 | add_dict = defaultdict(int) 97 | add_list = [] 98 | for token in rate_dict: 99 | pos_num = len(pos_dict[token]) 100 | neg_num = len(neg_dict[token]) 101 | tmp_dict = defaultdict(int) 102 | if pos_num > neg_num: 103 | for idx in random.choices(neg_dict[token], k=min(pos_num - neg_num, neg_num * 2)): 104 | tmp_dict[idx] += 1 105 | else: 106 | for idx in random.choices(pos_dict[token], k=min(neg_num - pos_num, pos_num * 2)): 107 | tmp_dict[idx] += 1 108 | for idx, count in tmp_dict.items(): 109 | add_dict[idx] = max(add_dict[idx], count) 110 | for idx, count in add_dict.items(): 111 | add_list.extend([idx] * count) 112 | print(add_dict) 113 | random.shuffle(add_list) 114 | # write data to train data 115 | logger.info(f"add number: {len(add_list)}") 116 | with open(args.output_path, 'w') as f: 117 | for example in examples: 118 | f.write(str(example[1]) + '\t' + example[2] + '\n') 119 | for idx in add_list: 120 | example = examples[idx] 121 | f.write(str(example[1]) + '\t' + example[2] + '\n') 122 | 123 | 124 | if __name__ == "__main__": 125 | run() 126 | -------------------------------------------------------------------------------- /tutorials/data_bias_identification/data_distribution_correction/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import numpy as np 16 | 17 | import paddle 18 | import paddle.nn.functional as F 19 | from paddlenlp.utils.log import logger 20 | 21 | 22 | @paddle.no_grad() 23 | def evaluate(model, criterion, metric, data_loader, name=''): 24 | """ 25 | Given a dataset, it evaluates model and computes the metric. 26 | Args: 27 | model(obj:`paddle.nn.Layer`): A model to classify texts. 28 | criterion(obj:`paddle.nn.Layer`): It can compute the loss. 29 | metric(obj:`paddle.metric.Metric`): The evaluation metric. 30 | data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches. 31 | """ 32 | 33 | model.eval() 34 | metric.reset() 35 | losses = [] 36 | for batch in data_loader: 37 | input_ids, token_type_ids, labels = batch['input_ids'], batch[ 38 | 'token_type_ids'], batch['labels'] 39 | logits = model(input_ids, token_type_ids) 40 | loss = criterion(logits, labels) 41 | losses.append(loss.numpy()) 42 | correct = metric.compute(logits, labels) 43 | metric.update(correct) 44 | 45 | acc = metric.accumulate() 46 | logger.info("%s: eval loss: %.5f, acc: %.5f" % (name, np.mean(losses), acc)) 47 | model.train() 48 | metric.reset() 49 | 50 | return acc 51 | 52 | 53 | def preprocess_function(example, tokenizer, max_seq_length, is_test=False): 54 | """ 55 | Builds model inputs from a sequence for sequence classification tasks 56 | by concatenating and adding special tokens. 57 | 58 | Args: 59 | example(obj:`list[str]`): input data, containing text and label if it have label. 60 | tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 61 | which contains most of the methods. Users should refer to the superclass for more information regarding methods. 62 | max_seq_length(obj:`int`): The maximum total input sequence length after tokenization. 63 | Sequences longer than this will be truncated, sequences shorter will be padded. 64 | label_nums(obj:`int`): The number of the labels. 65 | Returns: 66 | result(obj:`dict`): The preprocessed data including input_ids, token_type_ids, labels. 67 | """ 68 | if 'text_b' not in example: 69 | result = tokenizer(text=example["text_a"], max_seq_len=max_seq_length) 70 | else: 71 | result = tokenizer(text=example["text_a"], text_pair=example['text_b'], max_seq_len=max_seq_length) 72 | 73 | if not is_test: 74 | result["labels"] = np.array([example['label']], dtype='int64') 75 | return result 76 | -------------------------------------------------------------------------------- /tutorials/data_bias_identification/less_learn_shortcut/README.md: -------------------------------------------------------------------------------- 1 | # 解决训练数据分布偏置的问题 - 数据权重修正方案 2 | ## 方法介绍 3 | 受限于数据集收集方法、标注人员经验等影响,构建的训练数据集中往往存在偏置现象。模型会利用数据集偏置作为预测捷径,如在情感分析任务中,遇到否定词或描述直接给出“负向”情感预测。这种偏置会导致模型没有学会真正的理解和推理能力,在与训练数据分布一致的测试数据上表现很好,但在与训练数据分布不一致的测试数据上往往会表现较差。 4 | 5 | TrustAI提供了数据集偏置识别及基于权重修正的偏置缓解策略。 6 | * 偏置识别:统计训练数据中词与标注标签的分布,在分布上不均衡的词可能是偏置词,包含偏置词的样本为偏置样本。 7 | * 权重修正:降低偏置样本对训练loss的影响,即针对每一条样本计算一个偏置度,在训练loss计算时通过偏置度降低偏置样本影响,具体见[Du, Yanrui, et al. 2022](https://arxiv.org/abs/2205.12593)。 8 | 9 | 注:开发者可访问[ AI Studio示例 ](https://aistudio.baidu.com/aistudio/projectdetail/4434616)快速体验本案例。 10 | 11 | ## 实验步骤 12 | 实验基于ERNIE-3.0-base-zh在情感分析任务ChnsentiCorp数据集上微调得到基线模型,在情感分析鲁棒性数据集上评估效果,评估指标为准确率。 13 | 14 | 15 | **Step 1**:识别训练数据中的偏置词。在训练数据中,统计每个词在不同类别上的分布,对于频次大于`cnt_threshold`、且最少在一个类别上出现比例大于`p_threshold`的词视为偏置词。 16 | 17 | ```shell 18 | # 下载数据 19 | wget --no-check-certificate https://trustai.bj.bcebos.com/application_data/lls_data.tar && tar xf lls_data.tar && rm lls_data.tar 20 | # 统计偏置词 21 | python -u find_bias_word.py --output_dir output --input_path ./data/train.tsv --num_classes 2 --cnt_threshold 3 --p_threshold 0.90 --output_dir output 22 | # cnt_threshold表示为偏置词最少需要出现的频次 23 | # p_threshold表示偏置比例的阈值,偏置词至少需要在一个类别上大于此阈值 24 | # output_dir表示统计结果的存储路径 25 | ``` 26 | 27 | **Step 2**:基于偏置词的统计结果,针对每一训练样本,计算偏置度,作为样本对训练loss的影响权重。 28 | 29 | 当前方案提供了`lls_d`和`lls_d_f`两种计算样本偏置度的策略,前者考虑词的有偏性,后者同时考虑词的有偏性和频次。 30 | 31 | ```shell 32 | # 基于`lls_d`策略计算样本偏置度 33 | python -u lls.py --input_path ./data/train.tsv --bias_dir ./output --stopwords_path ./data/stop_words.txt --num_classes 2 --mode lls_d --output_path ./data/train_lls_d.tsv 34 | # 基于`lls_d_f`策略计算样本偏置度 35 | python -u lls.py --input_path ./data/train.tsv --bias_dir ./output --stopwords_path ./data/stop_words.txt --num_classes 2 --mode lls_d_f --output_path ./data/train_lls_d_f.tsv 36 | # mode表示计算样本偏置度的策略,当前有`lls_d`和`lls_d_f`两种策略 37 | # output_path表示为生成带偏置度训练集的存储路径 38 | ``` 39 | 40 | **Step 3**:用带偏置度的训练数据训练模型,偏置度作用于loss计算。 41 | ```shell 42 | # 基于`lls_d`策略产生的数据训练模型 43 | python -u train.py --dataset_dir ./data --train_file train_lls_d.tsv --dev_file dev.tsv --test_files test.tsv DuQM --num_classes 2 --save_dir ./lls_d_checkpoint 44 | # 基于`lls_d_f`策略产生的数据训练模型 45 | python -u train.py --dataset_dir ./data --train_file train_lls_d_f.tsv --dev_file dev.tsv --test_files test.tsv DuQM --num_classes 2 --save_dir ./lls_d_f_checkpoint 46 | ``` 47 | 48 | 实验结果如下表所示:相比于基线,权重修正后,模型在鲁棒性数据集DuQM上准确率提升0.94%。 49 | 50 | | 数据集 | LCQMCdev | LCQMCtest | DuQM | 51 | | :-------: | :-------: | :-------: | :-------: | 52 | | 基线 | 90.93 | 87.06 | 73.82 | 53 | | lls_d | 90.76 | 87.58 | 74.76 | 54 | | lls_d_f | 90.80 | 87.22 | 74.44 | 55 | 56 | 注:以上结果均为3次实验的平均值。 57 | -------------------------------------------------------------------------------- /tutorials/data_bias_identification/less_learn_shortcut/find_bias_word.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import collections 4 | import argparse 5 | 6 | from LAC import LAC 7 | from tqdm import tqdm 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("--output_dir", 11 | default="./output", 12 | type=str, 13 | help="The output directory where the result will be written.") 14 | parser.add_argument("--input_path", type=str, default=None, help="train data file path") 15 | parser.add_argument('--num_classes', type=int, default=2, help='Number of classification.') 16 | parser.add_argument('--cnt_threshold', type=int, default=3, help='Count threshold of statistical biased words') 17 | parser.add_argument('--p_threshold', type=float, default=0.85, help='Probability threshold of statistical biased words') 18 | 19 | args = parser.parse_args() 20 | 21 | 22 | class BiasWord(object): 23 | """ 24 | Statistic the biased words in the dataset 25 | """ 26 | 27 | def __init__(self, segments, labels, num_classes=2, cnt_threshold=3, p_threshold=0.85): 28 | self.cnt_threshold = cnt_threshold 29 | self.p_threshold = p_threshold 30 | self.num_classes = num_classes 31 | self.segments = segments 32 | self.labels = labels 33 | 34 | def process(self): 35 | """ 36 | process function 37 | """ 38 | self._get_dict() 39 | self._search_bias_word() 40 | print("number of bias_words:", len(self.bias_words)) 41 | return self.bias_words, self.bias_word_cnt, self.id2words 42 | 43 | def _get_dict(self): 44 | self.word2ids = collections.defaultdict(set) 45 | self.id2words = collections.defaultdict(set) 46 | for n, segs in enumerate(self.segments): 47 | for seg in segs: 48 | self.word2ids[seg].add(n) 49 | self.id2words[n] = set(segs) 50 | 51 | def _search_bias_word(self): 52 | self.bias_words = {} 53 | self.bias_word_cnt = {} 54 | for word, sentids in self.word2ids.items(): 55 | if len(sentids) >= self.cnt_threshold: 56 | cnts = [0] * self.num_classes 57 | 58 | for sentid in sentids: 59 | label = self.labels[sentid] 60 | cnts[label] += 1 61 | assert sum(cnts) != 0 62 | max_cnt = max(cnts) 63 | p = max_cnt / sum(cnts) 64 | if p >= self.p_threshold: 65 | self.bias_words[word] = p 66 | self.bias_word_cnt[word] = len(sentids) 67 | 68 | 69 | if __name__ == "__main__": 70 | # initialize tokenizer 71 | lac = LAC(mode='rank') 72 | 73 | # preprocess data, get segments、labels and lines 74 | segments = [] 75 | labels = [] 76 | lines = [] 77 | with open(args.input_path, 'r') as f: 78 | for line in tqdm(list(f)): 79 | lines.append(line) 80 | query, title, label = line.strip().split('\t') 81 | seg_res = lac.run([query, title]) 82 | query_segs = seg_res[0][0] 83 | title_segs = seg_res[1][0] 84 | segments.append(query_segs + title_segs) 85 | labels.append(int(label)) 86 | 87 | # get bias_words 88 | biasword = BiasWord(segments, labels, num_classes=2, cnt_threshold=args.cnt_threshold, p_threshold=args.p_threshold) 89 | # b_words: biased words, dict 90 | # b_word_cnt: count of biased words, dict 91 | # id2words: sentence index to words, dict 92 | b_words, b_word_cnt, id2words = biasword.process() 93 | 94 | # save result to output_dir 95 | if not os.path.exists(args.output_dir): 96 | os.makedirs(args.output_dir) 97 | with open(os.path.join(args.output_dir, "bias_word.json"), 'w') as f: 98 | json.dump(b_words, f, ensure_ascii=False) 99 | with open(os.path.join(args.output_dir, "bias_word_cnt.json"), 'w') as f: 100 | json.dump(b_word_cnt, f, ensure_ascii=False) 101 | with open(os.path.join(args.output_dir, "id2words.json"), 'w') as f: 102 | for k, v in id2words.items(): 103 | id2words[k] = list(v) 104 | json.dump(id2words, f, ensure_ascii=False) 105 | -------------------------------------------------------------------------------- /tutorials/data_bias_identification/less_learn_shortcut/lls.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--input_path", type=str, default=None, help="input data file path") 7 | parser.add_argument("--output_path", type=str, default=None, help="output data file path") 8 | parser.add_argument("--bias_dir", type=str, default=None, help="bias data directory.") 9 | parser.add_argument("--stopwords_path", type=str, default=None, help="stopwords data file path") 10 | parser.add_argument('--num_classes', type=int, default=2, help='Number of classification.') 11 | parser.add_argument('--alpha', 12 | type=float, 13 | default=0.01, 14 | help='Hyperparameters for frequency of words when mode is lls_d_f.') 15 | parser.add_argument('--mode', 16 | type=str, 17 | default='lls_d', 18 | choices=['lls_d', 'lls_d_f'], 19 | help='Hyperparameters for frequency of words.') 20 | 21 | args = parser.parse_args() 22 | 23 | 24 | def filter_stopwords(score_w, stop_words): 25 | for word in list(score_w.keys()): 26 | if word in stop_words: 27 | del score_w[word] 28 | return score_w 29 | 30 | 31 | def word_score(d, num_classes): 32 | score_w = {} 33 | for k in d.keys(): 34 | score_w[k] = abs(d[k] - 1 / num_classes) 35 | return score_w 36 | 37 | 38 | def word_score_freq(d, d_cnt, num_classes, alpha): 39 | score_w = {} 40 | for k in d.keys(): 41 | score_w[k] = abs(d[k] - 1 / num_classes) + alpha * d_cnt[k] 42 | return score_w 43 | 44 | 45 | def lls_basic(score_w, id2words): 46 | sample_bias = {} 47 | for n in range(len(id2words)): 48 | 49 | sample_score = 0 50 | cnt = 0 51 | for word in id2words[str(n)]: 52 | if word in score_w: 53 | sample_score += score_w[word] 54 | cnt += 1 55 | if cnt != 0: 56 | sample_bias[n] = sample_score / cnt 57 | return sample_bias 58 | 59 | 60 | def softxmax(sample_bias, a=0, b=0.15): 61 | """ 62 | Score normalization 63 | """ 64 | scores = [] 65 | for k, v in sample_bias.items(): 66 | scores.append(v) 67 | maxn, minn = max(scores), min(scores) 68 | sample_bias_norm = {} 69 | for k, sc in sample_bias.items(): 70 | sc_softmax = a + (b - a) / (maxn - minn) * (sc - minn) 71 | sample_bias_norm[k] = (1 - sc_softmax) 72 | return sample_bias_norm 73 | 74 | 75 | if __name__ == "__main__": 76 | 77 | # load data 78 | with open(args.stopwords_path, 'r') as f: 79 | stop_words = [] 80 | for line in f.readlines(): 81 | stop_words.append(line.strip()) 82 | with open(os.path.join(args.bias_dir, 'id2words.json'), 'r') as f: 83 | id2words = json.load(f) 84 | with open(os.path.join(args.bias_dir, 'bias_word.json'), 'r') as f: 85 | d = json.load(f) 86 | with open(os.path.join(args.bias_dir, 'bias_word_cnt.json'), 'r') as f: 87 | d_cnt = json.load(f) 88 | with open(args.input_path, 'r') as f: 89 | lines = list(f) 90 | 91 | # get bias degree for example 92 | mode = args.mode 93 | if mode == 'lls_d': 94 | score_w = word_score(d, num_classes=2) 95 | score_w = filter_stopwords(score_w, stop_words) 96 | sample_bias = lls_basic(score_w, id2words) 97 | sample_bias_norm = softxmax(sample_bias) 98 | elif mode == 'lls_d_f': 99 | score_w = word_score_freq(d, d_cnt, num_classes=args.num_classes, alpha=args.alpha) 100 | score_w = filter_stopwords(score_w, stop_words) 101 | sample_bias = lls_basic(score_w, id2words) 102 | sample_bias_norm = softxmax(sample_bias) 103 | else: 104 | raise KeyError(f"Unknown mode: {mode}, mode should be chosen from [lls_d, lls_d_f].") 105 | 106 | # save result 107 | with open(args.output_path, 'w', encoding='utf-8') as f: 108 | for n, line in enumerate(lines): 109 | if n in sample_bias_norm: 110 | f.write(line.strip() + '\t' + str(sample_bias_norm[n]) + '\n') 111 | else: 112 | f.write(line.strip() + '\t' + str(1) + '\n') -------------------------------------------------------------------------------- /tutorials/data_bias_identification/less_learn_shortcut/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import numpy as np 16 | 17 | import paddle 18 | import paddle.nn.functional as F 19 | from paddlenlp.utils.log import logger 20 | 21 | 22 | @paddle.no_grad() 23 | def evaluate(model, criterion, metric, data_loader, name=''): 24 | """ 25 | Given a dataset, it evaluates model and computes the metric. 26 | Args: 27 | model(obj:`paddle.nn.Layer`): A model to classify texts. 28 | criterion(obj:`paddle.nn.Layer`): It can compute the loss. 29 | metric(obj:`paddle.metric.Metric`): The evaluation metric. 30 | data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches. 31 | """ 32 | 33 | model.eval() 34 | metric.reset() 35 | losses = [] 36 | for batch in data_loader: 37 | input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels'] 38 | logits = model(input_ids, token_type_ids) 39 | loss = criterion(logits, labels) 40 | loss = loss.mean() 41 | losses.append(loss.numpy()) 42 | correct = metric.compute(logits, labels) 43 | metric.update(correct) 44 | 45 | acc = metric.accumulate() 46 | logger.info("%s: eval loss: %.5f, acc: %.5f" % (name, np.mean(losses), acc)) 47 | model.train() 48 | metric.reset() 49 | 50 | return acc 51 | 52 | 53 | def preprocess_function(example, tokenizer, max_seq_length, is_test=False): 54 | """ 55 | Builds model inputs from a sequence for sequence classification tasks 56 | by concatenating and adding special tokens. 57 | 58 | Args: 59 | example(obj:`list[str]`): input data, containing text and label if it have label. 60 | tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 61 | which contains most of the methods. Users should refer to the superclass for more information regarding methods. 62 | max_seq_length(obj:`int`): The maximum total input sequence length after tokenization. 63 | Sequences longer than this will be truncated, sequences shorter will be padded. 64 | label_nums(obj:`int`): The number of the labels. 65 | Returns: 66 | result(obj:`dict`): The preprocessed data including input_ids, token_type_ids, labels. 67 | """ 68 | if 'text_b' not in example: 69 | result = tokenizer(text=example["text_a"], max_seq_len=max_seq_length) 70 | else: 71 | result = tokenizer(text=example["text_a"], text_pair=example['text_b'], max_seq_len=max_seq_length) 72 | 73 | if not is_test: 74 | result["labels"] = np.array([example['label']], dtype='int64') 75 | result["weights"] = np.array([example['weight']], dtype='float32') 76 | return result 77 | -------------------------------------------------------------------------------- /tutorials/data_map/README.md: -------------------------------------------------------------------------------- 1 | # 基于训练信号的数据地图绘制 Dataset Cartography with Training Dynamics 2 | 3 | ## 方法介绍 4 | 现有工作表明,可以使用训练数据在训练过程中的信号绘制数据地图;根据信号特征划分数据,不同数据区域具有不同特点,如难学、标注错误等。通过绘制数据地图,可以帮助开发者更好地了解训练数据。 5 | 6 | TrustAI提供了"训练信号收集 -> 数据地图绘制"方案。首先,收集每条训练数据在训练过程中不同step下的训练信号;然后,根据得到的统计信号,基于指定的信号维度绘制数据地图。 7 | 8 | 注:开发者可访问[ AI Studio示例 ](https://aistudio.baidu.com/aistudio/projectdetail/5307701)快速体验本案例。 9 | 10 | ## 实验步骤 11 | 我们以相似度计算任务LCQMC数据集上的模拟实验为例,介绍该方案实现步骤和效果。 12 | 13 | 14 | **Step 1**:从LCQMC训练集中随机抽取100条数据作为训练数据样例。训练集文件为.tsv格式,内容如下 15 | 16 | ```shell 17 | # tsv格式数据,一行一条训练数据,不同字段用tab分隔,各字段如下介绍: 18 | [ 19 | "text_a" : xxx, // 训练数据文本a 20 | "text_b" : xxx, // 训练数据文本b 21 | "label" : xxx, // 训练数据对应的label 22 | "s_label" : xxx, // 训练数据的构造label (可省略) 23 | ] 24 | ``` 25 | 注: ``s_label``可省,为用户构造的label,如构造脏数据。 26 | 27 | 基于ERNIE-3.0-base-zh在新训练集`sample_100.tsv`上微调得到基线模型,运行命令如下所示: 28 | 29 | ```shell 30 | # 训练模型并收集训练信号 31 | sh run_train_pointwise.sh 32 | ``` 33 | 所有训练数据的训练信号按训练step保存在`outputs`路径下。 34 | 35 | 注: 训练信号的收集代码可参考代码`train_pointwise.py`中Lines 199-218,用户可根据自己模型代码进行修改。收集的训练信号如下(用户可自行设计更多信号进行收集): 36 | 37 | ```shell 38 | # .jsonl 格式数据 39 | [ 40 | {'id' : xxx, // 训练数据的id 41 | 'label' : xxx, // 训练数据对应的label 42 | 'pred_label' : xxx, // 训练数据的预测label 43 | 'correct' : xxx, // 训练数据是否被预测正确 44 | 'loss' : xxx, // 训练数据当前的loss 45 | 'probs' : [xxx, xxx], // 训练数据在当前每个类下的预测概率(one-hot形式) 46 | 'label_probs' : xxx // 训练数据在label类别下的预测概率 47 | } 48 | ... ... 49 | ] 50 | ``` 51 | 52 | **Step 2**:训练信号处理,即基于不同训练steps收集到的信号计算整体信号,如基于不同steps得到的预测概率计算整体平均预测概率。 53 | 54 | ```shell 55 | # 训练信号处理 56 | python -u sample_stat_summary.py 57 | ``` 58 | 产出数据保存在`outputs`路径下。 59 | 60 |
61 | 训练信号详细信息 62 | 63 | ```shell 64 | # tsv 格式数据,一行保存一条训练数据的所有训练信号,信号之间用tab进行分隔,各信号如下表示: 65 | [ 66 | "id" : xxx, // 训练数据的id 67 | "label" : xxx, // 训练数据对应的label 68 | "s_label" : xxx, // 训练数据的构造label,数据地图绘制允许标记困难数据(s_label = 1)和构造脏数据(s_label = 2) 69 | "correct_times" : xxx, // 总共预测正确的次数 70 | "correct_ratio" : xxx, // 预测正确次数占比 71 | "avg_probs" : xxx, // 多次预测的置信度的平均数 72 | "label_var" : xxx, // 多次预测的置信度的方差 73 | "max_label_probs" : xxx, // 多次预测的置信度的最大值 74 | "min_label_probs" : xxx, // 多次预测的置信度的最小值 75 | "forgetting_times" : xxx, // 多次预测反映出的,模型对本数据的遗忘次数(之前预测对了,后来又错了) 76 | "learnt_times" : xxx, // 多次预测反映出的,模型对本数据的学会次数(之前预测错了,后来又对了) 77 | "first_forget" : xxx, // 多次预测中,第一次遗忘本数据 78 | "first_learn" : xxx, // 多次预测中,第一次学会本数据 79 | ] 80 | ``` 81 |
82 | 83 | **Step 3**:基于产出的训练信号,选择两个信号作为数据地图的主要维度(默认为平均置信度与置信方差),并选择其他信号(如正确比例、正确次数、遗忘次数、学习次数等)以颜色、形状等进行区别绘制数据地图。 84 | 85 | ```shell 86 | # 数据地图绘制 87 | python -u plot_map.py 88 | 89 | # 参数选择 90 | attr1: str类型, 默认值为"avg_probs",选择一个信号作为数据地图的纵轴 91 | attr2: str类型, 默认值为"label_var",选择一个信号作为数据地图的横轴 92 | criterion: str类型,默认值为空,选择一个训练信号作为数据筛选依据,仅满足条件的数据会被绘制在地图上 93 | threshold: float类型,默认值0,与criterion一同使用,为选择的训练信号设置阈值,筛选数据 94 | use_f_times: float类型,默认值-1,使用forgotten_times并选择所有遗忘次数不小于use_f_times的样本 95 | use_l_times: float类型,默认值-1,使用learnt_times并选择所有遗忘次数不小于use_l_times的样本 96 | 97 | # 数据地图样例 98 | python -u plot_map.py # 图1左 99 | python -u plot_map.py --criterion forgetting_times --threshold 1 # 图1中 100 | python -u plot_map.py --use_l_times 0 # 图1右 101 | ``` 102 | 103 |

104 | 105 | 106 |
107 | 图1 数据地图样例。左: 默认参数设置的数据地图;中:指定criterion为forgetting_times,threshold为1的数据绘制地图;右:使用learnt_times(use_l_times=0)区分数据的颜色,其中左上和左下角数据分别使用correct_times做进一步区分。此数据地图根据全量LCQMC数据的训练信号绘制,而非提供的100条样例。 108 |

109 | -------------------------------------------------------------------------------- /tutorials/data_map/data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import paddle 16 | import numpy as np 17 | 18 | from paddlenlp.datasets import MapDataset 19 | 20 | 21 | def create_dataloader(dataset, 22 | mode='train', 23 | batch_size=1, 24 | batchify_fn=None, 25 | trans_fn=None): 26 | if trans_fn: 27 | dataset = dataset.map(trans_fn) 28 | 29 | shuffle = True if mode == 'train' else False 30 | if mode == 'train': 31 | batch_sampler = paddle.io.DistributedBatchSampler( 32 | dataset, batch_size=batch_size, shuffle=shuffle) 33 | else: 34 | batch_sampler = paddle.io.BatchSampler( 35 | dataset, batch_size=batch_size, shuffle=shuffle) 36 | 37 | return paddle.io.DataLoader( 38 | dataset=dataset, 39 | batch_sampler=batch_sampler, 40 | collate_fn=batchify_fn, 41 | return_list=True) 42 | 43 | 44 | def read_text_pair(data_path): 45 | """Reads data.""" 46 | with open(data_path, 'r', encoding='utf-8') as f: 47 | for line in f: 48 | data = line.rstrip().split("\t") 49 | if len(data) != 2: 50 | continue 51 | yield {'query': data[0], 'title': data[1]} 52 | 53 | 54 | def convert_pointwise_example(example, 55 | tokenizer, 56 | max_seq_length=512, 57 | is_test=False, 58 | language='en'): 59 | if language == 'ch': 60 | q_name = "text_t" 61 | t_name = "text_q" 62 | s_name = "s_label" 63 | #t_name = "title" 64 | l_name = "label" 65 | else: 66 | q_name = "text_t" 67 | t_name = "text_q" 68 | l_name = "label" 69 | s_name = "s_label" 70 | #q_name = "sentence1" 71 | #t_name = "sentence2" 72 | #l_name = "labels" 73 | 74 | query, title = example[q_name], example[t_name] 75 | 76 | encoded_inputs = tokenizer( 77 | text=query, text_pair=title, max_seq_len=max_seq_length) 78 | 79 | input_ids = encoded_inputs["input_ids"] 80 | token_type_ids = encoded_inputs["token_type_ids"] 81 | sep_ids = len(input_ids) - 1 82 | 83 | #print("input_ids:",input_ids) 84 | if not is_test: 85 | label = np.array([example[l_name]], dtype="int64") 86 | s_label = np.array([example[s_name]],dtype="int64") 87 | sep_ids = np.array([sep_ids], dtype="int64") 88 | #print('label', label) 89 | return input_ids, token_type_ids, label, s_label, sep_ids 90 | else: 91 | return input_ids, token_type_ids 92 | 93 | 94 | def convert_pairwise_example(example, 95 | tokenizer, 96 | max_seq_length=512, 97 | phase="train"): 98 | 99 | if phase == "train": 100 | query, pos_title, neg_title = example["query"], example[ 101 | "title"], example["neg_title"] 102 | 103 | pos_inputs = tokenizer( 104 | text=query, text_pair=pos_title, max_seq_len=max_seq_length) 105 | neg_inputs = tokenizer( 106 | text=query, text_pair=neg_title, max_seq_len=max_seq_length) 107 | 108 | pos_input_ids = pos_inputs["input_ids"] 109 | pos_token_type_ids = pos_inputs["token_type_ids"] 110 | neg_input_ids = neg_inputs["input_ids"] 111 | neg_token_type_ids = neg_inputs["token_type_ids"] 112 | 113 | return (pos_input_ids, pos_token_type_ids, neg_input_ids, 114 | neg_token_type_ids) 115 | 116 | else: 117 | query, title = example["query"], example["title"] 118 | 119 | inputs = tokenizer( 120 | text=query, text_pair=title, max_seq_len=max_seq_length) 121 | 122 | input_ids = inputs["input_ids"] 123 | token_type_ids = inputs["token_type_ids"] 124 | if phase == "eval": 125 | return input_ids, token_type_ids, example["label"] 126 | elif phase == "predict": 127 | return input_ids, token_type_ids 128 | else: 129 | raise ValueError("not supported phase:{}".format(phase)) 130 | 131 | 132 | def gen_pair(dataset, pool_size=100): 133 | """ 134 | Generate triplet randomly based on dataset 135 | 136 | Args: 137 | dataset: A `MapDataset` or `IterDataset` or a tuple of those. 138 | Each example is composed of 2 texts: exampe["query"], example["title"] 139 | pool_size: the number of example to sample negative example randomly 140 | 141 | Return: 142 | dataset: A `MapDataset` or `IterDataset` or a tuple of those. 143 | Each example is composed of 2 texts: exampe["query"], example["pos_title"]、example["neg_title"] 144 | """ 145 | 146 | if len(dataset) < pool_size: 147 | pool_size = len(dataset) 148 | 149 | new_examples = [] 150 | pool = [] 151 | tmp_exmaples = [] 152 | 153 | for example in dataset: 154 | label = example["label"] 155 | 156 | # Filter negative example 157 | if label == 0: 158 | continue 159 | 160 | tmp_exmaples.append(example) 161 | pool.append(example["title"]) 162 | 163 | if len(pool) >= pool_size: 164 | np.random.shuffle(pool) 165 | for idx, example in enumerate(tmp_exmaples): 166 | example["neg_title"] = pool[idx] 167 | new_examples.append(example) 168 | tmp_exmaples = [] 169 | pool = [] 170 | else: 171 | continue 172 | return MapDataset(new_examples) 173 | -------------------------------------------------------------------------------- /tutorials/data_map/run_train_pointwise.sh: -------------------------------------------------------------------------------- 1 | ### 2 | # This script is used to finetune pretrained models 3 | ### 4 | 5 | export CUDA_VISIBLE_DEVICES=3 6 | LANGUAGE="ch" 7 | timestamp=`date +"%Y%m%d_%H%M%S"` 8 | data_dir='./' 9 | LEARNING_RATE=3e-5 10 | MAX_SEQ_LENGTH=256 11 | 12 | [ -d "logs" ] || mkdir -p "logs" 13 | [ -d "outputs" ] || mkdir -p "outputs" 14 | set -x 15 | 16 | train_file=sample_100.tsv 17 | dev_file=$train_file 18 | train_size=100 19 | 20 | batch_size=32 21 | epoch=5 22 | save_model_num=5 23 | epoch_steps=$[$train_size/$batch_size] 24 | save_steps=$[$epoch_steps*$epoch/${save_model_num}] 25 | 26 | python3 ./train_pointwise.py \ 27 | --learning_rate $LEARNING_RATE \ 28 | --max_seq_length $MAX_SEQ_LENGTH \ 29 | --batch_size ${batch_size} \ 30 | --epochs ${epoch} \ 31 | --data_dir $data_dir \ 32 | --train_set ${train_file} \ 33 | --dev_set ${dev_file} \ 34 | --eval_step ${save_steps} \ 35 | --warmup_proportion 0.1 \ 36 | --save_dir saved_model/${timestamp} >> logs/log_${timestamp} 37 | 38 | -------------------------------------------------------------------------------- /tutorials/data_map/sample_100.tsv: -------------------------------------------------------------------------------- 1 | text_a text_b label type 2 | 御龙在天小还丹怎么做 御龙在天怎么上不去 0 0 3 | 东风日产轩逸怎么样? 东风日产新轩逸怎么样 1 0 4 | 穿越火线有哪些小说 穿越火线小说。 1 0 5 | 我的爸爸作文 爸爸的手作文 1 0 6 | 小米华为魅族哪个好? 酷派小米华为魅族哪个好 1 0 7 | 为什么苹果信号这么差? 为什么苹果6手机信号这么差 1 0 8 | 微信朋友圈能看到访客吗 微信朋友圈都能看到吗 0 0 9 | 魔兽世界,猎人宏 魔兽世界猎人弓 1 0 10 | 怎么查别人微信的聊天记录 微信聊天记录怎么查 1 0 11 | 列方程解应用题, 列方程解应用题。 1 0 12 | 支付宝里的钱怎么转到银行卡 支付宝的钱怎么转到银行卡 1 0 13 | 北京夜店有哪些 北京有哪些夜店 1 0 14 | 这是什么颜色啊? 这是什么颜色阿! 1 0 15 | 优酷视频为什么不能下载 优酷为什么不能下载视频了 1 0 16 | 怎么样才能做好网店! 怎么做好网店? 1 0 17 | 这是哪个明星小时候 这是哪个明星的小时候 1 0 18 | 天天飞车哪个车手刷高分多 天天飞车高分用哪个车手? 1 0 19 | 不客气的,为防止掉线,请您在3-4分钟内回复我一下就好 为防止掉线,请您在3分钟内回复我一下,谢谢您的配合。您好,在否?谢谢 1 0 20 | 我的特一营全集观看那有了吗 电视剧我的特一营全集观看哪里有 1 0 21 | 如何从网上查询个人征信 在网上如何查个人征信,要明细版的 1 0 22 | 在家赚钱有些什么方法? 在家带孩子有什么赚钱的方法 1 0 23 | 现在有什么好看的连续剧或者电影? 有什么好看的电视剧,或者电影 1 0 24 | 天龙八部问题 新天龙八部问题 0 0 25 | 这个里番是什么类型的? 是什么类型 0 0 26 | 婴儿理发器哪个牌子好 婴儿理发器哪个牌子好? 1 0 27 | 手机版的百度知道可以签到嘛? 百度知道怎么看还有多少升级啊? 0 0 28 | 怎样跟自己喜欢的人表白? 怎么和自己喜欢的人表白? 1 0 29 | 发什么成绩? 发什么成绩 1 0 30 | 几岁才可以办银行卡啊? 办银行卡要几岁才可以办? 1 0 31 | 您的是人工审核的方式吗? 只有人工审核的方式吗 0 0 32 | 怎么让刘海紧贴着额头 怎么使刘海不贴额头 0 0 33 | 无聊的时候你们都在干嘛呢? 你们无聊的时候都干嘛呢? 1 0 34 | 我爱你用韩语怎么写? 我爱你用韩语怎么说? 0 0 35 | 世界上有没有外星人 世界有没有外星人 1 0 36 | 这爱已打烊是什么意思 打烊是什么意思 0 0 37 | 硅是由什么构成的 硅由什么构成 1 0 38 | 二本中电子信息工程专业哪个学校比较好 电子信息工程专业分流哪个方向比较好 0 0 39 | 男孩子名字,哪个字好 带氵,钅字旁的男孩名字有哪些? 0 0 40 | 天天酷跑怎么刷金币和钻石 天天酷跑怎么刷金币钻石 1 0 41 | 入团申请书500字左右 入团申请书600字 0 0 42 | 从马鞍山火车站怎么到博望汽车站? 从马鞍山博望到黄池怎么走 0 0 43 | 大写的我怎么写 给的大写怎么写 0 0 44 | 戴耳机的男生头像 头像耳机男生 1 0 45 | 纪念碑谷这关怎么过? 史上最坑爹的游戏,咋过, 0 0 46 | 这个是您转账的 这应该是您查询错误账户了 0 0 47 | 平板电脑哪个品牌好呢 国产哪个牌子的平板电脑好 1 0 48 | 小和尚念经下一句是什么 小和尚念经,下一句是什么? 1 0 49 | 男票,是什么意思? 男票、女票什么意思? 1 0 50 | 怎样摆脱手机依赖症 如何摆脱手机依赖症 1 0 51 | 世界上最大的岛屿是 世界最大的岛屿? 1 0 52 | 过年手机一般会降价么? 过年手机会降价吗 1 0 53 | 您好,您是本人操作的吗 请您让本人操作 0 0 54 | 想找个情侣头像 谁帮我找个情侣头像吖 1 0 55 | 您好您的情况小二已经帮您反馈您后续关注一下您的手机和邮箱的信息. 您好.您的情况已经帮您反馈您可以后续关注一下您的手机和邮箱信息. 1 0 56 | 西游记是什么小说 西游记是什么体的小说 1 0 57 | 求!这张图片的高清大图! 求这张图片的高清大图。 1 0 58 | 吃榴莲不能吃什么 榴莲吃了不能吃什么 1 0 59 | 苹果手机铃声叫什么啊 苹果手机是什么铃声 1 0 60 | 眼皮老是跳怎么回事 左眼皮老是跳是怎么回事 1 0 61 | 灵魂是什么? 灵魂是什么?由来是什么? 1 0 62 | 请问QQ游戏的欢乐斗地主比斗地主好玩很多吗? 欢乐斗地主里面的斗牛游戏好玩吗?可以下载吗 0 0 63 | 薏米红豆水一天喝多少 红豆薏米一天喝多少 1 0 64 | 都在说大老虎什么意思 老虎油,什么意思? 0 0 65 | 天津艺术职业学院有普通类专业 天津城市职业学院电脑艺术设计专业要考专业吗? 0 0 66 | 传说中的黑洞是怎样形成的? 斑竹是怎么形成的? 0 0 67 | 什么意思?翻译一下。 翻译一下什么意思? 1 0 68 | 想给我儿子起个名字: 我想给儿子起个名字 1 0 69 | 数学几何证明题 几何数学证明题 1 0 70 | 疯狂猜成语的答案是什么? 疯狂猜成语关于嘴的成语有哪些 0 0 71 | 什么办法才能让鼻梁长高 有什么让鼻梁变高的方法 1 0 72 | 室内设计和计算机信息管理哪个专业好 计算机信息管理和电子商务哪个专业更好 0 0 73 | 暗黑魔法师崛起大理石手 暗黑魔法师:崛起缺少文件 0 0 74 | 为什么女人喜欢男人吃下面 为什么男人喜欢亲女人下面? 0 0 75 | 蓝颜红颜是什么 红颜和蓝颜分别指什么? 1 0 76 | 如何坚持马克思主义社会科学方法论的指导地位 如何理解马克思主义社会科学方法论的革命性变革 0 0 77 | 魏晨你喜欢吗 那么爱你为什么的吉他谱简单吗 0 0 78 | 璀璨人生全集在哪里可以看? 哪里有璀璨人生的全集? 1 0 79 | 这是什么动漫?叫什么? 这是什么动漫叫什么名字? 0 0 80 | 我什么时候放假? 你什么时候放假? 0 0 81 | 武林外传好看吗 谁有武林外传体验区激活码啊 0 0 82 | wd做个假体丰胸手术多少钱?最近有人做了吗 怎样才能有效丰胸?假如做隆胸手术得多少钱? 0 0 83 | 这个头像的大图 求这个头像大图! 1 0 84 | 十全十美是什么动物? 十全十美的动物是什么 1 0 85 | 《步步惊情》什么时候上映 步步惊情什么时候上映? 1 0 86 | 什么是婚育证明怎么写 村里开一胎生育证明怎么写 0 0 87 | 和您购买这个手机号的时间,麻烦您了。 手机号的购买的时间麻烦您您提供一下 1 0 88 | 双子座与狮子座配吗 双子座和狮子座和吗 1 0 89 | 该不该请领导吃饭 怎么暗示请领导吃饭 0 0 90 | 请问这是什么字体? 请问大家这是什么字体? 1 0 91 | 为什么天会下雨了 过年了为什么反而不放假了 0 0 92 | 为什么奥比岛打不开? 为什么奥比岛打不开 1 0 93 | 梦见自己生了个男孩子。 我梦见自己生了男孩子 1 0 94 | 正确的反义词是什么? 正确的反义词是什么 1 0 95 | 珠海有什么好玩的地方 珠海有什么好玩的地方? 1 0 96 | 孕妇可以吃黄瓜吗加了醋的 孕妇能吃黄瓜吗 1 0 97 | 儿童音乐乐园雅马哈音乐中心怎么样 儿童钢琴专业课程雅马哈音乐中心怎么样 0 0 98 | 淘宝换货怎么换? 淘宝怎么换货 1 0 99 | 想学摄影应该买什么书比较好? 摄影技术去哪里学比较好。 0 0 100 | 传奇登陆器怎么下载 怎么下载传奇登陆器 1 0 101 | 男的做什么工作挣钱 做什么工作挣钱? 1 0 102 | -------------------------------------------------------------------------------- /tutorials/data_map/sample_stat_summary.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import sys 4 | import json 5 | import numpy as np 6 | 7 | file_name = "output_data.json" 8 | output_file_names = ["correct_times", "correct_ratio", "avg_probs", "label_var", 9 | "max_label_probs", "min_label_probs", "forgetting_times", "learnt_times", "first_forget", "first_learn", "pred_label", "pred_dist"] 10 | num_samples = 100 11 | 12 | _input_path = "./outputs/" 13 | _output = open(_input_path + file_name + ".result", "w") 14 | 15 | def list_concat(score_dict, input_file, input_path = "./data/", sample_size=1, pred_idx=-2, label_idx=-1, score_idx=-3, get_max_probs=False): 16 | """ 17 | add info of each epoch (or given steps) into a dict of lists 18 | """ 19 | _input = open(input_path + input_file, "r") 20 | for i, line in enumerate(_input): 21 | info = json.loads(line.strip().replace("\'", "\"")) 22 | sid = int(info["id"]) 23 | label = int(info["label"]) # [1:-1] to avoid "[]" 24 | if "noisy_label" in info: 25 | s_label = int(info["noisy_label"]) 26 | else: 27 | s_label = 0 28 | # score = float(info["probs"]) 29 | label_probs = float(info["label_probs"]) # the score under GL 30 | pred_correctness = info["correct"] 31 | if get_max_probs: 32 | all_probs = [eval(j) for j in info["probs"]] 33 | max_probs = np.max(all_probs) 34 | else: 35 | max_probs = 1.0 36 | 37 | score_info = [] # list of scores under different class 38 | for score in info["probs"]: # the number of classes here 39 | score_float = float(score) 40 | score_info.append(score_float) 41 | 42 | if not score_dict["id"][sid]: 43 | score_dict["id"][sid] = sid 44 | score_dict["label"][sid] = label 45 | score_dict["s_label"][sid] = s_label 46 | score_dict["label_probs"][sid].append(label_probs) 47 | score_dict["max_probs"][sid].append(max_probs) 48 | score_dict["pred_info"][sid].append(pred_correctness) 49 | score_dict["pred_label"][sid].append(str(np.argmax(score_info))) 50 | 51 | # add forget info 52 | list_length = len(score_dict["pred_info"][sid]) 53 | if list_length > 1: 54 | if score_dict["pred_info"][sid][list_length - 1] == score_dict["pred_info"][sid][list_length - 2]: 55 | score_dict["forget_info"][sid].append("None") 56 | elif score_dict["pred_info"][sid][list_length - 1] == "true": 57 | score_dict["forget_info"][sid].append("Learn") 58 | else: 59 | score_dict["forget_info"][sid].append("Forget") 60 | else: 61 | score_dict["forget_info"][sid].append("None") 62 | #if sid == 1: 63 | # print(score_dict["forget_info"][sid]) 64 | 65 | # if i >= sample_size: 66 | # break 67 | 68 | _input.close() 69 | 70 | def check_correct_ratio(correct_lists): 71 | """ 72 | ratio that a model predict classes correctly in different epochs 73 | """ 74 | if len(correct_lists) == 0 or len(correct_lists[0]) == 0: 75 | return [0], [0] 76 | ratio_list = [] 77 | pos_list = [] 78 | for c_list in correct_lists: 79 | pos_cnt = 0 80 | for info in c_list: 81 | if info == "true": 82 | pos_cnt += 1 83 | ratio_list.append(float(pos_cnt)/len(c_list) if len(c_list)!=0 else 0) 84 | pos_list.append(pos_cnt) 85 | return pos_list, ratio_list 86 | 87 | def check_forget_time(forget_lists): 88 | if len(forget_lists) == 0 or len(forget_lists[0]) == 0: 89 | return [0], [0], 0, 0 90 | forgetting_list = [] 91 | learnt_list = [] 92 | first_forgetting_time = [] 93 | first_learnt_time = [] 94 | for f_list in forget_lists: 95 | forgetting_cnt = 0 96 | learnt_cnt = 0 97 | first_f_time = 0 98 | first_l_time = 0 99 | for i, info in enumerate(f_list): 100 | if info == "Forget": 101 | forgetting_cnt += 1 102 | if first_f_time == 0: 103 | first_f_time = i 104 | elif info == "Learn": 105 | learnt_cnt += 1 106 | if first_l_time == 0: 107 | first_l_time = i 108 | forgetting_list.append(forgetting_cnt) 109 | learnt_list.append(learnt_cnt) 110 | first_forgetting_time.append(first_f_time) 111 | first_learnt_time.append(first_l_time) 112 | 113 | return forgetting_list, learnt_list, first_forgetting_time, first_learnt_time 114 | 115 | def check_pred_distribution(pred_lists): 116 | pred_list = [] 117 | for scores in pred_lists: 118 | score_dist_dict = {"0": 0, "1": 0, "2": 0, "3": 0, "4": 0} 119 | for score in scores: 120 | score_dist_dict[score] += 1 121 | pred_list.append(score_dist_dict) 122 | return pred_list 123 | 124 | 125 | info_dict = {"id": [[] for i in range(num_samples)], "label": [[] for i in range(num_samples)], 126 | "s_label": [[] for i in range(num_samples)], "label_probs": [[] for i in range(num_samples)], 127 | "max_probs": [[] for i in range(num_samples)], "pred_info": [[] for i in range(num_samples)], 128 | "forget_info": [[] for i in range(num_samples)], "pred_label": [[] for i in range(num_samples)]} 129 | list_concat(info_dict, file_name, _input_path, sample_size=num_samples) 130 | 131 | print(len(info_dict["label_probs"]), len(info_dict["label_probs"][0])) 132 | 133 | info_dict["correct_times"], info_dict["correct_ratio"] = check_correct_ratio(info_dict["pred_info"]) 134 | info_dict["label_var"] = np.var(info_dict["label_probs"], axis=1) 135 | info_dict["max_var"] = np.var(info_dict["max_probs"], axis=1) 136 | info_dict["avg_probs"] = np.mean(info_dict["label_probs"], axis=1) 137 | info_dict["max_label_probs"] = np.max(info_dict["label_probs"], axis=1) 138 | info_dict["min_label_probs"] = np.min(info_dict["label_probs"], axis=1) 139 | info_dict["forgetting_times"], info_dict["learnt_times"], info_dict["first_forget"], info_dict["first_learn"] = check_forget_time(info_dict["forget_info"]) 140 | info_dict["pred_dist"] = check_pred_distribution(info_dict["pred_label"]) 141 | output_file_names = ["id", "label", "s_label"] + output_file_names 142 | 143 | _output.write("\t".join(output_file_names) + "\n") 144 | for i in range(num_samples): 145 | info_list = [] 146 | for name in output_file_names: 147 | info_list.append(str(info_dict[name][i])) 148 | _output.write("\t".join(info_list) + "\n") 149 | 150 | _output.close() -------------------------------------------------------------------------------- /tutorials/dirty_data_identification/README.md: -------------------------------------------------------------------------------- 1 | # 训练数据中脏数据识别 2 | 3 | ### 方法介绍 4 | 训练数据标注质量对模型效果有较大影响,但受限于标注人员水平、标注任务难易程度等影响,训练数据中都存在一定比例的标注错误的数据(称为**脏数据**)。当标注数据规模较大时,数据标注检查就成为一个难题。 5 | 6 |

7 |
8 | 图1 脏数据识别及标注修正策略流程 9 |

10 | 11 | TrustAI提供了"脏数据识别 -> 修正"方案,如图1所示。首先,基于实例级证据分析方法识别候选脏数据(对模型训练影响较大的数据)。然后,对候选脏数据进行标注修正,使用修正后的数据重新训练模型,可显著提升模型效果。 12 | 13 | 注:开发者可访问[ AI Studio示例 ](https://aistudio.baidu.com/aistudio/projectdetail/4434058)快速体验本案例。 14 | 15 | ## 实验步骤 16 | 我们以基于相似度计算任务LCQMC数据集上的模拟实验为例,介绍该方案实现步骤和效果。 17 | 18 | 19 | **Step 1**:从LCQMC训练集中随机抽取5000条数据作为新训练集。基于ERNIE-3.0-base-zh在新训练集`train_5000.tsv`微调得到基线模型,运行命令如下所示: 20 | 21 | ```shell 22 | # 下载数据 23 | wget --no-check-certificate https://trustai.bj.bcebos.com/application_data/dirty_data.tar && tar xf dirty_data.tar && rm dirty_data.tar 24 | # 训练基线模型 25 | python -u train.py --dataset_dir ./data --train_file train_5000.tsv --dev_file dev.tsv --test_files test.tsv --num_classes 2 --save_dir ./checkpoint 26 | ``` 27 | 训练的基线模型保存在`checkpoint`路径下。 28 | 29 | 30 | **Step 2**:识别训练集中的脏数据。 31 | 脏数据选择方法:基于TrustAI提供的实例级可信分析方法`RepresenterPointModel`,计算每一条训练样本对模型loss的影响分数,一般该分数表明了样本作为脏数据的可能性。我们使用这个分数识别脏数据。 32 | 33 | ```shell 34 | # 从训练集中识别候选脏数据 35 | python -u find_dirty_data.py --dataset_dir ./data --train_file train_5000.tsv --num_classes 2 --rest_path ./data/rest_train.tsv --init_from_ckpt ./checkpoint/model_state.pdparams --dirty_path ./data/dirty_train.tsv --dirty_num 500 36 | # dirty_num表示选取候选脏数据的数量 37 | # dirty_path表示候选脏数据的存储路径 38 | ``` 39 | 40 | 41 | **Step 3**:对候选脏数据(在我们的实验中,其占比为全部训练集10%)进行标注修正,修正后的数据保存在`correction_data.tsv`(数据修正比例为**38.4%**,随机选择数据其需要修正的数据比例为**5.0%**)。 42 | 43 | 44 | **Step 4**:使用修正后的新训练集`train_5000_correction.tsv`重新训练模型,并评估模型效果。 45 | ```shell 46 | # 下载数据:含train_5000_correction.tsv文件 47 | python -u train.py --dataset_dir ./data --train_file train_5000_correction.tsv --dev_file dev.tsv --test_files test.tsv DuQM --num_classes 2 --save_dir ./new_checkpoint 48 | ``` 49 | 50 | 由下表可知,候选脏数据修正(规模为原始训练集的10%)后,模型在LCQMC测试集上提升2.13%,在DuQM数据集上提升4.01%。 51 | 52 | 53 | | 数据集 | LCQMCdev | LCQMCtest | DuQM | 54 | | :-------: | :-----: | :-----: |:-----: | 55 | | 基线 | 86.42% | 84.87% | 69.51% | 56 | | 数据修正 | 87.76% | 86.62% | 73.18% | 57 | 58 | 注:以上结果均为10次实验的平均值。 59 | -------------------------------------------------------------------------------- /tutorials/dirty_data_identification/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import numpy as np 16 | 17 | import paddle 18 | import paddle.nn.functional as F 19 | from paddlenlp.utils.log import logger 20 | 21 | 22 | @paddle.no_grad() 23 | def evaluate(model, criterion, metric, data_loader, name=''): 24 | """ 25 | Given a dataset, it evaluates model and computes the metric. 26 | Args: 27 | model(obj:`paddle.nn.Layer`): A model to classify texts. 28 | criterion(obj:`paddle.nn.Layer`): It can compute the loss. 29 | metric(obj:`paddle.metric.Metric`): The evaluation metric. 30 | data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches. 31 | """ 32 | 33 | model.eval() 34 | metric.reset() 35 | losses = [] 36 | for batch in data_loader: 37 | input_ids, token_type_ids, labels = batch['input_ids'], batch[ 38 | 'token_type_ids'], batch['labels'] 39 | logits = model(input_ids, token_type_ids) 40 | loss = criterion(logits, labels) 41 | losses.append(loss.numpy()) 42 | correct = metric.compute(logits, labels) 43 | metric.update(correct) 44 | 45 | acc = metric.accumulate() 46 | logger.info("%s: eval loss: %.5f, acc: %.5f" % (name, np.mean(losses), acc)) 47 | model.train() 48 | metric.reset() 49 | 50 | return acc 51 | 52 | 53 | def preprocess_function(example, tokenizer, max_seq_length, is_test=False): 54 | """ 55 | Builds model inputs from a sequence for sequence classification tasks 56 | by concatenating and adding special tokens. 57 | 58 | Args: 59 | example(obj:`list[str]`): input data, containing text and label if it have label. 60 | tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 61 | which contains most of the methods. Users should refer to the superclass for more information regarding methods. 62 | max_seq_length(obj:`int`): The maximum total input sequence length after tokenization. 63 | Sequences longer than this will be truncated, sequences shorter will be padded. 64 | label_nums(obj:`int`): The number of the labels. 65 | Returns: 66 | result(obj:`dict`): The preprocessed data including input_ids, token_type_ids, labels. 67 | """ 68 | if 'text_b' not in example: 69 | result = tokenizer(text=example["text_a"], max_seq_len=max_seq_length) 70 | else: 71 | result = tokenizer(text=example["text_a"], text_pair=example['text_b'], max_seq_len=max_seq_length) 72 | 73 | if not is_test: 74 | result["labels"] = np.array([example['label']], dtype='int64') 75 | return result 76 | -------------------------------------------------------------------------------- /tutorials/enhanced_by_rationale/README.md: -------------------------------------------------------------------------------- 1 | # 基于证据指导的模型增强方案 2 | ## 方法介绍 3 | 4 | 经过对多个模型预测依赖证据的评估,发现深度学习模型提供的证据合理性偏弱。为进一步提高证据的合理性,TrustAI提供了基于证据指导的模型增强方案([Jayaram etc. 2021](https://aclanthology.org/2021.emnlp-main.450/)),即标注少量证据数据,通过联合学习原始任务和证据学习任务,用证据学习目标指导模型依赖合理的证据进行预测,提升模型可解释性。 5 | 6 | 7 | ## 实验步骤 8 | 9 | 实验基于ERNIE-2.0-EN-Base在英文情感分析SST数据集上微调得到基线模型,然后选择1000条训练数据进行证据人工标注,并基于这1000条数据进行证据学习。最终,在500条标有证据的验证数据上进行效果评估。评估指标除了模型预测准确率外,还包括可解释评估指标,即证据的合理性、充分性和完备性。 10 | 11 | 12 | 我们实验中使用数据可通过如下命令下载: 13 | ```shell 14 | # 下载样例数据,每个文件仅包含两条样例数据,开发者可根据样例数据的格式自行标注证据数据 15 | wget --no-check-certificate https://trustai.bj.bcebos.com/application_data/rationale_data.tar && tar xf rationale_data.ta && rm rationale_data.ta 16 | ``` 17 | 18 | 数据下载后,通过`train.py`进行模型训练,该训练过程引入了证据学习目标,指导模型依赖合理的证据进行预测。 19 | ```shell 20 | python -u train.py --dataset_dir ./data --train_file train.tsv --dev_file dev.tsv --num_classes 2 --save_dir ./maw --use_maw 21 | # user_maw表示是否使用证据增强模型效果 22 | ``` 23 | 24 | 实验结果如下表,在加入证据指导后,模型预测效果略有提升,准确率提升0.5%;模型可解释性提升明显:证据合理性提升5.0%、充分性降低0.185(该指标越低越好)、完备性提升0.044。 25 | 26 | | 数据集 | 准确率 | 合理性 | 充分性 | 完备性 | 27 | | :-------: | :-----: | :-----: | :-----: | :-----: | 28 | | base | 93.5% | 26.1% | 0.367 | 0.118 | 29 | | base + maw loss | 94.0% | 31.1% | 0.182 | 0.162 | 30 | 31 | 注:以上结果均为3次实验的平均值。 32 | -------------------------------------------------------------------------------- /tutorials/enhanced_by_rationale/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import numpy as np 16 | import itertools 17 | 18 | import paddle 19 | import paddle.nn.functional as F 20 | from paddlenlp.utils.log import logger 21 | 22 | 23 | @paddle.no_grad() 24 | def evaluate(model, criterion, metric, data_loader, name=''): 25 | """ 26 | Given a dataset, it evaluates model and computes the metric. 27 | Args: 28 | model(obj:`paddle.nn.Layer`): A model to classify texts. 29 | criterion(obj:`paddle.nn.Layer`): It can compute the loss. 30 | metric(obj:`paddle.metric.Metric`): The evaluation metric. 31 | data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches. 32 | """ 33 | 34 | model.eval() 35 | metric.reset() 36 | losses = [] 37 | for batch in data_loader: 38 | input_ids, token_type_ids, labels, _, _ = batch 39 | logits = model(input_ids, token_type_ids) 40 | loss = criterion(logits, labels) 41 | losses.append(loss.numpy()) 42 | correct = metric.compute(logits, labels) 43 | metric.update(correct) 44 | 45 | acc = metric.accumulate() 46 | logger.info("%s: eval loss: %.5f, acc: %.5f" % (name, np.mean(losses), acc)) 47 | model.train() 48 | metric.reset() 49 | 50 | return acc 51 | 52 | 53 | def preprocess_function(example, tokenizer, max_seq_length, is_test=False): 54 | """ 55 | Builds model inputs from a sequence for sequence classification tasks 56 | by concatenating and adding special tokens. 57 | 58 | Args: 59 | example(obj:`list[str]`): input data, containing text and label if it have label. 60 | tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 61 | which contains most of the methods. Users should refer to the superclass for more information regarding methods. 62 | max_seq_length(obj:`int`): The maximum total input sequence length after tokenization. 63 | Sequences longer than this will be truncated, sequences shorter will be padded. 64 | label_nums(obj:`int`): The number of the labels. 65 | Returns: 66 | result(obj:`dict`): The preprocessed data including input_ids, token_type_ids, labels. 67 | """ 68 | if is_test: 69 | result = tokenizer(text=example['text_a'], max_seq_len=max_seq_length, return_attention_mask=True) 70 | return result['input_ids'], result['token_type_ids'] 71 | else: 72 | tokens = example['tokens'] 73 | rationales = example['rationales'] 74 | tokens = [tokenizer._tokenize(token) for token in tokens] 75 | assert len(tokens) == len(rationales) 76 | rationales = list( 77 | itertools.chain(*[[rationale] * len(sub_tokens) for sub_tokens, rationale in zip(tokens, rationales)])) 78 | tokens = list(itertools.chain(*tokens)) 79 | result = tokenizer(text=tokens, 80 | max_seq_len=max_seq_length, 81 | is_split_into_words=True, 82 | return_attention_mask=True) 83 | input_ids = result["input_ids"] 84 | token_type_ids = result["token_type_ids"] 85 | attention_mask = result["attention_mask"] 86 | seq_len = len(input_ids) 87 | rationales = [0] + rationales[:seq_len - 2] + [0] 88 | assert len(rationales) == seq_len 89 | label = np.array([example['label']], dtype="int64") 90 | return input_ids, token_type_ids, label, rationales, attention_mask 91 | -------------------------------------------------------------------------------- /tutorials/redundancy_removal/args.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser(description=__doc__) 20 | parser.add_argument("--model_name", type=str, required=True, help="Name of pre-trained model.") 21 | parser.add_argument("--output_dir", 22 | type=str, 23 | required=True, 24 | help="The output directory where the model predictions and checkpoints will be written.") 25 | parser.add_argument( 26 | "--data_dir", 27 | type=str, 28 | required=True, 29 | help="The data directory should include `train` and `dev` set to train model and `test` set to test model.") 30 | parser.add_argument("--max_seq_length", 31 | default=512, 32 | type=int, 33 | help="The maximum total input sequence length after tokenization.") 34 | parser.add_argument("--batch_size", default=24, type=int, help="Batch size per GPU/CPU for training.") 35 | parser.add_argument("--learning_rate", default=7e-5, type=float, help="The initial learning rate for Adam.") 36 | parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") 37 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") 38 | parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") 39 | parser.add_argument("--num_train_epochs", default=20, type=int, help="Total number of train epochs to perform.") 40 | parser.add_argument("--max_steps", 41 | default=-1, 42 | type=int, 43 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.") 44 | parser.add_argument("--warmup_proportion", 45 | default=0.0, 46 | type=float, 47 | help="Proportion of training steps to perform linear learning rate warmup for.") 48 | parser.add_argument("--logging_steps", type=int, default=10, help="Log every X updates steps.") 49 | parser.add_argument("--save_steps", type=int, default=200, help="Save checkpoint every X updates steps.") 50 | parser.add_argument("--load_model_path", type=str, default=None, help="The checkpoint directory where the model") 51 | parser.add_argument("--get_k_sentences", type=int, default=0, help="load checkpoint path") 52 | parser.add_argument("--set_k_sentences_ground_true", type=int, default=0, help="set k sentences ground true") 53 | parser.add_argument("--early_stop_nums", type=int, default=5, help="probability threshold for selecting sentences") 54 | parser.add_argument("--one_alpha", type=float, default=0.4, help="probability threshold for selecting sentences") 55 | parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") 56 | parser.add_argument('--device', 57 | choices=['cpu', 'gpu'], 58 | default="gpu", 59 | help="Select which device to train model, defaults to gpu.") 60 | parser.add_argument("--doc_stride", 61 | type=int, 62 | default=128, 63 | help="When splitting up a long document into chunks, how much stride to take between chunks.") 64 | parser.add_argument( 65 | "--n_best_size", 66 | type=int, 67 | default=20, 68 | help="The total number of n-best predictions to generate in the nbest_predictions.json output file.") 69 | parser.add_argument("--max_query_length", type=int, default=64, help="Max query length.") 70 | parser.add_argument("--max_answer_length", type=int, default=30, help="Max answer length.") 71 | parser.add_argument("--hidden_size", type=int, default=768, help="hidden size") 72 | parser.add_argument("--verbose", action='store_true', help="Whether to output verbose log.") 73 | parser.add_argument("--do_train", action='store_true', help="Whether to train the model.") 74 | parser.add_argument("--do_predict", action='store_true', help="Whether to predict.") 75 | parser.add_argument("--use_loose_metric", action='store_true', help="whether to use loose metric to choose model.") 76 | parser.add_argument("--use_similarity", action='store_true', help="whether to use similarity to choose sentence.") 77 | parser.add_argument("--early_stop", action='store_true', help="whether to use early stop.") 78 | args = parser.parse_args() 79 | return args 80 | -------------------------------------------------------------------------------- /tutorials/redundancy_removal/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Download dataset and model parameters 3 | set -e 4 | 5 | if [ ! -d "./data" ]; then 6 | mkdir "./data" 7 | fi 8 | 9 | echo "Download DuReader-robust dataset" 10 | wget --no-check-certificate https://dataset-bj.cdn.bcebos.com/dureader_robust/data/dureader_robust-data.tar.gz 11 | tar -zxvf dureader_robust-data.tar.gz 12 | mv dureader_robust-data data/robust 13 | rm dureader_robust-data.tar.gz 14 | 15 | echo "Download DuReader-checklist dataset" 16 | wget --no-check-certificate https://dataset-bj.cdn.bcebos.com/lic2021/dureader_checklist.dataset.tar.gz 17 | tar -zxvf dureader_checklist.dataset.tar.gz 18 | mv dataset data/checklist 19 | rm dureader_checklist.dataset.tar.gz 20 | mkdir ./data/checklist_wo_no_answer 21 | python ./utils/checklist_process.py --input_data_dir ./data/checklist --output_data_dir ./data/checklist_wo_no_answer -------------------------------------------------------------------------------- /tutorials/redundancy_removal/predictor/model.py: -------------------------------------------------------------------------------- 1 | import paddle 2 | from paddlenlp.transformers import AutoModelForQuestionAnswering 3 | 4 | 5 | class Predictor(paddle.nn.Layer): 6 | 7 | def __init__(self, args): 8 | super(Predictor, self).__init__() 9 | self.model = AutoModelForQuestionAnswering.from_pretrained(args.model_name) 10 | 11 | def forward(self, x, mode="train"): 12 | data = { 13 | "input_ids": x[0], 14 | "token_type_ids": x[1], 15 | } 16 | if mode == "train": 17 | data["start_positions"] = x[2] 18 | data["end_positions"] = x[3] 19 | 20 | logits = self.model(input_ids=data["input_ids"], token_type_ids=data["token_type_ids"]) 21 | 22 | if mode == "dev" or mode == "test": 23 | return logits 24 | 25 | # Compute loss 26 | start_logits, end_logits = logits 27 | start_position = paddle.unsqueeze(data["start_positions"], axis=-1) 28 | end_position = paddle.unsqueeze(data["end_positions"], axis=-1) 29 | start_loss = paddle.nn.functional.cross_entropy(input=start_logits, label=start_position) 30 | end_loss = paddle.nn.functional.cross_entropy(input=end_logits, label=end_position) 31 | loss = (start_loss + end_loss) / 2 32 | return loss, logits 33 | -------------------------------------------------------------------------------- /tutorials/redundancy_removal/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | paddlenlp 3 | paddlepaddle-gpu >= 2.0.0 4 | scikit-learn 5 | tqdm 6 | matplotlib 7 | IPython 8 | pre-commit -------------------------------------------------------------------------------- /tutorials/redundancy_removal/run_predict.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import random 16 | 17 | import numpy as np 18 | import paddle 19 | 20 | from args import parse_args 21 | from predictor import model, model_manager 22 | 23 | 24 | def set_seed(args): 25 | random.seed(args.seed) 26 | np.random.seed(args.seed) 27 | paddle.seed(args.seed) 28 | 29 | 30 | def run(args): 31 | paddle.set_device(args.device) 32 | if paddle.distributed.get_world_size() > 1: 33 | paddle.distributed.init_parallel_env() 34 | rank = paddle.distributed.get_rank() 35 | 36 | set_seed(args) 37 | 38 | predictor = model.Predictor(args) 39 | if paddle.distributed.get_world_size() > 1: 40 | predictor = paddle.DataParallel(predictor) 41 | 42 | # Prepare model manager 43 | manager = model_manager.ModelManager(args, predictor) 44 | 45 | if args.do_train: 46 | manager.train(rank) 47 | if args.do_predict and rank == 0: 48 | manager.test() 49 | 50 | 51 | if __name__ == "__main__": 52 | args = parse_args() 53 | run(args) 54 | -------------------------------------------------------------------------------- /tutorials/redundancy_removal/run_select.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import random 16 | 17 | import numpy as np 18 | import paddle 19 | 20 | from args import parse_args 21 | from selector import model, model_manager 22 | 23 | 24 | def set_seed(args): 25 | random.seed(args.seed) 26 | np.random.seed(args.seed) 27 | paddle.seed(args.seed) 28 | 29 | 30 | def run(args): 31 | set_seed(args) 32 | 33 | # Prepare device and model 34 | paddle.set_device(args.device) 35 | if paddle.distributed.get_world_size() > 1: 36 | paddle.distributed.init_parallel_env() 37 | rank = paddle.distributed.get_rank() 38 | 39 | selector = model.Selector(args) 40 | if paddle.distributed.get_world_size() > 1: 41 | selector = paddle.DataParallel(selector) 42 | 43 | # Prepare model manager 44 | manager = model_manager.ModelManager(args, selector) 45 | 46 | if args.do_train: 47 | manager.train(rank) 48 | 49 | if args.do_predict and rank == 0: 50 | manager.test() 51 | 52 | 53 | if __name__ == "__main__": 54 | args = parse_args() 55 | run(args) 56 | -------------------------------------------------------------------------------- /tutorials/redundancy_removal/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | data_dir="./data/robust" 3 | output_dir="./output" 4 | load_selector_model_dir="./" 5 | load_predictor_model_dir="./" 6 | split="dev" 7 | 8 | ARGS=`getopt -a -o :d:o:s:p:S:h -l data_dir:,output_dir:,selector_model_dir:,predictor_model_dir:,split:,help -- "$@"` 9 | eval set -- "${ARGS}" 10 | while true 11 | do 12 | case "$1" in 13 | -d|--data_dir) 14 | data_dir="$2" 15 | shift 16 | ;; 17 | -o|--output_dir) 18 | output_dir="$2" 19 | shift 20 | ;; 21 | -s|--selector_model_dir) 22 | load_selector_model_dir="$2" 23 | shift 24 | ;; 25 | -p|--predictor_model_dir) 26 | load_predictor_model_dir="$2" 27 | shift 28 | ;; 29 | -S|--split) 30 | split="$2" 31 | shift 32 | ;; 33 | -h|--help) 34 | echo "help" 35 | ;; 36 | --) 37 | shift 38 | break 39 | ;; 40 | esac 41 | shift 42 | done 43 | 44 | if [ ! -d "./cache" ]; then 45 | mkdir "./cache" 46 | fi 47 | if [ ! -d "${output_dir}" ]; then 48 | mkdir "${output_dir}" 49 | fi 50 | if [ ! -d "${output_dir}/selected-test-data" ]; then 51 | mkdir "${output_dir}/selected-test-data" 52 | fi 53 | if [ $split = "dev" ]; then 54 | if [ ! -d "${output_dir}/tmp" ]; then 55 | mkdir "${output_dir}/tmp" 56 | fi 57 | cp "${data_dir}/dev.json" "${output_dir}/tmp/test.json" 58 | echo "Data path: ${data_dir}/dev.json" 59 | data_dir="${output_dir}/tmp" 60 | else 61 | echo "Data path: ${data_dir}/test.json" 62 | fi 63 | 64 | echo "Output path: ${output_dir}/selected-test-data" 65 | 66 | python -u run_select.py \ 67 | --model_name hfl/roberta-wwm-ext \ 68 | --do_predict \ 69 | --batch_size 24 \ 70 | --data_dir $data_dir \ 71 | --load_model_path "${load_selector_model_dir}/model_state.pdparams" \ 72 | --one_alpha 0.1 \ 73 | --output_dir "${output_dir}/selected-test-data/" \ 74 | --device gpu 75 | 76 | mv "${output_dir}/selected-test-data/test_prediction.json" "${output_dir}/selected-test-data/test.json" 77 | 78 | python -u run_predict.py \ 79 | --model_name hfl/roberta-wwm-ext \ 80 | --do_predict \ 81 | --batch_size 24 \ 82 | --max_seq_length 384 \ 83 | --data_dir "${output_dir}/selected-test-data/" \ 84 | --load_model_path "${load_predictor_model_dir}/model_state.pdparams" \ 85 | --output_dir "${output_dir}/predict-result/" \ 86 | --device gpu 87 | 88 | 89 | -------------------------------------------------------------------------------- /tutorials/redundancy_removal/train.sh: -------------------------------------------------------------------------------- 1 | 2 | data_dir="./data/robust" 3 | output_dir="./output" 4 | 5 | # Read Parameter 6 | # -d data dir path 7 | # -o output_dir 8 | while getopts ":d:o:" optname 9 | do 10 | case "$optname" in 11 | "d") 12 | data_dir=$OPTARG 13 | ;; 14 | "o") 15 | output_dir=$OPTARG 16 | ;; 17 | ":") 18 | echo "No argument value for option $OPTARG" 19 | ;; 20 | "?") 21 | echo "Unknown option $OPTARG" 22 | ;; 23 | *) 24 | echo "Unknown error while processing options" 25 | ;; 26 | esac 27 | done 28 | 29 | echo "Data path: ${data_dir}" 30 | echo "Selector Output path: ${output_dir}/selector/" 31 | 32 | # prepare dir 33 | 34 | # clean cache 35 | if [ ! -d "./cache" ]; then 36 | mkdir "./cache" 37 | else 38 | rm -rf "./cache" 39 | mkdir "./cache" 40 | fi 41 | if [ ! -d "${output_dir}" ]; then 42 | mkdir "${output_dir}" 43 | fi 44 | if [ ! -d "${output_dir}/selector" ]; then 45 | mkdir "${output_dir}/selector" 46 | fi 47 | if [ ! -d "${output_dir}/selected-data" ]; then 48 | mkdir "${output_dir}/selected-data" 49 | fi 50 | if [ ! -d "${output_dir}/predictor" ]; then 51 | mkdir "${output_dir}/predictor" 52 | fi 53 | if [ ! -d "${output_dir}/tmp" ]; then 54 | mkdir "${output_dir}/tmp" 55 | fi 56 | 57 | echo "########## Selector Training #############" 58 | 59 | # train selector 60 | python -u ./run_select.py \ 61 | --model_name hfl/roberta-wwm-ext \ 62 | --max_seq_length 512 \ 63 | --batch_size 24 \ 64 | --learning_rate 8e-5 \ 65 | --num_train_epochs 100 \ 66 | --logging_steps 10 \ 67 | --save_steps 200 \ 68 | --warmup_proportion 0.1 \ 69 | --weight_decay 0.01 \ 70 | --output_dir "${output_dir}/selector/" \ 71 | --data_dir ${data_dir} \ 72 | --set_k_sentences_ground_true 0 \ 73 | --early_stop_nums 5 \ 74 | --one_alpha -1 \ 75 | --do_train \ 76 | --use_loose_metric \ 77 | --early_stop \ 78 | --device gpu 79 | 80 | 81 | echo "########## Dev Processing #############" 82 | cp "${data_dir}/dev.json" "${output_dir}/tmp/test.json" 83 | 84 | # predict selector 85 | python -u ./run_select.py \ 86 | --model_name hfl/roberta-wwm-ext \ 87 | --max_seq_length 512 \ 88 | --batch_size 24 \ 89 | --load_model_path "${output_dir}/selector/best_model/model_state.pdparams" \ 90 | --data_dir "${output_dir}/tmp/" \ 91 | --output_dir "${output_dir}/selector/" \ 92 | --set_k_sentences_ground_true 0 \ 93 | --one_alpha 0.1 \ 94 | --do_predict \ 95 | --use_loose_metric \ 96 | --device gpu 97 | 98 | rm -rf "${output_dir}/tmp" 99 | # Postprocess selected data 100 | temp_dir="${data_dir}/*" 101 | cp -f ${temp_dir} "${output_dir}/selected-data" 102 | rm -f "${output_dir}/selected-data/dev.json" 103 | origin_dev_path="${output_dir}/selector/test_prediction.json" 104 | mv ${origin_dev_path} "${output_dir}/selected-data/dev.json" 105 | 106 | 107 | rm -rf "${output_dir}/tmp" 108 | # Postprocess selected data 109 | temp_dir="${data_dir}/*" 110 | cp -f ${temp_dir} "${output_dir}/selected-data" 111 | rm -f "${output_dir}/selected-data/dev.json" 112 | origin_dev_path="${output_dir}/selector/test_prediction.json" 113 | mv ${origin_dev_path} "${output_dir}/selected-data/dev.json" 114 | 115 | echo "########## Predictor Training #############" 116 | 117 | # Train predictor 118 | python -u run_predict.py \ 119 | --model_name hfl/roberta-wwm-ext \ 120 | --max_seq_length 384 \ 121 | --batch_size 24 \ 122 | --learning_rate 5e-5 \ 123 | --num_train_epochs 8 \ 124 | --logging_steps 10 \ 125 | --save_steps 100 \ 126 | --warmup_proportion 0.1 \ 127 | --weight_decay 0.01 \ 128 | --output_dir "$output_dir/predictor/" \ 129 | --data_dir "$output_dir/selected-data/" \ 130 | --do_train \ 131 | --device gpu 132 | 133 | python -u run_predict.py \ 134 | --model_name hfl/roberta-wwm-ext \ 135 | --max_seq_length 384 \ 136 | --batch_size 24 \ 137 | --learning_rate 5e-5 \ 138 | --num_train_epochs 8 \ 139 | --logging_steps 10 \ 140 | --save_steps 200 \ 141 | --warmup_proportion 0.1 \ 142 | --weight_decay 0.01 \ 143 | --output_dir "$output_dir/predictor/" \ 144 | --data_dir "$output_dir/selected-data/" \ 145 | --do_predict \ 146 | --device gpu -------------------------------------------------------------------------------- /tutorials/redundancy_removal/train_predictor.sh: -------------------------------------------------------------------------------- 1 | data_dir="./data/robust" 2 | output_dir="./output" 3 | 4 | # Read Parameter 5 | # -d data dir path 6 | # -o output_dir 7 | while getopts ":d:o:" optname 8 | do 9 | case "$optname" in 10 | "d") 11 | data_dir=$OPTARG 12 | ;; 13 | "o") 14 | output_dir=$OPTARG 15 | ;; 16 | ":") 17 | echo "No argument value for option $OPTARG" 18 | ;; 19 | "?") 20 | echo "Unknown option $OPTARG" 21 | ;; 22 | *) 23 | echo "Unknown error while processing options" 24 | ;; 25 | esac 26 | done 27 | 28 | echo "Data path: ${data_dir}" 29 | 30 | # prepare dir 31 | 32 | # clean cache 33 | if [ ! -d "./cache" ]; then 34 | mkdir "./cache" 35 | else 36 | rm -rf "./cache" 37 | mkdir "./cache" 38 | fi 39 | if [ ! -d "${output_dir}" ]; then 40 | mkdir "${output_dir}" 41 | fi 42 | if [ ! -d "${output_dir}/selector" ]; then 43 | mkdir "${output_dir}/selector" 44 | fi 45 | if [ ! -d "${output_dir}/selected-data" ]; then 46 | mkdir "${output_dir}/selected-data" 47 | fi 48 | if [ ! -d "${output_dir}/predictor" ]; then 49 | mkdir "${output_dir}/predictor" 50 | fi 51 | if [ ! -d "${output_dir}/tmp" ]; then 52 | mkdir "${output_dir}/tmp" 53 | fi 54 | 55 | echo "########## Predictor Training #############" 56 | 57 | # Train predictor 58 | python -u run_predict.py \ 59 | --model_name hfl/roberta-wwm-ext \ 60 | --max_seq_length 384 \ 61 | --batch_size 24 \ 62 | --learning_rate 5e-5 \ 63 | --num_train_epochs 8 \ 64 | --logging_steps 10 \ 65 | --save_steps 100 \ 66 | --warmup_proportion 0.1 \ 67 | --weight_decay 0.01 \ 68 | --output_dir "$output_dir/predictor/" \ 69 | --data_dir "$output_dir/selected-data/" \ 70 | --do_train \ 71 | --device gpu 72 | 73 | python -u run_predict.py \ 74 | --model_name hfl/roberta-wwm-ext \ 75 | --max_seq_length 384 \ 76 | --batch_size 24 \ 77 | --learning_rate 5e-5 \ 78 | --num_train_epochs 8 \ 79 | --logging_steps 10 \ 80 | --save_steps 200 \ 81 | --warmup_proportion 0.1 \ 82 | --weight_decay 0.01 \ 83 | --output_dir "$output_dir/predictor/" \ 84 | --data_dir "$output_dir/selected-data/" \ 85 | --do_predict \ 86 | --device gpu -------------------------------------------------------------------------------- /tutorials/redundancy_removal/train_select_data.sh: -------------------------------------------------------------------------------- 1 | 2 | data_dir="./data/robust" 3 | output_dir="./output" 4 | 5 | # Read Parameter 6 | # -d data dir path 7 | # -o output_dir 8 | while getopts ":d:o:" optname 9 | do 10 | case "$optname" in 11 | "d") 12 | data_dir=$OPTARG 13 | ;; 14 | "o") 15 | output_dir=$OPTARG 16 | ;; 17 | ":") 18 | echo "No argument value for option $OPTARG" 19 | ;; 20 | "?") 21 | echo "Unknown option $OPTARG" 22 | ;; 23 | *) 24 | echo "Unknown error while processing options" 25 | ;; 26 | esac 27 | done 28 | 29 | echo "Data path: ${data_dir}" 30 | echo "Selector Output path: ${output_dir}/selector/" 31 | 32 | # prepare dir 33 | 34 | # clean cache 35 | if [ ! -d "./cache" ]; then 36 | mkdir "./cache" 37 | else 38 | rm -rf "./cache" 39 | mkdir "./cache" 40 | fi 41 | if [ ! -d "${output_dir}" ]; then 42 | mkdir "${output_dir}" 43 | fi 44 | if [ ! -d "${output_dir}/selector" ]; then 45 | mkdir "${output_dir}/selector" 46 | fi 47 | if [ ! -d "${output_dir}/selected-data" ]; then 48 | mkdir "${output_dir}/selected-data" 49 | fi 50 | if [ ! -d "${output_dir}/predictor" ]; then 51 | mkdir "${output_dir}/predictor" 52 | fi 53 | if [ ! -d "${output_dir}/tmp" ]; then 54 | mkdir "${output_dir}/tmp" 55 | fi 56 | 57 | 58 | echo "########## Dev Processing #############" 59 | cp "${data_dir}/dev.json" "${output_dir}/tmp/test.json" 60 | 61 | # predict selector 62 | python -u ./run_select.py \ 63 | --model_name hfl/roberta-wwm-ext \ 64 | --max_seq_length 512 \ 65 | --batch_size 24 \ 66 | --load_model_path "${output_dir}/selector/best_model/model_state.pdparams" \ 67 | --data_dir "${output_dir}/tmp/" \ 68 | --output_dir "${output_dir}/selector/" \ 69 | --set_k_sentences_ground_true 0 \ 70 | --one_alpha 0.1 \ 71 | --do_predict \ 72 | --use_loose_metric \ 73 | --device gpu 74 | 75 | rm -rf "${output_dir}/tmp" 76 | # Postprocess selected data 77 | temp_dir="${data_dir}/*" 78 | cp -f ${temp_dir} "${output_dir}/selected-data" 79 | rm -f "${output_dir}/selected-data/dev.json" 80 | origin_dev_path="${output_dir}/selector/test_prediction.json" 81 | mv ${origin_dev_path} "${output_dir}/selected-data/dev.json" 82 | -------------------------------------------------------------------------------- /tutorials/redundancy_removal/train_selector.sh: -------------------------------------------------------------------------------- 1 | 2 | data_dir="./data/robust" 3 | output_dir="./output" 4 | 5 | # Read Parameter 6 | # -d data dir path 7 | # -o output_dir 8 | while getopts ":d:o:" optname 9 | do 10 | case "$optname" in 11 | "d") 12 | data_dir=$OPTARG 13 | ;; 14 | "o") 15 | output_dir=$OPTARG 16 | ;; 17 | ":") 18 | echo "No argument value for option $OPTARG" 19 | ;; 20 | "?") 21 | echo "Unknown option $OPTARG" 22 | ;; 23 | *) 24 | echo "Unknown error while processing options" 25 | ;; 26 | esac 27 | done 28 | 29 | echo "Data path: ${data_dir}" 30 | echo "Selector Output path: ${output_dir}/selector/" 31 | 32 | # prepare dir 33 | 34 | # clean cache 35 | if [ ! -d "./cache" ]; then 36 | mkdir "./cache" 37 | else 38 | rm -rf "./cache" 39 | mkdir "./cache" 40 | fi 41 | if [ ! -d "${output_dir}" ]; then 42 | mkdir "${output_dir}" 43 | fi 44 | if [ ! -d "${output_dir}/selector" ]; then 45 | mkdir "${output_dir}/selector" 46 | fi 47 | if [ ! -d "${output_dir}/selected-data" ]; then 48 | mkdir "${output_dir}/selected-data" 49 | fi 50 | if [ ! -d "${output_dir}/predictor" ]; then 51 | mkdir "${output_dir}/predictor" 52 | fi 53 | if [ ! -d "${output_dir}/tmp" ]; then 54 | mkdir "${output_dir}/tmp" 55 | fi 56 | 57 | echo "########## Selector Training #############" 58 | 59 | # train selector 60 | python -u ./run_select.py \ 61 | --model_name hfl/roberta-wwm-ext \ 62 | --max_seq_length 512 \ 63 | --batch_size 24 \ 64 | --learning_rate 8e-5 \ 65 | --num_train_epochs 100 \ 66 | --logging_steps 10 \ 67 | --save_steps 200 \ 68 | --warmup_proportion 0.1 \ 69 | --weight_decay 0.01 \ 70 | --output_dir "${output_dir}/selector/" \ 71 | --data_dir ${data_dir} \ 72 | --set_k_sentences_ground_true 0 \ 73 | --early_stop_nums 5 \ 74 | --one_alpha -1 \ 75 | --do_train \ 76 | --use_loose_metric \ 77 | --early_stop \ 78 | --device gpu -------------------------------------------------------------------------------- /tutorials/redundancy_removal/utils/checklist_process.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tools 3 | import argparse 4 | import os 5 | 6 | 7 | def process(input_data_path, output_data_path): 8 | with open(input_data_path, 'r', encoding='utf-8') as f: 9 | obj = json.load(f) 10 | for i, data in enumerate(obj['data']): 11 | x = [] 12 | for j, paragraphs in enumerate(data["paragraphs"]): 13 | for k, ans in enumerate(paragraphs["qas"][0]["answers"]): 14 | answer = ans["text"] 15 | if answer != "" and len(tools.split_sentence(answer)) == 1: 16 | x.append(paragraphs) 17 | else: 18 | break 19 | obj["data"][i]["paragraphs"] = x 20 | 21 | with open(output_data_path, 'w+', encoding="utf8") as outfile: 22 | json.dump(obj, outfile, ensure_ascii=False) 23 | 24 | 25 | if __name__ == "__main__": 26 | parser = argparse.ArgumentParser(description=__doc__) 27 | parser.add_argument("--input_data_dir", type=str, required=True, help="checklist input data dir") 28 | parser.add_argument("--output_data_dir", type=str, required=True, help="checklist output data dir") 29 | args = parser.parse_args() 30 | process(os.path.join(args.input_data_dir, 'train.json'), os.path.join(args.output_data_dir, 'train.json')) 31 | process(os.path.join(args.input_data_dir, 'dev.json'), os.path.join(args.output_data_dir, 'dev.json')) 32 | -------------------------------------------------------------------------------- /tutorials/redundancy_removal/utils/dureader_robust.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 3 | # Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Lint as: python3 18 | 19 | import json 20 | import os 21 | 22 | import datasets 23 | from datasets.tasks import QuestionAnsweringExtractive 24 | 25 | logger = datasets.logging.get_logger(__name__) 26 | 27 | _DESCRIPTION = """\ 28 | DureaderRobust is a chinese reading comprehension \ 29 | dataset, designed to evaluate the MRC models from \ 30 | three aspects: over-sensitivity, over-stability \ 31 | and generalization. 32 | """ 33 | 34 | _URL = "https://bj.bcebos.com/paddlenlp/datasets/dureader_robust-data.tar.gz" 35 | 36 | 37 | class DureaderRobustConfig(datasets.BuilderConfig): 38 | """BuilderConfig for DureaderRobust.""" 39 | data_dir: str = None 40 | do_train: bool = True 41 | do_predict: bool = True 42 | 43 | def __init__(self, **kwargs): 44 | """BuilderConfig for DureaderRobust. 45 | 46 | Args: 47 | **kwargs: keyword arguments forwarded to super. 48 | """ 49 | super(DureaderRobustConfig, self).__init__(**kwargs) 50 | 51 | 52 | class DureaderRobust(datasets.GeneratorBasedBuilder): 53 | BUILDER_CONFIGS = [ 54 | DureaderRobustConfig( 55 | name="plain_text", 56 | version=datasets.Version("1.0.0", ""), 57 | description="Plain text", 58 | ), 59 | ] 60 | 61 | def _info(self): 62 | return datasets.DatasetInfo( 63 | description=_DESCRIPTION, 64 | features=datasets.Features({ 65 | "id": 66 | datasets.Value("string"), 67 | "title": 68 | datasets.Value("string"), 69 | "context": 70 | datasets.Value("string"), 71 | "question": 72 | datasets.Value("string"), 73 | "answers": 74 | datasets.features.Sequence({ 75 | "text": datasets.Value("string"), 76 | "answer_start": datasets.Value("int32"), 77 | }), 78 | }), 79 | # No default supervised_keys (as we have to pass both question 80 | # and context as input). 81 | supervised_keys=None, 82 | homepage="https://arxiv.org/abs/2004.11142", 83 | task_templates=[ 84 | QuestionAnsweringExtractive(question_column="question", 85 | context_column="context", 86 | answers_column="answers") 87 | ], 88 | ) 89 | 90 | def _split_generators(self, dl_manager): 91 | # dl_dir = dl_manager.download_and_extract(_URL) 92 | result = [] 93 | if self.config.do_train: 94 | result += [ 95 | datasets.SplitGenerator(name=datasets.Split.TRAIN, 96 | gen_kwargs={"filepath": os.path.join(self.config.data_dir, "train.json")}), 97 | datasets.SplitGenerator(name=datasets.Split.VALIDATION, 98 | gen_kwargs={"filepath": os.path.join(self.config.data_dir, "dev.json")}), 99 | ] 100 | if self.config.do_predict: 101 | result.append( 102 | datasets.SplitGenerator(name=datasets.Split.TEST, 103 | gen_kwargs={"filepath": os.path.join(self.config.data_dir, 'test.json')})) 104 | return result 105 | 106 | def _generate_examples(self, filepath): 107 | """This function returns the examples in the raw (text) form.""" 108 | logger.info("generating examples from = %s", filepath) 109 | key = 0 110 | with open(filepath, encoding="utf-8") as f: 111 | durobust = json.load(f) 112 | for article in durobust["data"]: 113 | title = article.get("title", "") 114 | for paragraph in article["paragraphs"]: 115 | context = paragraph["context"] # do not strip leading blank spaces GH-2585 116 | for qa in paragraph["qas"]: 117 | answer_starts = [answer["answer_start"] for answer in qa.get("answers", '')] 118 | answers = [answer["text"] for answer in qa.get("answers", '')] 119 | # Features currently used are "context", "question", and "answers". 120 | # Others are extracted here for the ease of future expansions. 121 | yield key, { 122 | "title": title, 123 | "context": context, 124 | "question": qa["question"], 125 | "id": qa["id"], 126 | "answers": { 127 | "answer_start": answer_starts, 128 | "text": answers, 129 | }, 130 | } 131 | key += 1 132 | -------------------------------------------------------------------------------- /tutorials/redundancy_removal/utils/logger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | import os 17 | import time 18 | 19 | 20 | class Logger(): 21 | 22 | def __init__(self, args): 23 | self.args = args 24 | self.tic_train = time.time() 25 | self.performance_list = [] 26 | self.mode_dict = { 27 | "ERROR": "31", 28 | "INFO": "32", 29 | "WARNING": "33", 30 | } 31 | 32 | def __get_time_str(self): 33 | return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "," + str( 34 | int(round(time.time() * 1000)) % 1000).zfill(3) 35 | 36 | def __log(self, log_str, mode="INFO"): 37 | print("\033[" + self.mode_dict[mode] + "m[" + self.__get_time_str() + "] [ " + mode + "]\033[0m - " + 38 | log_str) 39 | 40 | def info(self, info_str: str): 41 | self.__log(info_str, mode="INFO") 42 | 43 | def error(self, info_str: str): 44 | self.__log(info_str, mode="ERROR") 45 | 46 | def warn(self, info_str: str): 47 | self.__log(info_str, mode="WARNING") 48 | 49 | def logging_step(self, global_step, epoch, step, loss): 50 | if global_step % self.args.logging_steps == 0: 51 | self.info("Global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % 52 | (global_step, epoch + 1, step + 1, loss, self.args.logging_steps / 53 | (time.time() - self.tic_train))) 54 | self.tic_train = time.time() 55 | 56 | def add_performance(self, performance): 57 | self.performance_list.append(performance) 58 | 59 | def save_performance(self): 60 | output_dir = os.path.join(self.args.output_dir, "logging.json") 61 | with open(output_dir, "w", encoding="utf8") as f: 62 | json.dump(self.performance_list, f) 63 | 64 | def logging_result(self, x): 65 | print(x) 66 | -------------------------------------------------------------------------------- /tutorials/redundancy_removal/utils/predict.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | from paddlenlp.metrics.squad import squad_evaluate 5 | 6 | 7 | def get_data(filepath): 8 | with open(filepath, encoding="utf-8") as f: 9 | durobust = json.load(f) 10 | data = [] 11 | for article in durobust["data"]: 12 | title = article.get("title", "") 13 | for paragraph in article["paragraphs"]: 14 | context = paragraph["context"] # do not strip leading blank spaces GH-2585 15 | for qa in paragraph["qas"]: 16 | answer_starts = [answer["answer_start"] for answer in qa.get("answers", '')] 17 | answers = [answer["text"] for answer in qa.get("answers", '')] 18 | # Features currently used are "context", "question", and "answers". 19 | # Others are extracted here for the ease of future expansions. 20 | data.append({ 21 | "title": title, 22 | "context": context, 23 | "question": qa["question"], 24 | "id": qa["id"], 25 | "answers": { 26 | "answer_start": answer_starts, 27 | "text": answers, 28 | }, 29 | }) 30 | return data 31 | 32 | 33 | if __name__ == "__main__": 34 | parser = argparse.ArgumentParser(description=__doc__) 35 | parser.add_argument("--test_data_dir", type=str, required=True, help="test data dir") 36 | parser.add_argument("--pred_data_dir", type=str, required=True, help="prediction data dir") 37 | args = parser.parse_args() 38 | raw_datasets = get_data(args.test_data_dir) 39 | 40 | with open(args.pred_data_dir, encoding="utf-8") as f: 41 | all_predictions = json.load(f) 42 | 43 | result = squad_evaluate(examples=raw_datasets, preds=all_predictions, is_whitespace_splited=False) 44 | -------------------------------------------------------------------------------- /tutorials/redundancy_removal/utils/tools.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | 17 | def remove_None(str_list: list) -> list: 18 | return [t for t in str_list if t is not None] 19 | 20 | 21 | def remove_blank_str(str_list: list) -> list: 22 | return [t for t in str_list if t != ""] 23 | 24 | 25 | def process_single_sign(str_list: list) -> list: 26 | x = [] 27 | for i in str_list: 28 | if i in ["|", "!", "。", "!", "?", "\\x0d", ";", ";", "?", "!"]: 29 | continue 30 | x.append(i) 31 | return x 32 | 33 | 34 | def strip_str(s: str) -> str: 35 | return s.strip().strip("\n").strip("\\x0d") 36 | 37 | 38 | def print_red(s: str, flag=True): 39 | if flag: 40 | print('\033[31m' + s + '\033[0m') 41 | 42 | 43 | def split_sentence(s: str, remove_illegal_sign=True) -> list: 44 | # s.replace(" ","。") 45 | c_list = re.split('(。|!|?|\\x0d|;|;|\?|!|\||\.{2,}|[\u4E00-\u9FA5]\.{1,} *|\. )', s) 46 | c_list.append("") 47 | c_list = ["".join(i) for i in zip(c_list[0::2], c_list[1::2])] 48 | if remove_illegal_sign: 49 | c_list = remove_None(c_list) 50 | c_list = process_single_sign(c_list) 51 | c_list = remove_blank_str(c_list) 52 | return [strip_str(c) for c in c_list] 53 | else: 54 | return c_list 55 | 56 | 57 | def batchify(dataset, batch_size): 58 | batch = [] 59 | for i, data in enumerate(dataset): 60 | if (i % batch_size == 0): 61 | batch.append([data]) 62 | else: 63 | batch[int(i / batch_size)].append(data) 64 | return batch 65 | 66 | 67 | def padding_sentence(data, padding_value=0): 68 | max_x_len = 0 69 | max_y_len = 0 70 | for x in data: 71 | if len(x) > max_x_len: 72 | max_x_len = len(x) 73 | for y in x: 74 | if len(y) > max_y_len: 75 | max_y_len = len(y) 76 | for x in data: 77 | x += [[0] * max_y_len] * (max_x_len - len(x)) 78 | for y in x: 79 | y += [0] * (max_y_len - len(y)) 80 | return data 81 | 82 | 83 | def padding_batch(data, padding_value=0): 84 | for x in data: 85 | max_y_len = 0 86 | for y in x: 87 | if len(y) > max_y_len: 88 | max_y_len = len(y) 89 | for y in x: 90 | y += [0] * (max_y_len - len(y)) 91 | return data 92 | -------------------------------------------------------------------------------- /tutorials/sparse_data_identification/README.md: -------------------------------------------------------------------------------- 1 | # 训练数据覆盖不足识别及有效数据增强 2 | 3 | ## 方法介绍 4 | 5 | 训练数据覆盖不足会导致模型在对应的测试数据上表现不好。数据扩充是提升模型效果首选方法,然而数据标注是一个费时费力的工作,如何标注更少的数据带来更大的效果提升是大多数NLP开发者面临的难题。 6 | 7 |

8 |
9 | 图1 数据覆盖不足识别及有效数据增强策略流程 10 |

11 | 12 | TrustAI提供图1流程,用尽量少的标注数据提升模型效果。首先,基于可信分析中实例级证据分析方法,从测试数据中识别因训练数据覆盖不足而导致预测效果差的测试样本,称作目标集(即证据不足测试数据)。然后,在未标注数据中,选择可作为目标集预测证据的数据进行标注。最后,将新标注的数据加入到训练数据中重训模型。 13 | 14 | 注:开发者可访问[ AI Studio示例 ](https://aistudio.baidu.com/aistudio/projectdetail/4434403)快速体验本案例。 15 | 16 | ## 实验步骤 17 | 18 | 我们以基于相似度计算任务LCQMC数据集进行的模拟实验为例介绍该方案实现步骤和效果。 19 | 20 | **Step 1**:从LCQMC训练数据中随机抽取5000条作为新训练集,剩余数据作为未标注数据。基于ERNIE-3.0-base-zh在新训练集`train_5000.tsv`微调得到相似度计算基线模型,运行命令如下所示: 21 | 22 | ```shell 23 | # 下载数据 24 | wget --no-check-certificate https://trustai.bj.bcebos.com/application_data/sparse_data.tar && tar xf sparse_data.tar && rm sparse_data.tar 25 | # 训练基线模型 26 | python -u train.py --dataset_dir ./data --train_file train_5000.tsv --dev_file dev.tsv --test_files test.tsv DuQM --num_classes 2 --save_dir ./checkpoint 27 | ``` 28 | 训练的基线模型保存在`checkpoint`目录中。 29 | 30 | 31 | **Step 2**:基于基线模型从验证集中选择目标数据,即**目标集**。 32 | 33 | 目标集选择方法:针对验证集中每一条样本,基于TrustAI提供的实例级证据分析方法`FeatureSimilarityModel`,计算每一训练样本作为其正支持证据的分数;然后计算每一验证样本在所有训练数据上得到的平均证据支持分数。分数较低的样本表明其证据不足,将其加入到目标集中。 34 | 35 | ```shell 36 | # 选取目标数据 37 | python -u find_sparse_data.py --dataset_dir ./data --train_file train_5000.tsv --dev_file dev.tsv --num_classes 2 --init_from_ckpt ./checkpoint/model_state.pdparams --sparse_num 50 --sparse_path ./data/sparse_data.tsv 38 | # sparse_num表示选择的目标数据的数量 39 | # sparse_path表示目标集存储的路径 40 | ``` 41 | 42 | 如图2所示,模型在目标集上的效果降低非常明显。 43 |

44 |
45 | 图2 模型在整个测试集和目标集上的表现 46 |

47 | 48 | 49 | **Step 3**:针对目标集中数据,再次利用`FeatureSimilarityModel`方法从未标注数据`rest_train.tsv`中选择可支持它们预测的数据,作为增强数据进行人工标注。 50 | 51 | 注:此处为模拟实验,`rest_train.tsv`的数据已被标注 52 | 53 | ```shell 54 | # 选取有效数据 55 | python -u find_valid_data.py --dataset_dir ./data --unlabeled_file rest_train.tsv --target_file sparse_data.tsv --num_classes 2 --init_from_ckpt ./checkpoint/model_state.pdparams --valid_threshold 0.7 --valid_num 1000 --valid_path ./data/valid_data.tsv 56 | # valid_threshold表示目标集证据的分数阈值,开发者可根据自己数据自主调整,默认为0.7 57 | # valid_num表示抽取有效数据的数量 58 | # valid_path表示有效数据的存储路径 59 | ``` 60 | 61 | **Step 4**:完成增强数据的标注后,将其加入到原训练数据集合中,重新训练模型及评估模型效果。 62 | 63 | ```shell 64 | # 将标注过的有效数据加入到原始训练数据中 65 | cat ./data/train_5000.tsv ./data/valid_data.tsv > ./data/merge_valid.tsv 66 | # 基于增强后的数据训练模型 67 | python -u train.py --dataset_dir ./data --train_file merge_valid.tsv --dev_file dev.tsv --test_files test.tsv DuQM sparse_data.tsv --num_classes 2 --save_dir ./valid_checkpoint 68 | ``` 69 | 实验结果如下表所示(对比基线:随机选择相同规模的数据进行标注,作为增强数据): 70 | 71 | | 数据集 | 数据量 | LCQMCdev | LCQMCtest | DuQM | 目标集 | 72 | | :-------: | :-------: | :-----: | :-----: |:-----: |:-----: | 73 | | 基线 | 5000 | 86.31% | 84.49% | 69.17% | 55.20% | 74 | | 基线 + 随机1000条 | 6000 | 86.76% | 85.05% | 69.23% | 55.20% | 75 | | 基线 + 策略1000条 | 6000 | 87.04% | 85.58% | 70.20% | 69.60% | 76 | 77 | 注:以上结果均为10次实验的平均值。 78 | -------------------------------------------------------------------------------- /tutorials/sparse_data_identification/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import numpy as np 16 | 17 | import paddle 18 | import paddle.nn.functional as F 19 | from paddlenlp.utils.log import logger 20 | 21 | 22 | @paddle.no_grad() 23 | def evaluate(model, criterion, metric, data_loader, name=''): 24 | """ 25 | Given a dataset, it evaluates model and computes the metric. 26 | Args: 27 | model(obj:`paddle.nn.Layer`): A model to classify texts. 28 | criterion(obj:`paddle.nn.Layer`): It can compute the loss. 29 | metric(obj:`paddle.metric.Metric`): The evaluation metric. 30 | data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches. 31 | """ 32 | 33 | model.eval() 34 | metric.reset() 35 | losses = [] 36 | for batch in data_loader: 37 | input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels'] 38 | logits = model(input_ids, token_type_ids) 39 | loss = criterion(logits, labels) 40 | losses.append(loss.numpy()) 41 | correct = metric.compute(logits, labels) 42 | metric.update(correct) 43 | 44 | acc = metric.accumulate() 45 | logger.info("%s: eval loss: %.5f, acc: %.5f" % (name, np.mean(losses), acc)) 46 | model.train() 47 | metric.reset() 48 | 49 | return acc 50 | 51 | 52 | def preprocess_function(example, tokenizer, max_seq_length, is_test=False): 53 | """ 54 | Builds model inputs from a sequence for sequence classification tasks 55 | by concatenating and adding special tokens. 56 | 57 | Args: 58 | example(obj:`list[str]`): input data, containing text and label if it have label. 59 | tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 60 | which contains most of the methods. Users should refer to the superclass for more information regarding methods. 61 | max_seq_length(obj:`int`): The maximum total input sequence length after tokenization. 62 | Sequences longer than this will be truncated, sequences shorter will be padded. 63 | label_nums(obj:`int`): The number of the labels. 64 | Returns: 65 | result(obj:`dict`): The preprocessed data including input_ids, token_type_ids, labels. 66 | """ 67 | if 'text_b' not in example: 68 | result = tokenizer(text=example["text_a"], max_seq_len=max_seq_length) 69 | else: 70 | result = tokenizer(text=example["text_a"], text_pair=example['text_b'], max_seq_len=max_seq_length) 71 | 72 | if not is_test: 73 | result["labels"] = np.array([example['label']], dtype='int64') 74 | return result 75 | --------------------------------------------------------------------------------