├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.yml
│ ├── config.yml
│ ├── feature_request.yml
│ └── general_issue.yml
├── PULL_REQUEST_TEMPLATE.md
└── workflows
│ ├── release.yml
│ └── unittest.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CODE_OF_CONDUCT.md
├── DOCUMENT.md
├── LICENSE
├── Makefile
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── Transparency_FAQ.md
├── examples
├── CoT.ipynb
├── Code.ipynb
├── LLMLingua2.ipynb
├── OnlineMeeting.ipynb
├── RAG.ipynb
├── RAGLlamaIndex.ipynb
└── Retrieval.ipynb
├── experiments
└── llmlingua2
│ ├── README.md
│ ├── data_collection
│ ├── GPT4_compressor.py
│ ├── README.md
│ ├── collect_data.sh
│ ├── compress.py
│ ├── compression_instructions.json
│ ├── filter.py
│ ├── format_data.py
│ ├── label_word.py
│ └── utils.py
│ ├── evaluation
│ ├── compress.py
│ ├── eval_bbh.py
│ ├── eval_gsm8k.py
│ ├── eval_longbench.py
│ ├── eval_meetingbank_qa.py
│ ├── eval_meetingbank_summary.py
│ ├── eval_zero_scrolls.py
│ ├── metrics.py
│ ├── scripts
│ │ ├── compress.sh
│ │ └── evaluate.sh
│ └── utils.py
│ └── model_training
│ ├── train.sh
│ ├── train_roberta.py
│ └── utils.py
├── images
├── LLMLingua-2.png
├── LLMLingua.png
├── LLMLingua_demo.png
├── LLMLingua_framework.png
├── LLMLingua_logo.png
├── LLMLingua_motivation.png
├── LongLLMLingua.png
├── LongLLMLingua_Motivation.png
└── motivation.png
├── llmlingua
├── __init__.py
├── prompt_compressor.py
├── utils.py
└── version.py
├── pyproject.toml
├── setup.cfg
├── setup.py
└── tests
├── test_llmlingua.py
├── test_llmlingua2.py
└── test_longllmlingua.py
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
1 | name: "\U0001F41B Bug Report"
2 | description: Submit a bug report to help us improve LLMLingua
3 | title: "[Bug]: "
4 | labels: ["bug"]
5 |
6 | body:
7 | - type: textarea
8 | id: description
9 | attributes:
10 | label: Describe the bug
11 | description: A clear and concise description of what the bug is.
12 | placeholder: What went wrong?
13 | - type: textarea
14 | id: reproduce
15 | attributes:
16 | label: Steps to reproduce
17 | description: |
18 | Steps to reproduce the behavior:
19 |
20 | 1. Step 1
21 | 2. Step 2
22 | 3. ...
23 | 4. See error
24 | placeholder: How can we replicate the issue?
25 | - type: textarea
26 | id: expected_behavior
27 | attributes:
28 | label: Expected Behavior
29 | description: A clear and concise description of what you expected to happen.
30 | placeholder: What should have happened?
31 | - type: textarea
32 | id: logs
33 | attributes:
34 | label: Logs
35 | description: If applicable, add logs or screenshots to help explain your problem.
36 | placeholder: Add logs here
37 | - type: textarea
38 | id: additional_information
39 | attributes:
40 | label: Additional Information
41 | description: |
42 | - LLMLingua Version:
43 | - Operating System:
44 | - Python Version:
45 | - Related Issues:
46 | - Any other relevant information.
47 | placeholder: Any additional details
48 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yml:
--------------------------------------------------------------------------------
1 | name: "\U0001F680 Feature request"
2 | description: Submit a proposal/request for a new LLMLingua feature
3 | labels: ["feature request"]
4 | title: "[Feature Request]: "
5 |
6 | body:
7 | - type: textarea
8 | id: problem_description
9 | attributes:
10 | label: Is your feature request related to a problem? Please describe.
11 | description: A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | placeholder: What problem are you trying to solve?
13 |
14 | - type: textarea
15 | id: solution_description
16 | attributes:
17 | label: Describe the solution you'd like
18 | description: A clear and concise description of what you want to happen.
19 | placeholder: How do you envision the solution?
20 |
21 | - type: textarea
22 | id: additional_context
23 | attributes:
24 | label: Additional context
25 | description: Add any other context or screenshots about the feature request here.
26 | placeholder: Any additional information
27 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/general_issue.yml:
--------------------------------------------------------------------------------
1 | name: "\U0001F31F General Question"
2 | description: File a general question
3 | title: "[Question]: "
4 | labels: ["question"]
5 |
6 | body:
7 | - type: textarea
8 | id: description
9 | attributes:
10 | label: Describe the issue
11 | description: A clear and concise description of what the question is.
12 | placeholder: The detail of question.
13 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | # What does this PR do?
2 |
3 |
12 |
13 |
14 |
15 | Fixes # (issue)
16 |
17 |
18 | ## Before submitting
19 | - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
20 | - [ ] Was this discussed/approved via a Github issue? Please add a link
21 | to it if that's the case.
22 | - [ ] Did you make sure to update the documentation with your changes?
23 | - [ ] Did you write any new necessary tests?
24 |
25 |
26 | ## Who can review?
27 |
28 | Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
29 | members/contributors who may be interested in your PR.
30 |
31 |
44 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | # This workflows will build and upload a Python Package using Twine when a release is published
2 | # Conda-forge bot will pick up new PyPI version and automatically create new version
3 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
4 |
5 | name: release
6 | run-name: Release LLMLingua by @${{ github.actor }}
7 |
8 | on:
9 | release:
10 | types: [published]
11 | permissions: {}
12 |
13 | jobs:
14 | deploy:
15 | strategy:
16 | matrix:
17 | os: ['ubuntu-latest']
18 | python-version: ["3.10"]
19 | runs-on: ${{ matrix.os }}
20 | environment:
21 | name: pypi
22 | url: https://pypi.org/project/llmlingua/
23 | permissions:
24 | id-token: write
25 | steps:
26 | - name: Checkout
27 | uses: actions/checkout@v3
28 | - name: Set up Python ${{ matrix.python-version }}
29 | uses: actions/setup-python@v5
30 | with:
31 | python-version: ${{ matrix.python-version }}
32 |
33 | - name: Install from source
34 | # This is required for the pre-commit tests
35 | shell: pwsh
36 | run: pip install .
37 |
38 | - name: Build
39 | shell: pwsh
40 | run: |
41 | pip install twine wheel
42 | python setup.py sdist bdist_wheel
43 | - name: Publish package distributions to PyPI
44 | uses: pypa/gh-action-pypi-publish@release/v1
45 | with:
46 | print-hash: true
47 |
--------------------------------------------------------------------------------
/.github/workflows/unittest.yml:
--------------------------------------------------------------------------------
1 | name: Unit Test
2 |
3 | # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows
4 | on: # Trigger the workflow on pull request or merge
5 | pull_request:
6 | merge_group:
7 | types: [checks_requested]
8 |
9 | defaults:
10 | run:
11 | shell: bash
12 | permissions: {}
13 |
14 | jobs:
15 | UnitTest:
16 | runs-on: ${{ matrix.os }}
17 | strategy:
18 | fail-fast: false
19 | matrix:
20 | os: [ubuntu-latest, macos-latest, windows-2019]
21 | python-version: ["3.9", "3.10", "3.11"]
22 | exclude:
23 | - os: macos-latest
24 | python-version: '3.9'
25 | steps:
26 | - uses: actions/checkout@v4
27 | - name: Set up Python ${{ matrix.python-version }}
28 | uses: actions/setup-python@v5
29 | with:
30 | python-version: ${{ matrix.python-version }}
31 |
32 | - name: Install packages and dependencies for all tests
33 | run: |
34 | python -m pip install --upgrade pip wheel
35 | pip install pytest pytest-xdist nltk
36 | python -c "import nltk; nltk.download('punkt_tab')"
37 |
38 | - name: Install packages
39 | run: |
40 | pip install -e .
41 |
42 | - name: Run core tests
43 | shell: bash
44 | env:
45 | HF_TOKEN: ${{ secrets.HF_TOKEN}}
46 | run: |
47 | make test
48 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 | ##
4 | ## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
5 |
6 | # User-specific files
7 | *.rsuser
8 | *.suo
9 | *.user
10 | *.userosscache
11 | *.sln.docstates
12 |
13 | # User-specific files (MonoDevelop/Xamarin Studio)
14 | *.userprefs
15 |
16 | # Mono auto generated files
17 | mono_crash.*
18 |
19 | # Build results
20 | [Dd]ebug/
21 | [Dd]ebugPublic/
22 | [Rr]elease/
23 | [Rr]eleases/
24 | x64/
25 | x86/
26 | [Ww][Ii][Nn]32/
27 | [Aa][Rr][Mm]/
28 | [Aa][Rr][Mm]64/
29 | bld/
30 | [Bb]in/
31 | [Oo]bj/
32 | [Ll]og/
33 | [Ll]ogs/
34 |
35 | # Visual Studio 2015/2017 cache/options directory
36 | .vs/
37 | # Uncomment if you have tasks that create the project's static files in wwwroot
38 | #wwwroot/
39 |
40 | # Visual Studio 2017 auto generated files
41 | Generated\ Files/
42 |
43 | # MSTest test Results
44 | [Tt]est[Rr]esult*/
45 | [Bb]uild[Ll]og.*
46 |
47 | # NUnit
48 | *.VisualState.xml
49 | TestResult.xml
50 | nunit-*.xml
51 |
52 | # Build Results of an ATL Project
53 | [Dd]ebugPS/
54 | [Rr]eleasePS/
55 | dlldata.c
56 |
57 | # Benchmark Results
58 | BenchmarkDotNet.Artifacts/
59 |
60 | # .NET Core
61 | project.lock.json
62 | project.fragment.lock.json
63 | artifacts/
64 |
65 | # ASP.NET Scaffolding
66 | ScaffoldingReadMe.txt
67 |
68 | # StyleCop
69 | StyleCopReport.xml
70 |
71 | # Files built by Visual Studio
72 | *_i.c
73 | *_p.c
74 | *_h.h
75 | *.ilk
76 | *.meta
77 | *.obj
78 | *.iobj
79 | *.pch
80 | *.pdb
81 | *.ipdb
82 | *.pgc
83 | *.pgd
84 | *.rsp
85 | *.sbr
86 | *.tlb
87 | *.tli
88 | *.tlh
89 | *.tmp
90 | *.tmp_proj
91 | *_wpftmp.csproj
92 | *.log
93 | *.tlog
94 | *.vspscc
95 | *.vssscc
96 | .builds
97 | *.pidb
98 | *.svclog
99 | *.scc
100 |
101 | # Chutzpah Test files
102 | _Chutzpah*
103 |
104 | # Visual C++ cache files
105 | ipch/
106 | *.aps
107 | *.ncb
108 | *.opendb
109 | *.opensdf
110 | *.sdf
111 | *.cachefile
112 | *.VC.db
113 | *.VC.VC.opendb
114 |
115 | # Visual Studio profiler
116 | *.psess
117 | *.vsp
118 | *.vspx
119 | *.sap
120 |
121 | # Visual Studio Trace Files
122 | *.e2e
123 |
124 | # TFS 2012 Local Workspace
125 | $tf/
126 |
127 | # Guidance Automation Toolkit
128 | *.gpState
129 |
130 | # ReSharper is a .NET coding add-in
131 | _ReSharper*/
132 | *.[Rr]e[Ss]harper
133 | *.DotSettings.user
134 |
135 | # TeamCity is a build add-in
136 | _TeamCity*
137 |
138 | # DotCover is a Code Coverage Tool
139 | *.dotCover
140 |
141 | # AxoCover is a Code Coverage Tool
142 | .axoCover/*
143 | !.axoCover/settings.json
144 |
145 | # Coverlet is a free, cross platform Code Coverage Tool
146 | coverage*.json
147 | coverage*.xml
148 | coverage*.info
149 |
150 | # Visual Studio code coverage results
151 | *.coverage
152 | *.coveragexml
153 |
154 | # NCrunch
155 | _NCrunch_*
156 | .*crunch*.local.xml
157 | nCrunchTemp_*
158 |
159 | # MightyMoose
160 | *.mm.*
161 | AutoTest.Net/
162 |
163 | # Web workbench (sass)
164 | .sass-cache/
165 |
166 | # Installshield output folder
167 | [Ee]xpress/
168 |
169 | # DocProject is a documentation generator add-in
170 | DocProject/buildhelp/
171 | DocProject/Help/*.HxT
172 | DocProject/Help/*.HxC
173 | DocProject/Help/*.hhc
174 | DocProject/Help/*.hhk
175 | DocProject/Help/*.hhp
176 | DocProject/Help/Html2
177 | DocProject/Help/html
178 |
179 | # Click-Once directory
180 | publish/
181 |
182 | # Publish Web Output
183 | *.[Pp]ublish.xml
184 | *.azurePubxml
185 | # Note: Comment the next line if you want to checkin your web deploy settings,
186 | # but database connection strings (with potential passwords) will be unencrypted
187 | *.pubxml
188 | *.publishproj
189 |
190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
191 | # checkin your Azure Web App publish settings, but sensitive information contained
192 | # in these scripts will be unencrypted
193 | PublishScripts/
194 |
195 | # NuGet Packages
196 | *.nupkg
197 | # NuGet Symbol Packages
198 | *.snupkg
199 | # The packages folder can be ignored because of Package Restore
200 | **/[Pp]ackages/*
201 | # except build/, which is used as an MSBuild target.
202 | !**/[Pp]ackages/build/
203 | # Uncomment if necessary however generally it will be regenerated when needed
204 | #!**/[Pp]ackages/repositories.config
205 | # NuGet v3's project.json files produces more ignorable files
206 | *.nuget.props
207 | *.nuget.targets
208 |
209 | # Microsoft Azure Build Output
210 | csx/
211 | *.build.csdef
212 |
213 | # Microsoft Azure Emulator
214 | ecf/
215 | rcf/
216 |
217 | # Windows Store app package directories and files
218 | AppPackages/
219 | BundleArtifacts/
220 | Package.StoreAssociation.xml
221 | _pkginfo.txt
222 | *.appx
223 | *.appxbundle
224 | *.appxupload
225 |
226 | # Visual Studio cache files
227 | # files ending in .cache can be ignored
228 | *.[Cc]ache
229 | # but keep track of directories ending in .cache
230 | !?*.[Cc]ache/
231 |
232 | # Others
233 | ClientBin/
234 | ~$*
235 | *~
236 | *.dbmdl
237 | *.dbproj.schemaview
238 | *.jfm
239 | *.pfx
240 | *.publishsettings
241 | orleans.codegen.cs
242 |
243 | # Including strong name files can present a security risk
244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
245 | #*.snk
246 |
247 | # Since there are multiple workflows, uncomment next line to ignore bower_components
248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
249 | #bower_components/
250 |
251 | # RIA/Silverlight projects
252 | Generated_Code/
253 |
254 | # Backup & report files from converting an old project file
255 | # to a newer Visual Studio version. Backup files are not needed,
256 | # because we have git ;-)
257 | _UpgradeReport_Files/
258 | Backup*/
259 | UpgradeLog*.XML
260 | UpgradeLog*.htm
261 | ServiceFabricBackup/
262 | *.rptproj.bak
263 |
264 | # SQL Server files
265 | *.mdf
266 | *.ldf
267 | *.ndf
268 |
269 | # Business Intelligence projects
270 | *.rdl.data
271 | *.bim.layout
272 | *.bim_*.settings
273 | *.rptproj.rsuser
274 | *- [Bb]ackup.rdl
275 | *- [Bb]ackup ([0-9]).rdl
276 | *- [Bb]ackup ([0-9][0-9]).rdl
277 |
278 | # Microsoft Fakes
279 | FakesAssemblies/
280 |
281 | # GhostDoc plugin setting file
282 | *.GhostDoc.xml
283 |
284 | # Node.js Tools for Visual Studio
285 | .ntvs_analysis.dat
286 | node_modules/
287 |
288 | # Visual Studio 6 build log
289 | *.plg
290 |
291 | # Visual Studio 6 workspace options file
292 | *.opt
293 |
294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
295 | *.vbw
296 |
297 | # Visual Studio 6 auto-generated project file (contains which files were open etc.)
298 | *.vbp
299 |
300 | # Visual Studio 6 workspace and project file (working project files containing files to include in project)
301 | *.dsw
302 | *.dsp
303 |
304 | # Visual Studio 6 technical files
305 | *.ncb
306 | *.aps
307 |
308 | # Visual Studio LightSwitch build output
309 | **/*.HTMLClient/GeneratedArtifacts
310 | **/*.DesktopClient/GeneratedArtifacts
311 | **/*.DesktopClient/ModelManifest.xml
312 | **/*.Server/GeneratedArtifacts
313 | **/*.Server/ModelManifest.xml
314 | _Pvt_Extensions
315 |
316 | # Paket dependency manager
317 | .paket/paket.exe
318 | paket-files/
319 |
320 | # FAKE - F# Make
321 | .fake/
322 |
323 | # CodeRush personal settings
324 | .cr/personal
325 |
326 | # Python Tools for Visual Studio (PTVS)
327 | __pycache__/
328 | *.pyc
329 |
330 | # Cake - Uncomment if you are using it
331 | # tools/**
332 | # !tools/packages.config
333 |
334 | # Tabs Studio
335 | *.tss
336 |
337 | # Telerik's JustMock configuration file
338 | *.jmconfig
339 |
340 | # BizTalk build output
341 | *.btp.cs
342 | *.btm.cs
343 | *.odx.cs
344 | *.xsd.cs
345 |
346 | # OpenCover UI analysis results
347 | OpenCover/
348 |
349 | # Azure Stream Analytics local run output
350 | ASALocalRun/
351 |
352 | # MSBuild Binary and Structured Log
353 | *.binlog
354 |
355 | # NVidia Nsight GPU debugger configuration file
356 | *.nvuser
357 |
358 | # MFractors (Xamarin productivity tool) working folder
359 | .mfractor/
360 |
361 | # Local History for Visual Studio
362 | .localhistory/
363 |
364 | # Visual Studio History (VSHistory) files
365 | .vshistory/
366 |
367 | # BeatPulse healthcheck temp database
368 | healthchecksdb
369 |
370 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
371 | MigrationBackup/
372 |
373 | # Ionide (cross platform F# VS Code tools) working folder
374 | .ionide/
375 |
376 | # Fody - auto-generated XML schema
377 | FodyWeavers.xsd
378 |
379 | # VS Code files for those working on multiple tools
380 | .vscode/*
381 | !.vscode/settings.json
382 | !.vscode/tasks.json
383 | !.vscode/launch.json
384 | !.vscode/extensions.json
385 | *.code-workspace
386 |
387 | # Local History for Visual Studio Code
388 | .history/
389 |
390 | # Windows Installer files from build outputs
391 | *.cab
392 | *.msi
393 | *.msix
394 | *.msm
395 | *.msp
396 |
397 | # JetBrains Rider
398 | *.sln.iml
399 | *.egg-info
400 |
401 | # build
402 | build/*
403 | dist/*
404 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | default_language_version:
2 | python: python3
3 | exclude: 'dotnet'
4 | ci:
5 | autofix_prs: true
6 | autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
7 | autoupdate_schedule: 'quarterly'
8 |
9 | repos:
10 | - repo: https://github.com/pre-commit/pre-commit-hooks
11 | rev: v4.4.0
12 | hooks:
13 | - id: check-added-large-files
14 | - id: check-ast
15 | - id: check-yaml
16 | - id: check-toml
17 | - id: check-json
18 | - id: check-byte-order-marker
19 | exclude: .gitignore
20 | - id: check-merge-conflict
21 | - id: detect-private-key
22 | - id: trailing-whitespace
23 | - id: end-of-file-fixer
24 | - id: no-commit-to-branch
25 | - repo: https://github.com/pycqa/isort
26 | rev: 5.13.2
27 | hooks:
28 | - id: isort
29 | - repo: https://github.com/psf/black
30 | rev: 23.3.0
31 | hooks:
32 | - id: black
33 | - repo: https://github.com/nbQA-dev/nbQA
34 | rev: 1.7.1
35 | hooks:
36 | - id: nbqa-black
37 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 |
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 |
5 | Resources:
6 |
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) Microsoft Corporation.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: install style test
2 |
3 | PYTHON := python
4 | CHECK_DIRS := llmlingua tests
5 |
6 | install:
7 | @${PYTHON} setup.py bdist_wheel
8 | @${PYTHON} -m pip install dist/sdtools*
9 |
10 | style:
11 | black $(CHECK_DIRS)
12 | isort -rc $(CHECK_DIRS)
13 | flake8 $(CHECK_DIRS)
14 |
15 | test:
16 | @${PYTHON} -m pytest -n auto --dist=loadfile -s -v ./tests/
17 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |

4 |
5 |
6 |
LLMLingua Series | Effectively Deliver Information to LLMs via Prompt Compression
7 |
8 |
9 |
10 |
11 | | Project Page |
12 | LLMLingua |
13 | LongLLMLingua |
14 | LLMLingua-2 |
15 | LLMLingua Demo |
16 | LLMLingua-2 Demo |
17 |
18 |
19 | https://github.com/microsoft/LLMLingua/assets/30883354/eb0ea70d-6d4c-4aa7-8977-61f94bb87438
20 |
21 | ## News
22 | - 🍩 [24/12/13] We are excited to announce the release of our KV cache-centric analysis work, [SCBench](https://aka.ms/SCBench), which evaluates long-context methods from a KV cache perspective.
23 | - 👘 [24/09/16] We are pleased to announce the release of our KV cache offloading work, [RetrievalAttention](https://aka.ms/RetrievalAttention), which accelerates long-context LLM inference via vector retrieval.
24 | - 🌀 [24/07/03] We're excited to announce the release of [MInference](https://aka.ms/MInference) to speed up Long-context LLMs' inference, reduces inference latency by up to **10X** for pre-filling on an A100 while maintaining accuracy in **1M tokens prompt**! For more information, check out our [paper](https://arxiv.org/abs/2407.02490), visit the [project page](https://aka.ms/MInference).
25 | - 🧩 LLMLingua has been integrated into [Prompt flow](https://microsoft.github.io/promptflow/integrations/tools/llmlingua-prompt-compression-tool.html), a streamlined tool framework for LLM-based AI applications.
26 | - 🦚 We're excited to announce the release of **LLMLingua-2**, boasting a 3x-6x speed improvement over LLMLingua! For more information, check out our [paper](https://aclanthology.org/2024.findings-acl.57/), visit the [project page](https://llmlingua.com/llmlingua2.html), and explore our [demo](https://huggingface.co/spaces/microsoft/LLMLingua-2).
27 | - 👾 LLMLingua has been integrated into [LangChain](https://github.com/langchain-ai/langchain/blob/master/docs/docs/integrations/retrievers/llmlingua.ipynb) and [LlamaIndex](https://github.com/run-llama/llama_index/blob/main/docs/examples/node_postprocessor/LongLLMLingua.ipynb), two widely-used RAG frameworks.
28 | - 🤳 Talk slides are available in [AI Time Jan, 24](https://drive.google.com/file/d/1fzK3wOvy2boF7XzaYuq2bQ3jFeP1WMk3/view?usp=sharing).
29 | - 🖥 EMNLP'23 slides are available in [Session 5](https://drive.google.com/file/d/1GxQLAEN8bBB2yiEdQdW4UKoJzZc0es9t/view) and [BoF-6](https://drive.google.com/file/d/1LJBUfJrKxbpdkwo13SgPOqugk-UjLVIF/view).
30 | - 📚 Check out our new [blog post](https://medium.com/@iofu728/longllmlingua-bye-bye-to-middle-loss-and-save-on-your-rag-costs-via-prompt-compression-54b559b9ddf7) discussing RAG benefits and cost savings through prompt compression. See the script example [here](https://github.com/microsoft/LLMLingua/blob/main/examples/Retrieval.ipynb).
31 | - 🎈 Visit our [project page](https://llmlingua.com/) for real-world case studies in RAG, Online Meetings, CoT, and Code.
32 | - 👨🦯 Explore our ['./examples'](./examples) directory for practical applications, including [LLMLingua-2](./examples/LLMLingua2.ipynb), [RAG](./examples/RAG.ipynb), [Online Meeting](./examples/OnlineMeeting.ipynb), [CoT](./examples/CoT.ipynb), [Code](./examples/Code.ipynb), and [RAG using LlamaIndex](./examples/RAGLlamaIndex.ipynb).
33 |
34 | ## TL;DR
35 |
36 | LLMLingua utilizes a compact, well-trained language model (e.g., GPT2-small, LLaMA-7B) to identify and remove non-essential tokens in prompts. This approach enables efficient inference with large language models (LLMs), achieving up to 20x compression with minimal performance loss.
37 |
38 | - [LLMLingua: Compressing Prompts for Accelerated Inference of Large Language Models](https://aclanthology.org/2023.emnlp-main.825/) (EMNLP 2023)
39 | _Huiqiang Jiang, Qianhui Wu, Chin-Yew Lin, Yuqing Yang and Lili Qiu_
40 |
41 | LongLLMLingua mitigates the 'lost in the middle' issue in LLMs, enhancing long-context information processing. It reduces costs and boosts efficiency with prompt compression, improving RAG performance by up to 21.4% using only 1/4 of the tokens.
42 |
43 | - [LongLLMLingua: Accelerating and Enhancing LLMs in Long Context Scenarios via Prompt Compression](https://aclanthology.org/2024.acl-long.91/) (ACL 2024 and ICLR ME-FoMo 2024)
44 | _Huiqiang Jiang, Qianhui Wu, Xufang Luo, Dongsheng Li, Chin-Yew Lin, Yuqing Yang and Lili Qiu_
45 |
46 | LLMLingua-2, a small-size yet powerful prompt compression method trained via data distillation from GPT-4 for token classification with a BERT-level encoder, excels in task-agnostic compression. It surpasses LLMLingua in handling out-of-domain data, offering 3x-6x faster performance.
47 |
48 | - [LLMLingua-2: Data Distillation for Efficient and Faithful Task-Agnostic Prompt Compression](https://aclanthology.org/2024.findings-acl.57/) (ACL 2024 Findings)
49 | _Zhuoshi Pan, Qianhui Wu, Huiqiang Jiang, Menglin Xia, Xufang Luo, Jue Zhang, Qingwei Lin, Victor Ruhle, Yuqing Yang, Chin-Yew Lin, H. Vicky Zhao, Lili Qiu, Dongmei Zhang_
50 |
51 | ## 🎥 Overview
52 |
53 | 
54 |
55 | - Ever encountered the token limit when asking ChatGPT to summarize lengthy texts?
56 | - Frustrated with ChatGPT forgetting previous instructions after extensive fine-tuning?
57 | - Experienced high costs using GPT3.5/4 API for experiments despite excellent results?
58 |
59 | While Large Language Models like ChatGPT and GPT-4 excel in generalization and reasoning, they often face challenges like prompt length limits and prompt-based pricing schemes.
60 |
61 | 
62 |
63 | Now you can use **LLMLingua**, **LongLLMLingua**, and **LLMLingua-2**!
64 |
65 | These tools offer an efficient solution to compress prompts by up to **20x**, enhancing the utility of LLMs.
66 |
67 | - 💰 **Cost Savings**: Reduces both prompt and generation lengths with minimal overhead.
68 | - 📝 **Extended Context Support**: Enhances support for longer contexts, mitigates the "lost in the middle" issue, and boosts overall performance.
69 | - ⚖️ **Robustness**: No additional training needed for LLMs.
70 | - 🕵️ **Knowledge Retention**: Maintains original prompt information like ICL and reasoning.
71 | - 📜 **KV-Cache Compression**: Accelerates inference process.
72 | - 🪃 **Comprehensive Recovery**: GPT-4 can recover all key information from compressed prompts.
73 |
74 | 
75 |
76 | 
77 |
78 | 
79 |
80 | PS: This demo is based on the [alt-gpt](https://github.com/feedox/alt-gpt) project. Special thanks to @Livshitz for their valuable contribution.
81 |
82 | If you find this repo helpful, please cite the following papers:
83 |
84 | ```bibtex
85 | @inproceedings{jiang-etal-2023-llmlingua,
86 | title = "{LLML}ingua: Compressing Prompts for Accelerated Inference of Large Language Models",
87 | author = "Huiqiang Jiang and Qianhui Wu and Chin-Yew Lin and Yuqing Yang and Lili Qiu",
88 | editor = "Bouamor, Houda and
89 | Pino, Juan and
90 | Bali, Kalika",
91 | booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
92 | month = dec,
93 | year = "2023",
94 | address = "Singapore",
95 | publisher = "Association for Computational Linguistics",
96 | url = "https://aclanthology.org/2023.emnlp-main.825",
97 | doi = "10.18653/v1/2023.emnlp-main.825",
98 | pages = "13358--13376",
99 | }
100 | ```
101 |
102 | ```bibtex
103 | @inproceedings{jiang-etal-2024-longllmlingua,
104 | title = "{L}ong{LLML}ingua: Accelerating and Enhancing {LLM}s in Long Context Scenarios via Prompt Compression",
105 | author = "Huiqiang Jiang and Qianhui Wu and and Xufang Luo and Dongsheng Li and Chin-Yew Lin and Yuqing Yang and Lili Qiu",
106 | editor = "Ku, Lun-Wei and
107 | Martins, Andre and
108 | Srikumar, Vivek",
109 | booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
110 | month = aug,
111 | year = "2024",
112 | address = "Bangkok, Thailand",
113 | publisher = "Association for Computational Linguistics",
114 | url = "https://aclanthology.org/2024.acl-long.91",
115 | pages = "1658--1677",
116 | }
117 | ```
118 |
119 | ```bibtex
120 | @inproceedings{pan-etal-2024-llmlingua,
121 | title = "{LLML}ingua-2: Data Distillation for Efficient and Faithful Task-Agnostic Prompt Compression",
122 | author = "Zhuoshi Pan and Qianhui Wu and Huiqiang Jiang and Menglin Xia and Xufang Luo and Jue Zhang and Qingwei Lin and Victor Ruhle and Yuqing Yang and Chin-Yew Lin and H. Vicky Zhao and Lili Qiu and Dongmei Zhang",
123 | editor = "Ku, Lun-Wei and
124 | Martins, Andre and
125 | Srikumar, Vivek",
126 | booktitle = "Findings of the Association for Computational Linguistics ACL 2024",
127 | month = aug,
128 | year = "2024",
129 | address = "Bangkok, Thailand and virtual meeting",
130 | publisher = "Association for Computational Linguistics",
131 | url = "https://aclanthology.org/2024.findings-acl.57",
132 | pages = "963--981",
133 | }
134 | ```
135 |
136 | ## 🎯 Quick Start
137 |
138 | #### 1. **Installing LLMLingua:**
139 |
140 | To get started with LLMLingua, simply install it using pip:
141 |
142 | ```bash
143 | pip install llmlingua
144 | ```
145 |
146 | #### 2. **Using LLMLingua Series Methods for Prompt Compression:**
147 |
148 | With **LLMLingua**, you can easily compress your prompts. Here’s how you can do it:
149 |
150 | ```python
151 | from llmlingua import PromptCompressor
152 |
153 | llm_lingua = PromptCompressor()
154 | compressed_prompt = llm_lingua.compress_prompt(prompt, instruction="", question="", target_token=200)
155 |
156 | # > {'compressed_prompt': 'Question: Sam bought a dozen boxes, each with 30 highlighter pens inside, for $10 each box. He reanged five of boxes into packages of sixlters each and sold them $3 per. He sold the rest theters separately at the of three pens $2. How much did make in total, dollars?\nLets think step step\nSam bought 1 boxes x00 oflters.\nHe bought 12 * 300ters in total\nSam then took 5 boxes 6ters0ters.\nHe sold these boxes for 5 *5\nAfterelling these boxes there were 3030 highlighters remaining.\nThese form 330 / 3 = 110 groups of three pens.\nHe sold each of these groups for $2 each, so made 110 * 2 = $220 from them.\nIn total, then, he earned $220 + $15 = $235.\nSince his original cost was $120, he earned $235 - $120 = $115 in profit.\nThe answer is 115',
157 | # 'origin_tokens': 2365,
158 | # 'compressed_tokens': 211,
159 | # 'ratio': '11.2x',
160 | # 'saving': ', Saving $0.1 in GPT-4.'}
161 |
162 | ## Or use the phi-2 model,
163 | llm_lingua = PromptCompressor("microsoft/phi-2")
164 |
165 | ## Or use the quantation model, like TheBloke/Llama-2-7b-Chat-GPTQ, only need <8GB GPU memory.
166 | ## Before that, you need to pip install optimum auto-gptq
167 | llm_lingua = PromptCompressor("TheBloke/Llama-2-7b-Chat-GPTQ", model_config={"revision": "main"})
168 | ```
169 |
170 | To try **LongLLMLingua** in your scenarios, you can use
171 |
172 | ```python
173 | from llmlingua import PromptCompressor
174 |
175 | llm_lingua = PromptCompressor()
176 | compressed_prompt = llm_lingua.compress_prompt(
177 | prompt_list,
178 | question=question,
179 | rate=0.55,
180 | # Set the special parameter for LongLLMLingua
181 | condition_in_question="after_condition",
182 | reorder_context="sort",
183 | dynamic_context_compression_ratio=0.3, # or 0.4
184 | condition_compare=True,
185 | context_budget="+100",
186 | rank_method="longllmlingua",
187 | )
188 | ```
189 |
190 | To try **LLMLingua-2** in your scenarios, you can use
191 |
192 | ```python
193 | from llmlingua import PromptCompressor
194 |
195 | llm_lingua = PromptCompressor(
196 | model_name="microsoft/llmlingua-2-xlm-roberta-large-meetingbank",
197 | use_llmlingua2=True, # Whether to use llmlingua-2
198 | )
199 | compressed_prompt = llm_lingua.compress_prompt(prompt, rate=0.33, force_tokens = ['\n', '?'])
200 |
201 | ## Or use LLMLingua-2-small model
202 | llm_lingua = PromptCompressor(
203 | model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
204 | use_llmlingua2=True, # Whether to use llmlingua-2
205 | )
206 | ```
207 |
208 | #### 3. **Advanced usage - Structured Prompt Compression:**
209 |
210 | Split text into sections, decide on whether to compress and its rate. Use `` tags for context segmentation, with optional rate and compress parameters.
211 |
212 | ```python
213 | structured_prompt = """Speaker 4: Thank you. And can we do the functions for content? Items I believe are 11, three, 14, 16 and 28, I believe.
214 | Speaker 0: Item 11 is a communication from Council on Price recommendation to increase appropriation in the general fund group in the City Manager Department by $200 to provide a contribution to the Friends of the Long Beach Public Library. Item 12 is communication from Councilman Super Now. Recommendation to increase appropriation in the special advertising and promotion fund group and the city manager's department by $10,000 to provide support for the end of summer celebration. Item 13 is a communication from Councilman Austin. Recommendation to increase appropriation in the general fund group in the city manager department by $500 to provide a donation to the Jazz Angels . Item 14 is a communication from Councilman Austin. Recommendation to increase appropriation in the general fund group in the City Manager department by $300 to provide a donation to the Little Lion Foundation. Item 16 is a communication from Councilman Allen recommendation to increase appropriation in the general fund group in the city manager department by $1,020 to provide contribution to Casa Korero, Sew Feria Business Association, Friends of Long Beach Public Library and Dave Van Patten. Item 28 is a communication. Communication from Vice Mayor Richardson and Council Member Muranga. Recommendation to increase appropriation in the general fund group in the City Manager Department by $1,000 to provide a donation to Ron Palmer Summit. Basketball and Academic Camp.
215 | Speaker 4: We have a promotion and a second time as councilman served Councilman Ringa and customers and they have any comments."""
216 | compressed_prompt = llm_lingua.structured_compress_prompt(structured_prompt, instruction="", question="", rate=0.5)
217 | print(compressed_prompt['compressed_prompt'])
218 |
219 | # > Speaker 4:. And can we do the functions for content? Items I believe are11,,116 28,.
220 | # Speaker 0: a from Council on Price to increase the fund group the Manager0 provide a the the1 is Councilman Super Now. the special group the provide the summerman a the Jazzels a communication from Councilman Austin. Recommendation to increase appropriation in the general fund group in the City Manager department by $300 to provide a donation to the Little Lion Foundation. Item 16 is a communication from Councilman Allen recommendation to increase appropriation in the general fund group in the city manager department by $1,020 to provide contribution to Casa Korero, Sew Feria Business Association, Friends of Long Beach Public Library and Dave Van Patten. Item 28 is a communication. Communication from Vice Mayor Richardson and Council Member Muranga. Recommendation to increase appropriation in the general fund group in the City Manager Department by $1,000 to provide a donation to Ron Palmer Summit. Basketball and Academic Camp.
221 | # Speaker 4: We have a promotion and a second time as councilman served Councilman Ringa and customers and they have any comments.
222 | ```
223 |
224 | #### 4. **Learning More:**
225 |
226 | To understand how to apply LLMLingua and LongLLMLingua in real-world scenarios like RAG, Online Meetings, CoT, and Code, please refer to our [**examples**](./examples). For detailed guidance, the [**documentation**](./DOCUMENT.md) provides extensive recommendations on effectively utilizing LLMLingua.
227 |
228 | #### 5. **Data collection and model training of LLMLingua-2:**
229 |
230 | To train the compressor on your custom data, please refer to our [**data_collection**](./experiments/llmlingua2/data_collection) and [**model_training**](./experiments/llmlingua2/model_training).
231 |
232 | ## Frequently Asked Questions
233 |
234 | For more insights and answers, visit our [FAQ section](./Transparency_FAQ.md).
235 |
236 | ## Contributing
237 |
238 | This project welcomes contributions and suggestions. Most contributions require you to agree to a
239 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
240 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
241 |
242 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
243 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
244 | provided by the bot. You will only need to do this once across all repos using our CLA.
245 |
246 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
247 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
248 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
249 |
250 | ## Trademarks
251 |
252 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
253 | trademarks or logos is subject to and must follow
254 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
255 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
256 | Any use of third-party trademarks or logos are subject to those third-party's policies.
257 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Security
4 |
5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
6 |
7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
8 |
9 | ## Reporting Security Issues
10 |
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 |
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 |
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 |
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
18 |
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 |
21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 | * Full paths of source file(s) related to the manifestation of the issue
23 | * The location of the affected source code (tag/branch/commit or direct URL)
24 | * Any special configuration required to reproduce the issue
25 | * Step-by-step instructions to reproduce the issue
26 | * Proof-of-concept or exploit code (if possible)
27 | * Impact of the issue, including how an attacker might exploit the issue
28 |
29 | This information will help us triage your report more quickly.
30 |
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 |
33 | ## Preferred Languages
34 |
35 | We prefer all communications to be in English.
36 |
37 | ## Policy
38 |
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 |
41 |
42 |
--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
1 | # Support
2 |
3 | ## How to file issues and get help
4 |
5 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing
6 | issues before filing new issues to avoid duplicates. For new issues, file your bug or
7 | feature request as a new Issue.
8 |
9 | For help and questions about using this project, please refer the [document](./DOCUMENT.md).
10 |
11 | ## Microsoft Support Policy
12 |
13 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
14 |
--------------------------------------------------------------------------------
/Transparency_FAQ.md:
--------------------------------------------------------------------------------
1 | # LLMLingua's Responsible AI FAQ
2 |
3 | ## What is LLMLingua?
4 |
5 | - LLMLingua is a simple and efficient method to compress prompt up to 20x and keeping the original prompt knowledge like ICL, reasoning, etc.
6 | - LLMLingua takes user-defined prompts and compression goals as input, and outputs a compressed prompt, which may often result in a form of expression that is difficult for humans to understand.
7 |
8 | ## What can LLMLingua do?
9 |
10 | - LLMLingua can simultaneously reduce the length of prompts and the output of LLMs (20%-30%), thus saving API calls;
11 | - Compressed prompts from LLMLingua can be directly used with black-box LLMs, such as ChatGPT, GPT-4, and Claude;
12 | - By compressing prompts, LLMLingua allows for more information to be included within the original token length, thereby improving model performance;
13 | - LLMLingua relies on a small language model, like GPT-2 or LLaMA-7b, for perplexity calculations, which is a relatively low-cost approach;
14 | - Compressed prompts generated by LLMLingua can be understood by LLMs, preserving their original capabilities in downstream tasks and keeping the original prompt knowledge like ICL, reasoning, etc. LLMs can also recover the essential information from the compressed prompts;
15 | - LLMLingua is a robustness method, no need any training for the LLMs;
16 | - Additionally, LLMLingua can be used to compress KV-Cache, which speeds up inference.
17 |
18 | ## What is/are LLMLingua’s intended use(s)?
19 |
20 | - Users who call black-box LLM APIs similar to GPT-4, those who utilize ChatGPT to handle longer content, as well as model deployers and cloud service providers, can benefit from these techniques.
21 |
22 | ## How was LLMLingua evaluated? What metrics are used to measure performance?
23 |
24 | - In our experiments, we conducted a detailed evaluation of the performance of compressed prompts across various tasks, particularly in those involving LLM-specific capabilities, such as In-Context Learning, reasoning tasks, summarization, and conversation tasks. We assessed our approach using compression ratio and performance loss as evaluation metrics.
25 |
26 | ## What are the limitations of LLMLingua? How can users minimize the impact of LLMLingua’s limitations when using the system?
27 |
28 | - The potential harmful, false or biased responses using the compressed prompts would likely be unchanged. Thus using LLMLingua has no inherent benefits or risks when it comes to those types of responsible AI issues.
29 | - LLMLingua may struggle to perform well at particularly high compression ratios, especially when the original prompts are already quite short.
30 |
31 | ## What operational factors and settings allow for effective and responsible use of LLMLingua?
32 |
33 | - Users can set parameters such as the boundaries between different components (instruction, context, question) in the prompt, compression goals, and the small model used for compression calculations. Afterward, they can input the compressed prompt into black-box LLMs for use.
34 |
35 | ## What is instruction, context, and question?
36 |
37 | In our approach, we divide the prompts into three distinct modules: instruction, context, and question. Each prompt necessarily contains a question, but the presence of context and instruction is not always guaranteed.
38 |
39 | - Question: This refers to the directives given by the user to the LLMs, such as inquiries, questions, or requests. Positioned after the instruction and context modules, the question module has a high sensitivity to compression.
40 | - Context: This module provides the supplementary context needed to address the question, such as documents, demonstrations, web search results, or API call results. Located between the instruction and question modules, its sensitivity to compression is relatively low.
41 | - Instruction: This module consists of directives given by the user to the LLMs, such as task descriptions. Placed before the instruction and context modules, the instruction module exhibits a high sensitivity to compression.
42 |
43 | ## Is there a need or benefit to finetune a small model specifically for this purpose?
44 |
45 | Refer the [discussion](https://github.com/microsoft/LLMLingua/discussions/57).
46 |
47 | **TL;DR**: Fine-tuning is beneficial, but the improvement is not very significant.
48 |
49 | Our current understanding is that any Language Model can be used to estimate the importance distribution of tokens. And we believe that the higher the compression rate of the LM itself (followed "LM is a compressor"), the more accurate the estimation will be. This is particularly true in terms of the model's exposure to more tokens during the pre-training process.
50 |
51 | Therefore, we consider that any LM can potentially serve as a compressor for prompt compression, with different LMs sharing the same essential token distribution. In our previous experiments, we found that alignment might have some impact, but it is minimal – about 1-2 points. Perhaps a more refined alignment method could significantly enhance performance.
52 |
53 | ## How to choose the compressor model (small language model)?
54 |
55 | Refer the [discussion](https://github.com/microsoft/LLMLingua/discussions/57), [issue](https://github.com/microsoft/LLMLingua/issues/83).
56 |
57 | Our current understanding is that any Language Model can be used to estimate the importance distribution of tokens. And we believe that the higher the compression rate of the LM itself (followed "LM is a compressor"), the more accurate the estimation will be. This is particularly true in terms of the model's exposure to more tokens during the pre-training process.
58 |
59 | Therefore, we consider that any LM can potentially serve as a compressor for prompt compression, with different LMs sharing the same essential token distribution. In our previous experiments, we found that alignment might have some impact, but it is minimal – about 1-2 points. Perhaps a more refined alignment method could significantly enhance performance.
60 |
61 | ## How to use LLMLingua in web-deploy model?
62 |
63 | Refer the [issue1](https://github.com/microsoft/LLMLingua/issues/44), [issue2](https://github.com/microsoft/LLMLingua/issues/65), and [issue3](https://github.com/microsoft/LLMLingua/issues/70).
64 |
65 | We require an API that can return the logprobs of the input prompt. Currently, we have found that OpenAI and [FastChat](https://github.com/lm-sys/FastChat/pull/2612) offer this feature. We plan to support it soon.
66 |
67 | ```python
68 | logp = openai.Completion.create(
69 | model="davinci-002",
70 | prompt="Please return the logprobs",
71 | logprobs=0,
72 | max_tokens=0,
73 | echo=True,
74 | temperature=0,
75 | )
76 | Out[3]:
77 | JSON: {
78 | "id": "",
79 | "object": "text_completion",
80 | "created": 1707295146,
81 | "model": "davinci-002",
82 | "choices": [
83 | {
84 | "text": "Please return the logprobs",
85 | "index": 0,
86 | "logprobs": {
87 | "tokens": [
88 | "Please",
89 | " return",
90 | " the",
91 | " log",
92 | "pro",
93 | "bs"
94 | ],
95 | "token_logprobs": [
96 | null,
97 | -6.9668007,
98 | -2.047512,
99 | -8.885729,
100 | -13.960022,
101 | -5.479665
102 | ],
103 | "top_logprobs": null,
104 | "text_offset": [
105 | 0,
106 | 6,
107 | 13,
108 | 17,
109 | 21,
110 | 24
111 | ]
112 | },
113 | "finish_reason": "length"
114 | }
115 | ],
116 | "usage": {
117 | "prompt_tokens": 6,
118 | "total_tokens": 6
119 | }
120 | }
121 | ```
122 |
123 | ## How to reproduce the result in LLMLingua Series work?
124 |
125 | We release the parameter in the [issue1](https://github.com/microsoft/LLMLingua/issues/76), [issue2](https://github.com/microsoft/LLMLingua/issues/86).
126 |
127 | **LLMLingua**:
128 |
129 | ```python
130 | prompt = compressor.compress_prompt(
131 | context=xxx,
132 | instruction=xxx,
133 | question=xxx,
134 | ratio=0.75,
135 | iterative_size=100,
136 | context_budget="*2",
137 | )
138 | ```
139 |
140 | **LongLLMLingua**:
141 |
142 | ```python
143 | compressed_prompt = llm_lingua.compress_prompt(
144 | demonstration.split("\n"),
145 | instruction,
146 | question,
147 | 0.55,
148 | use_sentence_level_filter=False,
149 | condition_in_question="after_condition",
150 | reorder_context="sort",
151 | dynamic_context_compression_ratio=0.3, # or 0.4
152 | condition_compare=True,
153 | context_budget="+100",
154 | rank_method="longllmlingua",
155 | )
156 | ```
157 |
158 | Experiments in LLMLingua and most experiments in LongLLMLingua were conducted in completion mode, whereas chat mode tends to be more sensitive to token-level compression. However, OpenAI has currently disabled GPT-3.5-turbo's completion; you can use GPT-3.5-turbo-instruction or Azure OpenAI service instead.
159 |
160 | **LLMLingua-2**:
161 |
162 | ```python
163 | from llmlingua import PromptCompressor
164 |
165 | llm_lingua = PromptCompressor(
166 | model_name="microsoft/llmlingua-2-xlm-roberta-large-meetingbank",
167 | use_llmlingua2=True, # Whether to use llmlingua-2
168 | )
169 | compressed_prompt = llm_lingua.compress_prompt(prompt, rate=0.33, force_tokens = ['\n', '?'])
170 |
171 | ## Or use LLMLingua-2-small model
172 | llm_lingua = PromptCompressor(
173 | model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
174 | use_llmlingua2=True, # Whether to use llmlingua-2
175 | )
176 | ```
177 |
178 | And you can find the details of the LLMLingua-2 experiments at [experiments/llmlingua2](./examples/llmlingua2).
179 |
180 | ## How to use LLMLingua in LangChain and LlamaIndex?
181 |
182 | ### Integration with LangChain
183 |
184 | Thanks to the contributions of Ayo Ayibiowu (@thehapyone), (Long)LLMLingua can be seamlessly integrated into LangChain. Here's an example of how to initialize (Long)LLMLingua within LangChain:
185 |
186 | ```python
187 | from langchain.retrievers import ContextualCompressionRetriever
188 | from langchain_community.retrievers.document_compressors import LLMLinguaCompressor
189 | from langchain_openai import ChatOpenAI
190 |
191 | llm = ChatOpenAI(temperature=0)
192 |
193 | compressor = LLMLinguaCompressor(model_name="openai-community/gpt2", device_map="cpu")
194 | compression_retriever = ContextualCompressionRetriever(
195 | base_compressor=compressor, base_retriever=retriever
196 | )
197 |
198 | compressed_docs = compression_retriever.get_relevant_documents(
199 | "What did the president say about Ketanji Jackson Brown"
200 | )
201 | pretty_print_docs(compressed_docs)
202 | ```
203 |
204 | For a more detailed guide, please refer to [Notebook](https://github.com/langchain-ai/langchain/blob/master/docs/docs/integrations/retrievers/llmlingua.ipynb).
205 |
206 | ### Integration with LlamaIndex
207 |
208 | Thanks to the contributions of Jerry Liu (@jerryjliu), (Long)LLMLingua can be seamlessly integrated into LlamaIndex. Here's an example of how to initialize (Long)LLMLingua within LlamaIndex:
209 |
210 | ```python
211 | from llama_index.query_engine import RetrieverQueryEngine
212 | from llama_index.response_synthesizers import CompactAndRefine
213 | from llama_index.indices.postprocessor import LongLLMLinguaPostprocessor
214 |
215 | node_postprocessor = LongLLMLinguaPostprocessor(
216 | instruction_str="Given the context, please answer the final question",
217 | target_token=300,
218 | rank_method="longllmlingua",
219 | additional_compress_kwargs={
220 | "condition_compare": True,
221 | "condition_in_question": "after",
222 | "context_budget": "+100",
223 | "reorder_context": "sort", # Enables document reordering
224 | "dynamic_context_compression_ratio": 0.4, # Enables dynamic compression ratio
225 | },
226 | )
227 | ```
228 |
229 | For a more detailed guide, please refer to [RAGLlamaIndex Example](https://github.com/microsoft/LLMLingua/blob/main/examples/RAGLlamaIndex.ipynb).
230 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/README.md:
--------------------------------------------------------------------------------
1 | # LLMLingua-2 Experiments
2 |
3 | ## Getting Started
4 |
5 | To get started with LLMLingua-2 experiments, simply install it using pip:
6 |
7 | ```bash
8 | pip install llmlingua
9 | ```
10 |
11 | To collect your own data using GPT-4, install the following packages:
12 | ```bash
13 | pip install openai==0.28
14 |
15 | pip install spacy
16 | python -m spacy download en_core_web_sm
17 | ```
18 |
19 | To train your own compressor on the collected data, install:
20 | ```bash
21 | pip install scikit-learn
22 | pip install tensorboard
23 | ```
24 |
25 | ## Data collection
26 |
27 | We release our collected GPT-4 compression result at [HF](https://huggingface.co/datasets/microsoft/MeetingBank-LLMCompressed) after review. We also provide the whole data collection pipeline at [**collect_data.sh**](data_collection/collect_data.sh) to help you construct your custom compression dataset.
28 |
29 | ## Model Training
30 |
31 | To train a compressor on the collected data, simply run [**train.sh**](model_training/train.sh)
32 |
33 | ## Evaluation
34 |
35 | We provide a script [**compress.sh**](evaluation/scripts/compress.sh) to compress the original context on several benchmarks. After compression, run [**evaluate.sh**](evaluation/scripts/evaluate.sh) to evalate on down-stream task using the compressed prompt.
36 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/data_collection/GPT4_compressor.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | from time import sleep
5 |
6 | from utils import load_model_and_tokenizer
7 |
8 | SLEEP_TIME_SUCCESS = 10
9 | SLEEP_TIME_FAILED = 62
10 |
11 |
12 | class PromptCompressor:
13 | def __init__(
14 | self,
15 | model_name,
16 | user_prompt,
17 | system_prompt=None,
18 | temperature=0.3,
19 | top_p=1.0,
20 | n_max_token=32700,
21 | ):
22 | self.model_name = model_name
23 | self.temperature = temperature
24 | self.top_p = top_p
25 |
26 | self.system_prompt = system_prompt
27 | self.user_prompt = user_prompt
28 | print(self.system_prompt)
29 | print(self.user_prompt)
30 |
31 | self.model, self.tokenizer = load_model_and_tokenizer(
32 | self.model_name, chat_completion=True
33 | )
34 | self.n_max_token = n_max_token
35 |
36 | def query_template(self, text, n_max_new_token=4096):
37 | if self.user_prompt and "{text_to_compress}" in self.user_prompt:
38 | prompt = self.user_prompt.format(text_to_compress=text)
39 | else:
40 | prompt = text
41 |
42 | len_sys_prompt = 0
43 | if self.system_prompt:
44 | messages = [{"role": "system", "content": self.system_prompt}]
45 | len_sys_prompt = len(self.tokenizer.encode(self.system_prompt))
46 | token_ids = self.tokenizer.encode(prompt)
47 | if len(token_ids) > (self.n_max_token - n_max_new_token - len_sys_prompt):
48 | half = int((self.n_max_token - n_max_new_token - len_sys_prompt) / 2) - 1
49 | prompt = self.tokenizer.decode(token_ids[:half]) + self.tokenizer.decode(
50 | token_ids[-half:]
51 | )
52 | messages.append({"role": "user", "content": prompt})
53 | return messages
54 |
55 | def compress(self, text, n_max_new_token=4096):
56 | messages = self.query_template(text, n_max_new_token)
57 | comp = None
58 | while comp is None:
59 | try:
60 | request = {
61 | "messages": messages,
62 | "temperature": self.temperature,
63 | "top_p": self.top_p,
64 | "max_tokens": n_max_new_token,
65 | }
66 | response = self.model.create(engine=self.model_name, **request)
67 | if "choices" not in response:
68 | print(response)
69 | comp = response["choices"][0]["message"]["content"]
70 | except Exception as e:
71 | print(f"error: {e}")
72 | sleep(SLEEP_TIME_FAILED)
73 | # sleep(SLEEP_TIME_SUCCESS)
74 | return comp
75 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/data_collection/README.md:
--------------------------------------------------------------------------------
1 | ### Use our collected data
2 |
3 | We release our collected GPT-4 compression result at [HF](https://huggingface.co/datasets/microsoft/MeetingBank-LLMCompressed) after review. To load data, simply use
4 |
5 | ```python
6 | from datasets import load_dataset
7 | data = load_dataset("microsoft/MeetingBank-LLMCompressed", split="train")
8 | print(len(data))
9 | for idx, sample in enumerate(data):
10 | # concatenation of all chunks
11 | prompt = sample["prompt"]
12 | compressed_prompt = sample["compressed_prompt"]
13 | ```
14 | **prompt** is the original meeting transcript. **compressed_prompt** is the compression result after merging all compressed chunks of a transcript.
15 |
16 | To load compressed chunks along with original chunks, simply use
17 | ```python
18 | from datasets import load_dataset
19 | data = load_dataset("microsoft/MeetingBank-LLMCompressed", split="train")
20 | print(len(data))
21 | for idx, sample in enumerate(data):
22 | # chunk list
23 | prompt_list = sample["prompt_list"]
24 | compressed_prompt_list = sample["compressed_prompt_list"]
25 | ```
26 |
27 | ### Construct your custom compression dataset
28 |
29 | First, format your data to a list of dict, with each dict containing at least two keys: *idx* and *prompt*. [**format_data.py**](format_data.py) illustrates how we format the meetingbank data.
30 |
31 | Then, instruct GPT-4 to compress the original context.
32 |
33 | ```bash
34 | python compress.py --load_origin_from \
35 | --chunk_size 512 \
36 | --compressor llmcomp \
37 | --model_name gpt-4-32k \
38 | --save_path
39 |
40 | ```
41 |
42 | Then, assign label to the original words and filter out poor compression samples.
43 |
44 |
45 | ```bash
46 | python label_word.py \
47 | --load_prompt_from \
48 | --window_size 400 \
49 | --save_path \
50 |
51 | ```
52 |
53 | Filter out some poorly compressed / labeled samples.
54 | ```bash
55 | python filter.py --load_path \
56 | --save_path
57 | ```
58 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/data_collection/collect_data.sh:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | # python format_data.py
5 |
6 | # python compress.py --load_origin_from ../results/meetingbank/origin/meetingbank_train_formated.json \
7 | # --compressor gpt4 \
8 | # --chunk_size 512 \
9 | # --save_path ../results/meetingbank/gpt-4-32k_comp/compression_cs512_meetingbank_train_formated.json
10 |
11 | python label_word.py --load_prompt_from microsoft/MeetingBank-LLMCompressed \
12 | --window_size 400 \
13 | --save_path ../results/meetingbank/gpt-4-32k_comp/annotation_cs512_meetingbank_train_formated.json
14 |
15 | python filter.py --load_path ../results/meetingbank/gpt-4-32k_comp/annotation_cs512_meetingbank_train_formated.pt \
16 | --save_path ../results/meetingbank/gpt-4-32k_comp/annotation_kept_cs512_meetingbank_train_formated.pt
17 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/data_collection/compress.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | import argparse
5 | import copy
6 | import json
7 | import os
8 | import time
9 |
10 | import tiktoken
11 | from tqdm import tqdm
12 |
13 | parser = argparse.ArgumentParser(description="compress any prompt.")
14 |
15 | parser.add_argument("--compressor", help="compress method", default="gpt4")
16 | parser.add_argument("--model_name", help="llm used to compress", default="gpt-4-32k")
17 |
18 | parser.add_argument(
19 | "--load_origin_from", help="dataset used to compress", required=True
20 | )
21 | parser.add_argument(
22 | "--load_key", help="the key to load the text to compress", default="prompt"
23 | )
24 | parser.add_argument(
25 | "--save_key",
26 | help="the key to save the compressed text",
27 | default="compressed_prompt",
28 | )
29 |
30 | parser.add_argument("--save_path", help="path to save results", required=True)
31 | # for gpt-4 compression
32 | parser.add_argument(
33 | "--load_prompt_from", help="", default="compression_instructions.json"
34 | )
35 | parser.add_argument("--prompt_id", type=int, default=4)
36 | parser.add_argument("--n_max_new_token", type=int, default=4000)
37 | # for gpt-4 compression and selective-context
38 | parser.add_argument("--chunk_size", type=int, default=-1)
39 | # for llmlingua
40 | parser.add_argument(
41 | "--compression_rate", help="compression rate", type=float, default=0.5
42 | )
43 | parser.add_argument(
44 | "--n_target_token", help="number of target tokens", type=int, default=-1
45 | )
46 |
47 | args = parser.parse_args()
48 | os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
49 |
50 | data = json.load(open(args.load_origin_from))
51 | print(f"num data: {len(data)}")
52 |
53 | if args.compressor == "gpt4":
54 | from GPT4_compressor import PromptCompressor
55 |
56 | prompts = json.load(open(args.load_prompt_from))
57 | system_prompt = prompts[str(args.prompt_id)]["system_prompt"]
58 | user_prompt = prompts[str(args.prompt_id)]["user_prompt"]
59 | compressor = PromptCompressor(
60 | model_name=args.model_name, system_prompt=system_prompt, user_prompt=user_prompt
61 | )
62 | elif args.compressor == "llmlingua" or args.compressor == "longllmlingua":
63 | from llmlingua import PromptCompressor
64 |
65 | compressor = PromptCompressor()
66 | elif args.compressor == "sc":
67 | from select_context import SelectiveContext
68 |
69 | compressor = SelectiveContext(model_type="NousResearch/Llama-2-7b-hf", lang="en")
70 | else:
71 | raise NotImplementedError()
72 |
73 | results = {}
74 | results_list = []
75 | total_time = 0
76 |
77 | if os.path.exists(args.save_path):
78 | results = json.load(open(args.save_path))
79 |
80 | tokenizer = tiktoken.encoding_for_model("gpt-4")
81 |
82 |
83 | def chunk_origin(origin_text):
84 | origin_list = []
85 | origin_token_ids = tokenizer.encode(origin_text)
86 | end_token_ids = set(tokenizer.encode(".") + tokenizer.encode("\n"))
87 | n = len(origin_token_ids)
88 | st = 0
89 | while st < n:
90 | if st + args.chunk_size > n - 1:
91 | chunk = tokenizer.decode(origin_token_ids[st:n])
92 | origin_list.append(chunk)
93 | break
94 | else:
95 | ed = st + args.chunk_size
96 | for j in range(0, ed - st):
97 | if origin_token_ids[ed - j] in end_token_ids:
98 | ed = ed - j
99 | break
100 | chunk = tokenizer.decode(origin_token_ids[st : ed + 1])
101 | origin_list.append(chunk)
102 | st = ed + 1
103 | return origin_list
104 |
105 |
106 | for sample in tqdm(data):
107 | idx = int(sample["idx"])
108 | origin = copy.deepcopy(sample[args.load_key])
109 | if origin is None:
110 | continue
111 | if idx in results or str(idx) in results:
112 | print(f"{idx}-th sample is processed")
113 | continue
114 |
115 | t = time.time()
116 | if args.compressor == "llmlingua" or args.compressor == "longllmlingua":
117 | comp_dict = compressor.compress_prompt(
118 | origin, ratio=args.compression_rate, target_token=args.n_target_token
119 | )
120 | comp = comp_dict["compressed_prompt"]
121 | else:
122 | # multi document
123 | if isinstance(origin, list):
124 | if args.chunk_size > 0:
125 | chunk_list = []
126 | for j, document in enumerate(origin):
127 | ori_list = chunk_origin(document)
128 | chunk_list.extend(ori_list)
129 | origin = chunk_list
130 | # single document
131 | else:
132 | origin = [origin]
133 | if args.chunk_size > 0:
134 | origin = chunk_origin(origin[0])
135 | print(f"num chunk: {len(origin)}")
136 | comp_list = []
137 | for j, chunk in enumerate(origin):
138 | if args.compressor == "gpt4":
139 | comp = compressor.compress(chunk, args.n_max_new_token)
140 | elif args.compressor == "sc":
141 | if args.n_target_token > 0:
142 | reduce_ratio = 1 - min(
143 | (args.n_target_token // len(origin))
144 | / len(tokenizer.encode(chunk)),
145 | 1.0,
146 | )
147 | else:
148 | reduce_ratio = 1.0 - args.compression_ratio
149 | comp, reduced = compressor(
150 | chunk, reduce_ratio=reduce_ratio, reduce_level="token"
151 | )
152 | comp = comp.replace("", "").replace("", "")
153 | comp_list.append(comp)
154 | assert len(origin) == len(comp_list)
155 | comp = "".join(comp_list)
156 |
157 | total_time += time.time() - t
158 | new_sample = copy.deepcopy(sample)
159 | new_sample[args.save_key] = comp
160 | if (
161 | not (args.compressor == "llmlingua" or args.compressor == "longllmlingua")
162 | and len(comp_list) > 0
163 | ):
164 | assert len(origin) == len(comp_list)
165 | new_sample["prompt_list"] = origin[:]
166 | new_sample["compressed_prompt_list"] = comp_list[:]
167 |
168 | results[idx] = new_sample
169 | json.dump(
170 | results,
171 | open(args.save_path, "w", encoding="utf8"),
172 | indent=4,
173 | ensure_ascii=False,
174 | )
175 |
176 | print(args.save_path, total_time)
177 | json.dump(
178 | results, open(args.save_path, "w", encoding="utf8"), indent=4, ensure_ascii=False
179 | )
180 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/data_collection/compression_instructions.json:
--------------------------------------------------------------------------------
1 | {
2 | "0":{"system_prompt": "Could you please rephrase the paragraph to make it short, and keep 5% tokens?", "user_prompt": ""},
3 | "1":{"system_prompt": "Summarize the provided examples in a few sentences, maintaining all essential reasoning aspects", "user_prompt": ""},
4 | "2":{"system_prompt": "Follow these steps to shorten the given text content: 1. First, calculate the amount of information contained in each sentence, and remove sentences with less information. 2. Next, further condense the text by removing stop words, unnecessary punctuation, and redundant expressions. Refine the content while ensuring that all key information is retained. Let's do it step by step.", "user_prompt": ""},
5 | "3":{"system_prompt": "Remove redundancy and express the text concisely in English, ensuring that all key information and reasoning processes are preserved.", "user_prompt": ""},
6 | "4":{"system_prompt": "You are an excellent linguist and very good at compressing passages into short expressions by removing unimportant words, while retaining as much information as possible.", "user_prompt": "Compress some text to short expressions, and such that you (GPT-4) can reconstruct it as close as possible to the original. Unlike the usual text compression, I need you to comply with the 5 conditions below: 1. You can ONLY remove unimportant words. 2. Do not change the order of words. 3. Do not change the original words, e.g. 'asking'->'ask' is NOT OK, 'current'->'now' is NOT OK. 4. Do not use abbreviations or emojis, e.g. 'without'->'w/o' is NOT OK, 'as soon as possible'->'ASAP' is NOT OK. 5. Do not add new words or symbols, this is very important. For example, 'dedicate 3 hours to each chapter'->'3 hours/chapter' is NOT OK because you add new token '/', just compress it into '3 hours each chapter'. '30 eggs plus 20 eggs equals 50 eggs'->'30+20=50' is also NOT OK becuase you add new symbols + and =, just compress it into '30 plus 20 equals 50'. \nCompress the origin aggressively by removing words only. Compress the origin as short as you can, while retaining as much information as possible. \nIf you understand, please compress the following text: \n{text_to_compress}\nThe compressed text is: "}
7 | }
8 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/data_collection/filter.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | import argparse
5 | from collections import defaultdict
6 |
7 | import numpy as np
8 | import torch
9 |
10 | parser = argparse.ArgumentParser(description="compress any prompt.")
11 | parser.add_argument(
12 | "--load_path",
13 | help="path to load data",
14 | default="../../../results/meetingbank/gpt-4-32k_comp/annotation_cs512_meetingbank_train_formated.pt",
15 | )
16 | parser.add_argument(
17 | "--save_path",
18 | help="path to save filtered data",
19 | default="../../../results/meetingbank/gpt-4-32k_comp/annotation_kept_cs512_meetingbank_train_formated.pt",
20 | )
21 | args = parser.parse_args()
22 |
23 | res_pt = torch.load(args.load_path)
24 |
25 | ## filtering
26 | variation_rate_list = res_pt["variation_rate"]
27 | print(len(variation_rate_list))
28 | threshold = np.percentile(variation_rate_list, 90)
29 | kept, filtered = defaultdict(list), defaultdict(list)
30 | for labels, origin, comp, retrieval, cr, vr, hr, mr, ag in zip(
31 | res_pt["labels"],
32 | res_pt["origin"],
33 | res_pt["comp"],
34 | res_pt["retrieval"],
35 | res_pt["comp_rate"],
36 | res_pt["variation_rate"],
37 | res_pt["hitting_rate"],
38 | res_pt["matching_rate"],
39 | res_pt["alignment_gap"],
40 | ):
41 | if vr >= threshold:
42 | filtered["labels"].append(labels)
43 | filtered["origin"].append(origin)
44 | filtered["comp"].append(comp)
45 | filtered["retrieval"].append(retrieval)
46 | filtered["comp_rate"].append(cr)
47 | filtered["variation_rate"].append(vr)
48 | filtered["hitting_rate"].append(hr)
49 | filtered["matching_rate"].append(mr)
50 | filtered["alignment_gap"].append(ag)
51 | else:
52 | kept["labels"].append(labels)
53 | kept["origin"].append(origin)
54 | kept["comp"].append(comp)
55 | kept["retrieval"].append(retrieval)
56 | kept["comp_rate"].append(cr)
57 | kept["variation_rate"].append(vr)
58 | kept["hitting_rate"].append(hr)
59 | kept["matching_rate"].append(mr)
60 | kept["alignment_gap"].append(ag)
61 |
62 | alignment_gap_list = kept["alignment_gap"]
63 | threshold = np.percentile(alignment_gap_list, 90)
64 | kept2 = defaultdict(list)
65 | for labels, origin, comp, retrieval, cr, vr, hr, mr, ag in zip(
66 | kept["labels"],
67 | kept["origin"],
68 | kept["comp"],
69 | res_pt["retrieval"],
70 | kept["comp_rate"],
71 | kept["variation_rate"],
72 | kept["hitting_rate"],
73 | kept["matching_rate"],
74 | kept["alignment_gap"],
75 | ):
76 | if ag >= threshold:
77 | filtered["labels"].append(labels)
78 | filtered["origin"].append(origin)
79 | filtered["comp"].append(comp)
80 | filtered["retrieval"].append(retrieval)
81 | filtered["comp_rate"].append(cr)
82 | filtered["variation_rate"].append(vr)
83 | filtered["hitting_rate"].append(hr)
84 | filtered["matching_rate"].append(mr)
85 | filtered["alignment_gap"].append(ag)
86 | else:
87 | kept2["labels"].append(labels)
88 | kept2["origin"].append(origin)
89 | kept2["comp"].append(comp)
90 | kept2["retrieval"].append(retrieval)
91 | kept2["comp_rate"].append(cr)
92 | kept2["variation_rate"].append(vr)
93 | kept2["hitting_rate"].append(hr)
94 | kept2["matching_rate"].append(mr)
95 | kept2["alignment_gap"].append(ag)
96 |
97 | torch.save(kept2, args.save_path)
98 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/data_collection/format_data.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | import json
5 | import os
6 |
7 | from datasets import load_dataset
8 |
9 | dataset = load_dataset("huuuyeah/meetingbank", split="train")
10 | data = []
11 | for idx, instance in enumerate(dataset):
12 | temp = {}
13 | temp["idx"] = idx
14 | temp["prompt"] = instance["transcript"]
15 | temp["summary"] = instance["summary"]
16 | data.append(temp)
17 | os.makedirs("../../../results/meetingbank/origin/", exist_ok=True)
18 | json.dump(
19 | data,
20 | open("../../../results/meetingbank/origin/meetingbank_train_formated.json", "w"),
21 | indent=4,
22 | )
23 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/data_collection/label_word.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | import argparse
5 | import json
6 | import logging
7 | import os
8 | from collections import defaultdict
9 | from datasets import load_dataset
10 | import spacy
11 | import torch
12 | from tqdm import tqdm
13 |
14 | parser = argparse.ArgumentParser(description="annotate token")
15 | parser.add_argument(
16 | "--dataset_name", help="dataset used to compress", default="meetingbank"
17 | )
18 | parser.add_argument("--split", help="dataset part", default="train")
19 | parser.add_argument(
20 | "--load_prompt_from",
21 | help="where to load compressed prompt",
22 | default="results/meetingbank/origin-comp-list_llmcomp_cs512.json",
23 | )
24 | parser.add_argument(
25 | "--save_path",
26 | help="path to save results",
27 | default="results/meetingbank/annotation/label_word.json",
28 | )
29 | parser.add_argument("--window_size", help="window size", type=int, default=150)
30 | parser.add_argument(
31 | "--verbose",
32 | help="print debug info",
33 | action=argparse.BooleanOptionalAction,
34 | default=False,
35 | )
36 |
37 | args = parser.parse_args()
38 | os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
39 | logging.basicConfig(
40 | filename=f"{os.path.dirname(args.save_path)}/log.log",
41 | level=logging.INFO,
42 | format="%(asctime)s - %(levelname)s - %(message)s",
43 | )
44 | logger = logging.getLogger()
45 |
46 | nlp = spacy.load("en_core_web_sm")
47 |
48 |
49 | def split_string(input_string, ignore_tokens=set([","])):
50 | doc = nlp(input_string)
51 | word_list = []
52 | for word in doc:
53 | if word.lemma_ not in ignore_tokens:
54 | word_list.append(word.lemma_)
55 | return word_list
56 |
57 |
58 | def is_equal(token1, token2):
59 | return token1.lower() == token2.lower()
60 |
61 | origins, comps = [], []
62 | meeting_bank_comp = load_dataset(args.load_prompt_from, split="train")
63 | for i, sample in enumerate(meeting_bank_comp):
64 | if len(sample["prompt_list"]) != len(sample["compressed_prompt_list"]):
65 | print(f"{i}-th length not equal")
66 | continue
67 | origins.extend(sample["prompt_list"])
68 | comps.extend(sample["compressed_prompt_list"])
69 |
70 | res = {}
71 | res_pt = defaultdict(list)
72 |
73 | num_sample = 0
74 | compression_rate_avg = 0
75 | find_rate_avg = 0
76 | variation_rate_avg = 0
77 | matching_rate_avg = 0
78 | hitting_rate_avg = 0
79 | alignment_gap_avg = 0
80 |
81 | for chunk_idx, (origin, comp) in tqdm(enumerate(zip(origins, comps))):
82 | num_sample += 1
83 | origin_tokens = split_string(origin)
84 | comp_tokens = split_string(comp)
85 | origin_tokens_set = set(origin_tokens)
86 | for token in origin_tokens:
87 | origin_tokens_set.add(token.lower())
88 |
89 | num_find = 0
90 | prev_idx = 0
91 | back_cnt = 0
92 | num_origin_tokens = len(origin_tokens)
93 | labels = [False] * num_origin_tokens
94 | for token in comp_tokens:
95 | flag = False
96 | if token in origin_tokens_set or token.lower() in origin_tokens_set:
97 | num_find += 1
98 | for i in range(args.window_size):
99 | # look forward
100 | token_idx = min(prev_idx + i, num_origin_tokens - 1)
101 | if is_equal(origin_tokens[token_idx], token) and not labels[token_idx]:
102 | labels[token_idx] = True
103 | # window do not go too fast
104 | if token_idx - prev_idx > args.window_size // 2:
105 | prev_idx += args.window_size // 2
106 | else:
107 | prev_idx = token_idx
108 | if args.verbose:
109 | print(
110 | token,
111 | token_idx,
112 | prev_idx,
113 | origin_tokens[token_idx - 1 : token_idx + 2],
114 | )
115 | flag = True
116 | break
117 | # look backward
118 | token_idx = max(prev_idx - i, 0)
119 | if is_equal(origin_tokens[token_idx], token) and not labels[token_idx]:
120 | labels[token_idx] = True
121 | prev_idx = token_idx
122 | if args.verbose:
123 | print(
124 | token,
125 | token_idx,
126 | prev_idx,
127 | origin_tokens[token_idx - 1 : token_idx + 2],
128 | )
129 | flag = True
130 | break
131 |
132 | retrieval_tokens = []
133 | for idx, token in enumerate(origin_tokens):
134 | if labels[idx]:
135 | retrieval_tokens.append(token)
136 | retrieval = " ".join(retrieval_tokens)
137 |
138 | comp_rate = len(comp_tokens) / len(origin_tokens)
139 | if len(comp_tokens) > 0:
140 | find_rate = num_find / len(comp_tokens)
141 | else:
142 | find_rate = 0.0
143 | variation_rate = 1 - find_rate
144 | hitting_rate = num_find / len(origin_tokens)
145 | matching_rate = sum(labels) / len(labels)
146 | alignment_gap = hitting_rate - matching_rate
147 |
148 | compression_rate_avg += comp_rate
149 | find_rate_avg += find_rate
150 | variation_rate_avg += variation_rate
151 | hitting_rate_avg += hitting_rate
152 | matching_rate_avg += matching_rate
153 | alignment_gap_avg += alignment_gap
154 |
155 | if alignment_gap > 0.1:
156 | print(origin)
157 | print("-" * 50)
158 | print(comp)
159 | print("-" * 50)
160 | print(retrieval)
161 | print("-" * 50)
162 | print(origin_tokens)
163 | print("-" * 50)
164 | print(comp_tokens)
165 | print("-" * 50)
166 | print(retrieval_tokens)
167 | print("=" * 50)
168 |
169 | print(
170 | f"comp rate: {comp_rate}, variation_rate: {variation_rate}, alignment_gap: {alignment_gap}"
171 | )
172 |
173 | res[chunk_idx] = {
174 | "labels": labels,
175 | "origin": origin,
176 | "comp": comp,
177 | "retrieval": retrieval,
178 | "origin_tokens": origin_tokens,
179 | "comp_rate": comp_rate,
180 | "variation_rate": variation_rate,
181 | "hitting_rate": hitting_rate,
182 | "matching_rate": matching_rate,
183 | "alignment_gap": alignment_gap,
184 | }
185 |
186 | res_pt["labels"].append(labels)
187 | res_pt["origin"].append(origin)
188 | res_pt["comp"].append(comp)
189 | res_pt["retrieval"].append(retrieval)
190 | res_pt["origin_tokens"].append(origin_tokens)
191 | res_pt["comp_rate"].append(comp_rate)
192 | res_pt["variation_rate"].append(variation_rate)
193 | res_pt["hitting_rate"].append(hitting_rate)
194 | res_pt["matching_rate"].append(matching_rate)
195 | res_pt["alignment_gap"].append(alignment_gap)
196 |
197 | if int(chunk_idx) % 1000 == 0:
198 | json.dump(res, open(args.save_path, "w"), indent=4)
199 | torch.save(res_pt, args.save_path.replace(".json", ".pt"))
200 |
201 | json.dump(res, open(args.save_path, "w"), indent=4)
202 | torch.save(res_pt, args.save_path.replace(".json", ".pt"))
203 |
204 | compression_rate_avg = compression_rate_avg / num_sample
205 | find_rate_avg = find_rate_avg / num_sample
206 | variation_rate_avg = variation_rate_avg / num_sample
207 | matching_rate_avg = matching_rate_avg / num_sample
208 | hitting_rate_avg = hitting_rate_avg / num_sample
209 | alignment_gap_avg = alignment_gap_avg / num_sample
210 |
211 | print_info = f"window size: {args.window_size}, comp rate: {compression_rate_avg}, hitting_rate: {hitting_rate_avg}, retrieval rate: {matching_rate_avg}"
212 | print(print_info)
213 | logger.info(print_info)
214 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/data_collection/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | from time import sleep
5 |
6 | import openai
7 | import tiktoken
8 |
9 |
10 | def query_llm(
11 | prompt,
12 | model,
13 | model_name,
14 | max_tokens,
15 | tokenizer=None,
16 | chat_completion=False,
17 | **kwargs,
18 | ):
19 | SLEEP_TIME_FAILED = 62
20 |
21 | request = {
22 | "temperature": kwargs["temperature"] if "temperature" in kwargs else 0.0,
23 | "top_p": kwargs["top_p"] if "top_p" in kwargs else 1.0,
24 | "seed": kwargs["seed"] if "seed" in kwargs else 42,
25 | "max_tokens": max_tokens,
26 | "n": 1,
27 | "stream": False,
28 | }
29 | if chat_completion:
30 | request["messages"] = [
31 | {"role": "system", "content": "You are a helpful assistant."},
32 | {"role": "user", "content": prompt},
33 | ]
34 | else:
35 | request["prompt"] = prompt
36 |
37 | answer = None
38 | response = None
39 | while answer is None:
40 | try:
41 | response = model.create(engine=model_name, **request)
42 | answer = (
43 | response["choices"][0]["message"]["content"]
44 | if chat_completion
45 | else response["choices"][0]["text"]
46 | )
47 | except Exception as e:
48 | answer = None
49 | print(f"error: {e}, response: {response}")
50 | sleep(SLEEP_TIME_FAILED)
51 | # sleep(SLEEP_TIME_SUCCESS)
52 | return answer
53 |
54 |
55 | def load_model_and_tokenizer(model_name_or_path, chat_completion=False):
56 | openai.api_key = "your_api_key"
57 | openai.api_base = "your_api_base"
58 | openai.api_type = "azure"
59 | openai.api_version = "2023-05-15"
60 |
61 | if chat_completion:
62 | model = openai.ChatCompletion
63 | else:
64 | model = openai.Completion
65 |
66 | tokenizer = tiktoken.encoding_for_model("gpt-4")
67 | return model, tokenizer
68 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/evaluation/compress.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | import argparse
5 | import copy
6 | import json
7 | import os
8 | import time
9 |
10 | from tqdm import tqdm
11 |
12 | from llmlingua.prompt_compressor import PromptCompressor
13 |
14 | parser = argparse.ArgumentParser(description="compress any prompt.")
15 |
16 | parser.add_argument("--compressor", help="compress method", default="llmcomp")
17 | parser.add_argument(
18 | "--model_name",
19 | help="llm used to compress",
20 | default="microsoft/llmlingua-2-xlm-roberta-large-meetingbank",
21 | )
22 | parser.add_argument(
23 | "--load_origin_from", help="dataset used to compress", required=True
24 | )
25 | parser.add_argument(
26 | "--load_key", help="the key to load the text to compress", default="prompt"
27 | )
28 | parser.add_argument(
29 | "--save_key",
30 | help="the key to save the compressed text",
31 | default="compressed_prompt",
32 | )
33 |
34 | parser.add_argument("--save_path", help="path to save results", required=True)
35 |
36 | # for llmlingua2
37 | parser.add_argument(
38 | "--compression_rate", help="compression rate", type=float, default=0.5
39 | )
40 | parser.add_argument(
41 | "--target_token", help="number of target tokens", type=int, default=-1
42 | )
43 | # llmlingua2 coarse to fine
44 | parser.add_argument(
45 | "--use_token_level_filter", action=argparse.BooleanOptionalAction, default=True
46 | )
47 | parser.add_argument(
48 | "--use_context_level_filter", action=argparse.BooleanOptionalAction, default=False
49 | )
50 | parser.add_argument("--target_context", type=int, default=-1)
51 | parser.add_argument("--context_level_compression_rate", type=float, default=1.0)
52 | parser.add_argument("--context_level_target_token", type=int, default=-1)
53 | # llmlingua2 details
54 | parser.add_argument(
55 | "--force_tokens",
56 | help="the tokens which will be forcely preserved, comma separated",
57 | type=str,
58 | default=None,
59 | )
60 | parser.add_argument(
61 | "--drop_consecutive", action=argparse.BooleanOptionalAction, default=True
62 | )
63 | parser.add_argument(
64 | "--force_reserve_digit", action=argparse.BooleanOptionalAction, default=False
65 | )
66 |
67 | args = parser.parse_args()
68 | os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
69 | if args.force_tokens is not None:
70 | args.force_tokens = [
71 | str(item).replace("\\n", "\n") for item in args.force_tokens.split(",")
72 | ]
73 | else:
74 | args.force_tokens = []
75 | print(f"force tokens: {args.force_tokens}")
76 |
77 | data = json.load(open(args.load_origin_from))
78 | print(f"num data: {len(data)}")
79 |
80 | compressor = PromptCompressor(
81 | model_name=args.model_name,
82 | model_config={},
83 | use_llmlingua2=True,
84 | )
85 |
86 | results = {}
87 | results_list = []
88 | total_time = 0
89 |
90 | if os.path.exists(args.save_path):
91 | results = json.load(open(args.save_path))
92 |
93 | for sample in tqdm(data):
94 | idx = int(sample["idx"])
95 | origin = copy.deepcopy(sample[args.load_key])
96 | if origin is None:
97 | continue
98 | if idx in results or str(idx) in results:
99 | print(f"{idx}-th sample is processed")
100 | continue
101 | t = time.time()
102 | comp_dict = compressor.compress_prompt_llmlingua2(
103 | origin,
104 | rate=args.compression_rate,
105 | target_token=args.target_token,
106 | use_context_level_filter=args.use_context_level_filter,
107 | use_token_level_filter=args.use_token_level_filter,
108 | target_context=args.target_context,
109 | context_level_rate=args.context_level_compression_rate,
110 | context_level_target_token=args.context_level_target_token,
111 | force_tokens=args.force_tokens,
112 | drop_consecutive=args.drop_consecutive,
113 | force_reserve_digit=args.force_reserve_digit,
114 | )
115 | total_time += time.time() - t
116 | comp = comp_dict["compressed_prompt"]
117 | comp_list = comp_dict["compressed_prompt_list"]
118 |
119 | new_sample = copy.deepcopy(sample)
120 | new_sample[args.save_key] = comp
121 | if comp_list is not None and args.load_key == "prompt_list":
122 | new_sample["compressed_prompt_list"] = comp_list
123 | print(len(new_sample["prompt_list"]), len(new_sample["compressed_prompt_list"]))
124 |
125 | results[idx] = new_sample
126 | json.dump(
127 | results,
128 | open(args.save_path, "w", encoding="utf8"),
129 | indent=4,
130 | ensure_ascii=False,
131 | )
132 |
133 | print(args.save_path, total_time)
134 | json.dump(
135 | results, open(args.save_path, "w", encoding="utf8"), indent=4, ensure_ascii=False
136 | )
137 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/evaluation/eval_bbh.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | import argparse
5 | import json
6 | import os
7 | import re
8 | from collections import defaultdict
9 |
10 | import tiktoken
11 | from tqdm import tqdm
12 | from utils import load_model_and_tokenizer, query_llm
13 |
14 | parser = argparse.ArgumentParser(description="compress any prompt.")
15 | parser.add_argument(
16 | "--model_name_or_path", help="LLM used to answer", default="gpt-3.5-turbo-0613"
17 | )
18 |
19 | parser.add_argument("--n_max_token", type=int, default=8100)
20 | parser.add_argument(
21 | "--n_max_token_ans",
22 | type=int,
23 | default=400,
24 | help="token num in answer, following llmlingua",
25 | )
26 |
27 | parser.add_argument(
28 | "--load_prompt_from",
29 | help="where to load compressed prompt",
30 | default="results/gsm8k/origin/gsm8k_test.json",
31 | )
32 | parser.add_argument("--load_key", default="prompt", type=str)
33 | parser.add_argument(
34 | "--save_path",
35 | help="path to save results",
36 | default="results/gsm8k/origin/gpt35_answer/answer_gsm8k_test.json",
37 | )
38 |
39 | parser.add_argument("--num_sample", default=-1, type=int)
40 | args = parser.parse_args()
41 | os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
42 |
43 |
44 | MULTIPLE_CHOICE_TASKS = [
45 | "temporal_sequences",
46 | "disambiguation_qa",
47 | "date_understanding",
48 | "tracking_shuffled_objects_three_objects",
49 | "penguins_in_a_table",
50 | "geometric_shapes",
51 | "snarks",
52 | "ruin_names",
53 | "tracking_shuffled_objects_seven_objects",
54 | "tracking_shuffled_objects_five_objects",
55 | "logical_deduction_three_objects",
56 | "hyperbaton",
57 | "logical_deduction_five_objects",
58 | "logical_deduction_seven_objects",
59 | "movie_recommendation",
60 | "salient_translation_error_detection",
61 | "reasoning_about_colored_objects",
62 | ]
63 | FREE_FORM_TASKS = [
64 | "multistep_arithmetic_two",
65 | "navigate",
66 | "dyck_languages",
67 | "word_sorting",
68 | "sports_understanding",
69 | "boolean_expressions",
70 | "object_counting",
71 | "formal_fallacies",
72 | "causal_judgement",
73 | "web_of_lies",
74 | ]
75 |
76 |
77 | def extract_ans(ans, mode):
78 | ans_line = ans.split("answer is ", 1)
79 | # Expect to see 'answer is'. If not return whole string
80 | if len(ans_line) == 1:
81 | return ans
82 | else:
83 | ans = ans_line[-1].strip()
84 |
85 | if mode == "multiple_choice":
86 | options = [
87 | "(A)",
88 | "(B)",
89 | "(C)",
90 | "(D)",
91 | "(E)",
92 | "(F)",
93 | "(G)",
94 | "(H)",
95 | "(I)",
96 | "(J)",
97 | "(K)",
98 | "(L)",
99 | "(M)",
100 | "(N)",
101 | "(O)",
102 | "(P)",
103 | "(Q)",
104 | "(R)",
105 | "(S)",
106 | "(T)",
107 | "(U)",
108 | "(V)",
109 | "(W)",
110 | "(X)",
111 | "(Y)",
112 | "(Z)",
113 | ]
114 | match_g = []
115 | for option in options:
116 | if option in ans:
117 | # ans = option[1]
118 | match_g.append((ans.index(option), option[1]))
119 | if match_g:
120 | match_g.sort(key=lambda x: x[0])
121 | return match_g[0][1]
122 | elif mode == "free_form":
123 | ans = ans.split(".", 1)[0]
124 | if ans[-1] == ".":
125 | ans = ans[:-1]
126 | return ans
127 |
128 |
129 | def analyze_cases(good, bad, task):
130 | _, good_questions, good_ans_pred, good_ans_gold = good
131 | _, bad_questions, bad_ans_pred, bad_ans_gold = bad
132 | mode = "multiple_choice" if task in MULTIPLE_CHOICE_TASKS else "free_form"
133 | true_map, x_map = {}, {}
134 | for q, p, g in zip(good_questions[task], good_ans_pred[task], good_ans_gold[task]):
135 | p_ans, g_ans = extract_ans(p, mode), g
136 | if p_ans == g_ans:
137 | true_map[q] = (p, g, p_ans, g_ans)
138 | x_map[q] = (p, g, p_ans, g_ans)
139 | false_map = {}
140 | for q, p, g in zip(bad_questions[task], bad_ans_pred[task], bad_ans_gold[task]):
141 | p_ans, g_ans = extract_ans(p, mode), g
142 | if p_ans != g_ans and q in true_map:
143 | false_map[q] = (p, g, p_ans, g_ans)
144 |
145 |
146 | def parse_pred_ans(path: str):
147 | res = open(path).read()
148 | pattern = "Task:(.*?)\n(.*?)\nA_model:(.*?)\nA_target:(.*?)\n\n"
149 | g, ans = defaultdict(int), defaultdict(list)
150 | questions, ans_models, ans_targets = (
151 | defaultdict(list),
152 | defaultdict(list),
153 | defaultdict(list),
154 | )
155 | for m in re.findall(pattern, res, re.S):
156 | task, question, ans_model, ans_target = m
157 | task = task.strip()
158 | mode = "multiple_choice" if task in MULTIPLE_CHOICE_TASKS else "free_form"
159 | question = question.strip()
160 | ans_model = ans_model.strip()
161 | ans_target = ans_target.strip()
162 | p, gg = extract_ans(ans_model, mode), ans_target
163 | g[task] += int(p == gg)
164 | ans[task].append((ans_model, gg))
165 | questions[task].append(question)
166 | ans_models[task].append(ans_model)
167 | ans_targets[task].append(ans_target)
168 | scores = defaultdict(dict)
169 | total_num = 0
170 | for task, correct in g.items():
171 | scores[task]["acc"] = correct / len(ans[task])
172 | scores[task]["num"] = len(ans[task])
173 | print(task, correct, len(ans[task]), correct / len(ans[task]))
174 | total_num += len(ans[task])
175 | print(total_num)
176 | score_list = [v["acc"] for v in scores.values()]
177 | scores["avg"] = sum(score_list) / len(score_list)
178 | # return ans, questions, ans_models, ans_targets
179 | return scores
180 |
181 |
182 | def get_generation_token_length(path):
183 | res = open(path, "r").read()
184 | pattern = "Task:(.*?)\n(.*?)\nA_model:(.*?)\nA_target:(.*?)\n\n"
185 | tokenizer = tiktoken.encoding_for_model("gpt-4")
186 | tokens = []
187 | for m in re.findall(pattern, res, re.S):
188 | task, question, ans_model, ans_target = m
189 | tokens.append(len(tokenizer.encode(ans_model)))
190 | return sum(tokens) / len(tokens)
191 |
192 |
193 | def predict():
194 | model, tokenizer = load_model_and_tokenizer(args.model_name_or_path)
195 |
196 | results = {}
197 | if os.path.exists(args.save_path):
198 | results = json.load(open(args.save_path))
199 |
200 | demonstration = json.load(open(args.load_prompt_from))
201 | prompts = {}
202 | instructions = {}
203 | for demon in demonstration.values():
204 | task = demon["task"]
205 | prompt = demon[args.load_key]
206 | instructions[task] = demon["instruction"]
207 | prompts[task] = prompt
208 | print(prompts)
209 | print(instructions)
210 |
211 | dataset = json.load(open("results/bbh/origin/bbh.json"))
212 | for sample in tqdm(dataset):
213 | idx = sample["idx"]
214 | task = sample["task"]
215 | task_type = "multiple_choice" if task in MULTIPLE_CHOICE_TASKS else "free_form"
216 | cot_prompt = prompts[task]
217 | instruction = instructions[task]
218 | if args.num_sample > 0 and int(idx) > args.num_sample:
219 | break
220 | if idx in results or str(idx) in results:
221 | print(f"{idx}-th processed")
222 | continue
223 | q = sample["question"]
224 | a = sample["answer"]
225 |
226 | if cot_prompt[0] != "\n":
227 | cot_prompt = "\n\n" + cot_prompt
228 | # print(cot_prompt)
229 | prompt = (
230 | f"{instruction}{cot_prompt}\n\nQ: {q}" + "\nA:Let's think step by step.\n"
231 | )
232 | token_ids = tokenizer.encode(prompt)
233 | # drop in middle
234 | if len(token_ids) > (args.n_max_token - args.n_max_token_ans):
235 | half = int((args.n_max_token - args.n_max_token_ans) / 2) - 1
236 | prompt = tokenizer.decode(token_ids[:half]) + tokenizer.decode(
237 | token_ids[-half:]
238 | )
239 | answer = query_llm(
240 | prompt,
241 | model,
242 | args.model_name_or_path,
243 | 400 if task != "geometric_shapes" else 800,
244 | )
245 |
246 | results[idx] = {"question": q, "model_answer": answer, "truth_answer": a}
247 | json.dump(results, open(args.save_path, "w"), indent=4)
248 |
249 | ans_ = extract_ans(answer, task_type)
250 | if task_type == "multiple_choice":
251 | a = a[1]
252 | res = "%dTask:%s\n%s\nA_model:%s\nA_target:%s\n\n" % (
253 | idx,
254 | task,
255 | q.replace("\n", ""),
256 | answer.replace("\n", "").replace("Q:", "").replace("A:", ""),
257 | a.replace("\n", ""),
258 | )
259 | with open(args.save_path.replace(".json", ".txt"), "a") as fd:
260 | fd.write(res)
261 |
262 |
263 | predict()
264 | scores = parse_pred_ans(args.save_path.replace(".json", ".txt"))
265 | save_path2 = os.path.join(
266 | os.path.dirname(args.save_path),
267 | os.path.basename(args.save_path).replace("answer", "metrics"),
268 | )
269 | json.dump(scores, open(save_path2, "w"))
270 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/evaluation/eval_gsm8k.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | import argparse
5 | import json
6 | import os
7 | import re
8 |
9 | from tqdm import tqdm
10 | from utils import load_model_and_tokenizer, query_llm
11 |
12 | parser = argparse.ArgumentParser(description="compress any prompt.")
13 | parser.add_argument(
14 | "--model_name_or_path", help="LLM used to answer", default="gpt-3.5-turbo-0613"
15 | )
16 |
17 | parser.add_argument("--n_max_token", type=int, default=8100)
18 | parser.add_argument(
19 | "--n_max_token_ans",
20 | type=int,
21 | default=400,
22 | help="token num in answer, following llmlingua",
23 | )
24 |
25 | parser.add_argument(
26 | "--load_prompt_from",
27 | help="where to load compressed prompt",
28 | default="results/gsm8k/origin/gsm8k_test.json",
29 | )
30 | parser.add_argument("--load_key", default="prompt", type=str)
31 | parser.add_argument(
32 | "--save_path",
33 | help="path to save results",
34 | default="results/gsm8k/origin/gpt35_answer/answer_gsm8k_test.json",
35 | )
36 |
37 | parser.add_argument("--num_sample", default=-1, type=int)
38 | args = parser.parse_args()
39 | os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
40 |
41 |
42 | def extract_ans(ans_model):
43 | ans_model = ans_model.split("\n")
44 | ans = []
45 | residual = []
46 | for li, al in enumerate(ans_model):
47 | ans.append(al)
48 | if "answer is" in al:
49 | break
50 | residual = list(ans_model[li + 1 :])
51 | ans = "\n".join(ans)
52 | residual = "\n".join(residual)
53 | return ans, residual
54 |
55 |
56 | def parse_pred_ans(filename):
57 | with open(filename) as fd:
58 | lines = fd.readlines()
59 | am, a = None, None
60 | num_q, acc = 0, 0
61 | current_mode = "none"
62 | questions = []
63 | ans_pred = []
64 | ans_gold = []
65 | for l in lines:
66 | l = l.replace(",", "")
67 | if l.startswith("Q: "):
68 | if am is not None and a is not None:
69 | questions.append(q)
70 | ans_pred.append(am)
71 | ans_gold.append(a)
72 | if test_answer(am, a):
73 | acc += 1
74 | current_mode = "q"
75 | q = l
76 | num_q += 1
77 | elif l.startswith("A_model:"):
78 | current_mode = "am"
79 | am = l
80 | elif l.startswith("A:"):
81 | current_mode = "a"
82 | a = l
83 | else:
84 | if current_mode == "q":
85 | q += l
86 | elif current_mode == "am":
87 | am += l
88 | elif current_mode == "a":
89 | a += l
90 | else:
91 | raise ValueError(current_mode)
92 |
93 | questions.append(q)
94 | ans_pred.append(am)
95 | ans_gold.append(a)
96 | if test_answer(am, a):
97 | acc += 1
98 | print("num_q %d correct %d ratio %.4f" % (num_q, acc, float(acc / num_q)))
99 | return questions, ans_pred, ans_gold
100 |
101 |
102 | def get_result(text: str):
103 | pattern = "\d*\.?\d+"
104 | res = re.findall(pattern, text)
105 | return res[-1] if res else ""
106 |
107 |
108 | def test_answer(pred_str, ans_str):
109 | pred, gold = get_result(pred_str), get_result(ans_str)
110 | return pred == gold
111 |
112 |
113 | def predict():
114 | model, tokenizer = load_model_and_tokenizer(args.model_name_or_path)
115 | dataset = json.load(open("../../../results/gsm8k/origin/gsm8k_test.json"))
116 |
117 | results = {}
118 | if os.path.exists(args.save_path):
119 | results = json.load(open(args.save_path))
120 |
121 | demon_dict = json.load(open(args.load_prompt_from))
122 | demonstrations = []
123 | for demon in demon_dict["0"][args.load_key]:
124 | demonstrations.append("\n\nQuestion: " + demon)
125 | demonstrations = "".join(demonstrations)
126 |
127 | for sample in tqdm(dataset):
128 | idx = sample["idx"]
129 | if idx in results or str(idx) in results:
130 | print(f"{idx}-th processed")
131 | continue
132 | q = sample["question"]
133 | a = sample["answer"]
134 |
135 | prompt = f"Please reference the following examples to answer the math question. \n {demonstrations}"
136 | query = f"\n\nQuestion: {q}" + "\nLet's think step by step."
137 | token_ids = tokenizer.encode(prompt)
138 | len2 = len(tokenizer.encode(query))
139 | # drop in middle
140 | if len(token_ids) > (args.n_max_token - args.n_max_token_ans - len2):
141 | half = int((args.n_max_token - args.n_max_token_ans - len2) / 2) - 1
142 | prompt = tokenizer.decode(token_ids[:half]) + tokenizer.decode(
143 | token_ids[-half:]
144 | )
145 | prompt = prompt + query
146 | answer = query_llm(prompt, model, args.model_name_or_path, args.n_max_token_ans)
147 |
148 | results[idx] = {"question": q, "model_answer": answer, "truth_answer": a}
149 | json.dump(results, open(args.save_path, "w"), indent=4)
150 |
151 | ans_, _ = extract_ans(answer)
152 | res = "Q: %s\nA_model:\n%s\nA:\n%s\n\n" % (
153 | q,
154 | ans_.replace("Q:", "").replace("A:", ""),
155 | a,
156 | )
157 | with open(args.save_path.replace(".json", ".txt"), "a") as fd:
158 | fd.write(res)
159 |
160 |
161 | predict()
162 | scores = parse_pred_ans(args.save_path.replace(".json", ".txt"))
163 | save_path2 = os.path.join(
164 | os.path.dirname(args.save_path),
165 | os.path.basename(args.save_path).replace("answer", "metrics"),
166 | )
167 | json.dump(scores, open(save_path2, "w"))
168 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/evaluation/eval_longbench.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | import argparse
5 | import json
6 | import os
7 | from collections import defaultdict
8 |
9 | import numpy as np
10 | from metrics import (
11 | classification_score,
12 | code_sim_score,
13 | count_score,
14 | qa_f1_score,
15 | qa_f1_zh_score,
16 | retrieval_score,
17 | retrieval_zh_score,
18 | rouge_score,
19 | rouge_zh_score,
20 | )
21 | from tqdm import tqdm
22 | from utils import load_model_and_tokenizer, query_llm
23 |
24 | dataset2metric = {
25 | "narrativeqa": qa_f1_score,
26 | "qasper": qa_f1_score,
27 | "multifieldqa_en": qa_f1_score,
28 | "multifieldqa_zh": qa_f1_zh_score,
29 | "hotpotqa": qa_f1_score,
30 | "2wikimqa": qa_f1_score,
31 | "musique": qa_f1_score,
32 | "dureader": rouge_zh_score,
33 | "gov_report": rouge_score,
34 | "qmsum": rouge_score,
35 | "multi_news": rouge_score,
36 | "vcsum": rouge_zh_score,
37 | "trec": classification_score,
38 | "triviaqa": qa_f1_score,
39 | "samsum": rouge_score,
40 | "lsht": classification_score,
41 | "passage_retrieval_en": retrieval_score,
42 | "passage_count": count_score,
43 | "passage_retrieval_zh": retrieval_zh_score,
44 | "lcc": code_sim_score,
45 | "repobench-p": code_sim_score,
46 | }
47 |
48 | parser = argparse.ArgumentParser(description="compress any prompt.")
49 | parser.add_argument(
50 | "--model_name_or_path", help="LLM used to answer", default="gpt-3.5-turbo-0613"
51 | )
52 |
53 | parser.add_argument("--n_max_token", type=int, default=8100)
54 | # parser.add_argument('--n_max_token_ans', type=int, default=400, help='token num in answer, following llmlingua')
55 |
56 | parser.add_argument(
57 | "--load_prompt_from",
58 | help="where to load compressed prompt",
59 | default="results/longbench/origin/longbench_test_single_doc_qa_formated.json",
60 | )
61 | parser.add_argument("--load_key", default="prompt", type=str)
62 | parser.add_argument(
63 | "--save_path",
64 | help="path to save results",
65 | default="results/longbench/origin/gpt35_chat_answer/answer_longbench_test_single_doc_qa_formated.json",
66 | )
67 |
68 | parser.add_argument("--e", action=argparse.BooleanOptionalAction, default=True)
69 | args = parser.parse_args()
70 | os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
71 | eng_datasets = [
72 | "narrativeqa",
73 | "qasper",
74 | "multifieldqa_en",
75 | "hotpotqa",
76 | "2wikimqa",
77 | "musique",
78 | "gov_report",
79 | "qmsum",
80 | "multi_news",
81 | "trec",
82 | "triviaqa",
83 | "samsum",
84 | "passage_count",
85 | "passage_retrieval_en",
86 | "lcc",
87 | "repobench-p",
88 | ]
89 | all_datasets = [
90 | "narrativeqa",
91 | "qasper",
92 | "multifieldqa_en",
93 | "multifieldqa_zh",
94 | "hotpotqa",
95 | "2wikimqa",
96 | "musique",
97 | "dureader",
98 | "gov_report",
99 | "qmsum",
100 | "multi_news",
101 | "vcsum",
102 | "trec",
103 | "triviaqa",
104 | "samsum",
105 | "lsht",
106 | "passage_count",
107 | "passage_retrieval_en",
108 | "passage_retrieval_zh",
109 | "lcc",
110 | "repobench-p",
111 | ]
112 |
113 |
114 | def scorer_e(dataset, predictions, answers, lengths, all_classes):
115 | scores = {"0-4k": [], "4-8k": [], "8k+": []}
116 | for prediction, ground_truths, length in zip(predictions, answers, lengths):
117 | score = 0.0
118 | if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
119 | prediction = prediction.lstrip("\n").split("\n")[0]
120 | for ground_truth in ground_truths:
121 | score = max(
122 | score,
123 | dataset2metric[dataset](
124 | prediction, ground_truth, all_classes=all_classes
125 | ),
126 | )
127 | if length < 4000:
128 | scores["0-4k"].append(score)
129 | elif length < 8000:
130 | scores["4-8k"].append(score)
131 | else:
132 | scores["8k+"].append(score)
133 | for key in scores.keys():
134 | scores[key] = round(100 * np.mean(scores[key]), 2)
135 | return scores
136 |
137 |
138 | def scorer(dataset, predictions, answers, all_classes):
139 | total_score = 0.0
140 | for prediction, ground_truths in zip(predictions, answers):
141 | score = 0.0
142 | if dataset in [
143 | "trec",
144 | "triviaqa",
145 | "samsum",
146 | "lsht",
147 | "narrativeqa",
148 | "qasper",
149 | "multifieldqa_en",
150 | "multifieldqa_zh",
151 | "hotpotqa",
152 | "2wikimqa",
153 | "musique",
154 | "dureader",
155 | "vcsum",
156 | ]:
157 | prediction = prediction.lstrip("\n").split("\n")[0]
158 | # if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
159 | # prediction = prediction.lstrip('\n').split('\n')[0]
160 | # for ground_truth in ground_truths:
161 | # score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
162 | # prediction = prediction.lstrip('\n').split('\n')[0]
163 | # prediction = prediction.strip("")
164 | for ground_truth in ground_truths:
165 | score = max(
166 | score,
167 | dataset2metric[dataset](
168 | prediction, ground_truth, all_classes=all_classes
169 | ),
170 | )
171 | total_score += score
172 | return round(100 * total_score / len(predictions), 2)
173 |
174 |
175 | def eval(load_path):
176 | results = json.load(open(load_path))
177 | predictions, answers, lengths = (
178 | defaultdict(list),
179 | defaultdict(list),
180 | defaultdict(list),
181 | )
182 | all_classes = {}
183 | for idx, data in results.items():
184 | predictions[data["task"]].append(data["pred"])
185 | answers[data["task"]].append(data["answers"])
186 | all_classes[data["task"]] = data["all_classes"]
187 | if "length" in data:
188 | lengths[data["task"]].append(data["length"])
189 | scores = {}
190 | for task in predictions.keys():
191 | pred_list, ans_list, length_list = (
192 | predictions[task],
193 | answers[task],
194 | lengths[task],
195 | )
196 | score = scorer(task, pred_list, ans_list, all_classes[task])
197 | print(score)
198 | scores[task] = {"score": score, "num": len(pred_list)}
199 | score_list = [s["score"] for s in scores.values()]
200 | scores["avg"] = sum(score_list) / len(score_list)
201 | return scores
202 |
203 |
204 | dataset2prompt = {
205 | "narrativeqa": "You are given a story, which can be either a novel or a movie script, and a question. Answer the question asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:",
206 | "qasper": 'You are given a scientific article and a question. Answer the question as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nArticle: {context}\n\n Answer the question based on the above article as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
207 | "multifieldqa_en": "Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
208 | "multifieldqa_zh": "阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:",
209 | "hotpotqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
210 | "2wikimqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
211 | "musique": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
212 | "dureader": "请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:",
213 | "gov_report": "You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:",
214 | "qmsum": "You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:",
215 | "multi_news": "You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:",
216 | "vcsum": "下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:",
217 | "trec": "Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}",
218 | "triviaqa": "Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}",
219 | "samsum": "Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}",
220 | "lsht": "请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}",
221 | "passage_count": "There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ",
222 | "passage_retrieval_en": 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
223 | "passage_retrieval_zh": '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
224 | "lcc": "Please complete the code given below. \n{context}Next line of code:\n",
225 | "repobench-p": "Please complete the code given below. \n{context}{input}Next line of code:\n",
226 | }
227 |
228 | dataset2maxlen = {
229 | "narrativeqa": 128,
230 | "qasper": 128,
231 | "multifieldqa_en": 64,
232 | "multifieldqa_zh": 64,
233 | "hotpotqa": 32,
234 | "2wikimqa": 32,
235 | "musique": 32,
236 | "dureader": 128,
237 | "gov_report": 512,
238 | "qmsum": 512,
239 | "multi_news": 512,
240 | "vcsum": 512,
241 | "trec": 64,
242 | "triviaqa": 32,
243 | "samsum": 128,
244 | "lsht": 64,
245 | "passage_count": 32,
246 | "passage_retrieval_en": 32,
247 | "passage_retrieval_zh": 32,
248 | "lcc": 64,
249 | "repobench-p": 64,
250 | }
251 |
252 |
253 | def predict():
254 | model, tokenizer = load_model_and_tokenizer(args.model_name_or_path)
255 |
256 | dataset = json.load(open(args.load_prompt_from))
257 | print(len(dataset))
258 | if isinstance(dataset, dict):
259 | dataset = dataset.values()
260 | # dataset2prompt = json.load(
261 | # open("../data/LongBench/config/dataset2prompt.json", "r")
262 | # )
263 | # dataset2maxlen = json.load(
264 | # open("../data/LongBench/config/dataset2maxlen.json", "r")
265 | # )
266 | # prompt_format = dataset2prompt[args.task]
267 | # max_gen = int(dataset2maxlen[args.task])
268 |
269 | results = {}
270 | if os.path.exists(args.save_path):
271 | results = json.load(open(args.save_path))
272 |
273 | for sample in tqdm(dataset):
274 | idx = int(sample["idx"])
275 | task = sample["task"]
276 | if idx in results or str(idx) in results:
277 | print(f"{idx} processed")
278 | continue
279 | new_sample = {}
280 | new_sample["context"] = sample[args.load_key]
281 | new_sample["input"] = sample["question"]
282 |
283 | prompt_format = dataset2prompt[sample["task"]]
284 | max_gen = int(dataset2maxlen[sample["task"]])
285 | prompt = prompt_format.format(**new_sample)
286 | token_ids = tokenizer.encode(prompt)
287 |
288 | if len(token_ids) > (args.n_max_token - max_gen):
289 | half = int((args.n_max_token - max_gen) / 2) - 1
290 | prompt = tokenizer.decode(token_ids[:half]) + tokenizer.decode(
291 | token_ids[-half:]
292 | )
293 |
294 | pred = query_llm(
295 | prompt, model, args.model_name_or_path, max_gen, tokenizer=tokenizer
296 | )
297 | results[idx] = {
298 | "pred": pred,
299 | "answers": sample["answers"],
300 | "model_name": args.model_name_or_path,
301 | "task": sample["task"],
302 | "idx": idx,
303 | "all_classes": sample["all_classes"],
304 | "length": sample["length"],
305 | }
306 | json.dump(
307 | results,
308 | open(args.save_path, "w", encoding="utf8"),
309 | indent=4,
310 | ensure_ascii=False,
311 | )
312 |
313 |
314 | predict()
315 | score_dict = eval(load_path=args.save_path)
316 | print(score_dict)
317 | json.dump(
318 | score_dict,
319 | open(
320 | os.path.join(
321 | os.path.dirname(args.save_path),
322 | os.path.basename(args.save_path).replace("answer", "metrics"),
323 | ),
324 | "w",
325 | ),
326 | )
327 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/evaluation/eval_meetingbank_qa.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | import argparse
5 | import json
6 | import os
7 | from collections import defaultdict
8 |
9 | from metrics import evaluate_with_gt
10 | from tqdm import tqdm
11 | from utils import load_model_and_tokenizer, query_llm
12 |
13 | parser = argparse.ArgumentParser(description="compress any prompt.")
14 | parser.add_argument(
15 | "--model_name_or_path", help="LLM used to answer", default="gpt-3.5-turbo-0613"
16 | )
17 |
18 | parser.add_argument("--n_max_token", type=int, default=8100)
19 | parser.add_argument(
20 | "--n_max_token_ans",
21 | type=int,
22 | default=100,
23 | help="token num in answer, following llmlingua",
24 | )
25 |
26 | parser.add_argument(
27 | "--load_prompt_from", help="where to load compressed prompt", required=True
28 | )
29 | parser.add_argument("--load_key", default="prompt", type=str)
30 | parser.add_argument("--save_path", help="path to save results", required=True)
31 | parser.add_argument("--num_sample", type=int, default=-1)
32 |
33 | args = parser.parse_args()
34 | os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
35 |
36 |
37 | def predict():
38 | data = json.load(open(args.load_prompt_from))
39 | data = data.values() if isinstance(data, dict) else data
40 |
41 | print(f"num data: {len(data)}")
42 |
43 | model, tokenizer = load_model_and_tokenizer(args.model_name_or_path)
44 |
45 | results = defaultdict(dict)
46 | results_list = defaultdict(list)
47 | if os.path.exists(args.save_path):
48 | prev_results = json.load(open(args.save_path))
49 | results.update(prev_results)
50 | if os.path.exists(
51 | os.path.join(
52 | os.path.dirname(args.save_path),
53 | os.path.basename(args.save_path).replace("answer", "answer_list"),
54 | )
55 | ):
56 | results_list = json.load(
57 | open(
58 | os.path.join(
59 | os.path.dirname(args.save_path),
60 | os.path.basename(args.save_path).replace("answer", "answer_list"),
61 | )
62 | )
63 | )
64 |
65 | prompt = "Write a high-quality answer for the given question using the provided meeting transcript (which may be compressed).\n{transcript}\nQuestion:{question}\nAnswer:"
66 | for sample in tqdm(data):
67 | sample_idx = int(sample["idx"])
68 | if sample_idx in results or str(sample_idx) in results:
69 | print(f"{sample_idx}-th already processed.")
70 | continue
71 | if args.num_sample > 0 and int(sample_idx) > args.num_sample:
72 | break
73 | transcript = sample[args.load_key]
74 | token_ids = tokenizer.encode(transcript)
75 | if len(token_ids) > args.n_max_token - args.n_max_token_ans:
76 | transcript = tokenizer.decode(
77 | token_ids[: args.n_max_token - args.n_max_token_ans]
78 | )
79 | qa_list = sample["QA_pairs"]
80 | q_list = []
81 | a_list = []
82 | a_list_model = []
83 | for qa in qa_list:
84 | q = qa["question"]
85 | a = qa["answer"]
86 | query = prompt.format(transcript=transcript, question=q)
87 | answer = query_llm(
88 | query,
89 | model,
90 | args.model_name_or_path,
91 | args.n_max_token_ans,
92 | tokenizer=tokenizer,
93 | )
94 | q_list.append(q)
95 | a_list.append(a)
96 | a_list_model.append(answer)
97 |
98 | results[sample_idx]["transcript"] = transcript
99 | results[sample_idx]["questions"] = q_list[:]
100 | results[sample_idx]["answers"] = a_list[:]
101 | results[sample_idx]["model_answers"] = a_list_model[:]
102 |
103 | results_list["questions"].extend(q_list[:])
104 | results_list["answers"].extend(a_list[:])
105 | results_list["model_answers"].extend(a_list_model[:])
106 |
107 | json.dump(results, open(args.save_path, "w"), indent=4)
108 | json.dump(
109 | results_list,
110 | open(
111 | os.path.join(
112 | os.path.dirname(args.save_path),
113 | os.path.basename(args.save_path).replace("answer", "answer_list"),
114 | ),
115 | "w",
116 | ),
117 | indent=4,
118 | )
119 |
120 |
121 | predict()
122 | results_list = json.load(
123 | open(
124 | os.path.join(
125 | os.path.dirname(args.save_path),
126 | os.path.basename(args.save_path).replace("answer", "answer_list"),
127 | )
128 | )
129 | )
130 | for i, ans in enumerate(results_list["answers"]):
131 | results_list["answers"][i] = [results_list["answers"][i]]
132 | score_dict = evaluate_with_gt(results_list["model_answers"], results_list["answers"])
133 | json.dump(
134 | score_dict,
135 | open(
136 | os.path.join(
137 | os.path.dirname(args.save_path),
138 | os.path.basename(args.save_path).replace("answer", "metrics"),
139 | ),
140 | "w",
141 | ),
142 | indent=4,
143 | )
144 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/evaluation/eval_meetingbank_summary.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | import argparse
5 | import json
6 | import os
7 | from collections import defaultdict
8 |
9 | from metrics import evaluate_sim
10 | from tqdm import tqdm
11 | from utils import load_model_and_tokenizer, query_llm
12 |
13 | parser = argparse.ArgumentParser(description="compress any prompt.")
14 | parser.add_argument(
15 | "--model_name_or_path", help="LLM used to answer", default="gpt-3.5-turbo-0613"
16 | )
17 |
18 | parser.add_argument("--n_max_token", type=int, default=8100)
19 | parser.add_argument(
20 | "--n_max_token_ans",
21 | type=int,
22 | default=400,
23 | help="token num in answer, following llmlingua",
24 | )
25 |
26 | parser.add_argument(
27 | "--load_prompt_from", help="where to load compressed prompt", required=True
28 | )
29 | parser.add_argument("--load_key", default="prompt", type=str)
30 | parser.add_argument("--save_path", help="path to save results", required=True)
31 | parser.add_argument("--num_sample", type=int, default=-1)
32 |
33 | args = parser.parse_args()
34 | os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
35 |
36 |
37 | def predict():
38 | model, tokenizer = load_model_and_tokenizer(args.model_name_or_path)
39 |
40 | data = json.load(open(args.load_prompt_from))
41 | data = data.values() if isinstance(data, dict) else data
42 | print(f"num data: {len(data)}")
43 |
44 | results = defaultdict(dict)
45 | results_list = defaultdict(list)
46 | if os.path.exists(args.save_path):
47 | prev_results = json.load(open(args.save_path))
48 | results.update(prev_results)
49 | if os.path.exists(
50 | os.path.join(
51 | os.path.dirname(args.save_path),
52 | os.path.basename(args.save_path).replace("answer", "answer_list"),
53 | )
54 | ):
55 | results_list = json.load(
56 | open(
57 | os.path.join(
58 | os.path.dirname(args.save_path),
59 | os.path.basename(args.save_path).replace("answer", "answer_list"),
60 | )
61 | )
62 | )
63 |
64 | prompt = "Summarize the provided meeting transcript (which may be compressed).\n{transcript}\nSummary:"
65 | for sample in tqdm(data):
66 | if isinstance(sample, float):
67 | continue
68 | sample_idx = int(sample["idx"])
69 | if sample_idx in results or str(sample_idx) in results:
70 | print(f"{sample_idx}-th already processed.")
71 | continue
72 | if args.num_sample > 0 and int(sample_idx) > args.num_sample:
73 | break
74 | transcript = sample[args.load_key]
75 | token_ids = tokenizer.encode(transcript)
76 | if len(token_ids) > args.n_max_token - args.n_max_token_ans:
77 | transcript = tokenizer.decode(
78 | token_ids[: args.n_max_token - args.n_max_token_ans]
79 | )
80 |
81 | query = prompt.format(transcript=transcript)
82 |
83 | # t = time.time()
84 | model_summary = query_llm(
85 | query,
86 | model,
87 | args.model_name_or_path,
88 | args.n_max_token_ans,
89 | tokenizer=tokenizer,
90 | )
91 | # total_time += time.time() - t
92 |
93 | summary = sample["gpt4_summary"]
94 |
95 | results[sample_idx]["transcript"] = transcript
96 | results[sample_idx]["model_summary"] = model_summary
97 | results[sample_idx]["gpt4_summary"] = summary
98 |
99 | results_list["model_summary"].append(model_summary)
100 | results_list["gpt4_summary"].append(summary)
101 |
102 | json.dump(results, open(args.save_path, "w"), indent=4)
103 | json.dump(
104 | results_list,
105 | open(
106 | os.path.join(
107 | os.path.dirname(args.save_path),
108 | os.path.basename(args.save_path).replace("answer", "answer_list"),
109 | ),
110 | "w",
111 | ),
112 | indent=4,
113 | )
114 |
115 |
116 | predict()
117 | results_list = defaultdict(list)
118 | results_list = json.load(
119 | open(
120 | os.path.join(
121 | os.path.dirname(args.save_path),
122 | os.path.basename(args.save_path).replace("answer", "answer_list"),
123 | )
124 | )
125 | )
126 | score_dict = evaluate_sim(results_list["model_summary"], results_list["gpt4_summary"])
127 | json.dump(
128 | score_dict,
129 | open(
130 | os.path.join(
131 | os.path.dirname(args.save_path),
132 | os.path.basename(args.save_path).replace("answer", "metrics"),
133 | ),
134 | "w",
135 | ),
136 | indent=4,
137 | )
138 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/evaluation/eval_zero_scrolls.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | import argparse
5 | import json
6 | import os
7 | import shutil
8 | from collections import defaultdict
9 |
10 | import datasets
11 | from huggingface_hub import hf_hub_download
12 | from tqdm import tqdm
13 | from utils import load_model_and_tokenizer, query_llm
14 |
15 | parser = argparse.ArgumentParser(description="compress any prompt.")
16 | parser.add_argument(
17 | "--model_name_or_path", help="LLM used to answer", default="gpt-3.5-turbo-0613"
18 | )
19 |
20 | parser.add_argument("--n_max_token", type=int, default=8100)
21 | # parser.add_argument('--n_max_token_ans', type=int, default=400, help='token num in answer, following llmlingua')
22 |
23 | parser.add_argument(
24 | "--load_prompt_from",
25 | help="where to load compressed prompt",
26 | default="results/zero_scrolls/origin/zero_scrolls_validation.json",
27 | )
28 | parser.add_argument("--load_key", default="prompt", type=str)
29 | parser.add_argument(
30 | "--save_path",
31 | help="path to save results",
32 | default="results/zero_scrolls/origin/gpt35_chat_16k_answer/answer_zero_scrolls_validation.json",
33 | )
34 | args = parser.parse_args()
35 | os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
36 | save_path2 = os.path.join(
37 | os.path.dirname(args.save_path),
38 | os.path.basename(args.save_path).replace("answer", "answer2"),
39 | )
40 |
41 |
42 | def eval(predict_path: str):
43 | def download_metric():
44 | zero_scrolls_metric_path = hf_hub_download(
45 | repo_id="tau/zero_scrolls",
46 | repo_type="dataset",
47 | filename="metrics/zero_scrolls.py",
48 | )
49 | updated_zero_scrolls_metric_path = (
50 | os.path.dirname(zero_scrolls_metric_path)
51 | + os.path.basename(zero_scrolls_metric_path).replace(".", "_")
52 | + ".py"
53 | )
54 | shutil.copy(zero_scrolls_metric_path, updated_zero_scrolls_metric_path)
55 | return updated_zero_scrolls_metric_path
56 |
57 | zero_scrolls_metric_path = download_metric()
58 | preds = json.load(open(predict_path))
59 | preds_g, refers_g = defaultdict(list), defaultdict(list)
60 | for v in preds.values():
61 | task, refer, pred = [v[k] for k in ["task", "reference", "pred"]]
62 | # if task == "narrative_qa":
63 | pred = (
64 | pred.split("\n\nQuestion:", 1)[0]
65 | .split("\n\nExplanation:", 1)[0]
66 | .replace("<|im_end|>", "")
67 | .replace("\end{document}", "")
68 | .strip()
69 | )
70 | # .split("\n\nExplanation:", 1)[0]
71 | if task == "space_digest":
72 | if pred.startswith("0.") and "%" not in pred[:4]:
73 | pred = "{:.2f}%".format(float(pred[:4]) * 100)
74 | else:
75 | pred = pred[:5].strip().replace("%", "") + "%"
76 | preds_g[task].append(pred)
77 | refers_g[task].append([refer])
78 |
79 | zero_scrolls = []
80 | score_dict = {}
81 | OUT_TASKS = [
82 | "gov_report",
83 | "summ_screen_fd",
84 | "qmsum",
85 | "squality",
86 | "quality",
87 | "narrative_qa",
88 | "qasper",
89 | "musique",
90 | "space_digest",
91 | "book_sum_sort",
92 | ]
93 | for task in OUT_TASKS:
94 | if task not in preds_g:
95 | zero_scrolls.append(0)
96 | continue
97 | p, r = preds_g[task], refers_g[task]
98 | zero_scrolls_metric = datasets.load_metric(zero_scrolls_metric_path, task)
99 | results = zero_scrolls_metric.compute(predictions=p, references=r)
100 | print(task, len(p), results)
101 | zero_scrolls.append(results["zero_scrolls_score"])
102 | score_dict[task] = {
103 | "zero_scrolls_score": results["zero_scrolls_score"],
104 | "length": len(p),
105 | }
106 | print(",".join([f"{ii:.2f}" for ii in zero_scrolls]))
107 | score_avg = sum(zero_scrolls) / len(zero_scrolls)
108 | score_dict["avg"] = score_avg
109 | return score_dict
110 |
111 |
112 | def predict():
113 | model, tokenizer = load_model_and_tokenizer(args.model_name_or_path)
114 |
115 | dataset = json.load(open(args.load_prompt_from))
116 | if isinstance(dataset, dict):
117 | dataset = dataset.values()
118 |
119 | res = {}
120 | res2 = {}
121 | if os.path.exists(args.save_path):
122 | res = json.load(open(args.save_path))
123 | if os.path.exists(save_path2):
124 | res2 = json.load(open(save_path2))
125 |
126 | for sample in tqdm(dataset):
127 | idx = int(sample["idx"])
128 | if idx in res or str(idx) in res:
129 | print(f"{idx} processed")
130 | continue
131 |
132 | prompt = sample[args.load_key]
133 | max_gen = sample["n_max_token_ans"]
134 | token_ids = tokenizer.encode(prompt)
135 |
136 | if len(token_ids) > (args.n_max_token - max_gen):
137 | half = int((args.n_max_token - max_gen) / 2) - 1
138 | prompt = tokenizer.decode(token_ids[:half]) + tokenizer.decode(
139 | token_ids[-half:]
140 | )
141 |
142 | pred = query_llm(prompt, model, args.model_name_or_path, max_gen)
143 |
144 | res[idx] = {
145 | "pred": pred,
146 | "answer": sample["answer"],
147 | "model_name": args.model_name_or_path,
148 | "task": sample["task"],
149 | "idx": idx,
150 | }
151 | json.dump(res, open(args.save_path, "w"), indent=4)
152 | res2[f"{idx},{sample['task']}"] = {
153 | "idx": idx,
154 | "task": sample["task"],
155 | "pred": pred,
156 | "reference": sample["answer"],
157 | }
158 | json.dump(res2, open(save_path2, "w"), indent=4)
159 |
160 |
161 | predict()
162 | score_dict = eval(save_path2)
163 | json.dump(
164 | score_dict,
165 | open(
166 | os.path.join(
167 | os.path.dirname(args.save_path),
168 | os.path.basename(args.save_path).replace("answer", "metrics"),
169 | ),
170 | "w",
171 | ),
172 | )
173 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/evaluation/metrics.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | import re
5 | import string
6 | from collections import Counter
7 | from typing import List
8 |
9 | import evaluate
10 | import jieba
11 | from fuzzywuzzy import fuzz
12 | from rouge import Rouge
13 |
14 |
15 | def normalize_answer(s):
16 | """Lower text and remove punctuation, articles and extra whitespace."""
17 |
18 | def remove_articles(text):
19 | return re.sub(r"\b(a|an|the)\b", " ", text)
20 |
21 | def white_space_fix(text):
22 | return " ".join(text.split())
23 |
24 | def remove_punc(text):
25 | exclude = set(string.punctuation)
26 | return "".join(ch for ch in text if ch not in exclude)
27 |
28 | def lower(text):
29 | return text.lower()
30 |
31 | return white_space_fix(remove_articles(remove_punc(lower(s))))
32 |
33 |
34 | def normalize_zh_answer(s):
35 | """Lower text and remove punctuation, extra whitespace."""
36 |
37 | def white_space_fix(text):
38 | return "".join(text.split())
39 |
40 | def remove_punc(text):
41 | cn_punctuation = "!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
42 | all_punctuation = set(string.punctuation + cn_punctuation)
43 | return "".join(ch for ch in text if ch not in all_punctuation)
44 |
45 | def lower(text):
46 | return text.lower()
47 |
48 | return white_space_fix(remove_punc(lower(s)))
49 |
50 |
51 | def count_score(prediction, ground_truth, **kwargs):
52 | numbers = re.findall(r"\d+", prediction)
53 | right_num = 0
54 | for number in numbers:
55 | if str(number) == str(ground_truth):
56 | right_num += 1
57 | final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
58 | return float(final_score)
59 |
60 |
61 | def retrieval_score(prediction, ground_truth, **kwargs):
62 | pattern = r"Paragraph (\d+)"
63 | matches = re.findall(pattern, ground_truth)
64 | ground_truth_id = matches[0]
65 | numbers = re.findall(r"\d+", prediction)
66 | right_num = 0
67 | for number in numbers:
68 | if str(number) == str(ground_truth_id):
69 | right_num += 1
70 | final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
71 | return float(final_score)
72 |
73 |
74 | def retrieval_zh_score(prediction, ground_truth, **kwargs):
75 | pattern = r"段落(\d+)"
76 | matches = re.findall(pattern, ground_truth)
77 | ground_truth_id = matches[0]
78 | numbers = re.findall(r"\d+", prediction)
79 | right_num = 0
80 | for number in numbers:
81 | if str(number) == str(ground_truth_id):
82 | right_num += 1
83 | final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
84 | return float(final_score)
85 |
86 |
87 | def code_sim_score(prediction, ground_truth, **kwargs):
88 | all_lines = prediction.lstrip("\n").split("\n")
89 | prediction = ""
90 | for line in all_lines:
91 | if ("`" not in line) and ("#" not in line) and ("//" not in line):
92 | prediction = line
93 | break
94 | return fuzz.ratio(prediction, ground_truth) / 100
95 |
96 |
97 | def classification_score(prediction, ground_truth, **kwargs):
98 | em_match_list = []
99 | all_classes = kwargs["all_classes"]
100 | for class_name in all_classes:
101 | if class_name in prediction:
102 | em_match_list.append(class_name)
103 | for match_term in em_match_list:
104 | if match_term in ground_truth and match_term != ground_truth:
105 | em_match_list.remove(match_term)
106 | if ground_truth in em_match_list:
107 | score = 1.0 / len(em_match_list)
108 | else:
109 | score = 0.0
110 | return score
111 |
112 |
113 | def rouge_score(prediction, ground_truth, **kwargs):
114 | rouge = Rouge()
115 | try:
116 | scores = rouge.get_scores([prediction], [ground_truth], avg=True)
117 | except:
118 | return 0.0
119 | return scores["rouge-l"]["f"]
120 |
121 |
122 | def rouge_zh_score(prediction, ground_truth, **kwargs):
123 | prediction = " ".join(list(jieba.cut(prediction, cut_all=False)))
124 | ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False)))
125 | score = rouge_score(prediction, ground_truth)
126 | return score
127 |
128 |
129 | def f1_score(prediction, ground_truth, **kwargs):
130 | common = Counter(prediction) & Counter(ground_truth)
131 | num_same = sum(common.values())
132 | if num_same == 0:
133 | return 0
134 | precision = 1.0 * num_same / len(prediction)
135 | recall = 1.0 * num_same / len(ground_truth)
136 | f1 = (2 * precision * recall) / (precision + recall)
137 | return f1
138 |
139 |
140 | def qa_f1_score(prediction, ground_truth, **kwargs):
141 | normalized_prediction = normalize_answer(prediction)
142 | normalized_ground_truth = normalize_answer(ground_truth)
143 |
144 | prediction_tokens = normalized_prediction.split()
145 | ground_truth_tokens = normalized_ground_truth.split()
146 | return f1_score(prediction_tokens, ground_truth_tokens)
147 |
148 |
149 | def qa_f1_zh_score(prediction, ground_truth, **kwargs):
150 | prediction_tokens = list(jieba.cut(prediction, cut_all=False))
151 | ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False))
152 | prediction_tokens = [normalize_zh_answer(token) for token in prediction_tokens]
153 | ground_truth_tokens = [normalize_zh_answer(token) for token in ground_truth_tokens]
154 | prediction_tokens = [token for token in prediction_tokens if len(token) > 0]
155 | ground_truth_tokens = [token for token in ground_truth_tokens if len(token) > 0]
156 | return f1_score(prediction_tokens, ground_truth_tokens)
157 |
158 |
159 | def qa_score(prediction, ground_truths):
160 | normalized_prediction = normalize_answer2(prediction)
161 |
162 | for ground_truth in ground_truths:
163 | normalized_ground_truth = normalize_answer2(ground_truth)
164 | if normalized_ground_truth.lower() in normalized_prediction.lower():
165 | return 1.0
166 | return 0.0
167 |
168 |
169 | import regex
170 |
171 |
172 | def normalize_answer2(s: str) -> str:
173 | """Normalization from the SQuAD evaluation script.
174 |
175 | See https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
176 | """
177 |
178 | def remove_articles(text):
179 | return regex.sub(r"\b(a|an|the)\b", " ", text)
180 |
181 | def white_space_fix(text):
182 | return " ".join(text.split())
183 |
184 | def remove_punc(text):
185 | exclude = set(string.punctuation)
186 | return "".join(ch for ch in text if ch not in exclude)
187 |
188 | def lower(text):
189 | return text.lower()
190 |
191 | return white_space_fix(remove_articles(remove_punc(lower(s))))
192 |
193 |
194 | def best_subspan_em(prediction: str, ground_truths: List[str]) -> float:
195 | normalized_prediction = normalize_answer2(prediction)
196 |
197 | for ground_truth in ground_truths:
198 | normalized_ground_truth = normalize_answer2(ground_truth)
199 | if normalized_ground_truth.lower() in normalized_prediction.lower():
200 | return 1.0
201 | return 0.0
202 |
203 |
204 | def evaluate_with_gt(pred_list, gt_list, truncate_pred=True, logger=None):
205 | def eval_qa_f1_score(pred, ground_truths):
206 | score = 0.0
207 | for gt in ground_truths:
208 | score = max(score, qa_f1_score(pred, gt))
209 | score = score
210 | return score
211 |
212 | if truncate_pred:
213 | pred_list_truncated = []
214 | for pred in pred_list:
215 | pred = pred.lstrip("\n").split("\n")[0].strip()
216 | pred_list_truncated.append(pred)
217 | pred_list = pred_list_truncated
218 |
219 | metrics = {
220 | "qa_score": 0.0,
221 | }
222 | for pred, gts in zip(pred_list, gt_list):
223 | metrics["qa_score"] += qa_score(pred, gts)
224 | # average
225 | for metric_name, score in metrics.items():
226 | metrics[metric_name] = score * 100 / len(pred_list)
227 | print(f"{metric_name}: {metrics[metric_name]:.3f}")
228 | if logger is not None:
229 | logger.info(f"{metric_name}: {metrics[metric_name]:.3f}")
230 |
231 | return metrics
232 |
233 |
234 | def evaluate_sim(pred_list, gt_list, truncate_pred=True, truncate_gt=False):
235 | if truncate_pred:
236 | pred_list_truncated = []
237 | for pred in pred_list:
238 | pred = pred.lstrip("\n").split("\n")[0].strip()
239 | pred_list_truncated.append(pred)
240 | pred_list = pred_list_truncated
241 | if truncate_gt:
242 | gt_list_truncated = []
243 | for gt in gt_list:
244 | gt = gt.lstrip("\n").split("\n")[0].strip()
245 | gt_list_truncated.append(gt)
246 | gt_list = gt_list_truncated
247 |
248 | bleu = evaluate.load("bleu")
249 | rouge = evaluate.load("rouge")
250 | bertscore = evaluate.load("bertscore")
251 | bleu_results = bleu.compute(predictions=pred_list, references=gt_list)
252 | rouge_results = rouge.compute(predictions=pred_list, references=gt_list)
253 | bertscore_results = bertscore.compute(
254 | predictions=pred_list, references=gt_list, lang="en"
255 | )
256 | p, r, f1 = [bertscore_results[k] for k in ["precision", "recall", "f1"]]
257 | evs = [
258 | bleu_results["bleu"],
259 | *[rouge_results[k] for k in ["rouge1", "rouge2", "rougeL", "rougeLsum"]],
260 | sum(p) / len(p),
261 | sum(r) / len(r),
262 | sum(f1) / len(f1),
263 | ]
264 | metrics = {}
265 | for i, metric_name in enumerate(
266 | [
267 | "bleu",
268 | "rouge1",
269 | "rouge2",
270 | "rougeL",
271 | "rougeLsum",
272 | "bertscore_precision",
273 | "bertscore_recall",
274 | "bertscore_f1",
275 | ]
276 | ):
277 | metrics[metric_name] = evs[i]
278 | print(",".join([f"{ii * 100:.2f}" for ii in evs]))
279 |
280 | return metrics
281 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/evaluation/scripts/compress.sh:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | python compress.py --load_origin_from ../../../results/meetingbank_short/origin/meetingbank_test_3qa_pairs_summary_formated.json \
5 | --compression_rate 0.33 \
6 | --force_tokens "\n,?,!,." \
7 | --save_path ../../../results/meetingbank_short/llmlingua2/compression_ratio33_meetingbank_test_3qa_pairs_summary_formated.json
8 |
9 | python compress.py --load_origin_from ../../../results/longbench/origin/longbench_test_single_doc_qa_formated.json \
10 | --target_token 2000 \
11 | --force_tokens "\n,?,!,." \
12 | --save_path ../../../results/longbench/llmlingua2/compression_target2000_longbench_test_single_doc_qa_formated.json
13 |
14 | python compress.py --load_origin_from ../../../results/zero_scrolls/origin/zero_scrolls_validation.json \
15 | --target_token 2000 \
16 | --force_tokens "\n,?,!,." \
17 | --save_path ../../../results/zero_scrolls/llmlingua2/compression_target2000_zero_scrolls_validation.json
18 |
19 | python compress.py --load_origin_from ../../../results/gsm8k/origin/gsm8k_cot_example_all_in_one.json \
20 | --load_key prompt_list \
21 | --target_token 250 \
22 | --force_tokens "+,-,*,×,/,÷,=,The answer is,\n" \
23 | --use_context_level_filter \
24 | --force_reserve_digit \
25 | --save_path ../../../results/gsm8k/llmlingua2/compression_target250_gsm8k_cot_example_all_in_one.json
26 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/evaluation/scripts/evaluate.sh:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | python eval_meetingbank_qa.py --load_prompt_from ../../../results/meetingbank_short/llmlingua2/compression_ratio33_meetingbank_test_3qa_pairs_summary_formated.json \
5 | --load_key compressed_prompt \
6 | --model_name_or_path gpt-35-turbo-instruct \
7 | --save_path ../../../results/meetingbank_short/llmlingua2/gpt35_answer/answer_ratio33_meetingbank_test_3qa_pairs_summary_formated.json
8 |
9 | python eval_longbench.py --load_prompt_from ../../../results/longbench/llmlingua2/compression_target2000_longbench_test_single_doc_qa_formated.json \
10 | --load_key compressed_prompt \
11 | --model_name_or_path gpt-35-turbo-instruct \
12 | --save_path ../../../results/longbench/llmlingua2/gpt35_answer/answer_target2000_longbench_test_single_doc_qa_formated.json
13 |
14 | python eval_zero_scrolls.py --load_prompt_from ../../../results/zero_scrolls/llmlingua2/compression_target2000_zero_scrolls_validation.json \
15 | --load_key compressed_prompt \
16 | --model_name_or_path gpt-35-turbo-instruct \
17 | --save_path ../../../results/zero_scrolls/llmlingua2/gpt35_answer/answer_target2000_zero_scrolls_validation.json
18 |
19 | python eval_gsm8k.py --load_prompt_from ../../../results/gsm8k/llmlingua2/compression_target200_gsm8k_cot_example_all_in_one.json \
20 | --load_key compressed_prompt_list \
21 | --model_name_or_path gpt-35-turbo-instruct \
22 | --save_path ../../../results/gsm8k/llmlingua2/gpt35_answer/answer_target200_gsm8k_cot_example_all_in_one.json
23 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/evaluation/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | from time import sleep
5 |
6 | import openai
7 | import tiktoken
8 |
9 |
10 | def query_llm(
11 | prompt,
12 | model,
13 | model_name,
14 | max_tokens,
15 | tokenizer=None,
16 | chat_completion=False,
17 | **kwargs,
18 | ):
19 | SLEEP_TIME_FAILED = 62
20 |
21 | request = {
22 | "temperature": kwargs["temperature"] if "temperature" in kwargs else 0.0,
23 | "top_p": kwargs["top_p"] if "top_p" in kwargs else 1.0,
24 | "seed": kwargs["seed"] if "seed" in kwargs else 42,
25 | "max_tokens": max_tokens,
26 | "n": 1,
27 | "stream": False,
28 | }
29 | if chat_completion:
30 | request["messages"] = [
31 | {"role": "system", "content": "You are a helpful assistant."},
32 | {"role": "user", "content": prompt},
33 | ]
34 | else:
35 | request["prompt"] = prompt
36 |
37 | answer = None
38 | response = None
39 | while answer is None:
40 | try:
41 | response = model.create(engine=model_name, **request)
42 | answer = (
43 | response["choices"][0]["message"]["content"]
44 | if chat_completion
45 | else response["choices"][0]["text"]
46 | )
47 | except Exception as e:
48 | answer = None
49 | print(f"error: {e}, response: {response}")
50 | sleep(SLEEP_TIME_FAILED)
51 | # sleep(SLEEP_TIME_SUCCESS)
52 | return answer
53 |
54 |
55 | def load_model_and_tokenizer(model_name_or_path, chat_completion=False):
56 | openai.api_key = "your_api_key"
57 | openai.api_base = "your_api_base"
58 | openai.api_type = "azure"
59 | openai.api_version = "2023-05-15"
60 |
61 | if chat_completion:
62 | model = openai.ChatCompletion
63 | else:
64 | model = openai.Completion
65 |
66 | tokenizer = tiktoken.encoding_for_model("gpt-4")
67 | return model, tokenizer
68 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/model_training/train.sh:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | python train_roberta.py --data_path ../../../results/meetingbank/gpt-4-32k_comp/annotation_kept_cs512_meetingbank_train_formated.pt \
5 | --save_path ../../../results/models/xlm_roberta_large_meetingbank_only
6 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/model_training/train_roberta.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | import argparse
5 | import os
6 | import random
7 | import time
8 |
9 | import torch
10 | from sklearn.metrics import accuracy_score
11 | from torch import cuda
12 | from torch.utils.data import DataLoader
13 | from torch.utils.tensorboard import SummaryWriter
14 | from tqdm import tqdm
15 | from transformers import AutoModelForTokenClassification, AutoTokenizer
16 | from utils import TokenClfDataset
17 |
18 | MAX_LEN = 512
19 | MAX_GRAD_NORM = 10
20 |
21 | parser = argparse.ArgumentParser(
22 | description="train bert to do compression (by token classification)"
23 | )
24 | parser.add_argument(
25 | "--model_name",
26 | help="token classification model",
27 | default="FacebookAI/xlm-roberta-large",
28 | )
29 | parser.add_argument(
30 | "--data_path",
31 | help="training and validation data path",
32 | default="../../../results/meetingbank/gpt-4-32k_comp/annotation_kept_cs512_meetingbank_train_formated.pt",
33 | )
34 | parser.add_argument(
35 | "--label_type",
36 | help="word label or token label",
37 | default="word_label",
38 | choices=["word_label", "token_label"],
39 | )
40 | parser.add_argument(
41 | "--save_path",
42 | help="save path",
43 | default="../../../results/models/xlm_roberta_large_meetingbank_only",
44 | )
45 | parser.add_argument("--lr", help="learning rate", default=1e-5, type=float)
46 | parser.add_argument(
47 | "--num_epoch", help="number of training epoch", default=10, type=int
48 | )
49 | parser.add_argument("--batch_size", type=int, default=10)
50 |
51 | args = parser.parse_args()
52 | os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
53 | writer = SummaryWriter(log_dir=os.path.dirname(args.save_path).replace("model", "log"))
54 |
55 |
56 | def train(epoch):
57 | tr_loss, tr_accuracy = 0, 0
58 | nb_tr_examples, nb_tr_steps = 0, 0
59 | tr_preds, tr_labels = [], []
60 | model.train()
61 |
62 | for idx, batch in enumerate(train_dataloader):
63 | t = time.time()
64 | ids = batch["ids"].to(device, dtype=torch.long)
65 | mask = batch["mask"].to(device, dtype=torch.long)
66 | targets = batch["targets"].to(device, dtype=torch.long)
67 |
68 | outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
69 | loss, tr_logits = outputs.loss, outputs.logits
70 | tr_loss += loss.item()
71 |
72 | nb_tr_steps += 1
73 | nb_tr_examples += targets.size(0)
74 |
75 | flattened_targets = targets.view(-1)
76 | active_logits = tr_logits.view(-1, model.num_labels)
77 | flattened_predictions = torch.argmax(active_logits, axis=1)
78 | active_accuracy = mask.view(-1) == 1
79 | targets = torch.masked_select(flattened_targets, active_accuracy)
80 | predictions = torch.masked_select(flattened_predictions, active_accuracy)
81 |
82 | tr_preds.extend(predictions)
83 | tr_labels.extend(targets)
84 |
85 | tmp_tr_accuracy = accuracy_score(
86 | targets.cpu().numpy(), predictions.cpu().numpy()
87 | )
88 | tr_accuracy += tmp_tr_accuracy
89 |
90 | if idx % 100 == 0:
91 | loss_step = tr_loss / nb_tr_steps
92 | acc_step = tr_accuracy / nb_tr_steps
93 | writer.add_scalar(
94 | "Loss/train", loss_step, idx + epoch * len(train_dataloader)
95 | )
96 | writer.add_scalar(
97 | "Acc/train", acc_step, idx + epoch * len(train_dataloader)
98 | )
99 | writer.flush()
100 | print(f"Training loss per 100 training steps: {loss_step}")
101 |
102 | torch.nn.utils.clip_grad_norm_(
103 | parameters=model.parameters(), max_norm=MAX_GRAD_NORM
104 | )
105 |
106 | optimizer.zero_grad()
107 | loss.backward()
108 | optimizer.step()
109 |
110 | tr_loss = tr_loss / nb_tr_steps
111 | tr_accuracy = tr_accuracy / nb_tr_steps
112 | print(f"Training loss epoch: {tr_loss}")
113 | print(f"Training accuracy epoch: {tr_accuracy}")
114 |
115 |
116 | def test(model, eval_dataloader):
117 | model.eval()
118 |
119 | eval_loss, eval_accuracy = 0, 0
120 | nb_eval_examples, nb_eval_steps = 0, 0
121 | eval_preds, eval_labels = [], []
122 |
123 | with torch.no_grad():
124 | for idx, batch in enumerate(eval_dataloader):
125 | ids = batch["ids"].to(device, dtype=torch.long)
126 | mask = batch["mask"].to(device, dtype=torch.long)
127 | targets = batch["targets"].to(device, dtype=torch.long)
128 |
129 | outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
130 | loss, eval_logits = outputs.loss, outputs.logits
131 |
132 | eval_loss += loss.item()
133 |
134 | nb_eval_steps += 1
135 | nb_eval_examples += targets.size(0)
136 |
137 | flattened_targets = targets.view(-1)
138 | active_logits = eval_logits.view(-1, model.num_labels)
139 | flattened_predictions = torch.argmax(active_logits, axis=1)
140 | active_accuracy = mask.view(-1) == 1
141 | targets = torch.masked_select(flattened_targets, active_accuracy)
142 | predictions = torch.masked_select(flattened_predictions, active_accuracy)
143 |
144 | eval_labels.extend(targets)
145 | eval_preds.extend(predictions)
146 |
147 | tmp_eval_accuracy = accuracy_score(
148 | targets.cpu().numpy(), predictions.cpu().numpy()
149 | )
150 | eval_accuracy += tmp_eval_accuracy
151 |
152 | labels = [label.item() for label in eval_labels]
153 | predictions = [pred.item() for pred in eval_preds]
154 |
155 | eval_loss = eval_loss / nb_eval_steps
156 | eval_accuracy = eval_accuracy / nb_eval_steps
157 | print(f"Validation Loss: {eval_loss}")
158 | print(f"Validation Accuracy: {eval_accuracy}")
159 |
160 | writer.add_scalar("Loss/eval", eval_loss, epoch * len(eval_dataloader))
161 | writer.add_scalar("Acc/eval", eval_accuracy, epoch * len(eval_dataloader))
162 | writer.flush()
163 |
164 | return eval_accuracy
165 |
166 |
167 | device = "cuda" if cuda.is_available() else "cpu"
168 | data = torch.load(args.data_path)
169 |
170 | tokenizer = AutoTokenizer.from_pretrained(args.model_name)
171 | model = AutoModelForTokenClassification.from_pretrained(
172 | args.model_name, num_labels=2, ignore_mismatched_sizes=True
173 | )
174 | model.to(device)
175 |
176 | assert len(data["origin"]) == len(data["labels"])
177 | text_label = [(text, label) for text, label in zip(data["origin"], data["labels"])]
178 | random.shuffle(text_label)
179 | train_data = text_label[: int(len(text_label) * 0.8)]
180 | val_data = text_label[int(len(text_label) * 0.8) :]
181 |
182 | train_text = [text for text, label in train_data]
183 | train_label = [label for text, label in train_data]
184 | val_text = [text for text, label in val_data]
185 | val_label = [label for text, label in val_data]
186 |
187 | train_dataset = TokenClfDataset(
188 | train_text, train_label, MAX_LEN, tokenizer=tokenizer, model_name=args.model_name
189 | )
190 | val_dataset = TokenClfDataset(
191 | val_text, val_label, MAX_LEN, tokenizer=tokenizer, model_name=args.model_name
192 | )
193 |
194 | print(f"len taining set: {len(train_dataset)}, len validation set: {len(val_dataset)}")
195 | print(train_dataset[0])
196 | for token, label in zip(
197 | tokenizer.convert_ids_to_tokens(train_dataset[0]["ids"][:30]),
198 | train_dataset[0]["targets"][:30],
199 | ):
200 | print("{0:10} {1}".format(token, label.item()))
201 | train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
202 |
203 | val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)
204 |
205 | ids = train_dataset[0]["ids"].unsqueeze(0)
206 | mask = train_dataset[0]["mask"].unsqueeze(0)
207 | targets = train_dataset[0]["targets"].unsqueeze(0)
208 | ids = ids.to(device)
209 | mask = mask.to(device)
210 | targets = targets.to(device)
211 | outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
212 | initial_loss = outputs[0]
213 | print(initial_loss)
214 |
215 | tr_logits = outputs[1]
216 | print(tr_logits.shape)
217 |
218 | optimizer = torch.optim.Adam(params=model.parameters(), lr=args.lr)
219 |
220 | best_acc = 0
221 |
222 | for epoch in tqdm(range(args.num_epoch)):
223 | print(f"Training epoch: {epoch + 1}")
224 | train(epoch)
225 | acc = test(model, val_dataloader)
226 | if acc > best_acc:
227 | best_acc = acc
228 | torch.save(model.state_dict(), f"{args.save_path}/state_dict.pth")
229 | model.save_pretrained(args.save_path)
230 | tokenizer.save_pretrained(args.save_path)
231 |
--------------------------------------------------------------------------------
/experiments/llmlingua2/model_training/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | import spacy
5 | import torch
6 | from torch.utils.data import Dataset
7 |
8 |
9 | class TokenClfDataset(Dataset):
10 | def __init__(
11 | self,
12 | texts,
13 | labels=None,
14 | max_len=512,
15 | tokenizer=None,
16 | model_name="bert-base-multilingual-cased",
17 | ):
18 | self.len = len(texts)
19 | self.texts = texts
20 | self.tokenizer = tokenizer
21 | self.max_len = max_len
22 | self.labels = labels
23 | self.model_name = model_name
24 | if "bert-base-multilingual-cased" in model_name:
25 | self.cls_token = "[CLS]"
26 | self.sep_token = "[SEP]"
27 | self.unk_token = "[UNK]"
28 | self.pad_token = "[PAD]"
29 | self.mask_token = "[MASK]"
30 | elif "xlm-roberta-large" in model_name:
31 | self.bos_token = ""
32 | self.eos_token = ""
33 | self.sep_token = ""
34 | self.cls_token = ""
35 | self.unk_token = ""
36 | self.pad_token = ""
37 | self.mask_token = ""
38 | else:
39 | raise NotImplementedError()
40 |
41 | self.nlp = spacy.load("en_core_web_sm")
42 |
43 | def __getitem__(self, index):
44 | text = self.texts[index]
45 | if self.labels is not None:
46 | labels = self.labels[index][:]
47 | tokenized_text, labels = self.tokenize_and_preserve_labels(
48 | text, labels, self.tokenizer
49 | )
50 | assert len(tokenized_text) == len(labels)
51 | labels.insert(0, False)
52 | labels.insert(-1, False)
53 | else:
54 | tokenized_text = self.tokenizer.tokenize(text)
55 |
56 | tokenized_text = [self.cls_token] + tokenized_text + [self.sep_token]
57 |
58 | if len(tokenized_text) > self.max_len:
59 | tokenized_text = tokenized_text[: self.max_len]
60 | if self.labels is not None:
61 | labels = labels[: self.max_len]
62 | else:
63 | tokenized_text = tokenized_text + [
64 | self.pad_token for _ in range(self.max_len - len(tokenized_text))
65 | ]
66 | if self.labels is not None:
67 | labels = labels + [False for _ in range(self.max_len - len(labels))]
68 |
69 | attn_mask = [1 if tok != self.pad_token else 0 for tok in tokenized_text]
70 |
71 | ids = self.tokenizer.convert_tokens_to_ids(tokenized_text)
72 |
73 | sample = {
74 | "ids": torch.tensor(ids, dtype=torch.long),
75 | "mask": torch.tensor(attn_mask, dtype=torch.long),
76 | }
77 | if self.labels is not None:
78 | sample["targets"] = torch.tensor(labels, dtype=torch.long)
79 |
80 | return sample
81 |
82 | def __len__(self):
83 | return self.len
84 |
85 | def split_string(self, input_string, ignore_tokens=set([","])):
86 | doc = self.nlp(input_string)
87 | word_list = []
88 | for word in doc:
89 | if word.lemma_ not in ignore_tokens:
90 | word_list.append(word.lemma_)
91 | return word_list
92 |
93 | def tokenize_and_preserve_labels(self, text, text_labels, tokenizer):
94 | """
95 | Word piece tokenization makes it difficult to match word labels
96 | back up with individual word pieces. This function tokenizes each
97 | word one at a time so that it is easier to preserve the correct
98 | label for each subword. It is, of course, a bit slower in processing
99 | time, but it will help our model achieve higher accuracy.
100 | """
101 |
102 | tokenized_text = []
103 | labels = []
104 |
105 | assert len(self.split_string(text)) == len(text_labels)
106 |
107 | for word, label in zip(self.split_string(text), text_labels):
108 | # Tokenize the word and count # of subwords the word is broken into
109 | tokenized_word = tokenizer.tokenize(word)
110 | n_subwords = len(tokenized_word)
111 |
112 | # Add the tokenized word to the final tokenized word list
113 | tokenized_text.extend(tokenized_word)
114 |
115 | # Add the same label to the new list of labels `n_subwords` times
116 | labels.extend([label] * n_subwords)
117 |
118 | return tokenized_text, labels
119 |
--------------------------------------------------------------------------------
/images/LLMLingua-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/LLMLingua/e4e172afb42d8ae3c0b6cb271a3f5d6a812846a0/images/LLMLingua-2.png
--------------------------------------------------------------------------------
/images/LLMLingua.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/LLMLingua/e4e172afb42d8ae3c0b6cb271a3f5d6a812846a0/images/LLMLingua.png
--------------------------------------------------------------------------------
/images/LLMLingua_demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/LLMLingua/e4e172afb42d8ae3c0b6cb271a3f5d6a812846a0/images/LLMLingua_demo.png
--------------------------------------------------------------------------------
/images/LLMLingua_framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/LLMLingua/e4e172afb42d8ae3c0b6cb271a3f5d6a812846a0/images/LLMLingua_framework.png
--------------------------------------------------------------------------------
/images/LLMLingua_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/LLMLingua/e4e172afb42d8ae3c0b6cb271a3f5d6a812846a0/images/LLMLingua_logo.png
--------------------------------------------------------------------------------
/images/LLMLingua_motivation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/LLMLingua/e4e172afb42d8ae3c0b6cb271a3f5d6a812846a0/images/LLMLingua_motivation.png
--------------------------------------------------------------------------------
/images/LongLLMLingua.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/LLMLingua/e4e172afb42d8ae3c0b6cb271a3f5d6a812846a0/images/LongLLMLingua.png
--------------------------------------------------------------------------------
/images/LongLLMLingua_Motivation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/LLMLingua/e4e172afb42d8ae3c0b6cb271a3f5d6a812846a0/images/LongLLMLingua_Motivation.png
--------------------------------------------------------------------------------
/images/motivation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/LLMLingua/e4e172afb42d8ae3c0b6cb271a3f5d6a812846a0/images/motivation.png
--------------------------------------------------------------------------------
/llmlingua/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | # flake8: noqa
5 | from .prompt_compressor import PromptCompressor
6 | from .version import VERSION as __version__
7 |
8 | __all__ = ["PromptCompressor"]
9 |
--------------------------------------------------------------------------------
/llmlingua/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import random
4 | import re
5 | import string
6 |
7 | import numpy as np
8 | import torch
9 | import yaml
10 | from torch.utils.data import Dataset
11 |
12 |
13 | class TokenClfDataset(Dataset):
14 | def __init__(
15 | self,
16 | texts,
17 | max_len=512,
18 | tokenizer=None,
19 | model_name="bert-base-multilingual-cased",
20 | ):
21 | self.len = len(texts)
22 | self.texts = texts
23 | self.tokenizer = tokenizer
24 | self.max_len = max_len
25 | self.model_name = model_name
26 | if "bert-base-multilingual-cased" in model_name:
27 | self.cls_token = "[CLS]"
28 | self.sep_token = "[SEP]"
29 | self.unk_token = "[UNK]"
30 | self.pad_token = "[PAD]"
31 | self.mask_token = "[MASK]"
32 | elif "xlm-roberta-large" in model_name:
33 | self.bos_token = ""
34 | self.eos_token = ""
35 | self.sep_token = ""
36 | self.cls_token = ""
37 | self.unk_token = ""
38 | self.pad_token = ""
39 | self.mask_token = ""
40 | else:
41 | raise NotImplementedError()
42 |
43 | def __getitem__(self, index):
44 | text = self.texts[index]
45 | tokenized_text = self.tokenizer.tokenize(text)
46 |
47 | tokenized_text = (
48 | [self.cls_token] + tokenized_text + [self.sep_token]
49 | ) # add special tokens
50 |
51 | if len(tokenized_text) > self.max_len:
52 | tokenized_text = tokenized_text[: self.max_len]
53 | else:
54 | tokenized_text = tokenized_text + [
55 | self.pad_token for _ in range(self.max_len - len(tokenized_text))
56 | ]
57 |
58 | attn_mask = [1 if tok != self.pad_token else 0 for tok in tokenized_text]
59 |
60 | ids = self.tokenizer.convert_tokens_to_ids(tokenized_text)
61 |
62 | return {
63 | "ids": torch.tensor(ids, dtype=torch.long),
64 | "mask": torch.tensor(attn_mask, dtype=torch.long),
65 | }
66 |
67 | def __len__(self):
68 | return self.len
69 |
70 |
71 | def seed_everything(seed: int):
72 | random.seed(seed)
73 | os.environ["PYTHONHASHSEED"] = str(seed)
74 | np.random.seed(seed)
75 | torch.manual_seed(seed)
76 | torch.cuda.manual_seed(seed)
77 | torch.backends.cudnn.deterministic = True
78 | torch.backends.cudnn.benchmark = False
79 |
80 |
81 | def is_begin_of_new_word(token, model_name, force_tokens, token_map):
82 | if "bert-base-multilingual-cased" in model_name \
83 | or "tinybert" in model_name.lower() \
84 | or "mobilebert" in model_name.lower():
85 | if token.lstrip("##") in force_tokens or token.lstrip("##") in set(
86 | token_map.values()
87 | ):
88 | return True
89 | return not token.startswith("##")
90 | elif "xlm-roberta-large" in model_name:
91 | if (
92 | token in string.punctuation
93 | or token in force_tokens
94 | or token in set(token_map.values())
95 | ):
96 | return True
97 | return token.startswith("▁")
98 | else:
99 | raise NotImplementedError()
100 |
101 |
102 | def replace_added_token(token, token_map):
103 | for ori_token, new_token in token_map.items():
104 | token = token.replace(new_token, ori_token)
105 | return token
106 |
107 |
108 | def get_pure_token(token, model_name):
109 | if "bert-base-multilingual-cased" in model_name \
110 | or "tinybert" in model_name.lower() \
111 | or "mobilebert" in model_name.lower():
112 | return token.lstrip("##")
113 | elif "xlm-roberta-large" in model_name:
114 | return token.lstrip("▁")
115 | else:
116 | raise NotImplementedError()
117 |
118 |
119 | def process_structured_json_data(json_data, json_config):
120 | if isinstance(json_config, str):
121 | with open(json_config, "r") as file:
122 | json_config = yaml.safe_load(file)
123 | elif not isinstance(json_config, dict):
124 | raise ValueError(
125 | "Invalid json config file. It should be a dictionary or a path to a yaml file."
126 | )
127 | assert set(json_data.keys()) == set(
128 | json_config.keys()
129 | ), "Keys in json data and json config file do not match."
130 | context = ["{"]
131 | forced_context_ids = [0]
132 | for i, (k, v) in enumerate(json_data.items()):
133 | if not json_config[k]["pair_remove"]:
134 | forced_context_ids.append(i + 1)
135 | rate, compress, value_type = (
136 | json_config[k]["rate"],
137 | json_config[k]["compress"],
138 | json_config[k]["value_type"],
139 | )
140 | if not compress:
141 | rate = 1
142 | context.append(precess_jsonKVpair(k, v, value_type, rate))
143 | context[-1] = context[-1][:-14] + ""
144 | context.append("}")
145 | forced_context_ids.append(len(json_data) + 1)
146 |
147 | return context, forced_context_ids
148 |
149 |
150 | def precess_jsonKVpair(k, v, value_type, rate):
151 | if rate == 1:
152 | return (
153 | ""
154 | + f"{json.dumps({k:v})[1:-1]}, "
155 | + ""
156 | )
157 | if value_type == "str" or value_type == "string":
158 | v = str(v)
159 | new_v = (
160 | f""
161 | + v
162 | + ""
163 | )
164 | return (
165 | ""
166 | + f"{json.dumps({k:new_v})[1:-1]}, "
167 | + ""
168 | )
169 | elif value_type in ["int", "float", "integer", "number"]:
170 | if value_type in ["int", "integer"]:
171 | v = int(v)
172 | if value_type in ["float", "number"]:
173 | v = float(v)
174 | return (
175 | ""
176 | + f'"{k}": {v}, '
177 | )
178 | elif value_type == "bool" or value_type == "boolean":
179 | if v in ["True", "true", "TRUE", True]:
180 | v = "true"
181 | elif v in ["False", "false", "FALSE", False]:
182 | v = "false"
183 | else:
184 | raise ValueError(f"Invalid boolean value: {v}")
185 | new_v = (
186 | f""
187 | + v
188 | + ""
189 | )
190 | return (
191 | ""
192 | + f"{json.dumps({k:new_v})[1:-1]}, "
193 | + ""
194 | )
195 | elif value_type == "list" or value_type == "List":
196 | return (
197 | ""
198 | + f'"{k}": {process_sequence_data(rate, "[", "]", v)}'
199 | )
200 | elif value_type == "dict" or value_type == "dictionary":
201 | return (
202 | ""
203 | + f'"{k}": {process_sequence_data(rate, "[", "]", v, is_dict=True)}'
204 | )
205 | elif value_type == "set":
206 | raise ValueError(f"Invalid value type: {value_type}")
207 | # return '' + f'"{k}": {process_sequence_data(rate, "{", "}", v)}'
208 | elif value_type == "tuple":
209 | return (
210 | ""
211 | + f'"{k}": {process_sequence_data(rate, "(", ")", v)}'
212 | )
213 | else:
214 | raise ValueError(f"Invalid value type: {value_type}")
215 |
216 |
217 | def process_sequence_data(rate, start, end, sequence, is_dict=False):
218 | res = f'{start}"'
219 | n = len(sequence)
220 | if not is_dict:
221 | for i, item in enumerate(sequence):
222 | item = str(item)
223 | res += f"{item}"
224 | if i != n - 1:
225 | res += '", "'
226 | else:
227 | for i, (k, v) in enumerate(sequence.items()):
228 | item = f"{k}: {v}"
229 | item.replace('"', "'")
230 | res += f"{item}"
231 | if i != n - 1:
232 | res += '", "'
233 | res += f'"{end}, '
234 | return res
235 |
236 |
237 | def remove_consecutive_commas(text):
238 | text = re.sub(r",\s*", ",", text)
239 | text = re.sub(r",+", ",", text)
240 | return text
241 |
--------------------------------------------------------------------------------
/llmlingua/version.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | _MAJOR = "0"
5 | _MINOR = "2"
6 | # On master and in a nightly release the patch should be one ahead of the last
7 | # released build.
8 | _PATCH = "2"
9 | # This is mainly for nightly builds which have the suffix ".dev$DATE". See
10 | # https://semver.org/#is-v123-a-semantic-version for the semantics.
11 | _SUFFIX = ""
12 |
13 | VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
14 | VERSION = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)
15 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 88
3 | target-version = ['py38']
4 | include = '\.pyi?$'
5 |
6 | [tool.isort]
7 | atomic = true
8 | profile = "black"
9 | line_length = 88
10 | skip_gitignore = true
11 | known_first_party = ["llmlingua"]
12 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [isort]
2 | default_section = FIRSTPARTY
3 | ensure_newline_before_comments = True
4 | force_grid_wrap = 0
5 | include_trailing_comma = True
6 | known_first_party = sdtools
7 | known_third_party =
8 | imblearn
9 | numpy
10 | pandas
11 | pytorch-tabnet
12 | scipy
13 | sklearn
14 | torch
15 | torchaudio
16 | torchvision
17 | torch_xla
18 | tqdm
19 | xgboost
20 |
21 | line_length = 119
22 | lines_after_imports = 2
23 | multi_line_output = 3
24 | use_parentheses = True
25 |
26 | [flake8]
27 | ignore = E203, E501, E741, W503, W605
28 | max-line-length = 119
29 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | from setuptools import find_packages, setup
5 |
6 | # PEP0440 compatible formatted version, see:
7 | # https://www.python.org/dev/peps/pep-0440/
8 | #
9 | # release markers:
10 | # X.Y
11 | # X.Y.Z # For bugfix releases
12 | #
13 | # pre-release markers:
14 | # X.YaN # Alpha release
15 | # X.YbN # Beta release
16 | # X.YrcN # Release Candidate
17 | # X.Y # Final release
18 |
19 | # version.py defines the VERSION and VERSION_SHORT variables.
20 | # We use exec here so we don't import allennlp whilst setting up.
21 | VERSION = {} # type: ignore
22 | with open("llmlingua/version.py", "r") as version_file:
23 | exec(version_file.read(), VERSION)
24 |
25 | INSTALL_REQUIRES = [
26 | "transformers>=4.26.0",
27 | "accelerate",
28 | "torch",
29 | "tiktoken",
30 | "nltk",
31 | "numpy",
32 | ]
33 | QUANLITY_REQUIRES = [
34 | "black==21.4b0",
35 | "flake8>=3.8.3",
36 | "isort>=5.5.4",
37 | "pre-commit",
38 | "pytest",
39 | "pytest-xdist",
40 | ]
41 | DEV_REQUIRES = INSTALL_REQUIRES + QUANLITY_REQUIRES
42 |
43 | setup(
44 | name="llmlingua",
45 | version=VERSION["VERSION"],
46 | author="The LLMLingua team",
47 | author_email="hjiang@microsoft.com",
48 | description="To speed up LLMs' inference and enhance LLM's perceive of key information, compress the prompt and KV-Cache, which achieves up to 20x compression with minimal performance loss.",
49 | long_description=open("README.md", encoding="utf8").read(),
50 | long_description_content_type="text/markdown",
51 | keywords="Prompt Compression, LLMs, Inference Acceleration, Black-box LLMs, Efficient LLMs",
52 | license="MIT License",
53 | url="https://github.com/microsoft/LLMLingua",
54 | classifiers=[
55 | "Intended Audience :: Science/Research",
56 | "Development Status :: 3 - Alpha",
57 | "Programming Language :: Python :: 3",
58 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
59 | ],
60 | package_dir={"": "."},
61 | packages=find_packages("."),
62 | extras_require={
63 | "dev": DEV_REQUIRES,
64 | "quality": QUANLITY_REQUIRES,
65 | },
66 | install_requires=INSTALL_REQUIRES,
67 | include_package_data=True,
68 | python_requires=">=3.8.0",
69 | zip_safe=False,
70 | )
71 |
--------------------------------------------------------------------------------
/tests/test_llmlingua.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | import unittest
5 |
6 | from llmlingua import PromptCompressor
7 |
8 |
9 | class LLMLinguaTester(unittest.TestCase):
10 | """
11 | End2end Test for LLMLingua
12 | """
13 |
14 | GSM8K_PROMPT = "Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAngelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n\nQuestion: You can buy 4 apples or 1 watermelon for the same price. You bought 36 fruits evenly split between oranges, apples and watermelons, and the price of 1 orange is $0.50. How much does 1 apple cost if your total bill was $66?\nLet's think step by step\nIf 36 fruits were evenly split between 3 types of fruits, then I bought 36/3 = 12 units of each fruit\nIf 1 orange costs $0.50 then 12 oranges will cost $0.50 * 12 = $6\nIf my total bill was $66 and I spent $6 on oranges then I spent $66 - $6 = $60 on the other 2 fruit types.\nAssuming the price of watermelon is W, and knowing that you can buy 4 apples for the same price and that the price of one apple is A, then 1W=4A\nIf we know we bought 12 watermelons and 12 apples for $60, then we know that $60 = 12W + 12A\nKnowing that 1W=4A, then we can convert the above to $60 = 12(4A) + 12A\n$60 = 48A + 12A\n$60 = 60A\nThen we know the price of one apple (A) is $60/60= $1\nThe answer is 1"
15 | GSM8K_150TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT = "Question: Angelo and Melanie to plan how many hours they should together their test have 2 their textbook and 4 to They out should and 1 hours. they study, many they study total week they a break every hour, include 3minute and lunch day\n's think step\n Melanie should the chapters hours 2 = hours\n the to dedicate x\n Melanie to with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4"
16 | GSM8K_150TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT = "Question: You can buy 4 apples or 1 for. You bought 36 fruits evenly split between, waterons and of 1 orange $.. much does cost if your total bill $\n's think step\nIf were between 3 of, then I 36/3 = 12 of fruitIf 1 orange50 then oranges50 * $If66 I $ oranges I $66 $60 on the other 2 fruit\nAssuming the of is W, and that you price and of is then 1W=4AIf we know we bought 12 and, then we know that $60 = 12W + 12A\nKnowing that 1W=4A, then we can convert the above to $60 = 12(4A) + 12A\n$60 = 48A + 12A\n$60 = 60A\nThen we know the price of one apple (A) is $60/60= $1\nThe answer is 1"
17 | JSON_PROMPT = """
18 | {
19 | "id": "987654",
20 | "name": "John Doe",
21 | "isActive": "true",
22 | "biography": "John Doe, born in New York in 1985, is a renowned software engineer with over 10 years of experience in the field. John graduated from MIT with a degree in Computer Science and has since worked with several Fortune 500 companies. He has a passion for developing innovative software solutions and has contributed to numerous open source projects. John is also an avid writer and speaker at tech conferences, sharing his insights on emerging technologies and their impact on the business world. In his free time, John enjoys hiking, reading science fiction novels, and playing the piano.",
23 | "employmentHistory": [
24 | {
25 | "company": "TechCorp",
26 | "role": "Senior Software Engineer",
27 | "description": "At TechCorp, John was responsible for leading a team of software engineers and overseeing the development of scalable web applications. He played a key role in driving the adoption of cloud technologies within the company, significantly enhancing the efficiency of their digital operations."
28 | },
29 | {
30 | "company": "Innovatech",
31 | "role": "Lead Developer",
32 | "description": "In his role at Innovatech, John focused on developing cutting-edge AI algorithms and implementing machine learning solutions for various business applications. He was instrumental in developing a predictive analytics tool that transformed the company's approach to data-driven decision making."
33 | }
34 | ],
35 | "skills": "Java, Python, Machine Learning, Cloud Computing, AI Development"
36 | }"""
37 | JSON_COMPRESSED_PROMPT = """
38 | {
39 | "id": "987654",
40 | "name": "John Doe",
41 | "isActive": "true",
42 | "biography": " Doe, born in York in 1985 a renowned engineer with over in the field. John from MIT a Science and has since worked with several He has a for developing innovative solutions and has to numerous projects John is also avid and speaker at conferences, his on technologies and their the business. In his enjoys, reading fiction and playing piano.",
43 | "employmentHistory": [
44 | {
45 | "company": "TechCorp",
46 | "role": "Senior Engineer",
47 | "description": " John was for leading of engineers and of scalable He in the of cloud technologies company, significantly the of their digital operations."
48 | },
49 | {
50 | "company": "Innovatech",
51 | "role": "Lead",
52 | "description": "In his John on developingedge AI and implementing machine learning solutions for various business applications. He was instrumental in developing a predictive analytics tool that transformed the company's approach to data-driven decision making."
53 | }
54 | ],
55 | "skills": "Java, Python, Machine Learning, Cloud Computing, AI Development"
56 | }"""
57 | MEETINGBANK_TRANSCRIPT_0_PROMPT = "Speaker 4: Thank you. And can we do the functions for content? Items I believe are 11, three, 14, 16 and 28, I believe.\nSpeaker 0: Item 11 is a communication from Council on Price recommendation to increase appropriation in the general fund group in the City Manager Department by $200 to provide a contribution to the Friends of the Long Beach Public Library. Item 12 is communication from Councilman Super Now. Recommendation to increase appropriation in the special advertising and promotion fund group and the city manager's department by $10,000 to provide support for the end of summer celebration. Item 13 is a communication from Councilman Austin. Recommendation to increase appropriation in the general fund group in the city manager department by $500 to provide a donation to the Jazz Angels . Item 14 is a communication from Councilman Austin. Recommendation to increase appropriation in the general fund group in the City Manager department by $300 to provide a donation to the Little Lion Foundation. Item 16 is a communication from Councilman Allen recommendation to increase appropriation in the general fund group in the city manager department by $1,020 to provide contribution to Casa Korero, Sew Feria Business Association, Friends of Long Beach Public Library and Dave Van Patten. Item 28 is a communication. Communication from Vice Mayor Richardson and Council Member Muranga. Recommendation to increase appropriation in the general fund group in the City Manager Department by $1,000 to provide a donation to Ron Palmer Summit. Basketball and Academic Camp.\nSpeaker 4: We have a promotion and a second time as councilman served Councilman Ringa and customers and they have any comments.\nSpeaker 2: Now. I had queued up to motion, but.\nSpeaker 4: Great that we have any public comment on this.\nSpeaker 5: If there are any members of the public that would like to speak on items 11, 12, 13, 14, 16 and 28 in person, please sign up at the podium in Zoom. Please use the raise hand feature or dial star nine now. Seen on the concludes public comment.\nSpeaker 4: Thank you. Please to a roll call vote, please.\nSpeaker 0: Councilwoman Sanchez.\nSpeaker 2: I am.\nSpeaker 0: Councilwoman Allen. I. Councilwoman Price.\nSpeaker 2: I.\nSpeaker 0: Councilman Spooner, i. Councilwoman Mongo i. Councilwoman Sarah I. Councilmember Waronker I. Councilman Alston.\nSpeaker 1: I.\nSpeaker 0: Vice Mayor Richardson.\nSpeaker 3: I.\nSpeaker 0: The motion is carried nine zero.\nSpeaker 4: Thank you. That concludes the consent. Just a couple announcements for the regular agenda. So we do have a very long and full agenda today. We have the budget hearing, which will happen first and then right after the budget hearing. We have a variety of other hearings as it relate to the our local control program and sales tax agreement. And then we have we're going to go right into some issues around and bonds around the aquarium and also the second reading of the health care worker ordinance, which we're going to try to do all of that towards the beginning of the agenda. And then we have a long agenda for the rest of of the council. So I just want to warn folks that we do have a we do have a long meeting. We're going to go right into the budget hearings. That's the first thing on the agenda. And they're going to try to move through that, through the council as expeditiously as possible. And so with that, let's continue the budget hearing, which we are doing for fire, police and parks. We're going to hear all of the presentations at once. And then after we go through all the presentations, we'll do all the all of the questions at once and then any any public comment, and we'll go from there."
58 | COMPRESSED_MULTIPLE_STRUCTURED_CONTEXT_PROMPT = '\n {\n "id": "987654",\n "name": "John Doe",\n "isActive": "true",\n "biography": " Doe, born in York in a renowned engineer over the field John from and has worked with several has a for developing has to numerous avid and speaker at, his on and the. In his enjoys, reading fiction and playing piano.",\n "employmentHistory": [\n {\n "company": "TechCorp",\n "role": "Senior",\n "description": " John was for of engineers and of scalable He in the of cloud technologies, significantly the of their digital operations."\n },\n {\n "company": "Innovatech",\n "role": "Lead",\n "description": "In John on developingedge AI and implementing learning for various He was developing a analytics that the\'s to datadriven making."\n }\n ],\n "skills": ",,, AI Development"\n }\n\nSpeaker 4: you. And we do the for content? Items I are 11,,, and, believe.\nSpeaker 0: a communication from on Price to increase the fund the City Manager a the the is Councilman Super. the special provide the of summer.man. increase fund manager a the Jazz theman Allen, provide toa, Sew Feria, of and Item communication. from Mayor Member Mur to Ron Palmer. and Academic\nSpeaker 4: We have a promotion and a second time asman servedman Ringa and customers and they have any comments.\nSpeaker 2: Now. I had queued up to motion, but.\nSpeaker 4: Great that we have any public comment on this.\nSpeaker 5: If there any members the public that to speak on items 11,, 16 and 28 in person please sign up at the podium in Zoom. Please use the raise hand feature or dial star nine. Seen on the concludes public comment\nSpeaker 4:. Please to a\nSpeaker 0:woman\nSpeaker 2:\nSpeaker 0:woman.woman\nSpeaker 2:\nSpeaker 0:man,woman Mongowomanmemberman\nSpeaker 1:\nSpeaker 0: Mayor\nSpeaker 3:\nSpeaker 0: The is carried nine\nSpeaker 4: the and the the budget hearings. That\'s the first thing on the agenda. And they\'re going to try to move through that, through the council as expeditiously as possible. And so with that, let\'s continue the budget hearing, which we are doing for fire, police and parks. We\'re going to hear all of the presentations at once. And then after we go through all the presentations, we\'ll do all the all of the questions at once and then any any public comment, and we\'ll go from there.'
59 |
60 | def __init__(self, *args, **kwargs):
61 | super(LLMLinguaTester, self).__init__(*args, **kwargs)
62 | try:
63 | import nltk
64 |
65 | nltk.download("punkt")
66 | except:
67 | print("nltk_data exits.")
68 | self.llmlingua = PromptCompressor("lgaalves/gpt2-dolly", device_map="cpu")
69 |
70 | def test_general_compress_prompt(self):
71 | # Single Context
72 | compressed_prompt = self.llmlingua.compress_prompt(
73 | self.GSM8K_PROMPT.split("\n\n")[0], target_token=150
74 | )
75 | self.assertEqual(
76 | compressed_prompt["compressed_prompt"],
77 | self.GSM8K_150TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT,
78 | )
79 | self.assertEqual(compressed_prompt["origin_tokens"], 422)
80 | self.assertEqual(compressed_prompt["compressed_tokens"], 293)
81 | self.assertEqual(compressed_prompt["ratio"], "1.4x")
82 | self.assertEqual(compressed_prompt["rate"], "69.4%")
83 |
84 | # Multiple Context
85 | compressed_prompt = self.llmlingua.compress_prompt(
86 | self.GSM8K_PROMPT.split("\n\n"), target_token=150
87 | )
88 | self.assertEqual(
89 | compressed_prompt["compressed_prompt"],
90 | self.GSM8K_150TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT,
91 | )
92 | self.assertEqual(compressed_prompt["origin_tokens"], 727)
93 | self.assertEqual(compressed_prompt["compressed_tokens"], 206)
94 | self.assertEqual(compressed_prompt["ratio"], "3.5x")
95 | self.assertEqual(compressed_prompt["rate"], "28.3%")
96 |
97 | def test_general_structured_compress_prompt(self):
98 | # Single Stuctured Context
99 | import json
100 |
101 | context, _, _, _ = self.llmlingua.segment_structured_context(
102 | [self.JSON_PROMPT], 0.5
103 | )
104 | _ = json.loads(context[0])
105 | compressed_prompt = self.llmlingua.structured_compress_prompt(
106 | [self.JSON_PROMPT],
107 | rate=0.5,
108 | use_sentence_level_filter=True,
109 | use_token_level_filter=True,
110 | )
111 | _ = json.loads(compressed_prompt["compressed_prompt"])
112 | self.assertEqual(
113 | compressed_prompt["compressed_prompt"],
114 | self.JSON_COMPRESSED_PROMPT,
115 | )
116 | self.assertEqual(compressed_prompt["origin_tokens"], 318)
117 | self.assertEqual(compressed_prompt["compressed_tokens"], 241)
118 | self.assertEqual(compressed_prompt["ratio"], "1.3x")
119 | self.assertEqual(compressed_prompt["rate"], "75.8%")
120 |
121 | # Multiple Stuctured Context
122 | compressed_prompt = self.llmlingua.structured_compress_prompt(
123 | [self.JSON_PROMPT, self.MEETINGBANK_TRANSCRIPT_0_PROMPT],
124 | rate=0.5,
125 | use_sentence_level_filter=False,
126 | use_token_level_filter=True,
127 | )
128 | self.assertEqual(
129 | compressed_prompt["compressed_prompt"],
130 | self.COMPRESSED_MULTIPLE_STRUCTURED_CONTEXT_PROMPT,
131 | )
132 | self.assertEqual(compressed_prompt["origin_tokens"], 1130)
133 | self.assertEqual(compressed_prompt["compressed_tokens"], 567)
134 | self.assertEqual(compressed_prompt["ratio"], "2.0x")
135 | self.assertEqual(compressed_prompt["rate"], "50.2%")
136 |
--------------------------------------------------------------------------------
/tests/test_llmlingua2.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Microsoft
2 | # Licensed under The MIT License [see LICENSE for details]
3 |
4 | import unittest
5 |
6 | from llmlingua import PromptCompressor
7 |
8 |
9 | class LLMLingua2Tester(unittest.TestCase):
10 | """
11 | End2end Test for LLMLingua-2
12 | """
13 |
14 | PROMPT = "John: So, um, I've been thinking about the project, you know, and I believe we need to, uh, make some changes. I mean, we want the project to succeed, right? So, like, I think we should consider maybe revising the timeline.\n\nSarah: I totally agree, John. I mean, we have to be realistic, you know. The timeline is, like, too tight. You know what I mean? We should definitely extend it."
15 | COMPRESSED_SINGLE_CONTEXT_PROMPT = "John: thinking project believe need make changes. want project succeed? consider revising timeline.\n\n Sarah agree. be realistic. timeline too tight.? extend."
16 | COMPRESSED_MULTIPLE_CONTEXT_PROMPT = "John: So, I've been thinking about project believe we need to make changes. we want project to succeed, right? think we should consider maybe revising timeline."
17 |
18 | GSM8K_PROMPT = "Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAngelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n\nQuestion: You can buy 4 apples or 1 watermelon for the same price. You bought 36 fruits evenly split between oranges, apples and watermelons, and the price of 1 orange is $0.50. How much does 1 apple cost if your total bill was $66?\nLet's think step by step\nIf 36 fruits were evenly split between 3 types of fruits, then I bought 36/3 = 12 units of each fruit\nIf 1 orange costs $0.50 then 12 oranges will cost $0.50 * 12 = $6\nIf my total bill was $66 and I spent $6 on oranges then I spent $66 - $6 = $60 on the other 2 fruit types.\nAssuming the price of watermelon is W, and knowing that you can buy 4 apples for the same price and that the price of one apple is A, then 1W=4A\nIf we know we bought 12 watermelons and 12 apples for $60, then we know that $60 = 12W + 12A\nKnowing that 1W=4A, then we can convert the above to $60 = 12(4A) + 12A\n$60 = 48A + 12A\n$60 = 60A\nThen we know the price of one apple (A) is $60/60= $1\nThe answer is 1"
19 | GSM8K_150TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT = "Question: Angelo Melanie plan test 2 chapters 4 worksheets 3 hours each chapter 1.5 hours each worksheet study 4 hours day how days 10-minute break every 3 10-minute snack breaks 30 minutes lunch\n\n dedicate 3 hours 2 chapters 3 2 = 6 hours total\n worksheets 1.5 hours each worksheet 1.5 4 = 6 hours total\n 12 hours study 4 hours a day 12 / 4 = 3 days\n breaks lunch 10-minute break 12 hours 10 = 120 minutes\n 3 10-minute snack breaks 3 10 = 30 minutes\n 30 minutes lunch 120 + 30 + 30 = 180 minutes 180 / 60 = 3 extra hours\n 12 hours study + 3 hours breaks = 15 hours total\n 4 hours each day 15 / 4 = 3.75\n 4 days\nThe answer is 4"
20 | GSM8K_150TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT = "4 apples 1 watermelon 36 fruits oranges watermelons 1 orange $0.50 1 apple bill $66\n\n 36 fruits 3 36/3 = 12 units\n 1 orange $0.50 12 oranges $0.50 * 12 = $6\n total bill $66 spent $6 oranges $66 - $6 = $60 other 2\n watermelon W 4 apples one apple A 1W=4A\n 12 watermelons 12 apples $60 $60 = 12W + 12A\n $60 = 12(4A + 12A\n = 48A + 12A\n = 60A\n one apple $60/60= $1\nThe answer is 1"
21 |
22 | MEETINGBANK_PROMPT = "Item 28 Report from Development. Services Recommendation to declare ordinance amending the Land Use District Map from institutional to IRP 13 read and adopted as read District eight. Councilman Austin. So moved. Wonderful. And I want to ask Councilman Andrews so any member of the public that wishes to address item 28 saying none, members, cast your vote. Oh, I'm sorry, sir. I did not see you. Can we? I know this sounds picky and stupid. But this is an illogical motion because you haven't yet created ARP 13. By the way, unlike some other speakers, I will furnish you my name. I'm Joe Weinstein. I did speak last week. I do not like to come down here again to talk on the same subjects. But. There is a minor little matter. As to whether a. The proposed zoning is a good idea. And B, whether. The project, which it is intended. To permit. In fact. Meets the specifications of the zoning. I have not check that out, but someone else did raise that question and there may be some question as to whether all of the conditions of that zoning have, in fact, been met by the details of this project. This particular zoning, perhaps in the abstract, need not be a bad idea, but the way you see it realized in the project. Is not a very good idea. You could have the same density and more without destroying the usability, the usable green space that this design does. Because really, although it looks impressive from a top down view, it looks like you see plenty of green space between the buildings, that that space is pretty well wasted and useless because the buildings are high enough to pretty well shade and dominate the green space that's in that project. So I'm not saying that the density that you're going for is a bad thing. But doing it in this way doesn't work, and any zoning that just permits this without further control is not a good idea. Thank you. Okay. Thank you, sir. Members, please cast your vote. Councilman Andrew's motion carries. Next time, please. Report from Development Services recommendation to declare ordinance amending the Land Use District Map from institutional to park red and adopted as Red District eight."
23 | MEETINGBANK_150TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT = "Item 28 Report Development. Services Recommendation declare ordinance amending Land Use District Map institutional IRP 13 adopted District eight. Councilman Austin. ask Councilman Andrews public address item 28 cast vote. see?. illogical motion created ARP 13. Joe Weinstein. last week. same subjects. minor matter. proposed zoning good idea. project intended. permit Meets specifications zoning. question conditions zoning met details project. zoning not bad project. not good. same density more without destroying usability green space. green space between buildings wasted useless buildings high shade dominate green space. not density bad. doesn't work zoning permits without control not good idea. Thank you. cast vote. Councilman Andrew's motion carries. Next time.Development Services ordinance Land District Map park District."
24 |
25 | LONGBENCH_PROMPT_LIST = [
26 | "新闻内容:\n(服务·健康)专家提醒:寒冷气候易诱发心脑血管疾病\n新华社海口2月9日专电(张苏民、李建国)海口市疾病预防控制中心专家介绍,持续的寒冷气候是心脑血管疾病的杀手,尤其患有高血压或高血脂疾病的老人更应做好防范,防止脑中风发生。\n 在寒冷的气候环境当中要注意保暖,增添衣服,饮食以清淡为主,多食用蔬菜,忌暴食荤类。尤其过年时,切忌熬夜,平时要加强身体锻炼,劳逸结合。除此之外,冬季还是呼吸道传染病暴发和流行的季节,应该注意预防流感、麻疹、流脑、水痘等呼吸道传染病的发生。\n 专家还指出,由于寒冷气候影响,人们习惯门窗紧闭,空气不对流,一旦有传染源传入,极容易造成疾病的暴发。春节期间,一些商场或公共娱乐场所人群密集,有关单位应加强通风。(完)\n类别:医药、卫生",
27 | "\n\n新闻内容:\n李明波在恩施调研时强调 大力推进基层党内民主建设\n本报讯 (记者吴畏、通讯员曾言、周恩祖)11日至13日,省委常委、秘书长李明波到恩施州调研基层党建工作时强调,要以增强党的创新活力、巩固党的团结统一为目标,以改革创新精神大力抓好基层党内民主建设。\n 李明波视察了非公有制企业党建、党代表常任制、基层党务公开、以党内和谐推进社区和谐等党建工作现场,与基层党务工作者座谈。李明波强调,在新形势下,要把握好民主进程与经济社会发展、尊重党员主体地位与提高党员民主素质、履行党员民主权利与保证党的统一意志、发挥党员民主监督作用与加强党纪教育管理等的关系,进一步深入探索,在丰富形式、拓宽渠道、完善机制等方面取得更大成绩。\n类别:政治",
28 | "\n\n新闻内容:\n第38届世界贸易中心年会及经贸洽谈会\n第38届世界贸易中心年会将于2007年10月21至24日在美国路易斯\n安那州首府新奥尔良召开。该会由美国纽约世界贸易中心总部和美国贸\n易服务管理总局、新奥尔良世贸中心共同举办,届时将有来自60多个国\n家和地区的经贸代表团约600余人与会。天津贸促会与天津世贸中心协\n会将共同组织天津经贸代表团赴美国参加“世贸中心2007年年会及经贸\n洽谈会”。\n 联系人:王岭 刘鹏\n 电话:022-2520231725202123\n 传真:022-25201975\n 地址:天津经济技术开发区宏达街19号A区2楼\n类别:商业、外贸、海关",
29 | "\n\n新闻内容:\n(全运会)第十一届全运会开闭幕时间确定\n新华社济南6月5日体育专电(记者赵仁伟)第十一届全国运动会组委会5日在济南宣布,十一运会将于今年10月16日在济南奥体中心开幕,闭幕时间为10月28日。\n 十一运会组委会常务副秘书长、山东省体育局局长张洪涛介绍,十一运会的比赛项目共设33个大项、43个分项、362个小项,其中包括28个夏季奥运会项目、4个冬季项目以及武术项目。与2005年十运会相比,大项增加了1个,即自由式滑雪;小项增加了5个,分别是自由式滑雪男子个人、女子个人,女子水球项目,足球男子16岁以下组和女子18岁以下组。\n 在十一运会全部362个小项中,马拉松男、女2个小项的比赛在北京举办,速度滑冰4个小项、自由式滑雪2个小项的比赛分别在沈阳和长春举办,其余354个小项的比赛在山东省17个赛区举行。其中,济南赛区共举办小项212个,青岛48个,日照40个,滨州28个,枣庄8个,菏泽7个,威海5个,烟台、德州各3个;淄博、东营、潍坊、济宁、泰安、莱芜、临沂、聊城8个赛区只举办小组赛和第四名以后的比赛,不产生金牌。\n 张洪涛介绍,十一运会冰雪项目已于1月至4月举行,占全部小项的4.4%。因部分夏季项目的世界锦标赛或国际重要赛事的时间与十一运会比赛时间冲突或相距较近,国家体育总局确定把这些项目的比赛安排在开幕式前举行,共有15个项目、80个小项,占全部小项的22.1%。(完)\n类别:体育",
30 | "\n\n新闻内容:\n(教育)河北整顿公办初中、小学招收择校生\n(教育)河北整顿公办初中、小学招收择校生\n 新华社石家庄3月12日电(冯月静)记者从河北省教育纪检监察审计工作会议上了解到,从今年起,河北省不再审批新的改制学校。对已审批的改制学校进行一次全面整顿和规范,重点解决公办初中、小学以改制为名或以民办为名举办“校中校”“校中班”高收费问题。\n 据了解,河北省规定达不到要求的,要限期整改;年内仍达不到标准要求的,一律停止招生。公办学校一律不准搞“一校两制”,更不准以改制为名高收费。\n 同时,今年秋季新学年开始,设区市市区的公办省级示范性普通高中(含在县镇办学的市直属省级示范性高中)择校生比例最高限额由原定的40%一律下调为30%。严禁学校擅自扩大择校生招生比例、降低录取分数线、提高收费标准或在限定金额外加收任何其他费用。(完)\n类别:教育",
31 | "\n\n新闻内容:\n(服务·关注“过劳死”) “过劳死”青睐什么人?\n人?\n 新华社郑州3月16日专电(记者李丽静) 有关专家\n研究表明:受教育程度高、中青年、女性是“过劳死”这\n一疾病的危险人群。这是因为这些人事业上强力拼搏,生\n活负荷过重,自身经常处于紧张状态之中,过度疲劳难以\n避免。\n 随着社会竞争的日趋激烈,该病也越来越多地困扰着\n我国的都市人。据一项在上海、无锡、深圳等地对\n1197位中年人健康状况调查显示:其中66%的人有\n失眠、多梦、不易入睡等现象;62%的人经常腰酸背痛;\n58%的人一干活就累;57%的人爬楼时感到吃力或记\n忆力明显减退;48%的人皮肤干燥、瘙痒、面色晦暗、\n脾气暴躁、焦虑。据国家有关部门的一项调查结果表明,\n慢性疲劳综合征在城市新兴行业人群中的发病率为10%\n至20%,在科技、新闻、广告、公务人员、演艺、出租\n车司机等行业中发病率则更高。\n 有关专家通过统计认为,“过劳死”特别“青睐”三\n种人:\n 第一种是有钱但不知保养的人。这部分人“富裕”的\n背后,往往有一条铺满辛酸的路。由于对贫穷的恐惧,使\n他们对财富永远不满足。为了追逐更多的财富,即使赴汤\n蹈火也在所不辞,而对他们最初惟一的资本———身体,\n则很不在乎。 \n 第二种是有事业心,特别是称得上“工作狂”的人。\n主要以从事科研、教学、新型高科技,如网络等职业者居\n多。\n 第三种是有家族遗传背景者。如果父母亲、爷爷奶奶\n等直系亲属中有心绞痛、心肌梗死、脑中风的患者,就要\n特别小心了,千万别让自己累着,否则很有可能在年轻时\n就诱发疾病。\n 而在对“过劳死”人群深入研究中发现,猝死直接死\n因的前5位是冠状动脉疾病、主动脉瘤、心瓣膜病、心肌\n病和脑出血。一些无症状冠心病,特别是无症状心肌梗死\n是首要的危险因素,一般的体检和心电图不易发现隐性冠\n心病。一旦发作,措手不及。此外,高血压也是一个潜在\n的危险因素。在遇到某些诱因时,便会引发高血压、脑中\n风等。(完)\n类别:医药、卫生",
32 | "\n\n新闻内容:\n五项措施应对技术性贸易壁垒\n调查结果显示,2006年我国有31\n .4%的出口企业受到国外技术性贸易措施不同程度的影响,比2005年增长6.3个百分点;全年出口贸易直接损失359.20亿美元,占同期出口额的3.71%,企业新增成本191.55亿美元。\n 会议通报的情况显示,对中国企业出口影响较大的技术性贸易措施类型集中在认证要求、技术标准要求、有毒有害物质限量要求、包装及材料的要求和环保要求(包括节能及产品回收),食品中农兽药残留要求、重金属等有害物质限量要求、细菌等卫生指标要求、食品标签要求和食品接触材料的要求等方面。受国外技术性贸易措施影响较大的行业排在前五位的是机电、农食产品、化矿、塑料皮革和纺织鞋帽。\n 会议提出了加强应对的5点意见。一是要强化进出口质量监管措施,在“严”字上下功夫,重点从源头上抓好农兽药残留、有毒化学物质残留、微生物等问题,同时要完善监管机制,提高检测能力,要检得出,检得快,检得准。二是要加快实施技术标准战略,在“高”字上下功夫,不断提高采标率,加快标准的制修订步伐。三是要加大信息共享力度,在“准”字上下功夫,各部门要密切配合,建立沟通机制,做到信息资源的充分利用。四是要果断迅速应对突发事件,在“快”字上下功夫。五是要加强技术性贸易措施的积极应对,在“实”字上下功夫,协调配合、相互支持。\n类别:商业、外贸、海关",
33 | "\n\n新闻内容:\n(新华时评·奥运会倒计时一百天)让我们共同守护奥林匹克精神\n新华社北京4月30日电 题:让我们共同守护奥林匹克精神\n 新华社记者张旭\n 在北京奥运会倒计时一百天之际,奥运圣火结束在其他国家的传递进入中国香港。在这两个重要时间节点重合之时,让我们以奥林匹克精神为依归,回味今年以来围绕北京奥运的风风雨雨,并以百倍的努力在接下来的日子里守护这一美好理想。\n 奥林匹克运动会是古希腊人的体育盛会,许多比赛项目源于古希腊文化。顾拜旦说:“古希腊人之所以组织竞赛活动,不仅仅只是为了锻炼体格和显示一种廉价的壮观场面,更是为了教育人”。更高更快更强并不是现代奥林匹克运动的全部价值诉求。现代奥林匹克运动经过了一百年的历史变迁,向世界传达的精神与主题始终如一,那就是在共同创造、共同分享、平等友爱的旗帜下,展现人类最美好的情感。奥林匹克是迄今为止人类社会不同种族、地域乃至不同意识形态间最大的交集。\n 2001年7月13日,时任国际奥委会主席的萨马兰奇宣布北京取得2008年奥运会主办权,现代奥林匹克运动从奥林匹亚来到万里长城。7年后的春天,当奥运圣火开始在中国境外传递时,妖魔化中国的舆论攻势和扰乱奥运火炬传递的暴力举动让海内外目光聚焦中国。我们可以肯定地说,这些人在为一己之私对奥林匹克精神进行亵渎。\n 北京奥运圣火一路走来,虽然遇到了噪音和干扰,但更多面对的还是像火一样热情的世界人民和对奥林匹克精神充分尊重的各国人士。他们因为懂得尊重奥林匹克精神,因此也能够享受奥林匹克带来的快乐。\n 2008年4月30日,“北京欢迎你”的歌声回荡在有着近600年历史的紫禁城太庙上空。8月8日,中国人民将第一次以东道主的身份在北京承办举世瞩目的奥林匹克运动会。北京奥运会对中国来说不仅仅是一次体育盛会,更是一次与世界各国开展文化交流的机会。如同当年奥林匹亚为神圣的无战争区域一样,体育竞技的目标是为了全世界的和平与发展。北京奥运会也完全可以成为世界各种文明一个共同的精神家园,通过沟通交流,达到良性互动。\n 奥运会的脚步声离我们越来越近的时候,奥林匹克运动正在为13亿中国人民所熟悉,奥林匹克精神也继续在世界范围内承载起人类追求幸福生活的梦想。中国人民真诚地邀请各国运动员、教练员和朋友们参与2008年北京奥运会。中国人民同时真诚地邀请全世界热爱奥林匹克精神和奥林匹克运动的人们一起,共同守护这一人类美好理想,让它在北京奥运会上开放出更加美丽的花朵。(完)\n类别:体育",
34 | "\n\n新闻内容:\n海口“接管”省 特殊教育 学校\n创建于1989年的海南省特殊教育学校原属省教育厅直属正处级事业单位,为海南省惟一一所全日寄宿的公立特殊教育学校。\n 我市“接管”省特殊教育学校之后,将继续面向全省招收视障、听障两类适龄儿童,优化教育布局调整,促进特殊教育又好又快发展。\n类别:教育",
35 | "\n\n新闻内容:\n9月7日特稿(加1)(美国-大学流感)\n美一大学两千学生恐染流感\n 马震\n 美国华盛顿州立大学大约2000名学生报告甲型H1N1流感症状。校方和医护人员说,这可能是最严重的一起大学生感染新型流感事件。\n (小标题)人数众多\n 这所大学位于华盛顿州普尔曼,主校区大约有1.9万名学生。据美国《纽约时报》网络版6日报道,华盛顿州注册护士萨莉·雷德曼证实了大约2000名华盛顿州立大学学生报告流感症状一事。\n 雷德曼在华盛顿州立大学学生医疗部门工作。她说,流感暴发情况出现在8月21日,那时学校还没开学。但如今为学生提供医疗服务的部门总是门庭若市。有一天,大约有200名学生就诊或给医疗机构打电话报告喉咙疼、发烧、咳嗽等症状。\n 华盛顿州立大学所在惠特曼县的卫生部门官员说,州实验室上周的检测结果显示,这所大学的疫情确实是因甲型H1N1流感病毒引起。\n 学校现已开学。法新社本月6日报道,学校上周开了关于流感疫情的博客,博客上最新的信息说:“秋季学期的前10天,我们估计已与大约2000名有流感症状的人联络。”\n 校方管理人员说,一些学生可能到社区医院就诊,一些学生可能居家自我治疗。校方无法掌握这些人的人数,已要求当地卫生部门提供相关数据,以便校方更好了解疫情情况。\n (小标题)无一死亡\n 华盛顿州立大学已根据国家疾病控制和预防中心的防流感指南向学生提供咨询服务,以避免疫情进一步加重。学校还向学生发放了一些防流感的药品和护具等。\n 为防止甲型流感传播,美国的一些大学已建立起隔离机制,但华盛顿州立大学没有类似机制。雷德曼说,在华盛顿州立大学上报的大部分流感疫情案例中,疑似染病的学生被要求待在居所内休息并吃退烧药。如果这些人在不吃退烧药24小时后体温仍旧正常,就可以正常来上课。\n 美国已有593例与甲型流感有关的死亡病例,但华盛顿州立大学尚未发现一起死亡病例。到目前为止,学生的流感症状相对温和,只有两个不是学生的患者入院治疗。\n 校方在声明中说:“我校患者中的绝大部分症状温和,通常3到5天就能见强。”\n (小标题)担心传播\n 华盛顿州立大学大规模流感疫情出现前,美国大学健康协会于8月28日对165所大学实施了流感疫情调查。调查结果显示,全国超过2000名学生报告说有甲型流感症状。\n 惠特曼县公共卫生部门负责人蒂莫西·穆迪认为本月晚些时候开学的其他大学可能会遭遇类似华盛顿州立大学的情况,而地方医疗机构会担心疫情可能向校外蔓延。\n 国家疾病控制和预防中心主任托马斯·弗里登6日接受美国有线电视新闻网采访时说,学校医务人员本学年报告的流感数字不同寻常。疾病控制和预防中心此前未遭遇过8月和9月数字增长这么快的情况。\n 国家疾病控制和预防中心现在特别重视流感疫情。弗里登说:“如果它的致命性增加,可能会造成特别严重的情形,可能会给上学和上班的人带来特别多麻烦。”(完)(新华社供本报特稿)\n 关键词:华盛顿州立大学(Washington State University)\n类别:医药、卫生",
36 | "\n\n新闻内容:\n在国防教育的落实上下功夫\n在国防教育的落实上下功夫 赵荣\n 加强全民国防教育是增强国防观念和忧患意识、促进国防和军队建设的基础性工程。鉴此,在今后的实践中,要坚持以科学发展观为指导,科学谋划、创新形式、狠抓落实,使全民国防教育深入人心,扎实有效地开展下去。\n 抓好责任落实。《国防教育法》第三章第十八条规定:各地区各部门的领导人员应当依法履行组织、领导本地区、本部门开展国防教育的职责。因而,要使全民国防教育扎实有效地开展下去,各级领导和职能部门要依法负起抓好全民国防教育的责任,对本地区、本单位、本行业的国防教育,从计划安排到组织实施都要认真负责地抓好落实。\n 抓好人员落实。国防教育是面向全民的教育,它的开展必须面向全社会,而不能只针对个别地区、个别单位和个别人员。因而,各地要对一切有接受能力的公民实施国防教育,以提高全民的政治、思想和道德素质,使全体公民积极争当热爱祖国、热爱国防的好公民。\n 抓好效果落实。国防教育的开展,效果的落实极为重要。为此,教育中应着重抓好国防理论、国防精神、国防知识、国防历史、国防技能、国防法制的教育,以强化爱国精神、增长国防知识、强化国防观念。通过教育,使全体公民进一步了解我国安全面临的新形势、世界军事变革的新发展、我国国防和军队建设面临的新挑战、以及在对国防建设中应承担的义务和责任等,不断提高他们支持和关心国防建设的积极性和自觉性。\n (来源:中国国防报 发布时间: 2007-11-22 08:19)\n类别:军事",
37 | "\n\n新闻内容:\n中国又一学者当选瑞典皇家工程科学院外籍院士\n新华社北京8月20日电 北京航空航天大学中国循环经济研究中心主任、北京循环经济促进会会长吴季松教授,日前被瑞典皇家工程科学院全体大会选为该院外籍院士。\n 作为改革开放后首批出国访问学者之一,吴季松曾在欧洲原子能联营法国原子能委员会研究受控热核聚变,还曾任中国常驻联合国教科文组织代表团参赞衔副代表、联合国教科文组织科技部门高技术与环境顾问。1985至1986年,主持联合国教科文组织“多学科综合研究应用于经济发展”专题研究,并由联合国教科文组织发表项目研究报告创意知识经济。\n 他在中国科技和产业领域作出了多项贡献,主要包括:创意“知识经济”并将科技园区的实践介绍到中国、提出修复生态系统理论并主持制定水资源规划、创立新循环经济学等。\n 瑞典皇家工程科学院创建于1919年,是世界上第一个工程院,现有机械工程、电机工程等学部。该院参与相关诺贝尔奖项的提名和评审工作。目前共有院士(含外籍院士)近1100人,来自中国的外籍院士包括宋健、徐冠华等。(完)\n类别:科学技术",
38 | ]
39 | LONGBENCH_1000TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT = "\n 新闻内容 第38届世界贸易中心年会及经贸洽谈会\n 安那州首府新奥尔良召开。\n 易服务管理总局、新奥尔良世贸中心共同举办\n 家和地区的经贸代表团约600余人与会。 天津贸促会与天津世贸中心协\n 会将共同组织天津经贸代表团赴美国参加“世贸中心2007年年会及经贸\n 洽谈会”。\n 联系人:王岭 刘鹏\n 电话:022-2520231725202123\n 传真:022-25201975\n 地址:天津经济 技术开发区宏达街19号A区2楼\n类别:商业、外贸、海关\n\n\n 新闻内容\n 海口“接管”省 特殊教育 学校\n 创建于1989年的海南省特殊教育 学校原属省教育 厅直属正处级事业单位,为海南省惟一一所全日寄宿的公立特殊教育 学校。\n教育 学校之后,将继续面向全省招收视障、听障两类适龄儿童教育 布局调整教育。\n类别:教育\n\n\n 中国又一学者当选瑞典皇家工程科学院外籍院士\n 新华社北京8月20日电 北京航空航天大学中国循环经济 研究中心主任、北京循环经济 促进会会长吴季松教授,日前被瑞典皇家工程科学院全体大会选为该院外籍院士。\n 作为改革开放后首批出国访问学者之一,吴季松曾在欧洲原子能联营法国原子能委员会研究受控热核聚变,还曾任中国常驻联合国教科文组织代表团参赞衔副代表、联合国教科文组织科技部门高技术与环境顾问。 1985至1986年,主持联合国教科文组织“多学科综合研究应用于经济 发展”专题研究经济。\n:创意“知识经济 ”并将科技园区的实践介绍到中国、提出修复生态系统理论并主持制定水资源规划、创立新循环经济 学等。\n 瑞典皇家工程科学院创建于1919年,是世界上第一个工程院,现有机械工程、电机工程等学部。 目前共有院士(含外籍院士)近1100人,来自中国的外籍院士包括宋健、徐冠华等。\n类别:科学技术"
40 |
41 | def __init__(self, *args, **kwargs):
42 | super(LLMLingua2Tester, self).__init__(*args, **kwargs)
43 | self.llmlingua = PromptCompressor(
44 | model_name="microsoft/llmlingua-2-xlm-roberta-large-meetingbank",
45 | device_map="cpu",
46 | use_llmlingua2=True,
47 | )
48 |
49 | def test_general_compress_prompt(self):
50 | compressed_prompt = self.llmlingua.compress_prompt(
51 | self.PROMPT,
52 | rate=0.33,
53 | force_tokens=["\n", ".", "!", "?"],
54 | drop_consecutive=False,
55 | force_reserve_digit=False,
56 | )
57 | self.assertEqual(
58 | compressed_prompt["compressed_prompt"],
59 | self.COMPRESSED_SINGLE_CONTEXT_PROMPT,
60 | )
61 | self.assertEqual(compressed_prompt["origin_tokens"], 98)
62 | self.assertEqual(compressed_prompt["compressed_tokens"], 30)
63 | self.assertEqual(compressed_prompt["ratio"], "3.3x")
64 | self.assertEqual(compressed_prompt["rate"], "30.6%")
65 |
66 | compressed_prompt = self.llmlingua.compress_prompt(
67 | self.PROMPT.split("\n\n"),
68 | target_token=40,
69 | use_context_level_filter=True,
70 | force_tokens=["\n", ".", "!", "?"],
71 | drop_consecutive=False,
72 | force_reserve_digit=False,
73 | )
74 | self.assertEqual(
75 | compressed_prompt["compressed_prompt"],
76 | self.COMPRESSED_MULTIPLE_CONTEXT_PROMPT,
77 | )
78 | self.assertEqual(compressed_prompt["origin_tokens"], 98)
79 | self.assertEqual(compressed_prompt["compressed_tokens"], 34)
80 | self.assertEqual(compressed_prompt["ratio"], "2.9x")
81 | self.assertEqual(compressed_prompt["rate"], "34.7%")
82 |
83 | # Single Context
84 | compressed_prompt = self.llmlingua.compress_prompt(
85 | self.GSM8K_PROMPT.split("\n\n")[0],
86 | target_token=170,
87 | force_tokens=[
88 | "+",
89 | "-",
90 | "*",
91 | "×",
92 | "/",
93 | "÷",
94 | "=",
95 | "The answer is",
96 | "\n",
97 | "Question:",
98 | ],
99 | drop_consecutive=False,
100 | force_reserve_digit=True,
101 | )
102 | self.assertEqual(
103 | compressed_prompt["compressed_prompt"],
104 | self.GSM8K_150TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT,
105 | )
106 | self.assertEqual(compressed_prompt["origin_tokens"], 422)
107 | self.assertEqual(compressed_prompt["compressed_tokens"], 203)
108 | self.assertEqual(compressed_prompt["ratio"], "2.1x")
109 | self.assertEqual(compressed_prompt["rate"], "48.1%")
110 |
111 | # Single Context
112 | compressed_prompt = self.llmlingua.compress_prompt(
113 | self.MEETINGBANK_PROMPT.split("\n\n")[0],
114 | target_token=150,
115 | force_tokens=["\n", ".", "?", "!"],
116 | drop_consecutive=True,
117 | force_reserve_digit=False,
118 | )
119 | self.assertEqual(
120 | compressed_prompt["compressed_prompt"],
121 | self.MEETINGBANK_150TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT,
122 | )
123 | self.assertEqual(compressed_prompt["origin_tokens"], 464)
124 | self.assertEqual(compressed_prompt["compressed_tokens"], 154)
125 | self.assertEqual(compressed_prompt["ratio"], "3.0x")
126 | self.assertEqual(compressed_prompt["rate"], "33.2%")
127 |
128 | # Multiple Context
129 | compressed_prompt = self.llmlingua.compress_prompt(
130 | self.GSM8K_PROMPT.split("\n\n"),
131 | target_token=150,
132 | use_context_level_filter=True,
133 | force_tokens=["+", "-", "*", "×", "/", "÷", "=", "The answer is", "\n"],
134 | drop_consecutive=False,
135 | force_reserve_digit=True,
136 | )
137 | self.assertEqual(
138 | compressed_prompt["compressed_prompt"],
139 | self.GSM8K_150TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT,
140 | )
141 | self.assertEqual(compressed_prompt["origin_tokens"], 726)
142 | self.assertEqual(compressed_prompt["compressed_tokens"], 161)
143 | self.assertEqual(compressed_prompt["ratio"], "4.5x")
144 | self.assertEqual(compressed_prompt["rate"], "22.2%")
145 |
146 | # Multiple Context
147 | compressed_prompt = self.llmlingua.compress_prompt(
148 | self.LONGBENCH_PROMPT_LIST,
149 | target_token=1000,
150 | use_context_level_filter=True,
151 | force_tokens=[
152 | "\n",
153 | "。",
154 | ":",
155 | "?",
156 | "类别:",
157 | "农业、农村",
158 | "军事",
159 | "文学、艺术",
160 | "体育",
161 | "传媒业",
162 | "电子信息产业",
163 | "文化、休闲娱乐",
164 | "社会、劳动",
165 | "经济",
166 | "服务业、旅游业",
167 | "环境、气象",
168 | "能源、水务、水利",
169 | "财政、金融",
170 | "教育",
171 | "科学技术",
172 | "对外关系、国际关系",
173 | "矿业、工业",
174 | "政治",
175 | "交通运输、邮政、物流",
176 | "灾难、事故",
177 | "基本建设、建筑业、房地产",
178 | "医药、卫生",
179 | "法律、司法",
180 | "商业、外贸、海关",
181 | ],
182 | drop_consecutive=True,
183 | force_reserve_digit=False,
184 | )
185 | self.assertEqual(
186 | compressed_prompt["compressed_prompt"],
187 | self.LONGBENCH_1000TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT,
188 | )
189 | self.assertEqual(compressed_prompt["origin_tokens"], 8389)
190 | self.assertEqual(compressed_prompt["compressed_tokens"], 870)
191 | self.assertEqual(compressed_prompt["ratio"], "9.6x")
192 | self.assertEqual(compressed_prompt["rate"], "10.4%")
193 |
--------------------------------------------------------------------------------