├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── VaLProbing
├── README.md
├── VaLProbing-32K
│ ├── code_backward_32k_skip_list.json
│ ├── database_forward_32k_skip_list.json
│ ├── document_bi_32k_skip_list.json
│ └── results
│ │ ├── FILM-7B
│ │ ├── sample_code_backward_32k.jsonl
│ │ ├── sample_database_forward_32k.jsonl
│ │ └── sample_document_bi_32k.jsonl
│ │ ├── Mistral-7B-Instruct-v0.2
│ │ ├── sample_code_backward_32k.jsonl
│ │ ├── sample_database_forward_32k.jsonl
│ │ └── sample_document_bi_32k.jsonl
│ │ └── gpt4-turbo
│ │ ├── sample_code_backward_32k.jsonl
│ │ ├── sample_database_forward_32k.jsonl
│ │ └── sample_document_bi_32k.jsonl
├── download.py
├── figures
│ └── probing_fig.pdf
├── pieces
│ ├── code_functions.jsonl
│ ├── database_entities.jsonl
│ └── document_sentences.jsonl
└── plot.py
├── figures
├── probing_results.png
├── probing_results_new.png
├── real_world_long.png
└── short.png
├── real_world_long
├── README.md
├── evaluate.py
├── metrics.py
├── prompts
│ ├── LongBench_output_128_512.zip
│ └── LongBench_output_32_64.zip
└── results
│ ├── FILM-7B
│ ├── sample_LongBench_output_128.jsonl
│ ├── sample_LongBench_output_32.jsonl
│ ├── sample_LongBench_output_512.jsonl
│ └── sample_LongBench_output_64.jsonl
│ ├── Mistral-7B-Instruct-v0.2
│ ├── sample_LongBench_output_128.jsonl
│ ├── sample_LongBench_output_32.jsonl
│ ├── sample_LongBench_output_512.jsonl
│ └── sample_LongBench_output_64.jsonl
│ └── gpt4-turbo
│ ├── sample_LongBench_output_128.jsonl
│ ├── sample_LongBench_output_32.jsonl
│ ├── sample_LongBench_output_512.jsonl
│ └── sample_LongBench_output_64.jsonl
├── requirements.txt
├── short_tasks
├── README.md
├── evaluation.py
├── prompts
│ ├── csqa_0shot.jsonl
│ ├── gsm8k_8shot.jsonl
│ └── math_4shot.jsonl
├── results
│ ├── FILM-7B
│ │ ├── sample_csqa_0shot.jsonl
│ │ ├── sample_gsm8k_8shot.jsonl
│ │ └── sample_math_4shot.jsonl
│ └── Mistral-7B-Instruct-v0.2
│ │ ├── sample_csqa_0shot.jsonl
│ │ ├── sample_gsm8k_8shot.jsonl
│ │ └── sample_math_4shot.jsonl
└── utils.py
└── vllm_inference
└── vllm_inference.py
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 | ##
4 | ## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
5 |
6 | # User-specific files
7 | *.rsuser
8 | *.suo
9 | *.user
10 | *.userosscache
11 | *.sln.docstates
12 |
13 | # User-specific files (MonoDevelop/Xamarin Studio)
14 | *.userprefs
15 |
16 | # Mono auto generated files
17 | mono_crash.*
18 |
19 | # Build results
20 | [Dd]ebug/
21 | [Dd]ebugPublic/
22 | [Rr]elease/
23 | [Rr]eleases/
24 | x64/
25 | x86/
26 | [Ww][Ii][Nn]32/
27 | [Aa][Rr][Mm]/
28 | [Aa][Rr][Mm]64/
29 | bld/
30 | [Bb]in/
31 | [Oo]bj/
32 | [Ll]og/
33 | [Ll]ogs/
34 |
35 | # Visual Studio 2015/2017 cache/options directory
36 | .vs/
37 | # Uncomment if you have tasks that create the project's static files in wwwroot
38 | #wwwroot/
39 |
40 | # Visual Studio 2017 auto generated files
41 | Generated\ Files/
42 |
43 | # MSTest test Results
44 | [Tt]est[Rr]esult*/
45 | [Bb]uild[Ll]og.*
46 |
47 | # NUnit
48 | *.VisualState.xml
49 | TestResult.xml
50 | nunit-*.xml
51 |
52 | # Build Results of an ATL Project
53 | [Dd]ebugPS/
54 | [Rr]eleasePS/
55 | dlldata.c
56 |
57 | # Benchmark Results
58 | BenchmarkDotNet.Artifacts/
59 |
60 | # .NET Core
61 | project.lock.json
62 | project.fragment.lock.json
63 | artifacts/
64 |
65 | # ASP.NET Scaffolding
66 | ScaffoldingReadMe.txt
67 |
68 | # StyleCop
69 | StyleCopReport.xml
70 |
71 | # Files built by Visual Studio
72 | *_i.c
73 | *_p.c
74 | *_h.h
75 | *.ilk
76 | *.meta
77 | *.obj
78 | *.iobj
79 | *.pch
80 | *.pdb
81 | *.ipdb
82 | *.pgc
83 | *.pgd
84 | *.rsp
85 | *.sbr
86 | *.tlb
87 | *.tli
88 | *.tlh
89 | *.tmp
90 | *.tmp_proj
91 | *_wpftmp.csproj
92 | *.log
93 | *.tlog
94 | *.vspscc
95 | *.vssscc
96 | .builds
97 | *.pidb
98 | *.svclog
99 | *.scc
100 |
101 | # Chutzpah Test files
102 | _Chutzpah*
103 |
104 | # Visual C++ cache files
105 | ipch/
106 | *.aps
107 | *.ncb
108 | *.opendb
109 | *.opensdf
110 | *.sdf
111 | *.cachefile
112 | *.VC.db
113 | *.VC.VC.opendb
114 |
115 | # Visual Studio profiler
116 | *.psess
117 | *.vsp
118 | *.vspx
119 | *.sap
120 |
121 | # Visual Studio Trace Files
122 | *.e2e
123 |
124 | # TFS 2012 Local Workspace
125 | $tf/
126 |
127 | # Guidance Automation Toolkit
128 | *.gpState
129 |
130 | # ReSharper is a .NET coding add-in
131 | _ReSharper*/
132 | *.[Rr]e[Ss]harper
133 | *.DotSettings.user
134 |
135 | # TeamCity is a build add-in
136 | _TeamCity*
137 |
138 | # DotCover is a Code Coverage Tool
139 | *.dotCover
140 |
141 | # AxoCover is a Code Coverage Tool
142 | .axoCover/*
143 | !.axoCover/settings.json
144 |
145 | # Coverlet is a free, cross platform Code Coverage Tool
146 | coverage*.json
147 | coverage*.xml
148 | coverage*.info
149 |
150 | # Visual Studio code coverage results
151 | *.coverage
152 | *.coveragexml
153 |
154 | # NCrunch
155 | _NCrunch_*
156 | .*crunch*.local.xml
157 | nCrunchTemp_*
158 |
159 | # MightyMoose
160 | *.mm.*
161 | AutoTest.Net/
162 |
163 | # Web workbench (sass)
164 | .sass-cache/
165 |
166 | # Installshield output folder
167 | [Ee]xpress/
168 |
169 | # DocProject is a documentation generator add-in
170 | DocProject/buildhelp/
171 | DocProject/Help/*.HxT
172 | DocProject/Help/*.HxC
173 | DocProject/Help/*.hhc
174 | DocProject/Help/*.hhk
175 | DocProject/Help/*.hhp
176 | DocProject/Help/Html2
177 | DocProject/Help/html
178 |
179 | # Click-Once directory
180 | publish/
181 |
182 | # Publish Web Output
183 | *.[Pp]ublish.xml
184 | *.azurePubxml
185 | # Note: Comment the next line if you want to checkin your web deploy settings,
186 | # but database connection strings (with potential passwords) will be unencrypted
187 | *.pubxml
188 | *.publishproj
189 |
190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
191 | # checkin your Azure Web App publish settings, but sensitive information contained
192 | # in these scripts will be unencrypted
193 | PublishScripts/
194 |
195 | # NuGet Packages
196 | *.nupkg
197 | # NuGet Symbol Packages
198 | *.snupkg
199 | # The packages folder can be ignored because of Package Restore
200 | **/[Pp]ackages/*
201 | # except build/, which is used as an MSBuild target.
202 | !**/[Pp]ackages/build/
203 | # Uncomment if necessary however generally it will be regenerated when needed
204 | #!**/[Pp]ackages/repositories.config
205 | # NuGet v3's project.json files produces more ignorable files
206 | *.nuget.props
207 | *.nuget.targets
208 |
209 | # Microsoft Azure Build Output
210 | csx/
211 | *.build.csdef
212 |
213 | # Microsoft Azure Emulator
214 | ecf/
215 | rcf/
216 |
217 | # Windows Store app package directories and files
218 | AppPackages/
219 | BundleArtifacts/
220 | Package.StoreAssociation.xml
221 | _pkginfo.txt
222 | *.appx
223 | *.appxbundle
224 | *.appxupload
225 |
226 | # Visual Studio cache files
227 | # files ending in .cache can be ignored
228 | *.[Cc]ache
229 | # but keep track of directories ending in .cache
230 | !?*.[Cc]ache/
231 |
232 | # Others
233 | ClientBin/
234 | ~$*
235 | *~
236 | *.dbmdl
237 | *.dbproj.schemaview
238 | *.jfm
239 | *.pfx
240 | *.publishsettings
241 | orleans.codegen.cs
242 |
243 | # Including strong name files can present a security risk
244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
245 | #*.snk
246 |
247 | # Since there are multiple workflows, uncomment next line to ignore bower_components
248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
249 | #bower_components/
250 |
251 | # RIA/Silverlight projects
252 | Generated_Code/
253 |
254 | # Backup & report files from converting an old project file
255 | # to a newer Visual Studio version. Backup files are not needed,
256 | # because we have git ;-)
257 | _UpgradeReport_Files/
258 | Backup*/
259 | UpgradeLog*.XML
260 | UpgradeLog*.htm
261 | ServiceFabricBackup/
262 | *.rptproj.bak
263 |
264 | # SQL Server files
265 | *.mdf
266 | *.ldf
267 | *.ndf
268 |
269 | # Business Intelligence projects
270 | *.rdl.data
271 | *.bim.layout
272 | *.bim_*.settings
273 | *.rptproj.rsuser
274 | *- [Bb]ackup.rdl
275 | *- [Bb]ackup ([0-9]).rdl
276 | *- [Bb]ackup ([0-9][0-9]).rdl
277 |
278 | # Microsoft Fakes
279 | FakesAssemblies/
280 |
281 | # GhostDoc plugin setting file
282 | *.GhostDoc.xml
283 |
284 | # Node.js Tools for Visual Studio
285 | .ntvs_analysis.dat
286 | node_modules/
287 |
288 | # Visual Studio 6 build log
289 | *.plg
290 |
291 | # Visual Studio 6 workspace options file
292 | *.opt
293 |
294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
295 | *.vbw
296 |
297 | # Visual Studio 6 auto-generated project file (contains which files were open etc.)
298 | *.vbp
299 |
300 | # Visual Studio 6 workspace and project file (working project files containing files to include in project)
301 | *.dsw
302 | *.dsp
303 |
304 | # Visual Studio 6 technical files
305 | *.ncb
306 | *.aps
307 |
308 | # Visual Studio LightSwitch build output
309 | **/*.HTMLClient/GeneratedArtifacts
310 | **/*.DesktopClient/GeneratedArtifacts
311 | **/*.DesktopClient/ModelManifest.xml
312 | **/*.Server/GeneratedArtifacts
313 | **/*.Server/ModelManifest.xml
314 | _Pvt_Extensions
315 |
316 | # Paket dependency manager
317 | .paket/paket.exe
318 | paket-files/
319 |
320 | # FAKE - F# Make
321 | .fake/
322 |
323 | # CodeRush personal settings
324 | .cr/personal
325 |
326 | # Python Tools for Visual Studio (PTVS)
327 | __pycache__/
328 | *.pyc
329 |
330 | # Cake - Uncomment if you are using it
331 | # tools/**
332 | # !tools/packages.config
333 |
334 | # Tabs Studio
335 | *.tss
336 |
337 | # Telerik's JustMock configuration file
338 | *.jmconfig
339 |
340 | # BizTalk build output
341 | *.btp.cs
342 | *.btm.cs
343 | *.odx.cs
344 | *.xsd.cs
345 |
346 | # OpenCover UI analysis results
347 | OpenCover/
348 |
349 | # Azure Stream Analytics local run output
350 | ASALocalRun/
351 |
352 | # MSBuild Binary and Structured Log
353 | *.binlog
354 |
355 | # NVidia Nsight GPU debugger configuration file
356 | *.nvuser
357 |
358 | # MFractors (Xamarin productivity tool) working folder
359 | .mfractor/
360 |
361 | # Local History for Visual Studio
362 | .localhistory/
363 |
364 | # Visual Studio History (VSHistory) files
365 | .vshistory/
366 |
367 | # BeatPulse healthcheck temp database
368 | healthchecksdb
369 |
370 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
371 | MigrationBackup/
372 |
373 | # Ionide (cross platform F# VS Code tools) working folder
374 | .ionide/
375 |
376 | # Fody - auto-generated XML schema
377 | FodyWeavers.xsd
378 |
379 | # VS Code files for those working on multiple tools
380 | .vscode/*
381 | !.vscode/settings.json
382 | !.vscode/tasks.json
383 | !.vscode/launch.json
384 | !.vscode/extensions.json
385 | *.code-workspace
386 |
387 | # Local History for Visual Studio Code
388 | .history/
389 |
390 | # Windows Installer files from build outputs
391 | *.cab
392 | *.msi
393 | *.msix
394 | *.msm
395 | *.msp
396 |
397 | # JetBrains Rider
398 | *.sln.iml
399 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 |
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 |
5 | Resources:
6 |
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) Microsoft Corporation.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # FILM: Make Your LLM Fully Utilize the Context
2 |
3 |
4 | 🤗 [Model] • 📃 [Paper] • ⚓ [VaLProbing-32K]
5 |
6 |
7 | This is the official repo for the paper *Make Your LLM Fully Utilize the Context*.
8 | This repo can help you to reproduce the results of **FILM-7B, a 32K-context LLM that overcomes the lost-in-the-middle problem**.
9 | FILM-7B is trained from Mistral-7B-Instruct-v0.2 by applying Information-Intensie (In2) Training.
10 | FILM-7B achieves near-perfect performance on probing tasks, SOTA-level performance on real-world long-context tasks among ~7B size LLMs, and does not compromise the short-context performance.
11 |
12 | Disclaimer: This repo is strictly for research purposes, and not an official product or service from Microsoft.
13 |
14 |
15 | ## Setup
16 |
17 | We recommend using [Conda](https://docs.conda.io/projects/miniconda) or the official [Pytorch Docker](https://hub.docker.com/layers/pytorch/pytorch/2.0.1-cuda11.7-cudnn8-devel/images/sha256-4f66166dd757752a6a6a9284686b4078e92337cd9d12d2e14d2d46274dfa9048?context=explore) to build up the environment.
18 |
19 | ```sh
20 | git clone https://github.com/microsoft/FILM.git
21 | cd FILM
22 | conda create -n FILM python=3.10.11
23 | conda activate FILM
24 | pip install torch==2.0.1 # cuda11.7 and cudnn8
25 | pip install -r requirements.txt
26 | ```
27 |
28 | ## Model Usage
29 |
30 | The system tempelate for FILM-7B:
31 | ```text
32 | '''[INST] Below is a context and an instruction. Based on the information provided in the context, write a response for the instruction.
33 |
34 | ### Context:
35 | {YOUR LONG CONTEXT}
36 |
37 | ### Instruction:
38 | {YOUR QUESTION & INSTRUCTION} [/INST]
39 | '''
40 | ```
41 |
42 | ## Probing Results
43 |
44 | To reproduce the results on our VaL Probing, see the guidance in [VaLProbing](./VaLProbing).
45 |
46 |
47 |
48 |
49 |
50 |
51 | ## Real-World Long-Context Tasks
52 |
53 | To reproduce the results on real-world long-context tasks, see the guidance in [real_world_long](./real_world_long).
54 |
55 |
56 |
57 |
58 |
59 |
60 | ## Short-Context Tasks
61 |
62 | To reproduce the results on short-context tasks, see the guidance in [short_tasks](./short_tasks).
63 |
64 |
65 |
66 |
67 |
68 |
69 | ## 📝 Citation
70 | ```
71 | @misc{an2024make,
72 | title={Make Your LLM Fully Utilize the Context},
73 | author={Shengnan An and Zexiong Ma and Zeqi Lin and Nanning Zheng and Jian-Guang Lou},
74 | year={2024},
75 | eprint={2404.16811},
76 | archivePrefix={arXiv},
77 | primaryClass={cs.CL}
78 | }
79 | ```
80 |
81 | ## Contributing
82 |
83 | This project welcomes contributions and suggestions. Most contributions require you to agree to a
84 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
85 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
86 |
87 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
88 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
89 | provided by the bot. You will only need to do this once across all repos using our CLA.
90 |
91 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
92 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
93 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
94 |
95 |
96 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Security
4 |
5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
6 |
7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
8 |
9 | ## Reporting Security Issues
10 |
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 |
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
14 |
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
16 |
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
18 |
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 |
21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 | * Full paths of source file(s) related to the manifestation of the issue
23 | * The location of the affected source code (tag/branch/commit or direct URL)
24 | * Any special configuration required to reproduce the issue
25 | * Step-by-step instructions to reproduce the issue
26 | * Proof-of-concept or exploit code (if possible)
27 | * Impact of the issue, including how an attacker might exploit the issue
28 |
29 | This information will help us triage your report more quickly.
30 |
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
32 |
33 | ## Preferred Languages
34 |
35 | We prefer all communications to be in English.
36 |
37 | ## Policy
38 |
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
40 |
41 |
42 |
--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
1 | # Support
2 |
3 | ## How to file issues and get help
4 |
5 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing
6 | issues before filing new issues to avoid duplicates. For new issues, file your bug or
7 | feature request as a new Issue.
8 |
9 | For help and questions about using this project, please contact an1006634493@stu.xjtu.edu.cn.
10 |
11 | ## Microsoft Support Policy
12 |
13 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
14 |
--------------------------------------------------------------------------------
/VaLProbing/README.md:
--------------------------------------------------------------------------------
1 | ## VaL Probing
2 |
3 | The following guidance will help you to reproduce our results on VaL Probing mentioned in the paper or construct your own probing data.
4 |
5 | ### To Reproduce Our Results
6 |
7 | **Step 1: Download Data.**
8 |
9 | Our synthesized data for the three probing tasks mentioned in the paper are contained in [VaLProbing-32K](https://huggingface.co/datasets/In2Training/VaLProbing-32K/).
10 | Download the data locally for the later inference and plotting stage.
11 | ```bash
12 | python download.py
13 | ```
14 | The data will be downloaded into the folder `./VaLProbing-32K/`.
15 | Each line in the data files contains an input prompt and a ground-truth completion (label/description)
16 |
17 | **Step 2: Inference with vLLM.**
18 |
19 | You can directly use the downloaded data for inference without pre-processing, as these data has been formatted into the system template for FILM-7B.
20 | **To inference with other LLMs, please change the system message and the tempelate format.**
21 | ```bash
22 | export NCCL_IGNORE_DISABLED_P2P=1
23 | python ../vllm_inference/vllm_inference.py --model_path In2Training/FILM-7B \
24 | --testdata_file document_bi_32k.jsonl \
25 | --testdata_folder ./VaLProbing-32K/ \
26 | --output_folder ./VaLProbing-32K/results/FILM-7B/ \
27 | --max_length 128 \
28 | --tensor_parallel_size 8
29 |
30 | python ../vllm_inference/vllm_inference.py --model_path In2Training/FILM-7B \
31 | --testdata_file code_backward_32k.jsonl \
32 | --testdata_folder ./VaLProbing-32K/ \
33 | --output_folder ./VaLProbing-32K/results/FILM-7B/ \
34 | --max_length 128 \
35 | --tensor_parallel_size 8
36 |
37 | python ../vllm_inference/vllm_inference.py --model_path In2Training/FILM-7B \
38 | --testdata_file database_forward_32k.jsonl \
39 | --testdata_folder ./VaLProbing-32K/ \
40 | --output_folder ./VaLProbing-32K/results/FILM-7B/ \
41 | --max_length 128 \
42 | --tensor_parallel_size 8
43 | ```
44 |
45 | We provide our generation results in `./VaLProbing-32K/results/`, including FILM-7B, Mistral-7B-Instruct-v0.2, and GPT-4-Turbo.
46 |
47 | **Step 3: Plot.**
48 |
49 | Run `plot.py` to reproduce the Figure 1 in our paper.
50 | Note that the examples in `*_32k_skip_list.json` are skipped during evaluation due to the ambiguity in the context (i.e., the retrieval keyword is mentioned more than one time in the context).
51 | The figure is saved under the `./VaLProbing-32K/figures/`.
52 | ```bash
53 | python plot.py
54 | ```
55 |
56 |
57 |
58 | ### To Construct Your Own Val Probing
59 |
60 | We provide the [raw data](./pieces) for constructing the three probing tasks.
61 | You can use it to construct longer context and change the retrieval pattern.
62 | Note that you should check the ambiguity before evaluation.
63 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/VaLProbing/VaLProbing-32K/code_backward_32k_skip_list.json:
--------------------------------------------------------------------------------
1 | [8, 17, 51, 73, 77, 134, 150, 181, 184, 189, 197, 225, 246, 255, 258, 301, 310, 314, 315, 325, 336, 349, 360, 386, 390, 393, 418, 424, 444, 463, 466, 480, 492, 493, 495, 502, 505, 506, 536, 537, 541, 560, 573, 580, 584, 591, 593, 596, 599, 607, 609, 618, 625, 626, 627, 633, 636, 637, 639, 682, 688, 697, 706, 726, 729, 776, 777, 793, 805, 806, 810, 811, 815, 822, 824, 827, 829, 836, 844, 847, 849, 862, 869, 882, 883, 885, 895, 908, 917, 921, 933, 942, 943, 949, 974, 977, 996, 1000, 1014, 1022, 1041, 1042, 1049, 1057, 1058, 1087, 1125, 1129, 1132, 1145, 1162, 1186, 1207, 1224, 1235, 1237, 1257, 1273, 1280, 1291, 1295, 1297, 1298, 1306, 1307, 1314, 1316, 1323, 1324, 1336, 1337, 1338, 1360, 1363, 1367, 1373, 1382, 1386, 1415, 1416, 1443, 1444, 1448, 1457, 1469, 1471, 1476, 1499, 1512, 1515, 1516, 1524, 1528, 1532, 1547, 1585, 1617, 1638, 1646, 1648, 1651, 1660, 1670, 1679, 1682, 1717, 1728, 1742, 1752, 1753, 1757, 1767, 1769, 1771, 1774, 1802, 1810, 1817, 1838, 1839, 1844, 1851, 1853, 1880, 1898, 1918, 1921, 1935, 1938, 1943, 1948, 1953, 1957, 1962, 1963, 1975, 1993, 1998, 2009, 2058, 2059, 2063, 2071, 2099, 2106, 2117, 2127, 2131, 2155, 2172, 2181, 2182, 2183, 2189, 2197, 2198, 2208, 2215, 2218, 2222, 2228, 2236, 2240, 2257, 2260, 2261, 2275, 2279, 2283, 2288, 2298, 2328, 2329, 2337, 2346, 2347, 2350, 2358, 2364, 2370, 2373, 2395, 2407, 2414, 2417, 2450, 2459, 2462, 2473, 2485, 2524, 2528, 2568, 2587, 2615, 2627, 2635, 2636, 2653, 2661, 2664, 2675, 2688, 2699, 2706, 2718, 2757, 2769, 2791, 2794, 2795, 2799, 2808, 2812, 2813, 2814, 2820, 2823, 2831, 2835, 2836, 2839, 2845, 2870, 2879, 2895, 2900, 2901, 2902, 2908, 2918, 2931, 2932, 2941, 2943, 2947, 2949, 2959, 2961, 2964, 2968, 2970, 3045, 3057, 3064, 3089, 3120, 3121, 3131, 3146, 3151, 3174, 3188]
--------------------------------------------------------------------------------
/VaLProbing/VaLProbing-32K/database_forward_32k_skip_list.json:
--------------------------------------------------------------------------------
1 | [774, 1126]
--------------------------------------------------------------------------------
/VaLProbing/VaLProbing-32K/document_bi_32k_skip_list.json:
--------------------------------------------------------------------------------
1 | []
--------------------------------------------------------------------------------
/VaLProbing/download.py:
--------------------------------------------------------------------------------
1 | # This source code is licensed under the MIT license
2 |
3 | import json
4 | from datasets import load_dataset
5 | import os
6 | from tqdm import tqdm
7 |
8 | dataset = load_dataset("In2Training/VaLProbing-32K")
9 |
10 | categories = list(dataset.keys())
11 |
12 | if not os.path.exists('./ValProbing-32K/'):
13 | os.mkdir('./ValProbing-32K/')
14 |
15 | for cate in categories:
16 | with open('./ValProbing-32K/' + cate + '.jsonl', 'w', encoding='utf-8') as f_write:
17 | for info in tqdm(dataset[cate]):
18 | for key in list(info.keys()):
19 | if info[key] == ' ':
20 | info.pop(key)
21 | f_write.write(json.dumps(info) + '\n')
22 |
--------------------------------------------------------------------------------
/VaLProbing/figures/probing_fig.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FILM/20ed4da261039c3b32ba2049ce6e4ba90f3cefbc/VaLProbing/figures/probing_fig.pdf
--------------------------------------------------------------------------------
/VaLProbing/plot.py:
--------------------------------------------------------------------------------
1 | # This source code is licensed under the MIT license
2 |
3 | import json
4 | import os
5 | import pdb
6 |
7 | import matplotlib.pyplot as plt
8 | from matplotlib.backends.backend_pdf import PdfPages
9 |
10 | import numpy as np
11 |
12 | from tqdm import tqdm
13 |
14 |
15 | model_infos = [
16 | ['FILM-7B', '#E97132', '#F4B898', '-'],
17 | ['Mistral-7B-Instruct-v0.2', '#7F7F7F', '#BFBFBF', '-'],
18 | ['gpt4-turbo', '#0F9ED5', '#87CEEA', ':'],
19 | ]
20 |
21 |
22 | fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(30, 10), dpi=200)
23 | fontsize = 20
24 | # fig, ax = plt.subplots(figsize=(10, 7), dpi=200)
25 |
26 |
27 |
28 | ####### Document
29 |
30 | total_len = 800
31 | span_len = 50
32 | span_num = int(total_len/span_len)
33 |
34 | set_ids = ['set_' + str(i) for i in range(4)]
35 |
36 | with open('./VaLProbing-32K/document_bi_32k.jsonl', 'r', encoding='utf-8') as f:
37 | label_infos = []
38 | for line in tqdm(f.readlines()):
39 | info = json.loads(line)
40 | gt = info['completion']
41 | position_id = info['position_id']
42 | set_id = info['set_id']
43 | set_id = 'set_' + str(set_id)
44 | label_infos.append({'gt': gt, 'position_id': position_id, 'set_id': set_id})
45 |
46 | with open('./VaLProbing-32K/document_bi_32k_skip_list.json', 'r', encoding='utf-8') as f:
47 | skip_list = json.load(f)
48 |
49 | for model_name, color_str, ecolor_str, linestyle in model_infos:
50 | set_ids_position2acc = {}
51 | for set_id in set_ids:
52 | set_ids_position2acc[set_id] = {}
53 | for i in range(total_len):
54 | set_ids_position2acc[set_id][i] = []
55 |
56 | with open('./VaLProbing-32K/results/' + model_name + '/sample_document_bi_32k.jsonl', 'r', encoding='utf-8') as f:
57 | pred_infos = [json.loads(line) for line in f.readlines()]
58 | for idx, (pred_info, label_info) in enumerate(tqdm(zip(pred_infos, label_infos))):
59 |
60 | if idx in skip_list:
61 | continue
62 |
63 | if 'gpt4' in model_name:
64 | pred = pred_info['sample']
65 | else:
66 | pred = pred_info['samples'][0]
67 |
68 | gt = label_info['gt']
69 | position_id = label_info['position_id']
70 | set_id = label_info['set_id']
71 |
72 | gt_words = set(gt.strip().lower().split())
73 | pred_words = set(pred.strip().lower().split())
74 | recall_score = len(gt_words & pred_words) / len(gt_words)
75 | set_ids_position2acc[set_id][position_id].append(recall_score)
76 |
77 | # if recall_score < 0.5:
78 | # pdb.set_trace()
79 |
80 | set_ids2span_acc_list = {}
81 | for set_id in set_ids:
82 | set_ids2span_acc_list[set_id] = []
83 | for i in range(span_num):
84 | span_start = span_len*i
85 | span_end = span_len*i + span_len - 1
86 | accs = []
87 | for position_id in range(span_start, span_end):
88 | accs += set_ids_position2acc[set_id][position_id]
89 | acc = sum(accs) / len(accs)
90 | set_ids2span_acc_list[set_id].append(acc)
91 |
92 | span_acc_list = []
93 | span_std_list = []
94 | for i in range(span_num):
95 | accs = []
96 | for set_id in set_ids:
97 | accs.append(set_ids2span_acc_list[set_id][i])
98 | span_acc_list.append(sum(accs) / len(accs))
99 | span_std_list.append(np.std(np.array(accs)))
100 |
101 | x = [i for i in range(span_num)]
102 |
103 | legend_label = model_name
104 | ax1.errorbar(x, span_acc_list, yerr=span_std_list,
105 | color=color_str, linewidth=3, marker='o', markersize=10, linestyle=linestyle,
106 | ecolor=ecolor_str, elinewidth=3, capsize=6, label=legend_label)
107 |
108 | print(model_name, 'Statistics:')
109 | long_acc = sum(span_acc_list) / len(span_acc_list)
110 | print('long avg:', long_acc)
111 | print('max-min gap:', max(span_acc_list) - min(span_acc_list))
112 | print('\n')
113 |
114 |
115 | x = [i for i in range(span_num)]
116 | x_tickets = []
117 | for i in range(span_num):
118 | span_name = str(span_len * i + span_len)
119 | x_tickets.append(span_name)
120 |
121 |
122 | ax1.set_xticks(x)
123 | ax1.set_xticklabels(x_tickets, fontsize=fontsize*0.5, rotation=45)
124 | ax1.set_xlabel('Relative Positions in ' + str(total_len) + ' Sentences', fontsize=fontsize*1.5)
125 |
126 | ax1.set_yticks([0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
127 | ax1.set_yticklabels([0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], fontsize=fontsize)
128 | ax1.set_ylabel('Performance (%)', fontsize=fontsize*1.5)
129 |
130 | ax1.set_title('Document Sentence Retrieval (Bi-Direction)', fontsize=fontsize*1.5)
131 |
132 | ax1.legend(loc='lower left', fontsize=fontsize)
133 |
134 |
135 |
136 | ####### Code
137 |
138 | total_len = 800
139 | span_len = 50
140 | span_num = int(total_len/span_len)
141 |
142 | set_ids = ['set_' + str(i) for i in range(4)]
143 |
144 | with open('./VaLProbing-32K/code_backward_32k.jsonl', 'r', encoding='utf-8') as f:
145 | label_infos = []
146 | for line in tqdm(f.readlines()):
147 | info = json.loads(line)
148 | gt = info['completion']
149 | position_id = info['position_id']
150 | set_id = info['set_id']
151 | set_id = 'set_' + str(set_id)
152 | label_infos.append({'gt': gt, 'position_id': position_id, 'set_id': set_id})
153 |
154 | with open('./VaLProbing-32K/code_backward_32k_skip_list.json', 'r', encoding='utf-8') as f:
155 | skip_list = json.load(f)
156 |
157 | for model_name, color_str, ecolor_str, linestyle in model_infos:
158 | set_ids_position2acc = {}
159 | for set_id in set_ids:
160 | set_ids_position2acc[set_id] = {}
161 | for i in range(total_len):
162 | set_ids_position2acc[set_id][i] = []
163 |
164 | with open('./VaLProbing-32K/results/' + model_name + '/sample_code_backward_32k.jsonl', 'r', encoding='utf-8') as f:
165 | pred_infos = [json.loads(line) for line in f.readlines()]
166 | for idx, (pred_info, label_info) in enumerate(tqdm(zip(pred_infos, label_infos))):
167 |
168 | if idx in skip_list:
169 | continue
170 |
171 | if 'gpt4' in model_name:
172 | pred = pred_info['sample']
173 | else:
174 | pred = pred_info['samples'][0]
175 |
176 | gt = label_info['gt']
177 | position_id = label_info['position_id']
178 | set_id = label_info['set_id']
179 | if gt.strip('.') in pred:
180 | set_ids_position2acc[set_id][position_id].append(1)
181 | else:
182 | set_ids_position2acc[set_id][position_id].append(0)
183 |
184 |
185 | set_ids2span_acc_list = {}
186 | for set_id in set_ids:
187 | set_ids2span_acc_list[set_id] = []
188 | for i in range(span_num):
189 | span_start = span_len*i
190 | span_end = span_len*i + span_len - 1
191 | accs = []
192 | for position_id in range(span_start, span_end):
193 | accs += set_ids_position2acc[set_id][position_id]
194 | acc = sum(accs) / len(accs)
195 | set_ids2span_acc_list[set_id].append(acc)
196 |
197 | span_acc_list = []
198 | span_std_list = []
199 | for i in range(span_num):
200 | accs = []
201 | for set_id in set_ids:
202 | accs.append(set_ids2span_acc_list[set_id][i])
203 | span_acc_list.append(sum(accs) / len(accs))
204 | span_std_list.append(np.std(np.array(accs)))
205 |
206 |
207 | x = [i for i in range(span_num)]
208 |
209 |
210 | legend_label = model_name
211 | ax2.errorbar(x, span_acc_list, yerr=span_std_list,
212 | color=color_str, linewidth=3, marker='o', markersize=10, linestyle=linestyle,
213 | ecolor=ecolor_str, elinewidth=3, capsize=6, label=legend_label)
214 |
215 | print(model_name, 'Statistics:')
216 | long_acc = sum(span_acc_list) / len(span_acc_list)
217 | print('long avg:', long_acc)
218 | print('max-min gap:', max(span_acc_list) - min(span_acc_list))
219 | print('\n')
220 |
221 |
222 |
223 | x = [i for i in range(span_num)]
224 | x_tickets = []
225 | for i in range(span_num):
226 | span_name = str(span_len * i + span_len)
227 | x_tickets.append(span_name)
228 |
229 |
230 |
231 | ax2.set_xticks(x)
232 | ax2.set_xticklabels(x_tickets, fontsize=fontsize*0.5, rotation=45)
233 | ax2.set_xlabel('Relative Positions in ' + str(total_len) + ' Functions', fontsize=fontsize*1.5)
234 |
235 | ax2.set_yticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
236 | ax2.set_yticklabels([0.0, 0.2, 0.4, 0.6, 0.8, 1.0], fontsize=fontsize)
237 | # ax1.set_ylabel('Accuracy (%)')
238 |
239 | ax2.set_title('Code Function Retrieval (Backward)', fontsize=fontsize*1.5)
240 |
241 | # ax2.legend(loc='lower left', fontsize=fontsize*0.8)
242 |
243 |
244 |
245 |
246 |
247 |
248 | ####### Structure-Data
249 |
250 | total_len = 750
251 | span_len = 50
252 | span_num = int(total_len/span_len)
253 |
254 | set_ids = ['set_' + str(i) for i in range(4)]
255 |
256 | with open('./VaLProbing-32K/database_forward_32k.jsonl', 'r', encoding='utf-8') as f:
257 | label_infos = []
258 | for line in tqdm(f.readlines()):
259 | info = json.loads(line)
260 | gt_label = info['label']
261 | gt_description = info['description']
262 | position_id = info['position_id']
263 | set_id = info['set_id']
264 | set_id = 'set_' + str(set_id)
265 | label_infos.append({'gt_label': gt_label, 'gt_description': gt_description, 'position_id': position_id, 'set_id': set_id})
266 |
267 | with open('./VaLProbing-32K/database_forward_32k_skip_list.json', 'r', encoding='utf-8') as f:
268 | skip_list = json.load(f)
269 |
270 | for model_name, color_str, ecolor_str, linestyle in model_infos:
271 | set_ids_position2acc = {}
272 | for set_id in set_ids:
273 | set_ids_position2acc[set_id] = {}
274 | for i in range(total_len):
275 | set_ids_position2acc[set_id][i] = []
276 |
277 | with open('./VaLProbing-32K/results/' + model_name + '/sample_database_forward_32k.jsonl', 'r', encoding='utf-8') as f:
278 | pred_infos = [json.loads(line) for line in f.readlines()]
279 | for idx, (pred_info, label_info) in enumerate(tqdm(zip(pred_infos, label_infos))):
280 |
281 | if idx in skip_list:
282 | continue
283 |
284 | if 'gpt4' in model_name:
285 | pred = pred_info['sample']
286 | else:
287 | pred = pred_info['samples'][0]
288 |
289 | gt_label = label_info['gt_label']
290 | gt_description = label_info['gt_description']
291 | position_id = label_info['position_id']
292 | set_id = label_info['set_id']
293 | if gt_label.strip('.').lower() in pred.lower() or gt_description.strip('.').lower() in pred.lower():
294 | set_ids_position2acc[set_id][position_id].append(1)
295 | else:
296 | set_ids_position2acc[set_id][position_id].append(0)
297 |
298 |
299 | set_ids2span_acc_list = {}
300 | for set_id in set_ids:
301 | set_ids2span_acc_list[set_id] = []
302 | for i in range(span_num):
303 | span_start = span_len*i
304 | span_end = span_len*i + span_len - 1
305 | accs = []
306 | for position_id in range(span_start, span_end):
307 | accs += set_ids_position2acc[set_id][position_id]
308 | acc = sum(accs) / len(accs)
309 | set_ids2span_acc_list[set_id].append(acc)
310 |
311 | span_acc_list = []
312 | span_std_list = []
313 | for i in range(span_num):
314 | accs = []
315 | for set_id in set_ids:
316 | accs.append(set_ids2span_acc_list[set_id][i])
317 | span_acc_list.append(sum(accs) / len(accs))
318 | span_std_list.append(np.std(np.array(accs)))
319 |
320 |
321 | x = [i for i in range(span_num)]
322 |
323 | legend_label = model_name
324 | ax3.errorbar(x, span_acc_list, yerr=span_std_list,
325 | color=color_str, linewidth=3, marker='o', markersize=10, linestyle=linestyle,
326 | ecolor=ecolor_str, elinewidth=3, capsize=6, label=legend_label)
327 |
328 | print(model_name, 'Statistics:')
329 | long_acc = sum(span_acc_list) / len(span_acc_list)
330 | print('long avg:', long_acc)
331 | print('max-min gap:', max(span_acc_list) - min(span_acc_list))
332 | print('\n')
333 |
334 |
335 |
336 | x = [i for i in range(span_num)]
337 | x_tickets = []
338 | for i in range(span_num):
339 | span_name = str(span_len * i + span_len)
340 | x_tickets.append(span_name)
341 |
342 |
343 | ax3.set_xticks(x)
344 | ax3.set_xticklabels(x_tickets, fontsize=fontsize*0.5, rotation=45)
345 | ax3.set_xlabel('Relative Positions in ' + str(total_len) + ' Entities', fontsize=fontsize*1.5)
346 |
347 | ax3.set_yticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
348 | ax3.set_yticklabels([0.0, 0.2, 0.4, 0.6, 0.8, 1.0], fontsize=fontsize)
349 | # ax1.set_ylabel('Accuracy (%)')
350 |
351 | ax3.set_title('Database Entity Retrieval (Forward)', fontsize=fontsize*1.5)
352 |
353 | # ax3.legend(loc='lower left', fontsize=fontsize*0.8)
354 |
355 |
356 |
357 |
358 | plt.gcf().subplots_adjust(left=0.05, right=0.97, bottom=0.1, top=0.95)
359 | # plt.show()
360 | pp = PdfPages('./figures/probing_fig.pdf')
361 | pp.savefig(fig)
362 | pp.close()
363 |
364 |
365 |
--------------------------------------------------------------------------------
/figures/probing_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FILM/20ed4da261039c3b32ba2049ce6e4ba90f3cefbc/figures/probing_results.png
--------------------------------------------------------------------------------
/figures/probing_results_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FILM/20ed4da261039c3b32ba2049ce6e4ba90f3cefbc/figures/probing_results_new.png
--------------------------------------------------------------------------------
/figures/real_world_long.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FILM/20ed4da261039c3b32ba2049ce6e4ba90f3cefbc/figures/real_world_long.png
--------------------------------------------------------------------------------
/figures/short.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FILM/20ed4da261039c3b32ba2049ce6e4ba90f3cefbc/figures/short.png
--------------------------------------------------------------------------------
/real_world_long/README.md:
--------------------------------------------------------------------------------
1 | ## Real-World Long-Context Tasks
2 |
3 | The following guidance will help you to reproduce our results on real-wordl long-context tasks.
4 | The test prompts and evaluation scripts are modified from [LongBench](https://github.com/THUDM/LongBench).
5 |
6 | **Step 1: Extract Data and Inference with vLLM.**
7 |
8 | The test data in `./prompts/` have been formatted into the system template for FILM-7B.
9 | The filenames indicate the max output length for different tasks during inference, following the default settings in LongBench.
10 | ```bash
11 | # Extract Data
12 | cd ./prompts/
13 | unzip LongBench_output_32_64.zip
14 | unzip LongBench_output_128_512.zip
15 | cd ..
16 |
17 | # Inference
18 | export NCCL_IGNORE_DISABLED_P2P=1
19 | python ../vllm_inference/vllm_inference.py --model_path In2Training/FILM-7B \
20 | --testdata_file LongBench_output_32.jsonl \
21 | --testdata_folder ./prompts/ \
22 | --output_folder ./results/FILM-7B/ \
23 | --max_length 32 \
24 | --tensor_parallel_size 8
25 |
26 | python ../vllm_inference/vllm_inference.py --model_path In2Training/FILM-7B \
27 | --testdata_file LongBench_output_64.jsonl \
28 | --testdata_folder ./prompts/ \
29 | --output_folder ./results/FILM-7B/ \
30 | --max_length 64 \
31 | --tensor_parallel_size 8
32 |
33 | python ../vllm_inference/vllm_inference.py --model_path In2Training/FILM-7B \
34 | --testdata_file LongBench_output_128.jsonl \
35 | --testdata_folder ./prompts/ \
36 | --output_folder ./results/FILM-7B/ \
37 | --max_length 128 \
38 | --tensor_parallel_size 8
39 |
40 | python ../vllm_inference/vllm_inference.py --model_path In2Training/FILM-7B \
41 | --testdata_file LongBench_output_512.jsonl \
42 | --testdata_folder ./prompts/ \
43 | --output_folder ./results/FILM-7B/ \
44 | --max_length 512 \
45 | --tensor_parallel_size 8
46 | ```
47 |
48 | We provide our generation results in `./results/`, including FILM-7B, Mistral-7B-Instruct-v0.2, and GPT-4-Turbo.
49 |
50 | **Step 2: Evaluation.**
51 |
52 | Run `evaluate.py` to calculate evaluation metrics on different tasks.
53 | ```bash
54 | python evaluate.py
55 | ```
56 |
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/real_world_long/evaluate.py:
--------------------------------------------------------------------------------
1 | # This source code is licensed under the MIT license
2 |
3 | import os
4 | import json
5 | import pdb
6 |
7 | from metrics import (
8 | qa_f1_score,
9 | rouge_score,
10 | )
11 |
12 | datasets = [
13 | "narrativeqa", "qasper", "multifieldqa_en",
14 | "hotpotqa", "2wikimqa", "musique",
15 | "gov_report", "qmsum", "multi_news",
16 | ]
17 |
18 | dataset2metric = {
19 | "narrativeqa": qa_f1_score,
20 | "qasper": qa_f1_score,
21 | "multifieldqa_en": qa_f1_score,
22 | "hotpotqa": qa_f1_score,
23 | "2wikimqa": qa_f1_score,
24 | "musique": qa_f1_score,
25 | "gov_report": rouge_score,
26 | "qmsum": rouge_score,
27 | "multi_news": rouge_score,
28 | }
29 |
30 | def scorer(dataset, predictions, answers, all_classes):
31 | total_score = 0.
32 | for (prediction, ground_truths) in zip(predictions, answers):
33 | score = 0.
34 | if dataset == "samsum":
35 | prediction = prediction.lstrip('\n').split('\n')[0]
36 | for ground_truth in ground_truths:
37 | score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
38 | total_score += score
39 | return round(100 * total_score / len(predictions), 2)
40 |
41 | model_names = ['FILM-7B', 'Mistral-7B-Instruct-v0.2', 'gpt4-turbo']
42 |
43 | label_filenames = [
44 | 'LongBench_output_32.jsonl',
45 | 'LongBench_output_64.jsonl',
46 | 'LongBench_output_128.jsonl',
47 | 'LongBench_output_512.jsonl',
48 | ]
49 |
50 | pred_filenames = [
51 | 'sample_LongBench_output_32.jsonl',
52 | 'sample_LongBench_output_64.jsonl',
53 | 'sample_LongBench_output_128.jsonl',
54 | 'sample_LongBench_output_512.jsonl',
55 | ]
56 |
57 | for model_name in model_names:
58 | print(model_name)
59 |
60 | detaset2infos = {}
61 | for dataset in datasets:
62 | detaset2infos[dataset] = {'predictions': [],
63 | 'answers': [],
64 | 'all_classes': None}
65 |
66 | for pred_filename, label_filename in zip(pred_filenames, label_filenames):
67 | with open(os.path.join('results', model_name, pred_filename), 'r', encoding='utf-8') as f_read:
68 | pred_infos = [json.loads(line) for line in f_read.readlines()]
69 |
70 | with open(os.path.join('prompts', label_filename), 'r', encoding='utf-8') as f_read:
71 | label_infos = [json.loads(line) for line in f_read.readlines()]
72 |
73 | assert len(pred_infos) == len(label_infos)
74 |
75 | for pred_info, label_info in zip(pred_infos, label_infos):
76 | if 'gpt4' in model_name:
77 | pred = pred_info['sample']
78 | else:
79 | pred = pred_info['samples'][0]
80 |
81 | if pred is None:
82 | pred = ''
83 | continue
84 |
85 | answers = label_info['answers']
86 | dataset = label_info['dataset']
87 |
88 | detaset2infos[dataset]['predictions'].append(pred)
89 | detaset2infos[dataset]['answers'].append(answers)
90 |
91 | all_classes = label_info['all_classes']
92 | if all_classes:
93 | detaset2infos[dataset]['all_classes'] = all_classes
94 |
95 |
96 | for dataset in datasets:
97 | score = scorer(dataset, detaset2infos[dataset]['predictions'], detaset2infos[dataset]['answers'], detaset2infos[dataset]['all_classes'])
98 | print(dataset, score)
99 |
100 |
--------------------------------------------------------------------------------
/real_world_long/metrics.py:
--------------------------------------------------------------------------------
1 | # This source code is licensed under the MIT license
2 |
3 | import pdb
4 | import re
5 | import string
6 |
7 | from collections import Counter
8 | from rouge import Rouge
9 |
10 |
11 | def normalize_answer(s):
12 | """Lower text and remove punctuation, articles and extra whitespace."""
13 |
14 | def remove_articles(text):
15 | return re.sub(r"\b(a|an|the)\b", " ", text)
16 |
17 | def white_space_fix(text):
18 | return " ".join(text.split())
19 |
20 | def remove_punc(text):
21 | exclude = set(string.punctuation)
22 | return "".join(ch for ch in text if ch not in exclude)
23 |
24 | def lower(text):
25 | return text.lower()
26 |
27 | return white_space_fix(remove_articles(remove_punc(lower(s))))
28 |
29 |
30 | def rouge_score(prediction, ground_truth, **kwargs):
31 | rouge = Rouge()
32 | try:
33 | scores = rouge.get_scores([prediction], [ground_truth], avg=True)
34 | except:
35 | return 0.0
36 | return scores["rouge-l"]["f"]
37 |
38 |
39 | def f1_score(prediction, ground_truth, **kwargs):
40 | common = Counter(prediction) & Counter(ground_truth)
41 | num_same = sum(common.values())
42 | if num_same == 0:
43 | return 0
44 | precision = 1.0 * num_same / len(prediction)
45 | recall = 1.0 * num_same / len(ground_truth)
46 | f1 = (2 * precision * recall) / (precision + recall)
47 | return f1
48 |
49 |
50 | def qa_f1_score(prediction, ground_truth, **kwargs):
51 | prediction = prediction.split('\n\n')[0]
52 | normalized_prediction = normalize_answer(prediction)
53 | normalized_ground_truth = normalize_answer(ground_truth)
54 |
55 | prediction_tokens = normalized_prediction.split()
56 | ground_truth_tokens = normalized_ground_truth.split()
57 |
58 | return f1_score(prediction_tokens, ground_truth_tokens)
59 |
60 |
61 |
--------------------------------------------------------------------------------
/real_world_long/prompts/LongBench_output_128_512.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FILM/20ed4da261039c3b32ba2049ce6e4ba90f3cefbc/real_world_long/prompts/LongBench_output_128_512.zip
--------------------------------------------------------------------------------
/real_world_long/prompts/LongBench_output_32_64.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FILM/20ed4da261039c3b32ba2049ce6e4ba90f3cefbc/real_world_long/prompts/LongBench_output_32_64.zip
--------------------------------------------------------------------------------
/real_world_long/results/FILM-7B/sample_LongBench_output_128.jsonl:
--------------------------------------------------------------------------------
1 | {"samples": [" He is living with the Mulvilles."]}
2 | {"samples": [" Ann is already in love with another man."]}
3 | {"samples": [" On Atlas' mountain"]}
4 | {"samples": [" To propose a plan of escape."]}
5 | {"samples": [" Baron Henry didn't kill Otto because he was afraid of the Baron Conrad's reaction."]}
6 | {"samples": [" at least six hours"]}
7 | {"samples": [" Lisa"]}
8 | {"samples": [" Janosz"]}
9 | {"samples": [" American"]}
10 | {"samples": [" the lying scribe"]}
11 | {"samples": [" His 20th Century experience and understanding of military tactics and strategy."]}
12 | {"samples": [" \"grayish-yellowish stuff,\" \"number on a large disk of metal strapped round the left arm,\" \"smelling rather strongly of carbolic,\" \"quite hairless\""]}
13 | {"samples": [" fifty years"]}
14 | {"samples": [" Virginie"]}
15 | {"samples": [" Pierre Grassou"]}
16 | {"samples": [" Vigo's painting comes to life and the slime spreads over the painting."]}
17 | {"samples": [" Elder Childers argues that he is calm and sure of himself."]}
18 | {"samples": [" His 20th Century experience and understanding of military tactics and strategy."]}
19 | {"samples": [" A woman brushing her long hair before an oval-shaped mirror."]}
20 | {"samples": [" Three"]}
21 | {"samples": [" A family quarrel about money."]}
22 | {"samples": [" 2419 A.D."]}
23 | {"samples": [" To follow the harvest."]}
24 | {"samples": [" A miserable slave"]}
25 | {"samples": [" Lady Coxon is Ruth Anvoy's aunt."]}
26 | {"samples": [" News reporter"]}
27 | {"samples": [" He believes it would be wrong to betray the principles he has always upheld, including not doing evil or returning evil for evil."]}
28 | {"samples": [" Drexl is killed by Clarence."]}
29 | {"samples": [" She gave it up."]}
30 | {"samples": [" twisted into an insane rictus of fear"]}
31 | {"samples": [" Beerbohm is making fun of Soames's work."]}
32 | {"samples": [" one week"]}
33 | {"samples": [" Raval"]}
34 | {"samples": [" She attended to the _shews_ of things, and her opinions were such as the generality approved of."]}
35 | {"samples": [" Anderson's ability to work with chlorophage."]}
36 | {"samples": [" He was worried about Ruth because he was afraid she might reveal their relationship to the authorities."]}
37 | {"samples": [" Bill was shot by Benson."]}
38 | {"samples": [" To investigate the \"Cornish Horror\" case."]}
39 | {"samples": [" Vigo"]}
40 | {"samples": [" Mary is initially educated with the expectation of a large fortune."]}
41 | {"samples": [" Dana's apartment"]}
42 | {"samples": [" Sadako"]}
43 | {"samples": [" The Prison of Socrates."]}
44 | {"samples": [" Shizuko Yamamura committed suicide."]}
45 | {"samples": [" Seth Lazurus was accused of murder."]}
46 | {"samples": [" He wore a soft black hat of clerical kind, but of Bohemian intention, and a gray waterproof cape which, perhaps because it was waterproof, failed to be romantic."]}
47 | {"samples": [" Baron Henry of Trutz-Drachen"]}
48 | {"samples": [" Mary is initially educated with the expectation of a large fortune."]}
49 | {"samples": [" Baptist"]}
50 | {"samples": [" Miss Vane"]}
51 | {"samples": [" Dark ages, ignorance, superstition, cruelty, wickedness, gentleness, love, peace, justice, wisdom, and redemption."]}
52 | {"samples": [" Tomoko dies."]}
53 | {"samples": [" twisted into an insane rictus of fear"]}
54 | {"samples": [" The Sinsings"]}
55 | {"samples": [" Bill"]}
56 | {"samples": [" They sing the Ghostbusters theme song."]}
57 | {"samples": [" He draws the magnificent income from The Coxon Fund."]}
58 | {"samples": [" Falder"]}
59 | {"samples": [" Not specified in the context."]}
60 | {"samples": [" A check"]}
61 | {"samples": [" A sexless thing"]}
62 | {"samples": [" In the closet."]}
63 | {"samples": [" Sarah"]}
64 | {"samples": [" Eatonville"]}
65 | {"samples": [" Dr. Richards"]}
66 | {"samples": [" JEZZIE"]}
67 | {"samples": [" ship's doctor"]}
68 | {"samples": [" Buy a good house."]}
69 | {"samples": [" Her mother"]}
70 | {"samples": [" one week"]}
71 | {"samples": [" Virginie Vervelle"]}
72 | {"samples": [" She copies the tape and shows it to someone else."]}
73 | {"samples": [" A painter"]}
74 | {"samples": [" They agree to obey the laws and commands of the city."]}
75 | {"samples": [" He jumped."]}
76 | {"samples": [" copying the tape and showing it to someone else"]}
77 | {"samples": [" Brenda Treginnis"]}
78 | {"samples": [" They slept in the same room."]}
79 | {"samples": [" ramble about the garden, admire the flowers, play with the dogs"]}
80 | {"samples": [" security guard"]}
81 | {"samples": [" Radioactive gases"]}
82 | {"samples": [" Izu Pacific Land"]}
83 | {"samples": [" Dick's number in Hollywood."]}
84 | {"samples": [" 100 years"]}
85 | {"samples": [" Holmes upset the watering-pot."]}
86 | {"samples": [" An enemy"]}
87 | {"samples": [" HECTOR FROME"]}
88 | {"samples": [" Louis"]}
89 | {"samples": [" His failure to impress himself on his decade."]}
90 | {"samples": [" Eliza"]}
91 | {"samples": [" Bad Bloods"]}
92 | {"samples": [" In front of Cliff's trailer home."]}
93 | {"samples": [" Oscar is Dana's son."]}
94 | {"samples": [" The laws in the world below will receive him as an enemy."]}
95 | {"samples": [" Open the closet door."]}
96 | {"samples": [" Doctor Nordenfeld"]}
97 | {"samples": [" \"one of the most honest fellows on earth\""]}
98 | {"samples": [" Artists laugh at his work."]}
99 | {"samples": [" Lee Donowitz"]}
100 | {"samples": [" He failed to report himself this last four weeks."]}
101 | {"samples": [" Ruth"]}
102 | {"samples": [" Detroit"]}
103 | {"samples": [" \"My child, I have not always treated you with kindness--God forgive me! do you?\""]}
104 | {"samples": [" \"Are you prepared?\""]}
105 | {"samples": [" No one"]}
106 | {"samples": [" Mary is taught to dance."]}
107 | {"samples": [" A painter"]}
108 | {"samples": [" To fulfill her mother's dying wish."]}
109 | {"samples": [" Mortimer Tregennis"]}
110 | {"samples": [" Jacob first meets Michael Newman in a hotel room."]}
111 | {"samples": [" American"]}
112 | {"samples": [" Bad Bloods"]}
113 | {"samples": [" September 21st. Tuesday."]}
114 | {"samples": [" Ruth Anvoy"]}
115 | {"samples": [" They go to play for the white folks."]}
116 | {"samples": [" The dark or middle ages."]}
117 | {"samples": [" Ryuji is killed by Sadako."]}
118 | {"samples": [" He had one or two places, but he didn't keep them."]}
119 | {"samples": [" He is living with the Mulvilles."]}
120 | {"samples": [" In the closet."]}
121 | {"samples": [" Artists laugh at his work; his name is a term of contempt in the studios."]}
122 | {"samples": [" A cursed videotape."]}
123 | {"samples": [" Alabama was a stewardess."]}
124 | {"samples": [" A demonic presence."]}
125 | {"samples": [" \"your capacity for self-delusion\""]}
126 | {"samples": [" The agreement to obey the laws and the government."]}
127 | {"samples": [" \"one of the most honest fellows on earth\""]}
128 | {"samples": [" Sister Taylor"]}
129 | {"samples": [" Lock him up in the Mayor's barn until Monday."]}
130 | {"samples": [" her brother"]}
131 | {"samples": [" \"Otto of the Silver Hand\""]}
132 | {"samples": [" her brother"]}
133 | {"samples": [" Wyoming Valley, Pennsylvania"]}
134 | {"samples": [" Soames was a figment of my brain."]}
135 | {"samples": [" A family quarrel about money."]}
136 | {"samples": [" Altaira"]}
137 | {"samples": [" Pierre Grassou"]}
138 | {"samples": [" He discovered that she was not in bed."]}
139 | {"samples": [" \"Shoumon bakkari... boukon ga kuru zo.\""]}
140 | {"samples": [" A house in Vendome, known as the property of the late Madame de Merret."]}
141 | {"samples": [" They decided not to give the money to Saltram."]}
142 | {"samples": [" Miss Violet Ray"]}
143 | {"samples": [" Bill"]}
144 | {"samples": [" Tomoko Ouishi and Iwata"]}
145 | {"samples": [" First World War"]}
146 | {"samples": [" He discovers that his own paintings are in Vervelle's collection."]}
147 | {"samples": [" Vietnam field hospital"]}
148 | {"samples": [" Citizens and the law are compared to a father and a child."]}
149 | {"samples": [" The coffin"]}
150 | {"samples": [" 2419 A.D."]}
151 | {"samples": [" Baron Conrad is killed in a battle with his enemies."]}
152 | {"samples": [" Lee grabs the coffee pot off the table and flings hot coffee into Elliot's face."]}
153 | {"samples": [" Eatonville"]}
154 | {"samples": [" Madame de Merret"]}
155 | {"samples": [" Gravener was urging Anvoy to marry him."]}
156 | {"samples": [" By visiting the reading-room of the British Museum in 1997."]}
157 | {"samples": [" Apis"]}
158 | {"samples": [" Two"]}
159 | {"samples": [" Alabama Worley"]}
160 | {"samples": [" Bill"]}
161 | {"samples": [" Elie Magus"]}
162 | {"samples": [" \"Higher and Higher\""]}
163 | {"samples": [" Artists laugh at his work; that his name is a term of contempt in the studios; and that the feuilletons take no notice of his pictures."]}
164 | {"samples": [" Charles, her father's friend's son."]}
165 | {"samples": [" A house in Vendome, known as the property of the late Madame de Merret."]}
166 | {"samples": [" \"Higher and Higher\""]}
167 | {"samples": [" At about a hundred paces from Vendome, on the banks of the Loir."]}
168 | {"samples": [" Over Daisy Taylor"]}
169 | {"samples": [" Bruce Joel Rubin"]}
170 | {"samples": [" Baron Conrad's wound"]}
171 | {"samples": [" Strange panacea in a crystal bowl."]}
172 | {"samples": [" Ville d'Avray"]}
173 | {"samples": [" To rescue his kinsman, William of Roderburg, who was held captive by Baron Conrad."]}
174 | {"samples": [" Bill was shot by Benson."]}
175 | {"samples": [" Sister Thomas"]}
176 | {"samples": [" People were going to stare at him and follow him around and seem afraid of him."]}
177 | {"samples": [" Abby"]}
178 | {"samples": [" Governor of the state"]}
179 | {"samples": [" \"old way\" spelling"]}
180 | {"samples": [" It causes hallucinations and visions of demons."]}
181 | {"samples": [" The mother, her niece, and nephew."]}
182 | {"samples": [" In the bathroom."]}
183 | {"samples": [" Jacob's unit was involved in a massacre."]}
184 | {"samples": [" Leave a crack at the bottom."]}
185 | {"samples": [" On Atlas' mountain"]}
186 | {"samples": [" St. Michaelsburg"]}
187 | {"samples": [" An enemy"]}
188 | {"samples": [" French letters"]}
189 | {"samples": [" The naked beauty of the soul"]}
190 | {"samples": [" In a dark Chicago mill."]}
191 | {"samples": [" Miranda Hope"]}
192 | {"samples": [" Nine pounds"]}
193 | {"samples": [" Holmes upset the watering-pot."]}
194 | {"samples": [" Ryuji is killed by Sadako."]}
195 | {"samples": [" Eatonville"]}
196 | {"samples": [" Mortimer Tregennis"]}
197 | {"samples": [" charred ashes"]}
198 | {"samples": [" American Radioactive Gas Corporation"]}
199 | {"samples": [" Raval"]}
200 | {"samples": [" September 21st. Tuesday."]}
201 | {"samples": [" An expert manually inspected the text field within the tweets to label them as containing fake news, or not containing them."]}
202 | {"samples": [" Ghost-VLAD is an extension of the NetVLAD approach, which was proposed for face recognition by Y. Zhong. It adds Ghost clusters along with the NetVLAD clusters to map any noisy or irrelevant content into ghost clusters and are not included during the feature aggregation stage."]}
203 | {"samples": [" 68.8% to 71.8%"]}
204 | {"samples": [" Context tweets"]}
205 | {"samples": [" FoxNews, CNN, ESPN, New York Times, Time magazine, Huffington Post Weird News, The Guardian, Cartoon Network, Cooking Light, Home Cooking Adventure, Justin Bieber, Nickelodeon, Spongebob, Disney."]}
206 | {"samples": [" Yes"]}
207 | {"samples": [" TrueSkill"]}
208 | {"samples": [" CNN/DailyMail, NYT, XSum"]}
209 | {"samples": [" GM_KL performs better than other approaches on the benchmark word similarity and entailment datasets."]}
210 | {"samples": [" They start with the best performing model and add the best performing model that hasn't been tried yet, keeping it in the ensemble if it improves validation performance and discarding it otherwise."]}
211 | {"samples": [" Friends is from the scripts of the Friends TV sitcom and EmotionPush is from Facebook messenger chats."]}
212 | {"samples": [" English"]}
213 | {"samples": [" IMDb dataset"]}
214 | {"samples": [" The proposed system outperforms strong baseline systems."]}
215 | {"samples": [" Yes"]}
216 | {"samples": [" Twitter posts and news articles related to finance."]}
217 | {"samples": [" unanswerable"]}
218 | {"samples": [" RNN-based NMT and Transformer-NMT"]}
219 | {"samples": [" (1) a regularization term associated with neutral features; (2) the maximum entropy of class distribution regularization term; and (3) the KL divergence between reference and predicted class distribution."]}
220 | {"samples": [" 1) SVM with unigram, bigram, and trigram features, 2) SVM with average word embedding, 3) SVM with average transformed word embeddings, 4) CNN and Recurrent Convolutional Neural Networks, 5) SVM and deep learning models with comment information, 6) UTCNN without user information, 7) UTCNN without the LDA model, 8) UTCNN without comments."]}
221 | {"samples": [" several points"]}
222 | {"samples": [" By allowing attention mappings to dynamically adapt their curvature and sparsity, and by enabling different heads to learn different sparsity behaviors."]}
223 | {"samples": [" The baseline was a context-agnostic MT model."]}
224 | {"samples": [" XNLI test accuracy, Labeled Attachment Scores (LAS)"]}
225 | {"samples": [" ASR, MT and ST respectively"]}
226 | {"samples": [" Stylistic patterns and patterns related to situational disparity."]}
227 | {"samples": [" LSTM"]}
228 | {"samples": [" Yes"]}
229 | {"samples": [" Jasper architecture"]}
230 | {"samples": [" 22,880 users"]}
231 | {"samples": [" BPE perplexity, BLEU-1/4, ROUGE-L, Distinct-1/2, User matching accuracy (UMA), Mean Reciprocal Rank (MRR), recipe-level coherence scores, step entailment score."]}
232 | {"samples": [" \"completed symptoms\", \"to-do symptoms\", \"completed attributes\", \"to-do attributes\""]}
233 | {"samples": [" 5k abstracts"]}
234 | {"samples": [" Four machine translation tasks."]}
235 | {"samples": [" ELMo embeddings show the largest improvement over fastText embeddings."]}
236 | {"samples": [" unanswerable"]}
237 | {"samples": [" yes"]}
238 | {"samples": [" The Nguni languages and the Sotho languages."]}
239 | {"samples": [" 6-layers and 9-layers sMBR models"]}
240 | {"samples": [" 29,794 articles"]}
241 | {"samples": [" A group of 50 native people who were well-versed in both English and Tamil languages acted as annotators for the evaluation."]}
242 | {"samples": [" Yes"]}
243 | {"samples": [" Efficiency of a communication scheme is measured by the retention rate of tokens, and the accuracy is measured as the fraction of sentences generated by greedily decoding the model that exactly matches the target sentence."]}
244 | {"samples": [" Precision, Recall and F-measure"]}
245 | {"samples": [" The source domain is the existing domain with sufficient labeled data, and the target domain is the new domain with very few or no labeled data."]}
246 | {"samples": [" LSTM"]}
247 | {"samples": [" RNN, CNN, QRNN, Transformer, Highway network, Encoder Decoder architecture, Linear/Bi-linear Attention, Full Attention, Bidirectional attention flow, Dropout, Layer Norm, Batch Norm."]}
248 | {"samples": [" The multilingual pronunciation corpus collected by deri2016grapheme and the Carnegie Mellon Pronouncing Dictionary."]}
249 | {"samples": [" unanswerable"]}
250 | {"samples": [" English, Spanish, Finnish"]}
251 | {"samples": [" Named Entity Recognition, POS tagging, text classification, language modeling"]}
252 | {"samples": [" Yes"]}
253 | {"samples": [" yes"]}
254 | {"samples": [" They use the Meaning Extraction Method (MEM) to measure the usage of words related to people's core values."]}
255 | {"samples": [" Claims, premises, backing, rebuttal, and refutation."]}
256 | {"samples": [" n-grams"]}
257 | {"samples": [" 1,873 Twitter conversation threads, roughly 14k tweets"]}
258 | {"samples": [" The 12 languages covered are typologically diverse and include also under-resourced ones, such as Welsh and Kiswahili."]}
259 | {"samples": [" Wikipedia conversations dataset and Reddit CMV dataset."]}
260 | {"samples": [" unanswerable"]}
261 | {"samples": [" Sanity checks, BLEU scores, perplexity, character ratio, and similarity scores."]}
262 | {"samples": [" They combine the information from these sources using a feed-forward neural model."]}
263 | {"samples": [" 2.11 BLEU, 1.7 FKGL and 1.07 SARI."]}
264 | {"samples": [" 700 examples"]}
265 | {"samples": [" A tweet went viral if it was retweeted more than 1000 times."]}
266 | {"samples": [" LSTM-CRF"]}
267 | {"samples": [" crowdsourcing"]}
268 | {"samples": [" Logistic Regression and neural networks"]}
269 | {"samples": [" The benchmark dataset is a dataset built by Lee et al. using a social honeypot to attract social spammers' retweet. Its quality is not explicitly mentioned in the context."]}
270 | {"samples": [" LSTM decoder"]}
271 | {"samples": [" unanswerable"]}
272 | {"samples": [" The best performing model is the ensemble+ of (II and IV) from each of the folds 1-3, i.e., $|{\\mathcal {M}}|=6$ models, ranked at 3rd position."]}
273 | {"samples": [" (b3)"]}
274 | {"samples": [" '0.7033'"]}
275 | {"samples": [" Word embedding techniques such as word2vec"]}
276 | {"samples": [" They use a bilingual dictionary for transfer information from the assisting to the source language."]}
277 | {"samples": [" Yes"]}
278 | {"samples": [" Seven experts with legal training."]}
279 | {"samples": [" CNN-RNN based image-to-poem net and seq2seq model with parallel text corpus for painting embedding; sequence-to-sequence model with attention for language style transfer."]}
280 | {"samples": [" ToBERT"]}
281 | {"samples": [" yes"]}
282 | {"samples": [" Personal attack, racism, and sexism"]}
283 | {"samples": [" They propose extended middle context, a new context representation for CNNs for relation classification."]}
284 | {"samples": [" 4"]}
285 | {"samples": [" higher quality"]}
286 | {"samples": [" 65% of the speakers are men, speaking more than 75% of the time."]}
287 | {"samples": [" English-German dataset"]}
288 | {"samples": [" previous state-of-the-art models"]}
289 | {"samples": [" Logistic Regression (LR) and Multilayer Perceptron (MLP)"]}
290 | {"samples": [" BIBREF17 and BIBREF18"]}
291 | {"samples": [" SQuAD dataset"]}
292 | {"samples": [" Various approaches have been proposed for modelling urban regions and identifying points-of-interest and itineraries."]}
293 | {"samples": [" Yes"]}
294 | {"samples": [" CSAT dataset, 20 newsgroups, Fisher Phase 1 corpus"]}
295 | {"samples": [" IMDb movie review dataset"]}
296 | {"samples": [" Yes"]}
297 | {"samples": [" no"]}
298 | {"samples": [" INLINEFORM0 and INLINEFORM1 exists."]}
299 | {"samples": [" The full catalogue of features, their description, detailed annotation guideline as well as illustrating examples can be found in Appendix."]}
300 | {"samples": [" WikiSmall has 89,042 sentence pairs and WikiLarge has 296,402 sentence pairs."]}
301 | {"samples": [" Vanilla ST baseline, Pre-training baselines, Multi-task baselines, Many-to-many+pre-training, Triangle+pre-train."]}
302 | {"samples": [" English"]}
303 | {"samples": [" SVM, BiLSTM, CNN"]}
304 | {"samples": [" unanswerable"]}
305 | {"samples": [" GloVe, Edinburgh embeddings, Emoji embeddings"]}
306 | {"samples": [" They generated high-quality and specific recipes that align with historical user preferences."]}
307 | {"samples": [" A combination of rewards for irony accuracy, sentiment preservation and content preservation."]}
308 | {"samples": [" The generated English poem may not work well with Shakespeare style transfer as shown in Figure FIGREF12 for \"Starry Night\" with a low average content score."]}
309 | {"samples": [" Affective Text dataset, Fairy Tales dataset, and the ISEAR dataset."]}
310 | {"samples": [" The distribution results are not provided in the context."]}
311 | {"samples": [" From the Stanford Sentiment Analysis Dataset."]}
312 | {"samples": [" Persian and English"]}
313 | {"samples": [" The context of the corresponding text."]}
314 | {"samples": [" Random Forests (RF)"]}
315 | {"samples": [" unanswerable"]}
316 | {"samples": [" 110-hour German-English ST corpus"]}
317 | {"samples": [" The high-quality datasets the challenge organizers released."]}
318 | {"samples": [" BERT$_\\mathrm {BASE}$"]}
319 | {"samples": [" Yes"]}
320 | {"samples": [" unanswerable"]}
321 | {"samples": [" Competitive results"]}
322 | {"samples": [" INLINEFORM0 tagging scheme"]}
323 | {"samples": [" no"]}
324 | {"samples": [" \"make the model more robust and practical\""]}
325 | {"samples": [" InferSent, Universal Sentence Encoder"]}
326 | {"samples": [" +0.29 and +0.97"]}
327 | {"samples": [" Task 1: Quora Duplicate Question Pair Detection and Task 2: Ranking questions in Bing's People Also Ask"]}
328 | {"samples": [" syntactic tree-based models, other neural models"]}
329 | {"samples": [" Relation detection"]}
330 | {"samples": [" A name-based Nearest-Neighbor model (NN) and a simple Encoder-Decoder baseline with ingredient attention (Enc-Dec)."]}
331 | {"samples": [" Tagging descriptions with part-of-speech information, leveraging the structure of Flickr30K Entities, and creating a coreference graph."]}
332 | {"samples": [" French"]}
333 | {"samples": [" They experimented with the architecture without INLINEFORM2."]}
334 | {"samples": [" no"]}
335 | {"samples": [" Sumy package"]}
336 | {"samples": [" BIBREF0"]}
337 | {"samples": [" No"]}
338 | {"samples": [" DTA corpus"]}
339 | {"samples": [" Kannada, Hindi, Telugu, Malayalam, Bengali, and English."]}
340 | {"samples": [" Competitive performance"]}
341 | {"samples": [" significant improvement"]}
342 | {"samples": [" \"unanswerable\""]}
343 | {"samples": [" The ability of the model to detect some biases in the process of collecting or annotating datasets."]}
344 | {"samples": [" Yes"]}
345 | {"samples": [" 14 million words"]}
346 | {"samples": [" +0.58 for MRPC and +0.73 for QQP."]}
347 | {"samples": [" eye-tracking, self-paced reading, and ERP components"]}
348 | {"samples": [" 7 phonemic/syllabic and 4 words"]}
349 | {"samples": [" Pointer-Gen, Pointer-Gen+Pos, Pointer-Gen+Same-FT, Pointer-Gen+Pos-FT, Pointer-Gen+RL-ROUGE, Pointer-Gen+RL-SEN, Pointer-Gen+ARL-SEN."]}
350 | {"samples": [" Traditional machine learning classifiers and neural network based models."]}
351 | {"samples": [" A bi-directional language model and a uni-directional model."]}
352 | {"samples": [" Associated with each training example in proportion to $(1-p)$, and this weight dynamically changes as training proceeds."]}
353 | {"samples": [" Agents utilizing knowledge-graphs in addition to either enhanced exploration method far outperform the baseline A2C and KG-A2C."]}
354 | {"samples": [" Individual Bayesian models for each language."]}
355 | {"samples": [" Foreign words, in this case Spanish words, are also labelled as such."]}
356 | {"samples": [" A semicharacter architecture is a type of neural network architecture that processes a sentence of words with misspelled characters, predicting the correct words at each step."]}
357 | {"samples": [" 16 languages"]}
358 | {"samples": [" NCEL outperforms the state-of-the-art collective methods across five different datasets."]}
359 | {"samples": [" Yes"]}
360 | {"samples": [" The baseline used was the error detection system by Rei2016, trained using the same FCE dataset."]}
361 | {"samples": [" 2010 i2b2/VA"]}
362 | {"samples": [" It allows the decoder to generate context vectors and refine the summary word based on the source document and other words."]}
363 | {"samples": [" PPDB"]}
364 | {"samples": [" TF-IDF features"]}
365 | {"samples": [" Each tweet is annotated as no evidence of depression or evidence of depression with one or more depressive symptoms."]}
366 | {"samples": [" eight publicly available NER tasks"]}
367 | {"samples": [" The machine translation platform Apertium was used for the translation of the datasets."]}
368 | {"samples": [" multinomial Naive Bayes classifier"]}
369 | {"samples": [" A very simple logistic regression classifier with default parameters, where the input instances are represented with a single feature: the length of the sentence."]}
370 | {"samples": [" CRF"]}
371 | {"samples": [" The procedure proposed in BIBREF2 is used to label different outlets."]}
372 | {"samples": [" ancient Chinese history records in several dynasties (about 1000BC-200BC) and articles written by celebrities of that era."]}
373 | {"samples": [" English"]}
374 | {"samples": [" unanswerable"]}
375 | {"samples": [" three convolutional layers"]}
376 | {"samples": [" European network of nature protected sites Natura 2000 dataset"]}
377 | {"samples": [" NUBes-PHI and MEDDOCAN"]}
378 | {"samples": [" Unigrams and Pragmatic features"]}
379 | {"samples": [" Coverage, Avg. MCC, avg. +ve F1 score"]}
380 | {"samples": [" Yes"]}
381 | {"samples": [" Galatasaray and Fenerbah\u00e7e"]}
382 | {"samples": [" Additional experiments on the transformation from ironic sentences to non-ironic sentences."]}
383 | {"samples": [" Gaussian-masked directional multi-head attention is a variant of self-attention that captures representation of different directions to improve the ability of capturing the localness information and position information for the importance of adjacent characters."]}
384 | {"samples": [" Facebook"]}
385 | {"samples": [" 100 baseline features"]}
386 | {"samples": [" number of clusters, seed initialization"]}
387 | {"samples": [" Second (EI-Reg), second (EI-Oc), fourth (V-Reg) and fifth (V-Oc)"]}
388 | {"samples": [" The corpus consists of 53 documents, which contain an average number of 156.1 sentences per document, each with 19.55 tokens on average."]}
389 | {"samples": [" Yes"]}
390 | {"samples": [" Text categorization and sentiment classification."]}
391 | {"samples": [" Li and Roth BIBREF6 model"]}
392 | {"samples": [" The training sets of these versions of ELMo are larger."]}
393 | {"samples": [" 6946 sentences"]}
394 | {"samples": [" MLP, Eusboost, MWMOTE"]}
395 | {"samples": [" Yes"]}
396 | {"samples": [" Yes"]}
397 | {"samples": [" '0.6103'"]}
398 | {"samples": [" The Wall Street Journal (WSJ) portion of the Penn Treebank."]}
399 | {"samples": [" \"Related Work\""]}
400 | {"samples": [" SimpleQuestions and WebQSP"]}
401 |
--------------------------------------------------------------------------------
/real_world_long/results/FILM-7B/sample_LongBench_output_32.jsonl:
--------------------------------------------------------------------------------
1 | {"samples": [" Miller v. California"]}
2 | {"samples": [" Charles L. Clifford"]}
3 | {"samples": [" Lowell, Michigan"]}
4 | {"samples": [" Jupiter"]}
5 | {"samples": [" Chloe (2009 film)"]}
6 | {"samples": [" Parliament"]}
7 | {"samples": [" Kumbakonam"]}
8 | {"samples": [" Pleiospilos"]}
9 | {"samples": [" June 1, 2013"]}
10 | {"samples": [" The Rebirth"]}
11 | {"samples": [" October 13, 1980"]}
12 | {"samples": [" Adam Scott and Naomi Scott"]}
13 | {"samples": [" Mary Astor"]}
14 | {"samples": [" number five"]}
15 | {"samples": [" Macclesfield and Wirral"]}
16 | {"samples": [" 2000 Summer Olympics"]}
17 | {"samples": [" Claudio L\u00f3pez"]}
18 | {"samples": [" Ronald Reagan"]}
19 | {"samples": [" \"We Found Love\""]}
20 | {"samples": [" George Harrison"]}
21 | {"samples": [" Cartoon Cartoon Fridays"]}
22 | {"samples": [" Sydney, New South Wales, Australia"]}
23 | {"samples": [" Long Island"]}
24 | {"samples": [" PewDiePie"]}
25 | {"samples": [" It's Always Sunny in Philadelphia"]}
26 | {"samples": [" 424th"]}
27 | {"samples": [" Ribosomes"]}
28 | {"samples": [" Pistacia"]}
29 | {"samples": [" Band-e Amir"]}
30 | {"samples": [" University of Southern California Trojans"]}
31 | {"samples": [" 7.4 million residents"]}
32 | {"samples": [" A123 Systems"]}
33 | {"samples": [" 15,678"]}
34 | {"samples": [" Taoiseach"]}
35 | {"samples": [" Troy"]}
36 | {"samples": [" Mika H\u00e4kkinen"]}
37 | {"samples": [" Constantine"]}
38 | {"samples": [" Darci Kistler"]}
39 | {"samples": [" 3M"]}
40 | {"samples": [" Summer Magic"]}
41 | {"samples": [" Nobel Prize in Physics"]}
42 | {"samples": [" stop motion animation"]}
43 | {"samples": [" Eileen Atkins"]}
44 | {"samples": [" Iran"]}
45 | {"samples": [" Hakim Abdullah Jamal"]}
46 | {"samples": [" Keith Morris"]}
47 | {"samples": [" YIVO"]}
48 | {"samples": [" January 18, 1813"]}
49 | {"samples": [" Himalchuli"]}
50 | {"samples": [" 1867"]}
51 | {"samples": [" Pope John X"]}
52 | {"samples": [" Manchester United F.C."]}
53 | {"samples": [" Yes"]}
54 | {"samples": [" 2,098"]}
55 | {"samples": [" FIA Formula E Championship"]}
56 | {"samples": [" Yes"]}
57 | {"samples": [" Mississippi State"]}
58 | {"samples": [" Floyd Casey Stadium"]}
59 | {"samples": [" 2010"]}
60 | {"samples": [" Gaja in Sanskrit means elephant and 'Brishta' is the back or hip portion of a sitting elephant."]}
61 | {"samples": [" British"]}
62 | {"samples": [" Yes"]}
63 | {"samples": [" November 23, 2002"]}
64 | {"samples": [" Drea de Matteo"]}
65 | {"samples": [" British"]}
66 | {"samples": [" Vernon L. Smith"]}
67 | {"samples": [" Deftones"]}
68 | {"samples": [" 796"]}
69 | {"samples": [" India"]}
70 | {"samples": [" Yes"]}
71 | {"samples": [" 1939"]}
72 | {"samples": [" Philip K. Dick"]}
73 | {"samples": [" 821"]}
74 | {"samples": [" Capital Cities"]}
75 | {"samples": [" Morgan Llywelyn"]}
76 | {"samples": [" 14 points"]}
77 | {"samples": [" Big 12 Conference"]}
78 | {"samples": [" Brittany, Cornwall, Ireland, the Isle of Man, Scotland, and Wales."]}
79 | {"samples": [" American BNC connector"]}
80 | {"samples": [" Wicked Twister"]}
81 | {"samples": [" Pamela B. Green"]}
82 | {"samples": [" Man Haron Monis"]}
83 | {"samples": [" Hawaii"]}
84 | {"samples": [" 100m"]}
85 | {"samples": [" Due to the onset and progression of Alzheimer's disease."]}
86 | {"samples": [" Yes"]}
87 | {"samples": [" Leucippus"]}
88 | {"samples": [" Hollywood, Florida"]}
89 | {"samples": [" Skyscraper"]}
90 | {"samples": [" 1895"]}
91 | {"samples": [" Pac-12 Conference"]}
92 | {"samples": [" 7.00pm"]}
93 | {"samples": [" No"]}
94 | {"samples": [" writer"]}
95 | {"samples": [" No"]}
96 | {"samples": [" 25,000"]}
97 | {"samples": [" Indianapolis, Indiana"]}
98 | {"samples": [" 1891"]}
99 | {"samples": [" Bill McCutcheon"]}
100 | {"samples": [" Field Marshal John Standish Surtees Prendergast Vereker, 6th Viscount Gort"]}
101 | {"samples": [" Polk County"]}
102 | {"samples": [" James II"]}
103 | {"samples": [" \"Pinball Wizard\""]}
104 | {"samples": [" 2006"]}
105 | {"samples": [" Ovambo people"]}
106 | {"samples": [" American"]}
107 | {"samples": [" Michael Joseph Swango"]}
108 | {"samples": [" Juan Rulfo"]}
109 | {"samples": [" Merck & Co."]}
110 | {"samples": [" 1961"]}
111 | {"samples": [" U.S. representative for Montana's at-large congressional district"]}
112 | {"samples": [" Alice's Adventures in Wonderland"]}
113 | {"samples": [" Vienna"]}
114 | {"samples": [" Yes"]}
115 | {"samples": [" Albert Park"]}
116 | {"samples": [" filmmaker"]}
117 | {"samples": [" Jeffersontown"]}
118 | {"samples": [" John Musker, Ron Clements, Dave Michener, and Burny Mattinson"]}
119 | {"samples": [" WAMC"]}
120 | {"samples": [" Bassendean"]}
121 | {"samples": [" diving duck"]}
122 | {"samples": [" Mimosa"]}
123 | {"samples": [" Both triangles are associated with paranormal phenomena, including sightings of Bigfoot-like creatures, giant snakes, and thunderbirds."]}
124 | {"samples": [" FEMSA"]}
125 | {"samples": [" Ellie Kemper"]}
126 | {"samples": [" Yes"]}
127 | {"samples": [" Umina Beach"]}
128 | {"samples": [" Metro Manila"]}
129 | {"samples": [" NASA Astrobiology Institute"]}
130 | {"samples": [" Suining"]}
131 | {"samples": [" video game publisher"]}
132 | {"samples": [" Yes"]}
133 | {"samples": [" Russian physicist"]}
134 | {"samples": [" Elvis' Christmas Album"]}
135 | {"samples": [" Operation Iceberg"]}
136 | {"samples": [" Logar Province"]}
137 | {"samples": [" BBC Formula One coverage"]}
138 | {"samples": [" Plato"]}
139 | {"samples": [" Northern Ireland"]}
140 | {"samples": [" Joe Gooch"]}
141 | {"samples": [" Both are poets."]}
142 | {"samples": [" Myst\u00e8re"]}
143 | {"samples": [" Houston Oilers"]}
144 | {"samples": [" \"father of liberalism\""]}
145 | {"samples": [" 35 people"]}
146 | {"samples": [" Currer Bell"]}
147 | {"samples": [" No"]}
148 | {"samples": [" 2014"]}
149 | {"samples": [" Southern (Dolomitic) Alps"]}
150 | {"samples": [" 1970"]}
151 | {"samples": [" French forces"]}
152 | {"samples": [" University of Vienna"]}
153 | {"samples": [" Brian Stokes Mitchell"]}
154 | {"samples": [" Netflix"]}
155 | {"samples": [" No"]}
156 | {"samples": [" Louisville, Kentucky"]}
157 | {"samples": [" Sonic Mania"]}
158 | {"samples": [" Mark Donohue"]}
159 | {"samples": [" acting career"]}
160 | {"samples": [" Veyil (2006)"]}
161 | {"samples": [" Kentucky Derby"]}
162 | {"samples": [" Neighbours"]}
163 | {"samples": [" Bill Miner"]}
164 | {"samples": [" Manistee County"]}
165 | {"samples": [" 4,000 capacity"]}
166 | {"samples": [" Atlantic Ocean"]}
167 | {"samples": [" filmmaker"]}
168 | {"samples": [" Start"]}
169 | {"samples": [" Green and yellow"]}
170 | {"samples": [" Claire Fraser is a married World War II nurse who finds herself transported back to 1743 Scotland and falls in love with the Highland warrior"]}
171 | {"samples": [" The Henry Jackson Society"]}
172 | {"samples": [" City of Peace"]}
173 | {"samples": [" William Dieterle"]}
174 | {"samples": [" Michael Tippett"]}
175 | {"samples": [" Babylon"]}
176 | {"samples": [" Ten Walls"]}
177 | {"samples": [" How to Train Your Dragon"]}
178 | {"samples": [" 1978"]}
179 | {"samples": [" J\u0101nis Strazdi\u0146\u0161"]}
180 | {"samples": [" English physician, writer, progressive intellectual and social reformer"]}
181 | {"samples": [" Qin dynasty"]}
182 | {"samples": [" violinist"]}
183 | {"samples": [" Arizona State University"]}
184 | {"samples": [" Yes"]}
185 | {"samples": [" Switchfoot"]}
186 | {"samples": [" Yes"]}
187 | {"samples": [" Brothers Quay"]}
188 | {"samples": [" 72 feet (22 m)"]}
189 | {"samples": [" Sawin Millett is a Maine politician who served as the Commissioner of Administrative and Financial Services in the administration of Governor Paul LePage from 2"]}
190 | {"samples": [" Marcus Dent"]}
191 | {"samples": [" River Thames"]}
192 | {"samples": [" Allure"]}
193 | {"samples": [" Jaleel Ahmad White"]}
194 | {"samples": [" Lake Wallace"]}
195 | {"samples": [" Over 100 million records"]}
196 | {"samples": [" Totally Tom"]}
197 | {"samples": [" 1998"]}
198 | {"samples": [" William"]}
199 | {"samples": [" Jerry Bingham"]}
200 | {"samples": [" No"]}
201 | {"samples": [" Gyulafeh\u00e9rv\u00e1r, Transylvania"]}
202 | {"samples": [" Princess Elizabeth"]}
203 | {"samples": [" Op\u00e9ra National de Lyon"]}
204 | {"samples": [" Yes"]}
205 | {"samples": [" 1532"]}
206 | {"samples": [" Lewis Watson, 1st Earl of Rockingham"]}
207 | {"samples": [" 16 January 1373"]}
208 | {"samples": [" Alboin"]}
209 | {"samples": [" Abbey Theatre playwright"]}
210 | {"samples": [" Unusual Films"]}
211 | {"samples": [" The House Of The Seven Hawks"]}
212 | {"samples": [" Marie Of Brabant"]}
213 | {"samples": [" William Busac"]}
214 | {"samples": [" 14 August 1846"]}
215 | {"samples": [" American"]}
216 | {"samples": [" Maria Teresa, Grand Duchess of Luxembourg"]}
217 | {"samples": [" Indradhanura Chhai"]}
218 | {"samples": [" No"]}
219 | {"samples": [" Yes"]}
220 | {"samples": [" Madrid"]}
221 | {"samples": [" Beaulieu-sur-Loire"]}
222 | {"samples": [" Lee Strasberg Theatre and Film Institute"]}
223 | {"samples": [" The Soviet Story"]}
224 | {"samples": [" Cipriano Castro"]}
225 | {"samples": [" The Year Of The Rabbit"]}
226 | {"samples": [" Altu\u011f \u00c7elikbilek"]}
227 | {"samples": [" Jugband Blues"]}
228 | {"samples": [" No"]}
229 | {"samples": [" Rathold R\u00e1t\u00f3t"]}
230 | {"samples": [" Kathmandu Valley"]}
231 | {"samples": [" Arthur Acheson, 1st Earl of Gosford"]}
232 | {"samples": [" John Paul Getty Jr."]}
233 | {"samples": [" Goring-on-Thames, Oxfordshire"]}
234 | {"samples": [" Yes"]}
235 | {"samples": [" Wooden Crosses"]}
236 | {"samples": [" Above Rubies"]}
237 | {"samples": [" The Longshot"]}
238 | {"samples": [" Castlemaine, Victoria, Australia"]}
239 | {"samples": [" Paul De Scherff"]}
240 | {"samples": [" Y\u0131ld\u0131z Palace"]}
241 | {"samples": [" Melun"]}
242 | {"samples": [" Tex And The Lord Of The Deep"]}
243 | {"samples": [" Wales"]}
244 | {"samples": [" Lagu Kenangan"]}
245 | {"samples": [" Buenos Aires"]}
246 | {"samples": [" Magdalene Sibylle of Holstein-Gottorp"]}
247 | {"samples": [" Brunswick Cathedral"]}
248 | {"samples": [" Saig\u014d Takamori"]}
249 | {"samples": [" Bajo Otro Sol"]}
250 | {"samples": [" Marie Lafor\u00eat"]}
251 | {"samples": [" Sandra Nelson"]}
252 | {"samples": [" 16 June 1705"]}
253 | {"samples": [" Pamplona"]}
254 | {"samples": [" Miley Naa Miley Hum"]}
255 | {"samples": [" Space Probe Taurus"]}
256 | {"samples": [" Seven In The Sun"]}
257 | {"samples": [" Vacations In Majorca"]}
258 | {"samples": [" Changeland"]}
259 | {"samples": [" Peter Rosegger"]}
260 | {"samples": [" Vasily Karatygin"]}
261 | {"samples": [" Jessi Colter"]}
262 | {"samples": [" Yes"]}
263 | {"samples": [" Erich Haenisch"]}
264 | {"samples": [" American"]}
265 | {"samples": [" Yes"]}
266 | {"samples": [" Melody Of The World"]}
267 | {"samples": [" Seville"]}
268 | {"samples": [" Audrey Davis Levin"]}
269 | {"samples": [" The Great Man'S Lady"]}
270 | {"samples": [" Oklahoma City, Oklahoma"]}
271 | {"samples": [" Beaulieu-sur-Loire"]}
272 | {"samples": [" La Trinit\u00e9, a commune within the Nice metropolitan area."]}
273 | {"samples": [" Prince Albert of Prussia"]}
274 | {"samples": [" Yes"]}
275 | {"samples": [" Beaulieu-sur-Loire"]}
276 | {"samples": [" Sir Paul Gore, 1st Baronet"]}
277 | {"samples": [" Egypt"]}
278 | {"samples": [" Dayton, Ohio"]}
279 | {"samples": [" Fay Wray"]}
280 | {"samples": [" The Drover's Sweetheart"]}
281 | {"samples": [" Baraguru village in the Tumkur district, Karnataka state."]}
282 | {"samples": [" Det Sande Ansigt"]}
283 | {"samples": [" Il Gaucho"]}
284 | {"samples": [" Beaulieu-sur-Loire"]}
285 | {"samples": [" The Pyrammmid"]}
286 | {"samples": [" No"]}
287 | {"samples": [" Eric XIV was most likely murdered."]}
288 | {"samples": [" American"]}
289 | {"samples": [" Hawai\u02bbi"]}
290 | {"samples": [" X-Paroni"]}
291 | {"samples": [" August Underground's Penance"]}
292 | {"samples": [" Lee Kun-hee"]}
293 | {"samples": [" Charles I, Duke of Bourbon"]}
294 | {"samples": [" 27 June 1839"]}
295 | {"samples": [" Gura Humorului, Austro-Hungarian Empire (now Romania)"]}
296 | {"samples": [" Hadi Kalafate"]}
297 | {"samples": [" Charles Wheatstone"]}
298 | {"samples": [" Washington, D.C."]}
299 | {"samples": [" London Melody"]}
300 | {"samples": [" Fernando Fla\u00ednez"]}
301 | {"samples": [" Mira Sorvino"]}
302 | {"samples": [" No"]}
303 | {"samples": [" Denmark"]}
304 | {"samples": [" Not specified in the context."]}
305 | {"samples": [" Eleanor of Castile died in Harby near Lincoln."]}
306 | {"samples": [" Hong Kong Film Award for Best Director"]}
307 | {"samples": [" 3 September 1992"]}
308 | {"samples": [" Nathaniel McLenaghan"]}
309 | {"samples": [" Tisch School of the Arts"]}
310 | {"samples": [" Cuchillos De Fuego"]}
311 | {"samples": [" Moment Of Danger"]}
312 | {"samples": [" Rock Street Journal"]}
313 | {"samples": [" The Piper's Price"]}
314 | {"samples": [" No"]}
315 | {"samples": [" 13 March 1753"]}
316 | {"samples": [" True To The Navy"]}
317 | {"samples": [" Marshall, Indiana"]}
318 | {"samples": [" Alkohol"]}
319 | {"samples": [" Many Tanks Mr. Atkins"]}
320 | {"samples": [" No"]}
321 | {"samples": [" New York City"]}
322 | {"samples": [" Tiger In The Smoke"]}
323 | {"samples": [" Mumbai"]}
324 | {"samples": [" F The Prom"]}
325 | {"samples": [" Waiting For The Clouds"]}
326 | {"samples": [" Dhuen Ki Lakeer"]}
327 | {"samples": [" Perd\u00f3n, Viejita"]}
328 | {"samples": [" University of Wisconsin-Madison"]}
329 | {"samples": [" Dudley Russell"]}
330 | {"samples": [" Vytautas Strai\u017eys"]}
331 | {"samples": [" In 1992."]}
332 | {"samples": [" Eindhoven"]}
333 | {"samples": [" Women's Suffrage Journal"]}
334 | {"samples": [" Fairmont, West Virginia"]}
335 | {"samples": [" Mayor Muthanna"]}
336 | {"samples": [" Yes"]}
337 | {"samples": [" Marrakesh"]}
338 | {"samples": [" Beaulieu-sur-Loire"]}
339 | {"samples": [" Solemn Promise"]}
340 | {"samples": [" Wolf Warrior"]}
341 | {"samples": [" Tr\u1ecbnh C\u01b0\u01a1ng"]}
342 | {"samples": [" Marshall, Indiana"]}
343 | {"samples": [" South Central Los Angeles"]}
344 | {"samples": [" Winter Sleepers"]}
345 | {"samples": [" Adolf III of Berg"]}
346 | {"samples": [" No"]}
347 | {"samples": [" M\u00fclheim an der Ruhr"]}
348 | {"samples": [" Una Prostituta Al Servizio Del Pubblico E In Regola Con Le Leggi Dello Stato"]}
349 | {"samples": [" Yes"]}
350 | {"samples": [" Prince Bhanurangsi Savangwongse"]}
351 | {"samples": [" Elizabeth Heneage"]}
352 | {"samples": [" \u00c9ric Rohmer"]}
353 | {"samples": [" Newport, Monmouthshire, Wales"]}
354 | {"samples": [" Gordonsville, Virginia"]}
355 | {"samples": [" Mangalia, Romania"]}
356 | {"samples": [" F\u00e9lix Leclerc"]}
357 | {"samples": [" Georges I, Duke of Saxe-Meiningen"]}
358 | {"samples": [" Stephen I of Hungary"]}
359 | {"samples": [" Queen Yi Jiang"]}
360 | {"samples": [" 3 November 1867"]}
361 | {"samples": [" Richard Cooper"]}
362 | {"samples": [" 1813"]}
363 | {"samples": [" Season Of Strangers"]}
364 | {"samples": [" No"]}
365 | {"samples": [" Antoine Casavant"]}
366 | {"samples": [" Oxford"]}
367 | {"samples": [" Sir Hew Dalrymple, 3rd Baronet"]}
368 | {"samples": [" Khud-Daar"]}
369 | {"samples": [" Milla Jovovich"]}
370 | {"samples": [" In Naples"]}
371 | {"samples": [" \"Best Cutting Edge Film\" at the 2008 San Diego Film Festival, \"Audience Choice -- Feature-Length Narrative Film\""]}
372 | {"samples": [" German"]}
373 | {"samples": [" Station For Two"]}
374 | {"samples": [" Kronstadt"]}
375 | {"samples": [" 1234"]}
376 | {"samples": [" 25 June 1601"]}
377 | {"samples": [" The Wind's Fierce"]}
378 | {"samples": [" He committed suicide by drowning himself in his swimming pool."]}
379 | {"samples": [" Abu Dhabi"]}
380 | {"samples": [" Prenzlau"]}
381 | {"samples": [" Pozna\u0144"]}
382 | {"samples": [" Forl\u00ec"]}
383 | {"samples": [" Toronto, Ontario"]}
384 | {"samples": [" Forbidden Daughters"]}
385 | {"samples": [" No"]}
386 | {"samples": [" Bogdan \u021a\u0103ru\u0219"]}
387 | {"samples": [" Folgore Division"]}
388 | {"samples": [" Durango Valley Raiders"]}
389 | {"samples": [" Yes"]}
390 | {"samples": [" Vienna"]}
391 | {"samples": [" Johnny Ekstr\u00f6m"]}
392 | {"samples": [" Peru"]}
393 | {"samples": [" English"]}
394 | {"samples": [" Complications of Parkinson's disease."]}
395 | {"samples": [" Piers de Geneville"]}
396 | {"samples": [" Ali Dinar"]}
397 | {"samples": [" King Ferdinand I of the Two Sicilies"]}
398 | {"samples": [" Prince Of Arcadia"]}
399 | {"samples": [" Milan"]}
400 | {"samples": [" K\u00f6ln"]}
401 | {"samples": [" Salma Hayek"]}
402 | {"samples": [" The person who provided evidence to suggest the existence of the neutron was a participant of the 7th Solvay Conference."]}
403 | {"samples": [" Merseyside"]}
404 | {"samples": [" Angel Bojado"]}
405 | {"samples": [" Sherry Boucher"]}
406 | {"samples": [" 2008"]}
407 | {"samples": [" Ysabella"]}
408 | {"samples": [" 1713"]}
409 | {"samples": [" 1912"]}
410 | {"samples": [" ATS-6"]}
411 | {"samples": [" James Howard Meredith"]}
412 | {"samples": [" The Royal Swedish Academy of Sciences"]}
413 | {"samples": [" Tom Hood"]}
414 | {"samples": [" Jennifer Connelly"]}
415 | {"samples": [" Welbedacht Dam"]}
416 | {"samples": [" Rialto Bridge"]}
417 | {"samples": [" January 20, 2017"]}
418 | {"samples": [" Vassar College"]}
419 | {"samples": [" Malko Tarnovo Municipality"]}
420 | {"samples": [" Budapest"]}
421 | {"samples": [" University of Toronto"]}
422 | {"samples": [" 1999"]}
423 | {"samples": [" FDA"]}
424 | {"samples": [" Susilo Bambang Yudhoyono"]}
425 | {"samples": [" James Wolfe"]}
426 | {"samples": [" Ocala"]}
427 | {"samples": [" Kim Un-yong"]}
428 | {"samples": [" 1911"]}
429 | {"samples": [" Susilo Bambang Yudhoyono"]}
430 | {"samples": [" Philip Mountbatten, Duke of Edinburgh"]}
431 | {"samples": [" 1915"]}
432 | {"samples": [" Shell was in the process of trying to find a potential buyer for all of its operations in Finland and was doing similar market research concerning Swedish operations."]}
433 | {"samples": [" Keturah"]}
434 | {"samples": [" Da Nang, Vietnam"]}
435 | {"samples": [" Michael Bubl\u00e9"]}
436 | {"samples": [" BEC Recordings"]}
437 | {"samples": [" Gustav Vasa"]}
438 | {"samples": [" TML Entertainment"]}
439 | {"samples": [" Susilo Bambang Yudhoyono"]}
440 | {"samples": [" 16"]}
441 | {"samples": [" Appleton, Wisconsin"]}
442 | {"samples": [" Fort Davis"]}
443 | {"samples": [" Claudia Wells"]}
444 | {"samples": [" Job 19:20"]}
445 | {"samples": [" Bartoszyce"]}
446 | {"samples": [" 1671"]}
447 | {"samples": [" The Orange River"]}
448 | {"samples": [" Common Sense was an important work because it crystallized sentiment for independence in 1776, advocating a complete break with history and a solution"]}
449 | {"samples": [" Pearl River County"]}
450 | {"samples": [" The African Queen"]}
451 | {"samples": [" The African Queen"]}
452 | {"samples": [" Manchester City"]}
453 | {"samples": [" Golestan Province, Iran"]}
454 | {"samples": [" John D. Loudermilk"]}
455 | {"samples": [" New York University"]}
456 | {"samples": [" Khalid ibn Walid"]}
457 | {"samples": [" River Thames"]}
458 | {"samples": [" January 3, 2015"]}
459 | {"samples": [" Jason"]}
460 | {"samples": [" 2011"]}
461 | {"samples": [" P\u00e9cs"]}
462 | {"samples": [" Matthew Lawrence"]}
463 | {"samples": [" Matthew Lawrence"]}
464 | {"samples": [" 25 April 2010"]}
465 | {"samples": [" UNESCO"]}
466 | {"samples": [" The Egyptian pantheon"]}
467 | {"samples": [" Calvin Coolidge"]}
468 | {"samples": [" Claudia Wells"]}
469 | {"samples": [" Sire Records"]}
470 | {"samples": [" Ernst Mach"]}
471 | {"samples": [" 9.4 years"]}
472 | {"samples": [" Susilo Bambang Yudhoyono"]}
473 | {"samples": [" Georgetown"]}
474 | {"samples": [" American football"]}
475 | {"samples": [" Clan Gordon"]}
476 | {"samples": [" fleur-de-lis"]}
477 | {"samples": [" Greek mythology"]}
478 | {"samples": [" The College of Charleston"]}
479 | {"samples": [" Rowan County"]}
480 | {"samples": [" Clatskanie, Oregon"]}
481 | {"samples": [" Manhattan Project"]}
482 | {"samples": [" The story of the character from The Bourne Deception was based on the character Jason Bourne, created by Robert Ludlum."]}
483 | {"samples": [" Internet integration and ease of use"]}
484 | {"samples": [" Aaron Benward"]}
485 | {"samples": [" 1713"]}
486 | {"samples": [" 1999"]}
487 | {"samples": [" Johan Ludvig Heiberg"]}
488 | {"samples": [" Louise Sawyer"]}
489 | {"samples": [" NFL"]}
490 | {"samples": [" 2006"]}
491 | {"samples": [" Rosario Dawson"]}
492 | {"samples": [" 60-70%"]}
493 | {"samples": [" August 3, 1769"]}
494 | {"samples": [" Casa Loma"]}
495 | {"samples": [" Min Zhou and Carl L. Bankston III"]}
496 | {"samples": [" Hannah"]}
497 | {"samples": [" Stephen Harper"]}
498 | {"samples": [" 02:00 local time"]}
499 | {"samples": [" 1809"]}
500 | {"samples": [" Never"]}
501 | {"samples": [" 2005"]}
502 | {"samples": [" Oregon State Beavers"]}
503 | {"samples": [" Jennifer Connelly"]}
504 | {"samples": [" Rhett Butler"]}
505 | {"samples": [" Lucy Mack Smith"]}
506 | {"samples": [" Russell K. Paul"]}
507 | {"samples": [" Rowan County"]}
508 | {"samples": [" ATS-6"]}
509 | {"samples": [" Hunter River"]}
510 | {"samples": [" 1,362,359"]}
511 | {"samples": [" Toledo"]}
512 | {"samples": [" Susilo Bambang Yudhoyono"]}
513 | {"samples": [" Audrey"]}
514 | {"samples": [" Fleur-de-lis"]}
515 | {"samples": [" Tanzania"]}
516 | {"samples": [" Bruno Mars"]}
517 | {"samples": [" Stephen Harper"]}
518 | {"samples": [" Cape Verde"]}
519 | {"samples": [" Ben Affleck"]}
520 | {"samples": [" Dick Grayson (Nightwing)"]}
521 | {"samples": [" Hamilton"]}
522 | {"samples": [" Pristina"]}
523 | {"samples": [" Orange River"]}
524 | {"samples": [" Claudia Wells"]}
525 | {"samples": [" Tom Kenny"]}
526 | {"samples": [" 279\u2013272 BC"]}
527 | {"samples": [" reopening select previously profitable stores"]}
528 | {"samples": [" 29 August 1862"]}
529 | {"samples": [" Susilo Bambang Yudhoyono"]}
530 | {"samples": [" Brandi Carlile"]}
531 | {"samples": [" Ko Phi Phi Le"]}
532 | {"samples": [" American"]}
533 | {"samples": [" Kenton County"]}
534 | {"samples": [" Silver Bear for Best Actor"]}
535 | {"samples": [" 1 January 1986"]}
536 | {"samples": [" A&M Records"]}
537 | {"samples": [" January 6, 2015"]}
538 | {"samples": [" Sazerac Company"]}
539 | {"samples": [" Zhejiang"]}
540 | {"samples": [" Luke Bryan"]}
541 | {"samples": [" Casa Loma"]}
542 | {"samples": [" Natalie Wood"]}
543 | {"samples": [" Tunis"]}
544 | {"samples": [" The northern part of Erich Zakowski's birthplace, East Prussia, was divided between the Soviet republics of Russia and Lithuania"]}
545 | {"samples": [" Charles County"]}
546 | {"samples": [" January 3, 2015"]}
547 | {"samples": [" Longest unbeaten run: 36 matches (2007\u201309)"]}
548 | {"samples": [" Maria Bello"]}
549 | {"samples": [" Bobby J. Brown"]}
550 | {"samples": [" 1986"]}
551 | {"samples": [" 1992"]}
552 | {"samples": [" Rabbi Dovber Schneuri"]}
553 | {"samples": [" Josh Groban"]}
554 | {"samples": [" Bruce L. Davis"]}
555 | {"samples": [" 20 March 2005"]}
556 | {"samples": [" Francis Bacon"]}
557 | {"samples": [" The Drakensberg mountains in Lesotho"]}
558 | {"samples": [" San Marino national team"]}
559 | {"samples": [" Angers, France"]}
560 | {"samples": [" 2 million"]}
561 | {"samples": [" Jennifer Connelly"]}
562 | {"samples": [" DeKalb County"]}
563 | {"samples": [" three different relationships"]}
564 | {"samples": [" Notus, Idaho"]}
565 | {"samples": [" Christopher McDonald"]}
566 | {"samples": [" Soviet Union"]}
567 | {"samples": [" Niassa Province"]}
568 | {"samples": [" The Association for Computing Machinery (ACM)"]}
569 | {"samples": [" Falls County"]}
570 | {"samples": [" Nuevo Laredo, Mexico"]}
571 | {"samples": [" Sebastian Cabot"]}
572 | {"samples": [" Colin Firth"]}
573 | {"samples": [" Trey Parker"]}
574 | {"samples": [" Ko Phi Phi Le"]}
575 | {"samples": [" Billie Jean King Cup"]}
576 | {"samples": [" Susilo Bambang Yudhoyono"]}
577 | {"samples": [" Geoffrey Rush"]}
578 | {"samples": [" River Thames"]}
579 | {"samples": [" Tony Award and a Drama Desk Award for Outstanding Revival of a Play."]}
580 | {"samples": [" University of Glasgow"]}
581 | {"samples": [" Louis Chedid"]}
582 | {"samples": [" Pope John XXIII died of peritonitis caused by a perforated stomach."]}
583 | {"samples": [" Adam Smith"]}
584 | {"samples": [" Lech Wa\u0142\u0119sa"]}
585 | {"samples": [" Maasmond"]}
586 | {"samples": [" Luke Bryan"]}
587 | {"samples": [" City University of New York system"]}
588 | {"samples": [" Bia\u0142ystok"]}
589 | {"samples": [" James Howard Meredith"]}
590 | {"samples": [" Emily Osment"]}
591 | {"samples": [" Hollywood Records"]}
592 | {"samples": [" Cold Crush Records"]}
593 | {"samples": [" Emilia Galotti"]}
594 | {"samples": [" January 3, 2023"]}
595 | {"samples": [" 1572"]}
596 | {"samples": [" Rostov"]}
597 | {"samples": [" Al-Wakrah Sports Club\u2019s home"]}
598 | {"samples": [" 825"]}
599 | {"samples": [" $99,300"]}
600 | {"samples": [" Santa Clara County"]}
601 |
--------------------------------------------------------------------------------
/real_world_long/results/FILM-7B/sample_LongBench_output_64.jsonl:
--------------------------------------------------------------------------------
1 | {"samples": [" South West Ultras"]}
2 | {"samples": [" No"]}
3 | {"samples": [" Low temperature scanning tunneling microscopy and spectroscopy (STM/STS) were used to study the quantum dot structures in this research."]}
4 | {"samples": [" To treat life-threatening heartbeat irregularities."]}
5 | {"samples": [" The sloped sides form a conical section, which when cut with a plane not perpendicular to its axis, results in an elliptical shape for the top longeron. This is necessary for the fuselage to have a straight and true shape."]}
6 | {"samples": [" The proximity of superconductivity tends to suppress the Kondo effect if the superconducting energy gap becomes larger than the relevant Kondo temperature."]}
7 | {"samples": [" Electricity is used in lighting, heating, communication, and computation."]}
8 | {"samples": [" Vice Admiral"]}
9 | {"samples": [" 1\u20130"]}
10 | {"samples": [" Vitamins K3, K4, and K5."]}
11 | {"samples": [" Yes"]}
12 | {"samples": [" 3-D printing and software development."]}
13 | {"samples": [" 90 \u03bcg and 120 \u03bcg respectively."]}
14 | {"samples": [" The watt"]}
15 | {"samples": [" Flexibility"]}
16 | {"samples": [" Jacob C. Landau"]}
17 | {"samples": [" The main methodology used in the research is an unsupervised method based on the information bottleneck to capture both referential complexity and task-specific utility to adequately explore sparse social communication scenarios in multi-agent reinforcement learning (MARL)."]}
18 | {"samples": [" The function beta(r) is determined by using the Einstein equations and the vacuum Einstein equations."]}
19 | {"samples": [" 70-75 metres"]}
20 | {"samples": [" Genuine placebo"]}
21 | {"samples": [" Brisbane, Cleveland, Queensland, Australia"]}
22 | {"samples": [" L = 8 and L = 14"]}
23 | {"samples": [" Five"]}
24 | {"samples": [" Lasa, Gitastrophe, and Shadoks"]}
25 | {"samples": [" The court concluded that the method claims did not meet the second prong of the Bilski test because they did not require any visual depiction or subsequent display."]}
26 | {"samples": [" The learning rate increases with the transition probability."]}
27 | {"samples": [" Ultracold neutral plasmas that form via kinetic rate processes from state-selected Rydberg gases."]}
28 | {"samples": [" Wearable sensors"]}
29 | {"samples": [" Out of fairness."]}
30 | {"samples": [" \"I have seen the Lord\""]}
31 | {"samples": [" verifying other meta-information"]}
32 | {"samples": [" Simulated and real data"]}
33 | {"samples": [" To ensure a fair and true fuselage."]}
34 | {"samples": [" Provides cover for the war and allows supporters of the illegal war to point to it and insist/slur \"Things aren't so bad!\""]}
35 | {"samples": [" The time required to update the robot's belief does not increase with the complexity of the environment."]}
36 | {"samples": [" The Tevatron Collider Run II started in March 2002 and is expected to continue until the end of this decade."]}
37 | {"samples": [" Four"]}
38 | {"samples": [" A solution $u$ of $-\\Delta u = f(u)$ is stable if\n\\[ \\int f'(u) \\psi^2 \\le \\int | \\nabla \\psi|^2, \\qquad \\forall \\psi \\in C_c^2. \\]"]}
39 | {"samples": [" The normalized least mean square (NLMS) algorithm is engaged in the PLMS-PPIC method."]}
40 | {"samples": [" Del Bigtree and his team at ICAN"]}
41 | {"samples": [" C$_2$H"]}
42 | {"samples": [" 1964"]}
43 | {"samples": [" The maximum velocity is proportional to the amplitude of the blob or depletion."]}
44 | {"samples": [" A complex network of proteins linking extracellular signals with the actin cytoskeleton."]}
45 | {"samples": [" 14,520 attendees"]}
46 | {"samples": [" Quill now harms States to a degree far greater than could have been anticipated earlier."]}
47 | {"samples": [" Environmental fluctuation and uncertainty, task complexity, and the details of the task the artificial organisms are aiming to solve."]}
48 | {"samples": [" The completed fuselage sides bow up from the building surface, forming a \"banana\" shape."]}
49 | {"samples": [" January 1929"]}
50 | {"samples": [" 48V"]}
51 | {"samples": [" June 1, 1999"]}
52 | {"samples": [" Qaderi, Chishti, Nakshbandi, Suharwardi, and Madaari Orders."]}
53 | {"samples": [" privacy concerns"]}
54 | {"samples": [" Wallace"]}
55 | {"samples": [" 30,223"]}
56 | {"samples": [" 2013"]}
57 | {"samples": [" 23 September"]}
58 | {"samples": [" Long Term Capital Management (LTCM)"]}
59 | {"samples": [" The dynamical behavior of the anisotropic order parameter following a quench to the critical point is well described by the Gaussian theory for all the three lattice gas models studied. In the short-time regime, m \u223c t^(1/2)."]}
60 | {"samples": [" 2\u00d72 meters"]}
61 | {"samples": [" The PLM with decimation outperforms other methods, especially in sparse networks and under-sampling regimes."]}
62 | {"samples": [" Exegetical, theological, and homiletical."]}
63 | {"samples": [" February 15, 2010"]}
64 | {"samples": [" Spanish BERT"]}
65 | {"samples": [" The infall rate is 2-5 times smaller and the gas density is 2-5 times smaller in the magnetized model compared to non-magnetized accretion."]}
66 | {"samples": [" The police say they aren\u2019t paid enough to enforce the laws in the streets."]}
67 | {"samples": [" Peter Denning"]}
68 | {"samples": [" The conduction gap is strongly dependent on the direction of applied strain."]}
69 | {"samples": [" 7 March 2023"]}
70 | {"samples": [" V+, V0 and V-"]}
71 | {"samples": [" Ngotho loses his job and the family is forced to move."]}
72 | {"samples": [" Jivani Street 2 of the Malatia-Sebastia District, Yerevan."]}
73 | {"samples": [" NFPA and FPSA vastly outperform DSA and GMRES for all cases by orders of magnitude."]}
74 | {"samples": [" Technological limitations, resistance among archaeologists to expose data to scrutiny, loss of opportunities for analysis before others use it, loss of 'capital' of data, control over how data tables are presented, and the emphasis on creating new data through archaeological research."]}
75 | {"samples": [" Physics, biology, social sciences, finance, neuroscience."]}
76 | {"samples": [" 4.5$\\times$10$^{8}$\\,cm"]}
77 | {"samples": [" VC-10 Squadron"]}
78 | {"samples": [" The bigger the receptive field size, the more complete shapes we can reconstruct using DSP."]}
79 | {"samples": [" It is a band geometric quantity that measures the k-space curl of the interlayer BCP over the occupied states, and is a characteristic of layer-hybridized electronic states."]}
80 | {"samples": [" Yes"]}
81 | {"samples": [" Legacies of Losing in American Politics"]}
82 | {"samples": [" A media application may use a content-recognition module to determine the context of an event in a media asset."]}
83 | {"samples": [" \"Tissue-engineered bone formation with gene transfer and mesenchymal stem cells in a minimally invasive technique\""]}
84 | {"samples": [" FC Banants"]}
85 | {"samples": [" Fruit consumption may provide a protective effect for Hg exposure in Amazonian riparians."]}
86 | {"samples": [" The scoring engine generates a stream of content for the channel by querying new content items based on the channel category and at least one other channel attribute, retrieving candidate content items that include the channel category and the other channel attribute, and then generating a stream of content from the candidate content items for the channel."]}
87 | {"samples": [" 2013\u201314."]}
88 | {"samples": [" More than 120 novels"]}
89 | {"samples": [" Approximating the posterior distribution $p({\\bf w}_k|y_{1:k})$ with an isotropic Gaussian distribution."]}
90 | {"samples": [" Yes"]}
91 | {"samples": [" Anemia, bruising, nosebleeds and bleeding of the gums in both sexes, and heavy menstrual bleeding in women."]}
92 | {"samples": [" 172"]}
93 | {"samples": [" Mobile device management (MDM) is a system that supports centralized control of an entire fleet of mobile devices and mobile applications by applying and ensuring pre-defined configuration settings."]}
94 | {"samples": [" BERT and RoBERTa"]}
95 | {"samples": [" Hosting Subscriber may not use Broadjam's servers or Hosting Subscriber's Website as a source, intermediary, reply to address, or destination address for mail bombs, Internet packet flooding, packet corruption, denial of service, or any other abusive activities. Server hacking or"]}
96 | {"samples": [" The vacuum processing system is configured with a plurality of vacuum processing apparatus arranged in parallel, with all vacuum processing chambers arranged in a straight line."]}
97 | {"samples": [" 21"]}
98 | {"samples": [" 1425 $\\mu_{B}$"]}
99 | {"samples": [" No"]}
100 | {"samples": [" August 25"]}
101 | {"samples": [" Reduced computational complexity."]}
102 | {"samples": [" QuecPython \u793e\u533a"]}
103 | {"samples": [" Severe anemia, enlargement of the placenta, heart, liver, spleen, and adrenal glands, fluid collection throughout the body, growth retardation, and increased risk of complications during pregnancy and delivery."]}
104 | {"samples": [" October 2001"]}
105 | {"samples": [" Smartphones are more compact and power constrained, with a cellular modem as an absolute necessity. Tablets, on the other hand, are more akin to PCs both technically and economically, with a wider range of designs due to a greater power budget. Tablets primarily use Wi-Fi for connect"]}
106 | {"samples": [" How much spending to cut."]}
107 | {"samples": [" The Director of Town and Country Planning"]}
108 | {"samples": [" The framework captures the reduced-order dynamics by using a propagator in the latent space, which is a complex-valued representation. The latent variables are treated independently, enabling interpretable latent dynamics."]}
109 | {"samples": [" Keep deploying and harvesting your bases."]}
110 | {"samples": [" The electron correlation parameter, $\\Gamma_e$, is the ratio of the average unscreened electron-electron potential energy to the electron kinetic energy."]}
111 | {"samples": [" Pressing \u2018SKIP\u2019."]}
112 | {"samples": [" four years"]}
113 | {"samples": [" C-295"]}
114 | {"samples": [" Permanent yellow spot damage on the screen."]}
115 | {"samples": [" Since she smokes he wants to find his own place."]}
116 | {"samples": [" An ALPHABETICAL LIST OF THE NAMES and PLACES of ABODE of the MERCHANTS and PRINCIPAL TRADERS of the Cities of LONDON and WESTMINSTER, the Borough of SOUTHWARK, and their Env"]}
117 | {"samples": [" It becomes a bit less."]}
118 | {"samples": [" Appoint a blue ribbon commission to conduct the research and develop the management plan."]}
119 | {"samples": [" Power-law functions"]}
120 | {"samples": [" $f'\\left(x\\right) = \\frac{6x^2\\cos{\\left(x^2\\right)}+\\sin{\\left(x^2\\right)}}{3\\sqrt[3]{x^2}}$, if $x \\neq 0$ and $0$, if $"]}
121 | {"samples": [" John F. Kennedy Profiles in Courage Award"]}
122 | {"samples": [" Nonlinear system vibration problems"]}
123 | {"samples": [" Fuller's Ranch"]}
124 | {"samples": [" The normalized least mean square (NLMS) algorithm is engaged in the PLMS-PPIC method."]}
125 | {"samples": [" Accounting for path preference consistently improves performance compared with the goal-only baseline."]}
126 | {"samples": [" An open-shell (5OS) and a closed-shell (5para) state."]}
127 | {"samples": [" Nuclear liquid-gas transition in the strong coupling regime of lattice QCD."]}
128 | {"samples": [" \u03b3 h"]}
129 | {"samples": [" Sir Richard"]}
130 | {"samples": [" Thalassemias are classified according to the globin that is affected."]}
131 | {"samples": [" McPherson"]}
132 | {"samples": [" In the spring of 1870."]}
133 | {"samples": [" \u4f7f\u7528\u5b89\u88c5\u5305\u5b89\u88c5\u5373\u53ef\u3002"]}
134 | {"samples": [" Improves its performance."]}
135 | {"samples": [" C-GDBN"]}
136 | {"samples": [" Memory nanodots, nanochannels for spin injection."]}
137 | {"samples": [" 62"]}
138 | {"samples": [" 1766"]}
139 | {"samples": [" Iraq snapshot"]}
140 | {"samples": [" SNNs may serve as a shortcut to explore the biological visual cortex."]}
141 | {"samples": [" Deputy Prime Minister and Minister of Finance"]}
142 | {"samples": [" 85.61%"]}
143 | {"samples": [" The larger the specific-heat ratio, the faster the average motion of the bubbles."]}
144 | {"samples": [" In the U.S. state of Kansas."]}
145 | {"samples": [" O (t, L_{\\parallel} ; S_\\Delta) = L_{\\parallel}^{-\\beta/[\\nu(1+\\Delta)]} \\tilde f_O (t/L_{\\parallel}^{z/(1+\\Delta)} ; S_\\Delta)."]}
146 | {"samples": [" The relationships between catch per set and fishing behavior variables are different for different measures of CPUE."]}
147 | {"samples": [" Romance novels and women's fiction."]}
148 | {"samples": [" No"]}
149 | {"samples": [" URPC2017 and URPC2018"]}
150 | {"samples": [" The effects of Brazilian Jiu Jitsu, and psychotherapy on helping people with autism develop subjective awareness of others."]}
151 |
--------------------------------------------------------------------------------
/real_world_long/results/gpt4-turbo/sample_LongBench_output_128.jsonl:
--------------------------------------------------------------------------------
1 | {"sample": "He lives with the Mulvilles."}
2 | {"sample": "Ann's heart is preoccupied with another."}
3 | {"sample": "Atlas' mountain"}
4 | {"sample": "To persuade Socrates to escape from prison."}
5 | {"sample": null}
6 | {"sample": "at least six hours"}
7 | {"sample": "Lisa"}
8 | {"sample": "Janosz Poha"}
9 | {"sample": "American"}
10 | {"sample": "the lying scribe"}
11 | {"sample": "His knowledge of 20th-century military tactics and strategy."}
12 | {"sample": "Uniformed crowd, phonetic spelling, people looking very well cared for, hairless or shorn individuals, and a sense of fear towards Soames."}
13 | {"sample": "fifty years"}
14 | {"sample": "Their daughter Virginie."}
15 | {"sample": "Pierre Grassou"}
16 | {"sample": "It starts to vibrate as if shaken by an unseen hand."}
17 | {"sample": "Elder Childers argues that the town needs improvements, such as a jail, and criticizes Mayor Joe Clark for not making these changes."}
18 | {"sample": "His knowledge of 20th-century military tactics and strategy."}
19 | {"sample": "Her face is distorted."}
20 | {"sample": "Three"}
21 | {"sample": "a family quarrel about money"}
22 | {"sample": "2419 A.D."}
23 | {"sample": "To follow the harvest."}
24 | {"sample": "betraying one's parents"}
25 | {"sample": "Aunt"}
26 | {"sample": "Reporter"}
27 | {"sample": "Obedience to the laws and principles of justice."}
28 | {"sample": "Shotgun blast"}
29 | {"sample": "Destroyed it without reading it."}
30 | {"sample": "Their faces were twisted into an insane rictus of fear, mouth open, eyes wide and glassy."}
31 | {"sample": "Soames believed Beerbohm would make him seem imaginary in his writing."}
32 | {"sample": "One week"}
33 | {"sample": "Mia and Jof"}
34 | {"sample": "Reading novels."}
35 | {"sample": "There is no mention of a character named Anderson in the provided context."}
36 | {"sample": "He loved her and she was in distress."}
37 | {"sample": "Shot by Benson"}
38 | {"sample": "Holmes's health needed a rest."}
39 | {"sample": "Vigo the Carpathian"}
40 | {"sample": "Mary is initially educated by an old house-keeper who tells her stories, reads to her, and teaches her to read."}
41 | {"sample": "East 77th Street"}
42 | {"sample": "Sadako Yamamura"}
43 | {"sample": "The Prison of Socrates"}
44 | {"sample": "Shizuko Yamamura committed suicide."}
45 | {"sample": "There is no mention of Bennett Landsmann or Seth Lazurus in the provided story."}
46 | {"sample": "Wearing a soft black hat of clerical kind and a gray waterproof cape."}
47 | {"sample": "Baron Henry of Trutz-Drachen"}
48 | {"sample": "Superficial accomplishments without any taste for them."}
49 | {"sample": "Baptist"}
50 | {"sample": "Lady Augusta Fleming"}
51 | {"sample": "Good and evil, innocence and cruelty, the brutality of the Middle Ages, the power of love and gentleness."}
52 | {"sample": "Tomoko dies in her closet, with a look of horror on her face."}
53 | {"sample": "Their faces were twisted into an insane rictus of fear, mouth open, eyes wide and glassy."}
54 | {"sample": "The Sinsings"}
55 | {"sample": "Abby and Ursula"}
56 | {"sample": "Sing the Ghostbusters theme song."}
57 | {"sample": "Philosopher and lecturer"}
58 | {"sample": "William Falder"}
59 | {"sample": "There is no mention of a character named Laura or a medical school in the provided context."}
60 | {"sample": "A check."}
61 | {"sample": "A sexless creature with wings."}
62 | {"sample": "In the closet."}
63 | {"sample": null}
64 | {"sample": "Eatonville, Florida"}
65 | {"sample": "Dr. Richards and Mr. Mortimer Tregennis"}
66 | {"sample": null}
67 | {"sample": "There is no mention of a character named Landsmann in the provided story context."}
68 | {"sample": "Buy a good house."}
69 | {"sample": "Mrs. Abraham C. Mope"}
70 | {"sample": "One week"}
71 | {"sample": null}
72 | {"sample": "She copies the tape and shows it to someone else."}
73 | {"sample": "a monk"}
74 | {"sample": "Obey the laws of the state."}
75 | {"sample": "He jumped."}
76 | {"sample": "Copying the tape and showing it to someone else."}
77 | {"sample": "Brenda Tregennis"}
78 | {"sample": "Madame de Merret fell ill and Monsieur de Merret did not leave her room until she recovered."}
79 | {"sample": "Rambling about the garden and playing with the dogs."}
80 | {"sample": "ex-cop, at present a security guard"}
81 | {"sample": null}
82 | {"sample": "Izu Pacific Land"}
83 | {"sample": "A suitcase full of cocaine"}
84 | {"sample": "A hundred years"}
85 | {"sample": "Holmes upset the watering-pot."}
86 | {"sample": "An enemy of the laws and his country."}
87 | {"sample": "Hector Frome"}
88 | {"sample": "Dana Barrett"}
89 | {"sample": "His lack of recognition and failure as a writer."}
90 | {"sample": "Eliza"}
91 | {"sample": "Bad Bloods"}
92 | {"sample": "INT. BAR - NIGHT"}
93 | {"sample": "Oscar is Dana's baby."}
94 | {"sample": "He believes he will be judged as unjust and wronging himself, his friends, his country, and the laws."}
95 | {"sample": "Opened the closet door."}
96 | {"sample": "No one."}
97 | {"sample": "mediocre"}
98 | {"sample": "Artists laugh at his work."}
99 | {"sample": "Blue Lou Boyle"}
100 | {"sample": "He failed to report himself."}
101 | {"sample": "Ruth Honeywill"}
102 | {"sample": "There is no mention of Alabama having a son in the provided script."}
103 | {"sample": "Eliza requests Mary to forgive her."}
104 | {"sample": "Have you come for me?"}
105 | {"sample": "There is no mention of Lazarus or a murder trial in the provided context."}
106 | {"sample": "The duties of her station."}
107 | {"sample": "a monk"}
108 | {"sample": "Her father's will and her mother's desire."}
109 | {"sample": "Dr. Leon Sterndale"}
110 | {"sample": null}
111 | {"sample": "American"}
112 | {"sample": "The Han airship."}
113 | {"sample": "When she finds Ryuji's dead body."}
114 | {"sample": "Ruth Anvoy"}
115 | {"sample": "Jim is arrested and taken to the barn; Dave is knocked out and tended to by the villagers."}
116 | {"sample": "The dark or middle ages in Germany"}
117 | {"sample": "Reiko's husband, Ryuji, dies."}
118 | {"sample": "He had one or two places but didn't keep them."}
119 | {"sample": "He lives with the Mulvilles."}
120 | {"sample": "In the closet."}
121 | {"sample": "Artists laugh at his work."}
122 | {"sample": null}
123 | {"sample": null}
124 | {"sample": null}
125 | {"sample": null}
126 | {"sample": "The agreement with the laws and the state of Athens."}
127 | {"sample": "mediocre artist"}
128 | {"sample": "Elder Childers"}
129 | {"sample": "Jim was not found guilty; the trial was interrupted and never concluded."}
130 | {"sample": "Her son."}
131 | {"sample": "Wisdom and peacefulness"}
132 | {"sample": "Her dogs."}
133 | {"sample": "Wyoming Valley"}
134 | {"sample": "Imaginary character"}
135 | {"sample": "a family quarrel about money"}
136 | {"sample": "There is no mention of Dwyer or an in vitro clinic in the provided story."}
137 | {"sample": "Pierre Grassou"}
138 | {"sample": "Infidelity"}
139 | {"sample": "Copy the tape and show it to someone else."}
140 | {"sample": "An old isolated house in disrepair near Vendome."}
141 | {"sample": "Their engagement ended."}
142 | {"sample": "Miss Violet Ray"}
143 | {"sample": "Bill"}
144 | {"sample": "Tomoko and Yoichi"}
145 | {"sample": null}
146 | {"sample": "Grassou discovers that many of the paintings in Vervelle's art collection are actually his own works, sold to Elie Magus and passed off as masterpieces by famous artists."}
147 | {"sample": null}
148 | {"sample": "Children and parents."}
149 | {"sample": "The coffin"}
150 | {"sample": "2419 A.D."}
151 | {"sample": "Otto is mutilated by Baron Henry of Trutz-Drachen."}
152 | {"sample": "Lee throws hot coffee in Elliot's face."}
153 | {"sample": "Eatonville"}
154 | {"sample": "Madame de Merret"}
155 | {"sample": "Marry him."}
156 | {"sample": "By being projected into the reading-room of the British Museum a hundred years hence."}
157 | {"sample": "Apis"}
158 | {"sample": "Two people (Mrs. Porter and the vicar)."}
159 | {"sample": "Alabama Worley"}
160 | {"sample": "Bill kills Chuck with a spoke."}
161 | {"sample": "Elie Magus"}
162 | {"sample": "\"Auld Lang Syne\""}
163 | {"sample": "Artists laugh at his work."}
164 | {"sample": "Charles."}
165 | {"sample": "An old isolated house in disrepair near Vendome."}
166 | {"sample": "\"Auld Lang Syne\""}
167 | {"sample": "At about a hundred paces from Vendome, on the banks of the Loir."}
168 | {"sample": "Over Daisy Taylor's attention."}
169 | {"sample": null}
170 | {"sample": "Because she saw her husband, Baron Conrad, wounded and thought him dead."}
171 | {"sample": "Strange dreams on the brain"}
172 | {"sample": "Ville d'Avray"}
173 | {"sample": "Revenge for the caravan attack."}
174 | {"sample": "Shot by Benson"}
175 | {"sample": "Elder Simms"}
176 | {"sample": "He scared them and they avoided him."}
177 | {"sample": "Abby"}
178 | {"sample": "Governor of the state"}
179 | {"sample": "Proper names were still spelled in the old way."}
180 | {"sample": null}
181 | {"sample": "Henry and his family."}
182 | {"sample": "In his trailer home."}
183 | {"sample": null}
184 | {"sample": "Leave a crack at the bottom."}
185 | {"sample": "Atlas' mountain"}
186 | {"sample": "St. Michaelsburg"}
187 | {"sample": "A subverter of the laws"}
188 | {"sample": "Leon's romantic conquests"}
189 | {"sample": "The naked beauty of the soul."}
190 | {"sample": "Chicago mill"}
191 | {"sample": "Dr. Rudolf Staub"}
192 | {"sample": "Ninety pounds"}
193 | {"sample": "Holmes upset the watering-pot."}
194 | {"sample": "Reiko's husband, Ryuji, dies."}
195 | {"sample": "Eatonville"}
196 | {"sample": "Brenda Tregennis and Mortimer Tregennis"}
197 | {"sample": "Charred ashes of the overnight fire"}
198 | {"sample": "American Radioactive Gas Corporation"}
199 | {"sample": "Mia and Jof"}
200 | {"sample": "When she finds Ryuji's dead body."}
201 | {"sample": "The ground truth for fake news was established by manual annotation carried out by a single person."}
202 | {"sample": "The GhostVLAD approach is an extension of the NetVLAD pooling strategy that adds Ghost clusters to map noisy or irrelevant content and exclude them during the feature aggregation stage, improving the discriminative properties of the resulting features for tasks like language identification."}
203 | {"sample": "68.8% to 71.8%"}
204 | {"sample": "The use of context tweets as an additional feature."}
205 | {"sample": "FoxNews, CNN, ESPN, New York Times, Time magazine, Huffington Post Weird News, The Guardian, Cartoon Network, Cooking Light, Home Cooking Adventure, Justin Bieber, Nickelodeon, Spongebob, Disney."}
206 | {"sample": "Yes"}
207 | {"sample": "Extrinsic evaluation in the task of summary evaluation."}
208 | {"sample": "CNN/DailyMail, New York Times Annotated Corpus (NYT), and XSum."}
209 | {"sample": "Unanswerable."}
210 | {"sample": "The ensemble method works by starting with the best performing model according to validation performance, then iteratively adding the best performing model that had not been previously tried, keeping it if it improved validation performance and discarding it otherwise, until each model has been tried once, resulting in a greedy ensemble."}
211 | {"sample": "Friends TV sitcom scripts and Facebook messenger chats."}
212 | {"sample": "English"}
213 | {"sample": "IMDb dataset of movie reviews"}
214 | {"sample": "unanswerable"}
215 | {"sample": "Yes"}
216 | {"sample": "The datasets used are a set of thousand documents related to finance, consisting of 184,001 Twitter posts and 62,949 news articles."}
217 | {"sample": "Energy sector."}
218 | {"sample": "RNN-based NMT and Transformer-NMT"}
219 | {"sample": "(1) a regularization term associated with neutral features; (2) the maximum entropy of class distribution regularization term; (3) the KL divergence between reference and predicted class distribution."}
220 | {"sample": "1) SVM with unigram, bigram, and trigram features; 2) SVM with average word embedding; 3) SVM with average transformed word embeddings; 4) CNN; 5) Recurrent Convolutional Neural Networks (RCNN); 6) UTCNN without user information; 7) UTCNN without the LDA model; 8) UTCNN without comments."}
221 | {"sample": "unanswerable"}
222 | {"sample": "Our model improves interpretability by identifying both crisper examples of attention head behavior and novel behaviors thanks to the sparsity and adaptivity of our proposed model."}
223 | {"sample": "The baseline model is a Transformer base model used for initial sentence-level translations without context-aware corrections."}
224 | {"sample": "XNLI test accuracy and Labeled Attachment Scores (LAS) for dependency parsing."}
225 | {"sample": "MT dataset $\\mathcal {M}$"}
226 | {"sample": "Unanswerable."}
227 | {"sample": "LSTM encoder"}
228 | {"sample": "Yes"}
229 | {"sample": "unanswerable"}
230 | {"sample": "22,880 users"}
231 | {"sample": "Perplexity, user-ranking, BLEU-1/4, ROUGE-L, Distinct-1/2, user matching accuracy (UMA), Mean Reciprocal Rank (MRR), recipe-level coherence, and step entailment score."}
232 | {"sample": "No Answer"}
233 | {"sample": "unanswerable"}
234 | {"sample": "machine translation tasks"}
235 | {"sample": "unanswerable"}
236 | {"sample": "unanswerable"}
237 | {"sample": "No"}
238 | {"sample": "The Nguni languages (zul, xho, nbl, ssw) and the Sotho languages (nso, sot, tsn) are similar to each other."}
239 | {"sample": "They compared layer-wise trained models with Xavier initialization models, and 2-layers distilled models with 2-layers regular-trained models."}
240 | {"sample": "29,794 articles"}
241 | {"sample": "A group of 50 native people who were well-versed in both English and Tamil languages acted as annotators for the evaluation."}
242 | {"sample": "Yes"}
243 | {"sample": "Models are evaluated by the retention rate of tokens and the fraction of sentences that exactly match the target sentence."}
244 | {"sample": "Precision, Recall, F-measure, overall accuracy"}
245 | {"sample": "The source domain is the existing domain with sufficient labeled data, and the target domain is the new domain with very few or no labeled data."}
246 | {"sample": "LSTMs, RAN, QRNN, NAS"}
247 | {"sample": "Word/character embedding, RNN, CNN, QRNN, Transformer, Highway network, Encoder Decoder architecture, attention layers, regularization layers, and loss functions."}
248 | {"sample": "multilingual pronunciation corpus collected by deri2016grapheme from Wiktionary"}
249 | {"sample": "unanswerable"}
250 | {"sample": "English, Spanish, Finnish."}
251 | {"sample": "unanswerable"}
252 | {"sample": "Yes"}
253 | {"sample": "Yes"}
254 | {"sample": "By generating maps for word categories that reflect certain psycholinguistic or semantic properties using lexical resources like Roget or Linguistic Inquiry and Word Count."}
255 | {"sample": "claim, premise, backing, rebuttal, and refutation"}
256 | {"sample": "unanswerable"}
257 | {"sample": "1,873 Twitter conversation threads"}
258 | {"sample": "English, Mandarin Chinese, Yue Chinese, Spanish, Russian, French, Polish, Estonian, Finnish, Hebrew, Kiswahili, Welsh."}
259 | {"sample": "Wikipedia 'Conversations Gone Awry' dataset and Reddit CMV dataset."}
260 | {"sample": "unanswerable"}
261 | {"sample": "The quality of the data is empirically evaluated through various sanity checks, including computing sentence-level BLEU scores, manual inspection of translations, perplexity measurements using a language model, character ratio computations, and similarity scores based on LASER cross-lingual sentence embeddings."}
262 | {"sample": "The audio and text sequences are encoded using dual RNNs and then combined using a feed-forward neural model to predict the emotion class."}
263 | {"sample": "2.11 BLEU, 1.7 FKGL, and 1.07 SARI."}
264 | {"sample": "700 examples"}
265 | {"sample": "A tweet went viral if it was retweeted more than 1000 times."}
266 | {"sample": "unanswerable"}
267 | {"sample": "crowdsourcing"}
268 | {"sample": "Logistic Regression and deep learning model adapted from Bowman et al."}
269 | {"sample": "The benchmark dataset is the Social Honeypot dataset, and its quality is not explicitly stated as high or low in the context provided."}
270 | {"sample": "LSTM decoder"}
271 | {"sample": "unanswerable"}
272 | {"sample": "3rd position in FLC task."}
273 | {"sample": "The baseline was the M2M Transformer NMT model (b3)."}
274 | {"sample": "0.7033"}
275 | {"sample": "Second-order co-occurrence vectors and word embeddings."}
276 | {"sample": "Using a bilingual dictionary."}
277 | {"sample": "Yes"}
278 | {"sample": "Experts with legal training."}
279 | {"sample": "Painting embedding: 3 parallel CNNs (object CNN, sentiment CNN, and scene CNN) combined with a skip-thought model; Language style transfer: sequence-to-sequence models with attention and pointer networks."}
280 | {"sample": "The transformer layer works better."}
281 | {"sample": "Yes"}
282 | {"sample": "personal attack, racism, and sexism"}
283 | {"sample": "By splitting the sentence into three disjoint regions (left context, middle context, right context) and focusing on the extended middle context by using two combinations: (1) left context, left entity, and middle context; (2) middle context, right entity, and right context."}
284 | {"sample": "Four types (PER, LOC, ORG, and MISC)"}
285 | {"sample": "unanswerable"}
286 | {"sample": "Women represent 33.16% of the speakers and account for only 22.57% of the total speech time."}
287 | {"sample": "English-German dataset"}
288 | {"sample": "BIBREF2, BIBREF13, BIBREF4, BIBREF15, BIBREF10, BIBREF17, BIBREF18, BIBREF20, BIBREF23, BIBREF33, BIBREF34, BIBREF35, BIBREF36, BIBREF37."}
289 | {"sample": "Logistic Regression (LR) and Multilayer Perceptron (MLP)"}
290 | {"sample": "BIBREF23, BIBREF17, BIBREF18, BIBREF19, BIBREF24, BIBREF25, BIBREF26, TwitterNLP, CogComp-NLP, Stanford NLP NER, spaCy NER."}
291 | {"sample": "SQuAD dataset"}
292 | {"sample": "Modelling urban regions, identifying points-of-interest and itineraries, predicting environmental phenomena using bag-of-words representations derived from Flickr tags, and using vector space embeddings for modelling geographic locations."}
293 | {"sample": "Yes"}
294 | {"sample": "CSAT dataset, 20 newsgroups, and Fisher Phase 1 corpus."}
295 | {"sample": "IMDb movie review dataset"}
296 | {"sample": "Yes"}
297 | {"sample": "No."}
298 | {"sample": "The invertibility condition requires the neural projector to be invertible and its inverse to exist."}
299 | {"sample": "unanswerable"}
300 | {"sample": "WikiSmall: 89,042 sentence pairs; WikiLarge: 296,402 sentence pairs."}
301 | {"sample": "Vanilla ST baseline, encoder pre-training, decoder pre-training, encoder-decoder pre-training, one-to-many multi-task, many-to-one multi-task, many-to-many multi-task, many-to-many+pre-training, Triangle+pre-train."}
302 | {"sample": "unanswerable"}
303 | {"sample": "linear SVM, bidirectional Long Short-Term-Memory (BiLSTM) model, Convolutional Neural Network (CNN)"}
304 | {"sample": "unanswerable"}
305 | {"sample": "200-dimensional GloVe embeddings and Edinburgh embeddings."}
306 | {"sample": "Our personalized generative models can generate plausible, personalized, and coherent recipes preferred by human evaluators for consumption."}
307 | {"sample": "harmonic mean of irony reward and sentiment reward"}
308 | {"sample": "The generated English poem may not work well with Shakespeare style transfer if the style transfer dataset does not have similar words in the training set of sentences."}
309 | {"sample": "The Affective Text dataset, the Fairy Tales dataset, and the ISEAR dataset."}
310 | {"sample": "The distribution results showed statistically significant differences in the distribution of followers, no significant difference in retweets and favourites, a larger number of hashtags in viral fake news (though not statistically significant), a higher chance of fake news coming from unverified accounts, a statistically significant difference in the distribution of friends, a larger ratio of friends/followers for accounts spreading fake news, less frequent use of mentions in tweets containing fake news, no statistically significant difference in the presence of media elements, and more URLs in viral tweets containing fake news."}
311 | {"sample": "The dataset of hashtags is sourced from the Stanford Sentiment Analysis Dataset."}
312 | {"sample": "unanswerable"}
313 | {"sample": "The context of the corresponding text."}
314 | {"sample": "B1 and B2"}
315 | {"sample": "unanswerable"}
316 | {"sample": "unanswerable"}
317 | {"sample": "SemEval-2016 \u201cSentiment Analysis in Twitter\u201d task dataset."}
318 | {"sample": "small BERT"}
319 | {"sample": "Yes"}
320 | {"sample": "Yes"}
321 | {"sample": "Competitive or even state-of-the-art results for some of the emotion labels on existing, standard evaluation datasets."}
322 | {"sample": "{B-PUN, I-PUN, O}"}
323 | {"sample": "No"}
324 | {"sample": "Robustness of a model is defined as its ability to leverage prior knowledge without being misled by bias in the knowledge."}
325 | {"sample": "InferSent, Universal Sentence Encoder, average GloVe embeddings, and BiLSTM architecture."}
326 | {"sample": "English datasets: CoNLL2003 (+0.29), OntoNotes5.0 (+0.96); Chinese datasets: MSRA (+0.97), OntoNotes4.0 (+2.36)."}
327 | {"sample": "Task 1: Quora Duplicate Question Pair Detection and Task 2: Ranking questions in Bing's People Also Ask."}
328 | {"sample": "syntactic tree-based models, latent tree models, and non-tree models"}
329 | {"sample": "Relation detection."}
330 | {"sample": "Nearest-Neighbor model (NN) and Encoder-Decoder baseline with ingredient attention (Enc-Dec)."}
331 | {"sample": "Browser-based annotation tool, manual categorization, part-of-speech tagging, leveraging Flickr30K Entities dataset with coreference annotations and clustering."}
332 | {"sample": "French, Spanish, Italian, Portuguese, Arabic, Hebrew, German, and English."}
333 | {"sample": "CAS-LSTM, plain stacked LSTMs, models with different INLINEFORM0, models without INLINEFORM1, and models that integrate lower contexts via peephole connections."}
334 | {"sample": "Yes"}
335 | {"sample": "ILP-based summarization, Sumy package algorithms"}
336 | {"sample": "BIBREF7"}
337 | {"sample": "unanswerable"}
338 | {"sample": "DTA18 and DTA19"}
339 | {"sample": "Kannada, Hindi, Telugu, Malayalam, Bengali, and English."}
340 | {"sample": "Unanswerable"}
341 | {"sample": "ALOHA achieves a significant improvement on the target character language style retrieval task compared to the baseline open-domain chatbot models."}
342 | {"sample": "unanswerable"}
343 | {"sample": "The authors present evidence of biases in data annotation and collection by showing that many errors are due to biases from data collection and rules of annotation, and by manual inspection of mislabeled items, where tweets with specific language or geographic restriction are oversampled, resulting in high rates of misclassification."}
344 | {"sample": "Yes."}
345 | {"sample": "unanswerable"}
346 | {"sample": "+0.58 for MRPC and +0.73 for QQP."}
347 | {"sample": "BIBREF0"}
348 | {"sample": "unanswerable"}
349 | {"sample": "Pointer-Gen, Pointer-Gen+Pos, Pointer-Gen+Same-FT, Pointer-Gen+Pos-FT, Pointer-Gen+RL-ROUGE, Pointer-Gen+RL-SEN, Pointer-Gen+ARL-SEN."}
350 | {"sample": "Na\u00efve Bayes, Logistic Regression, Support Vector Machine, Random Forests, Gradient Boosted Trees, Convolutional Neural Networks, Recurrent Neural Networks."}
351 | {"sample": "bi-directional language model and uni-directional model"}
352 | {"sample": "Weights are dynamically adjusted in proportion to $(1-p)$, and this weight dynamically changes as training proceeds."}
353 | {"sample": "KG-A2C-chained and KG-A2C-Explore both pass the bottleneck of a score of 40, whereas A2C-Explore gets to the bottleneck but cannot surpass it. KG-A2C-chained is significantly more sample efficient and converges faster than KG-A2C-Explore."}
354 | {"sample": "Bayesian models for each language and crosslingual latent variables to incorporate soft role agreement between aligned constituents."}
355 | {"sample": "Annotations for noises and disfluencies including mispronunciations."}
356 | {"sample": "unanswerable"}
357 | {"sample": "Bulgarian, Croatian, Czech, Danish, English, French, German, Indonesian, Italian, Norwegian, Persian, Polish, Portuguese, Slovenian, Spanish, and Swedish."}
358 | {"sample": "NCEL consistently outperforms various baselines with a favorable generalization ability."}
359 | {"sample": "Yes"}
360 | {"sample": "AdaDelta BIBREF15"}
361 | {"sample": "2010 i2b2/VA"}
362 | {"sample": "Masking words in the decoder is helpful because it allows the refine decoder to generate more fluent and natural sequences by using the ability of the contextual language model to predict words given the complete context consistent with their pre-training processes."}
363 | {"sample": "unanswerable"}
364 | {"sample": "TF-IDF features"}
365 | {"sample": "Each tweet is annotated as no evidence of depression or evidence of depression, and if evidence of depression is present, it is further annotated with one or more depressive symptoms."}
366 | {"sample": "unanswerable"}
367 | {"sample": "The training data was translated using the machine translation platform Apertium."}
368 | {"sample": "multinomial Naive Bayes classifier"}
369 | {"sample": "The baseline for the SLC task was a logistic regression classifier with the length of the sentence as a feature, and for the FLC task, it was a random baseline that generates spans and selects one of the 18 techniques randomly."}
370 | {"sample": "CRF-based model, prior works that did not employ joint learning, and a \"pipeline\" method."}
371 | {"sample": "The political bias of different sources is included in the model by assigning a political bias label to different US outlets following the procedure described in BIBREF2."}
372 | {"sample": "The ancient Chinese dataset comes from 1.7K bilingual ancient-modern Chinese articles from the internet, including ancient Chinese history records in several dynasties (about 1000BC-200BC) and articles written by celebrities of that era."}
373 | {"sample": "English"}
374 | {"sample": "unanswerable"}
375 | {"sample": "Three convolutional layers."}
376 | {"sample": "European network of nature protected sites Natura 2000, CORINE land cover classes, SoilGrids, and the ScenicOrNot dataset."}
377 | {"sample": "NUBes-PHI and the MEDDOCAN corpus."}
378 | {"sample": "Unigrams, pragmatic features, stylistic patterns, and patterns related to situational disparity."}
379 | {"sample": "Coverage(C), Avg. MCC, and avg. +ve F1 score."}
380 | {"sample": "Yes"}
381 | {"sample": "Galatasaray (Target-1) and Fenerbah\u00e7e (Target-2)"}
382 | {"sample": "Automatic evaluations and human evaluations on the transformation from non-ironic sentences to ironic sentences, and additional experiments on the transformation from ironic sentences to non-ironic sentences."}
383 | {"sample": "Gaussian-masked directional multi-head attention works by applying a Gaussian weight matrix that emphasizes the localness relationship between characters based on their distances, adjusting the attention weights to favor adjacent characters, and using a triangular matrix mask for forward and backward encoders to focus on different weights."}
384 | {"sample": "Facebook status update messages."}
385 | {"sample": "The features extracted from the network's baseline CNN that directly aims to classify a sentence as sarcastic vs non-sarcastic."}
386 | {"sample": "The number of clusters (k) and the type of word vector models (skipgram, cbow, GloVe)."}
387 | {"sample": "unanswerable"}
388 | {"sample": "The corpus consists of 53 documents."}
389 | {"sample": "Unanswerable"}
390 | {"sample": "text categorization and sentiment classification"}
391 | {"sample": "CNN, term frequency models, and rule-based pattern matching."}
392 | {"sample": "The training sets for these versions of ELMo are much larger, with the Latvian corpus having about 280 million tokens compared to the ELMoForManyLangs' 20 million tokens."}
393 | {"sample": "6946 sentences"}
394 | {"sample": "MLP, Eusboost, MWMOTE"}
395 | {"sample": "Yes"}
396 | {"sample": "No"}
397 | {"sample": "0.6103"}
398 | {"sample": "Wall Street Journal (WSJ) portion of the Penn Treebank"}
399 | {"sample": "unanswerable"}
400 | {"sample": "SimpleQuestions and WebQSP"}
401 |
--------------------------------------------------------------------------------
/real_world_long/results/gpt4-turbo/sample_LongBench_output_32.jsonl:
--------------------------------------------------------------------------------
1 | {"sample": "Miller v. California"}
2 | {"sample": "Charles L. Clifford"}
3 | {"sample": "Norman, Oklahoma"}
4 | {"sample": "Sun"}
5 | {"sample": "on film"}
6 | {"sample": "Parliament"}
7 | {"sample": "Mayiladuthurai District"}
8 | {"sample": "Pleiospilos"}
9 | {"sample": "2013"}
10 | {"sample": "The Rebirth"}
11 | {"sample": "October 13, 1980"}
12 | {"sample": "Ted Turner"}
13 | {"sample": "Mary Astor"}
14 | {"sample": "number five"}
15 | {"sample": "Horton Plains National Park"}
16 | {"sample": "2000 Summer Olympics"}
17 | {"sample": "Claudio L\u00f3pez"}
18 | {"sample": "Ronald Reagan"}
19 | {"sample": "Talk That Talk"}
20 | {"sample": "George Harrison"}
21 | {"sample": "The Cartoon Cartoon Show"}
22 | {"sample": "Sydney, New South Wales, Australia"}
23 | {"sample": "Long Island"}
24 | {"sample": "PewDiePie"}
25 | {"sample": "It's Always Sunny in Philadelphia"}
26 | {"sample": "Blue Valley Northwest High School"}
27 | {"sample": "Ribosomes"}
28 | {"sample": "Dracula"}
29 | {"sample": "Band-e Amir National Park"}
30 | {"sample": "University of Southern California Trojans football team"}
31 | {"sample": "12,817"}
32 | {"sample": "A123 Systems"}
33 | {"sample": "15,678"}
34 | {"sample": "Taoiseach"}
35 | {"sample": "Troy"}
36 | {"sample": "Mika H\u00e4kkinen"}
37 | {"sample": "I Am Legend"}
38 | {"sample": "George Balanchine"}
39 | {"sample": "Xcel Energy"}
40 | {"sample": "Around the World in 80 Days"}
41 | {"sample": "Nobel Prize in Physics in 1944"}
42 | {"sample": "Rankin/Bass Animated Entertainment is best known for its seasonal television specials, usually done in stop motion animation."}
43 | {"sample": "Eileen Atkins"}
44 | {"sample": "Iran"}
45 | {"sample": "Hakim Jamal"}
46 | {"sample": "Keith Morris"}
47 | {"sample": "YIVO"}
48 | {"sample": "January 18, 1813"}
49 | {"sample": "Tongshanjiabu"}
50 | {"sample": "1867"}
51 | {"sample": "Pope John X"}
52 | {"sample": "Manchester United F.C."}
53 | {"sample": "Yes"}
54 | {"sample": "2,098"}
55 | {"sample": "Australian Supercars Championship"}
56 | {"sample": "No"}
57 | {"sample": "Mike Leach passed away on December 12, 2022, and therefore is not currently coaching."}
58 | {"sample": "Baylor, Floyd Casey Stadium"}
59 | {"sample": "2010"}
60 | {"sample": "Elephants are connected to Gajabrishta through the term \"Gaja,\" which means elephant in Sanskrit. \"Brishta\" refers to the"}
61 | {"sample": "British"}
62 | {"sample": "Yes"}
63 | {"sample": "November 23, 2002"}
64 | {"sample": "Drea de Matteo"}
65 | {"sample": "British"}
66 | {"sample": "Vernon L. Smith"}
67 | {"sample": "Deftones"}
68 | {"sample": "796"}
69 | {"sample": "India"}
70 | {"sample": "Yes"}
71 | {"sample": "May 5, 1939"}
72 | {"sample": "Philip K. Dick"}
73 | {"sample": "821"}
74 | {"sample": "Capital Cities"}
75 | {"sample": "Morgan Llywelyn"}
76 | {"sample": "14pts"}
77 | {"sample": "Big 12 Conference"}
78 | {"sample": "Brittany (Breizh), Cornwall (Kernow), Ireland (\u00c9ire), the Isle of Man (Mannin, or Ellan V"}
79 | {"sample": "coaxial cables"}
80 | {"sample": "Wicked Twister"}
81 | {"sample": "Lev Yilmaz"}
82 | {"sample": "Man Haron Monis"}
83 | {"sample": "Hawaii"}
84 | {"sample": "100 metres"}
85 | {"sample": "due to the onset and progression of Alzheimer's disease"}
86 | {"sample": "Yes"}
87 | {"sample": "Leucippus"}
88 | {"sample": "Hollywood"}
89 | {"sample": "Both are skyscrapers."}
90 | {"sample": "1895"}
91 | {"sample": "Pac-12 Conference"}
92 | {"sample": "7.00pm"}
93 | {"sample": "No."}
94 | {"sample": "Writer and poet"}
95 | {"sample": "No."}
96 | {"sample": "43,000 demonstrators"}
97 | {"sample": "Indianapolis, Indiana"}
98 | {"sample": "1909"}
99 | {"sample": "Bill McCutcheon"}
100 | {"sample": "Field Marshal John Standish Surtees Prendergast Vereker, 6th Viscount Gort"}
101 | {"sample": "Polk County"}
102 | {"sample": "James II"}
103 | {"sample": "\"Pinball Wizard\""}
104 | {"sample": "2006"}
105 | {"sample": "Yoruba"}
106 | {"sample": "American"}
107 | {"sample": "Michael Swango"}
108 | {"sample": "Juan Rulfo"}
109 | {"sample": "Merck & Co."}
110 | {"sample": "1961"}
111 | {"sample": "United States Secretary of the Interior"}
112 | {"sample": "Alice's Adventures in Wonderland"}
113 | {"sample": "Vienna"}
114 | {"sample": "Yes"}
115 | {"sample": "Albert Park"}
116 | {"sample": "filmmakers"}
117 | {"sample": "Jeffersontown"}
118 | {"sample": "John Musker, Ron Clements, Dave Michener, and Burny Mattinson"}
119 | {"sample": "WAMC"}
120 | {"sample": "Bassendean"}
121 | {"sample": "duck"}
122 | {"sample": "Mimosa"}
123 | {"sample": "Disappearances, UFO sightings, Bigfoot-like sightings, and animal mutilations."}
124 | {"sample": "Coca-Cola FEMSA"}
125 | {"sample": null}
126 | {"sample": "Yes"}
127 | {"sample": "Umina Beach"}
128 | {"sample": "Quezon City"}
129 | {"sample": "NASA Astrobiology Institute (NAI)"}
130 | {"sample": "Suining"}
131 | {"sample": "Video game."}
132 | {"sample": "No."}
133 | {"sample": "Russian physicist"}
134 | {"sample": "Elvis' Christmas Album"}
135 | {"sample": "Operation Iceberg"}
136 | {"sample": "Puli Alam"}
137 | {"sample": "BBC Formula One"}
138 | {"sample": "Plato"}
139 | {"sample": "Donaghadee, Northern Ireland"}
140 | {"sample": "Joe Gooch"}
141 | {"sample": "Both Lavinia Greenlaw and N\u00e2z\u0131m Hikmet are poets."}
142 | {"sample": "O"}
143 | {"sample": "West Lafayette, Indiana"}
144 | {"sample": "John Locke"}
145 | {"sample": "35 people"}
146 | {"sample": "Currer Bell"}
147 | {"sample": "No"}
148 | {"sample": "2011"}
149 | {"sample": "Cortina d'Ampezzo, Italy"}
150 | {"sample": "1975"}
151 | {"sample": "French forces"}
152 | {"sample": "Cornell University"}
153 | {"sample": "Brian Stokes Mitchell"}
154 | {"sample": "Netflix"}
155 | {"sample": "No"}
156 | {"sample": "Louisville, Kentucky"}
157 | {"sample": "Sonic Mania"}
158 | {"sample": "Mark Donohue"}
159 | {"sample": "Acting career"}
160 | {"sample": "Veyil"}
161 | {"sample": "Kentucky Derby"}
162 | {"sample": "Neighbours"}
163 | {"sample": "Bill Miner"}
164 | {"sample": "Chippewa County"}
165 | {"sample": "3,677"}
166 | {"sample": "Atlantic Ocean"}
167 | {"sample": "Film director"}
168 | {"sample": "Start"}
169 | {"sample": "Green and yellow"}
170 | {"sample": "Claire Randall"}
171 | {"sample": "The Jackson Laboratory for Genomic Medicine"}
172 | {"sample": "City of Peace"}
173 | {"sample": "Martin Scorsese"}
174 | {"sample": "Michael Tippett"}
175 | {"sample": "Babylon"}
176 | {"sample": "Ten Walls"}
177 | {"sample": "Dragons: Race to the Edge"}
178 | {"sample": "1978"}
179 | {"sample": "J\u0101nis Strazdi\u0146\u0161"}
180 | {"sample": "English authors"}
181 | {"sample": "Nanyue"}
182 | {"sample": "Baron of Holberg"}
183 | {"sample": "Arizona State University"}
184 | {"sample": "Yes"}
185 | {"sample": "Switchfoot"}
186 | {"sample": "No"}
187 | {"sample": "Jake Kasdan"}
188 | {"sample": "The South Branch of the Nashua River"}
189 | {"sample": "The Bangor Daily News is not specifically mentioned as talking about Sawin Millett in the given context."}
190 | {"sample": "Let's Dance for Comic Relief"}
191 | {"sample": "River Thames"}
192 | {"sample": "Allure"}
193 | {"sample": "Jaleel White"}
194 | {"sample": "Lake Wallace"}
195 | {"sample": "over 100 million records"}
196 | {"sample": "Two Episodes of Mash"}
197 | {"sample": "1998"}
198 | {"sample": "William"}
199 | {"sample": "Jerry Bingham"}
200 | {"sample": "No"}
201 | {"sample": "Gyulafeh\u00e9rv\u00e1r, Transylvania"}
202 | {"sample": "Elizabeth"}
203 | {"sample": "France"}
204 | {"sample": "No."}
205 | {"sample": "c.\u20091532"}
206 | {"sample": "Edward Watson, 2nd Baron Rockingham"}
207 | {"sample": "January 16, 1373"}
208 | {"sample": "Cunimund"}
209 | {"sample": "St Patrick's College, Dublin"}
210 | {"sample": "Bob Jones University"}
211 | {"sample": "The House Of The Seven Hawks"}
212 | {"sample": "Adelaide of Burgundy"}
213 | {"sample": "Nocher II"}
214 | {"sample": "24 November 1837"}
215 | {"sample": "United States"}
216 | {"sample": null}
217 | {"sample": "The Death of Black King"}
218 | {"sample": "No."}
219 | {"sample": "No"}
220 | {"sample": "Mysore"}
221 | {"sample": "The context does not provide information about the place of death of Randi Monsen's father, Fredrik Monsen."}
222 | {"sample": "Lee Strasberg Theatre and Film Institute"}
223 | {"sample": "Hell Up in Harlem"}
224 | {"sample": "Cipriano Castro"}
225 | {"sample": "Monster on the Campus"}
226 | {"sample": "Altu\u011f \u00c7elikbilek"}
227 | {"sample": "Jugband Blues"}
228 | {"sample": "Yes"}
229 | {"sample": "Rathold R\u00e1t\u00f3t"}
230 | {"sample": "Kathmandu Valley"}
231 | {"sample": "Archibald Acheson, 2nd Earl of Gosford"}
232 | {"sample": "John Paul Getty Jr."}
233 | {"sample": "Goring-on-Thames, Oxfordshire"}
234 | {"sample": "Yes"}
235 | {"sample": "Wooden Crosses"}
236 | {"sample": "Above Rubies"}
237 | {"sample": "The Longshot"}
238 | {"sample": "Castlemaine, Victoria, Australia"}
239 | {"sample": "Lyudmyla Olyanovska"}
240 | {"sample": "Y\u0131ld\u0131z Palace"}
241 | {"sample": "Aachen"}
242 | {"sample": "Henry Goes Arizona"}
243 | {"sample": "Wales"}
244 | {"sample": "All-American Co-Ed"}
245 | {"sample": "Buenos Aires"}
246 | {"sample": "Magdalene Sibylle of Holstein-Gottorp"}
247 | {"sample": "Stahleck Castle"}
248 | {"sample": "Rupilia Faustina"}
249 | {"sample": "Bajo otro sol"}
250 | {"sample": "Marie Lafor\u00eat"}
251 | {"sample": null}
252 | {"sample": "May 19, 1669"}
253 | {"sample": "Palencia"}
254 | {"sample": "Miley Naa Miley Hum"}
255 | {"sample": "Tom Mix in Arabia"}
256 | {"sample": "Daughter of the Jungle"}
257 | {"sample": "Dr. Socrates"}
258 | {"sample": "Changeland"}
259 | {"sample": "Ruel Redinger"}
260 | {"sample": "Vasily Karatygin"}
261 | {"sample": null}
262 | {"sample": "Yes"}
263 | {"sample": "William Pooley"}
264 | {"sample": "Swedish"}
265 | {"sample": "Yes"}
266 | {"sample": "Melody of the World"}
267 | {"sample": "Madrid"}
268 | {"sample": "Keku\u02bbiapoiwa II"}
269 | {"sample": "The Great Man's Lady"}
270 | {"sample": "Oklahoma City, Oklahoma"}
271 | {"sample": "Dubai"}
272 | {"sample": "La Trinit\u00e9, a commune within the Nice metropolitan area"}
273 | {"sample": "Paul Frederick, Grand Duke of Mecklenburg-Schwerin"}
274 | {"sample": "Yes"}
275 | {"sample": "Northampton"}
276 | {"sample": "Sir Paul Gore, 1st Baronet"}
277 | {"sample": "Egypt"}
278 | {"sample": "Dayton, Ohio"}
279 | {"sample": "There is no information provided about the spouse of the director of the film \"Ann Carver's Profession\" in the given context."}
280 | {"sample": "The Drover's Sweetheart"}
281 | {"sample": "Baraguru"}
282 | {"sample": "Det Sande Ansigt"}
283 | {"sample": "Il Gaucho"}
284 | {"sample": "Maria's husband, Boris I of Bulgaria, died on 2 May 907. However, the passages do not specify the exact place of his death."}
285 | {"sample": "The PyraMMMid"}
286 | {"sample": "No."}
287 | {"sample": "poisoning"}
288 | {"sample": "American"}
289 | {"sample": "Hawai\u02bbi"}
290 | {"sample": null}
291 | {"sample": "Vasantha Raagam"}
292 | {"sample": "Lee Kun-hee"}
293 | {"sample": "Charles I, Duke of Bourbon"}
294 | {"sample": "27 June 1839"}
295 | {"sample": "Gura Humorului, Austro-Hungarian Empire (now Romania)"}
296 | {"sample": "Fran\u00e7oise Hardy"}
297 | {"sample": "Charles Wheatstone"}
298 | {"sample": "Washington, D.C."}
299 | {"sample": "London Melody"}
300 | {"sample": "Fernando Fla\u00ednez"}
301 | {"sample": "Mira Sorvino, Michael Sorvino, Amanda Sorvino"}
302 | {"sample": "No"}
303 | {"sample": "Denmark"}
304 | {"sample": "Illness"}
305 | {"sample": "Harby near Lincoln"}
306 | {"sample": "Hong Kong Film Award for Best Director"}
307 | {"sample": "3 September 1992"}
308 | {"sample": "Nathaniel McLenaghan"}
309 | {"sample": "Tisch School of the Arts"}
310 | {"sample": "A Woman Has Killed"}
311 | {"sample": "The Ballad of Josie"}
312 | {"sample": "De AS"}
313 | {"sample": "The Piper's Price"}
314 | {"sample": "Yes"}
315 | {"sample": "13 March 1753"}
316 | {"sample": "No Trees in the Street"}
317 | {"sample": "Methala, near Perumbavoor in Kerala"}
318 | {"sample": "Alkohol"}
319 | {"sample": "Do Musafir"}
320 | {"sample": "No"}
321 | {"sample": "New York City"}
322 | {"sample": "Contragolpe"}
323 | {"sample": "Mumbai"}
324 | {"sample": "The Comedians of Comedy"}
325 | {"sample": "Waiting for the Clouds"}
326 | {"sample": "Dhuen Ki Lakeer"}
327 | {"sample": "Perd\u00f3n, viejita"}
328 | {"sample": "University of Wisconsin-Madison"}
329 | {"sample": "Dudley Russell"}
330 | {"sample": "Mirjam Polkunen"}
331 | {"sample": "Saint Petersburg"}
332 | {"sample": "Eindhoven"}
333 | {"sample": "Women's Suffrage Journal"}
334 | {"sample": "Fairmont, West Virginia"}
335 | {"sample": "Mayor Muthanna"}
336 | {"sample": "Yes"}
337 | {"sample": "Sijilmasa"}
338 | {"sample": "Beaulieu-sur-Loire"}
339 | {"sample": "Tarzan The Magnificent"}
340 | {"sample": "Wolf Warrior"}
341 | {"sample": "Tr\u1ecbnh C\u01b0\u01a1ng"}
342 | {"sample": "Gura Humorului, Austro-Hungarian Empire (now Romania)"}
343 | {"sample": "Ermias Joseph Asghedom was born on August 15, 1985, in the Crenshaw District of South Central Los Angeles."}
344 | {"sample": "Winter Sleepers"}
345 | {"sample": "Adolf III of Berg"}
346 | {"sample": "No"}
347 | {"sample": "M\u00fclheim an der Ruhr"}
348 | {"sample": null}
349 | {"sample": "Yes"}
350 | {"sample": "Bhanurangsi Savangwongse"}
351 | {"sample": "Margaret Clinton"}
352 | {"sample": "Scam"}
353 | {"sample": "Helsinki, Finland"}
354 | {"sample": "Orange County, Virginia"}
355 | {"sample": "Mangalia, Romania"}
356 | {"sample": "F\u00e9lix Leclerc"}
357 | {"sample": "Jacques of Savoy"}
358 | {"sample": "Viscount Ogawa Mataji"}
359 | {"sample": "Wang Si"}
360 | {"sample": "3 November 1867"}
361 | {"sample": "Dorothy Batley"}
362 | {"sample": "1 August 1813"}
363 | {"sample": "The Yellow Teddy Bears"}
364 | {"sample": "Yes"}
365 | {"sample": "Antoine Casavant"}
366 | {"sample": "Allahabad University"}
367 | {"sample": "Sir Hew Dalrymple, 3rd Baronet"}
368 | {"sample": "Murderer in the Fog"}
369 | {"sample": "Milla Jovovich"}
370 | {"sample": "Lecce"}
371 | {"sample": "\"Best Cutting Edge Film\" at the 2008 San Diego Film Festival, \"Audience Choice -- Feature-Length Narrative Film\" at the 2008 Cinema"}
372 | {"sample": "German"}
373 | {"sample": "Station for Two"}
374 | {"sample": "Inverkeithing, Fife, Scotland"}
375 | {"sample": "1234"}
376 | {"sample": "25 July 1666"}
377 | {"sample": "Special Delivery (1927 film)"}
378 | {"sample": null}
379 | {"sample": "Abu Dhabi"}
380 | {"sample": "Prenzlau"}
381 | {"sample": "Pozna\u0144"}
382 | {"sample": "The context does not provide information about the birthplace of Giorgio Ordelaffi, the father of Teobaldo II Ordelaffi."}
383 | {"sample": "Toronto, Ontario"}
384 | {"sample": "Forbidden Daughters"}
385 | {"sample": "Yes"}
386 | {"sample": "Robert Wulnikowski"}
387 | {"sample": "Sandflow"}
388 | {"sample": "From Corleone to Brooklyn"}
389 | {"sample": "Yes"}
390 | {"sample": "Vienna"}
391 | {"sample": "Johnny Ekstr\u00f6m"}
392 | {"sample": "Yes"}
393 | {"sample": "English"}
394 | {"sample": "complications of Parkinson's disease"}
395 | {"sample": "Piers de Geneville"}
396 | {"sample": "Albertus Seba"}
397 | {"sample": "Ferdinand I of the Two Sicilies"}
398 | {"sample": "Prince of Arcadia"}
399 | {"sample": "Milan"}
400 | {"sample": "K\u00f6ln"}
401 | {"sample": "Salma Hayek"}
402 | {"sample": "7th Solvay Conference"}
403 | {"sample": "Metropolitan Borough of Knowsley"}
404 | {"sample": "Sebastian Cabot"}
405 | {"sample": "Elizabeth Ashley"}
406 | {"sample": "16 February 2010"}
407 | {"sample": "Griffin O'Neal"}
408 | {"sample": "Three times"}
409 | {"sample": "1946"}
410 | {"sample": "ATS-6"}
411 | {"sample": "James Meredith"}
412 | {"sample": "University of Cambridge"}
413 | {"sample": "Tom Hood"}
414 | {"sample": "Jennifer Connelly"}
415 | {"sample": "Orange River"}
416 | {"sample": "Rialto Bridge"}
417 | {"sample": "January 3, 1947"}
418 | {"sample": "Clifton College; grain trade in London; Vassar College; The Juilliard School"}
419 | {"sample": "Malko Tarnovo Municipality"}
420 | {"sample": "Nicole and Natalie Albino's names."}
421 | {"sample": "University of Toronto"}
422 | {"sample": "1984"}
423 | {"sample": "Food and Drug Administration (FDA)"}
424 | {"sample": "Francisco Guterres"}
425 | {"sample": "Edward Pakenham"}
426 | {"sample": "Ocala"}
427 | {"sample": "Thomas Bach"}
428 | {"sample": "1964"}
429 | {"sample": "Jos\u00e9 Ramos-Horta"}
430 | {"sample": "There is currently no queen of England as the reigning monarch is King Charles III following the death of Queen Elizabeth II."}
431 | {"sample": "1039"}
432 | {"sample": "Finding a potential buyer for all of its operations in Finland and doing similar market research concerning Swedish operations."}
433 | {"sample": "Keturah"}
434 | {"sample": "South Central Coast"}
435 | {"sample": "Michael Bubl\u00e9"}
436 | {"sample": "BEC Recordings"}
437 | {"sample": "Gustav Vasa"}
438 | {"sample": "TML Entertainment"}
439 | {"sample": "Jos\u00e9 Ramos-Horta"}
440 | {"sample": "Three times."}
441 | {"sample": "Chilton"}
442 | {"sample": "Fort Davis"}
443 | {"sample": "Elisabeth Shue played the girlfriend of Michael J. Fox's character, Marty McFly. Michael J. Fox also played Marty McFly's daughter,"}
444 | {"sample": "Job 19:20"}
445 | {"sample": "Appleton"}
446 | {"sample": "Three times."}
447 | {"sample": "Orange River"}
448 | {"sample": "Common Sense was an important work because it crystallized sentiment for independence in 1776, contributing significantly to spreading the idea of republicanism, bolstering enthusiasm for"}
449 | {"sample": "Pearl River County"}
450 | {"sample": "The African Queen"}
451 | {"sample": "The African Queen"}
452 | {"sample": "Manchester City"}
453 | {"sample": "Golestan province, Iran"}
454 | {"sample": "John D. Loudermilk"}
455 | {"sample": "National Institute of Dramatic Art (NIDA)"}
456 | {"sample": null}
457 | {"sample": "River Thames"}
458 | {"sample": "January 3, 1947"}
459 | {"sample": "Neil Patrick Harris as Nightwing"}
460 | {"sample": "1989"}
461 | {"sample": "P\u00e9cs"}
462 | {"sample": "Matthew Lawrence"}
463 | {"sample": "Matthew Lawrence"}
464 | {"sample": "20 January 1906"}
465 | {"sample": "Richard Stallman"}
466 | {"sample": "Egyptian pantheon"}
467 | {"sample": "Calvin Coolidge was born on July 4, 1872."}
468 | {"sample": null}
469 | {"sample": "Sire Records"}
470 | {"sample": "Charles University in Prague"}
471 | {"sample": "10.75 years"}
472 | {"sample": "Xanana Gusm\u00e3o"}
473 | {"sample": "Georgetown"}
474 | {"sample": "National Football League (NFL)"}
475 | {"sample": "Alexander Lindsay, 4th Earl of Crawford"}
476 | {"sample": "Fleur-de-lis"}
477 | {"sample": "Greek mythology"}
478 | {"sample": "College of Charleston"}
479 | {"sample": "Cuyahoga County"}
480 | {"sample": "Clatskanie, Oregon"}
481 | {"sample": "Manhattan Project"}
482 | {"sample": "The Bourne Deception picks up where The Bourne Sanction left off."}
483 | {"sample": "Bill Gates was named the richest man in the world by Forbes' annual list of the world's billionaires. This was the 16th time that the founder of"}
484 | {"sample": "Aaron Benward"}
485 | {"sample": "There is no information provided in the given context about the occurrence of plague in the place where the composer of Bajazet was born."}
486 | {"sample": "1986"}
487 | {"sample": "Johanne Luise P\u00e4tges"}
488 | {"sample": null}
489 | {"sample": "NFL"}
490 | {"sample": "27 September 1879"}
491 | {"sample": "Amy Poehler"}
492 | {"sample": "60-70%"}
493 | {"sample": "August 3, 1769"}
494 | {"sample": "Casa Loma"}
495 | {"sample": "Min Zhou and Carl L. Bankston III"}
496 | {"sample": "Meg Ryan"}
497 | {"sample": "Andrew Scheer"}
498 | {"sample": "The context does not provide specific information about the local time at which the country where Fuser and Alberto meet the indigenous couple traveling to look for work changes their clocks"}
499 | {"sample": "1982"}
500 | {"sample": "20 January 1906"}
501 | {"sample": "2005"}
502 | {"sample": "Benny Beaver"}
503 | {"sample": "Jennifer Connelly"}
504 | {"sample": "Laurence Olivier"}
505 | {"sample": null}
506 | {"sample": "Jacek Majchrowski"}
507 | {"sample": "Cook County"}
508 | {"sample": "ATS-6"}
509 | {"sample": "Hunter River"}
510 | {"sample": "1,362,359"}
511 | {"sample": "St. Paul"}
512 | {"sample": "Susilo Bambang Yudhoyono"}
513 | {"sample": "There is no information provided in the context about the child of the male star of the 1921 film \"Money.\""}
514 | {"sample": "Fleur-de-lis"}
515 | {"sample": "Tanzania"}
516 | {"sample": "Josh Groban"}
517 | {"sample": "Marvin Kenneth Epp was a member of the Conservative Party of Canada."}
518 | {"sample": "Atlantic Ocean"}
519 | {"sample": "Ben Affleck played Fred O'Bannion in Dazed and Confused."}
520 | {"sample": "Nightwing"}
521 | {"sample": "Emile de Becque in South Pacific"}
522 | {"sample": "Belgrade"}
523 | {"sample": "The mountains south of Lesotho"}
524 | {"sample": "Jennifer Parker"}
525 | {"sample": "Tom Kenny"}
526 | {"sample": "280\u2013272 BC"}
527 | {"sample": "The owner of MV Miralda had not announced any specific actions in April 2010 according to the given context. The context provided does not mention any particular"}
528 | {"sample": "15 February 1942"}
529 | {"sample": "Jos\u00e9 Ramos-Horta"}
530 | {"sample": null}
531 | {"sample": "Ko Phi Phi Le"}
532 | {"sample": "Ilyasah Shabazz"}
533 | {"sample": "Kenton County"}
534 | {"sample": "Silver Bear for Best Actor"}
535 | {"sample": "1 January 1986"}
536 | {"sample": "A&M Records (Canada) and Koch Entertainment (Europe)"}
537 | {"sample": "2014"}
538 | {"sample": "Barton Distillery"}
539 | {"sample": "Zhejiang"}
540 | {"sample": "Karen Fairchild"}
541 | {"sample": "Casa Loma"}
542 | {"sample": "Lana Wood"}
543 | {"sample": "Tunis"}
544 | {"sample": "After the Second World War, Zakowski fled from Prussia with his mother and four siblings initially to Dortmund then Hamburg and finally the family settled in Niederz"}
545 | {"sample": "Charles County"}
546 | {"sample": "2015"}
547 | {"sample": "15 consecutive wins including friendlies"}
548 | {"sample": "Salma Hayek"}
549 | {"sample": "Bobby Brown plays Officer Bobby Brown."}
550 | {"sample": "1986"}
551 | {"sample": "1920"}
552 | {"sample": "Rabbi Menachem Mendel Schneersohn"}
553 | {"sample": "Josh Groban"}
554 | {"sample": "Bruce L. Davis"}
555 | {"sample": "20 March 2005"}
556 | {"sample": "Francis Bacon"}
557 | {"sample": "The Orange River"}
558 | {"sample": "Liverpool F.C."}
559 | {"sample": "Tunis, Tunisia"}
560 | {"sample": "The context does not provide information on the number of people in British Colonies related to the origin of the London broil cut."}
561 | {"sample": "Jennifer Connelly"}
562 | {"sample": "Gwinnett County"}
563 | {"sample": "three different relationships"}
564 | {"sample": "Minsk Region"}
565 | {"sample": "Bill Pullman"}
566 | {"sample": "Indonesia"}
567 | {"sample": "Cabo Delgado Province"}
568 | {"sample": "Association for Computing Machinery (ACM)"}
569 | {"sample": "Falls County"}
570 | {"sample": "Nuevo Laredo Municipality"}
571 | {"sample": "Sebastian Cabot"}
572 | {"sample": "Colin Firth"}
573 | {"sample": "France"}
574 | {"sample": "Ko Phi Phi Le"}
575 | {"sample": "Billie Jean King Cup"}
576 | {"sample": "Xanana Gusm\u00e3o"}
577 | {"sample": "Colin Firth"}
578 | {"sample": "River Thames"}
579 | {"sample": "Tony Award and a Drama Desk Award for Outstanding Revival of a Play."}
580 | {"sample": "University of Glasgow"}
581 | {"sample": "Louis Chedid"}
582 | {"sample": "To attend the conclave in Rome."}
583 | {"sample": "Book Trust"}
584 | {"sample": "M\u00e1ty\u00e1s Sz\u0171r\u00f6s"}
585 | {"sample": "Nieuwe Maas River"}
586 | {"sample": "Karen Fairchild"}
587 | {"sample": "City University of New York system"}
588 | {"sample": "Madison"}
589 | {"sample": "James Meredith"}
590 | {"sample": null}
591 | {"sample": "Hollywood Records"}
592 | {"sample": "Cold Crush Records"}
593 | {"sample": null}
594 | {"sample": "2010"}
595 | {"sample": "1572"}
596 | {"sample": "Yaroslavl Oblast"}
597 | {"sample": "Qatar Stars League"}
598 | {"sample": "The kingdom that Heptarchy was part of was not abolished; Heptarchy refers to the period in early English history (approximately the 5th\u201311"}
599 | {"sample": "$72,641"}
600 | {"sample": "Santa Clara County"}
601 |
--------------------------------------------------------------------------------
/real_world_long/results/gpt4-turbo/sample_LongBench_output_64.jsonl:
--------------------------------------------------------------------------------
1 | {"sample": "South West Ultras fan club"}
2 | {"sample": "No."}
3 | {"sample": "Low temperature scanning tunneling microscopy and spectroscopy (STM/STS)."}
4 | {"sample": "An ICD, or Implantable Cardioverter Defibrillator, is a surgically implanted electronic device that treats life-threatening heartbeat irregularities."}
5 | {"sample": "The sides of the fuselage are sloped (tumbled home) to form a conical section, which is necessary for the fuselage to have the correct aerodynamic shape and structural integrity. If the sides were not sloped, the section formed would be cylindrical, which would not match the design specifications of the aircraft"}
6 | {"sample": "The proximity of superconductivity tends to suppress the Kondo effect if the superconducting energy gap becomes larger than the relevant Kondo temperature. However, the non-local pairing induced by the superconducting proximity effect, which gives rise to the crossed Andreev reflections (CAR) exchange, can be the sole cause"}
7 | {"sample": "Electricity is used in everyday life for lighting, heating, cooling, telecommunications, computing, transportation (such as electric buses and trains, and electric cars), household appliances, industrial machinery, and entertainment systems. It is also used in medical equipment, sanitation, and infrastructure systems."}
8 | {"sample": "Vice Admiral"}
9 | {"sample": "15\u20133"}
10 | {"sample": "Vitamins K3, K4, and K5."}
11 | {"sample": "Yes, individual molecules of indeno[1,2-a]fluorene can switch between open-shell and closed-shell states."}
12 | {"sample": "3-D printing and software development."}
13 | {"sample": "The current daily AIs for vitamin K for adult women and men are 90 \u03bcg and 120 \u03bcg respectively."}
14 | {"sample": "watt"}
15 | {"sample": "The main advantage of a horizontal business model for mobile devices is flexibility; as costs drop and the market expands, it allows vendors to offer a wide range of phones based on radically different SoCs, avoiding competition with customers or suppliers."}
16 | {"sample": "Jacob C. Landau"}
17 | {"sample": "The main methodology used in the research is an unsupervised method based on the information bottleneck to capture both referential complexity and task-specific utility to adequately explore sparse social communication scenarios in multi-agent reinforcement learning (MARL). This includes the development of a natural-language-inspired lexicon of messages through emergent communication, alignment of"}
18 | {"sample": "The function beta(r) is determined by using the simplified form of the vacuum Einstein equations as applied to a test ball in free fall, leading to a differential equation for beta(r) that is solved using the method of separation of variables. The initial condition for infalling observers and the Newtonian limit at large distances from the"}
19 | {"sample": "70-75 metres"}
20 | {"sample": "Other toxic products."}
21 | {"sample": "Brisbane; Cleveland, Queensland, Australia"}
22 | {"sample": "L = 6, L = 8, and L = 14."}
23 | {"sample": "Five experiments."}
24 | {"sample": "Lasa, Gitastrophe, and Shadoks"}
25 | {"sample": "The court in In re Ferguson concluded that the method claims were neither tied to a particular machine or apparatus nor did they transform a particular article into a different state or thing."}
26 | {"sample": "As the rate of environmental transition increases, the learning rate initially rises, reaching a peak, and then begins to decline slowly, eventually reaching zero."}
27 | {"sample": "The study focuses on the subset of ultracold neutral plasmas that form via kinetic rate processes from state-selected Rydberg gases, and emphasizes in particular the distinctive dynamics found in the evolution of molecular ultracold plasmas."}
28 | {"sample": "Wearable sensors and ambient sensors."}
29 | {"sample": "The decision to base the water rates on usage was out of fairness."}
30 | {"sample": "Mary Magdalene went and announced to the disciples, \u201cI have seen the Lord\u201d; and she told them that he had said these things to her."}
31 | {"sample": "The future direction mentioned in the conclusion includes applying the results to non-Japanese EHRs, verifying other meta-information such as the patient's gender, age, race, religion, and used EHR system, developing a robust and flexible research infrastructure for large scale cross-sectional studies, and addressing the challenge of conducting human evaluations"}
32 | {"sample": "The authors used simulated data for the stationary experiment and real data of a wireless MISO channel acquired in a realistic indoor scenario for the tracking experiment."}
33 | {"sample": "The baseline in the layout procedure serves as a reference for the midpoint of the firewall for the developed (and true dimensioned) side panel."}
34 | {"sample": "The new Iraqi Body Count organization provides a count of civilian deaths."}
35 | {"sample": "The main advantage of the proposed method in terms of computation time is that the time required to update the robot's belief does not increase with the complexity of the environment, which makes the method suitable for online applications."}
36 | {"sample": "March 2002, end of this decade."}
37 | {"sample": "Four"}
38 | {"sample": "A solution $u$ of $-\\Delta u = f(u)$ in ${\\mathbb{R}}^N$ is stable provided\n\\[ \\int f'(u) \\psi^2 \\le \\int | \\nabla \\psi|^2, \\qquad \\forall \\psi \\in C"}
39 | {"sample": "The normalized least mean square (NLMS) algorithm."}
40 | {"sample": "Del Bigtree and his team at ICAN"}
41 | {"sample": "C$_2$H"}
42 | {"sample": "1964"}
43 | {"sample": "The relationship between the maximum velocity (\\(V\\)) and the amplitude of the blob or depletion (\\(\\triangle n\\)) is initially linear for small amplitudes (\\(|\\triangle n/n_0| < 1\\)), and transitions to a square root dependence for larger amplitudes in incompressible flows"}
44 | {"sample": "Dendritic spines are rich in actin."}
45 | {"sample": "14,520 attendees"}
46 | {"sample": "Justice Kennedy acknowledged that the Internet has caused far-reaching systemic and structural changes in the economy and therefore \"Quill now harms States to a degree far greater than could have been anticipated earlier.\" He concluded with the wish that \"the legal system should find an appropriate case for this Court to reexamine Quill and Bellas"}
47 | {"sample": "Environmental fluctuation and uncertainty, the details of the task the artificial organisms are aiming to solve, and the interaction of learned and static network connectivity."}
48 | {"sample": "The problem encountered when building the fuselage sides is that when the completed fuselage sides are laid into position to form the fuselage box section, the once straight longerons bow up from the building surface, forming an unsatisfactory \"banana\" shape. This is especially problematic when using preformed fiberglass parts, as this"}
49 | {"sample": "January 1929"}
50 | {"sample": "48V"}
51 | {"sample": "June 1, 1999"}
52 | {"sample": "Mufti-e-Azam-e-Hind received Khilafat in the Qaderi, Chishti, Nakshbandi, Suharwardi, and Madaari Orders."}
53 | {"sample": "Privacy concerns, skepticism about the program's effectiveness, and the potential cost to taxpayers."}
54 | {"sample": "Wallace"}
55 | {"sample": "30,223"}
56 | {"sample": "2013"}
57 | {"sample": "23 September 2017"}
58 | {"sample": "Long Term Capital Management (LTCM)"}
59 | {"sample": "$m \\sim t^{1/2}$"}
60 | {"sample": "2 meters by 2 meters"}
61 | {"sample": "The PLM with decimation method outperforms the PLM with $l_2$ regularization and a mean-field-based method, particularly in problems with sparse interaction networks. It provides a more precise estimate of the total number of bonds and results in a lower reconstruction error, indicating a higher quality of network reconstruction. The"}
62 | {"sample": "Exegetical, Theological, Homiletical"}
63 | {"sample": "February 15, 2010"}
64 | {"sample": "Spanish BERT base"}
65 | {"sample": "The infall rate is 2-5 times smaller and the gas density is 2-5 times smaller in the region close to the black hole compared to non-magnetized accretion."}
66 | {"sample": "Toby Schindelbeck has observed that issues like Corporate Personhood and the \u201csingle-use\u201d plastic bag ban have drawn fairly small crowds \u2013 he estimates 25 \u2013 30 people, and the city has acted on these issues with only that small fraction of the population in support."}
67 | {"sample": "Peter Denning"}
68 | {"sample": "The conduction gap is strongly dependent on the direction of applied strain. It has peaks at certain strain directions and is zero at others. For tensile strain, the conduction gap peaks occur at strain directions of 0 or 90 degrees, while it is zero at approximately 47 and 133 degrees. For compress"}
69 | {"sample": "7 March 2023"}
70 | {"sample": "V\u2212, V0, V+"}
71 | {"sample": null}
72 | {"sample": "Jivani Street 2, Malatia-Sebastia District, Yerevan."}
73 | {"sample": "NFPA and FPSA both vastly outperformed GMRES and DSA in runtime and iterations for all cases by orders of magnitude."}
74 | {"sample": "Technological limitations, resistance amongst archaeologists to making data available due to concerns about scrutiny and lost opportunities for analysis, loss of 'capital' of data, control over data presentation, emphasis on the creation of new data, and informal data sharing practices relying on personal networks."}
75 | {"sample": "Physics, biology, social sciences, finance, and neuroscience."}
76 | {"sample": "4.5$\\times$10$^{8}$\\,cm"}
77 | {"sample": "VC-10 Squadron"}
78 | {"sample": "The bigger the receptive field size, the more successful the algorithm is in filling in the gaps."}
79 | {"sample": "The interlayer Berry connection polarizability (BCP) is significant because it arises from layer hybridization of electrons by the twisted interlayer coupling and is the band geometric origin of the crossed nonlinear dynamical Hall effect in twisted bilayers. It measures the momentum space curl of the interlayer BCP over occupied states,"}
80 | {"sample": "Yes"}
81 | {"sample": "Legacies of Losing in American Politics, with Nicole Mellow (University of Chicago Press, Fall 2017)"}
82 | {"sample": "A media application determines the context of an event by using a content-recognition module or algorithm to analyze data associated with the event, such as spoken words, actions, facial gestures, and the relationships between them. It may also use object recognition techniques, speech recognition techniques, and cross-reference the data with databases indicating different contexts"}
83 | {"sample": "Gene transfer to hemophilia A mice via oral delivery of FVIII-chitosan nanoparticles"}
84 | {"sample": "FC Banants"}
85 | {"sample": "The conclusion of the study was that fruit consumption may provide a protective effect for mercury exposure in Amazonian riparians. Prevention strategies that seek to maintain fish consumption while reducing mercury exposure in fish-eating communities should be pursued."}
86 | {"sample": "The scoring engine queries new content items based on the channel category and at least one other channel attribute. It retrieves candidate content items that include the channel category and the other channel attribute. The scoring engine then compares the candidate content items to a model to determine their relevance and interest to the user, scores them, and generates a"}
87 | {"sample": "2013\u20132014"}
88 | {"sample": "More than 120 novels"}
89 | {"sample": "The proposed approach in the research paper is a probabilistic interpretation of the least-mean-square (LMS) filter, which results in an adaptable step-size LMS algorithm that performs well in both stationary and tracking scenarios. The approach approximates the posterior distribution of the parameter vector with an isotropic Gaussian distribution, leading to a"}
90 | {"sample": "Yes"}
91 | {"sample": "Anemia, bruising, nosebleeds, bleeding of the gums, and heavy menstrual bleeding in women."}
92 | {"sample": "172"}
93 | {"sample": "Mobile device management (MDM) systems are usually referred to as systems that support centralized control of an entire fleet of mobile devices (smartphones and tablets) and mobile applications by applying and ensuring pre-defined configuration settings."}
94 | {"sample": "BERT, RoBERTa, ELECTRA, GPT-2, and XLM-RoBERTa."}
95 | {"sample": "Broadjam's servers may not be used for illegal activities, excessive overloading, as a source or intermediary for mail bombs, Internet packet flooding, packet corruption, denial of service, or any other abusive activities. Server hacking or security breaches are strictly prohibited. Hosting Subscriber's Website cannot be used as an anonymous gateway, and"}
96 | {"sample": "The vacuum processing system is configured with multiple vacuum processing apparatus arranged in parallel."}
97 | {"sample": "21"}
98 | {"sample": "1425 $\\mu_{B}$"}
99 | {"sample": "Nothing can give a solid proof of the existence of heaven and hell, yet, nothing can disprove it either."}
100 | {"sample": "BC will not leave Boston."}
101 | {"sample": "The benefits of using binary variables in the SLAS formulation include reduced computational complexity, faster solution finding, and the ability to transform integer decision variables into a format that simplifies the optimization problem, leading to improved computational efficiency."}
102 | {"sample": "Users can go for troubleshooting and support to the QuecPython community at https://forumschinese.quectel.com/c/function-subjects/quectpython/43, contact online support via QQ group 445121768, or visit the official website documentation at https://python.quectel.com/doc/"}
103 | {"sample": "Severe anemia that begins even before birth, most affected babies do not survive to be born or die shortly after birth."}
104 | {"sample": "October 2001"}
105 | {"sample": "Smartphones are more compact and power constrained, with SoCs limited to around 1W due to battery and thermal dissipation constraints, and they require a cellular modem, often integrated into the SoC for cost-sensitive models. Tablets, on the other hand, have a higher power budget for SoCs, up to"}
106 | {"sample": "The sticking points are about how much spending to cut and over lightning rod issues like regulating greenhouse gases and abortion."}
107 | {"sample": "The Director of Town and Country Planning appointed under the act is responsible for carrying out the functions assigned under the act."}
108 | {"sample": "The framework captures the reduced-order dynamics by employing a propagator in the latent space to capture the dynamics, using complex valued latent variables treated independently, and solving a continuous ODE in the complex plane for each dimension of the latent variable."}
109 | {"sample": "Keep deploying and harvesting your bases to earn XP (experience points), proving your loyalty to your faction."}
110 | {"sample": "\\begin{equation}\n\\Gamma_e = \\frac{e^2}{4\\pi \\epsilon_0 a_{ws}}\\frac{1}{k_B T_e}\n\\end{equation}"}
111 | {"sample": "By pressing \u2018SKIP\u2019."}
112 | {"sample": "6 years"}
113 | {"sample": "C-295"}
114 | {"sample": "Permanent yellow spot damage on the screen."}
115 | {"sample": "Craig wants to find his own place because the lady he is renting a room from smokes, and he wants to avoid that."}
116 | {"sample": "An ALPHABETICAL LIST OF THE NAMES and PLACES of ABODE of the MERCHANTS and PRINCIPAL TRADERS of the Cities of LONDON and WESTMINSTER, the Borough of SOUTHWARK, and their Environs, with the Number affixed to each House."}
117 | {"sample": "At dawn or dusk, the high resolution becomes a bit less on what we focus on so that what's off to the left or right can be better noted in the dim light."}
118 | {"sample": "The group's request to the Connecticut DEEP Commissioner is to appoint a blue ribbon commission to conduct research and develop a management plan for Whalebone Cove, and either deny or defer approval on any applications for new docks in the Cove until the management plan can be developed and implemented."}
119 | {"sample": "Power-law functions"}
120 | {"sample": "The correct expression for the derivative of the function $f\\left(x\\right)$ is:\n\\[\nf'\\left(x\\right) = \n\\begin{cases} \n\\frac{6x^2\\cos{\\left(x^2\\right)}+\\sin{\\left(x^2\\right)}}"}
121 | {"sample": "John F. Kennedy Profiles in Courage Award"}
122 | {"sample": "The paper aims to solve the problem of computing transient responses of nonlinear oscillators under arbitrary irregular excitations based on a combination of a pole-residue operation and Volterra series."}
123 | {"sample": "Fuller's Ranch"}
124 | {"sample": "normalized least mean square (NLMS) algorithm"}
125 | {"sample": "Accounting for path preference in the robot's belief update allows for a more gradual decrease in entropy over the goal distribution, avoiding overconfidence in incorrect predictions about the user's preferences, which leads to improved system performance."}
126 | {"sample": "An open-shell \u03c0-diradical state and a closed-shell state with a para-quinodimethane moiety."}
127 | {"sample": "The main focus of the research paper is to determine the nuclear liquid-gas transition in the strong coupling regime of lattice QCD with staggered quarks, specifically the first order transition at low temperatures as a function of the quark mass and the inverse gauge coupling \u03b2, and to discuss the nuclear interactions as a function of"}
128 | {"sample": "\u03b3_h = 1.5"}
129 | {"sample": "William Rokeby"}
130 | {"sample": "Thalassemias are classified according to the globin that is affected, hence the names alpha thalassemia and beta thalassemia."}
131 | {"sample": "McPherson"}
132 | {"sample": "1870"}
133 | {"sample": "1. \u4f7f\u7528\u5b89\u88c5\u5305\u5b89\u88c5\u5373\u53ef\u3002\n2. \u6253\u5f00\u7a0b\u5e8f\u540e\uff0c\u4f1a\u663e\u793a\u4ee5\u4e0b\u754c\u9762\uff0c\u7528\u6237\u53ef\u4ee5\u67e5\u770b\u8d26\u6237\uff08\u9ed8\u8ba4\u521b\u5efa10\u4e2a\u8d26\u6237\uff09\u3001\u533a\u5757\u3001\u4ea4\u6613\u548c\u65e5\u5fd7\u3002\n3. \u70b9\u51fb\u201c\u8bbe\u7f6e\u201d\uff0c\u7528\u6237\u8fd8\u53ef\u4ee5\u8bbe\u7f6e\u7ed1\u5b9a"}
134 | {"sample": "Improves its performance."}
135 | {"sample": "C-GDBN (Coupled Generalized Dynamic Bayesian Network)"}
136 | {"sample": "Potential applications of ferromagnetic semiconductors include spin injection into non-magnetic semiconductors, electrical manipulation of carrier-induced magnetism in magnetic semiconductors, and the fabrication of nanodevices such as memory nanodots or nanochannels for spin injection."}
137 | {"sample": "62"}
138 | {"sample": "1766"}
139 | {"sample": "The main topic of the text is the situation in Iraq, including the formation of a new government, the plight of Iraqi Christians and refugees, the celebration of Christmas by US service members and Iraqi Christians under threat, and various opinions and reports on the political and social conditions in Iraq."}
140 | {"sample": "The potential of SNNs in modeling the visual system lies in their ability to achieve higher neural representation similarity scores compared to CNNs with the same depth and almost the same architectures, and their more biologically plausible mechanisms of encoding information with spikes and capturing temporal features, which could provide a better understanding of the functional hierarchy and"}
141 | {"sample": "Deputy Prime Minister and Minister of Finance"}
142 | {"sample": "85.61%"}
143 | {"sample": "The specific-heat ratio affects the average motion of the bubble such that the smaller the specific-heat ratio, the slower the average motion of the bubble. In the shock compression stage, the specific-heat ratio contributes little to the average motion, but after the shock wave passes through the bubble, a larger specific-heat"}
144 | {"sample": "Kansas"}
145 | {"sample": "O (t, L_{\\parallel} ; S_\\Delta) = L_{\\parallel}^{-\\beta/[\\nu(1+\\Delta)]} \\tilde f_O (t/L_{\\parallel}^{z/(1+\\Delta)} ; S_\\Delta).\\quad"}
146 | {"sample": "The relationships between catch per set and fishing behavior variables differ for different measures of CPUE in that there is a positive relationship between unstandardized CPUE (individuals per set) and number of hooks, number of sets, and engine power. However, when CPUE is standardized by hook number (individuals per"}
147 | {"sample": "Romance novels and women's fiction"}
148 | {"sample": "No."}
149 | {"sample": "URPC2017, URPC2018, URPC2019, URPC2020$_{ZJ}$, URPC2020$_{DL}$, UDD"}
150 | {"sample": "The research opportunity mentioned is studying the effects of Brazilian Jiu Jitsu and psychotherapy on helping people with autism develop subjective awareness of others. The study requires participants to be between 7-21 years of age with a diagnosis of Autism Spectrum Disorder, enroll in an approved Jiu Jitsu Academy and attend at least two"}
151 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.38.1
2 | vllm==0.3.2
3 | ray==2.11.0
4 |
--------------------------------------------------------------------------------
/short_tasks/README.md:
--------------------------------------------------------------------------------
1 | ## Short-Context Tasks
2 |
3 | The following guidance will help you to reproduce our results on short-context tasks.
4 |
5 | ### Results on GSM8K, MATH and CSQA
6 |
7 | We leverage the evaluation data scripts in [LEMA](https://github.com/microsoft/LEMA/).
8 | The few-shot examples for GSM8K and MATH are chosen from their training set according to the input similarity.
9 |
10 | **Step 1: Inference with vLLM.**
11 |
12 | The test data in `./prompts/` have been formatted into the system template for FILM-7B.
13 | ```bash
14 | # Extract Data
15 |
16 | # Inference
17 | export NCCL_IGNORE_DISABLED_P2P=1
18 | python ../vllm_inference/vllm_inference.py --model_path In2Training/FILM-7B \
19 | --testdata_file gsm8k_8shot.jsonl \
20 | --testdata_folder ./prompts/ \
21 | --output_folder ./results/FILM-7B/ \
22 | --max_length 2048 \
23 | --tensor_parallel_size 8
24 |
25 | python ../vllm_inference/vllm_inference.py --model_path In2Training/FILM-7B \
26 | --testdata_file math_4shot.jsonl \
27 | --testdata_folder ./prompts/ \
28 | --output_folder ./results/FILM-7B/ \
29 | --max_length 2048 \
30 | --tensor_parallel_size 8
31 |
32 | python ../vllm_inference/vllm_inference.py --model_path In2Training/FILM-7B \
33 | --testdata_file csqa_0shot.jsonl \
34 | --testdata_folder ./prompts/ \
35 | --output_folder ./results/FILM-7B/ \
36 | --max_length 128 \
37 | --tensor_parallel_size 8
38 | ```
39 |
40 | We provide our generation results in `./results/`, including FILM-7B and Mistral-7B-Instruct-v0.2.
41 |
42 | **Step 2: Evaluation.**
43 |
44 | Run `evaluation.py` to calculate evaluation metrics on different tasks.
45 | ```bash
46 | python evaluation.py
47 | ```
48 |
49 |
50 | ### Results on Other Tasks
51 |
52 | We utilize the [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness) for the evaluation on MMLU, BoolQ, RACE-H, ARC-C, and HellaSwag.
53 | The results could have slight variances with different versions of lm_eval.
54 |
55 |
--------------------------------------------------------------------------------
/short_tasks/evaluation.py:
--------------------------------------------------------------------------------
1 | # This source code is licensed under the MIT license
2 |
3 | import json
4 | import pdb
5 |
6 | from utils import get_final_result_gsm8k, get_final_result_math, get_csqa_match
7 |
8 |
9 | if __name__ == "__main__":
10 |
11 | models = ['FILM-7B', 'Mistral-7B-Instruct-v0.2']
12 | tasks = ['gsm8k_8shot', 'math_4shot', 'csqa_0shot']
13 |
14 | for model in models:
15 | print(model)
16 | for task in tasks:
17 | print(task)
18 |
19 | file_label = './prompts/{task}.jsonl'.format(task=task)
20 | file_pred = './results/{model}/sample_{task}.jsonl'.format(model=model, task=task)
21 |
22 | acc_list = []
23 | with open(file_label, 'r', encoding='utf-8') as f_label, \
24 | open(file_pred, 'r', encoding='utf-8') as f_pred:
25 | label_infos = [json.loads(line) for line in f_label.readlines()]
26 | pred_infos = [json.loads(line) for line in f_pred.readlines()]
27 |
28 | assert len(label_infos) == len(pred_infos)
29 |
30 | for label_info, pred_info in zip(label_infos, pred_infos):
31 |
32 | if 'gsm8k' in task:
33 | pred = pred_info['samples'][0]
34 | label = label_info['completion']
35 |
36 | pred_result = get_final_result_gsm8k(pred)
37 | label_result = get_final_result_gsm8k(label)
38 |
39 | assert label_result is not None
40 | assert label_result != ''
41 | assert label_result != 0
42 |
43 | if pred_result == label_result:
44 | acc_list.append(1)
45 | else:
46 | acc_list.append(0)
47 |
48 | elif 'math' in task:
49 | pred = pred_info['samples'][0]
50 | label = label_info['completion']
51 |
52 | pred_result = get_final_result_math(pred)
53 | label_result = get_final_result_math(label)
54 |
55 | if ',' not in label_result:
56 | pred_result = pred_result.replace(',', '')
57 |
58 | assert label_result is not None
59 | assert label_result != ''
60 | assert label_result != 0
61 |
62 | if pred_result == label_result:
63 | acc_list.append(1)
64 | else:
65 | acc_list.append(0)
66 |
67 | elif 'csqa' in task:
68 | pred = pred_info['samples'][0]
69 | label = label_info['answer']
70 | candidates = label_info['candidates']
71 |
72 | assert label in candidates
73 |
74 | score = get_csqa_match(pred, label, candidates)
75 | acc_list.append(score)
76 |
77 | else:
78 | pdb.set_trace()
79 |
80 | print('acc:', sum(acc_list) / len(acc_list))
81 |
82 |
--------------------------------------------------------------------------------
/short_tasks/utils.py:
--------------------------------------------------------------------------------
1 | # This source code is licensed under the MIT license
2 |
3 | import string
4 | import pdb
5 |
6 |
7 | index_rechange = {
8 | 'A': 0,
9 | 'B': 1,
10 | 'C': 2,
11 | 'D': 3,
12 | 'E': 4,
13 | 'F': 5
14 | }
15 |
16 | def get_csqa_match(pred, label, candidates):
17 | select_index = None
18 | for char in pred:
19 | if char in index_rechange:
20 | select_index = char
21 | break
22 | if select_index is None:
23 | select_index = 'A'
24 |
25 | select_answer = candidates[index_rechange[select_index]]
26 |
27 | if select_answer == label:
28 | return 1
29 | else:
30 | return 0
31 |
32 |
33 | def _fix_fracs(string):
34 | substrs = string.split("\\frac")
35 | new_str = substrs[0]
36 | if len(substrs) > 1:
37 | substrs = substrs[1:]
38 | for substr in substrs:
39 | new_str += "\\frac"
40 | if substr[0] == "{":
41 | new_str += substr
42 | else:
43 | try:
44 | assert len(substr) >= 2
45 | except:
46 | return string
47 | a = substr[0]
48 | b = substr[1]
49 | if b != "{":
50 | if len(substr) > 2:
51 | post_substr = substr[2:]
52 | new_str += "{" + a + "}{" + b + "}" + post_substr
53 | else:
54 | new_str += "{" + a + "}{" + b + "}"
55 | else:
56 | if len(substr) > 2:
57 | post_substr = substr[2:]
58 | new_str += "{" + a + "}" + b + post_substr
59 | else:
60 | new_str += "{" + a + "}" + b
61 | string = new_str
62 | return string
63 |
64 |
65 | def _fix_a_slash_b(string):
66 | if len(string.split("/")) != 2:
67 | return string
68 | a = string.split("/")[0]
69 | b = string.split("/")[1]
70 | try:
71 | a = int(a)
72 | b = int(b)
73 | assert string == "{}/{}".format(a, b)
74 | new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
75 | return new_string
76 | except:
77 | return string
78 |
79 |
80 | def _remove_right_units(string):
81 | # "\\text{ " only ever occurs (at least in the val set) when describing units
82 | if "\\text{ " in string:
83 | splits = string.split("\\text{ ")
84 | assert len(splits) == 2
85 | return splits[0]
86 | else:
87 | return string
88 |
89 |
90 | def _fix_sqrt(string):
91 | if "\\sqrt" not in string:
92 | return string
93 | splits = string.split("\\sqrt")
94 | new_string = splits[0]
95 | for split in splits[1:]:
96 | if split[0] != "{":
97 | a = split[0]
98 | new_substr = "\\sqrt{" + a + "}" + split[1:]
99 | else:
100 | new_substr = "\\sqrt" + split
101 | new_string += new_substr
102 | return new_string
103 |
104 |
105 | def _strip_string(string):
106 | # linebreaks
107 | string = string.replace("\n", "")
108 | # print(string)
109 |
110 | # remove inverse spaces
111 | string = string.replace("\\!", "")
112 | # print(string)
113 |
114 | # replace \\ with \
115 | string = string.replace("\\\\", "\\")
116 | # print(string)
117 |
118 | # replace tfrac and dfrac with frac
119 | string = string.replace("tfrac", "frac")
120 | string = string.replace("dfrac", "frac")
121 | # print(string)
122 |
123 | # remove \left and \right
124 | string = string.replace("\\left", "")
125 | string = string.replace("\\right", "")
126 | # print(string)
127 |
128 | # Remove circ (degrees)
129 | string = string.replace("^{\\circ}", "")
130 | string = string.replace("^\\circ", "")
131 |
132 | # remove dollar signs
133 | string = string.replace("\\$", "")
134 |
135 | # remove units (on the right)
136 | string = _remove_right_units(string)
137 |
138 | # remove percentage
139 | string = string.replace("\\%", "")
140 | string = string.replace("\%", "")
141 |
142 | # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
143 | string = string.replace(" .", " 0.")
144 | string = string.replace("{.", "{0.")
145 | # if empty, return empty string
146 | if len(string) == 0:
147 | return string
148 | if string[0] == ".":
149 | string = "0" + string
150 |
151 | # to consider: get rid of e.g. "k = " or "q = " at beginning
152 | if len(string.split("=")) == 2:
153 | if len(string.split("=")[0]) <= 2:
154 | string = string.split("=")[1]
155 |
156 | # fix sqrt3 --> sqrt{3}
157 | string = _fix_sqrt(string)
158 |
159 | # remove spaces
160 | string = string.replace(" ", "")
161 |
162 | # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
163 | string = _fix_fracs(string)
164 |
165 | # manually change 0.5 --> \frac{1}{2}
166 | if string == "0.5":
167 | string = "\\frac{1}{2}"
168 |
169 | # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
170 | string = _fix_a_slash_b(string)
171 |
172 | return string
173 |
174 | def remove_boxed(s):
175 | left = "\\boxed{"
176 | try:
177 | assert s[:len(left)] == left
178 | assert s[-1] == "}"
179 | return s[len(left):-1]
180 | except:
181 | return None
182 |
183 | def last_boxed_only_string(string):
184 | idx = string.rfind("\\boxed")
185 | if idx < 0:
186 | idx = string.rfind("\\fbox")
187 | if idx < 0:
188 | return None
189 |
190 | i = idx
191 | right_brace_idx = None
192 | num_left_braces_open = 0
193 | while i < len(string):
194 | if string[i] == "{":
195 | num_left_braces_open += 1
196 | if string[i] == "}":
197 | num_left_braces_open -= 1
198 | if num_left_braces_open == 0:
199 | right_brace_idx = i
200 | break
201 | i += 1
202 |
203 | if right_brace_idx == None:
204 | retval = None
205 | else:
206 | retval = string[idx:right_brace_idx + 1]
207 |
208 | return retval
209 |
210 |
211 | def get_final_result_math(completion):
212 | if 'Q:' in completion:
213 | completion = completion.split('Q:')[0].strip()
214 | if 'Question:' in completion:
215 | completion = completion.split('Question:')[0].strip()
216 | if 'The answer is: ' in completion:
217 | completion = completion.replace('The answer is: ', 'The answer is ')
218 |
219 |
220 | if 'The answer is ' not in completion:
221 | result = last_boxed_only_string(completion)
222 | result = remove_boxed(result)
223 | else:
224 | result = completion.split('The answer is ')[-1].strip()
225 |
226 | if result == "":
227 | return ""
228 |
229 | if result[-1] == '.':
230 | result = result[:-1]
231 |
232 | result = result.replace('$', '').strip()
233 |
234 | if 'boxed{' in result:
235 | result = last_boxed_only_string(result)
236 | result = remove_boxed(result)
237 |
238 |
239 | if result is None:
240 | return ''
241 |
242 | if result == '':
243 | return ''
244 |
245 | try:
246 | result = _strip_string(result)
247 | except:
248 | return ''
249 |
250 | assert result is not None
251 |
252 | return result
253 |
254 |
255 | def get_final_result_gsm8k(completion):
256 | if '\n\nQ: ' in completion:
257 | completion = completion.split('\n\nQ: ')[0]
258 | if '\n\nQuestion: ' in completion:
259 | completion = completion.split('\n\nQuestion: ')[0]
260 | if 'The answer is: ' in completion:
261 | completion = completion.replace('The answer is: ', 'The answer is ')
262 |
263 | while completion[-1] in ['.', '\n']:
264 | completion = completion[:-1]
265 |
266 | if 'The answer is ' not in completion:
267 | # print(completion)
268 | # pdb.set_trace()
269 | return 0.
270 |
271 | result = completion.lower().split('the answer is ')[-1].split(' ')[0]
272 |
273 | if len(result) == 0:
274 | return 0.
275 |
276 | if result[-1] == '.':
277 | result = result[:-1]
278 |
279 | if '£' in result:
280 | result = result.replace('£', '')
281 | if '€' in result:
282 | result = result.replace('€', '')
283 |
284 | if len(result) == 0:
285 | return 0.
286 |
287 | if result[-1] == '.':
288 | result = result[:-1]
289 | result = result.replace(',', '')
290 |
291 | if '=' in result:
292 | result = result.split('=')[-1]
293 | result = result.strip()
294 |
295 | if '>>' in result:
296 | result = result.split('>>')[-1]
297 | result = result.strip()
298 |
299 | result_str = ''
300 | result = result.lower()
301 | for char in result:
302 | if char in string.ascii_lowercase:
303 | continue
304 | else:
305 | result_str += char
306 | result = result_str
307 |
308 |
309 | if ':' in result:
310 | result = result.split(':')[0]
311 |
312 | for char in ['$', '"']:
313 | result = result.replace(char, '')
314 |
315 | if '%' in result:
316 | result = result.strip()
317 | if result[-1] == '%':
318 | result = result[:-1]
319 | else:
320 | return 0.
321 | # percentage = 0.01
322 |
323 | if len(result) == 0:
324 | return 0.
325 |
326 | if result[-1] in ['/']:
327 | result = result[:-1]
328 |
329 | result = result.replace(' ', '')
330 |
331 | try:
332 | if ('+' in result) or ('-' in result) or ('*' in result) or ('/' in result):
333 | result = eval(result)
334 | result = float(result)
335 | except:
336 | # print('\n', result)
337 | # pdb.set_trace()
338 | result = 0
339 |
340 | return result
--------------------------------------------------------------------------------
/vllm_inference/vllm_inference.py:
--------------------------------------------------------------------------------
1 | # This source code is licensed under the MIT license
2 |
3 | from typing import Optional
4 | import logging, os, json
5 | from vllm import LLM, SamplingParams
6 | import ray
7 | from ray_on_aml.core import Ray_On_AML
8 | import argparse
9 |
10 |
11 | def inference(testdata_folder, testdata_file, output_folder, output_file, model_path, tensor_parallel_size, max_length, trust_remote_code):
12 | logging.basicConfig(level=logging.INFO)
13 | logger = logging.getLogger(__name__)
14 |
15 | # log args
16 | logger.info(f"test file: {testdata_file}")
17 | logger.info(f"output file: {output_file}")
18 | logger.info(f"tensor_parallel_size: {tensor_parallel_size}")
19 |
20 | with open(os.path.join(testdata_folder, testdata_file), 'r', encoding='utf-8') as f_read:
21 | test_prompts = [json.loads(line)['prompt'] for line in f_read.readlines()]
22 | total_lines = len(test_prompts)
23 | logger.info(f"Total lines: {total_lines}")
24 | assert len(test_prompts) != 0
25 |
26 | llm = LLM(model=model_path,
27 | tensor_parallel_size=tensor_parallel_size,
28 | trust_remote_code=trust_remote_code,
29 | max_num_batched_tokens=800000,
30 | gpu_memory_utilization=0.9)
31 |
32 | sampling_params = SamplingParams(temperature=0.0, top_p=1.0, max_tokens=max_length)
33 |
34 | batch_size = 128
35 | total_batch_num = (total_lines // batch_size) + 1
36 |
37 | current_lines = 0
38 | all_outputs = []
39 |
40 | for batch_idx in range(total_batch_num):
41 | if batch_idx == total_batch_num-1:
42 | prompt_batch = test_prompts[batch_idx * batch_size:]
43 | else:
44 | prompt_batch = test_prompts[batch_idx*batch_size:(batch_idx+1)*batch_size]
45 | results = llm.generate(prompt_batch, sampling_params)
46 | current_lines += batch_size
47 | logger.info(f"{current_lines} in {total_lines} examples.")
48 | for result in results:
49 | all_outputs.append({'samples': [result.outputs[0].text]})
50 |
51 | with open(os.path.join(output_folder, output_file), "w", encoding='utf-8') as f:
52 | for output in all_outputs:
53 | f.write(json.dumps(output) + '\n')
54 |
55 |
56 | if __name__ == '__main__':
57 | parser = argparse.ArgumentParser(description='args for running vllm')
58 | parser.add_argument('--testdata_folder', type=str, required=True)
59 | parser.add_argument('--testdata_file', type=str, required=True)
60 | parser.add_argument('--output_folder', type=str, required=True)
61 | parser.add_argument('--output_file', type=str, default=None, required=False)
62 | parser.add_argument('--model_path', type=str, required=True)
63 | parser.add_argument('--max_length', type=int, default=128, required=False)
64 | parser.add_argument('--tensor_parallel_size', type=int, default=8, required=False)
65 | parser.add_argument('--trust_remote_code', type=bool, default=True, required=False)
66 | args = parser.parse_args()
67 |
68 | if args.output_file is None:
69 | output_file = 'sample_' + args.testdata_file
70 | else:
71 | output_file = args.output_file
72 |
73 | if not os.path.exists(args.output_folder):
74 | try:
75 | os.mkdir(args.output_folder)
76 | except:
77 | print('Path exist!')
78 |
79 |
80 | inference(testdata_folder=args.testdata_folder,
81 | testdata_file=args.testdata_file,
82 | output_folder=args.output_folder,
83 | output_file=output_file,
84 | model_path=args.model_path,
85 | max_length=args.max_length,
86 | tensor_parallel_size=args.tensor_parallel_size,
87 | trust_remote_code=args.trust_remote_code)
88 |
--------------------------------------------------------------------------------