├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── SUPPORT.md
└── src
    ├── reina.py
    └── run_summarization.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.rsuser
  8 | *.suo
  9 | *.user
 10 | *.userosscache
 11 | *.sln.docstates
 12 | 
 13 | # User-specific files (MonoDevelop/Xamarin Studio)
 14 | *.userprefs
 15 | 
 16 | # Mono auto generated files
 17 | mono_crash.*
 18 | 
 19 | # Build results
 20 | [Dd]ebug/
 21 | [Dd]ebugPublic/
 22 | [Rr]elease/
 23 | [Rr]eleases/
 24 | x64/
 25 | x86/
 26 | [Aa][Rr][Mm]/
 27 | [Aa][Rr][Mm]64/
 28 | bld/
 29 | [Bb]in/
 30 | [Oo]bj/
 31 | [Ll]og/
 32 | [Ll]ogs/
 33 | 
 34 | # Visual Studio 2015/2017 cache/options directory
 35 | .vs/
 36 | # Uncomment if you have tasks that create the project's static files in wwwroot
 37 | #wwwroot/
 38 | 
 39 | # Visual Studio 2017 auto generated files
 40 | Generated\ Files/
 41 | 
 42 | # MSTest test Results
 43 | [Tt]est[Rr]esult*/
 44 | [Bb]uild[Ll]og.*
 45 | 
 46 | # NUnit
 47 | *.VisualState.xml
 48 | TestResult.xml
 49 | nunit-*.xml
 50 | 
 51 | # Build Results of an ATL Project
 52 | [Dd]ebugPS/
 53 | [Rr]eleasePS/
 54 | dlldata.c
 55 | 
 56 | # Benchmark Results
 57 | BenchmarkDotNet.Artifacts/
 58 | 
 59 | # .NET Core
 60 | project.lock.json
 61 | project.fragment.lock.json
 62 | artifacts/
 63 | 
 64 | # StyleCop
 65 | StyleCopReport.xml
 66 | 
 67 | # Files built by Visual Studio
 68 | *_i.c
 69 | *_p.c
 70 | *_h.h
 71 | *.ilk
 72 | *.meta
 73 | *.obj
 74 | *.iobj
 75 | *.pch
 76 | *.pdb
 77 | *.ipdb
 78 | *.pgc
 79 | *.pgd
 80 | *.rsp
 81 | *.sbr
 82 | *.tlb
 83 | *.tli
 84 | *.tlh
 85 | *.tmp
 86 | *.tmp_proj
 87 | *_wpftmp.csproj
 88 | *.log
 89 | *.vspscc
 90 | *.vssscc
 91 | .builds
 92 | *.pidb
 93 | *.svclog
 94 | *.scc
 95 | 
 96 | # Chutzpah Test files
 97 | _Chutzpah*
 98 | 
 99 | # Visual C++ cache files
100 | ipch/
101 | *.aps
102 | *.ncb
103 | *.opendb
104 | *.opensdf
105 | *.sdf
106 | *.cachefile
107 | *.VC.db
108 | *.VC.VC.opendb
109 | 
110 | # Visual Studio profiler
111 | *.psess
112 | *.vsp
113 | *.vspx
114 | *.sap
115 | 
116 | # Visual Studio Trace Files
117 | *.e2e
118 | 
119 | # TFS 2012 Local Workspace
120 | $tf/
121 | 
122 | # Guidance Automation Toolkit
123 | *.gpState
124 | 
125 | # ReSharper is a .NET coding add-in
126 | _ReSharper*/
127 | *.[Rr]e[Ss]harper
128 | *.DotSettings.user
129 | 
130 | # TeamCity is a build add-in
131 | _TeamCity*
132 | 
133 | # DotCover is a Code Coverage Tool
134 | *.dotCover
135 | 
136 | # AxoCover is a Code Coverage Tool
137 | .axoCover/*
138 | !.axoCover/settings.json
139 | 
140 | # Visual Studio code coverage results
141 | *.coverage
142 | *.coveragexml
143 | 
144 | # NCrunch
145 | _NCrunch_*
146 | .*crunch*.local.xml
147 | nCrunchTemp_*
148 | 
149 | # MightyMoose
150 | *.mm.*
151 | AutoTest.Net/
152 | 
153 | # Web workbench (sass)
154 | .sass-cache/
155 | 
156 | # Installshield output folder
157 | [Ee]xpress/
158 | 
159 | # DocProject is a documentation generator add-in
160 | DocProject/buildhelp/
161 | DocProject/Help/*.HxT
162 | DocProject/Help/*.HxC
163 | DocProject/Help/*.hhc
164 | DocProject/Help/*.hhk
165 | DocProject/Help/*.hhp
166 | DocProject/Help/Html2
167 | DocProject/Help/html
168 | 
169 | # Click-Once directory
170 | publish/
171 | 
172 | # Publish Web Output
173 | *.[Pp]ublish.xml
174 | *.azurePubxml
175 | # Note: Comment the next line if you want to checkin your web deploy settings,
176 | # but database connection strings (with potential passwords) will be unencrypted
177 | *.pubxml
178 | *.publishproj
179 | 
180 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
181 | # checkin your Azure Web App publish settings, but sensitive information contained
182 | # in these scripts will be unencrypted
183 | PublishScripts/
184 | 
185 | # NuGet Packages
186 | *.nupkg
187 | # NuGet Symbol Packages
188 | *.snupkg
189 | # The packages folder can be ignored because of Package Restore
190 | **/[Pp]ackages/*
191 | # except build/, which is used as an MSBuild target.
192 | !**/[Pp]ackages/build/
193 | # Uncomment if necessary however generally it will be regenerated when needed
194 | #!**/[Pp]ackages/repositories.config
195 | # NuGet v3's project.json files produces more ignorable files
196 | *.nuget.props
197 | *.nuget.targets
198 | 
199 | # Microsoft Azure Build Output
200 | csx/
201 | *.build.csdef
202 | 
203 | # Microsoft Azure Emulator
204 | ecf/
205 | rcf/
206 | 
207 | # Windows Store app package directories and files
208 | AppPackages/
209 | BundleArtifacts/
210 | Package.StoreAssociation.xml
211 | _pkginfo.txt
212 | *.appx
213 | *.appxbundle
214 | *.appxupload
215 | 
216 | # Visual Studio cache files
217 | # files ending in .cache can be ignored
218 | *.[Cc]ache
219 | # but keep track of directories ending in .cache
220 | !?*.[Cc]ache/
221 | 
222 | # Others
223 | ClientBin/
224 | ~$*
225 | *~
226 | *.dbmdl
227 | *.dbproj.schemaview
228 | *.jfm
229 | *.pfx
230 | *.publishsettings
231 | orleans.codegen.cs
232 | 
233 | # Including strong name files can present a security risk
234 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
235 | #*.snk
236 | 
237 | # Since there are multiple workflows, uncomment next line to ignore bower_components
238 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
239 | #bower_components/
240 | 
241 | # RIA/Silverlight projects
242 | Generated_Code/
243 | 
244 | # Backup & report files from converting an old project file
245 | # to a newer Visual Studio version. Backup files are not needed,
246 | # because we have git ;-)
247 | _UpgradeReport_Files/
248 | Backup*/
249 | UpgradeLog*.XML
250 | UpgradeLog*.htm
251 | ServiceFabricBackup/
252 | *.rptproj.bak
253 | 
254 | # SQL Server files
255 | *.mdf
256 | *.ldf
257 | *.ndf
258 | 
259 | # Business Intelligence projects
260 | *.rdl.data
261 | *.bim.layout
262 | *.bim_*.settings
263 | *.rptproj.rsuser
264 | *- [Bb]ackup.rdl
265 | *- [Bb]ackup ([0-9]).rdl
266 | *- [Bb]ackup ([0-9][0-9]).rdl
267 | 
268 | # Microsoft Fakes
269 | FakesAssemblies/
270 | 
271 | # GhostDoc plugin setting file
272 | *.GhostDoc.xml
273 | 
274 | # Node.js Tools for Visual Studio
275 | .ntvs_analysis.dat
276 | node_modules/
277 | 
278 | # Visual Studio 6 build log
279 | *.plg
280 | 
281 | # Visual Studio 6 workspace options file
282 | *.opt
283 | 
284 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
285 | *.vbw
286 | 
287 | # Visual Studio LightSwitch build output
288 | **/*.HTMLClient/GeneratedArtifacts
289 | **/*.DesktopClient/GeneratedArtifacts
290 | **/*.DesktopClient/ModelManifest.xml
291 | **/*.Server/GeneratedArtifacts
292 | **/*.Server/ModelManifest.xml
293 | _Pvt_Extensions
294 | 
295 | # Paket dependency manager
296 | .paket/paket.exe
297 | paket-files/
298 | 
299 | # FAKE - F# Make
300 | .fake/
301 | 
302 | # CodeRush personal settings
303 | .cr/personal
304 | 
305 | # Python Tools for Visual Studio (PTVS)
306 | __pycache__/
307 | *.pyc
308 | 
309 | # Cake - Uncomment if you are using it
310 | # tools/**
311 | # !tools/packages.config
312 | 
313 | # Tabs Studio
314 | *.tss
315 | 
316 | # Telerik's JustMock configuration file
317 | *.jmconfig
318 | 
319 | # BizTalk build output
320 | *.btp.cs
321 | *.btm.cs
322 | *.odx.cs
323 | *.xsd.cs
324 | 
325 | # OpenCover UI analysis results
326 | OpenCover/
327 | 
328 | # Azure Stream Analytics local run output
329 | ASALocalRun/
330 | 
331 | # MSBuild Binary and Structured Log
332 | *.binlog
333 | 
334 | # NVidia Nsight GPU debugger configuration file
335 | *.nvuser
336 | 
337 | # MFractors (Xamarin productivity tool) working folder
338 | .mfractor/
339 | 
340 | # Local History for Visual Studio
341 | .localhistory/
342 | 
343 | # BeatPulse healthcheck temp database
344 | healthchecksdb
345 | 
346 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
347 | MigrationBackup/
348 | 
349 | # Ionide (cross platform F# VS Code tools) working folder
350 | .ionide/
351 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # REINA
 2 | Implementation of the following paper:
 3 | ## Training Data is More Valuable than You Think: A Simple and Effective Method by Retrieving from Training Data (https://arxiv.org/abs/2203.08773)
 4 | Shuohang Wang (shuowa at microsoft.com), Yichong Xu, Yuwei Fang, Yang Liu, Siqi Sun, Ruochen Xu, Chenguang Zhu, Michael Zeng
 5 | 
 6 | 
 7 | 
 8 | Accept to ACL2022 main conference
 9 | 
10 | ### Usage 1
11 | After cloning the repo, run the following code with docker to reproduce REINA on XSum dataset. REINA is interaged into the model trainig code.  Please set model name to google/pegasus-large or facebook/bart-large or facebook/bart-base, etc. By default, the job is run on 8 GPUs. Please tuning "--gradient_accumulation_steps" if use less GPUs. More --reina_workers is prefered to speed up REINA process. 40 workers will task around 15 minutes. 
12 | ```
13 | docker run --gpus all -it --rm --shm-size 10g -w /home/reina/src -v ${PWD}/REINA:/home/reina shuohang/pytorch:reina /bin/bash -c "export HF_DATASETS_CACHE=/home/reina/data; export TRANSFORMERS_CACHE=/home/reina/cache; python -m torch.distributed.launch --nproc_per_node=8 run_summarization.py --report_to none  --save_strategy epoch --model_name_or_path google/pegasus-large --dataset_name xsum  --do_train   --do_eval --do_predict  --per_device_train_batch_size=2 --gradient_accumulation_steps 2 --per_device_eval_batch_size=4 --predict_with_generate --output_dir /home/reina/output --overwrite_output_dir --text_column document --summary_column summary  --num_train_epochs 3 --logging_strategy epoch --evaluation_strategy epoch --load_best_model_at_end --max_target_length 64 --val_max_target_length 64 --learning_rate 0.00005 --reina --reina_workers 40"
14 | ```
15 | 
16 | ### Usage 2
17 | In this section, the REINA and model training are splitted in two steps. The first step will save REINA data into files and then run seq2seq model for summarization.
18 | ```
19 | docker run --gpus all -it --rm --shm-size 10g -w /home/reina/src -v ${PWD}/REINA:/home/reina shuohang/pytorch:reina /bin/bash -c "export HF_DATASETS_CACHE=/home/reina/data; python reina.py --dataname xsum --reina_workers 10 --key_column document --value_column summary"
20 | docker run --gpus all -it --rm --shm-size 10g -w /home/reina/src -v ${PWD}/REINA:/home/reina shuohang/pytorch:reina /bin/bash -c "export HF_DATASETS_CACHE=/home/reina/data; export TRANSFORMERS_CACHE=/home/reina/cache; python -m torch.distributed.launch --nproc_per_node=8 run_summarization.py --report_to none  --save_strategy epoch --model_name_or_path google/pegasus-large  --do_train   --do_eval --do_predict  --per_device_train_batch_size=2 --gradient_accumulation_steps 2 --per_device_eval_batch_size=4 --predict_with_generate --output_dir /home/reina/output --overwrite_output_dir --text_column document --summary_column summary  --num_train_epochs 3 --logging_strategy epoch --evaluation_strategy epoch --load_best_model_at_end --max_target_length 64 --val_max_target_length 64 --learning_rate 0.00005  --train_file /home/reina/data/reina/xsum/train.json --validation_file /home/reina/data/reina/xsum/validation.json --test_file /home/reina/data/reina/xsum/test.json"
21 | ```
22 | 
23 | ### Related project
24 | REINA is integrated into the project of Human Parity on CommonsenseQA
25 | 
26 | https://github.com/microsoft/KEAR
27 | 
28 | 
29 | ## Contributing
30 | 
31 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
32 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
33 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
34 | 
35 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
36 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
37 | provided by the bot. You will only need to do this once across all repos using our CLA.
38 | 
39 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
40 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
41 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
42 | 
43 | ## Trademarks
44 | 
45 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 
46 | trademarks or logos is subject to and must follow 
47 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
48 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
49 | Any use of third-party trademarks or logos are subject to those third-party's policies.
50 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport).
 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/src/reina.py:
--------------------------------------------------------------------------------
  1 | import sys, os, lucene, threading, time
  2 | import math
  3 | from multiprocessing import Pool
  4 | import shutil
  5 | 
  6 | from datetime import datetime
  7 | 
  8 | from org.apache.lucene import analysis, document, index, queryparser, search, store, util
  9 | from java.nio.file import Paths
 10 | from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
 11 | from org.apache.lucene.analysis.standard import StandardAnalyzer
 12 | from org.apache.lucene.document import Document, Field, FieldType
 13 | from org.apache.lucene.index import \
 14 |     FieldInfo, IndexWriter, IndexWriterConfig, IndexOptions, DirectoryReader
 15 | from org.apache.lucene.store import SimpleFSDirectory, MMapDirectory
 16 | from org.apache.lucene.store import RAMDirectory
 17 | from org.apache.lucene.search.similarities import BM25Similarity, TFIDFSimilarity
 18 | import random
 19 | 
 20 | import json
 21 | import string
 22 | import glob
 23 | import bz2
 24 | import gzip
 25 | import sys
 26 | from tqdm import tqdm
 27 | from nltk import sent_tokenize
 28 | from nltk import word_tokenize as tokenize
 29 | from nltk.corpus import stopwords
 30 | from collections import defaultdict
 31 | from datasets import Dataset
 32 | 
 33 | stops_en = set(stopwords.words('english'))
 34 | exclude = set(string.punctuation)
 35 | 
 36 | def remove_punc(text):
 37 |     return ''.join(ch for ch in text if ch not in exclude)
 38 | 
 39 | def word_tokenize(text, lowercase=True):
 40 |     words = tokenize(text)
 41 |     outputs = []
 42 |     for token in words:
 43 |         if token not in stops_en and token not in exclude:
 44 |             outputs.append( remove_punc(token) )
 45 | 
 46 |     return ' '.join(outputs[:600])
 47 | 
 48 | class MyMemLucene():
 49 | 
 50 |     def __init__(self):
 51 | 
 52 |         lucene.initVM()
 53 |         # # # lucene # # #
 54 |         self.t1 = FieldType()
 55 |         self.t1.setStored(True)
 56 |         self.t1.setTokenized(False)
 57 |         self.t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
 58 | 
 59 |         self.t2 = FieldType()
 60 |         self.t2.setStored(True)
 61 |         self.t2.setTokenized(True)
 62 |         self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
 63 | 
 64 |         self.t3 = FieldType()
 65 |         self.t3.setStored(True)
 66 | 
 67 |         self.analyzer = StandardAnalyzer()
 68 | 
 69 | 
 70 |     def built_RAM(self, data, key, value):
 71 |         self.index_directory = RAMDirectory()
 72 |         config = IndexWriterConfig( self.analyzer )
 73 |         config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
 74 |         iwriter = IndexWriter(self.index_directory, config)
 75 | 
 76 |         print('Building REINA index ...')
 77 |         qbar = tqdm(total=len(data[key]))
 78 | 
 79 |         for instance_key, instance_value in zip(data[key], data[value]):
 80 |             doc = Document()
 81 |             doc.add(Field(key, instance_key, self.t2))
 82 |             doc.add(Field(value, instance_value, self.t2))
 83 | 
 84 |             try:
 85 |                 iwriter.addDocument(doc)
 86 |             except:
 87 |                 print(instance_value)
 88 |                 continue
 89 |             qbar.update(1)
 90 |         qbar.close()
 91 |         iwriter.close()
 92 | 
 93 |     def retrieve_RAM(self, lines, docs_num, key, value):
 94 | 
 95 |         ireader = DirectoryReader.open(self.index_directory)
 96 |         isearcher = search.IndexSearcher(ireader)
 97 |         isearcher.setSimilarity(BM25Similarity())
 98 | 
 99 |         parser = queryparser.classic.QueryParser( key, self.analyzer)
100 | 
101 |         output_all = []
102 |         for question in lines:
103 |             try:
104 |                 query = parser.parse(question)
105 |             except:
106 |                 try:
107 |                     query = parser.parse(word_tokenize(question))
108 |                 except:
109 |                     output_all.append(question)
110 |                     continue
111 | 
112 |     
113 |             hits = isearcher.search(query, max(20, docs_num) ).scoreDocs
114 |             output = []
115 |             for hit in hits:
116 |                 hitDoc = isearcher.doc(hit.doc)
117 |                 try:
118 |                     if hitDoc[key] == question: continue
119 |                     output.append( hitDoc[value] )
120 |                     
121 |                 except:
122 |                     continue
123 | 
124 |             instance = ' '.join( question.split(' ')[:600] )   + ' ' + ' '.join(output[:docs_num])
125 |             output_all.append(instance)
126 | 
127 |         return output_all
128 |         
129 | class MultiprocessingEncoder(object):
130 |     def __init__(self, args):
131 |         self.args = args
132 | 
133 |     def initializer(self):
134 |         global mylc
135 |         mylc = MyMemLucene()
136 |         mylc.built_RAM( self.args['index_data'] , self.args['key'], self.args['value'] )
137 | 
138 | 
139 |     def retrieve_lines(self, lines):
140 |         output = mylc.retrieve_RAM( lines, 5, self.args['key'], self.args['value'] )
141 |         return output
142 | 
143 | 
144 | def reina_apply(raw_datasets, key, value, num_proc):
145 |     
146 |     index_data_list = raw_datasets['train']
147 |     query_data_dict = {k:v for k, v in raw_datasets.items()}
148 |     datasets_new = defaultdict(dict)
149 | 
150 |     retriever = MultiprocessingEncoder({'index_data': index_data_list, 'key': key, 'value': value})
151 |     pool = Pool(num_proc, initializer=retriever.initializer)
152 |     
153 | 
154 |     for set_name, query_data in query_data_dict.items():
155 |         print(set_name)
156 |         lines = [  k  for k in query_data[key] ]
157 |         datasets_new[set_name][value] = [ v for v in query_data[value] ]
158 | 
159 |         encoded_lines = pool.imap(retriever.retrieve_lines, zip(*[lines]), 100)
160 |         print('REINA start ...')
161 |         lines_reina = []
162 |         qbar = tqdm(total=len(query_data[key]))
163 |         key_id = 0
164 |         for line_id, lines_ir in enumerate(encoded_lines):
165 |             for line in lines_ir:
166 |                 lines_reina.append(line)
167 |                 key_id += 1
168 |             qbar.update(len(lines_ir))
169 |             
170 |         datasets_new[set_name][key] = lines_reina
171 | 
172 |         qbar.close()
173 |         datasets_new[set_name] = Dataset.from_dict(datasets_new[set_name])
174 |     return datasets_new
175 | 
176 | def reina(raw_datasets, key, value, use_cache, num_proc=10):
177 | 
178 |     import torch
179 |     import pickle
180 |     
181 |     reina_path = os.getenv("HF_DATASETS_CACHE",os.path.join(os.path.expanduser('~'), '.cache/huggingface/datasets/'))
182 |     reina_path = os.path.join(reina_path, 'reina')
183 |     reina_dataset_path = os.path.join(reina_path, 'reina_dataset.pkl')
184 |     
185 |     if torch.cuda.current_device() == 0:
186 |         print('REINA path for cache: ' + reina_dataset_path)
187 |         print('Please remove it if data modified!')
188 | 
189 |     if not use_cache and torch.cuda.current_device() == 0:
190 |         datasets_new = reina_apply(raw_datasets, key, value, num_proc)
191 | 
192 |         if not os.path.isdir(reina_path):
193 |             os.makedirs(reina_path)
194 |         with open(reina_dataset_path, 'wb') as fpw:
195 |             pickle.dump(datasets_new, fpw)
196 |      
197 |     torch.distributed.barrier()
198 |     with open(reina_dataset_path, 'rb') as fpr:
199 |         datasets_new = pickle.load(fpr)
200 | 
201 |     return datasets_new
202 | 
203 | def reina_offline(data_name, data_path, key, value, num_proc):
204 |     from datasets import load_dataset
205 |     datasets = load_dataset(data_name)
206 |     if not os.path.isdir(data_path):
207 |         os.makedirs(data_path)
208 |     print(datasets)
209 | 
210 |     datasets_new = reina_apply(datasets, key, value, num_proc)
211 |     for set_name in ['validation', 'test', 'train']:
212 |         if set_name not in datasets_new: continue
213 | 
214 |         print('REINA for ' + set_name)
215 |         with open(os.path.join(data_path, set_name + '.json'), 'w', encoding='utf8') as fpw:
216 |             data_num = len(datasets_new[set_name][key])
217 |             for data_id, data in enumerate(datasets_new[set_name]):
218 |                 fpw.write(json.dumps({key: data[key], value: data[value]}) + '\n')
219 |             fpw.close()
220 | 
221 | if __name__ == "__main__":
222 |     import argparse
223 | 
224 |     parser = argparse.ArgumentParser(description='Process some integers.')
225 |     parser.add_argument('--dataname', type=str, default='xsum',
226 |                         help='dataset name, such as xsum')
227 |     parser.add_argument('--key_column', type=str, default='document',
228 |                         help='REINA key')
229 |     parser.add_argument('--value_column', type=str, default='summary',
230 |                         help='REINA value')
231 |     parser.add_argument('--reina_workers', type=int, default=10,
232 |                         help='REINA workers')
233 | 
234 |     args = parser.parse_args()
235 | 
236 |     reina_path = os.getenv("HF_DATASETS_CACHE",os.path.join(os.path.expanduser('~'), '.cache/huggingface/datasets/'))
237 |     reina_path = os.path.join(reina_path, 'reina', args.dataname)
238 |     
239 |     reina_offline(args.dataname, reina_path, args.key_column, args.value_column, args.reina_workers)
240 | 
241 | 


--------------------------------------------------------------------------------
/src/run_summarization.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2021 The HuggingFace Team. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Fine-tuning the library models for sequence to sequence.
 18 | """
 19 | # You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
 20 | 
 21 | import logging
 22 | import os
 23 | import sys
 24 | from dataclasses import dataclass, field
 25 | from typing import Optional
 26 | 
 27 | import datasets
 28 | import nltk  # Here to have a nice missing dependency error message early on
 29 | import numpy as np
 30 | from datasets import load_dataset, load_metric
 31 | 
 32 | import torch
 33 | 
 34 | import tensorflow as tf
 35 | from reina import reina
 36 | # Set tf.enable_eager_execution() if using TF 1.x.
 37 | 
 38 | 
 39 | 
 40 | import transformers
 41 | from filelock import FileLock
 42 | from transformers import (
 43 |     AutoConfig,
 44 |     AutoModelForSeq2SeqLM,
 45 |     AutoTokenizer,
 46 |     DataCollatorForSeq2Seq,
 47 |     HfArgumentParser,
 48 |     Seq2SeqTrainer,
 49 |     Seq2SeqTrainingArguments,
 50 |     set_seed,
 51 | )
 52 | from transformers.file_utils import is_offline_mode
 53 | from transformers.trainer_utils import get_last_checkpoint
 54 | from transformers.utils import check_min_version
 55 | from transformers.utils.versions import require_version
 56 | 
 57 | 
 58 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 59 | check_min_version("4.9.0")
 60 | 
 61 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 62 | 
 63 | logger = logging.getLogger(__name__)
 64 | 
 65 | try:
 66 |     nltk.data.find("tokenizers/punkt")
 67 | except (LookupError, OSError):
 68 |     if is_offline_mode():
 69 |         raise LookupError(
 70 |             "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
 71 |         )
 72 |     with FileLock(".lock") as lock:
 73 |         nltk.download("punkt", quiet=True)
 74 | 
 75 | 
 76 | @dataclass
 77 | class ModelArguments:
 78 |     """
 79 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
 80 |     """
 81 | 
 82 |     model_name_or_path: str = field(
 83 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
 84 |     )
 85 |     config_name: Optional[str] = field(
 86 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
 87 |     )
 88 |     tokenizer_name: Optional[str] = field(
 89 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
 90 |     )
 91 |     cache_dir: Optional[str] = field(
 92 |         default=None,
 93 |         metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
 94 |     )
 95 |     use_fast_tokenizer: bool = field(
 96 |         default=True,
 97 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
 98 |     )
 99 |     model_revision: str = field(
100 |         default="main",
101 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
102 |     )
103 |     use_auth_token: bool = field(
104 |         default=False,
105 |         metadata={
106 |             "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
107 |             "with private models)."
108 |         },
109 |     )
110 | 
111 | 
112 | @dataclass
113 | class DataTrainingArguments:
114 |     """
115 |     Arguments pertaining to what data we are going to input our model for training and eval.
116 |     """
117 | 
118 |     dataset_name: Optional[str] = field(
119 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
120 |     )
121 |     dataset_config_name: Optional[str] = field(
122 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
123 |     )
124 |     text_column: Optional[str] = field(
125 |         default=None,
126 |         metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
127 |     )
128 |     summary_column: Optional[str] = field(
129 |         default=None,
130 |         metadata={"help": "The name of the column in the datasets containing the summaries (for summarization)."},
131 |     )
132 |     train_file: Optional[str] = field(
133 |         default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."}
134 |     )
135 |     validation_file: Optional[str] = field(
136 |         default=None,
137 |         metadata={
138 |             "help": "An optional input evaluation data file to evaluate the metrics (rouge) on "
139 |             "(a jsonlines or csv file)."
140 |         },
141 |     )
142 |     test_file: Optional[str] = field(
143 |         default=None,
144 |         metadata={
145 |             "help": "An optional input test data file to evaluate the metrics (rouge) on " "(a jsonlines or csv file)."
146 |         },
147 |     )
148 |     overwrite_cache: bool = field(
149 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
150 |     )
151 |     preprocessing_num_workers: Optional[int] = field(
152 |         default=None,
153 |         metadata={"help": "The number of processes to use for the preprocessing."},
154 |     )
155 |     max_source_length: Optional[int] = field(
156 |         default=1024,
157 |         metadata={
158 |             "help": "The maximum total input sequence length after tokenization. Sequences longer "
159 |             "than this will be truncated, sequences shorter will be padded."
160 |         },
161 |     )
162 |     max_target_length: Optional[int] = field(
163 |         default=128,
164 |         metadata={
165 |             "help": "The maximum total sequence length for target text after tokenization. Sequences longer "
166 |             "than this will be truncated, sequences shorter will be padded."
167 |         },
168 |     )
169 |     val_max_target_length: Optional[int] = field(
170 |         default=None,
171 |         metadata={
172 |             "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer "
173 |             "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
174 |             "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
175 |             "during ``evaluate`` and ``predict``."
176 |         },
177 |     )
178 |     pad_to_max_length: bool = field(
179 |         default=False,
180 |         metadata={
181 |             "help": "Whether to pad all samples to model maximum sentence length. "
182 |             "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
183 |             "efficient on GPU but very bad for TPU."
184 |         },
185 |     )
186 |     max_train_samples: Optional[int] = field(
187 |         default=None,
188 |         metadata={
189 |             "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
190 |             "value if set."
191 |         },
192 |     )
193 |     max_eval_samples: Optional[int] = field(
194 |         default=None,
195 |         metadata={
196 |             "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
197 |             "value if set."
198 |         },
199 |     )
200 |     max_predict_samples: Optional[int] = field(
201 |         default=None,
202 |         metadata={
203 |             "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
204 |             "value if set."
205 |         },
206 |     )
207 |     num_beams: Optional[int] = field(
208 |         default=None,
209 |         metadata={
210 |             "help": "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
211 |             "which is used during ``evaluate`` and ``predict``."
212 |         },
213 |     )
214 |     ignore_pad_token_for_loss: bool = field(
215 |         default=True,
216 |         metadata={
217 |             "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
218 |         },
219 |     )
220 |     source_prefix: Optional[str] = field(
221 |         default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
222 |     )
223 |     reina: bool = field(
224 |         default=False,
225 |         metadata={
226 |             "help": "Whether to use REINA."
227 |         },
228 |     )
229 |     reina_workers: Optional[int] = field(
230 |         default=10,
231 |         metadata={
232 |             "help": "Number of workers for retrieval"
233 |         },
234 |     )
235 |     reina_use_cache: bool = field(
236 |         default=False,
237 |         metadata={
238 |             "help": "Whether to use reina cached data. If first-round on a new data, set false"
239 |         },
240 |     )
241 | 
242 | 
243 |     def __post_init__(self):
244 |         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
245 |             raise ValueError("Need either a dataset name or a training/validation file.")
246 |         else:
247 |             if self.train_file is not None:
248 |                 extension = self.train_file.split(".")[-1]
249 |                 assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
250 |             if self.validation_file is not None:
251 |                 extension = self.validation_file.split(".")[-1]
252 |                 assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
253 |         if self.val_max_target_length is None:
254 |             self.val_max_target_length = self.max_target_length
255 | 
256 | 
257 | summarization_name_mapping = {
258 |     "amazon_reviews_multi": ("review_body", "review_title"),
259 |     "big_patent": ("description", "abstract"),
260 |     "cnn_dailymail": ("article", "highlights"),
261 |     "orange_sum": ("text", "summary"),
262 |     "pn_summary": ("article", "summary"),
263 |     "psc": ("extract_text", "summary_text"),
264 |     "samsum": ("dialogue", "summary"),
265 |     "thaisum": ("body", "summary"),
266 |     "xglue": ("news_body", "news_title"),
267 |     "xsum": ("document", "summary"),
268 |     "wiki_summary": ("article", "highlights"),
269 | }
270 | 
271 | 
272 | def main():
273 |     # See all possible arguments in src/transformers/training_args.py
274 |     # or by passing the --help flag to this script.
275 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
276 | 
277 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
278 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
279 |         # If we pass only one argument to the script and it's the path to a json file,
280 |         # let's parse it to get our arguments.
281 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
282 |     else:
283 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
284 | 
285 |     # Setup logging
286 |     logging.basicConfig(
287 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
288 |         datefmt="%m/%d/%Y %H:%M:%S",
289 |         handlers=[logging.StreamHandler(sys.stdout)],
290 |     )
291 |     log_level = training_args.get_process_log_level()
292 |     logger.setLevel(log_level)
293 |     datasets.utils.logging.set_verbosity(log_level)
294 |     transformers.utils.logging.set_verbosity(log_level)
295 |     transformers.utils.logging.enable_default_handler()
296 |     transformers.utils.logging.enable_explicit_format()
297 | 
298 |     # Log on each process the small summary:
299 |     logger.warning(
300 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
301 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
302 |     )
303 |     logger.info(f"Training/evaluation parameters {training_args}")
304 | 
305 |     if data_args.source_prefix is None and model_args.model_name_or_path in [
306 |         "t5-small",
307 |         "t5-base",
308 |         "t5-large",
309 |         "t5-3b",
310 |         "t5-11b",
311 |     ]:
312 |         logger.warning(
313 |             "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
314 |             "`--source_prefix 'summarize: ' `"
315 |         )
316 | 
317 |     # Detecting last checkpoint.
318 |     last_checkpoint = None
319 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
320 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
321 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
322 |             raise ValueError(
323 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
324 |                 "Use --overwrite_output_dir to overcome."
325 |             )
326 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
327 |             logger.info(
328 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
329 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
330 |             )
331 | 
332 |     # Set seed before initializing model.
333 |     set_seed(training_args.seed)
334 | 
335 |     # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
336 |     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
337 |     # (the dataset will be downloaded automatically from the datasets Hub).
338 |     #
339 |     # For CSV/JSON files this script will use the first column for the full texts and the second column for the
340 |     # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments).
341 |     #
342 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
343 |     # download the dataset.
344 |     if data_args.dataset_name is not None:
345 |         # Downloading and loading a dataset from the hub.
346 |         raw_datasets = load_dataset(
347 |             data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
348 |         )
349 |     else:
350 |         data_files = {}
351 |         if data_args.train_file is not None:
352 |             data_files["train"] = data_args.train_file
353 |             extension = data_args.train_file.split(".")[-1]
354 |         if data_args.validation_file is not None:
355 |             data_files["validation"] = data_args.validation_file
356 |             extension = data_args.validation_file.split(".")[-1]
357 |         if data_args.test_file is not None:
358 |             data_files["test"] = data_args.test_file
359 |             extension = data_args.test_file.split(".")[-1]
360 |         raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
361 |     if data_args.reina:
362 |         raw_datasets = reina(raw_datasets, data_args.text_column, data_args.summary_column, data_args.reina_use_cache, data_args.reina_workers)
363 |     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
364 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
365 | 
366 |     # Load pretrained model and tokenizer
367 |     #
368 |     # Distributed training:
369 |     # The .from_pretrained methods guarantee that only one local process can concurrently
370 |     # download model & vocab.
371 |     config = AutoConfig.from_pretrained(
372 |         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
373 |         cache_dir=model_args.cache_dir,
374 |         revision=model_args.model_revision,
375 |         use_auth_token=True if model_args.use_auth_token else None,
376 |     )
377 |     tokenizer = AutoTokenizer.from_pretrained(
378 |         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
379 |         cache_dir=model_args.cache_dir,
380 |         use_fast=model_args.use_fast_tokenizer,
381 |         revision=model_args.model_revision,
382 |         use_auth_token=True if model_args.use_auth_token else None,
383 |     )
384 |     model = AutoModelForSeq2SeqLM.from_pretrained(
385 |         model_args.model_name_or_path,
386 |         from_tf=bool(".ckpt" in model_args.model_name_or_path),
387 |         config=config,
388 |         cache_dir=model_args.cache_dir,
389 |         revision=model_args.model_revision,
390 |         use_auth_token=True if model_args.use_auth_token else None,
391 |     )
392 | 
393 |     model.resize_token_embeddings(len(tokenizer))
394 | 
395 |     if model.config.decoder_start_token_id is None:
396 |         raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
397 | 
398 |     prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
399 | 
400 |     # Preprocessing the datasets.
401 |     # We need to tokenize inputs and targets.
402 |     if training_args.do_train:
403 |         column_names = raw_datasets["train"].column_names
404 |     elif training_args.do_eval:
405 |         column_names = raw_datasets["validation"].column_names
406 |     elif training_args.do_predict:
407 |         column_names = raw_datasets["test"].column_names
408 |     else:
409 |         logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
410 |         return
411 | 
412 |     # Get the column names for input/target.
413 |     dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None)
414 |     if data_args.text_column is None:
415 |         text_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
416 |     else:
417 |         text_column = data_args.text_column
418 |         if text_column not in column_names:
419 |             raise ValueError(
420 |                 f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}"
421 |             )
422 |     if data_args.summary_column is None:
423 |         summary_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
424 |     else:
425 |         summary_column = data_args.summary_column
426 |         if summary_column not in column_names:
427 |             raise ValueError(
428 |                 f"--summary_column' value '{data_args.summary_column}' needs to be one of: {', '.join(column_names)}"
429 |             )
430 | 
431 |     # Temporarily set max_target_length for training.
432 |     max_target_length = data_args.max_target_length
433 |     padding = "max_length" if data_args.pad_to_max_length else False
434 | 
435 |     if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
436 |         logger.warning(
437 |             "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
438 |             f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
439 |         )
440 | 
441 |     def preprocess_function(examples):
442 |         inputs = examples[text_column]
443 |         targets = examples[summary_column]
444 |         inputs = [prefix + inp for inp in inputs]
445 |         model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
446 | 
447 |         # Setup the tokenizer for targets
448 |         with tokenizer.as_target_tokenizer():
449 |             labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
450 | 
451 |         # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
452 |         # padding in the loss.
453 |         if padding == "max_length" and data_args.ignore_pad_token_for_loss:
454 |             labels["input_ids"] = [
455 |                 [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
456 |             ]
457 | 
458 |         model_inputs["labels"] = labels["input_ids"]
459 |         return model_inputs
460 | 
461 |     if training_args.do_train:
462 |         if "train" not in raw_datasets:
463 |             raise ValueError("--do_train requires a train dataset")
464 |         train_dataset = raw_datasets["train"]
465 |         if data_args.max_train_samples is not None:
466 |             train_dataset = train_dataset.select(range(data_args.max_train_samples))
467 |         with training_args.main_process_first(desc="train dataset map pre-processing"):
468 |             train_dataset = train_dataset.map(
469 |                 preprocess_function,
470 |                 batched=True,
471 |                 num_proc=data_args.preprocessing_num_workers,
472 |                 remove_columns=column_names,
473 |                 load_from_cache_file=not data_args.overwrite_cache,
474 |                 desc="Running tokenizer on train dataset",
475 |             )
476 | 
477 |     if training_args.do_eval:
478 |         max_target_length = data_args.val_max_target_length
479 |         if "validation" not in raw_datasets:
480 |             raise ValueError("--do_eval requires a validation dataset")
481 |         eval_dataset = raw_datasets["validation"]
482 |         if data_args.max_eval_samples is not None:
483 |             eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
484 |         with training_args.main_process_first(desc="validation dataset map pre-processing"):
485 |             eval_dataset = eval_dataset.map(
486 |                 preprocess_function,
487 |                 batched=True,
488 |                 num_proc=data_args.preprocessing_num_workers,
489 |                 remove_columns=column_names,
490 |                 load_from_cache_file=not data_args.overwrite_cache,
491 |                 desc="Running tokenizer on validation dataset",
492 |             )
493 | 
494 |     if training_args.do_predict:
495 |         max_target_length = data_args.val_max_target_length
496 |         if "test" not in raw_datasets:
497 |             raise ValueError("--do_predict requires a test dataset")
498 |         predict_dataset = raw_datasets["test"]
499 |         if data_args.max_predict_samples is not None:
500 |             predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
501 |         with training_args.main_process_first(desc="prediction dataset map pre-processing"):
502 |             predict_dataset = predict_dataset.map(
503 |                 preprocess_function,
504 |                 batched=True,
505 |                 num_proc=data_args.preprocessing_num_workers,
506 |                 remove_columns=column_names,
507 |                 load_from_cache_file=not data_args.overwrite_cache,
508 |                 desc="Running tokenizer on prediction dataset",
509 |             )
510 | 
511 |     # Data collator
512 |     label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
513 |     data_collator = DataCollatorForSeq2Seq(
514 |         tokenizer,
515 |         model=model,
516 |         label_pad_token_id=label_pad_token_id,
517 |         pad_to_multiple_of=8 if training_args.fp16 else None,
518 |     )
519 | 
520 |     # Metric
521 |     metric = load_metric("rouge")
522 |     metric_sacrebleu = load_metric("sacrebleu")
523 | 
524 |     def postprocess_text(preds, labels):
525 |         preds = [pred.strip() for pred in preds]
526 |         labels = [label.strip() for label in labels]
527 | 
528 |         # rougeLSum expects newline after each sentence
529 |         preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
530 |         labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
531 | 
532 |         return preds, labels
533 | 
534 |     def compute_metrics(eval_preds):
535 |         preds, labels = eval_preds
536 |         if isinstance(preds, tuple):
537 |             preds = preds[0]
538 |         decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
539 |         if data_args.ignore_pad_token_for_loss:
540 |             # Replace -100 in the labels as we can't decode them.
541 |             labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
542 |         decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
543 | 
544 |         # Some simple post-processing
545 |         decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
546 | 
547 |         result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
548 |         # Extract a few results from ROUGE
549 |         result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
550 | 
551 | 
552 |         result_sblue = metric_sacrebleu.compute(predictions=decoded_preds, references=[[c] for c in decoded_labels])
553 |         result["bleu"] = result_sblue["score"]
554 | 
555 |         prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
556 |         result["gen_len"] = np.mean(prediction_lens)
557 |         result = {k: round(v, 4) for k, v in result.items()}
558 |         return result
559 | 
560 |     # Initialize our Trainer
561 |     trainer = Seq2SeqTrainer(
562 |         model=model,
563 |         args=training_args,
564 |         train_dataset=train_dataset if training_args.do_train else None,
565 |         eval_dataset=eval_dataset if training_args.do_eval else None,
566 |         tokenizer=tokenizer,
567 |         data_collator=data_collator,
568 |         compute_metrics=compute_metrics if training_args.predict_with_generate else None,
569 |     )
570 | 
571 |     # Training
572 |     if training_args.do_train:
573 |         checkpoint = None
574 |         if training_args.resume_from_checkpoint is not None:
575 |             checkpoint = training_args.resume_from_checkpoint
576 |         elif last_checkpoint is not None:
577 |             checkpoint = last_checkpoint
578 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
579 |         trainer.save_model()  # Saves the tokenizer too for easy upload
580 | 
581 |         metrics = train_result.metrics
582 |         max_train_samples = (
583 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
584 |         )
585 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
586 | 
587 |         trainer.log_metrics("train", metrics)
588 |         trainer.save_metrics("train", metrics)
589 |         trainer.save_state()
590 | 
591 |     # Evaluation
592 |     results = {}
593 |     if training_args.do_eval:
594 |         logger.info("*** Evaluate ***")
595 | 
596 |         metrics = trainer.evaluate(
597 |             max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, metric_key_prefix="eval"
598 |         )
599 |         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
600 |         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
601 | 
602 |         trainer.log_metrics("eval", metrics)
603 |         trainer.save_metrics("eval", metrics)
604 | 
605 |     if training_args.do_predict:
606 |         logger.info("*** Predict ***")
607 | 
608 |         predict_results = trainer.predict(
609 |             predict_dataset,
610 |             metric_key_prefix="predict",
611 |             max_length=data_args.val_max_target_length,
612 |             num_beams=data_args.num_beams,
613 |         )
614 |         metrics = predict_results.metrics
615 |         max_predict_samples = (
616 |             data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
617 |         )
618 |         metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
619 | 
620 |         trainer.log_metrics("predict", metrics)
621 |         trainer.save_metrics("predict", metrics)
622 | 
623 |         if trainer.is_world_process_zero():
624 |             if training_args.predict_with_generate:
625 |                 predictions = tokenizer.batch_decode(
626 |                     predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
627 |                 )
628 |                 predictions = [pred.strip() for pred in predictions]
629 |                 output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt")
630 |                 with open(output_prediction_file, "w") as writer:
631 |                     writer.write("\n".join(predictions))
632 | 
633 |     if training_args.push_to_hub:
634 |         kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "summarization"}
635 |         if data_args.dataset_name is not None:
636 |             kwargs["dataset_tags"] = data_args.dataset_name
637 |             if data_args.dataset_config_name is not None:
638 |                 kwargs["dataset_args"] = data_args.dataset_config_name
639 |                 kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
640 |             else:
641 |                 kwargs["dataset"] = data_args.dataset_name
642 | 
643 |         trainer.push_to_hub(**kwargs)
644 | 
645 |     return results
646 | 
647 | 
648 | def _mp_fn(index):
649 |     # For xla_spawn (TPUs)
650 |     main()
651 | 
652 | 
653 | if __name__ == "__main__":
654 |     main()
655 | 


--------------------------------------------------------------------------------