├── .gitignore
├── LICENSE
├── Paragraph2Action
    ├── README.md
    ├── data
    │   ├── few_data
    │   │   ├── few_example_data
    │   │   │   ├── src-test.txt
    │   │   │   ├── src-train.txt
    │   │   │   ├── src-valid.txt
    │   │   │   ├── tgt-test.txt
    │   │   │   ├── tgt-train.txt
    │   │   │   └── tgt-valid.txt
    │   │   ├── few_test
    │   │   │   └── test_10.csv
    │   │   ├── few_train
    │   │   │   └── train_80.csv
    │   │   └── few_valid
    │   │   │   └── valid_10.csv
    │   ├── hand_annotated
    │   │   └── README.md
    │   └── processed_data
    │   │   └── README.md
    ├── evaluate_for_paragraph2action.ipynb
    ├── finetune_bart_or_t5_for_paragraph2action.py
    ├── finetune_chatgpt_for_paragraph2action.ipynb
    ├── finetune_llms_full_for_paragraph2action.py
    ├── finetune_llms_peft_for_paragraph2action.py
    ├── prompt_chatgpt_for_paragraph2action.ipynb
    ├── results
    │   └── README.md
    ├── vllm_inference_full_finetuned_llms.ipynb
    └── vllm_inference_peft_finetuned_llms.ipynb
├── Paragraph2Comound
    ├── README.md
    ├── data
    │   ├── test
    │   │   ├── test_1000.csv
    │   │   └── test_10000.csv
    │   ├── train
    │   │   ├── trial_1
    │   │   │   ├── train_10.csv
    │   │   │   ├── train_100.csv
    │   │   │   ├── train_1000.csv
    │   │   │   └── train_10000.csv
    │   │   ├── trial_2
    │   │   │   ├── train_10.csv
    │   │   │   ├── train_100.csv
    │   │   │   ├── train_1000.csv
    │   │   │   └── train_10000.csv
    │   │   └── trial_3
    │   │   │   ├── train_10.csv
    │   │   │   ├── train_100.csv
    │   │   │   ├── train_1000.csv
    │   │   │   └── train_10000.csv
    │   └── valid
    │   │   └── valid_1000.csv
    ├── evaluate_for_paragraph2compound.ipynb
    ├── finetune_bart_or_t5_for_paragraph2compound.py
    ├── finetune_llms_full_for_paragraph2compound.py
    ├── finetune_llms_peft_for_paragraph2compound.py
    ├── results
    │   └── README.md
    ├── vllm_inference_full_finetuned_llms.ipynb
    └── vllm_inference_peft_finetuned_llms.ipynb
├── Paragraph2MOFInfo
    ├── README.md
    ├── data
    │   ├── data_for_bart_and_t5
    │   │   ├── test_329_11_tasks.csv
    │   │   └── train_329_11_tasks.csv
    │   ├── data_for_llms
    │   │   ├── test_329.csv
    │   │   └── train_329.csv
    │   └── raw_data
    │   │   ├── MOF_annotated_data.json
    │   │   └── MOF_processed_dict.csv
    ├── evaluate_llms_paragraph2MOFInfo_all.ipynb
    ├── evaluate_llms_paragraph2MOFInfo_split_multiple_reactions.ipynb
    ├── evaluate_llms_paragraph2MOFInfo_split_single_reaction.ipynb
    ├── finetune_bart_or_t5_for_paragraph2MOFInfo.py
    ├── finetune_llms_full_for_paragraph2MOFInfo.py
    ├── finetune_llms_peft_for_paragraph2MOFInfo.py
    ├── results
    │   └── README.md
    ├── vllm_inference_full_finetuned_llms.ipynb
    └── vllm_inference_peft_finetuned_llms.ipynb
├── Paragraph2NMR
    ├── README.md
    ├── data
    │   ├── data_for_bart_or_t5
    │   │   ├── test_300_one_column_lstrip_add_space.csv
    │   │   └── train_200_one_column_lstrip_add_space.csv
    │   └── data_for_llms
    │   │   ├── test
    │   │       └── test_300.csv
    │   │   └── train
    │   │       ├── train_100_data_in_200.csv
    │   │       ├── train_200_data_in_300.csv
    │   │       ├── train_25_data_in_50.csv
    │   │       ├── train_300.csv
    │   │       └── train_50_data_in_100.csv
    ├── evaluate_bart_or_t5_Paragraph2NMR.ipynb
    ├── evaluate_llms_Paragraph2NMR.ipynb
    ├── finetune_bart_or_t5_for_paragraph2NMR.py
    ├── finetune_llms_full_for_paragraph2NMR.py
    ├── finetune_llms_peft_for_paragraph2NMR.py
    ├── results
    │   └── README.md
    ├── vllm_inference_full_finetuned_llms.ipynb
    └── vllm_inference_peft_finetuned_llms.ipynb
├── Paragraph2RXNRole
    ├── Paragraph2Prod
    │   ├── README.md
    │   ├── data
    │   │   └── prod
    │   │   │   ├── test.csv
    │   │   │   ├── train.csv
    │   │   │   └── valid.csv
    │   ├── evaluate_Paragraph2Prod.ipynb
    │   ├── finetune_bart_or_t5_for_paragraph2prod.py
    │   ├── finetune_llms_full_for_paragraph2prod.py
    │   ├── finetune_llms_peft_for_paragraph2prod.py
    │   ├── results
    │   │   └── README.md
    │   ├── vllm_inference_full_finetuned_llms.ipynb
    │   └── vllm_inference_peft_finetuned_llms.ipynb
    ├── Paragraph2Role
    │   ├── README.md
    │   ├── data
    │   │   └── role
    │   │   │   ├── test.csv
    │   │   │   ├── train.csv
    │   │   │   └── valid.csv
    │   ├── evaluate_Paragraph2Role.ipynb
    │   ├── finetune_bart_or_t5_for_paragraph2role.py
    │   ├── finetune_llms_full_for_paragraph2role.py
    │   ├── finetune_llms_peft_for_paragraph2role.py
    │   ├── results
    │   │   └── README.md
    │   ├── vllm_inference_full_finetuned_llms.ipynb
    │   └── vllm_inference_peft_finetuned_llms.ipynb
    └── README.md
├── README.md
└── demo
    ├── fine-tuning_chatgpt_on_25_paragraph2NMR_data.ipynb
    ├── test_300.csv
    └── train_25.csv


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Paragraph2Compound 100000 Dataset (too large)
 2 | Paragraph2Comound/data/train/trial_1/train_100000.csv
 3 | Paragraph2Comound/data/train/trial_2/train_100000.csv
 4 | Paragraph2Comound/data/train/trial_3/train_100000.csv
 5 | 
 6 | # Paragraph2Action Whole Dataset (pistachio license needed)
 7 | Paragraph2Action/data/hand_annotated/*.txt
 8 | Paragraph2Action/data/processed_data/*.csv
 9 | Paragraph2Action/data/processed_data/*.jsonl
10 | 
11 | # Saved Models
12 | */saved_models
13 | */*/saved_models
14 | 
15 | # Predicted results
16 | */results/*.csv
17 | */results/*.ipynb
18 | */results/*.png
19 | */results/*.svg
20 | */results/*.txt
21 | */results/predictions/*
22 | */*/results/*.csv
23 | */*/results/*.ipynb
24 | */*/results/*.png
25 | */*/results/*.svg
26 | */*/results/*.txt
27 | */*/results/predictions/*
28 | 
29 | # Test Files
30 | */*wqg*
31 | */*zw*
32 | */*copy*
33 | */*/*wqg*
34 | */*/*zw*
35 | */*/*copy*
36 | */*log*
37 | */*/*log*
38 | */*/*svg*
39 | */*/*/*svg*


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 zw-SIMM
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Paragraph2Action/README.md:
--------------------------------------------------------------------------------
 1 | ## 1. Data for Paragraph2Action
 2 | The ```data/few_example_data``` of Paragraph2Action dataset is a subset of https://github.com/rxn4chemistry/paragraph2actions/tree/main/test_data
 3 | 
 4 | The whole paragraph2action dataset ```data/hand_annotated``` is available upon request (with pistachio license).
 5 | 
 6 | The processed dataset is in ```data/processed_data```.
 7 | 
 8 | ## 2. Methods for Paragraph2Action
 9 | 
10 | ### Prompt Engineering ChatGPT (GPT-4, GPT-3.5-Turbo)
11 | 
12 | See in ```prompt_chatgpt_for_paragraph2action.ipynb```
13 | 
14 | ### Fine-tuning ChatGPT (GPT-3.5-Turbo)
15 | 
16 | See in ```finetune_chatgpt_for_paragraph2action.ipynb```
17 | 
18 | ### Full Parameter Fine-tuning Open-source Large Language Models (Mistral, Llama3, Llama2)
19 | 
20 | Training Code in ```finetune_llms_full_for_paragraph2action.py```
21 | 
22 | Inferencing Code in ```vllm_inference_full_finetuned_llms.ipynb```
23 | 
24 | ### Parameter Efficient Fine-tuning (PEFT) Open-source Large Language Models (Mistral, Llama3, Llama2)
25 | 
26 | Training Code in ```finetune_llms_peft_for_paragraph2action.py```
27 | 
28 | Inferencing Code in ```vllm_inference_peft_finetuned_llms.ipynb```
29 | 
30 | ### Fine-tuning Language Models (T5, Bart)
31 | 
32 | Training Code in ```finetune_bart_or_t5_for_paragraph2action.py```
33 | 
34 | 
35 | ## 3. Evaluating the results of Paragraph2Action
36 | 
37 | All predictions will be saved in ```results/predictions```
38 | 
39 | Evaluating codes are in ```evaluate_for_paragraph2action.ipynb```
40 | 


--------------------------------------------------------------------------------
/Paragraph2Action/data/few_data/few_example_data/src-test.txt:
--------------------------------------------------------------------------------
 1 | The reaction mixture is allowed to warm to room temperature and stirred overnight.
 2 | The combined organic layers were washed with brine, dried over sodium sulfate, filtered, and concentrated to yield 2.98 g of N-{2-[4-amino-7-[(6-aminohexyl)oxy]-2-(ethoxymethyl)-1H-imidazo[4,5-c]quinolin-1-yl]-1,1-dimethylethyl}-N-isopropylurea as a dark orange solid.
 3 | 3-Amino-2,4-dichloro-benzamide (2.00 g, 9.8 mmol) in THF (45 mL) was added dropwise to LiAlH4 (1 M in THF, 24.4 mL) in THF (45 mL).
 4 | A further drop of methanesulfonyl chloride was added and the mixture continued to stir at RT for 30 min.
 5 | The reaction mixture was cooled to −80° C., and a solution of tert-butyl 6-[(cyclopropylmethoxy)methyl]-6-hydroxy-1,4-oxazepane-4-carboxylate (Preparation 80, 50 g, 0.22 mol, 1 eq) in THF was added.
 6 | After few hours, reaction mixture can be diluted with water and neutralize with hydrochloric acid, filter and recrystallize to give title compound.
 7 | The reaction mixture was allowed to reach room temperature and stirred over night.
 8 | After the reaction is completed it was cooled to room temperature and extracted with ethyl acetate.
 9 | The resulting precipitate was collected by filtration, washed with water and ether and dried for 4 hours at 60° C. under high vacuum to give 7-benzyloxy-3,4-dihydroquinazolin-4-one (7.02 g, 63%).
10 | After concentration, the residue was diluted with dichloromethane (200 mL/mmol), washed with water (62 mL/mmol x 3).
11 | 


--------------------------------------------------------------------------------
/Paragraph2Action/data/few_data/few_example_data/src-valid.txt:
--------------------------------------------------------------------------------
 1 | The reaction mixture was heated to 60 °C to give a homogeneous solution that was maintained at 60 °C for 16 hours at which time the starting material was completely consumed.
 2 | To the residue were added ethyl acetate (3 ml) and heptane (3 ml) to precipitate a solid.
 3 | To a stirred solution of 5-(4-bromophenyl)-1-[3-fluoro-4-(methylsulfonyl)phenyl]-3-(trifluoromethyl)-1H-pyrazole (0.15 g, 0.324 mmol) in DME (3.9 mL) was added furan -3-boronic acid (0.044 g, 0.389 mmol), bis(triphenylphosphine) palladium( II)chloride (0.025 g, 0.04 mmol) and saturated NaHCO3 solution (1.3 mL) at room temperature under nitrogen.
 4 | Yield 1.5 g (50.34%).
 5 | Appl. (1992), GB 2253848 A1 was reacted with [2-methyl-5-(4-trifluoromethoxy-phenyl)-2H-pyrazol-3-yl]-methanol (example 11 d]) in the presence of N,N,N',N'-tetramethyl azodicarboxamide and tributylphosphine to give [rac]-2-{6-[2-methyl-5-(4-trifluoromethoxy-phenyl)-2H-pyrazol-3-ylmethoxy]-indol-1-yl}-propionic acid-ethyl ester as colorless oil.
 6 | Potassium hydroxide (3.65 g, 65.1 mmol) is added all at once to a suspension of 4-bromo-3-methyl-1-(4-methylbenzenesulphonyl)-1H-pyrrole-2-carbonitrile (4.66 g, 13.7 mmol) in methanol (95 mL) cooled using an ice bath.
 7 | The reaction mixture was stirred for 3 hours, and then 300 ml of water was added.
 8 | The reaction was allowed to stir at room temperature for 92 hours and the mixture was poured into 2M aqueous hydrochloric acid (85ml) then diluted with water (170ml).
 9 | Without further workup, the mixture is purified directly by preparative HPLC [Method 10].
10 | The reaction vessel is sealed and the solution is left to stir for 20 h. EtOAc (50 mL) is added.
11 | 


--------------------------------------------------------------------------------
/Paragraph2Action/data/few_data/few_example_data/tgt-test.txt:
--------------------------------------------------------------------------------
 1 | STIR for overnight at room temperature.
 2 | COLLECTLAYER organic; WASH with brine; DRYSOLUTION over sodium sulfate; FILTER keep filtrate; CONCENTRATE; YIELD N-{2-[4-amino-7-[(6-aminohexyl)oxy]-2-(ethoxymethyl)-1H-imidazo[4,5-c]quinolin-1-yl]-1,1-dimethylethyl}-N-isopropylurea (2.98 g).
 3 | ADD LiAlH4 ‌(1 M in THF) (24.4 mL); ADD THF (45 mL); MAKESOLUTION with 3-Amino-2,4-dichloro-benzamide (2.00 g, 9.8 mmol) and THF (45 mL); ADD SLN dropwise.
 4 | ADD methanesulfonyl chloride (1 drop); STIR for 30 min at RT.
 5 | SETTEMPERATURE −80° C; MAKESOLUTION with tert-butyl 6-[(cyclopropylmethoxy)methyl]-6-hydroxy-1,4-oxazepane-4-carboxylate (50 g, 0.22 mol, 1 eq) and THF; ADD SLN.
 6 | WAIT for few hours; ADD water; PH with hydrochloric acid to pH neutral; FILTER; RECRYSTALLIZE from unknown; YIELD title compound.
 7 | STIR for over night at room temperature.
 8 | SETTEMPERATURE room temperature; EXTRACT with ethyl acetate.
 9 | FILTER keep precipitate; WASH with water; WASH with ether; DRYSOLID for 4 hours at 60° C under vacuum; YIELD 7-benzyloxy-3,4-dihydroquinazolin-4-one (7.02 g, 63%).
10 | CONCENTRATE; ADD dichloromethane (200 mL); WASH with water (62 mL) 3 x.
11 | 


--------------------------------------------------------------------------------
/Paragraph2Action/data/few_data/few_example_data/tgt-train.txt:
--------------------------------------------------------------------------------
 1 | PHASESEPARATION.
 2 | WAIT for three days at RT; ADD ethyl acetate; WASH with water.
 3 | ADD 3-Bromo-2-fluoroaniline (10 g, 52.63 mmol); ADD DCM (100 mL) under nitrogen.
 4 | SETTEMPERATURE rt; CONCENTRATE.
 5 | PURIFY; YIELD benzyl 3-(formamido(7-((2-(trimethylsilyl)ethoxy)methyl)-7H-pyrrolo[2,3-d]pyrimidin-4-yl)methyl)piperidine-1-carboxylate (2.2 g, 69%).
 6 | OTHERLANGUAGE.
 7 | CONCENTRATE; ADD MeOH; PURIFY.
 8 | MAKESOLUTION with methyl 1H-indazole-6-carboxylate (865 mg, 4.91 mmol) and N,N-dimethylformamide (12 mL); ADD SLN; ADD potassium hydroxide (840 mg, 3.05 mmol); ADD iodine (1.54 g, 5.9 mmol).
 9 | PURIFY; COLLECTLAYER organic; PH with K2CO3 (0.5 g) to pH basic; EXTRACT with DCM (50 ml) 2 x; DRYSOLUTION over unknown; CONCENTRATE; YIELD yellow solid (101 mg, 31%).
10 | STIR for overnight at 90°C.
11 | ADD water; EXTRACT with EA 3 x; COLLECTLAYER organic; WASH with brine; DRYSOLUTION over anhydrous Na2SO4.
12 | NOACTION.
13 | SETTEMPERATURE room temperature; CONCENTRATE.
14 | ADD Crushed ice (40 g); PH with 40% aqueous KOH solution to pH 12.
15 | STIR for 10 min at 0 °C; STIR for 20 min at RT.
16 | FILTER keep filtrate; CONCENTRATE; PURIFY.
17 | WAIT for unknown at below 30°.
18 | MAKESOLUTION with 4-chloro-3-(2,3-dichloro-4-methoxybenzyl)-5-difluoromethoxy-1-methyl-1H-pyrazole (3.37 g, 9 mmol) and dichloromethane (45 ml); ADD SLN; SETTEMPERATURE (−78)° C; ADD 1-molar solution of boron tribromide in dichloromethane (18.1 ml, 18 mmol).
19 | ADD dimethylsulfoxide (15 ml); ADD Ethyl 7,8,9-trifluoro-1-hydroxymethyl-5-oxo-5H-thiazolo[3,2-a]quinoline-4-carboxylate (0.50 g); ADD triethylamine (0.40 g); STIR for 1 hour and 30 minutes at 75°C.
20 | PURIFY; PH with PL-HCO3 resin to pH neutral.
21 | NOACTION.
22 | MAKESOLUTION with obtained residue (40 mg) and DCM (2.0 ml); ADD SLN; ADD trifluoroacetic acid (500 μl) at 0 °C; STIR for 2 hours at same temperature.
23 | FOLLOWOTHERPROCEDURE.
24 | COLLECTLAYER aqueous; EXTRACT with dichloromethane 3 x; COLLECTLAYER organic; WASH with saturated aqueous sodium bicarbonate; WASH with brine.
25 | ADD (7-Fluoro-2-oxo-1,5-naphthyridin-1(2H)-yl)acetaldehyde methyl hemiacetal (200 mg, 0.8396 mmol); ADD 1,1-dimethylethyl[(3S,4R)-3-hydroxy-4-piperidinyl]carbamate (192 mg, 1 eq); ADD chloroform (10 ml); ADD MeOH (0.5 ml); STIR for 2 h under argon.
26 | REFLUX for 5 minutes; SETTEMPERATURE room temperature; CONCENTRATE.
27 | ADD water; PH with 1.0 N HCl aqueous solution to pH 3 to 4; EXTRACT with EtOAc.
28 | SETTEMPERATURE 0° C; WAIT for unknown; FILTER keep precipitate.
29 | STIR for 20 min at room temperature; ADD water; EXTRACT with dichloromethane.
30 | CONCENTRATE; PURIFY; YIELD compound 162.
31 | MAKESOLUTION with (6,7-difluoro-3-methyl-1,2-benzisoxazol-5-yl)methanol (200 mg, 1.0 mmol) and CH2Cl2 (10 mL); ADD SLN; ADD NMO (235 mg, 2.0 mmol) at 0 °C; ADD TPAP (35 mg, 0.1 mmol) at 0 °C; STIR for 2 hours at room temperature.
32 | MAKESOLUTION with 5-amino-2-methoxy-4-nitrobenzoic acid (5.55 g) and benzene (40 ml); ADD SLN; ADD thionyl chloride (40 ml); REFLUX for 4 hours.
33 | NOACTION.
34 | ADD DCM (1-2 mL); ADD Et2O.
35 | STIR for 30 minutes at 25° C; PH with 6M NaOH (0.135 L) to pH 9.
36 | MAKESOLUTION with (pyridin-2-yl)methanol (0.724 ml) and dichloromethane (40 ml); ADD SLN; ADD Triphenylphosphine (2.95 g) at 0 °C; ADD carbon tetrabromide (3.73 g) at 0 °C; STIR for 1.5 hr at same temperature.
37 | INVALIDACTION.
38 | CONCENTRATE; PURIFY; YIELD 2-(5-(3,5-dichlorophenyl)-5-(trifluoromethyl)-4,5-dihydroisoxazol-3-yl)-N-(2-oxo-2-(2,2,2-trifluoroethylamino)ethyl)thieno[2,3-b]pyridine-5-carboxamide (50 mg, 34.78%).
39 | SETTEMPERATURE ambient temperature; ADD H2O (250 mL).
40 | ADD 4-(1-{2-fluoro-1-[(3S)-pyrrolidin-3-yl]ethyl}-1H-pyrazol-4-yl)-7-{[2-(trimethylsilyl)ethoxy]methyl}-7H-pyrrolo[2,3-d]pyrimidine (25 mg, 0.058 mmol); ADD DIPEA (2.0E1 μL, 0.12 mmol); ADD NMP (0.2 mL).
41 | STIR for 1.5 hours; MAKESOLUTION with 3-amino-4-{[3-(dimethylamino)propyl]amino}-N-methylbenzamide (1.653 g) and chloroform (9 ml); ADD SLN.
42 | INVALIDACTION.
43 | INVALIDACTION.
44 | STIR for 1 hour at room temperature; ADD methanolic HCl.
45 | STIR for 30 min at 0°C; SETTEMPERATURE ambient temperature.
46 | COLLECTLAYER aqueous; PH with 1N HCl (8 mL) to pH 3.
47 | SETTEMPERATURE 0 °C; MAKESOLUTION with tert-Butyl 4-oxopiperidine-1-carboxylate (4.98 g) and dichloromethane (20 mL); STIR for 1 hour at room temperature.
48 | STIR for 1 hr at room temperature; PH with 1N hydrochloric acid to pH neutral; EXTRACT with ethyl acetate.
49 | NOACTION.
50 | ADD THF (64 ml); ADD methanol (24 ml); ADD compound obtained in Example 149-1 (396 mg, 0.696 mmol); ADD 2.5% potassium carbonate aqueous solution (36 ml); STIR for 6 hours at room temperature.
51 | STIR for 15 min at 70° C; SETTEMPERATURE r.t..
52 | SETTEMPERATURE 110°C.
53 | FOLLOWOTHERPROCEDURE.
54 | MAKESOLUTION with methyl ester of 4-methyl-3-cyclohexene-1-carboxylic acid (100 g) and dry tetrahydrofuran (20 ml); ADD SLN at 0 °C.
55 | NOACTION.
56 | WAIT for overnight.
57 | SETTEMPERATURE room temperature; STIR for overnight at room temperature.
58 | ADD Water; PH with 2N hydrochloric acid to pH neutral; EXTRACT with ethyl acetate.
59 | FOLLOWOTHERPROCEDURE.
60 | ADD ice water; WASH with diethyl ether; PH with 5 M aqueous solution of sodium hydroxide to pH neutral; EXTRACT with ethyl acetate 2 x.
61 | MAKESOLUTION with 2-[(3,6-dichloro-4-pyridazinyl)oxy]ethanol (15.46 g, 0.0703 mol) and dry 1,4-dioxane (1.2 L); ADD SLN; ADD lithium hydride (2.3 g, 0.28 mol); STIR for 1 hour at room temperature under argon; STIR for overnight at 110° C.
62 | ADD 2M dimethyl amine solution in THF (5 mL); ADD 2,8-Dimethyl-5-((2-(pyridin-4-yl)oxiran-2-yl)methyl)-2,3,4,5-tetrahydro-1H-pyrido[4,3-b]indole (100 mg, 0.3 mmol); STIR for overnight at 60° C.
63 | STIR for 48 hours at room temperature.
64 | MAKESOLUTION with 6-(3-fluoro-4-methoxyphenyl)-5-(4-methoxyphenyl)-2H-pyridazin-3-one (150 mg, 0.46 mmol) and N,N-dimethylformamide (1.5 ml); ADD SLN; ADD potassium carbonate (317.6 mg); ADD ethyl iodide (179.2 mg); STIR for 3 hours at 70° C.
65 | FOLLOWOTHERPROCEDURE.
66 | STIR for 4 hrs at −70° C; SETTEMPERATURE room temperature.
67 | PH with NaOH solution to pH 9; PHASESEPARATION.
68 | FOLLOWOTHERPROCEDURE.
69 | NOACTION.
70 | STIR for 60 minutes at same temperature; SETTEMPERATURE room temperature.
71 | COLLECTLAYER aqueous; EXTRACT with ethyl acetate (100 ml); COLLECTLAYER organic; WASH with water (100 ml) 3 x; WASH with saturated aqueous sodium chloride (100 ml); DRYSOLUTION over MgSO4; FILTER keep filtrate; CONCENTRATE; YIELD oily residue (3.9 g).
72 | NOACTION.
73 | WAIT for 20 minutes.
74 | ADD 3-allyloxy-4-methoxynitrobenzene over 30 minute; STIR for 3 hours at 95° C.
75 | YIELD title product (20.64 g, 84%).
76 | ADD water (200 ml); PH with KOH pellets to pH neutral; EXTRACT with CH2Cl2 (100 ml) 3 x.
77 | SETTEMPERATURE room temperature; FILTER keep precipitate; WASH with cold triethyl orthoformate; WASH with heptane; DRYSOLID under vacuum.
78 | PH with 1N HCl to pH ˜2.
79 | STIR for overnight at room temperature; QUENCH with saturated aqueous NH4Cl solution.
80 | ADD 3-Cyclobutyl-7-(piperidin-4-yloxy)-2,3,4,5-tetrahydro-1H-benzo[d]azepine (150 mg, 0.5 mmol); ADD dichloromethane (5 ml); ADD diethylaminomethyl polystyrene ‌(3.2 mmol/g) (625 mg, 2 mmol).
81 | 


--------------------------------------------------------------------------------
/Paragraph2Action/data/few_data/few_example_data/tgt-valid.txt:
--------------------------------------------------------------------------------
 1 | SETTEMPERATURE 60 °C; STIR for 16 hours at 60 °C.
 2 | ADD ethyl acetate (3 ml); ADD heptane (3 ml).
 3 | MAKESOLUTION with 5-(4-bromophenyl)-1-[3-fluoro-4-(methylsulfonyl)phenyl]-3-(trifluoromethyl)-1H-pyrazole (0.15 g, 0.324 mmol) and DME (3.9 mL); ADD SLN; ADD furan -3-boronic acid (0.044 g, 0.389 mmol) at room temperature under nitrogen; ADD bis(triphenylphosphine) palladium( II)chloride (0.025 g, 0.04 mmol) at room temperature under nitrogen; ADD saturated NaHCO3 solution (1.3 mL) at room temperature under nitrogen.
 4 | NOACTION.
 5 | FOLLOWOTHERPROCEDURE.
 6 | MAKESOLUTION with 4-bromo-3-methyl-1-(4-methylbenzenesulphonyl)-1H-pyrrole-2-carbonitrile (4.66 g, 13.7 mmol) and methanol (95 mL); ADD SLN; ADD Potassium hydroxide (3.65 g, 65.1 mmol) at 0 °C.
 7 | STIR for 3 hours; ADD water (300 ml).
 8 | STIR for 92 hours at room temperature; ADD 2M aqueous hydrochloric acid (85ml); ADD water (170ml).
 9 | PURIFY.
10 | STIR for 20 h; ADD EtOAc (50 mL).
11 | 


--------------------------------------------------------------------------------
/Paragraph2Action/data/few_data/few_test/test_10.csv:
--------------------------------------------------------------------------------
 1 | Paragraph,Action
 2 | The reaction mixture is allowed to warm to room temperature and stirred overnight.,STIR for overnight at room temperature.
 3 | "The combined organic layers were washed with brine, dried over sodium sulfate, filtered, and concentrated to yield 2.98 g of N-{2-[4-amino-7-[(6-aminohexyl)oxy]-2-(ethoxymethyl)-1H-imidazo[4,5-c]quinolin-1-yl]-1,1-dimethylethyl}-N-isopropylurea as a dark orange solid.","COLLECTLAYER organic; WASH with brine; DRYSOLUTION over sodium sulfate; FILTER keep filtrate; CONCENTRATE; YIELD N-{2-[4-amino-7-[(6-aminohexyl)oxy]-2-(ethoxymethyl)-1H-imidazo[4,5-c]quinolin-1-yl]-1,1-dimethylethyl}-N-isopropylurea (2.98 g)."
 4 | "3-Amino-2,4-dichloro-benzamide (2.00 g, 9.8 mmol) in THF (45 mL) was added dropwise to LiAlH4 (1 M in THF, 24.4 mL) in THF (45 mL).","ADD LiAlH4 ‌(1 M in THF) (24.4 mL); ADD THF (45 mL); MAKESOLUTION with 3-Amino-2,4-dichloro-benzamide (2.00 g, 9.8 mmol) and THF (45 mL); ADD SLN dropwise."
 5 | A further drop of methanesulfonyl chloride was added and the mixture continued to stir at RT for 30 min.,ADD methanesulfonyl chloride (1 drop); STIR for 30 min at RT.
 6 | "The reaction mixture was cooled to −80° C., and a solution of tert-butyl 6-[(cyclopropylmethoxy)methyl]-6-hydroxy-1,4-oxazepane-4-carboxylate (Preparation 80, 50 g, 0.22 mol, 1 eq) in THF was added.","SETTEMPERATURE −80° C; MAKESOLUTION with tert-butyl 6-[(cyclopropylmethoxy)methyl]-6-hydroxy-1,4-oxazepane-4-carboxylate (50 g, 0.22 mol, 1 eq) and THF; ADD SLN."
 7 | "After few hours, reaction mixture can be diluted with water and neutralize with hydrochloric acid, filter and recrystallize to give title compound.",WAIT for few hours; ADD water; PH with hydrochloric acid to pH neutral; FILTER; RECRYSTALLIZE from unknown; YIELD title compound.
 8 | The reaction mixture was allowed to reach room temperature and stirred over night.,STIR for over night at room temperature.
 9 | After the reaction is completed it was cooled to room temperature and extracted with ethyl acetate.,SETTEMPERATURE room temperature; EXTRACT with ethyl acetate.
10 | "The resulting precipitate was collected by filtration, washed with water and ether and dried for 4 hours at 60° C. under high vacuum to give 7-benzyloxy-3,4-dihydroquinazolin-4-one (7.02 g, 63%).","FILTER keep precipitate; WASH with water; WASH with ether; DRYSOLID for 4 hours at 60° C under vacuum; YIELD 7-benzyloxy-3,4-dihydroquinazolin-4-one (7.02 g, 63%)."
11 | "After concentration, the residue was diluted with dichloromethane (200 mL/mmol), washed with water (62 mL/mmol x 3).",CONCENTRATE; ADD dichloromethane (200 mL); WASH with water (62 mL) 3 x.
12 | 


--------------------------------------------------------------------------------
/Paragraph2Action/data/few_data/few_valid/valid_10.csv:
--------------------------------------------------------------------------------
 1 | Paragraph,Action
 2 | The reaction mixture was heated to 60 °C to give a homogeneous solution that was maintained at 60 °C for 16 hours at which time the starting material was completely consumed.,SETTEMPERATURE 60 °C; STIR for 16 hours at 60 °C.
 3 | To the residue were added ethyl acetate (3 ml) and heptane (3 ml) to precipitate a solid.,ADD ethyl acetate (3 ml); ADD heptane (3 ml).
 4 | "To a stirred solution of 5-(4-bromophenyl)-1-[3-fluoro-4-(methylsulfonyl)phenyl]-3-(trifluoromethyl)-1H-pyrazole (0.15 g, 0.324 mmol) in DME (3.9 mL) was added furan -3-boronic acid (0.044 g, 0.389 mmol), bis(triphenylphosphine) palladium( II)chloride (0.025 g, 0.04 mmol) and saturated NaHCO3 solution (1.3 mL) at room temperature under nitrogen.","MAKESOLUTION with 5-(4-bromophenyl)-1-[3-fluoro-4-(methylsulfonyl)phenyl]-3-(trifluoromethyl)-1H-pyrazole (0.15 g, 0.324 mmol) and DME (3.9 mL); ADD SLN; ADD furan -3-boronic acid (0.044 g, 0.389 mmol) at room temperature under nitrogen; ADD bis(triphenylphosphine) palladium( II)chloride (0.025 g, 0.04 mmol) at room temperature under nitrogen; ADD saturated NaHCO3 solution (1.3 mL) at room temperature under nitrogen."
 5 | Yield 1.5 g (50.34%).,NOACTION.
 6 | "Appl. (1992), GB 2253848 A1 was reacted with [2-methyl-5-(4-trifluoromethoxy-phenyl)-2H-pyrazol-3-yl]-methanol (example 11 d]) in the presence of N,N,N',N'-tetramethyl azodicarboxamide and tributylphosphine to give [rac]-2-{6-[2-methyl-5-(4-trifluoromethoxy-phenyl)-2H-pyrazol-3-ylmethoxy]-indol-1-yl}-propionic acid-ethyl ester as colorless oil.",FOLLOWOTHERPROCEDURE.
 7 | "Potassium hydroxide (3.65 g, 65.1 mmol) is added all at once to a suspension of 4-bromo-3-methyl-1-(4-methylbenzenesulphonyl)-1H-pyrrole-2-carbonitrile (4.66 g, 13.7 mmol) in methanol (95 mL) cooled using an ice bath.","MAKESOLUTION with 4-bromo-3-methyl-1-(4-methylbenzenesulphonyl)-1H-pyrrole-2-carbonitrile (4.66 g, 13.7 mmol) and methanol (95 mL); ADD SLN; ADD Potassium hydroxide (3.65 g, 65.1 mmol) at 0 °C."
 8 | "The reaction mixture was stirred for 3 hours, and then 300 ml of water was added.",STIR for 3 hours; ADD water (300 ml).
 9 | The reaction was allowed to stir at room temperature for 92 hours and the mixture was poured into 2M aqueous hydrochloric acid (85ml) then diluted with water (170ml).,STIR for 92 hours at room temperature; ADD 2M aqueous hydrochloric acid (85ml); ADD water (170ml).
10 | "Without further workup, the mixture is purified directly by preparative HPLC [Method 10].",PURIFY.
11 | The reaction vessel is sealed and the solution is left to stir for 20 h. EtOAc (50 mL) is added.,STIR for 20 h; ADD EtOAc (50 mL).
12 | 


--------------------------------------------------------------------------------
/Paragraph2Action/data/hand_annotated/README.md:
--------------------------------------------------------------------------------
1 | The whole ```hand_annotated``` paragraph2action dataset is available upon request (pistachio license needed),  sourced from https://github.com/rxn4chemistry/paragraph2actions/tree/main/test_data


--------------------------------------------------------------------------------
/Paragraph2Action/data/processed_data/README.md:
--------------------------------------------------------------------------------
1 | The whole ```hand_annotated``` paragraph2action dataset is available upon request (pistachio license needed),  sourced from https://github.com/rxn4chemistry/paragraph2actions/tree/main/test_data


--------------------------------------------------------------------------------
/Paragraph2Action/evaluate_for_paragraph2action.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Functions of calculating Metrics"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "# The code of calculating metrics are from:\n",
 17 |     "# https://github.com/rxn4chemistry/paragraph2actions/blob/main/src/paragraph2actions/analysis.py\n",
 18 |     "# https://github.com/rxn4chemistry/paragraph2actions/blob/main/src/paragraph2actions/scripts/calculate_metrics.py\n",
 19 |     "\n",
 20 |     "import os\n",
 21 |     "import pandas as pd\n",
 22 |     "from typing import Tuple, Sequence, List, Optional, Iterable, Any, Iterator, Callable\n",
 23 |     "import textdistance\n",
 24 |     "from nltk.translate.bleu_score import corpus_bleu\n",
 25 |     "import matplotlib.pyplot as plt\n",
 26 |     "\n",
 27 |     "def all_identical(sequence: Sequence[Any]) -> bool:\n",
 28 |     "    return all(s == sequence[0] for s in sequence)\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "def highlight_differences(source_sentences: List[str], translations: Sequence[List[str]]) -> None:\n",
 32 |     "    \"\"\"\n",
 33 |     "    Will highlight sentences that are translated differently by different models.\n",
 34 |     "\n",
 35 |     "    Args:\n",
 36 |     "        source_sentences: Sentences to translate (length: L)\n",
 37 |     "        translations: Multiple lists of translations, depending on the number of translation models (size: n_models x L)\n",
 38 |     "    \"\"\"\n",
 39 |     "    assert all(len(l) == len(source_sentences) for l in translations)\n",
 40 |     "\n",
 41 |     "    for i, sentence in enumerate(source_sentences):\n",
 42 |     "        sentence_translations = [t[i] for t in translations]\n",
 43 |     "\n",
 44 |     "        if not all_identical(sentence_translations):\n",
 45 |     "            print(f'Sample {i}: {sentence}')\n",
 46 |     "            for model_no, s in enumerate(sentence_translations, 1):\n",
 47 |     "                print(f'{model_no}) {s}')\n",
 48 |     "            print()\n",
 49 |     "\n",
 50 |     "\n",
 51 |     "def full_sentence_accuracy(truth: List[str], pred: List[str]) -> float:\n",
 52 |     "    \"\"\"\n",
 53 |     "    Calculate the number of exact matches.\n",
 54 |     "    \"\"\"\n",
 55 |     "    assert len(truth) == len(pred)\n",
 56 |     "\n",
 57 |     "    correct_count = sum(int(t == p) for t, p in zip(truth, pred))\n",
 58 |     "    return correct_count / len(truth)\n",
 59 |     "\n",
 60 |     "def modified_bleu(truth: List[str], pred: List[str]) -> float:\n",
 61 |     "    \"\"\"\n",
 62 |     "    Calculates the BLEU score of a translation, with a small modification in order not to penalize sentences\n",
 63 |     "    with less than 4 words.\n",
 64 |     "\n",
 65 |     "    Returns:\n",
 66 |     "        value between 0 and 1.\n",
 67 |     "    \"\"\"\n",
 68 |     "    \n",
 69 |     "    references = [sentence.split() for sentence in truth]\n",
 70 |     "    candidates = [sentence.split() for sentence in pred]\n",
 71 |     "\n",
 72 |     "    # BLEU penalizes sentences with only one word. Even correct translations get a score of zero.\n",
 73 |     "    references = [r + max(0, 4 - len(r)) * [''] for r in references]\n",
 74 |     "    candidates = [c + max(0, 4 - len(c)) * [''] for c in candidates]\n",
 75 |     "\n",
 76 |     "    # references must have a larger depth because it supports multiple choices\n",
 77 |     "    refs = [[r] for r in references]\n",
 78 |     "    return corpus_bleu(refs, candidates)\n",
 79 |     "\n",
 80 |     "\n",
 81 |     "def original_bleu(truth: List[str], pred: List[str]) -> float:\n",
 82 |     "    \"\"\"\n",
 83 |     "    Calculates the BLEU score of a translation, with the original function from nltk.\n",
 84 |     "\n",
 85 |     "    Returns:\n",
 86 |     "        value between 0 and 1.\n",
 87 |     "    \"\"\"\n",
 88 |     "    references = [sentence.split() for sentence in truth]\n",
 89 |     "    candidates = [sentence.split() for sentence in pred]\n",
 90 |     "\n",
 91 |     "    # references must have a larger depth because it supports multiple choices\n",
 92 |     "    refs = [[r] for r in references]\n",
 93 |     "    return corpus_bleu(refs, candidates)\n",
 94 |     "\n",
 95 |     "\n",
 96 |     "def bleu2(truth, pred):\n",
 97 |     "    references = [sentence.split() for sentence in truth]\n",
 98 |     "    candidates = [sentence.split() for sentence in pred]\n",
 99 |     "    refs = [[r] for r in references]\n",
100 |     "    bleu2 = corpus_bleu(refs, candidates, weights=(.5, .5))\n",
101 |     "    return bleu2\n",
102 |     "\n",
103 |     "\n",
104 |     "def levenshtein_similarity(truth: List[str], pred: List[str]) -> float:\n",
105 |     "    assert len(truth) == len(pred)\n",
106 |     "    scores = (textdistance.levenshtein.normalized_similarity(t, p) for t, p in zip(truth, pred))\n",
107 |     "    return sum(scores) / len(truth)\n",
108 |     "\n",
109 |     "\n",
110 |     "def partial_accuracy(truth: List[str], pred: List[str], threshold: float) -> float:\n",
111 |     "    \"\"\"\n",
112 |     "    Calculates the accuracy from the fraction of sentences that have a similarity to the\n",
113 |     "    ground truth higher than a given threshold.\n",
114 |     "\n",
115 |     "    For threshold == 1.0, this function is equivalent to full_sentence_accuracy.\n",
116 |     "\n",
117 |     "    Args:\n",
118 |     "        truth: ground truth action sequences\n",
119 |     "        pred: predicted truth action sequences\n",
120 |     "        threshold: threshold above which to consider it as a partial match, between 0 and 1\n",
121 |     "    \"\"\"\n",
122 |     "    assert len(truth) == len(pred)\n",
123 |     "    match_count = sum(\n",
124 |     "        1 for t, p in zip(truth, pred)\n",
125 |     "        if textdistance.levenshtein.normalized_similarity(t, p) >= threshold\n",
126 |     "    )\n",
127 |     "    return match_count / len(truth)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "### Calculating Metrics"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 2,
140 |    "metadata": {},
141 |    "outputs": [
142 |     {
143 |      "name": "stdout",
144 |      "output_type": "stream",
145 |      "text": [
146 |       "Modified BLEU, pr: 0.864221426194361\n",
147 |       "BLEU-2, pr: 0.8859320523310793\n",
148 |       "Levenshtein, pr: 0.8991550985984681\n",
149 |       "100% accuracy, pr: 0.6903409090909091\n",
150 |       "90% accuracy, pr: 0.78125\n",
151 |       "75% accuracy, pr: 0.8693181818181818\n",
152 |       "\n"
153 |      ]
154 |     }
155 |    ],
156 |    "source": [
157 |     "\"\"\"Calculate metrics for predictions generated by one or several translation models\"\"\"\n",
158 |     "\n",
159 |     "# load predictions and ground truth\n",
160 |     "df = pd.read_csv(\"results/predictions/finetuned_gpt3.5_hand_annotated_train_augmented_unique_5_epoch.csv\")#.fillna(\"0\")\n",
161 |     "\n",
162 |     "ground_truth = list(df['Actual Text'])\n",
163 |     "prediction = list(df['Generated Text']) \n",
164 |     "\n",
165 |     "# evaluations\n",
166 |     "print('Modified BLEU, pr:', modified_bleu(ground_truth, prediction))\n",
167 |     "print('BLEU-2, pr:', bleu2(ground_truth, prediction))\n",
168 |     "print('Levenshtein, pr:', levenshtein_similarity(ground_truth, prediction))\n",
169 |     "print('100% accuracy, pr:', partial_accuracy(ground_truth, prediction, 1.0))\n",
170 |     "print('90% accuracy, pr:', partial_accuracy(ground_truth, prediction, 0.9))\n",
171 |     "print('75% accuracy, pr:', partial_accuracy(ground_truth, prediction, 0.75))\n",
172 |     "print()"
173 |    ]
174 |   }
175 |  ],
176 |  "metadata": {
177 |   "kernelspec": {
178 |    "display_name": "know",
179 |    "language": "python",
180 |    "name": "python3"
181 |   },
182 |   "language_info": {
183 |    "codemirror_mode": {
184 |     "name": "ipython",
185 |     "version": 3
186 |    },
187 |    "file_extension": ".py",
188 |    "mimetype": "text/x-python",
189 |    "name": "python",
190 |    "nbconvert_exporter": "python",
191 |    "pygments_lexer": "ipython3",
192 |    "version": "3.10.13"
193 |   },
194 |   "orig_nbformat": 4
195 |  },
196 |  "nbformat": 4,
197 |  "nbformat_minor": 2
198 | }
199 | 


--------------------------------------------------------------------------------
/Paragraph2Action/finetune_bart_or_t5_for_paragraph2action.py:
--------------------------------------------------------------------------------
  1 | # Finetuning Bart or T5
  2 | import os
  3 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  4 | import pandas as pd 
  5 | import numpy as np
  6 | import torch
  7 | import logging
  8 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
  9 | from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, TrainerCallback
 10 | from datasets import load_metric, Dataset, DatasetDict
 11 | import warnings
 12 | warnings.filterwarnings('ignore')
 13 | 
 14 | # Configuration
 15 | class CFG:
 16 |     # Model Configuration
 17 |     device = "cuda:0" if torch.cuda.is_available() else "cpu"
 18 |     num_train_epochs = 50
 19 |     save_total_limit = 50
 20 |     batch_size = 8
 21 |     learning_rate = 1e-5
 22 |     max_input_length = 512
 23 |     max_target_length = 512
 24 |     weight_decay = 0.01
 25 |     save_strategy = "epoch"
 26 |     evaluation_strategy = "epoch"
 27 |     interval_eval_epoch = 1                               # the number of interval epochs to evaluate (inference)
 28 |     model_name = "bart-base"                                # "bart-base" or "t5-base"
 29 |     pretrained_dir = "/home/zhangwei/pretrained_models/"  # Path of the pretrained model downloaded from Hugging Face
 30 |     saved_models_dir = f"saved_models/{model_name}/hand_annotated_train_augmented_unique_{model_name}_lr_{learning_rate}_epoch_{num_train_epochs}_bs_{batch_size}_intervel_{interval_eval_epoch}/"
 31 |     output_dir = f"results/predictions/{saved_models_dir}"
 32 | 
 33 |     # Data Configuration
 34 |     train_file = "data/processed_data/hand_annotated_train_augmented_unique.csv" # "hand_annotated_train.csv" or "hand_annotated_train_augmented_unique.csv"
 35 |     test_file = "data/processed_data/hand_annotated_test.csv"
 36 |     source_text_column = "paragraphs"
 37 |     target_text_column = "actions"
 38 | 
 39 | def load_data():
 40 |     train_df = pd.read_csv(CFG.train_file)
 41 |     test_df = pd.read_csv(CFG.test_file)
 42 |     train_dataset = Dataset.from_dict(train_df.astype(str))
 43 |     test_dataset = Dataset.from_dict(test_df.astype(str))
 44 |     datasets = DatasetDict({"train": train_dataset, "test": test_dataset})
 45 |     print(datasets)
 46 |     return datasets
 47 | 
 48 | def tokenize_and_encode(tokenizer, datasets):
 49 |     def tokenize_function(examples):
 50 |         model_inputs = tokenizer(examples[CFG.source_text_column], max_length=CFG.max_input_length, truncation=True)
 51 |         model_labels = tokenizer(examples[CFG.target_text_column], max_length=CFG.max_target_length, truncation=True)
 52 |         model_inputs["labels"] = model_labels["input_ids"]
 53 |         return model_inputs
 54 |     return datasets.map(tokenize_function, batched=True)
 55 | 
 56 | def logging_config():
 57 |     logging.info("Configuration Details:")
 58 |     for attr in dir(CFG):
 59 |         # Filter out private attributes and methods
 60 |         if not attr.startswith("__") and not callable(getattr(CFG, attr)):
 61 |             logging.info(f"{attr}: {getattr(CFG, attr)}")
 62 | 
 63 | # Custom Callback
 64 | class CustomCallback(TrainerCallback):    
 65 |     def __init__(self, trainer) -> None:
 66 |         super().__init__()
 67 |         self._trainer = trainer
 68 | 
 69 |     def on_log(self, args, state, control, logs=None, **kwargs):
 70 |         if 'loss' in logs:
 71 |             training_loss = logs['loss']
 72 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current training_loss: {training_loss}")
 73 | 
 74 |         if 'eval_loss' in state.log_history[-1]:
 75 |             eval_loss = state.log_history[-1]['eval_loss']
 76 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current eval_loss: {eval_loss}")
 77 | 
 78 |     def on_epoch_end(self, args, state, control, **kwargs):
 79 |             logging.info("Saving inference results for test_set...")
 80 |             output = self._trainer.predict(self._trainer.eval_dataset)
 81 |             epoch = int(state.epoch)
 82 | 
 83 |             if epoch % CFG.interval_eval_epoch == 0 :
 84 |                 # Decode generated summaries into text
 85 |                 decoded_ids = output.predictions
 86 | 
 87 |                 # Replace -100 in the labels as we can't decode them
 88 |                 decoded_ids = np.where(decoded_ids != -100, decoded_ids, tokenizer.pad_token_id)
 89 |                 decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=True)
 90 |                 paragraphs = [i[CFG.source_text_column] for i in self._trainer.eval_dataset]
 91 |                 ground_truth = [i[CFG.target_text_column] for i in self._trainer.eval_dataset]
 92 |                 prediction = [decoded_text for decoded_text in decoded_texts]
 93 | 
 94 |                 # Save predictions to csv
 95 |                 predicted_df = pd.DataFrame()
 96 |                 predicted_df['Paragraph'] = paragraphs
 97 |                 predicted_df['Generated Text'] = prediction
 98 |                 predicted_df['Actual Text'] = ground_truth
 99 |                 predicted_df.to_csv(f"{CFG.output_dir}/epoch_{epoch}.csv", index = None)
100 | 
101 | def main():
102 |     # mkdir needed folders
103 |     if not os.path.exists(CFG.saved_models_dir):
104 |         os.makedirs(CFG.saved_models_dir)
105 |     if not os.path.exists(CFG.output_dir):
106 |         os.makedirs(CFG.output_dir)
107 | 
108 |     # Setup logging
109 |     logging.basicConfig(filename = CFG.saved_models_dir+'/training.log', level = logging.INFO)
110 |     logging_config()
111 | 
112 |     # Loading Tokenizer and Model
113 |     print("Loading Tokenizer and Model ...")
114 |     logging.info(f"[Device]: {CFG.device}...")
115 |     logging.info(f"[Model]: Loading {CFG.model_name}...")
116 |     global tokenizer
117 |     tokenizer = AutoTokenizer.from_pretrained(f"{CFG.pretrained_dir}/{CFG.model_name}")
118 |     model = AutoModelForSeq2SeqLM.from_pretrained(f"{CFG.pretrained_dir}/{CFG.model_name}").to(CFG.device)
119 | 
120 |     # Loading Data
121 |     print("Loading Data ...")
122 |     datasets = load_data()
123 | 
124 |     # Preparing Data
125 |     print("Preparing Data ...")
126 |     logging.info(f"[Dataset]:\n{datasets}")
127 |     tokenized_datasets = tokenize_and_encode(tokenizer, datasets)
128 |     data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)
129 | 
130 |     # Training Arguments
131 |     args = Seq2SeqTrainingArguments(
132 |         output_dir = CFG.saved_models_dir,
133 |         logging_dir = CFG.saved_models_dir + "logs/",
134 |         evaluation_strategy = CFG.evaluation_strategy,            
135 |         learning_rate = CFG.learning_rate,
136 |         per_device_train_batch_size = CFG.batch_size,
137 |         per_device_eval_batch_size = CFG.batch_size,
138 |         weight_decay = CFG.weight_decay,
139 |         generation_max_length = CFG.max_target_length,
140 |         save_strategy =  CFG.save_strategy,
141 |         num_train_epochs = CFG.num_train_epochs,
142 |         save_total_limit = CFG.save_total_limit,
143 |         predict_with_generate = True,
144 |         logging_steps = len(tokenized_datasets["train"]) // CFG.batch_size,
145 |         push_to_hub = False,
146 |         report_to = "tensorboard")
147 | 
148 |     # Trainer                
149 |     trainer = Seq2SeqTrainer(
150 |         model,
151 |         args,
152 |         train_dataset = tokenized_datasets["train"],
153 |         eval_dataset = tokenized_datasets["test"],
154 |         data_collator = data_collator,
155 |         tokenizer = tokenizer)
156 | 
157 |     # Training and logging
158 |     print("Training ...")
159 |     trainer.add_callback(CustomCallback(trainer)) 
160 |     trainer.train()
161 | 
162 | if __name__ == "__main__":
163 |     main()


--------------------------------------------------------------------------------
/Paragraph2Action/finetune_llms_full_for_paragraph2action.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | # Setup environment (full fine-tuning 7b needs 160GB Memory)
  3 | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
  4 | 
  5 | import torch
  6 | import pandas as pd
  7 | from datasets import load_dataset, Dataset, DatasetDict
  8 | from transformers import  AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
  9 | import logging
 10 | from peft import LoraConfig, PeftModel
 11 | from trl import SFTTrainer
 12 | from transformers import TrainerCallback
 13 | 
 14 | # Data Loading and Preprocessing
 15 | train_df = pd.read_csv("data/processed_data/hand_annotated_train_augmented_unique.csv")    # "hand_annotated_train.csv" or "hand_annotated_train_augmented_unique.csv"
 16 | test_df = pd.read_csv("data/processed_data/hand_annotated_test.csv")
 17 | source_text = "paragraphs"
 18 | target_text = "actions"
 19 | instruction = f'{source_text}2{target_text}: '
 20 | 
 21 | train_df['text'] = f'<s>[INST] {instruction}' + train_df[source_text] + " [/INST] " + train_df[target_text] +  "!!! </s>"
 22 | test_df['text'] = f'<s>[INST] {instruction}' + test_df[source_text] + " [/INST] " + test_df[target_text] +  "!!! </s>"
 23 | train_dataset = Dataset.from_dict(train_df[['text']].astype(str))
 24 | test_dataset = Dataset.from_dict(test_df[['text']].astype(str))
 25 | dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
 26 | print(dataset)
 27 | 
 28 | # TrainingArguments parameters
 29 | num_train_epochs = 20
 30 | save_steps = 0                      # Save checkpoint every X updates steps
 31 | logging_steps = 25                  # Log every X updates steps
 32 | 
 33 | fp16 = False                        # Enable fp16/bf16 training (set fp16 to True with an V100)
 34 | bf16 = True                         # Enable fp16/bf16 training (set bf16 to True with an A100)
 35 | 
 36 | per_device_train_batch_size = 4     # Batch size per GPU for training
 37 | per_device_eval_batch_size = 4      # Batch size per GPU for evaluation
 38 | gradient_accumulation_steps = 1     # Number of update steps to accumulate the gradients for
 39 | gradient_checkpointing = True       # Enable gradient checkpointing
 40 | 
 41 | max_grad_norm = 0.3                 # Maximum gradient normal (gradient clipping)
 42 | learning_rate = 1e-6                # Initial learning rate (AdamW optimizer, 1e-5 or 5e-6 or 1e-4)
 43 | weight_decay = 0.001                # Weight decay to apply to all layers except bias/LayerNorm weights
 44 | 
 45 | optim = "paged_adamw_32bit"         # Optimizer to use
 46 | lr_scheduler_type = "cosine"        # Learning rate schedule
 47 | 
 48 | max_steps = -1                      # Number of training steps (overrides num_train_epochs)
 49 | warmup_ratio = 0.03                 # Ratio of steps for a linear warmup (from 0 to learning rate)
 50 | 
 51 | group_by_length = True              # Group sequences into batches with same length (Saves memory and speeds up training considerably)
 52 | 
 53 | # SFT parameters
 54 | max_seq_length = 4096               # Maximum sequence length to use (default 1024)
 55 | packing = False                     # Pack multiple short examples in the same input sequence to increase efficiency
 56 | device_map = "auto"                 # Load the entire model on the GPU 0, or "auto"
 57 | 
 58 | # Model Version (Meta-Llama-3-8B-Instruct, Mistral-7B-Instruct-v0.2, llama-2-13b-chat-hf ...)
 59 | model_name = "/home/zhangwei/pretrained_models/Mistral-7B-Instruct-v0.2"  # Path of the pretrained model downloaded from Hugging Face
 60 | new_model_dir = f"saved_models/Mistral-7B-Instruct-v0.2/train_{len(train_df)}_lr{learning_rate}_bs{per_device_train_batch_size}"  # Fine-tuned model name
 61 | output_dir = new_model_dir          # Output directory where the model predictions and checkpoints will be stored
 62 | 
 63 | # Load base model
 64 | model = AutoModelForCausalLM.from_pretrained(
 65 |     pretrained_model_name_or_path = model_name,
 66 |     torch_dtype = torch.bfloat16,
 67 |     device_map = device_map
 68 | )
 69 | 
 70 | # Load tokenizer
 71 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 72 | tokenizer.pad_token = tokenizer.eos_token
 73 | tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training
 74 | print("------------tokenizer.eos_token--------------", tokenizer.eos_token)
 75 | print("------------tokenizer.unk_token--------------", tokenizer.unk_token)
 76 | print("------------tokenizer.bos_token--------------", tokenizer.bos_token)
 77 | print("------------tokenizer.pad_token--------------", tokenizer.pad_token)
 78 | print("------------vocab_size is------------", tokenizer.vocab_size)
 79 | print("------------vocab_size is------------", len(tokenizer))
 80 | 
 81 | # Set training parameters
 82 | training_arguments = TrainingArguments(
 83 |     output_dir = output_dir,
 84 |     logging_dir = output_dir + "/logs/",
 85 |     evaluation_strategy = "epoch",            
 86 |     save_strategy = "epoch",
 87 |     num_train_epochs = num_train_epochs,
 88 |     save_total_limit = num_train_epochs,
 89 |     per_device_train_batch_size = per_device_train_batch_size,
 90 |     gradient_accumulation_steps = gradient_accumulation_steps,
 91 |     # optim = optim,
 92 |     #save_steps=save_steps,
 93 |     logging_steps = logging_steps,
 94 |     learning_rate = learning_rate,
 95 |     weight_decay = weight_decay,
 96 |     fp16 = fp16,
 97 |     bf16 = bf16,
 98 |     max_grad_norm = max_grad_norm,
 99 |     max_steps = max_steps,
100 |     warmup_ratio = warmup_ratio,
101 |     group_by_length = group_by_length,
102 |     lr_scheduler_type = lr_scheduler_type,
103 |     report_to = "tensorboard"
104 | )
105 | 
106 | # Set supervised fine-tuning parameters
107 | trainer = SFTTrainer(
108 |     model = model,
109 |     train_dataset = dataset['train'],
110 |     eval_dataset = dataset["test"],
111 |     dataset_text_field = "text",
112 |     max_seq_length = max_seq_length,
113 |     tokenizer = tokenizer,
114 |     args = training_arguments,
115 |     packing = packing,
116 | )
117 | 
118 | class SaveBestModelCallback(TrainerCallback):
119 |     def __init__(self):
120 |         super().__init__()
121 |         self.best_eval_loss = float('inf')
122 |         self.best_model_checkpoint = None
123 |         
124 |     def on_log(self, args, state, control, logs=None, **kwargs):
125 |         # Check if training_loss is in the logs and print it
126 |         if 'loss' in logs:
127 |             training_loss = logs['loss']
128 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current training_loss: {training_loss}")
129 | 
130 |     def on_evaluate(self, args, state, control, **kwargs):
131 |         # Check if eval_loss is in the logs
132 |         if 'eval_loss' in state.log_history[-1]:
133 |             eval_loss = state.log_history[-1]['eval_loss']
134 |             # Print current eval_loss with epoch and step
135 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current eval_loss: {eval_loss}")  
136 | 
137 |             if eval_loss < self.best_eval_loss:
138 |                 self.best_eval_loss = eval_loss
139 |                 # Save the best model
140 |                 self.best_model_checkpoint = state.global_step
141 |                 trainer.save_model(f"{args.output_dir}/best_model")
142 |                 # Print loss of Best Model
143 |                 logging.info(f"New best model saved at step {state.global_step} with eval_loss: {eval_loss}")  
144 | 
145 | # Create an instance of the callback
146 | save_best_model_callback = SaveBestModelCallback()
147 | 
148 | # Training and logging
149 | logging.basicConfig(filename=output_dir+'/training.log', level=logging.INFO)
150 | logging.info(f"""[Device]: cuda:{os.environ["CUDA_VISIBLE_DEVICES"]}...\n""")
151 | logging.info(f"""[Model]: Loading {model_name}...\n""")
152 | logging.info(f"""[Outputdir]: Loading {output_dir}...\n""")
153 | 
154 | # Add the callback to the trainer
155 | trainer.add_callback(save_best_model_callback)
156 | 
157 | # Train model
158 | trainer.train()
159 | 
160 | # Save trained model
161 | trainer.model.save_pretrained(new_model_dir)


--------------------------------------------------------------------------------
/Paragraph2Action/finetune_llms_peft_for_paragraph2action.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
  3 | import torch
  4 | device = "cuda:0" if torch.cuda.is_available() else "cpu"
  5 | 
  6 | import pandas as pd
  7 | from datasets import load_dataset, Dataset, DatasetDict
  8 | from transformers import  AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
  9 | import logging
 10 | from peft import LoraConfig, PeftModel
 11 | from trl import SFTTrainer
 12 | from transformers import TrainerCallback
 13 | 
 14 | # Data Loading and Preprocessing
 15 | train_df = pd.read_csv("data/processed_data/hand_annotated_train_augmented_unique.csv") # "hand_annotated_train.csv" or "hand_annotated_train_augmented_unique.csv"
 16 | test_df = pd.read_csv("data/processed_data/hand_annotated_test.csv")
 17 | source_text = "paragraphs"
 18 | target_text = "actions"
 19 | instruction = f'{source_text}2{target_text}: '
 20 | 
 21 | train_df['text'] = f'<s>[INST] {instruction}' + train_df[source_text] + " [/INST] " + train_df[target_text] +  "!!! </s>"
 22 | test_df['text'] = f'<s>[INST] {instruction}' + test_df[source_text] + " [/INST] " + test_df[target_text] +  "!!! </s>"
 23 | train_dataset = Dataset.from_dict(train_df[['text']].astype(str))
 24 | test_dataset = Dataset.from_dict(test_df[['text']].astype(str))
 25 | dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
 26 | print(dataset)
 27 | 
 28 | ################################################################################
 29 | # Parameters Setting
 30 | ################################################################################
 31 | # QLoRA parameters
 32 | lora_r = 64                         # LoRA attention dimension (8, 16, 64, larger is better)
 33 | lora_alpha = 128                    # Alpha parameter for LoRA scaling (lora_r*2)
 34 | lora_dropout = 0.1                  # Dropout probability for LoRA layers
 35 | 
 36 | # bitsandbytes parameters
 37 | use_4bit = True                     # Activate 4-bit precision base model loading
 38 | bnb_4bit_compute_dtype = "float16"  # Compute dtype for 4-bit base models   
 39 | bnb_4bit_quant_type = "nf4"         # Quantization type (fp4 or nf4)
 40 | use_nested_quant = False            # Activate nested quantization for 4-bit base models (double quantization)
 41 | 
 42 | # TrainingArguments parameters
 43 | num_train_epochs = 20
 44 | save_steps = 0                      # Save checkpoint every X updates steps
 45 | logging_steps = 25                  # Log every X updates steps
 46 | 
 47 | fp16 = False                        # Enable fp16/bf16 training (set fp16 to True with an V100)
 48 | bf16 = True                         # Enable fp16/bf16 training (set bf16 to True with an A100)
 49 | 
 50 | per_device_train_batch_size = 2     # Batch size per GPU for training
 51 | per_device_eval_batch_size = 2      # Batch size per GPU for evaluation
 52 | gradient_accumulation_steps = 1     # Number of update steps to accumulate the gradients for
 53 | gradient_checkpointing = True       # Enable gradient checkpointing
 54 | 
 55 | max_grad_norm = 0.3                 # Maximum gradient normal (gradient clipping)
 56 | learning_rate = 1e-4                # Initial learning rate (AdamW optimizer)
 57 | weight_decay = 0.001                # Weight decay to apply to all layers except bias/LayerNorm weights
 58 | 
 59 | optim = "paged_adamw_32bit"         # Optimizer to use
 60 | lr_scheduler_type = "cosine"        # Learning rate schedule
 61 | 
 62 | max_steps = -1                      # Number of training steps (overrides num_train_epochs)
 63 | warmup_ratio = 0.03                 # Ratio of steps for a linear warmup (from 0 to learning rate)
 64 | 
 65 | group_by_length = True              # Group sequences into batches with same length (Saves memory and speeds up training considerably)
 66 | 
 67 | # SFT parameters
 68 | max_seq_length = 4096               # Maximum sequence length to use (default 1024)
 69 | packing = False                     # Pack multiple short examples in the same input sequence to increase efficiency
 70 | device_map = {"": 0}                # Load the entire model on the GPU 0, or "auto"
 71 | 
 72 | # Model Version
 73 | model_name = "/home/zhangwei/pretrained_models/llama-2-13b-chat-hf"              # Path of the pretrained model downloaded from Hugging Face (llama-2-13b-chat-hf or Mistral-7B-Instruct-v0.2)
 74 | new_model_dir = f"saved_models/llama2_13b_chat_qlora/train_{len(train_df)}_lora_r{lora_r}_lr{learning_rate}"  # Fine-tuned model name
 75 | output_dir = new_model_dir                                              # Output directory where the model predictions and checkpoints will be stored
 76 | 
 77 | ################################################################################
 78 | # Train
 79 | ################################################################################
 80 | # Load tokenizer and model with QLoRA configuration
 81 | compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
 82 | bnb_config = BitsAndBytesConfig(
 83 |     load_in_4bit=use_4bit,
 84 |     bnb_4bit_quant_type=bnb_4bit_quant_type,
 85 |     bnb_4bit_compute_dtype=compute_dtype,
 86 |     bnb_4bit_use_double_quant=use_nested_quant,
 87 | )
 88 | 
 89 | # Check GPU compatibility with bfloat16
 90 | if compute_dtype == torch.float16 and use_4bit:
 91 |     major, _ = torch.cuda.get_device_capability()
 92 |     if major >= 8:
 93 |         print("=" * 80)
 94 |         print("Your GPU supports bfloat16: accelerate training with bf16=True")
 95 |         print("=" * 80)
 96 | 
 97 | # Load base model
 98 | model = AutoModelForCausalLM.from_pretrained(
 99 |     pretrained_model_name_or_path = model_name,
100 |     quantization_config = bnb_config,
101 |     device_map = device_map
102 | )
103 | model.config.use_cache = False
104 | model.config.pretraining_tp = 1
105 | 
106 | # Load LLaMA tokenizer
107 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
108 | tokenizer.pad_token = tokenizer.eos_token
109 | tokenizer.padding_side = "right"   # Fix weird overflow issue with fp16 training
110 | 
111 | # Load LoRA configuration
112 | peft_config = LoraConfig(
113 |     lora_alpha=lora_alpha,
114 |     lora_dropout=lora_dropout,
115 |     r=lora_r,
116 |     bias="none",
117 |     task_type="CAUSAL_LM",
118 | )
119 | 
120 | # Set training parameters
121 | training_arguments = TrainingArguments(
122 |     output_dir=output_dir,
123 |     logging_dir = output_dir + "/logs/",
124 |     evaluation_strategy = "epoch",            
125 |     save_strategy = "epoch",
126 |     num_train_epochs = num_train_epochs,
127 |     save_total_limit = num_train_epochs,
128 |     per_device_train_batch_size = per_device_train_batch_size,
129 |     gradient_accumulation_steps = gradient_accumulation_steps,
130 |     optim = optim,
131 |     #save_steps=save_steps,
132 |     logging_steps = logging_steps,
133 |     learning_rate = learning_rate,
134 |     weight_decay = weight_decay,
135 |     fp16 = fp16,
136 |     bf16 = bf16,
137 |     max_grad_norm = max_grad_norm,
138 |     max_steps = max_steps,
139 |     warmup_ratio = warmup_ratio,
140 |     group_by_length = group_by_length,
141 |     lr_scheduler_type = lr_scheduler_type,
142 |     report_to = "tensorboard"
143 | )
144 | 
145 | # Set supervised fine-tuning parameters
146 | trainer = SFTTrainer(
147 |     model = model,
148 |     train_dataset = dataset['train'],
149 |     eval_dataset = dataset["test"],
150 |     peft_config = peft_config,
151 |     dataset_text_field ="text",
152 |     max_seq_length = max_seq_length,
153 |     tokenizer = tokenizer,
154 |     args = training_arguments,
155 |     packing = packing,
156 | )
157 | 
158 | class SaveBestModelCallback(TrainerCallback):
159 |     def __init__(self):
160 |         super().__init__()
161 |         self.best_eval_loss = float('inf')
162 |         self.best_model_checkpoint = None
163 |         
164 |     def on_log(self, args, state, control, logs=None, **kwargs):
165 |         # Check if training_loss is in the logs and print it
166 |         if 'loss' in logs:
167 |             training_loss = logs['loss']
168 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current training_loss: {training_loss}")
169 | 
170 |     def on_evaluate(self, args, state, control, **kwargs):
171 |         # Check if eval_loss is in the logs
172 |         if 'eval_loss' in state.log_history[-1]:
173 |             eval_loss = state.log_history[-1]['eval_loss']
174 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current eval_loss: {eval_loss}")  # Print current eval_loss with epoch and step
175 | 
176 |             if eval_loss < self.best_eval_loss:
177 |                 self.best_eval_loss = eval_loss
178 |                 # Save the best model
179 |                 self.best_model_checkpoint = state.global_step
180 |                 trainer.save_model(f"{args.output_dir}/best_model")
181 |                 logging.info(f"New best model saved at step {state.global_step} with eval_loss: {eval_loss}")  # Print loss of Best Model
182 | 
183 | # Create an instance of the callback
184 | save_best_model_callback = SaveBestModelCallback()
185 | 
186 | # Training and logging
187 | logging.basicConfig(filename=output_dir+'/training.log', level=logging.INFO)
188 | logging.info(f"""[Device]: cuda:{os.environ["CUDA_VISIBLE_DEVICES"]}...\n""")
189 | logging.info(f"""[Model]: Loading {model_name}...\n""")
190 | logging.info(f"""[Outputdir]: Loading {output_dir}...\n""")
191 | 
192 | # Add the callback to the trainer
193 | trainer.add_callback(save_best_model_callback)
194 | 
195 | # Train model
196 | trainer.train()
197 | 
198 | # Save trained model
199 | trainer.model.save_pretrained(new_model_dir)


--------------------------------------------------------------------------------
/Paragraph2Action/results/README.md:
--------------------------------------------------------------------------------
1 | Save the generated outputs by different models. 


--------------------------------------------------------------------------------
/Paragraph2Comound/README.md:
--------------------------------------------------------------------------------
 1 | ## 1. Data for Paragraph2Comound
 2 | 
 3 | All Data are in ```data```.
 4 | 
 5 | For training data, we randomly sample 3 times from  10000 to  1000, 100, 10:
 6 | - ```/data/train/trial_1```
 7 | - ```/data/train/trial_2``` 
 8 | - ```/data/train/trial_3``` 
 9 | 
10 | ## 2. Methods for Paragraph2Comound
11 | 
12 | ### Prompt Engineering ChatGPT (GPT-4, GPT-3.5-Turbo)
13 | 
14 | See in ```prompt_chatgpt_for_paragraph2compound.ipynb```
15 | 
16 | ### Fine-tuning ChatGPT (GPT-3.5-Turbo)
17 | 
18 | See in ```finetune_chatgpt_for_paragraph2compound.ipynb```
19 | 
20 | ### Full Parameter Fine-tuning Open-source Large Language Models (Mistral, Llama3, Llama2)
21 | 
22 | Training Code in ```finetune_llms_full_for_paragraph2compound.py```
23 | 
24 | Inferencing Code in ```vllm_inference_full_finetuned_llms.ipynb```
25 | 
26 | ### Parameter Efficient Fine-tuning (PEFT) Open-source Large Language Models (Mistral, Llama3, Llama2)
27 | 
28 | Training Code in ```finetune_llms_peft_for_paragraph2compound.py```
29 | 
30 | Inferencing Code in ```vllm_inference_peft_finetuned_llms.ipynb```
31 | 
32 | ### Fine-tuning Language Models (T5, Bart)
33 | 
34 | See in ```finetune_bart_or_t5_for_paragraph2compound.py```
35 | 
36 | ## 3. Evaluating the results of Paragraph2Comound
37 | 
38 | All predictions will be saved in ```results/predictions```
39 | 
40 | Evaluating codes are in ```evaluate_for_paragraph2compound.ipynb```


--------------------------------------------------------------------------------
/Paragraph2Comound/data/train/trial_1/train_10.csv:
--------------------------------------------------------------------------------
 1 | documentId,Paragraph,Compound
 2 | US20140128390A1,"Into a scintillation vial were placed 6-(4-fluorophenyl)-4-[(3R)-3-methylpiperazin-1-yl]-7H-pyrrolo[2,3-d]pyrimidine (5) (80 mg, 0.26 mmol), 1-[(1S)-1-isocyanatoethyl]-3-methoxy-benzene (70 mg, 0.4 mmol) and DMF (3 mL). N,N-diisopropylethylamine (0.1 ml, 0.58 mmol) was added and the reaction was stirred at room temperature for 5 hours. The mixture was placed on silica and purified with silica gel chromatography eluting with a gradient of ethyl acetate:hexanes (40-100%). 1HNMR and MS were consistent with the structure of the desired product. MS (ESI) [M+H+]+=489.55 [M−H]−=487.1.","6-(4-fluorophenyl)-4-[(3R)-3-methylpiperazin-1-yl]-7H-pyrrolo[2,3-d]pyrimidine | 1-[(1S)-1-isocyanatoethyl]-3-methoxy-benzene | DMF | N,N-diisopropylethylamine | silica gel | ethyl acetate"
 3 | US20060211697A1,"To a suspension of 2-(4-bromo-2-methoxy-phenoxy)-phenol (2.500 g, 7.50 mmol) and Cs2CO3 (3.665 g, 11.25 mmol) in acetonitrile was added dropwise methyl iodide (0.93 mL, 15.00 mmol) via syringe. After being stirred at room temperature for 2 days, the mixture was poured into water and extracted with DCM (30 mL×2). The combined layer was washed with water, dried over sodium sulfate, filtered and evaporated. The residue was purified by flash chromatography on silica column eluted with 15% of ethyl acetate (AcOEt) in hexane. The final product (1.925 g, 83%) is a white solid. M.P.: 58-60° C.",2-(4-bromo-2-methoxy-phenoxy)-phenol | Cs2CO3 | acetonitrile | methyl iodide | water | DCM | water | sodium sulfate | silica | ethyl acetate | hexane
 4 | US06613804B2,"N-(4-bromo-3-methyl-5-isoxazolyl)-2-(3,4,5-trimethoxybenzyl)-benzo[b]thiophene-3-sulfonamide was prepared in the same manner as described in Example 41. Reaction of 4-bromo-3-methyl-5-aminoisoxazole (0.55 mmoles, 97 mg), NaH (1.4 mmoles, 55 mg), and 2-(3,4,5-trimethoxylbenzyl)-benzo[b]thiophene-3-sulfonyl chloride (0.66 mmoles, 0.27 g) in THF (2 ml) yielded, after flash chromatography using 50% ethyl acetate/hexanes and recrystallization from chloroform and hexanes, 94 mg of a tan solid, m.p. 154-156° C.","N-(4-bromo-3-methyl-5-isoxazolyl)-2-(3,4,5-trimethoxybenzyl)-benzo[b]thiophene-3-sulfonamide | 4-bromo-3-methyl-5-aminoisoxazole | NaH | 2-(3,4,5-trimethoxylbenzyl)-benzo[b]thiophene-3-sulfonyl chloride | THF | ethyl acetate/hexanes | hexanes | chloroform | tan solid"
 5 | US20110136764A1,"A mixture of 3-amino-6-bromopyridine-2-carboxylic acid ethyl ester-(Compound 232G, 0.31 g, 1.3 mmol), 1,4-dioxaspiro[4,5]dec-7-ene-8-boronic acid pinacol ester (0.406 g, 1.52 mmol) and [1,1′-Bis(diphenylphosphino)ferrocene]dichloropalladium(11), complex with dichloromethane (1:1) (0.10 g, 0.12 mmol) in a 35 mL sealable microwave tube was taken up in 1,4-Dioxane (4.9 mL). This mixture was treated with a solution of Potassium carbonate (0.52 g, 3.8 mmol) in H2O (1 mL), flushed with nitrogen, sealed and irradiated in a microwave reactor for 30 minutes at 100° C. The mixture was poured into water and extracted twice with EtOAc. The combined organic layers were washed with brine, dried over Na2SO4 filtered and concentrated to a brown residue. This was taken up in DCM, treated with silica and concentrated. The resulting silica plug was loaded on to a sample cartridge and purified on an ISCO Combiflash system (0-50% EtOAc/Heptane) to isolate the desired product as 247 mg of an oil which crystallized upon standing. 1H NMR (400 MHz, DMSO-d6) d 1.30 (t, J=7.20 Hz, 3H), 1.77 (t, J=6.44 Hz, 2H), 2.32-2.40 (m, 2H), 2.60 (td, J=6.44, 1.52 Hz, 2H), 3.91 (s, 4H), 4.28 (q, J=7.07 Hz, 2H), 6.25 (t, J=4.04 Hz, 1H), 6.65 (s, 2H), 7.17 (d, J=8.84 Hz, 1H), 7.52 (d, J=8.84 Hz, 1H). MS(ES+): m/z=305.12 (100) [M+1]. HPLC: tR=3.50 min (ZQ3, polar—5 min).","3-amino-6-bromopyridine-2-carboxylic acid ethyl ester | Compound 232G | 1,4-dioxaspiro[4,5]dec-7-ene-8-boronic acid pinacol ester | [1,1′-Bis(diphenylphosphino)ferrocene]dichloropalladium(11) | dichloromethane | 1,4-Dioxane | Potassium carbonate | H2O | nitrogen | water | EtOAc | brine | Na2SO4 | silica | EtOAc/Heptane | desired product | oil"
 6 | US20160222004A1,"To a stirred, room temperature solution of 5-iodo-4′-methyl-biphenyl-3-carboxylic acid (2-methoxy-1-methyl-ethyl)-amide (1.0 g, 2.4 mmol)) in 3 ml anhydrous DMF were added LiCl (520 mg, 5 eq), Pd2(dba)3 (18.34 mg, 1.3% eq), DIPEA (0.8545 ml, 2 eq) and acetic anhydride (1.1636 ml, 5 eq). The reaction mixture was heated by microwave irridiation to 150° C. for 1 hour, then cooled and diluted with EtOAc. The combined organic layers were washed with water, brine, dried (Na2SO4), filtered, and concentrated in vacuo. The residue was purified by flash column chromatography on silica gel with hexanes ethyl acetate 8:1 to 2:1), giving 5-acetyl-4′-methyl-biphenyl-3-carboxylic acid (2-methoxy-1-methyl-ethyl)-amide (538 mg, 75%). MS (M+H)=326.",5-iodo-4′-methyl-biphenyl-3-carboxylic acid (2-methoxy-1-methyl-ethyl)-amide | DMF | LiCl | Pd2(dba)3 | DIPEA | acetic anhydride | EtOAc | water | brine | Na2SO4 | silica gel | hexanes ethyl acetate | 5-acetyl-4′-methyl-biphenyl-3-carboxylic acid (2-methoxy-1-methyl-ethyl)-amide
 7 | US08610345B2,"A mixed solution of 10 g of 2-bromonitrobenzene, 8.2 g of 4-methylphenylboronic acid, 25.4 g of tripotassium phosphate, 3.9 g of tetrabutylammonium bromide, 270 mg of palladium acetate and 150 ml of dimethylformamide was heated and stirred under a nitrogen gas stream at 130° C. for 3 hours. The solution was cooled to room temperature and 100 ml of water was poured into the solution, followed by extraction with 150 ml of ethyl acetate. The organic layer was washed twice with 100 ml of water, dried over magnesium sulfate and then evaporated. The concentrate was purified by silica gel column chromatography and then vacuum-dried to obtain 8.6 g of 2-(4-methylphenyl)nitrobenzene.",2-bromonitrobenzene | 4-methylphenylboronic acid | tripotassium phosphate | tetrabutylammonium bromide | palladium acetate | dimethylformamide | nitrogen | water | ethyl acetate | water | magnesium sulfate | silica gel | 2-(4-methylphenyl)nitrobenzene
 8 | US20100087424A1,"To a solution of 7-fluoro-2-methoxy-8-quinolinol (0.23 g, 1.2 mmol) in THF (10 ml) was added NBS (0.23 g, 1.3 mmol) and the solution stirred for 1.5 h, then concentrated. Chromatography over silica (20 g SPE, eluting with 1:1 dichloromethane/hexane) provided the title compound as an off-white solid (0.28 g, 86%).",7-fluoro-2-methoxy-8-quinolinol | THF | NBS | silica | title compound
 9 | US20150329524A1,"To a 500 mL round bottom flask containing the crude (S)-4-(4-bromo-2-fluorophenyl)-1-methyl-3-((1-propionylpyrrolidin-3-yl)methyl)-1H-1,2,4-triazol-5(4H)-one (2.95 g), was added bis(pinacolato)diboron (2.00 g, 7.88 mmol), potassium acetate (3.00 g, 30.6 mmol), PdCl2(dppf)-CH2Cl2 adduct (0.50 g, 0.61 mmol), and 1,4-dioxane (30 mL). A condenser was attached and the solution was stirred at 100° C. for 1 h. Analysis of an aliquot by LCMS indicated the starting material had been consumed and the desired boronic acid intermediate was present. The reaction mixture was cooled to room temperature, and the flask was charged with 7-bromo-3-methylquinoline (1.6 g, 7.2 mmol), and 2M aq potassium carbonate (15.00 mL). The reaction mixture was stirred at 100° C. for 1 h and then was cooled to room temperature. The solution was adjusted to pH=7 with the addition of 1N aq HCl. The desired product was extracted with ethyl acetate (2×200 mL). The combined organic layers were dried over sodium sulfate and approximately 1 g of Silicycle Si-thiol resin, filtered, and concentrated in vacuo. Purification of the residue by silica gel chromatography (0-10% methanol:dichloromethane) and then by reverse phase HPLC (35% acetonitrile/65% 0.3M aq ammonium formate) afforded the title compound as a white solid (2.3 g, 64% yield over two steps). MS(ES)+ m/e 474.4 [M+H]+.","(S)-4-(4-bromo-2-fluorophenyl)-1-methyl-3-((1-propionylpyrrolidin-3-yl)methyl)-1H-1,2,4-triazol-5(4H)-one | bis(pinacolato)diboron | potassium acetate | PdCl2(dppf)-CH2Cl2 | 1,4-dioxane | 7-bromo-3-methylquinoline | potassium carbonate | HCl | desired product | ethyl acetate | sodium sulfate | Si-thiol | silica gel | methanol | dichloromethane | acetonitrile | title compound"
10 | US09409893B2,"The free base of 2-(aminomethyl)-4-(tert-butyl)-6-(6-(trifluoromethyl)pyridin-3-yl)phenol hydrochloride (Example 64), was prepared by partitioning the hydrochloride salt (560 mg, 1.55 mmol) between ethyl acetate and a saturated aqueous sodium bicarbonate solution. The organic phase was separated, dried (Na2SO4), filtered and the solvent removed under reduced pressure. The residue was redissolved in a solution of tert-butyl (2-oxopropyl)carbamate in ethanol (5 mL) and stirred for three days. Sodium borohydride (290 mg, 7.75 mmol) was added portionwise and the reaction mixture stirred at room temperature for one hour. The reaction was quenched with saturated aqueous ammonium chloride solution (1 mL) and the reaction mixture partitioned between ethyl acetate and a saturated aqueous sodium bicarbonate solution. The organic phase was separated, dried (Na2SO4), filtered and the solvent was removed under reduced pressure. The residue was purified using flash chromatography on silica eluting with a solvent gradient of 0-100% ethyl acetate in hexanes to give tert-butyl (2-((5-(tert-butyl)-2-hydroxy-3-(6-(trifluoromethyl)pyridin-3-yl)benzyl)amino)propyl)carbamate (500 mg, 67%) as a white solid. The carbamate (500 mg, 1.04 mmol) was dissolved in methylene chloride (20 mL) and trifluoroacetic acid (10 mL) added and the reaction was stirred for 16 h. The solvent was removed under reduced pressure, the residue azeotroped twice with toluene (20 mL) and then redissolved in a 1:1 mixture of methylene chloride and trifluoracetic acid (40 mL) and stirred at room temperature for four hours. The solvent was removed under reduced pressure, the residue azeotroped twice with toluene (20 mL) and the residue partitioned between ethyl acetate and saturated aqueous sodium bicarbonate solution. The organic phase was separated, dried (Na2SO4), filtered and the solvent removed under pressure. The residue was purified using flash chromatography on silica eluting with a solvent gradient of 0 to 100% of (89% methylene chloride, 10% methanol and 1% ammonium hydroxide) in methylene chloride. The purified residue was dissolved in a 1.25 M hydrochloric acid solution in ethanol, the solution stood for 10 minutes, and then the solvent removed under reduced pressure and the residue triturated with diethyl ether to give 2-(((1-aminopropan-2-yl)amino)methyl)-4-(tert-butyl)-6-(6-(trifluoromethyl)pyridin-3-yl)phenol dihydrochloride (150 mg, 32% yield) as a white solid.",2-(aminomethyl)-4-(tert-butyl)-6-(6-(trifluoromethyl)pyridin-3-yl)phenol hydrochloride | hydrochloride salt | ethyl acetate | sodium bicarbonate | Na2SO4 | tert-butyl (2-oxopropyl)carbamate | ethanol | Sodium borohydride | ammonium chloride | ethyl acetate | Na2SO4 | tert-butyl (2-((5-(tert-butyl)-2-hydroxy-3-(6-(trifluoromethyl)pyridin-3-yl)benzyl)amino)propyl)carbamate
11 | US08987473B2,"To an aqueous solution (2 mL) of zinc powder (1.2 g) was added mercury chloride (124 mg) at room temperature, and then the reaction mixture was stirred at room temperature for 30 minutes. To the reaction mixture were added the compound [59] obtained in Example 59 (73 mg), toluene (1 mL), water (1 mL) and concentrated hydrochloric acid (1 mL) at room temperature, and then the reaction mixture was heated at reflux for 4 hours. After cooling to room temperature, the reaction mixture was quenched with water, and extracted with chloroform. The obtained organic layer was dried over anhydrous sodium sulfate, filtered, and the filtrate was concentrated under reduced pressure. The obtained residue was purified by silica gel column chromatography to give the titled compound (35 mg) as a yellow solid.",zinc | mercury chloride | toluene | water | hydrochloric acid | water | chloroform | sodium sulfate | silica gel | titled compound
12 | 


--------------------------------------------------------------------------------
/Paragraph2Comound/data/train/trial_2/train_10.csv:
--------------------------------------------------------------------------------
 1 | documentId,Paragraph,Compound
 2 | US20150005279A1,"To a solution of 5-bromopyridin-2-ol (4 g, 23 mmol) in THF (200 mL) at about 0° C. was added NaH (0.83 g, 34.7 mmol) in portions. The reaction mixture was stirred at rt for about 15 min followed by addition of iodomethane (9.8 g, 69 mmol). The mixture was stirred overnight at rt. After the completion of the reaction (TLC monitoring), the reaction mixture was cooled to about 0° C., water was added, extracted with EtOAc (100 mL×2). The organic layer was washed with brine, dried with anhydrous Na2SO4, filtered and concentrated under reduced pressure to provide 5-bromo-1-methylpyridin-2-(1H)-one (3 g, 69%): 1H NMR (MeOD) δ 7.87 (s, 1H), 7.58-7.55 (m, 1H), 6.47 (d, J=9.6 Hz, 1H), 3.53 (s, 3H).",5-bromopyridin-2-ol | THF | NaH | iodomethane | water | EtOAc | brine | Na2SO4 | 5-bromo-1-methylpyridin-2-(1H)-one
 3 | US20090286790A1,"A solution of 5-nitro-3-(trifluoromethyl)benzoic acid (2.35 g, 10 mmol) and H2SO4 (53 μL, 1 mmol) in methanol (30 mL) is allowed to warm to reflux and stir for 20 hours. The mixture is cooled to room temperature, then the mixture is concentrated under reduced pressure. The obtained residue is neutralized with saturated aq. NaHCO3. The mixture is extracted with CH2Cl2. The combined organic layer is washed with brine, dried over Na2SO4, filtrated, and concentrated under reduced pressure to give 3-nitro-5-trifluoromethyl-benzoic acid methyl ester (2.15 g, 86%), which is used in the next step without further purification.",5-nitro-3-(trifluoromethyl)benzoic acid | H2SO4 | methanol | CH2Cl2 | brine | Na2SO4 | 3-nitro-5-trifluoromethyl-benzoic acid methyl ester
 4 | US20120295903A1,"To a solution of crude 10-(2-(benzylamino)ethyl)-7,8-dimethylbenzo[g]-pteridine-2,4(3H,10H)-dione (7.53 mmol) in MeOH (200 mL) is added di-tert-butyl dicarbonate (5.2 g, 23.8 mmol) and Et3N (4 mL). The reaction was concentrated under reduced pressure and purified via silica gel chromatography (ISCO) (100% DCM to 10% MeOH/DCM) over 1 h to obtain desired product (1.85 g, 54%) as a brown solid.","10-(2-(benzylamino)ethyl)-7,8-dimethylbenzo[g]-pteridine-2,4(3H,10H)-dione | MeOH | di-tert-butyl dicarbonate | Et3N | silica gel | DCM | MeOH/DCM | desired product"
 5 | US07026335B2,"To a cooled (−78° C.) solution of 4-(R)-methyl-5-(S)-phenyl-oxazolidin-2-one, 31, (600 mg, 3.39 mmol) in anhydrous tetrahydrofuran (20 mL) is added n-butyl lithium (2.5 mL, 1.6M solution in hexanes, 4.07 mmol). The resulting solution is stirred at −78° C. for ninety minutes and then 3-(4-chloro-phenyl)-propionyl chloride (894 mg, 4.41 mmol) is slowly added. The solution is warmed to room temperature for thirty minutes and then the solvents removed under reduced pressure. The crude product is purified over silica (20:80 ethyl acetate:hexanes, Rf˜0.3) to afford 1.07 g (92% yield) of the desired compound as a colorless solid. 1H NMR (CDCl3 300 MHz) δ 0.91 (d, J=6.6 Hz, 3H), 3.01 (t, J=7.8 Hz, 2H), 3.18–3.40 (m, 2H), 4.77 (m, 1H), 5.67 (d, J=7.2 Hz, 1H), 7.18–7.48 (m, 9H). MS (ESI) m/z 344 (M+H+)",4-(R)-methyl-5-(S)-phenyl-oxazolidin-2-one | tetrahydrofuran | n-butyl lithium | 3-(4-chloro-phenyl)-propionyl chloride | crude product | silica | desired compound
 6 | US09149475B2,"In a 100 mL round bottom flask to a solution of 2-chloro-5-fluoro-N-(2,2,6,6,-tetramethylpiperidin-4-yl)pyrimidin-4-amine (Compound 15; 0.18 g, 0.63 mmol) in 10 mL iPrOH, 4-cyclopropyl-3-(1H-tetrazol-1-yl)benzeneamine (0.18 g, 0.88 mmol) and TosOH (0.095 g, 0.50 mmol) were added. The reaction mixture was heated at 100° C. for 3 hours. LCMS analysis indicated the complete consumption of the mono-SNAr product and appearance of Compound III-118. The reaction mixture was then cooled to room temperature and volatiles were removed under reduced pressure. The crude reaction mixture was purified by column chromatography using DCM/2 M NH3 in MeOH to get tan solid that was further purified by tituration using DCM/hexanes.","2-chloro-5-fluoro-N-(2,2,6,6,-tetramethylpiperidin-4-yl)pyrimidin-4-amine | Compound 15 | iPrOH | 4-cyclopropyl-3-(1H-tetrazol-1-yl)benzeneamine | TosOH"
 7 | US20150072969A1,"Ethyl 2-(3-(4-((2′,6′-dimethyl-4′-((tetrahydrofuran-2-yl)methoxy)-[1,1′-biphenyl]-3-yl)methoxy)phenyl)oxetan-3-yl)acetate (compound of Step 1b of Example 46, 135 mg, 0.254 mM) was dissolved in a mixture of THF (4 ml) and MeOH (1 ml) and aqueous LiOH monohydrate (1018 μl, 1.526 mM) was added to the reaction mixture. The reaction mixture was stirred for 6 h and quenched with saturated NH4Cl. The mixture was extracted with ethyl acetate and the organic layer was washed with brine, dried over Na2SO4 and concentrated to afford the title compound 2-(3-(4-((2′,6′-dimethyl-4′-((tetrahydrofuran-2-yl)methoxy)-[1,1′-biphenyl]-3-yl)methoxy)phenyl)oxetan-3-yl)acetic acid (80 mg) as a white solid. Yield: 62.6%; 1H NMR (DMSO-d6, 300 MHz): δ 12.8 (s, 1H), 7.44-7.40 (m, 2H), 7.21-7.18 (m, 3H), 7.07 (d, J=7.2 Hz, 1H), 6.96 (d, J=8.7 Hz, 2H), 6.69 (s, 2H), 5.14 (s, 2H), 4.74 (s, 4H), 4.15-4.10 (m, 1H), 3.95-3.93 (m, 2H), 3.80-3.67 (m, 2H), 3.50-3.32 (m, 2H), 3.01 (s, 2H), 2.09-2.05 (m, s, 1.91, 7H), 1.80-1.74 (m, 1H); MS: m/z 525 (M+Na).","Ethyl 2-(3-(4-((2′,6′-dimethyl-4′-((tetrahydrofuran-2-yl)methoxy)-[1,1′-biphenyl]-3-yl)methoxy)phenyl)oxetan-3-yl)acetate | compound | THF | MeOH | LiOH monohydrate | NH4Cl | ethyl acetate | brine | Na2SO4 | title compound | 2-(3-(4-((2′,6′-dimethyl-4′-((tetrahydrofuran-2-yl)methoxy)-[1,1′-biphenyl]-3-yl)methoxy)phenyl)oxetan-3-yl)acetic acid"
 8 | US06825222B2,"A solution of 2-{4-[2-(2-biphenyl-3-yl-5-methyloxazol-4-yl)ethoxy]-3-butylphenoxy}-2-methylpropionic acid ethyl ester (0.57 mmol) in ethanol (10 mL) was treated with 2.5 N aqueous NaOH (1.1 mL), and heated at 55° C. for 2 h. The reaction was cooled to ambient temperature and concentrated down to near dryness. The residue was then diluted with ethyl acetate (40 mL) and water (20 mL) and acidified to pH=1 with 1N aqueous HCl. The organic layer was washed with brine (20 mL), dried (Na2SO4) and concentrated in vacuo to give 2-{4-[2-(2-biphenyl-4-yl-5-methyloxazol-4-yl)ethoxy]-3-butylphenoxy}-2-methylpropionic acid. 1H NMR (400 MHz, CDCl3) δ 8.2 (d, J=1.5 Hz, 1H), 7.95 (d, J=7.8 Hz, 1H), 7.66-7.63 (m, 3H), 7.52-7.35 (m, 4H), 6.73 (d, J=2.9 Hz, 3H), 4.20 (t, J=6.1 Hz, 2H), 3.02 (t, J=6.1 Hz, 2H), 2.52 (t, J=7.6 Hz, 2H), 2.40 (s, 3H), 1.51-1.42 (m, 8H), 1.29 (quintet, J=7.2 Hz, 2H), 0.86 (t, J=7.1 Hz, 3H); MS (ES) m/e 514 (M+1).",2-{4-[2-(2-biphenyl-3-yl-5-methyloxazol-4-yl)ethoxy]-3-butylphenoxy}-2-methylpropionic acid ethyl ester | ethanol | NaOH | ethyl acetate | water | brine | Na2SO4 | 2-{4-[2-(2-biphenyl-4-yl-5-methyloxazol-4-yl)ethoxy]-3-butylphenoxy}-2-methylpropionic acid
 9 | US06548526B2,"A mixture of (4-cyano-3-mercapto-isothiazol-5-yl)-carbamic acid phenyl ester (0.70 g, 2.5 mmol), 2,6-di-tert-butyl-4-methylphenol (BHT) (one crystal) and concentrated sulfuric acid (3 mL) was stirred for 18 hours at room temperature. The mixture was diluted with ice water, extracted 3× with ethyl acetate, and the combined organic layers were dried over Na2SO4, filtered and concentrated in vacuo. The residue was dissolved in 10 mL of ethanol at 0° C. and was treated with 0.096 g (2.5 mmol) of NaBH4. After stirring for 30 minutes, the mixture was acidified with 1 M HCl, extracted into ethyl acetate, dried over Na2SO4, filtered and concentrated in vacuo, affording 0.60 g (81%) of (4-carbamoyl-3-mercapto-isothiazol-5-yl)-carbamic acid phenyl ester as a yellow solid. 1H NMR (400 MHz, acetone-d6) δ 13.0 (s, 1H), 11.0-10.9 (bs, 1H), 10.3 (s, 1H), 7.47 (t, 2H, J=6.8 Hz), 7.37-7.30 (m, 4H) ppm; MS (APCl, m/z): 296 [M+H]−.","(4-cyano-3-mercapto-isothiazol-5-yl)-carbamic acid phenyl ester | 2,6-di-tert-butyl-4-methylphenol | sulfuric acid | ice water | ethyl acetate | Na2SO4 | ethanol | NaBH4 | HCl | ethyl acetate | Na2SO4 | (4-carbamoyl-3-mercapto-isothiazol-5-yl)-carbamic acid phenyl ester"
10 | US09029295B2,"47% ethyl glyoxylate in toluene (polymer type) (467.84 g, 2.15 mol) was dissolved in THF (620 ml), to the solution was slowly added methylhydrazine (103 g, 1.02 eq) under ice-cooled. After the addition was complete, the mixture was stirred at 0° C. for 30 minutes and at room temperature for 16 hours. The reaction solution was concentrated under reduced pressure. The residue was dissolved in toluene (400 ml), and concentrated under reduced pressure. The resulting solid was washed with tert-butyl methyl ether (200 ml), and allowed to stand at 0° C. for 30 minutes. Then, the mixture was filtered, and washed with cooled tert-butyl methyl ether (100 ml) to give 223.91 g of Compound (XVI-1) (yield: 78.8%) as solid.",ethyl glyoxylate | toluene | THF | methylhydrazine | toluene | tert-butyl methyl ether | tert-butyl methyl ether | Compound
11 | US04419524,"24.7 g. of 2,2-dimethyl-3-(2,2-dichlorovinyl)-cyclopropane-1-carboxylic acid ethyl ester (purity: 96%, cis)trans ratio: 40:60), 6.7 g. of a 30% aqueous potassium hydroxide solution and 0.05 g. of polyoxyethylenesorbitane monooleate are stirred at 70° C. for 60 minutes. Stirring is continued with a further 6-g. portion of the potassium hydroxide solution for 60 minutes and with an additional 6-g. portion for 120 minutes. The solution is diluted with 100 ml. of water and 40 ml. fraction is distilled off to remove volatile, non-hydrolyzed substances. To the residual aqueous solution 80 cm3 of hexane is added followed by the addition of 12 g. of a 36% aqueous hydrochloric acid solution at 60° C. with stirring. The reaction mixture is then further manufactured as described in Example 1. 19.8 g. (95%) of 2,2-dimethyl-3-(2,2-dichlorovinyl)cyclopropane-1-carboxylic acid are obtained. Cis/trans ratio: 40:60.","2,2-dimethyl-3-(2,2-dichlorovinyl)-cyclopropane-1-carboxylic acid ethyl ester | potassium hydroxide | monooleate | 6-g | potassium hydroxide | 6-g | water | hexane | hydrochloric acid | 2,2-dimethyl-3-(2,2-dichlorovinyl)cyclopropane-1-carboxylic acid"
12 | 


--------------------------------------------------------------------------------
/Paragraph2Comound/data/train/trial_3/train_10.csv:
--------------------------------------------------------------------------------
 1 | documentId,Paragraph,Compound
 2 | US04370343,"To a stirred solution of 0.3 g (0.001 mole) of 2-tetradecylthioglycidic acid in 10 ml of anhydrous tetrahydrofuran (THF) at 0° C. (ice water bath) is added 1.01 g (0.001 mole) of triethylamine in a small amount of THF. The mixture is stirred at 0° C. for about 30 minutes. To the thus-formed triethylammonium 2-tetradecylthioglycidate is added 0.108 g (0.001 mole) of ethyl chloroformate in a small amount of THF and the mixture is stirred at about 0° C. (ice water bath) for about 3 hours to prepare the corresponding mixed anhydride (a ppt. of Et3NHCl is observed). A stoichiometric excess of ethylamine in THF is then added and the mixture stirred at room temperature for 16 hours. The THF solvent is concentrated to approximately 1/4 volume, water is added and the mixture extracted with ether. After drying the ether extract (Na2SO4), the solvent is removed in vacuo giving the desired product, N-ethyl-2-tetradecylthioglycidamide, in good yield.",2-tetradecylthioglycidic acid | tetrahydrofuran | THF | ice water | water | triethylamine | THF | triethylammonium 2-tetradecylthioglycidate | ethyl chloroformate | THF | ice water | anhydride | ethylamine | THF | ether | ether | Na2SO4 | desired product | N-ethyl-2-tetradecylthioglycidamide
 3 | US07659279B2,"Glycolic acid (22 mg) and O-(7-azabenzotriazol-1-yl)-N,N,N′,N′-tetramethyluronium hexafluorophosphate (HATU, 110 mg) were added sequentially, each in one portion, to a stirred solution of N-(3-chloro-4-fluorophenyl)-7-ethoxy-5-[(2R)-pyrrolidin-2-ylmethoxy]-quinazolin-4-amine (81 mg) and N,N-diisopropylethylamine (68 μl) in DCM (2.5 ml) at room temperature. The reaction mixture was stirred for 16 hours and then concentrated in vacuo to leave a brown residue. The residue was taken up in a 7:2:1 mixture of DMSO:acetonitrile:water, filtered and then purified by reverse phase HPLC. The fractions containing the required product were combined, basified by the addition of potassium carbonate and then extracted with DCM. The organic layer was dried (MgSO4) and then concentrated in vacuo to give the title compound as a white solid (36 mg, 39%); NMR Spectrum (DMSO-d6 at 373 K) 9.80 (br s, 1H), 8.40 (s, 1H), 8.15 (m, 1H), 7.65 (m, 1H), 7.33 (m, 1H), 6.80 (m, 2H), 4.60 (m, 1H), 4.50 (m, 1H), 4.30-4.10 (m, 4H), 4.0 (m, 2H), 3.50-3.40 (m, 2H), 2.10-1.85 (m, 4H), 1.40 (t, 3H); Mass spectrum MH+ 475.","Glycolic acid | O-(7-azabenzotriazol-1-yl)-N,N,N′,N′-tetramethyluronium hexafluorophosphate | N-(3-chloro-4-fluorophenyl)-7-ethoxy-5-[(2R)-pyrrolidin-2-ylmethoxy]-quinazolin-4-amine | N,N-diisopropylethylamine | DCM | DMSO | acetonitrile | water | required product | potassium carbonate | DCM | MgSO4 | title compound"
 4 | US07947728B1,"[6-(4′,5′-Difluoro-2′-methoxy-biphenyl-4-yloxymethyl)-indazol-1-yl]-acetic acid ethyl ester (0.095 g, 0.204 mmol) and lithium hydroxide hydrate (10 mg, 0.245 mmol) in 5 mL THF/1 mL H2O was stirred at RT for 6 hrs. The reaction was distributed between EtOAc and H2O. The water layer was made acidic to pH 3 with 1N HCl and the organic layer was separated and concentrated in vacuo. The crude product was purified by trituration with Et2O to yield [6-(4′,5′-difluoro-2′-methoxy-biphenyl-4-yloxymethyl)-indazol-1-yl]-acetic acid as a white solid. LC-MS (ES) calculated for C23H18F2N2O4, 424.41; found m/z 425 [M+H]+.","[6-(4′,5′-Difluoro-2′-methoxy-biphenyl-4-yloxymethyl)-indazol-1-yl]-acetic acid ethyl ester | lithium hydroxide hydrate | THF | H2O | EtOAc | water | HCl | crude product | Et2O | [6-(4′,5′-difluoro-2′-methoxy-biphenyl-4-yloxymethyl)-indazol-1-yl]-acetic acid"
 5 | US20090275567A1,"4-(2-Methyl-3-propan-2-yl-imidazol-4-yl)-N-(4-piperidinyl)pyrimidin-2-amine (Example 2, 70 mg, 0.23 mmol), 2-bromoethylbenzene (65 mg, 0.35 mmol) and TEA (0.097 ml, 0.7 mmol) in DMF (2 ml) were heated at 50° C. for 65 hrs. The temperature was increased to 90° C. for 2 hrs then the solvents were evaporated. The resultant material was dissolved in DCM and chromatographed on silica eluting with a shallow gradient of 0-10% MeOH/DCM. Fractions containing product were combined and evaporated to a gum, ether was added, re-evaporated and dried under vacuum, to give the title compound as a white solid (33 mg, 35%). NMR (400.132 MHz, CDCl3) 1.59 (m, 8H), 2.08 (m, 2H), 2.20 (m, 2H), 2.57 (s, 3H), 2.62 (m, 2H), 2.82 (m, 2H), 2.97 (m, 2H), 3.86 (m, 1H), 4.99 (m, 1H), 5.63 (m, 1H), 6.72 (d, 1H), 7.21 (m, 3H), 7.29 (m, 3H), 8.20 (d, 1H); MH+ 405.",4-(2-Methyl-3-propan-2-yl-imidazol-4-yl)-N-(4-piperidinyl)pyrimidin-2-amine | 2-bromoethylbenzene | TEA | DMF | resultant material | DCM | silica | MeOH/DCM | ether | title compound
 6 | US08957080B2,"To a sealed tube was added 3-[(2S)-2-[[4-amino-3-(2-fluoro-4-phenoxy-phenyl)pyrazolo[3,4-d]pyrimidin-1-yl]methyl]pyrrolidin-1-yl]-3-oxo-propanenitrile (900 mg, 1.91 mmol), ethanol (12 mL), piperidine (0.23 mL, 2.29 mmol) and 2-methyl-2-morpholino-propanal (0.49 mL, 2.86 mmol). The tube was sealed and heated to 105° C. for 24 hrs. The mixture was then cooled, concentrated and then dissolved in ethyl acetate (100 mL) and washed with 5% citric acid (100 ml) and then brine. The organic layer was dried (MgSO4), filtered and concentrated. The crude material was purified by Isolera (column size 100 g. Solvent system 4%-8% MeOH/EtOAc) to obtain 245 mg (21% yield) of the title compound. MS (pos. ion) m/z: 611 (M+1).","3-[(2S)-2-[[4-amino-3-(2-fluoro-4-phenoxy-phenyl)pyrazolo[3,4-d]pyrimidin-1-yl]methyl]pyrrolidin-1-yl]-3-oxo-propanenitrile | ethanol | piperidine | 2-methyl-2-morpholino-propanal | ethyl acetate | citric acid | MgSO4 | crude material | MeOH/EtOAc | title compound"
 7 | US09126945B2,"55.0 g (296 mmol) of methyl 2-chloro-5-aminobenzoate and 49.1 g (356 mmol) of potassium carbonate are suspended in 500 ml of acetonitrile p.a. 22.1 ml (356 mmol) of methyl iodide are added dropwise to the reaction mixture. The suspension is then boiled under reflux for 3 hours. After cooling, the reaction mixture is filtered. The filtrate is diluted with water. The aqueous phase is extracted twice with ethyl acetate. The combined organic phases are dried over sodium sulphate and filtered. The solvent is removed under reduced pressure on a rotary evaporator. The crude product is purified by column chromatography. This gives 30.0 g (51%) of methyl 2-chloro-5-(methylamino)benzoate.",methyl 2-chloro-5-aminobenzoate | potassium carbonate | acetonitrile | methyl iodide | water | ethyl acetate | sodium sulphate | crude product
 8 | US04806555,"A mixture of 35.1 g. (0.15 mole) of 1-(o-chlorophenoxy)-2-hydroxybutyl chloride, 32.6 g. (0.2 mole) of N-phenylpiperazine and 400 ml. of isopropanol was refluxed for 48 hr. The reaction mixture was allowed to stand in a refrigerator overnight and filtered. The filtrate was treated with ethereal hydrogen chloride and the salt precipitated by the addition of ether. The white crystalline solid which formed was dissolved in 0.1 mole of hydrochloric acid and then neutralized with sodium hydroxide producing a crystalline precipitate. This was recrystallized with isopropanol yielding 36 g of the free base of the product which melted at 100°-101.5° C.",1-(o-chlorophenoxy)-2-hydroxybutyl chloride | N-phenylpiperazine | isopropanol | hydrogen chloride | ether | hydrochloric acid | sodium hydroxide | isopropanol
 9 | US08114888B2,"A mixture of 4-Nitro-2,6-lutidine 1-oxide (21, 5.1 g, 30 mmol), palladized charcoal (10% Pd, 1 g) and acetic acid (2 ml) in methanol (200 ml) is hydrogenated under pressure (40 psi) over 10 hours using a hydrogenation apparatus. The reaction is followed with LC-MS. After filtration, the filtrate is concentrated. The remaining oil is further dried by lyophilization, yielding 4.5 g of 4-amino-2,6-lutidine (22) containing approximately 15% acetic acid. 1H NMR (300 MHz, CDCl3) δ 2.3 (s, 6H), 7.2 (s, 2H). MS, m/z 123 (M+1), 243 (2M−1).","4-Nitro-2,6-lutidine 1-oxide | charcoal | acetic acid | methanol | 4-amino-2,6-lutidine"
10 | US08178538B2,The product was obtained starting from 3-((E)-3-Dimethylamino-acryloyl)-1-[3-(piperidine-1-sulfonyl)-phenyl]-1H-pyridazin-4-one (A-26) and 2-fluoro-phenylhydrazine according to the method described for example 91. MS: M=480.1 (M+H)+,3-((E)-3-Dimethylamino-acryloyl)-1-[3-(piperidine-1-sulfonyl)-phenyl]-1H-pyridazin-4-one | 2-fluoro-phenylhydrazine
11 | US07981874B2,"To a solution of methyl 4-({[2-[(tert-butoxycarbonyl)amino]-5-(2-thienyl)phenyl]amino}carbonyl)benzoate (3.20 g, 7.07 mmol) in 1:1:2 H2O:MeOH:THF (80 mL) was added LiOH (1.20 g, 50.1 mmol). The reaction was stirred at room temperature for 18 h, diluted with EtOAc, washed with 2 N HCl, dried (Na2SO4), and evaporated to give the desired product as a red solid. MS: cal'd 461 (MNa+), exp 461 (MNa+).",methyl 4-({[2-[(tert-butoxycarbonyl)amino]-5-(2-thienyl)phenyl]amino}carbonyl)benzoate | H2O | MeOH | THF | LiOH | EtOAc | HCl | Na2SO4 | desired product
12 | 


--------------------------------------------------------------------------------
/Paragraph2Comound/finetune_bart_or_t5_for_paragraph2compound.py:
--------------------------------------------------------------------------------
  1 | # Finetuning Bart or T5
  2 | import os
  3 | os.environ["CUDA_VISIBLE_DEVICES"] = "3"
  4 | import pandas as pd 
  5 | import numpy as np
  6 | import torch
  7 | import logging
  8 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
  9 | from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, TrainerCallback
 10 | from datasets import load_metric, Dataset, DatasetDict
 11 | import warnings
 12 | warnings.filterwarnings('ignore')
 13 | 
 14 | # Configuration
 15 | class CFG:
 16 |     # Model Configuration
 17 |     device = "cuda:0" if torch.cuda.is_available() else "cpu"
 18 |     num_train_epochs = 50
 19 |     save_total_limit = 50
 20 |     batch_size = 8
 21 |     learning_rate = 1e-5
 22 |     max_input_length = 512
 23 |     max_target_length = 512
 24 |     weight_decay = 0.01
 25 |     save_strategy = "epoch"
 26 |     evaluation_strategy = "epoch"
 27 |     interval_eval_epoch = 1                               # the number of interval epochs to evaluate (inference)
 28 |     model_name = "bart-base"                                # "bart-base" or "t5-base"
 29 |     pretrained_dir = "/home/zhangwei/pretrained_models/"  # Path of the pretrained model downloaded from Hugging Face
 30 | 
 31 |     # Trail Set
 32 |     num_of_train_trail = 1
 33 |     num_of_train_data = 10000
 34 |     saved_models_dir = f"saved_models/{model_name}/trail_{num_of_train_trail}_train_{num_of_train_data}_{model_name}_lr_{learning_rate}_epoch_{num_train_epochs}_bs_{batch_size}_intervel_{interval_eval_epoch}/"
 35 |     output_dir = f"results/predictions/{saved_models_dir}"
 36 | 
 37 |     # Data Configuration
 38 |     train_file = f"data/train/trial_{num_of_train_trail}/train_{num_of_train_data}.csv"
 39 |     test_file = "data/test/test_1000.csv"
 40 |     source_text_column = "Paragraph"
 41 |     target_text_column = "Compound"
 42 | 
 43 | def load_data():
 44 |     train_df = pd.read_csv(CFG.train_file)
 45 |     test_df = pd.read_csv(CFG.test_file)
 46 |     train_dataset = Dataset.from_dict(train_df.astype(str))
 47 |     test_dataset = Dataset.from_dict(test_df.astype(str))
 48 |     datasets = DatasetDict({"train": train_dataset, "test": test_dataset})
 49 |     print(datasets)
 50 |     return datasets
 51 | 
 52 | def tokenize_and_encode(tokenizer, datasets):
 53 |     def tokenize_function(examples):
 54 |         model_inputs = tokenizer(examples[CFG.source_text_column], max_length=CFG.max_input_length, truncation=True)
 55 |         model_labels = tokenizer(examples[CFG.target_text_column], max_length=CFG.max_target_length, truncation=True)
 56 |         model_inputs["labels"] = model_labels["input_ids"]
 57 |         return model_inputs
 58 |     return datasets.map(tokenize_function, batched=True)
 59 | 
 60 | def logging_config():
 61 |     logging.info("Configuration Details:")
 62 |     for attr in dir(CFG):
 63 |         # Filter out private attributes and methods
 64 |         if not attr.startswith("__") and not callable(getattr(CFG, attr)):
 65 |             logging.info(f"{attr}: {getattr(CFG, attr)}")
 66 | 
 67 | # Custom Callback
 68 | class CustomCallback(TrainerCallback):    
 69 |     def __init__(self, trainer) -> None:
 70 |         super().__init__()
 71 |         self._trainer = trainer
 72 | 
 73 |     def on_log(self, args, state, control, logs=None, **kwargs):
 74 |         if 'loss' in logs:
 75 |             training_loss = logs['loss']
 76 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current training_loss: {training_loss}")
 77 | 
 78 |         if 'eval_loss' in state.log_history[-1]:
 79 |             eval_loss = state.log_history[-1]['eval_loss']
 80 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current eval_loss: {eval_loss}")
 81 | 
 82 |     def on_epoch_end(self, args, state, control, **kwargs):
 83 |             logging.info("Saving inference results for test_set...")
 84 |             output = self._trainer.predict(self._trainer.eval_dataset)
 85 |             epoch = int(state.epoch)
 86 | 
 87 |             if epoch % CFG.interval_eval_epoch == 0 :
 88 |                 # Decode generated summaries into text
 89 |                 decoded_ids = output.predictions
 90 | 
 91 |                 # Replace -100 in the labels as we can't decode them
 92 |                 decoded_ids = np.where(decoded_ids != -100, decoded_ids, tokenizer.pad_token_id)
 93 |                 decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=True)
 94 |                 paragraphs = [i[CFG.source_text_column] for i in self._trainer.eval_dataset]
 95 |                 ground_truth = [i[CFG.target_text_column] for i in self._trainer.eval_dataset]
 96 |                 prediction = [decoded_text for decoded_text in decoded_texts]
 97 | 
 98 |                 # Save predictions to csv
 99 |                 predicted_df = pd.DataFrame()
100 |                 predicted_df['Paragraph'] = paragraphs
101 |                 predicted_df['Generated Text'] = prediction
102 |                 predicted_df['Actual Text'] = ground_truth
103 |                 predicted_df.to_csv(f"{CFG.output_dir}/epoch_{epoch}.csv", index = None)
104 | 
105 | def main():
106 |     # mkdir needed folders
107 |     if not os.path.exists(CFG.saved_models_dir):
108 |         os.makedirs(CFG.saved_models_dir)
109 |     if not os.path.exists(CFG.output_dir):
110 |         os.makedirs(CFG.output_dir)
111 | 
112 |     # Setup logging
113 |     logging.basicConfig(filename = CFG.saved_models_dir+'/training.log', level = logging.INFO)
114 |     logging_config()
115 | 
116 |     # Loading Tokenizer and Model
117 |     print("Loading Tokenizer and Model ...")
118 |     logging.info(f"[Device]: {CFG.device}...")
119 |     logging.info(f"[Model]: Loading {CFG.model_name}...")
120 |     global tokenizer
121 |     tokenizer = AutoTokenizer.from_pretrained(f"{CFG.pretrained_dir}/{CFG.model_name}")
122 |     model = AutoModelForSeq2SeqLM.from_pretrained(f"{CFG.pretrained_dir}/{CFG.model_name}").to(CFG.device)
123 | 
124 |     # Loading Data
125 |     print("Loading Data ...")
126 |     datasets = load_data()
127 | 
128 |     # Preparing Data
129 |     print("Preparing Data ...")
130 |     logging.info(f"[Dataset]:\n{datasets}")
131 |     tokenized_datasets = tokenize_and_encode(tokenizer, datasets)
132 |     data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)
133 | 
134 |     # Training Arguments
135 |     args = Seq2SeqTrainingArguments(
136 |         output_dir = CFG.saved_models_dir,
137 |         logging_dir = CFG.saved_models_dir + "logs/",
138 |         evaluation_strategy = CFG.evaluation_strategy,            
139 |         learning_rate = CFG.learning_rate,
140 |         per_device_train_batch_size = CFG.batch_size,
141 |         per_device_eval_batch_size = CFG.batch_size,
142 |         weight_decay = CFG.weight_decay,
143 |         generation_max_length = CFG.max_target_length,
144 |         save_strategy =  CFG.save_strategy,
145 |         num_train_epochs = CFG.num_train_epochs,
146 |         save_total_limit = CFG.save_total_limit,
147 |         predict_with_generate = True,
148 |         logging_steps = len(tokenized_datasets["train"]) // CFG.batch_size,
149 |         push_to_hub = False,
150 |         report_to = "tensorboard")
151 | 
152 |     # Trainer                
153 |     trainer = Seq2SeqTrainer(
154 |         model,
155 |         args,
156 |         train_dataset = tokenized_datasets["train"],
157 |         eval_dataset = tokenized_datasets["test"],
158 |         data_collator = data_collator,
159 |         tokenizer = tokenizer)
160 | 
161 |     # Training and logging
162 |     print("Training ...")
163 |     trainer.add_callback(CustomCallback(trainer)) 
164 |     trainer.train()
165 | 
166 | if __name__ == "__main__":
167 |     main()


--------------------------------------------------------------------------------
/Paragraph2Comound/finetune_llms_full_for_paragraph2compound.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | # Setup environment (full fine-tuning 7b needs 160GB Memory)
  3 | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
  4 | 
  5 | import torch
  6 | import pandas as pd
  7 | from datasets import load_dataset, Dataset, DatasetDict
  8 | from transformers import  AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
  9 | import logging
 10 | from peft import LoraConfig, PeftModel
 11 | from trl import SFTTrainer
 12 | from transformers import TrainerCallback
 13 | 
 14 | # Data Loading and Preprocessing
 15 | num_of_train_trial = 3
 16 | num_of_train_data = 10000
 17 | train_df = pd.read_csv(f"data/train/trial_{num_of_train_trial}/train_{num_of_train_data}.csv")
 18 | test_df = pd.read_csv("data/test/test_1000.csv")
 19 | 
 20 | source_text = "Paragraph"
 21 | target_text = "Compound"
 22 | instruction = f'{source_text}2{target_text}: '
 23 | # insturction = '''You are an assistant that formats chemical paragraphs to compoulease extract all compound names in the paragraph, the compound names should be split in " | ".'''
 24 | 
 25 | train_df['text'] = f'<s>[INST] {instruction}' + train_df[source_text] + " [/INST] " + train_df[target_text] +  "!!! </s>"
 26 | test_df['text'] = f'<s>[INST] {instruction}' + test_df[source_text] + " [/INST] " + test_df[target_text] +  "!!! </s>"
 27 | 
 28 | train_dataset = Dataset.from_dict(train_df[['text']].astype(str))
 29 | test_dataset = Dataset.from_dict(test_df[['text']].astype(str))
 30 | dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
 31 | print(dataset)
 32 | 
 33 | # TrainingArguments parameters
 34 | num_train_epochs = 20
 35 | save_steps = 0                      # Save checkpoint every X updates steps
 36 | logging_steps = 25                  # Log every X updates steps
 37 | 
 38 | fp16 = False                        # Enable fp16/bf16 training (set fp16 to True with an V100)
 39 | bf16 = True                         # Enable fp16/bf16 training (set bf16 to True with an A100)
 40 | 
 41 | per_device_train_batch_size = 2     # Batch size per GPU for training
 42 | per_device_eval_batch_size = 2      # Batch size per GPU for evaluation
 43 | gradient_accumulation_steps = 1     # Number of update steps to accumulate the gradients for
 44 | gradient_checkpointing = True       # Enable gradient checkpointing
 45 | 
 46 | max_grad_norm = 0.3                 # Maximum gradient normal (gradient clipping)
 47 | learning_rate = 5e-6                # Initial learning rate (AdamW optimizer, 1e-5 or 5e-6 or 1e-4)
 48 | weight_decay = 0.001                # Weight decay to apply to all layers except bias/LayerNorm weights
 49 | 
 50 | optim = "paged_adamw_32bit"         # Optimizer to use
 51 | lr_scheduler_type = "cosine"        # Learning rate schedule
 52 | 
 53 | max_steps = -1                      # Number of training steps (overrides num_train_epochs)
 54 | warmup_ratio = 0.03                 # Ratio of steps for a linear warmup (from 0 to learning rate)
 55 | 
 56 | group_by_length = True              # Group sequences into batches with same length (Saves memory and speeds up training considerably)
 57 | 
 58 | # SFT parameters
 59 | max_seq_length = 2048               # Maximum sequence length to use (default 1024)
 60 | packing = False                     # Pack multiple short examples in the same input sequence to increase efficiency
 61 | device_map = "auto"                 # Load the entire model on the GPU 0, or "auto"
 62 | 
 63 | # Model Version (Meta-Llama-3-8B-Instruct, Mistral-7B-Instruct-v0.2, llama-2-13b-chat-hf ...)
 64 | model_name = "/home/zhangwei/pretrained_models/Meta-Llama-3-8B-Instruct"  # Path of the pretrained model downloaded from Hugging Face
 65 | new_model_dir = f"saved_models/Meta-Llama-3-8B-Instruct/trial_{num_of_train_trial}_train_{len(train_df)}_lr{learning_rate}_bs{per_device_train_batch_size}"  # Fine-tuned model name
 66 | output_dir = new_model_dir          # Output directory where the model predictions and checkpoints will be stored
 67 | 
 68 | # Load base model
 69 | model = AutoModelForCausalLM.from_pretrained(
 70 |     pretrained_model_name_or_path = model_name,
 71 |     torch_dtype = torch.bfloat16,
 72 |     device_map = device_map
 73 | )
 74 | 
 75 | # Load tokenizer
 76 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 77 | tokenizer.pad_token = tokenizer.eos_token
 78 | tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training
 79 | print("------------tokenizer.eos_token--------------", tokenizer.eos_token)
 80 | print("------------tokenizer.unk_token--------------", tokenizer.unk_token)
 81 | print("------------tokenizer.bos_token--------------", tokenizer.bos_token)
 82 | print("------------tokenizer.pad_token--------------", tokenizer.pad_token)
 83 | print("------------vocab_size is------------", tokenizer.vocab_size)
 84 | print("------------vocab_size is------------", len(tokenizer))
 85 | 
 86 | # Set training parameters
 87 | training_arguments = TrainingArguments(
 88 |     output_dir = output_dir,
 89 |     logging_dir = output_dir + "/logs/",
 90 |     # evaluation_strategy = "epoch",            
 91 |     save_strategy = "epoch",
 92 |     num_train_epochs = num_train_epochs,
 93 |     save_total_limit = num_train_epochs,
 94 |     per_device_train_batch_size = per_device_train_batch_size,
 95 |     gradient_accumulation_steps = gradient_accumulation_steps,
 96 |     # optim = optim,
 97 |     # save_steps=save_steps,
 98 |     logging_steps = logging_steps,
 99 |     learning_rate = learning_rate,
100 |     weight_decay = weight_decay,
101 |     fp16 = fp16,
102 |     bf16 = bf16,
103 |     max_grad_norm = max_grad_norm,
104 |     max_steps = max_steps,
105 |     warmup_ratio = warmup_ratio,
106 |     group_by_length = group_by_length,
107 |     lr_scheduler_type = lr_scheduler_type,
108 |     report_to = "tensorboard"
109 | )
110 | 
111 | # Set supervised fine-tuning parameters
112 | trainer = SFTTrainer(
113 |     model = model,
114 |     train_dataset = dataset['train'],
115 |     eval_dataset = dataset["test"],
116 |     dataset_text_field = "text",
117 |     max_seq_length = max_seq_length,
118 |     tokenizer = tokenizer,
119 |     args = training_arguments,
120 |     packing = packing,
121 | )
122 | 
123 | class SaveBestModelCallback(TrainerCallback):
124 |     def __init__(self):
125 |         super().__init__()
126 |         self.best_eval_loss = float('inf')
127 |         self.best_model_checkpoint = None
128 |         
129 |     def on_log(self, args, state, control, logs=None, **kwargs):
130 |         # Check if training_loss is in the logs and print it
131 |         if 'loss' in logs:
132 |             training_loss = logs['loss']
133 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current training_loss: {training_loss}")
134 | 
135 |     def on_evaluate(self, args, state, control, **kwargs):
136 |         # Check if eval_loss is in the logs
137 |         if 'eval_loss' in state.log_history[-1]:
138 |             eval_loss = state.log_history[-1]['eval_loss']
139 |             # Print current eval_loss with epoch and step
140 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current eval_loss: {eval_loss}")  
141 | 
142 |             if eval_loss < self.best_eval_loss:
143 |                 self.best_eval_loss = eval_loss
144 |                 # Save the best model
145 |                 self.best_model_checkpoint = state.global_step
146 |                 trainer.save_model(f"{args.output_dir}/best_model")
147 |                 # Print loss of Best Model
148 |                 logging.info(f"New best model saved at step {state.global_step} with eval_loss: {eval_loss}")  
149 | 
150 | # Create an instance of the callback
151 | save_best_model_callback = SaveBestModelCallback()
152 | 
153 | # Training and logging
154 | logging.basicConfig(filename=output_dir+'/training.log', level=logging.INFO)
155 | logging.info(f"""[Device]: cuda:{os.environ["CUDA_VISIBLE_DEVICES"]}...\n""")
156 | logging.info(f"""[Model]: Loading {model_name}...\n""")
157 | logging.info(f"""[Outputdir]: Loading {output_dir}...\n""")
158 | 
159 | # Add the callback to the trainer
160 | trainer.add_callback(save_best_model_callback)
161 | 
162 | # Train model
163 | trainer.train()
164 | 
165 | # Save trained model
166 | trainer.model.save_pretrained(new_model_dir)


--------------------------------------------------------------------------------
/Paragraph2Comound/finetune_llms_peft_for_paragraph2compound.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
  3 | import torch
  4 | device = "cuda:0" if torch.cuda.is_available() else "cpu"
  5 | 
  6 | import pandas as pd
  7 | from datasets import load_dataset, Dataset, DatasetDict
  8 | from transformers import  AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
  9 | import logging
 10 | from peft import LoraConfig, PeftModel
 11 | from trl import SFTTrainer
 12 | from transformers import TrainerCallback
 13 | 
 14 | # Data Loading and Preprocessing
 15 | num_of_train_trail = 1
 16 | num_of_train_data = 10000
 17 | train_df = pd.read_csv(f"data/train/trial_{num_of_train_trail}/train_{num_of_train_data}.csv")
 18 | test_df = pd.read_csv("data/test/test_1000.csv")
 19 | 
 20 | source_text = "Paragraph"
 21 | target_text = "Compound"
 22 | instruction = f'{source_text}2{target_text}: '
 23 | # insturction = '''You are an assistant that formats chemical paragraphs to compounds.Please extract all compound names in the paragraph, the compound names should be split in " | ".'''
 24 | 
 25 | train_df['text'] = f'<s>[INST] {instruction}' + train_df[source_text] + " [/INST] " + train_df[target_text] +  "!!! </s>"
 26 | test_df['text'] = f'<s>[INST] {instruction}' + test_df[source_text] + " [/INST] " + test_df[target_text] +  "!!! </s>"
 27 | 
 28 | train_dataset = Dataset.from_dict(train_df[['text']].astype(str))
 29 | test_dataset = Dataset.from_dict(test_df[['text']].astype(str))
 30 | dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
 31 | print(dataset)
 32 | 
 33 | ################################################################################
 34 | # Parameters Setting
 35 | ################################################################################
 36 | # QLoRA parameters
 37 | lora_r = 64                         # LoRA attention dimension (8, 16, 64, larger is better)
 38 | lora_alpha = 128                    # Alpha parameter for LoRA scaling (lora_r*2)
 39 | lora_dropout = 0.1                  # Dropout probability for LoRA layers
 40 | 
 41 | # bitsandbytes parameters
 42 | use_4bit = True                     # Activate 4-bit precision base model loading
 43 | bnb_4bit_compute_dtype = "float16"  # Compute dtype for 4-bit base models   
 44 | bnb_4bit_quant_type = "nf4"         # Quantization type (fp4 or nf4)
 45 | use_nested_quant = False            # Activate nested quantization for 4-bit base models (double quantization)
 46 | 
 47 | # TrainingArguments parameters
 48 | num_train_epochs = 20
 49 | save_steps = 0                      # Save checkpoint every X updates steps
 50 | logging_steps = 25                  # Log every X updates steps
 51 | 
 52 | fp16 = False                        # Enable fp16/bf16 training (set fp16 to True with an V100)
 53 | bf16 = True                         # Enable fp16/bf16 training (set bf16 to True with an A100)
 54 | 
 55 | per_device_train_batch_size = 2     # Batch size per GPU for training
 56 | per_device_eval_batch_size = 2      # Batch size per GPU for evaluation
 57 | gradient_accumulation_steps = 1     # Number of update steps to accumulate the gradients for
 58 | gradient_checkpointing = True       # Enable gradient checkpointing
 59 | 
 60 | max_grad_norm = 0.3                 # Maximum gradient normal (gradient clipping)
 61 | learning_rate = 1e-5                # Initial learning rate (AdamW optimizer)
 62 | weight_decay = 0.001                # Weight decay to apply to all layers except bias/LayerNorm weights
 63 | 
 64 | optim = "paged_adamw_32bit"         # Optimizer to use
 65 | lr_scheduler_type = "cosine"        # Learning rate schedule
 66 | 
 67 | max_steps = -1                      # Number of training steps (overrides num_train_epochs)
 68 | warmup_ratio = 0.03                 # Ratio of steps for a linear warmup (from 0 to learning rate)
 69 | 
 70 | group_by_length = True              # Group sequences into batches with same length (Saves memory and speeds up training considerably)
 71 | 
 72 | # SFT parameters
 73 | max_seq_length = 4096               # Maximum sequence length to use (default 1024)
 74 | packing = False                     # Pack multiple short examples in the same input sequence to increase efficiency
 75 | device_map = {"": 0}                # Load the entire model on the GPU 0, or "auto"
 76 | 
 77 | # Model Version (Meta-Llama-3-8B-Instruct, Mistral-7B-Instruct-v0.2, llama-2-13b-chat-hf ...)
 78 | model_name = "/home/zhangwei/pretrained_models/llama-2-13b-chat-hf"              # Path of the pretrained model downloaded from Hugging Face
 79 | new_model_dir = f"saved_models/llama2_13b_chat_qlora/train_{len(train_df)}_lora_r{lora_r}_lr{learning_rate}"  # Fine-tuned model name
 80 | output_dir = new_model_dir                                              # Output directory where the model predictions and checkpoints will be stored
 81 | 
 82 | ################################################################################
 83 | # Train
 84 | ################################################################################
 85 | # Load tokenizer and model with QLoRA configuration
 86 | compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
 87 | bnb_config = BitsAndBytesConfig(
 88 |     load_in_4bit=use_4bit,
 89 |     bnb_4bit_quant_type=bnb_4bit_quant_type,
 90 |     bnb_4bit_compute_dtype=compute_dtype,
 91 |     bnb_4bit_use_double_quant=use_nested_quant,
 92 | )
 93 | 
 94 | # Check GPU compatibility with bfloat16
 95 | if compute_dtype == torch.float16 and use_4bit:
 96 |     major, _ = torch.cuda.get_device_capability()
 97 |     if major >= 8:
 98 |         print("=" * 80)
 99 |         print("Your GPU supports bfloat16: accelerate training with bf16=True")
100 |         print("=" * 80)
101 | 
102 | # Load base model
103 | model = AutoModelForCausalLM.from_pretrained(
104 |     pretrained_model_name_or_path = model_name,
105 |     quantization_config = bnb_config,
106 |     device_map = device_map
107 | )
108 | model.config.use_cache = False
109 | model.config.pretraining_tp = 1
110 | 
111 | # Load LLaMA tokenizer
112 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
113 | tokenizer.pad_token = tokenizer.eos_token
114 | tokenizer.padding_side = "right"   # Fix weird overflow issue with fp16 training
115 | 
116 | # Load LoRA configuration
117 | peft_config = LoraConfig(
118 |     lora_alpha=lora_alpha,
119 |     lora_dropout=lora_dropout,
120 |     r=lora_r,
121 |     bias="none",
122 |     task_type="CAUSAL_LM",
123 | )
124 | 
125 | # Set training parameters
126 | training_arguments = TrainingArguments(
127 |     output_dir=output_dir,
128 |     logging_dir = output_dir + "/logs/",
129 |     evaluation_strategy = "epoch",            
130 |     save_strategy = "epoch",
131 |     num_train_epochs = num_train_epochs,
132 |     save_total_limit = num_train_epochs,
133 |     per_device_train_batch_size = per_device_train_batch_size,
134 |     gradient_accumulation_steps = gradient_accumulation_steps,
135 |     optim = optim,
136 |     #save_steps=save_steps,
137 |     logging_steps = logging_steps,
138 |     learning_rate = learning_rate,
139 |     weight_decay = weight_decay,
140 |     fp16 = fp16,
141 |     bf16 = bf16,
142 |     max_grad_norm = max_grad_norm,
143 |     max_steps = max_steps,
144 |     warmup_ratio = warmup_ratio,
145 |     group_by_length = group_by_length,
146 |     lr_scheduler_type = lr_scheduler_type,
147 |     report_to = "tensorboard"
148 | )
149 | 
150 | # Set supervised fine-tuning parameters
151 | trainer = SFTTrainer(
152 |     model = model,
153 |     train_dataset = dataset['train'],
154 |     eval_dataset = dataset["test"],
155 |     peft_config = peft_config,
156 |     dataset_text_field ="text",
157 |     max_seq_length = max_seq_length,
158 |     tokenizer = tokenizer,
159 |     args = training_arguments,
160 |     packing = packing,
161 | )
162 | 
163 | class SaveBestModelCallback(TrainerCallback):
164 |     def __init__(self):
165 |         super().__init__()
166 |         self.best_eval_loss = float('inf')
167 |         self.best_model_checkpoint = None
168 |         
169 |     def on_log(self, args, state, control, logs=None, **kwargs):
170 |         # Check if training_loss is in the logs and print it
171 |         if 'loss' in logs:
172 |             training_loss = logs['loss']
173 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current training_loss: {training_loss}")
174 | 
175 |     def on_evaluate(self, args, state, control, **kwargs):
176 |         # Check if eval_loss is in the logs
177 |         if 'eval_loss' in state.log_history[-1]:
178 |             eval_loss = state.log_history[-1]['eval_loss']
179 |             # Print current eval_loss with epoch and step
180 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current eval_loss: {eval_loss}")  
181 | 
182 |             if eval_loss < self.best_eval_loss:
183 |                 self.best_eval_loss = eval_loss
184 |                 # Save the best model
185 |                 self.best_model_checkpoint = state.global_step
186 |                 trainer.save_model(f"{args.output_dir}/best_model")
187 |                 # Print loss of Best Model
188 |                 logging.info(f"New best model saved at step {state.global_step} with eval_loss: {eval_loss}")  
189 | 
190 | # Create an instance of the callback
191 | save_best_model_callback = SaveBestModelCallback()
192 | 
193 | # Training and logging
194 | logging.basicConfig(filename=output_dir+'/training.log', level=logging.INFO)
195 | logging.info(f"""[Device]: cuda:{os.environ["CUDA_VISIBLE_DEVICES"]}...\n""")
196 | logging.info(f"""[Model]: Loading {model_name}...\n""")
197 | logging.info(f"""[Outputdir]: Loading {output_dir}...\n""")
198 | 
199 | # Add the callback to the trainer
200 | trainer.add_callback(save_best_model_callback)
201 | 
202 | # Train model
203 | trainer.train()
204 | 
205 | # Save trained model
206 | trainer.model.save_pretrained(new_model_dir)


--------------------------------------------------------------------------------
/Paragraph2Comound/results/README.md:
--------------------------------------------------------------------------------
1 | Save the generated outputs by different models. 


--------------------------------------------------------------------------------
/Paragraph2MOFInfo/README.md:
--------------------------------------------------------------------------------
 1 | ## 1. Data for Paragraph2MOFInfo
 2 | 
 3 | - Data for fine-tuning LLMs are in ```data/data_for_llms```.
 4 | 
 5 | - Data for fine-tuning bart or T5 are in ```data/data_for_bart_or_t5```. 
 6 | 
 7 | (preprocessed beacuse the 512 length limitation, split into 11 prefix tasks )
 8 | 
 9 | ## 2. Methods for Paragraph2MOFInfo
10 | 
11 | ### Prompt Engineering ChatGPT (GPT-4, GPT-3.5-Turbo)
12 | 
13 | See in ```prompt_chatgpt_for_paragraph2MOFInfo.ipynb```
14 | 
15 | ### Fine-tuning ChatGPT (GPT-3.5-Turbo)
16 | 
17 | See in ```finetune_chatgpt_for_paragraph2MOFInfo.ipynb```
18 | 
19 | ### Full Parameter Fine-tuning Open-source Large Language Models (Mistral, Llama3, Llama2)
20 | 
21 | Training Code in ```finetune_llms_full_for_paragraph2MOFInfo.py```
22 | 
23 | Inferencing Code in ```vllm_inference_full_finetuned_llms.ipynb```
24 | 
25 | ### Parameter Efficient Fine-tuning (PEFT) Open-source Large Language Models (Mistral, Llama3, Llama2)
26 | 
27 | Training Code in ```finetune_llms_peft_for_paragraph2MOFInfo.py``
28 | 
29 | Inferencing Code in ```vllm_inference_peft_finetuned_llms.ipynb```
30 | 
31 | ### Fine-tuning Language Models (T5, Bart)
32 | 
33 | See in ```finetune_bart_or_t5_for_paragraph2MOFInfo.py```
34 | 
35 | ## 3. Evaluating the results of Paragraph2MOFInfo
36 | 
37 | All predictions will be saved in ```results/predictions```
38 | 
39 | Evalutating codes for LLMs (ChatGPT, Llama, Mistral) are in ```evaluate_llms_paragraph2MOFInfo_all.ipynb```
40 | 
41 | Evalutating codes for LM (Bart or T5) are in ```evaluate_bart_or_t5_Paragraph2NMR.ipynb```


--------------------------------------------------------------------------------
/Paragraph2MOFInfo/finetune_bart_or_t5_for_paragraph2MOFInfo.py:
--------------------------------------------------------------------------------
  1 | # Finetuning Bart or T5
  2 | import os
  3 | os.environ["CUDA_VISIBLE_DEVICES"] = "2"
  4 | import pandas as pd 
  5 | import numpy as np
  6 | import torch
  7 | import logging
  8 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
  9 | from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, TrainerCallback
 10 | from datasets import load_metric, Dataset, DatasetDict
 11 | import warnings
 12 | warnings.filterwarnings('ignore')
 13 | 
 14 | # Configuration
 15 | class CFG:
 16 |     # Model Configuration
 17 |     device = "cuda:0" if torch.cuda.is_available() else "cpu"
 18 |     num_train_epochs = 50
 19 |     save_total_limit = 50
 20 |     batch_size = 8
 21 |     learning_rate = 1e-5
 22 |     max_input_length = 1024
 23 |     max_target_length = 1024
 24 |     weight_decay = 0.01
 25 |     save_strategy = "epoch"
 26 |     evaluation_strategy = "epoch"
 27 |     interval_eval_epoch = 1                               # the number of interval epochs to evaluate (inference)
 28 |     model_name = "t5-base"                                # "bart-base" or "t5-base"
 29 |     pretrained_dir = "/home/zhangwei/pretrained_models/"  # Path of the pretrained model downloaded from Hugging Face
 30 |     saved_models_dir = f"saved_models/{model_name}/{model_name}_lr_{learning_rate}_epoch_{num_train_epochs}_bs_{batch_size}_intervel_{interval_eval_epoch}/"
 31 |     output_dir = f"results/predictions/{saved_models_dir}"
 32 | 
 33 |     # Data Configuration
 34 |     train_file = "data/data_for_bart_and_t5/train_329_11_tasks.csv"
 35 |     test_file = "data/data_for_bart_and_t5/test_329_11_tasks.csv"
 36 |     source_text_column = "paragraph"
 37 |     target_text_column = "output"
 38 | 
 39 | def load_data():
 40 |     train_df = pd.read_csv(CFG.train_file)
 41 |     test_df = pd.read_csv(CFG.test_file)
 42 |     train_dataset = Dataset.from_dict(train_df.astype(str))
 43 |     test_dataset = Dataset.from_dict(test_df.astype(str))
 44 |     datasets = DatasetDict({"train": train_dataset, "test": test_dataset})
 45 |     print(datasets)
 46 |     return datasets
 47 | 
 48 | def tokenize_and_encode(tokenizer, datasets):
 49 |     def tokenize_function(examples):
 50 |         model_inputs = tokenizer(examples[CFG.source_text_column], max_length=CFG.max_input_length, truncation=True)
 51 |         model_labels = tokenizer(examples[CFG.target_text_column], max_length=CFG.max_target_length, truncation=True)
 52 |         model_inputs["labels"] = model_labels["input_ids"]
 53 |         return model_inputs
 54 |     return datasets.map(tokenize_function, batched=True)
 55 | 
 56 | def logging_config():
 57 |     logging.info("Configuration Details:")
 58 |     for attr in dir(CFG):
 59 |         # Filter out private attributes and methods
 60 |         if not attr.startswith("__") and not callable(getattr(CFG, attr)):
 61 |             logging.info(f"{attr}: {getattr(CFG, attr)}")
 62 | 
 63 | # Custom Callback
 64 | class CustomCallback(TrainerCallback):    
 65 |     def __init__(self, trainer) -> None:
 66 |         super().__init__()
 67 |         self._trainer = trainer
 68 | 
 69 |     def on_log(self, args, state, control, logs=None, **kwargs):
 70 |         if 'loss' in logs:
 71 |             training_loss = logs['loss']
 72 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current training_loss: {training_loss}")
 73 | 
 74 |         if 'eval_loss' in state.log_history[-1]:
 75 |             eval_loss = state.log_history[-1]['eval_loss']
 76 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current eval_loss: {eval_loss}")
 77 | 
 78 |     def on_epoch_end(self, args, state, control, **kwargs):
 79 |             logging.info("Saving inference results for test_set...")
 80 |             output = self._trainer.predict(self._trainer.eval_dataset)
 81 |             epoch = int(state.epoch)
 82 | 
 83 |             if epoch % CFG.interval_eval_epoch == 0 :
 84 |                 # Decode generated summaries into text
 85 |                 decoded_ids = output.predictions
 86 | 
 87 |                 # Replace -100 in the labels as we can't decode them
 88 |                 decoded_ids = np.where(decoded_ids != -100, decoded_ids, tokenizer.pad_token_id)
 89 |                 decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=True)
 90 |                 paragraphs = [i[CFG.source_text_column] for i in self._trainer.eval_dataset]
 91 |                 ground_truth = [i[CFG.target_text_column] for i in self._trainer.eval_dataset]
 92 |                 prediction = [decoded_text for decoded_text in decoded_texts]
 93 | 
 94 |                 # Save predictions to csv
 95 |                 predicted_df = pd.DataFrame()
 96 |                 predicted_df['Paragraph'] = paragraphs
 97 |                 predicted_df['Generated Text'] = prediction
 98 |                 predicted_df['Actual Text'] = ground_truth
 99 |                 predicted_df.to_csv(f"{CFG.output_dir}/epoch_{epoch}.csv", index = None)
100 | 
101 | def main():
102 |     # mkdir needed folders
103 |     if not os.path.exists(CFG.saved_models_dir):
104 |         os.makedirs(CFG.saved_models_dir)
105 |     if not os.path.exists(CFG.output_dir):
106 |         os.makedirs(CFG.output_dir)
107 | 
108 |     # Setup logging
109 |     logging.basicConfig(filename = CFG.saved_models_dir+'/training.log', level = logging.INFO)
110 |     logging_config()
111 | 
112 |     # Loading Tokenizer and Model
113 |     print("Loading Tokenizer and Model ...")
114 |     logging.info(f"[Device]: {CFG.device}...")
115 |     logging.info(f"[Model]: Loading {CFG.model_name}...")
116 |     global tokenizer
117 |     tokenizer = AutoTokenizer.from_pretrained(f"{CFG.pretrained_dir}/{CFG.model_name}")
118 |     model = AutoModelForSeq2SeqLM.from_pretrained(f"{CFG.pretrained_dir}/{CFG.model_name}").to(CFG.device)
119 | 
120 |     # Loading Data
121 |     print("Loading Data ...")
122 |     datasets = load_data()
123 | 
124 |     # Preparing Data
125 |     print("Preparing Data ...")
126 |     logging.info(f"[Dataset]:\n{datasets}")
127 |     tokenized_datasets = tokenize_and_encode(tokenizer, datasets)
128 |     data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)
129 | 
130 |     # Training Arguments
131 |     args = Seq2SeqTrainingArguments(
132 |         output_dir = CFG.saved_models_dir,
133 |         logging_dir = CFG.saved_models_dir + "logs/",
134 |         evaluation_strategy = CFG.evaluation_strategy,            
135 |         learning_rate = CFG.learning_rate,
136 |         per_device_train_batch_size = CFG.batch_size,
137 |         per_device_eval_batch_size = CFG.batch_size,
138 |         weight_decay = CFG.weight_decay,
139 |         generation_max_length = CFG.max_target_length,
140 |         save_strategy =  CFG.save_strategy,
141 |         num_train_epochs = CFG.num_train_epochs,
142 |         save_total_limit = CFG.save_total_limit,
143 |         predict_with_generate = True,
144 |         logging_steps = len(tokenized_datasets["train"]) // CFG.batch_size,
145 |         push_to_hub = False,
146 |         report_to = "tensorboard")
147 | 
148 |     # Trainer                
149 |     trainer = Seq2SeqTrainer(
150 |         model,
151 |         args,
152 |         train_dataset = tokenized_datasets["train"],
153 |         eval_dataset = tokenized_datasets["test"],
154 |         data_collator = data_collator,
155 |         tokenizer = tokenizer)
156 | 
157 |     # Training and logging
158 |     print("Training ...")
159 |     trainer.add_callback(CustomCallback(trainer)) 
160 |     trainer.train()
161 | 
162 | if __name__ == "__main__":
163 |     main()


--------------------------------------------------------------------------------
/Paragraph2MOFInfo/finetune_llms_full_for_paragraph2MOFInfo.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | # Setup environment (full fine-tuning 7b needs 160GB Memory)
  3 | os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"
  4 | 
  5 | import torch
  6 | import pandas as pd
  7 | from datasets import load_dataset, Dataset, DatasetDict
  8 | from transformers import  AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
  9 | import logging
 10 | from peft import LoraConfig, PeftModel
 11 | from trl import SFTTrainer
 12 | from transformers import TrainerCallback
 13 | 
 14 | # Data Loading and Preprocessing
 15 | train_file = "data/data_for_llms/train_329.csv"
 16 | test_file = "data/data_for_llms/test_329.csv"
 17 | train_df = pd.read_csv(train_file, encoding='utf-8')
 18 | test_df = pd.read_csv(test_file, encoding='utf-8')
 19 | 
 20 | source_text = "Paragraph"
 21 | target_text = "MOFInfo"
 22 | 
 23 | instruction = f'{source_text}2{target_text}: '
 24 | instruction = '''Extract the MOF synthesis information from the Paragraph.'''
 25 | 
 26 | train_df['text'] = f'<s>[INST] {instruction} ' + train_df[source_text] + " [/INST] " + train_df[target_text] +  "!!! </s>"
 27 | test_df['text'] = f'<s>[INST] {instruction} ' + test_df[source_text] + " [/INST] " + test_df[target_text] +  "!!! </s>"
 28 | 
 29 | train_dataset = Dataset.from_dict(train_df[['text']].astype(str))
 30 | test_dataset = Dataset.from_dict(test_df[['text']].astype(str))
 31 | dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
 32 | print(dataset)
 33 | 
 34 | # TrainingArguments parameters
 35 | num_train_epochs = 20
 36 | save_steps = 0                      # Save checkpoint every X updates steps
 37 | logging_steps = 25                  # Log every X updates steps
 38 | 
 39 | fp16 = False                        # Enable fp16/bf16 training (set fp16 to True with an V100)
 40 | bf16 = True                         # Enable fp16/bf16 training (set bf16 to True with an A100)
 41 | 
 42 | per_device_train_batch_size = 2     # Batch size per GPU for training
 43 | per_device_eval_batch_size = 2      # Batch size per GPU for evaluation
 44 | gradient_accumulation_steps = 1     # Number of update steps to accumulate the gradients for
 45 | gradient_checkpointing = True       # Enable gradient checkpointing
 46 | 
 47 | max_grad_norm = 0.3                 # Maximum gradient normal (gradient clipping)
 48 | learning_rate = 5e-6                # Initial learning rate (AdamW optimizer, 1e-5 or 5e-6 or 1e-4)
 49 | weight_decay = 0.001                # Weight decay to apply to all layers except bias/LayerNorm weights
 50 | 
 51 | optim = "paged_adamw_32bit"         # Optimizer to use
 52 | lr_scheduler_type = "cosine"        # Learning rate schedule
 53 | 
 54 | max_steps = -1                      # Number of training steps (overrides num_train_epochs)
 55 | warmup_ratio = 0.03                 # Ratio of steps for a linear warmup (from 0 to learning rate)
 56 | 
 57 | group_by_length = True              # Group sequences into batches with same length (Saves memory and speeds up training considerably)
 58 | 
 59 | # SFT parameters
 60 | max_seq_length = 4096               # Maximum sequence length to use (default 1024)
 61 | packing = False                     # Pack multiple short examples in the same input sequence to increase efficiency
 62 | device_map = "auto"                 # Load the entire model on the GPU 0, or "auto"
 63 | 
 64 | # Model Version (Meta-Llama-3-8B-Instruct, Mistral-7B-Instruct-v0.2, llama-2-13b-chat-hf ...)
 65 | model_name = "../../pretrained_models/Mistral-7B-Instruct-v0.2"  # Path of the pretrained model downloaded from Hugging Face
 66 | new_model_dir = f"saved_models/Mistral-7B-Instruct-v0.2/train_{len(train_df)}_lr{learning_rate}_bs{per_device_train_batch_size}"  # Fine-tuned model name
 67 | output_dir = new_model_dir          # Output directory where the model predictions and checkpoints will be stored
 68 | 
 69 | # Load base model
 70 | model = AutoModelForCausalLM.from_pretrained(
 71 |     pretrained_model_name_or_path = model_name,
 72 |     torch_dtype = torch.bfloat16,
 73 |     device_map = device_map
 74 | )
 75 | 
 76 | # Load tokenizer
 77 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 78 | tokenizer.pad_token = tokenizer.eos_token
 79 | tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training
 80 | print("------------tokenizer.eos_token--------------", tokenizer.eos_token)
 81 | print("------------tokenizer.unk_token--------------", tokenizer.unk_token)
 82 | print("------------tokenizer.bos_token--------------", tokenizer.bos_token)
 83 | print("------------tokenizer.pad_token--------------", tokenizer.pad_token)
 84 | print("------------vocab_size is------------", tokenizer.vocab_size)
 85 | print("------------vocab_size is------------", len(tokenizer))
 86 | 
 87 | # Set training parameters
 88 | training_arguments = TrainingArguments(
 89 |     output_dir = output_dir,
 90 |     logging_dir = output_dir + "/logs/",
 91 |     # evaluation_strategy = "epoch",            
 92 |     save_strategy = "epoch",
 93 |     num_train_epochs = num_train_epochs,
 94 |     save_total_limit = num_train_epochs,
 95 |     per_device_train_batch_size = per_device_train_batch_size,
 96 |     gradient_accumulation_steps = gradient_accumulation_steps,
 97 |     # optim = optim,
 98 |     #save_steps=save_steps,
 99 |     logging_steps = logging_steps,
100 |     learning_rate = learning_rate,
101 |     weight_decay = weight_decay,
102 |     fp16 = fp16,
103 |     bf16 = bf16,
104 |     max_grad_norm = max_grad_norm,
105 |     max_steps = max_steps,
106 |     warmup_ratio = warmup_ratio,
107 |     group_by_length = group_by_length,
108 |     lr_scheduler_type = lr_scheduler_type,
109 |     report_to = "tensorboard",
110 | )
111 | 
112 | # Set supervised fine-tuning parameters
113 | trainer = SFTTrainer(
114 |     model = model,
115 |     train_dataset = dataset['train'],
116 |     eval_dataset = dataset["test"],
117 |     dataset_text_field = "text",
118 |     max_seq_length = max_seq_length,
119 |     tokenizer = tokenizer,
120 |     args = training_arguments,
121 |     packing = packing,
122 | )
123 | 
124 | class SaveBestModelCallback(TrainerCallback):
125 |     def __init__(self):
126 |         super().__init__()
127 |         self.best_eval_loss = float('inf')
128 |         self.best_model_checkpoint = None
129 |         
130 |     def on_log(self, args, state, control, logs=None, **kwargs):
131 |         # Check if training_loss is in the logs and print it
132 |         if 'loss' in logs:
133 |             training_loss = logs['loss']
134 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current training_loss: {training_loss}")
135 | 
136 |     # def on_evaluate(self, args, state, control, **kwargs):
137 |     #     # Check if eval_loss is in the logs
138 |     #     if 'eval_loss' in state.log_history[-1]:
139 |     #         eval_loss = state.log_history[-1]['eval_loss']
140 |     #         # Print current eval_loss with epoch and step
141 |     #         logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current eval_loss: {eval_loss}")  
142 | 
143 |     #         if eval_loss < self.best_eval_loss:
144 |     #             self.best_eval_loss = eval_loss
145 |     #             # Save the best model
146 |     #             self.best_model_checkpoint = state.global_step
147 |     #             trainer.save_model(f"{args.output_dir}/best_model")
148 |     #             # Print loss of Best Model
149 |     #             logging.info(f"New best model saved at step {state.global_step} with eval_loss: {eval_loss}")  
150 | 
151 | # Create an instance of the callback
152 | save_best_model_callback = SaveBestModelCallback()
153 | 
154 | # Training and logging
155 | logging.basicConfig(filename=output_dir+'/training.log', level=logging.INFO)
156 | logging.info(f"""[Device]: cuda:{os.environ["CUDA_VISIBLE_DEVICES"]}...\n""")
157 | logging.info(f"""[Model]: Loading {model_name}...\n""")
158 | logging.info(f"""[Outputdir]: Loading {output_dir}...\n""")
159 | 
160 | # Add the callback to the trainer
161 | trainer.add_callback(save_best_model_callback)
162 | 
163 | # Train model
164 | trainer.train()
165 | 
166 | # Save trained model
167 | trainer.model.save_pretrained(new_model_dir)


--------------------------------------------------------------------------------
/Paragraph2MOFInfo/finetune_llms_peft_for_paragraph2MOFInfo.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3 | import torch
  4 | device = "cuda:0" if torch.cuda.is_available() else "cpu"
  5 | 
  6 | import pandas as pd
  7 | from datasets import load_dataset, Dataset, DatasetDict
  8 | from transformers import  AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
  9 | import logging
 10 | from peft import LoraConfig, PeftModel
 11 | from trl import SFTTrainer
 12 | from transformers import TrainerCallback
 13 | 
 14 | # Data Loading and Preprocessing
 15 | train_file = "data/data_for_llms/train_329.csv"
 16 | test_file = "data/data_for_llms/test_329.csv"
 17 | train_df = pd.read_csv(train_file, encoding='utf-8')
 18 | test_df = pd.read_csv(test_file, encoding='utf-8')
 19 | 
 20 | source_text = "Paragraph"
 21 | target_text = "MOFInfo"
 22 | 
 23 | instruction = f'{source_text}2{target_text}: '
 24 | instruction = '''Extract the MOF synthesis information from the Paragraph.'''
 25 | 
 26 | train_df['text'] = f'<s>[INST] {instruction} ' + train_df[source_text] + " [/INST] " + train_df[target_text] +  "!!! </s>"
 27 | test_df['text'] = f'<s>[INST] {instruction} ' + test_df[source_text] + " [/INST] " + test_df[target_text] +  "!!! </s>"
 28 | 
 29 | train_dataset = Dataset.from_dict(train_df[['text']].astype(str))
 30 | test_dataset = Dataset.from_dict(test_df[['text']].astype(str))
 31 | dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
 32 | print(dataset)
 33 | 
 34 | ################################################################################
 35 | # Parameters Setting
 36 | ################################################################################
 37 | # QLoRA parameters
 38 | lora_r = 64                         # LoRA attention dimension (8, 16, 64, larger is better)
 39 | lora_alpha = 128                    # Alpha parameter for LoRA scaling (lora_r*2)
 40 | lora_dropout = 0.1                  # Dropout probability for LoRA layers
 41 | 
 42 | # bitsandbytes parameters
 43 | use_4bit = True                     # Activate 4-bit precision base model loading
 44 | bnb_4bit_compute_dtype = "bfloat16" # Compute dtype for 4-bit base models   ("bfloat16" or "float16")
 45 | bnb_4bit_quant_type = "nf4"         # Quantization type (fp4 or nf4)
 46 | use_nested_quant = False            # Activate nested quantization for 4-bit base models (double quantization)
 47 | 
 48 | # TrainingArguments parameters
 49 | num_train_epochs = 20
 50 | save_steps = 0                      # Save checkpoint every X updates steps
 51 | logging_steps = 25                  # Log every X updates steps
 52 | 
 53 | fp16 = False                        # Enable fp16/bf16 training (set fp16 to True with an V100)
 54 | bf16 = True                         # Enable fp16/bf16 training (set bf16 to True with an A100)
 55 | 
 56 | per_device_train_batch_size = 2     # Batch size per GPU for training
 57 | per_device_eval_batch_size = 2      # Batch size per GPU for evaluation
 58 | gradient_accumulation_steps = 1     # Number of update steps to accumulate the gradients for
 59 | gradient_checkpointing = True       # Enable gradient checkpointing
 60 | 
 61 | max_grad_norm = 0.3                 # Maximum gradient normal (gradient clipping)
 62 | learning_rate = 1e-5                # Initial learning rate (AdamW optimizer)
 63 | weight_decay = 0.001                # Weight decay to apply to all layers except bias/LayerNorm weights
 64 | 
 65 | optim = "paged_adamw_32bit"         # Optimizer to use
 66 | lr_scheduler_type = "cosine"        # Learning rate schedule
 67 | 
 68 | max_steps = -1                      # Number of training steps (overrides num_train_epochs)
 69 | warmup_ratio = 0.03                 # Ratio of steps for a linear warmup (from 0 to learning rate)
 70 | 
 71 | group_by_length = True              # Group sequences into batches with same length (Saves memory and speeds up training considerably)
 72 | 
 73 | # SFT parameters
 74 | max_seq_length = 4096               # Maximum sequence length to use (default 1024)
 75 | packing = False                     # Pack multiple short examples in the same input sequence to increase efficiency
 76 | device_map = {"": 0}                # Load the entire model on the GPU 0, or "auto"
 77 | 
 78 | # Model Version
 79 | model_name = "/home/zhangwei/pretrained_models/llama-2-13b-chat-hf"              # Path of the pretrained model downloaded from Hugging Face (llama-2-13b-chat-hf or Mistral-7B-Instruct-v0.2)
 80 | new_model_dir = f"saved_models/llama2_13b_chat_qlora/train_{len(train_df)}_lora_r{lora_r}_lr{learning_rate}"  # Fine-tuned model name
 81 | output_dir = new_model_dir                                              # Output directory where the model predictions and checkpoints will be stored
 82 | 
 83 | ################################################################################
 84 | # Train
 85 | ################################################################################
 86 | # Load tokenizer and model with QLoRA configuration
 87 | compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
 88 | print(compute_dtype)
 89 | bnb_config = BitsAndBytesConfig(
 90 |     load_in_4bit=use_4bit,
 91 |     bnb_4bit_quant_type=bnb_4bit_quant_type,
 92 |     bnb_4bit_compute_dtype=compute_dtype,
 93 |     bnb_4bit_use_double_quant=use_nested_quant,
 94 | )
 95 | 
 96 | # Check GPU compatibility with bfloat16
 97 | if compute_dtype == torch.float16 and use_4bit:
 98 |     major, _ = torch.cuda.get_device_capability()
 99 |     if major >= 8:
100 |         print("=" * 80)
101 |         print("Your GPU supports bfloat16: accelerate training with bf16=True")
102 |         print("=" * 80)
103 | 
104 | # Load base model
105 | model = AutoModelForCausalLM.from_pretrained(
106 |     pretrained_model_name_or_path = model_name,
107 |     quantization_config = bnb_config,
108 |     device_map = device_map
109 | )
110 | model.config.use_cache = False
111 | model.config.pretraining_tp = 1
112 | 
113 | # Load LLaMA tokenizer
114 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
115 | tokenizer.pad_token = tokenizer.eos_token
116 | tokenizer.padding_side = "right"   # Fix weird overflow issue with fp16 training
117 | print("------------tokenizer.eos_token--------------", tokenizer.eos_token)
118 | print("------------tokenizer.unk_token--------------", tokenizer.unk_token)
119 | print("------------tokenizer.bos_token--------------", tokenizer.bos_token)
120 | print("------------tokenizer.pad_token--------------", tokenizer.pad_token)
121 | print("------------vocab_size is------------", tokenizer.vocab_size)
122 | print("------------vocab_size is------------", len(tokenizer))
123 | 
124 | # Load LoRA configuration
125 | peft_config = LoraConfig(
126 |     lora_alpha=lora_alpha,
127 |     lora_dropout=lora_dropout,
128 |     r=lora_r,
129 |     bias="none",
130 |     task_type="CAUSAL_LM",
131 | )
132 | 
133 | # Set training parameters
134 | training_arguments = TrainingArguments(
135 |     output_dir=output_dir,
136 |     logging_dir = output_dir + "/logs/",
137 |     evaluation_strategy = "epoch",            
138 |     save_strategy = "epoch",
139 |     num_train_epochs = num_train_epochs,
140 |     save_total_limit = num_train_epochs,
141 |     per_device_train_batch_size = per_device_train_batch_size,
142 |     gradient_accumulation_steps = gradient_accumulation_steps,
143 |     # optim = optim,
144 |     #save_steps=save_steps,
145 |     logging_steps = logging_steps,
146 |     learning_rate = learning_rate,
147 |     weight_decay = weight_decay,
148 |     fp16 = fp16,
149 |     bf16 = bf16,
150 |     max_grad_norm = max_grad_norm,
151 |     max_steps = max_steps,
152 |     warmup_ratio = warmup_ratio,
153 |     group_by_length = group_by_length,
154 |     lr_scheduler_type = lr_scheduler_type,
155 |     report_to = "tensorboard"
156 | )
157 | 
158 | # Set supervised fine-tuning parameters
159 | trainer = SFTTrainer(
160 |     model = model,
161 |     train_dataset = dataset['train'],
162 |     eval_dataset = dataset["test"],
163 |     peft_config = peft_config,
164 |     dataset_text_field ="text",
165 |     max_seq_length = max_seq_length,
166 |     tokenizer = tokenizer,
167 |     args = training_arguments,
168 |     packing = packing,
169 | )
170 | 
171 | class SaveBestModelCallback(TrainerCallback):
172 |     def __init__(self):
173 |         super().__init__()
174 |         self.best_eval_loss = float('inf')
175 |         self.best_model_checkpoint = None
176 |         
177 |     def on_log(self, args, state, control, logs=None, **kwargs):
178 |         # Check if training_loss is in the logs and print it
179 |         if 'loss' in logs:
180 |             training_loss = logs['loss']
181 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current training_loss: {training_loss}")
182 | 
183 |     def on_evaluate(self, args, state, control, **kwargs):
184 |         # Check if eval_loss is in the logs
185 |         if 'eval_loss' in state.log_history[-1]:
186 |             eval_loss = state.log_history[-1]['eval_loss']
187 |             # Print current eval_loss with epoch and step
188 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current eval_loss: {eval_loss}")  
189 | 
190 |             if eval_loss < self.best_eval_loss:
191 |                 self.best_eval_loss = eval_loss
192 |                 # Save the best model
193 |                 self.best_model_checkpoint = state.global_step
194 |                 trainer.save_model(f"{args.output_dir}/best_model")
195 |                 # Print loss of Best Model
196 |                 logging.info(f"New best model saved at step {state.global_step} with eval_loss: {eval_loss}")  
197 | 
198 | # Create an instance of the callback
199 | save_best_model_callback = SaveBestModelCallback()
200 | 
201 | # Training and logging
202 | logging.basicConfig(filename=output_dir+'/training.log', level=logging.INFO)
203 | logging.info(f"""[Device]: cuda:{os.environ["CUDA_VISIBLE_DEVICES"]}...\n""")
204 | logging.info(f"""[Model]: Loading {model_name}...\n""")
205 | logging.info(f"""[Outputdir]: Loading {output_dir}...\n""")
206 | 
207 | # Add the callback to the trainer
208 | trainer.add_callback(save_best_model_callback)
209 | 
210 | # Train model
211 | trainer.train()
212 | 
213 | # Save trained model
214 | trainer.model.save_pretrained(new_model_dir)


--------------------------------------------------------------------------------
/Paragraph2MOFInfo/results/README.md:
--------------------------------------------------------------------------------
1 | Save the generated outputs by different models. 


--------------------------------------------------------------------------------
/Paragraph2NMR/README.md:
--------------------------------------------------------------------------------
 1 | ## 1. Data for Paragraph2NMR
 2 | 
 3 | - Data for fine-tuning LLMs are in ```data/data_for_llms```.
 4 | 
 5 | - Data for fine-tuning bart or T5 are in ```data/data_for_bart_or_t5```. (preprocessed beacuse the 512 length limitation, split into 7 prefix tasks )
 6 | 
 7 | ## 2. Methods for Paragraph2NMR
 8 | 
 9 | ### Prompt Engineering ChatGPT (GPT-4, GPT-3.5-Turbo)
10 | 
11 | See in ```prompt_chatgpt_for_paragraph2NMR.ipynb```
12 | 
13 | ### Fine-tuning ChatGPT (GPT-3.5-Turbo)
14 | 
15 | See in ```finetune_chatgpt_for_paragraph2NMR.ipynb```
16 | 
17 | ### Full Parameter Fine-tuning Open-source Large Language Models (Llama3, Llama2, Mistral)
18 | 
19 | Training Code in ```finetune_llms_full_for_paragraph2NMR.py```
20 | 
21 | Inferencing Code in ```vllm_inference_full_finetuned_llms.ipynb```
22 | 
23 | ### Parameter Efficient Fine-tuning (PEFT) Open-source Large Language Models (Llama3, Llama2, Mistral)
24 | 
25 | Training Code in ```finetune_llms_peft_for_paragraph2NMR.py```
26 | 
27 | Inferencing Code in ```vllm_inference_peft_finetuned_llms.ipynb```
28 | 
29 | ### Fine-tuning Language Models (T5, Bart)
30 | 
31 | See in ```finetune_bart_or_t5_for_paragraph2NMR.py```
32 | 
33 | ## 3. Evaluating the results of Paragraph2NMR
34 | 
35 | All predictions will be saved in ```results/predictions```
36 | 
37 | Evalutating codes for LLMs (ChatGPT, Llama, Mistral) are in ```evaluate_llms_Paragraph2NMR.ipynb```
38 | 
39 | Evalutating codes for LM (Bart or T5) are in ```evaluate_bart_or_t5_Paragraph2NMR.ipynb```
40 | 


--------------------------------------------------------------------------------
/Paragraph2NMR/finetune_bart_or_t5_for_paragraph2NMR.py:
--------------------------------------------------------------------------------
  1 | # Finetuning Bart or T5
  2 | import os
  3 | os.environ["CUDA_VISIBLE_DEVICES"] = "5"
  4 | import pandas as pd 
  5 | import numpy as np
  6 | import torch
  7 | import logging
  8 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
  9 | from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, TrainerCallback
 10 | from datasets import load_metric, Dataset, DatasetDict
 11 | import warnings
 12 | warnings.filterwarnings('ignore')
 13 | 
 14 | # Configuration
 15 | class CFG:
 16 |     # Model Configuration
 17 |     device = "cuda:0" if torch.cuda.is_available() else "cpu"
 18 |     num_train_epochs = 50
 19 |     save_total_limit = 50
 20 |     batch_size = 8
 21 |     learning_rate = 1e-5
 22 |     max_input_length = 512
 23 |     max_target_length = 512
 24 |     weight_decay = 0.01
 25 |     save_strategy = "epoch"
 26 |     evaluation_strategy = "epoch"
 27 |     interval_eval_epoch = 1                               # the number of interval epochs to evaluate (inference)
 28 |     model_name = "t5-base"                                # "bart-base" or "t5-base"
 29 |     pretrained_dir = "/home/zhangwei/pretrained_models/"  # Path of the pretrained model downloaded from Hugging Face
 30 |     saved_models_dir = f"saved_models/{model_name}/train_200_{model_name}_lr_{learning_rate}_epoch_{num_train_epochs}_bs_{batch_size}_intervel_{interval_eval_epoch}/"
 31 |     output_dir = f"results/predictions/{saved_models_dir}"
 32 | 
 33 |     # Data Configuration
 34 |     train_file = "data/data_for_bart_or_t5/train_200_one_column_lstrip_add_space.csv"
 35 |     test_file = "data/data_for_bart_or_t5/test_300_one_column_lstrip_add_space.csv"
 36 |     source_text_column = "Paragraph"
 37 |     target_text_column = "output"
 38 | 
 39 | def load_data():
 40 |     train_df = pd.read_csv(CFG.train_file)
 41 |     test_df = pd.read_csv(CFG.test_file)
 42 |     train_dataset = Dataset.from_dict(train_df.astype(str))
 43 |     test_dataset = Dataset.from_dict(test_df.astype(str))
 44 |     datasets = DatasetDict({"train": train_dataset, "test": test_dataset})
 45 |     print(datasets)
 46 |     return datasets
 47 | 
 48 | def tokenize_and_encode(tokenizer, datasets):
 49 |     def tokenize_function(examples):
 50 |         model_inputs = tokenizer(examples[CFG.source_text_column], max_length=CFG.max_input_length, truncation=True)
 51 |         model_labels = tokenizer(examples[CFG.target_text_column], max_length=CFG.max_target_length, truncation=True)
 52 |         model_inputs["labels"] = model_labels["input_ids"]
 53 |         return model_inputs
 54 |     return datasets.map(tokenize_function, batched=True)
 55 | 
 56 | def logging_config():
 57 |     logging.info("Configuration Details:")
 58 |     for attr in dir(CFG):
 59 |         # Filter out private attributes and methods
 60 |         if not attr.startswith("__") and not callable(getattr(CFG, attr)):
 61 |             logging.info(f"{attr}: {getattr(CFG, attr)}")
 62 | 
 63 | # Custom Callback
 64 | class CustomCallback(TrainerCallback):    
 65 |     def __init__(self, trainer) -> None:
 66 |         super().__init__()
 67 |         self._trainer = trainer
 68 | 
 69 |     def on_log(self, args, state, control, logs=None, **kwargs):
 70 |         if 'loss' in logs:
 71 |             training_loss = logs['loss']
 72 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current training_loss: {training_loss}")
 73 | 
 74 |         if 'eval_loss' in state.log_history[-1]:
 75 |             eval_loss = state.log_history[-1]['eval_loss']
 76 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current eval_loss: {eval_loss}")
 77 | 
 78 |     def on_epoch_end(self, args, state, control, **kwargs):
 79 |             logging.info("Saving inference results for test_set...")
 80 |             output = self._trainer.predict(self._trainer.eval_dataset)
 81 |             epoch = int(state.epoch)
 82 | 
 83 |             if epoch % CFG.interval_eval_epoch == 0 :
 84 |                 # Decode generated summaries into text
 85 |                 decoded_ids = output.predictions
 86 | 
 87 |                 # Replace -100 in the labels as we can't decode them
 88 |                 decoded_ids = np.where(decoded_ids != -100, decoded_ids, tokenizer.pad_token_id)
 89 |                 decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=True)
 90 |                 paragraphs = [i[CFG.source_text_column] for i in self._trainer.eval_dataset]
 91 |                 ground_truth = [i[CFG.target_text_column] for i in self._trainer.eval_dataset]
 92 |                 prediction = [decoded_text for decoded_text in decoded_texts]
 93 | 
 94 |                 # Save predictions to csv
 95 |                 predicted_df = pd.DataFrame()
 96 |                 predicted_df['Paragraph'] = paragraphs
 97 |                 predicted_df['Generated Text'] = prediction
 98 |                 predicted_df['Actual Text'] = ground_truth
 99 |                 predicted_df.to_csv(f"{CFG.output_dir}/epoch_{epoch}.csv", index = None)
100 | 
101 | def main():
102 |     # mkdir needed folders
103 |     if not os.path.exists(CFG.saved_models_dir):
104 |         os.makedirs(CFG.saved_models_dir)
105 |     if not os.path.exists(CFG.output_dir):
106 |         os.makedirs(CFG.output_dir)
107 | 
108 |     # Setup logging
109 |     logging.basicConfig(filename = CFG.saved_models_dir+'/training.log', level = logging.INFO)
110 |     logging_config()
111 | 
112 |     # Loading Tokenizer and Model
113 |     print("Loading Tokenizer and Model ...")
114 |     logging.info(f"[Device]: {CFG.device}...")
115 |     logging.info(f"[Model]: Loading {CFG.model_name}...")
116 |     global tokenizer
117 |     tokenizer = AutoTokenizer.from_pretrained(f"{CFG.pretrained_dir}/{CFG.model_name}")
118 |     model = AutoModelForSeq2SeqLM.from_pretrained(f"{CFG.pretrained_dir}/{CFG.model_name}").to(CFG.device)
119 | 
120 |     # Loading Data
121 |     print("Loading Data ...")
122 |     datasets = load_data()
123 | 
124 |     # Preparing Data
125 |     print("Preparing Data ...")
126 |     logging.info(f"[Dataset]:\n{datasets}")
127 |     tokenized_datasets = tokenize_and_encode(tokenizer, datasets)
128 |     data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)
129 | 
130 |     # Training Arguments
131 |     args = Seq2SeqTrainingArguments(
132 |         output_dir = CFG.saved_models_dir,
133 |         logging_dir = CFG.saved_models_dir + "logs/",
134 |         evaluation_strategy = CFG.evaluation_strategy,            
135 |         learning_rate = CFG.learning_rate,
136 |         per_device_train_batch_size = CFG.batch_size,
137 |         per_device_eval_batch_size = CFG.batch_size,
138 |         weight_decay = CFG.weight_decay,
139 |         generation_max_length = CFG.max_target_length,
140 |         save_strategy =  CFG.save_strategy,
141 |         num_train_epochs = CFG.num_train_epochs,
142 |         save_total_limit = CFG.save_total_limit,
143 |         predict_with_generate = True,
144 |         logging_steps = len(tokenized_datasets["train"]) // CFG.batch_size,
145 |         push_to_hub = False,
146 |         report_to = "tensorboard")
147 | 
148 |     # Trainer                
149 |     trainer = Seq2SeqTrainer(
150 |         model,
151 |         args,
152 |         train_dataset = tokenized_datasets["train"],
153 |         eval_dataset = tokenized_datasets["test"],
154 |         data_collator = data_collator,
155 |         tokenizer = tokenizer)
156 | 
157 |     # Training and logging
158 |     print("Training ...")
159 |     trainer.add_callback(CustomCallback(trainer)) 
160 |     trainer.train()
161 | 
162 | if __name__ == "__main__":
163 |     main()


--------------------------------------------------------------------------------
/Paragraph2NMR/finetune_llms_full_for_paragraph2NMR.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | # Setup environment (full fine-tuning 7b needs 160GB Memory)
  3 | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
  4 | 
  5 | import torch
  6 | import pandas as pd
  7 | from datasets import load_dataset, Dataset, DatasetDict
  8 | from transformers import  AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
  9 | import logging
 10 | from peft import LoraConfig, PeftModel
 11 | from trl import SFTTrainer
 12 | from transformers import TrainerCallback
 13 | 
 14 | # Data Loading and Preprocessing
 15 | train_file = "data/data_for_llms/train/train_200_data_in_300.csv"
 16 | test_file = "data/data_for_llms/test/test_300.csv"
 17 | train_df = pd.read_csv(train_file, encoding='utf-8')
 18 | test_df = pd.read_csv(test_file, encoding='utf-8')
 19 | 
 20 | def create_assistant_message(row):
 21 |     return f"""{{\"IUPAC\":\"{row['IUPAC']}\",\"1H NMR text\":\"{row['1H NMR text']}\",\"1H NMR conditions\":\"{row['1H NMR conditions']}\",\"1H NMR data\":\"{row['1H NMR data']}\",\"13C NMR text\":\"{row['13C NMR text']}\",\"13C NMR conditions\":\"{row['13C NMR conditions']}\",\"13C NMR data\":\"{row['13C NMR data']}\"}}"""
 22 | 
 23 | train_df['NMRInfo'] = train_df.apply(create_assistant_message, axis=1)
 24 | test_df['NMRInfo'] = test_df.apply(create_assistant_message, axis=1)
 25 | 
 26 | source_text = "Paragraph"
 27 | target_text = "NMRInfo"
 28 | instruction = f'{source_text}2{target_text}: '
 29 | instruction = '''Extract the NMR information from the Paragraph: '''
 30 | # instruction = '''Extract text containing 1H NMR and 13C NMR data, remove interference information such as reactants, raw materials, solvents and other non-final product names based on text semantics, and then extract the name, code or number of the final product. Please delete the IUPAC name Alias, numbers and ordinal numbers before and after fields, such as '2.1.3.', '(HL4)', '(9)', '(4d)'. NMR text should contain complete information, such as instrument power and solvent information, For example, "13C NMR text": "13C NMR (400 MHz, acetone-d6) 174.0 (C), 157.7 (C). Then split the NMR text. The content in NMR conditions is NMR instrument power and solvent information, such as "13C NMR conditions": "400MHz, acetone-d6". The contents in the NMR data are only numbers, such as "13C NMR data": "174.0, 157.7", "1H NMR data": "174.0, 157.7". All responses must originate from information extracted from the given text, ensuring that the extracted content has not been modified or fragmented, and that capitalization and punctuation are exactly the same as the given text. Must end with {"IUPAC":"text","1H NMR text":"text","1H NMR conditions":"text","1H NMR data":"text","13C NMR text":"text","13C NMR conditions":"text","13C NMR data":"text"} format reply.'''
 31 | 
 32 | # Old prompt template (for Mistral)
 33 | train_df['text'] = f'<s>[INST] {instruction}' + train_df[source_text] + " [/INST] " + train_df[target_text] +  "!!! </s>"
 34 | test_df['text'] = f'<s>[INST] {instruction}' + test_df[source_text] + " [/INST] " + test_df[target_text] +  "!!! </s>"
 35 | 
 36 | train_dataset = Dataset.from_dict(train_df[['text']].astype(str))
 37 | test_dataset = Dataset.from_dict(test_df[['text']].astype(str))
 38 | dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
 39 | print(dataset)
 40 | 
 41 | # TrainingArguments parameters
 42 | num_train_epochs = 20
 43 | save_steps = 0                      # Save checkpoint every X updates steps
 44 | logging_steps = 25                  # Log every X updates steps
 45 | 
 46 | fp16 = False                        # Enable fp16/bf16 training (set fp16 to True with an V100)
 47 | bf16 = True                         # Enable fp16/bf16 training (set bf16 to True with an A100)
 48 | 
 49 | per_device_train_batch_size = 2     # Batch size per GPU for training
 50 | per_device_eval_batch_size = 2      # Batch size per GPU for evaluation
 51 | gradient_accumulation_steps = 1     # Number of update steps to accumulate the gradients for
 52 | gradient_checkpointing = True       # Enable gradient checkpointing
 53 | 
 54 | max_grad_norm = 0.3                 # Maximum gradient normal (gradient clipping)
 55 | learning_rate = 5e-6                # Initial learning rate (AdamW optimizer, 1e-5 or 5e-6 or 1e-4)
 56 | weight_decay = 0.001                # Weight decay to apply to all layers except bias/LayerNorm weights
 57 | 
 58 | optim = "paged_adamw_32bit"         # Optimizer to use
 59 | lr_scheduler_type = "cosine"        # Learning rate schedule
 60 | 
 61 | max_steps = -1                      # Number of training steps (overrides num_train_epochs)
 62 | warmup_ratio = 0.03                 # Ratio of steps for a linear warmup (from 0 to learning rate)
 63 | 
 64 | group_by_length = True              # Group sequences into batches with same length (Saves memory and speeds up training considerably)
 65 | 
 66 | # SFT parameters
 67 | max_seq_length = 4096               # Maximum sequence length to use (default 1024)
 68 | packing = False                     # Pack multiple short examples in the same input sequence to increase efficiency
 69 | device_map = "auto"                 # Load the entire model on the GPU 0, or "auto"
 70 | 
 71 | # Model Version (Meta-Llama-3-8B-Instruct, Mistral-7B-Instruct-v0.2, llama-2-13b-chat-hf ...)
 72 | model_name = "/home/zhangwei/pretrained_models/Mistral-7B-Instruct-v0.2"  # Path of the pretrained model downloaded from Hugging Face
 73 | new_model_dir = f"saved_models/Mistral-7B-Instruct-v0.2/train_{len(train_df)}_without_prompt_lr{learning_rate}_bs{per_device_train_batch_size}"  # Fine-tuned model name
 74 | output_dir = new_model_dir          # Output directory where the model predictions and checkpoints will be stored
 75 | 
 76 | # Load base model
 77 | model = AutoModelForCausalLM.from_pretrained(
 78 |     pretrained_model_name_or_path = model_name,
 79 |     torch_dtype = torch.bfloat16,
 80 |     device_map = device_map
 81 | )
 82 | 
 83 | # Load tokenizer
 84 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 85 | tokenizer.pad_token = tokenizer.eos_token
 86 | tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training
 87 | print("------------tokenizer.eos_token--------------", tokenizer.eos_token)
 88 | print("------------tokenizer.unk_token--------------", tokenizer.unk_token)
 89 | print("------------tokenizer.bos_token--------------", tokenizer.bos_token)
 90 | print("------------tokenizer.pad_token--------------", tokenizer.pad_token)
 91 | print("------------vocab_size is------------", tokenizer.vocab_size)
 92 | print("------------vocab_size is------------", len(tokenizer))
 93 | 
 94 | # Set training parameters
 95 | training_arguments = TrainingArguments(
 96 |     output_dir = output_dir,
 97 |     logging_dir = output_dir + "/logs/",
 98 |     # evaluation_strategy = "epoch",            
 99 |     save_strategy = "epoch",
100 |     num_train_epochs = num_train_epochs,
101 |     save_total_limit = num_train_epochs,
102 |     per_device_train_batch_size = per_device_train_batch_size,
103 |     gradient_accumulation_steps = gradient_accumulation_steps,
104 |     # optim = optim,
105 |     #save_steps=save_steps,
106 |     logging_steps = logging_steps,
107 |     learning_rate = learning_rate,
108 |     weight_decay = weight_decay,
109 |     fp16 = fp16,
110 |     bf16 = bf16,
111 |     max_grad_norm = max_grad_norm,
112 |     max_steps = max_steps,
113 |     warmup_ratio = warmup_ratio,
114 |     group_by_length = group_by_length,
115 |     lr_scheduler_type = lr_scheduler_type,
116 |     report_to = "tensorboard",
117 | )
118 | 
119 | # Set supervised fine-tuning parameters
120 | trainer = SFTTrainer(
121 |     model = model,
122 |     train_dataset = dataset['train'],
123 |     eval_dataset = dataset["test"],
124 |     dataset_text_field = "text",
125 |     max_seq_length = max_seq_length,
126 |     tokenizer = tokenizer,
127 |     args = training_arguments,
128 |     packing = packing,
129 | )
130 | 
131 | class SaveBestModelCallback(TrainerCallback):
132 |     def __init__(self):
133 |         super().__init__()
134 |         self.best_eval_loss = float('inf')
135 |         self.best_model_checkpoint = None
136 |         
137 |     def on_log(self, args, state, control, logs=None, **kwargs):
138 |         # Check if training_loss is in the logs and print it
139 |         if 'loss' in logs:
140 |             training_loss = logs['loss']
141 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current training_loss: {training_loss}")
142 | 
143 |     # def on_evaluate(self, args, state, control, **kwargs):
144 |     #     # Check if eval_loss is in the logs
145 |     #     if 'eval_loss' in state.log_history[-1]:
146 |     #         eval_loss = state.log_history[-1]['eval_loss']
147 |     #         # Print current eval_loss with epoch and step
148 |     #         logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current eval_loss: {eval_loss}")  
149 | 
150 |     #         if eval_loss < self.best_eval_loss:
151 |     #             self.best_eval_loss = eval_loss
152 |     #             # Save the best model
153 |     #             self.best_model_checkpoint = state.global_step
154 |     #             trainer.save_model(f"{args.output_dir}/best_model")
155 |     #             # Print loss of Best Model
156 |     #             logging.info(f"New best model saved at step {state.global_step} with eval_loss: {eval_loss}")  
157 | 
158 | # Create an instance of the callback
159 | save_best_model_callback = SaveBestModelCallback()
160 | 
161 | # Training and logging
162 | logging.basicConfig(filename=output_dir+'/training.log', level=logging.INFO)
163 | logging.info(f"""[Device]: cuda:{os.environ["CUDA_VISIBLE_DEVICES"]}...\n""")
164 | logging.info(f"""[Model]: Loading {model_name}...\n""")
165 | logging.info(f"""[Outputdir]: Loading {output_dir}...\n""")
166 | 
167 | # Add the callback to the trainer
168 | trainer.add_callback(save_best_model_callback)
169 | 
170 | # Train model
171 | trainer.train()
172 | 
173 | # Save trained model
174 | trainer.model.save_pretrained(new_model_dir)


--------------------------------------------------------------------------------
/Paragraph2NMR/results/README.md:
--------------------------------------------------------------------------------
1 | Save the generated outputs by different models. 


--------------------------------------------------------------------------------
/Paragraph2RXNRole/Paragraph2Prod/README.md:
--------------------------------------------------------------------------------
 1 | ## 1. Data for Paragraph2Prod
 2 | 
 3 | All Data are in ```data/prod```.
 4 | 
 5 | ## 2. Methods for Paragraph2Comound
 6 | 
 7 | ### Prompt Engineering ChatGPT (GPT-4, GPT-3.5-Turbo)
 8 | 
 9 | See in ```prompt_chatgpt_for_paragraph2prod.ipynb```
10 | 
11 | ### Fine-tuning ChatGPT (GPT-3.5-Turbo)
12 | 
13 | See in ```finetune_chatgpt_for_paragraph2prod.ipynb```
14 | 
15 | ### Full Parameter Fine-tuning Open-source Large Language Models (Mistral, Llama3, Llama2)
16 | 
17 | Training Code in ```finetune_llms_full_for_paragraph2prod.py```
18 | 
19 | Inferencing Code in ```vllm_inference_full_finetuned_llms.ipynb```
20 | 
21 | ### Parameter Efficient Fine-tuning (PEFT) Open-source Large Language Models (Mistral, Llama3, Llama2)
22 | 
23 | Training Code in ```finetune_llms_peft_for_paragraph2prod.py```
24 | 
25 | Inferencing Code in ```vllm_inference_peft_finetuned_llms.ipynb```
26 | 
27 | ### Fine-tuning Language Models (T5, Bart)
28 | 
29 | See in ```finetune_bart_or_t5_for_paragraph2prod.py```
30 | 
31 | ## 3. Evaluating the results of Paragraph2Comound
32 | 
33 | All predictions will be saved in ```results/predictions```
34 | 
35 | Evaluating codes are in ```evaluate_Paragraph2Prod.ipynb```


--------------------------------------------------------------------------------
/Paragraph2RXNRole/Paragraph2Prod/finetune_bart_or_t5_for_paragraph2prod.py:
--------------------------------------------------------------------------------
  1 | # Finetuning Bart or T5
  2 | import os
  3 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
  4 | import pandas as pd 
  5 | import numpy as np
  6 | import torch
  7 | import logging
  8 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
  9 | from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, TrainerCallback
 10 | from datasets import load_metric, Dataset, DatasetDict
 11 | import warnings
 12 | warnings.filterwarnings('ignore')
 13 | 
 14 | # Configuration
 15 | class CFG:
 16 |     # Model Configuration
 17 |     device = "cuda:0" if torch.cuda.is_available() else "cpu"
 18 |     num_train_epochs = 50
 19 |     save_total_limit = 50
 20 |     batch_size = 8
 21 |     learning_rate = 1e-5                                  # 1e-4 or 1e-5
 22 |     max_input_length = 512
 23 |     max_target_length = 512
 24 |     weight_decay = 0.01
 25 |     save_strategy = "epoch"
 26 |     evaluation_strategy = "epoch"
 27 |     interval_eval_epoch = 1                               # the number of interval epochs to evaluate (inference)
 28 |     model_name = "bart-base"                                # "bart-base" or "t5-base"
 29 |     task_name = "paragraph2prod"
 30 |     pretrained_dir = "/home/zhangwei/pretrained_models/"  # Path of the pretrained model downloaded from Hugging Face
 31 |     saved_models_dir = f"saved_models/{model_name}/{model_name}_lr_{learning_rate}_epoch_{num_train_epochs}_bs_{batch_size}_intervel_{interval_eval_epoch}/"
 32 |     output_dir = f"results/predictions/{saved_models_dir}"
 33 | 
 34 |     # Data Configuration
 35 |     train_file = "data/prod/train.csv"
 36 |     test_file = "data/prod/test.csv"
 37 |     source_text_column = "input"
 38 |     target_text_column = "output"
 39 | 
 40 | def load_data():
 41 |     train_df = pd.read_csv(CFG.train_file)
 42 |     test_df = pd.read_csv(CFG.test_file)
 43 |     train_dataset = Dataset.from_dict(train_df.astype(str))
 44 |     test_dataset = Dataset.from_dict(test_df.astype(str))
 45 |     datasets = DatasetDict({"train": train_dataset, "test": test_dataset})
 46 |     print(datasets)
 47 |     return datasets
 48 | 
 49 | def tokenize_and_encode(tokenizer, datasets):
 50 |     def tokenize_function(examples):
 51 |         model_inputs = tokenizer(examples[CFG.source_text_column], max_length=CFG.max_input_length, truncation=True)
 52 |         model_labels = tokenizer(examples[CFG.target_text_column], max_length=CFG.max_target_length, truncation=True)
 53 |         model_inputs["labels"] = model_labels["input_ids"]
 54 |         return model_inputs
 55 |     return datasets.map(tokenize_function, batched=True)
 56 | 
 57 | def logging_config():
 58 |     logging.info("Configuration Details:")
 59 |     for attr in dir(CFG):
 60 |         # Filter out private attributes and methods
 61 |         if not attr.startswith("__") and not callable(getattr(CFG, attr)):
 62 |             logging.info(f"{attr}: {getattr(CFG, attr)}")
 63 | 
 64 | # Custom Callback
 65 | class CustomCallback(TrainerCallback):    
 66 |     def __init__(self, trainer) -> None:
 67 |         super().__init__()
 68 |         self._trainer = trainer
 69 | 
 70 |     def on_log(self, args, state, control, logs=None, **kwargs):
 71 |         if 'loss' in logs:
 72 |             training_loss = logs['loss']
 73 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current training_loss: {training_loss}")
 74 | 
 75 |         if 'eval_loss' in state.log_history[-1]:
 76 |             eval_loss = state.log_history[-1]['eval_loss']
 77 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current eval_loss: {eval_loss}")
 78 | 
 79 |     def on_epoch_end(self, args, state, control, **kwargs):
 80 |             logging.info("Saving inference results for test_set...")
 81 |             output = self._trainer.predict(self._trainer.eval_dataset)
 82 |             epoch = int(state.epoch)
 83 | 
 84 |             if epoch % CFG.interval_eval_epoch == 0 :
 85 |                 # Decode generated summaries into text
 86 |                 decoded_ids = output.predictions
 87 | 
 88 |                 # Replace -100 in the labels as we can't decode them
 89 |                 decoded_ids = np.where(decoded_ids != -100, decoded_ids, tokenizer.pad_token_id)
 90 |                 decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=True)
 91 |                 paragraphs = [i[CFG.source_text_column] for i in self._trainer.eval_dataset]
 92 |                 ground_truth = [i[CFG.target_text_column] for i in self._trainer.eval_dataset]
 93 |                 ground_label = [i['bio_label'] for i in self._trainer.eval_dataset]
 94 |                 prediction = [decoded_text for decoded_text in decoded_texts]
 95 | 
 96 |                 # Save predictions to csv
 97 |                 predicted_df = pd.DataFrame()
 98 |                 predicted_df['Paragraph'] = paragraphs
 99 |                 predicted_df['Generated Text'] = prediction
100 |                 predicted_df['Actual Text'] = ground_truth
101 |                 predicted_df['BIO Label'] = ground_label
102 |                 predicted_df.to_csv(f"{CFG.output_dir}/epoch_{epoch}.csv", index = None)
103 | 
104 | def main():
105 |     # mkdir needed folders
106 |     if not os.path.exists(CFG.saved_models_dir):
107 |         os.makedirs(CFG.saved_models_dir)
108 |     if not os.path.exists(CFG.output_dir):
109 |         os.makedirs(CFG.output_dir)
110 | 
111 |     # Setup logging
112 |     logging.basicConfig(filename = CFG.saved_models_dir+'/training.log', level = logging.INFO)
113 |     logging_config()
114 | 
115 |     # Loading Tokenizer and Model
116 |     print("Loading Tokenizer and Model ...")
117 |     logging.info(f"[Device]: {CFG.device}...")
118 |     logging.info(f"[Model]: Loading {CFG.model_name}...")
119 |     global tokenizer
120 |     tokenizer = AutoTokenizer.from_pretrained(f"{CFG.pretrained_dir}/{CFG.model_name}")
121 |     model = AutoModelForSeq2SeqLM.from_pretrained(f"{CFG.pretrained_dir}/{CFG.model_name}").to(CFG.device)
122 | 
123 |     # Loading Data
124 |     print("Loading Data ...")
125 |     datasets = load_data()
126 | 
127 |     # Preparing Data
128 |     print("Preparing Data ...")
129 |     logging.info(f"[Dataset]:\n{datasets}")
130 |     tokenized_datasets = tokenize_and_encode(tokenizer, datasets)
131 |     data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)
132 | 
133 |     # Training Arguments
134 |     args = Seq2SeqTrainingArguments(
135 |         output_dir = CFG.saved_models_dir,
136 |         logging_dir = CFG.saved_models_dir + "logs/",
137 |         evaluation_strategy = CFG.evaluation_strategy,            
138 |         learning_rate = CFG.learning_rate,
139 |         per_device_train_batch_size = CFG.batch_size,
140 |         per_device_eval_batch_size = CFG.batch_size,
141 |         weight_decay = CFG.weight_decay,
142 |         generation_max_length = CFG.max_target_length,
143 |         save_strategy =  CFG.save_strategy,
144 |         num_train_epochs = CFG.num_train_epochs,
145 |         save_total_limit = CFG.save_total_limit,
146 |         predict_with_generate = True,
147 |         logging_steps = len(tokenized_datasets["train"]) // CFG.batch_size,
148 |         push_to_hub = False,
149 |         report_to = "tensorboard")
150 | 
151 |     # Trainer                
152 |     trainer = Seq2SeqTrainer(
153 |         model,
154 |         args,
155 |         train_dataset = tokenized_datasets["train"],
156 |         eval_dataset = tokenized_datasets["test"],
157 |         data_collator = data_collator,
158 |         tokenizer = tokenizer)
159 | 
160 |     # Training and logging
161 |     print("Training ...")
162 |     trainer.add_callback(CustomCallback(trainer)) 
163 |     trainer.train()
164 | 
165 | if __name__ == "__main__":
166 |     main()


--------------------------------------------------------------------------------
/Paragraph2RXNRole/Paragraph2Prod/finetune_llms_full_for_paragraph2prod.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | # Setup environment (full fine-tuning 7b needs 160GB Memory)
  3 | os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"
  4 | 
  5 | import torch
  6 | import pandas as pd
  7 | from datasets import load_dataset, Dataset, DatasetDict
  8 | from transformers import  AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
  9 | import logging
 10 | from peft import LoraConfig, PeftModel
 11 | from trl import SFTTrainer
 12 | from transformers import TrainerCallback
 13 | 
 14 | # Data Loading and Preprocessing
 15 | train_df = pd.read_csv("data/prod/train.csv")
 16 | test_df = pd.read_csv("data/prod/test.csv")
 17 | source_text = "input"
 18 | target_text = "output"
 19 | instruction = f"{source_text}2{target_text}: "
 20 | instruction = "annotate the products in the paragraph. "
 21 | 
 22 | train_df['text'] = f'<s>[INST] {instruction}' + train_df[source_text] + " [/INST] " + train_df[target_text] +  "!!! </s>"
 23 | test_df['text'] = f'<s>[INST] {instruction}' + test_df[source_text] + " [/INST] " + test_df[target_text] +  "!!! </s>"
 24 | 
 25 | train_dataset = Dataset.from_dict(train_df[['text']].astype(str))
 26 | test_dataset = Dataset.from_dict(test_df[['text']].astype(str))
 27 | dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
 28 | print(dataset)
 29 | 
 30 | # TrainingArguments parameters
 31 | num_train_epochs = 20
 32 | save_steps = 0                      # Save checkpoint every X updates steps
 33 | logging_steps = 25                  # Log every X updates steps
 34 | 
 35 | fp16 = False                        # Enable fp16/bf16 training (set fp16 to True with an V100)
 36 | bf16 = True                         # Enable fp16/bf16 training (set bf16 to True with an A100)
 37 | 
 38 | per_device_train_batch_size = 2     # Batch size per GPU for training
 39 | per_device_eval_batch_size = 2      # Batch size per GPU for evaluation
 40 | gradient_accumulation_steps = 1     # Number of update steps to accumulate the gradients for
 41 | gradient_checkpointing = True       # Enable gradient checkpointing
 42 | 
 43 | max_grad_norm = 0.3                 # Maximum gradient normal (gradient clipping)
 44 | learning_rate = 5e-6                # Initial learning rate (AdamW optimizer, 1e-5 or 5e-6 or 1e-4)
 45 | weight_decay = 0.001                # Weight decay to apply to all layers except bias/LayerNorm weights
 46 | 
 47 | optim = "paged_adamw_32bit"         # Optimizer to use
 48 | lr_scheduler_type = "cosine"        # Learning rate schedule
 49 | 
 50 | max_steps = -1                      # Number of training steps (overrides num_train_epochs)
 51 | warmup_ratio = 0.03                 # Ratio of steps for a linear warmup (from 0 to learning rate)
 52 | 
 53 | group_by_length = True              # Group sequences into batches with same length (Saves memory and speeds up training considerably)
 54 | 
 55 | # SFT parameters
 56 | max_seq_length = 4096               # Maximum sequence length to use (default 1024)
 57 | packing = False                     # Pack multiple short examples in the same input sequence to increase efficiency
 58 | device_map = "auto"                 # Load the entire model on the GPU 0, or "auto"
 59 | 
 60 | # Model Version (Meta-Llama-3-8B-Instruct, Mistral-7B-Instruct-v0.2, llama-2-13b-chat-hf ...)
 61 | model_name = "/home/zhangwei/pretrained_models/Meta-Llama-3-8B-Instruct"  # Path of the pretrained model downloaded from Hugging Face
 62 | new_model_dir = f"saved_models/Meta-Llama-3-8B-Instruct/train_{len(train_df)}_lr{learning_rate}_bs{per_device_train_batch_size}"  # Fine-tuned model name
 63 | output_dir = new_model_dir          # Output directory where the model predictions and checkpoints will be stored
 64 | 
 65 | # Load base model
 66 | model = AutoModelForCausalLM.from_pretrained(
 67 |     pretrained_model_name_or_path = model_name,
 68 |     torch_dtype = torch.bfloat16,
 69 |     device_map = device_map
 70 | )
 71 | 
 72 | # Load tokenizer
 73 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 74 | tokenizer.pad_token = tokenizer.eos_token
 75 | tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training
 76 | print("------------tokenizer.eos_token--------------", tokenizer.eos_token)
 77 | print("------------tokenizer.unk_token--------------", tokenizer.unk_token)
 78 | print("------------tokenizer.bos_token--------------", tokenizer.bos_token)
 79 | print("------------tokenizer.pad_token--------------", tokenizer.pad_token)
 80 | print("------------vocab_size is------------", tokenizer.vocab_size)
 81 | print("------------vocab_size is------------", len(tokenizer))
 82 | 
 83 | # Set training parameters
 84 | training_arguments = TrainingArguments(
 85 |     output_dir = output_dir,
 86 |     logging_dir = output_dir + "/logs/",
 87 |     evaluation_strategy = "epoch",            
 88 |     save_strategy = "epoch",
 89 |     num_train_epochs = num_train_epochs,
 90 |     save_total_limit = num_train_epochs,
 91 |     per_device_train_batch_size = per_device_train_batch_size,
 92 |     gradient_accumulation_steps = gradient_accumulation_steps,
 93 |     # optim = optim,
 94 |     #save_steps=save_steps,
 95 |     logging_steps = logging_steps,
 96 |     learning_rate = learning_rate,
 97 |     weight_decay = weight_decay,
 98 |     fp16 = fp16,
 99 |     bf16 = bf16,
100 |     max_grad_norm = max_grad_norm,
101 |     max_steps = max_steps,
102 |     warmup_ratio = warmup_ratio,
103 |     group_by_length = group_by_length,
104 |     lr_scheduler_type = lr_scheduler_type,
105 |     report_to = "tensorboard"
106 | )
107 | 
108 | # Set supervised fine-tuning parameters
109 | trainer = SFTTrainer(
110 |     model = model,
111 |     train_dataset = dataset['train'],
112 |     eval_dataset = dataset["test"],
113 |     dataset_text_field = "text",
114 |     max_seq_length = max_seq_length,
115 |     tokenizer = tokenizer,
116 |     args = training_arguments,
117 |     packing = packing,
118 | )
119 | 
120 | class SaveBestModelCallback(TrainerCallback):
121 |     def __init__(self):
122 |         super().__init__()
123 |         self.best_eval_loss = float('inf')
124 |         self.best_model_checkpoint = None
125 |         
126 |     def on_log(self, args, state, control, logs=None, **kwargs):
127 |         # Check if training_loss is in the logs and print it
128 |         if 'loss' in logs:
129 |             training_loss = logs['loss']
130 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current training_loss: {training_loss}")
131 | 
132 |     def on_evaluate(self, args, state, control, **kwargs):
133 |         # Check if eval_loss is in the logs
134 |         if 'eval_loss' in state.log_history[-1]:
135 |             eval_loss = state.log_history[-1]['eval_loss']
136 |             # Print current eval_loss with epoch and step
137 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current eval_loss: {eval_loss}")  
138 | 
139 |             if eval_loss < self.best_eval_loss:
140 |                 self.best_eval_loss = eval_loss
141 |                 # Save the best model
142 |                 self.best_model_checkpoint = state.global_step
143 |                 trainer.save_model(f"{args.output_dir}/best_model")
144 |                 # Print loss of Best Model
145 |                 logging.info(f"New best model saved at step {state.global_step} with eval_loss: {eval_loss}")  
146 | 
147 | # Create an instance of the callback
148 | save_best_model_callback = SaveBestModelCallback()
149 | 
150 | # Training and logging
151 | logging.basicConfig(filename=output_dir+'/training.log', level=logging.INFO)
152 | logging.info(f"""[Device]: cuda:{os.environ["CUDA_VISIBLE_DEVICES"]}...\n""")
153 | logging.info(f"""[Model]: Loading {model_name}...\n""")
154 | logging.info(f"""[Outputdir]: Loading {output_dir}...\n""")
155 | 
156 | # Add the callback to the trainer
157 | trainer.add_callback(save_best_model_callback)
158 | 
159 | # Train model
160 | trainer.train()
161 | 
162 | # Save trained model
163 | trainer.model.save_pretrained(new_model_dir)


--------------------------------------------------------------------------------
/Paragraph2RXNRole/Paragraph2Prod/finetune_llms_peft_for_paragraph2prod.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  3 | import torch
  4 | device = "cuda:0" if torch.cuda.is_available() else "cpu"
  5 | 
  6 | import pandas as pd
  7 | from datasets import load_dataset, Dataset, DatasetDict
  8 | from transformers import  AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
  9 | import logging
 10 | from peft import LoraConfig, PeftModel
 11 | from trl import SFTTrainer
 12 | from transformers import TrainerCallback
 13 | 
 14 | # Data Loading and Preprocessing
 15 | train_df = pd.read_csv("data/prod/train.csv")
 16 | test_df = pd.read_csv("data/prod/test.csv")
 17 | source_text = "input"
 18 | target_text = "output"
 19 | instruction = f"{source_text}2{target_text}: "
 20 | instruction = "annotate the products in the paragraph. "
 21 | 
 22 | train_df['text'] = f'<s>[INST] {instruction}' + train_df[source_text] + " [/INST] " + train_df[target_text] +  "!!! </s>"
 23 | test_df['text'] = f'<s>[INST] {instruction}' + test_df[source_text] + " [/INST] " + test_df[target_text] +  "!!! </s>"
 24 | 
 25 | train_dataset = Dataset.from_dict(train_df[['text']].astype(str))
 26 | test_dataset = Dataset.from_dict(test_df[['text']].astype(str))
 27 | dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
 28 | print(dataset)
 29 | 
 30 | ################################################################################
 31 | # Parameters Setting
 32 | ################################################################################
 33 | # QLoRA parameters
 34 | lora_r = 64                         # LoRA attention dimension (8, 16, 64, larger is better)
 35 | lora_alpha = 128                    # Alpha parameter for LoRA scaling (lora_r*2)
 36 | lora_dropout = 0.1                  # Dropout probability for LoRA layers
 37 | 
 38 | # bitsandbytes parameters
 39 | use_4bit = True                     # Activate 4-bit precision base model loading
 40 | bnb_4bit_compute_dtype = "float16"  # Compute dtype for 4-bit base models   
 41 | bnb_4bit_quant_type = "nf4"         # Quantization type (fp4 or nf4)
 42 | use_nested_quant = False            # Activate nested quantization for 4-bit base models (double quantization)
 43 | 
 44 | # TrainingArguments parameters
 45 | num_train_epochs = 20
 46 | save_steps = 0                      # Save checkpoint every X updates steps
 47 | logging_steps = 25                  # Log every X updates steps
 48 | 
 49 | fp16 = False                        # Enable fp16/bf16 training (set fp16 to True with an V100)
 50 | bf16 = True                         # Enable fp16/bf16 training (set bf16 to True with an A100)
 51 | 
 52 | per_device_train_batch_size = 2     # Batch size per GPU for training
 53 | per_device_eval_batch_size = 2      # Batch size per GPU for evaluation
 54 | gradient_accumulation_steps = 1     # Number of update steps to accumulate the gradients for
 55 | gradient_checkpointing = True       # Enable gradient checkpointing
 56 | 
 57 | max_grad_norm = 0.3                 # Maximum gradient normal (gradient clipping)
 58 | learning_rate = 1e-5                # Initial learning rate (AdamW optimizer)
 59 | weight_decay = 0.001                # Weight decay to apply to all layers except bias/LayerNorm weights
 60 | 
 61 | optim = "paged_adamw_32bit"         # Optimizer to use
 62 | lr_scheduler_type = "cosine"        # Learning rate schedule
 63 | 
 64 | max_steps = -1                      # Number of training steps (overrides num_train_epochs)
 65 | warmup_ratio = 0.03                 # Ratio of steps for a linear warmup (from 0 to learning rate)
 66 | 
 67 | group_by_length = True              # Group sequences into batches with same length (Saves memory and speeds up training considerably)
 68 | 
 69 | # SFT parameters
 70 | max_seq_length = 4096               # Maximum sequence length to use (default 1024)
 71 | packing = False                     # Pack multiple short examples in the same input sequence to increase efficiency
 72 | device_map = {"": 0}                # Load the entire model on the GPU 0, or "auto"
 73 | 
 74 | # Model Version (Meta-Llama-3-8B-Instruct, Mistral-7B-Instruct-v0.2, llama-2-13b-chat-hf ...)
 75 | model_name = "/home/zhangwei/pretrained_models/llama-2-13b-chat-hf"     # Path of the pretrained model downloaded from Hugging Face
 76 | new_model_dir = f"saved_models/llama2_13b_chat_qlora/train_{len(train_df)}_lora_r{lora_r}_lr{learning_rate}"  # Fine-tuned model name
 77 | output_dir = new_model_dir                                              # Output directory where the model predictions and checkpoints will be stored
 78 | 
 79 | ################################################################################
 80 | # Train
 81 | ################################################################################
 82 | # Load tokenizer and model with QLoRA configuration
 83 | compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
 84 | bnb_config = BitsAndBytesConfig(
 85 |     load_in_4bit=use_4bit,
 86 |     bnb_4bit_quant_type=bnb_4bit_quant_type,
 87 |     bnb_4bit_compute_dtype=compute_dtype,
 88 |     bnb_4bit_use_double_quant=use_nested_quant,
 89 | )
 90 | 
 91 | # Check GPU compatibility with bfloat16
 92 | if compute_dtype == torch.float16 and use_4bit:
 93 |     major, _ = torch.cuda.get_device_capability()
 94 |     if major >= 8:
 95 |         print("=" * 80)
 96 |         print("Your GPU supports bfloat16: accelerate training with bf16=True")
 97 |         print("=" * 80)
 98 | 
 99 | # Load base model
100 | model = AutoModelForCausalLM.from_pretrained(
101 |     pretrained_model_name_or_path = model_name,
102 |     quantization_config = bnb_config,
103 |     device_map = device_map
104 | )
105 | model.config.use_cache = False
106 | model.config.pretraining_tp = 1
107 | 
108 | # Load LLaMA tokenizer
109 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
110 | tokenizer.pad_token = tokenizer.eos_token
111 | tokenizer.padding_side = "right"   # Fix weird overflow issue with fp16 training
112 | 
113 | # Load LoRA configuration
114 | peft_config = LoraConfig(
115 |     lora_alpha=lora_alpha,
116 |     lora_dropout=lora_dropout,
117 |     r=lora_r,
118 |     bias="none",
119 |     task_type="CAUSAL_LM",
120 | )
121 | 
122 | # Set training parameters
123 | training_arguments = TrainingArguments(
124 |     output_dir=output_dir,
125 |     logging_dir = output_dir + "/logs/",
126 |     evaluation_strategy = "epoch",            
127 |     save_strategy = "epoch",
128 |     num_train_epochs = num_train_epochs,
129 |     save_total_limit = num_train_epochs,
130 |     per_device_train_batch_size = per_device_train_batch_size,
131 |     gradient_accumulation_steps = gradient_accumulation_steps,
132 |     optim = optim,
133 |     #save_steps=save_steps,
134 |     logging_steps = logging_steps,
135 |     learning_rate = learning_rate,
136 |     weight_decay = weight_decay,
137 |     fp16 = fp16,
138 |     bf16 = bf16,
139 |     max_grad_norm = max_grad_norm,
140 |     max_steps = max_steps,
141 |     warmup_ratio = warmup_ratio,
142 |     group_by_length = group_by_length,
143 |     lr_scheduler_type = lr_scheduler_type,
144 |     report_to = "tensorboard"
145 | )
146 | 
147 | # Set supervised fine-tuning parameters
148 | trainer = SFTTrainer(
149 |     model = model,
150 |     train_dataset = dataset['train'],
151 |     eval_dataset = dataset["test"],
152 |     peft_config = peft_config,
153 |     dataset_text_field ="text",
154 |     max_seq_length = max_seq_length,
155 |     tokenizer = tokenizer,
156 |     args = training_arguments,
157 |     packing = packing,
158 | )
159 | 
160 | class SaveBestModelCallback(TrainerCallback):
161 |     def __init__(self):
162 |         super().__init__()
163 |         self.best_eval_loss = float('inf')
164 |         self.best_model_checkpoint = None
165 |         
166 |     def on_log(self, args, state, control, logs=None, **kwargs):
167 |         # Check if training_loss is in the logs and print it
168 |         if 'loss' in logs:
169 |             training_loss = logs['loss']
170 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current training_loss: {training_loss}")
171 | 
172 |     def on_evaluate(self, args, state, control, **kwargs):
173 |         # Check if eval_loss is in the logs
174 |         if 'eval_loss' in state.log_history[-1]:
175 |             eval_loss = state.log_history[-1]['eval_loss']
176 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current eval_loss: {eval_loss}")  # Print current eval_loss with epoch and step
177 | 
178 |             if eval_loss < self.best_eval_loss:
179 |                 self.best_eval_loss = eval_loss
180 |                 # Save the best model
181 |                 self.best_model_checkpoint = state.global_step
182 |                 trainer.save_model(f"{args.output_dir}/best_model")
183 |                 logging.info(f"New best model saved at step {state.global_step} with eval_loss: {eval_loss}")  # Print loss of Best Model
184 | 
185 | # Create an instance of the callback
186 | save_best_model_callback = SaveBestModelCallback()
187 | 
188 | # Training and logging
189 | logging.basicConfig(filename=output_dir+'/training.log', level=logging.INFO)
190 | logging.info(f"""[Device]: cuda:{os.environ["CUDA_VISIBLE_DEVICES"]}...\n""")
191 | logging.info(f"""[Model]: Loading {model_name}...\n""")
192 | logging.info(f"""[Outputdir]: Loading {output_dir}...\n""")
193 | 
194 | # Add the callback to the trainer
195 | trainer.add_callback(save_best_model_callback)
196 | 
197 | # Train model
198 | trainer.train()
199 | 
200 | # Save trained model
201 | trainer.model.save_pretrained(new_model_dir)


--------------------------------------------------------------------------------
/Paragraph2RXNRole/Paragraph2Prod/results/README.md:
--------------------------------------------------------------------------------
1 | Save the generated outputs by different models. 


--------------------------------------------------------------------------------
/Paragraph2RXNRole/Paragraph2Prod/vllm_inference_full_finetuned_llms.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Import Packages"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import os\n",
 17 |     "import torch\n",
 18 |     "import pandas as pd\n",
 19 |     "from vllm import LLM, SamplingParams\n",
 20 |     "from transformers import  AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments\n",
 21 |     "from peft import LoraConfig, PeftModel\n",
 22 |     "\n",
 23 |     "# Setup environment\n",
 24 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "### Load Fine-Tuned Model"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "new_model_name = \"saved_models/Mistral-7B-Instruct-v0.2/train_6163_lr5e-06_bs2/checkpoint-3082\" # Fine-tuned model name\n",
 41 |     "sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens = 3072, stop = ['!!!'])\n",
 42 |     "llm = LLM(model = new_model_name, tensor_parallel_size=1)"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "### Preprocess Data"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [
 57 |     {
 58 |      "data": {
 59 |       "text/plain": [
 60 |        "['<s>[INST] annotate the products in the paragraph. The additional ring which arises from an intramolecular HDA reaction may be useful for the synthesis of triquinanes or other polycyclic compounds. [/INST]',\n",
 61 |        " '<s>[INST] annotate the products in the paragraph. The decrease in entropy associated with tethering the two reactive components suggests that the reaction would be significantly more facile than the intermolecular reaction.25 However , this potential rate enhancement is com- promised by the dramatic decrease in rate associated with intermolecular cycloadditions with substituted norbomadienes as described previously. [/INST]',\n",
 62 |        " '<s>[INST] annotate the products in the paragraph. There were no reported examples of successful intramolecular HDA reactions in the literature prior to 1992.5d,26 In an intramolecular reaction , there are two possible modes of [ 2n + 227 + 2271 cycloaddition which have to be considered ( Scheme 3 ). [/INST]',\n",
 63 |        " '<s>[INST] annotate the products in the paragraph. The dienophile in the tether can cyclize on Ca- Cb-CC to give a cycloadduct of type I or it can cyclize on Cd- Ce-Cf to give a type I1 cycloadduct. [/INST]',\n",
 64 |        " '<s>[INST] annotate the products in the paragraph. Molecular models and MM2 calculations indicate that both products would be stable . [/INST]']"
 65 |       ]
 66 |      },
 67 |      "execution_count": 3,
 68 |      "metadata": {},
 69 |      "output_type": "execute_result"
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "# Data Loading and Preprocessing\n",
 74 |     "test_df = pd.read_csv(\"data/prod/test.csv\")\n",
 75 |     "source_text = \"input\"\n",
 76 |     "target_text = \"output\"\n",
 77 |     "instruction = f\"{source_text}2{target_text}: \"\n",
 78 |     "instruction = \"annotate the products in the paragraph. \"\n",
 79 |     "\n",
 80 |     "test_df['text'] = f'<s>[INST] {instruction}' + test_df[source_text] + \" [/INST]\"\n",
 81 |     "prompts = list(test_df['text'])\n",
 82 |     "prompts[:5]"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "### Inference"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "# Generate texts from the prompts. \n",
 99 |     "# The output is a list of RequestOutput objects that contain the prompt, generated text, and other information.\n",
100 |     "outputs = llm.generate(prompts, sampling_params)\n",
101 |     "predictions = []\n",
102 |     "\n",
103 |     "# Print the outputs.\n",
104 |     "for output in outputs:\n",
105 |     "    prompt = output.prompt\n",
106 |     "    generated_text = output.outputs[0].text\n",
107 |     "    print(f\"Prompt: {prompt},\\nGenerated text: {generated_text!r}\")\n",
108 |     "    predictions.append(generated_text.strip())"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "### Save the Predictions"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "pred_df = pd.DataFrame()\n",
125 |     "pred_df['Generated Text'] = predictions\n",
126 |     "pred_df['Actual Text'] = test_df[target_text]\n",
127 |     "pred_df['Paragraph'] = test_df[source_text]\n",
128 |     "pred_df['BIO Label'] = test_df['bio_label']\n",
129 |     "pred_df.to_csv(f\"results/predictions/prediction_of_{new_model_name.replace('/', '-')}.csv\", index = None)\n",
130 |     "pred_df"
131 |    ]
132 |   }
133 |  ],
134 |  "metadata": {
135 |   "kernelspec": {
136 |    "display_name": "llm",
137 |    "language": "python",
138 |    "name": "python3"
139 |   },
140 |   "language_info": {
141 |    "codemirror_mode": {
142 |     "name": "ipython",
143 |     "version": 3
144 |    },
145 |    "file_extension": ".py",
146 |    "mimetype": "text/x-python",
147 |    "name": "python",
148 |    "nbconvert_exporter": "python",
149 |    "pygments_lexer": "ipython3",
150 |    "version": "3.10.13"
151 |   }
152 |  },
153 |  "nbformat": 4,
154 |  "nbformat_minor": 2
155 | }
156 | 


--------------------------------------------------------------------------------
/Paragraph2RXNRole/Paragraph2Role/README.md:
--------------------------------------------------------------------------------
 1 | ## 1. Data for Paragraph2Role
 2 | 
 3 | See in ```data/role```
 4 | 
 5 | ## 2. Methods for Paragraph2Role
 6 | 
 7 | ### Prompt Engineering ChatGPT (GPT-4, GPT-3.5-Turbo)
 8 | 
 9 | See in ```prompt_chatgpt_for_paragraph2role.ipynb```
10 | 
11 | ### Fine-tuning ChatGPT (GPT-3.5-Turbo)
12 | 
13 | See in ```finetune_chatgpt_for_paragraph2role.ipynb```
14 | 
15 | ### Full Parameter Fine-tuning Open-source Large Language Models (Llama3, Llama2, Mistral)
16 | 
17 | Training Code in ```finetune_llms_full_for_paragraph2role.py```
18 | 
19 | Inferencing Code in ```vllm_inference_full_finetuned_llms.ipynb```
20 | 
21 | ### Parameter Efficient Fine-tuning (PEFT) Open-source Large Language Models (Llama3, Llama2, Mistral)
22 | 
23 | Training Code in ```finetune_llms_peft_for_paragraph2role.py```
24 | 
25 | Inferencing Code in ```vllm_inference_peft_finetuned_llms.ipynb```
26 | 
27 | ### Fine-tuning Language Models (T5, Bart)
28 | 
29 | See in ```finetune_bart_or_t5_for_paragraph2role.py```
30 | 
31 | ## 3. Evaluating the results of Paragraph2Role
32 | 
33 | All predictions will be saved in ```results/predictions```
34 | 
35 | Evaluating codes are in ```evaluate_Paragraph2Role.ipynb```


--------------------------------------------------------------------------------
/Paragraph2RXNRole/Paragraph2Role/finetune_bart_or_t5_for_paragraph2role.py:
--------------------------------------------------------------------------------
  1 | # Finetuning Bart or T5
  2 | import os
  3 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
  4 | import pandas as pd 
  5 | import numpy as np
  6 | import torch
  7 | import logging
  8 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
  9 | from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, TrainerCallback
 10 | from datasets import load_metric, Dataset, DatasetDict
 11 | import warnings
 12 | warnings.filterwarnings('ignore')
 13 | 
 14 | # Configuration
 15 | class CFG:
 16 |     # Model Configuration
 17 |     device = "cuda:0" if torch.cuda.is_available() else "cpu"
 18 |     num_train_epochs = 50
 19 |     save_total_limit = 50
 20 |     batch_size = 8
 21 |     learning_rate = 1e-5                                  # 1e-4 or 1e-5
 22 |     max_input_length = 512
 23 |     max_target_length = 512
 24 |     weight_decay = 0.01
 25 |     save_strategy = "epoch"
 26 |     evaluation_strategy = "epoch"
 27 |     interval_eval_epoch = 1                               # the number of interval epochs to evaluate (inference)
 28 |     model_name = "bart-base"                                # "bart-base" or "t5-base"
 29 |     task_name = "paragraph2role"
 30 |     pretrained_dir = "/home/zhangwei/pretrained_models/"  # Path of the pretrained model downloaded from Hugging Face
 31 |     saved_models_dir = f"saved_models/{model_name}/{model_name}_lr_{learning_rate}_epoch_{num_train_epochs}_bs_{batch_size}_intervel_{interval_eval_epoch}/"
 32 |     output_dir = f"results/predictions/{saved_models_dir}"
 33 | 
 34 |     # Data Configuration
 35 |     train_file = "data/role/train.csv"
 36 |     test_file = "data/role/test.csv"
 37 |     source_text_column = "input"
 38 |     target_text_column = "output"
 39 | 
 40 | def load_data():
 41 |     train_df = pd.read_csv(CFG.train_file)
 42 |     test_df = pd.read_csv(CFG.test_file)
 43 |     train_dataset = Dataset.from_dict(train_df.astype(str))
 44 |     test_dataset = Dataset.from_dict(test_df.astype(str))
 45 |     datasets = DatasetDict({"train": train_dataset, "test": test_dataset})
 46 |     print(datasets)
 47 |     return datasets
 48 | 
 49 | def tokenize_and_encode(tokenizer, datasets):
 50 |     def tokenize_function(examples):
 51 |         model_inputs = tokenizer(examples[CFG.source_text_column], max_length=CFG.max_input_length, truncation=True)
 52 |         model_labels = tokenizer(examples[CFG.target_text_column], max_length=CFG.max_target_length, truncation=True)
 53 |         model_inputs["labels"] = model_labels["input_ids"]
 54 |         return model_inputs
 55 |     return datasets.map(tokenize_function, batched=True)
 56 | 
 57 | def logging_config():
 58 |     logging.info("Configuration Details:")
 59 |     for attr in dir(CFG):
 60 |         # Filter out private attributes and methods
 61 |         if not attr.startswith("__") and not callable(getattr(CFG, attr)):
 62 |             logging.info(f"{attr}: {getattr(CFG, attr)}")
 63 | 
 64 | # Custom Callback
 65 | class CustomCallback(TrainerCallback):    
 66 |     def __init__(self, trainer) -> None:
 67 |         super().__init__()
 68 |         self._trainer = trainer
 69 | 
 70 |     def on_log(self, args, state, control, logs=None, **kwargs):
 71 |         if 'loss' in logs:
 72 |             training_loss = logs['loss']
 73 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current training_loss: {training_loss}")
 74 | 
 75 |         if 'eval_loss' in state.log_history[-1]:
 76 |             eval_loss = state.log_history[-1]['eval_loss']
 77 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current eval_loss: {eval_loss}")
 78 | 
 79 |     def on_epoch_end(self, args, state, control, **kwargs):
 80 |             logging.info("Saving inference results for test_set...")
 81 |             output = self._trainer.predict(self._trainer.eval_dataset)
 82 |             epoch = int(state.epoch)
 83 | 
 84 |             if epoch % CFG.interval_eval_epoch == 0 :
 85 |                 # Decode generated summaries into text
 86 |                 decoded_ids = output.predictions
 87 | 
 88 |                 # Replace -100 in the labels as we can't decode them
 89 |                 decoded_ids = np.where(decoded_ids != -100, decoded_ids, tokenizer.pad_token_id)
 90 |                 decoded_texts = tokenizer.batch_decode(decoded_ids, skip_special_tokens=True)
 91 |                 paragraphs = [i[CFG.source_text_column] for i in self._trainer.eval_dataset]
 92 |                 ground_truth = [i[CFG.target_text_column] for i in self._trainer.eval_dataset]
 93 |                 ground_label = [i['bio_label'] for i in self._trainer.eval_dataset]
 94 |                 prediction = [decoded_text for decoded_text in decoded_texts]
 95 | 
 96 |                 # Save predictions to csv
 97 |                 predicted_df = pd.DataFrame()
 98 |                 predicted_df['Paragraph'] = paragraphs
 99 |                 predicted_df['Generated Text'] = prediction
100 |                 predicted_df['Actual Text'] = ground_truth
101 |                 predicted_df['BIO Label'] = ground_label
102 |                 predicted_df.to_csv(f"{CFG.output_dir}/epoch_{epoch}.csv", index = None)
103 | 
104 | def main():
105 |     # mkdir needed folders
106 |     if not os.path.exists(CFG.saved_models_dir):
107 |         os.makedirs(CFG.saved_models_dir)
108 |     if not os.path.exists(CFG.output_dir):
109 |         os.makedirs(CFG.output_dir)
110 | 
111 |     # Setup logging
112 |     logging.basicConfig(filename = CFG.saved_models_dir+'/training.log', level = logging.INFO)
113 |     logging_config()
114 | 
115 |     # Loading Tokenizer and Model
116 |     print("Loading Tokenizer and Model ...")
117 |     logging.info(f"[Device]: {CFG.device}...")
118 |     logging.info(f"[Model]: Loading {CFG.model_name}...")
119 |     global tokenizer
120 |     tokenizer = AutoTokenizer.from_pretrained(f"{CFG.pretrained_dir}/{CFG.model_name}")
121 |     model = AutoModelForSeq2SeqLM.from_pretrained(f"{CFG.pretrained_dir}/{CFG.model_name}").to(CFG.device)
122 | 
123 |     # Loading Data
124 |     print("Loading Data ...")
125 |     datasets = load_data()
126 | 
127 |     # Preparing Data
128 |     print("Preparing Data ...")
129 |     logging.info(f"[Dataset]:\n{datasets}")
130 |     tokenized_datasets = tokenize_and_encode(tokenizer, datasets)
131 |     data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)
132 | 
133 |     # Training Arguments
134 |     args = Seq2SeqTrainingArguments(
135 |         output_dir = CFG.saved_models_dir,
136 |         logging_dir = CFG.saved_models_dir + "logs/",
137 |         evaluation_strategy = CFG.evaluation_strategy,            
138 |         learning_rate = CFG.learning_rate,
139 |         per_device_train_batch_size = CFG.batch_size,
140 |         per_device_eval_batch_size = CFG.batch_size,
141 |         weight_decay = CFG.weight_decay,
142 |         generation_max_length = CFG.max_target_length,
143 |         save_strategy =  CFG.save_strategy,
144 |         num_train_epochs = CFG.num_train_epochs,
145 |         save_total_limit = CFG.save_total_limit,
146 |         predict_with_generate = True,
147 |         logging_steps = len(tokenized_datasets["train"]) // CFG.batch_size,
148 |         push_to_hub = False,
149 |         report_to = "tensorboard")
150 | 
151 |     # Trainer                
152 |     trainer = Seq2SeqTrainer(
153 |         model,
154 |         args,
155 |         train_dataset = tokenized_datasets["train"],
156 |         eval_dataset = tokenized_datasets["test"],
157 |         data_collator = data_collator,
158 |         tokenizer = tokenizer)
159 | 
160 |     # Training and logging
161 |     print("Training ...")
162 |     trainer.add_callback(CustomCallback(trainer)) 
163 |     trainer.train()
164 | 
165 | if __name__ == "__main__":
166 |     main()


--------------------------------------------------------------------------------
/Paragraph2RXNRole/Paragraph2Role/finetune_llms_full_for_paragraph2role.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | # Setup environment (full fine-tuning 7b needs 160GB Memory)
  3 | os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"
  4 | 
  5 | import torch
  6 | import pandas as pd
  7 | from datasets import load_dataset, Dataset, DatasetDict
  8 | from transformers import  AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
  9 | import logging
 10 | from peft import LoraConfig, PeftModel
 11 | from trl import SFTTrainer
 12 | from transformers import TrainerCallback
 13 | 
 14 | # Data Loading and Preprocessing
 15 | train_df = pd.read_csv("data/role/train.csv")
 16 | test_df = pd.read_csv("data/role/test.csv")
 17 | source_text = "input"
 18 | target_text = "output"
 19 | instruction = f"{source_text}2{target_text}: "
 20 | instruction = "annotate the reaction roles in the paragraph. "
 21 | 
 22 | train_df['text'] = f'<s>[INST] {instruction}' + train_df[source_text] + " [/INST] " + train_df[target_text] +  "!!! </s>"
 23 | test_df['text'] = f'<s>[INST] {instruction}' + test_df[source_text] + " [/INST] " + test_df[target_text] +  "!!! </s>"
 24 | 
 25 | train_dataset = Dataset.from_dict(train_df[['text']].astype(str))
 26 | test_dataset = Dataset.from_dict(test_df[['text']].astype(str))
 27 | dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
 28 | print(dataset)
 29 | 
 30 | # TrainingArguments parameters
 31 | num_train_epochs = 20
 32 | save_steps = 0                      # Save checkpoint every X updates steps
 33 | logging_steps = 25                  # Log every X updates steps
 34 | 
 35 | fp16 = False                        # Enable fp16/bf16 training (set fp16 to True with an V100)
 36 | bf16 = True                         # Enable fp16/bf16 training (set bf16 to True with an A100)
 37 | 
 38 | per_device_train_batch_size = 2     # Batch size per GPU for training
 39 | per_device_eval_batch_size = 2      # Batch size per GPU for evaluation
 40 | gradient_accumulation_steps = 1     # Number of update steps to accumulate the gradients for
 41 | gradient_checkpointing = True       # Enable gradient checkpointing
 42 | 
 43 | max_grad_norm = 0.3                 # Maximum gradient normal (gradient clipping)
 44 | learning_rate = 5e-6                # Initial learning rate (AdamW optimizer, 1e-5 or 5e-6 or 1e-4)
 45 | weight_decay = 0.001                # Weight decay to apply to all layers except bias/LayerNorm weights
 46 | 
 47 | optim = "paged_adamw_32bit"         # Optimizer to use
 48 | lr_scheduler_type = "cosine"        # Learning rate schedule
 49 | 
 50 | max_steps = -1                      # Number of training steps (overrides num_train_epochs)
 51 | warmup_ratio = 0.03                 # Ratio of steps for a linear warmup (from 0 to learning rate)
 52 | 
 53 | group_by_length = True              # Group sequences into batches with same length (Saves memory and speeds up training considerably)
 54 | 
 55 | # SFT parameters
 56 | max_seq_length = 4096               # Maximum sequence length to use (default 1024)
 57 | packing = False                     # Pack multiple short examples in the same input sequence to increase efficiency
 58 | device_map = "auto"                 # Load the entire model on the GPU 0, or "auto"
 59 | 
 60 | # Model Version (Meta-Llama-3-8B-Instruct, Mistral-7B-Instruct-v0.2, llama-2-13b-chat-hf ...)
 61 | model_name = "/home/zhangwei/pretrained_models/Mistral-7B-Instruct-v0.2"  # Path of the pretrained model downloaded from Hugging Face
 62 | new_model_dir = f"saved_models/Mistral-7B-Instruct-v0.2/train_{len(train_df)}_lr{learning_rate}_bs{per_device_train_batch_size}"  # Fine-tuned model name
 63 | output_dir = new_model_dir          # Output directory where the model predictions and checkpoints will be stored
 64 | 
 65 | # Load base model
 66 | model = AutoModelForCausalLM.from_pretrained(
 67 |     pretrained_model_name_or_path = model_name,
 68 |     torch_dtype = torch.bfloat16,
 69 |     device_map = device_map
 70 | )
 71 | 
 72 | # Load tokenizer
 73 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 74 | tokenizer.pad_token = tokenizer.eos_token
 75 | tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training
 76 | print("------------tokenizer.eos_token--------------", tokenizer.eos_token)
 77 | print("------------tokenizer.unk_token--------------", tokenizer.unk_token)
 78 | print("------------tokenizer.bos_token--------------", tokenizer.bos_token)
 79 | print("------------tokenizer.pad_token--------------", tokenizer.pad_token)
 80 | print("------------vocab_size is------------", tokenizer.vocab_size)
 81 | print("------------vocab_size is------------", len(tokenizer))
 82 | 
 83 | # Set training parameters
 84 | training_arguments = TrainingArguments(
 85 |     output_dir = output_dir,
 86 |     logging_dir = output_dir + "/logs/",
 87 |     evaluation_strategy = "epoch",            
 88 |     save_strategy = "epoch",
 89 |     num_train_epochs = num_train_epochs,
 90 |     save_total_limit = num_train_epochs,
 91 |     per_device_train_batch_size = per_device_train_batch_size,
 92 |     gradient_accumulation_steps = gradient_accumulation_steps,
 93 |     # optim = optim,
 94 |     #save_steps=save_steps,
 95 |     logging_steps = logging_steps,
 96 |     learning_rate = learning_rate,
 97 |     weight_decay = weight_decay,
 98 |     fp16 = fp16,
 99 |     bf16 = bf16,
100 |     max_grad_norm = max_grad_norm,
101 |     max_steps = max_steps,
102 |     warmup_ratio = warmup_ratio,
103 |     group_by_length = group_by_length,
104 |     lr_scheduler_type = lr_scheduler_type,
105 |     report_to = "tensorboard"
106 | )
107 | 
108 | # Set supervised fine-tuning parameters
109 | trainer = SFTTrainer(
110 |     model = model,
111 |     train_dataset = dataset['train'],
112 |     eval_dataset = dataset["test"],
113 |     dataset_text_field = "text",
114 |     max_seq_length = max_seq_length,
115 |     tokenizer = tokenizer,
116 |     args = training_arguments,
117 |     packing = packing,
118 | )
119 | 
120 | class SaveBestModelCallback(TrainerCallback):
121 |     def __init__(self):
122 |         super().__init__()
123 |         self.best_eval_loss = float('inf')
124 |         self.best_model_checkpoint = None
125 |         
126 |     def on_log(self, args, state, control, logs=None, **kwargs):
127 |         # Check if training_loss is in the logs and print it
128 |         if 'loss' in logs:
129 |             training_loss = logs['loss']
130 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current training_loss: {training_loss}")
131 | 
132 |     def on_evaluate(self, args, state, control, **kwargs):
133 |         # Check if eval_loss is in the logs
134 |         if 'eval_loss' in state.log_history[-1]:
135 |             eval_loss = state.log_history[-1]['eval_loss']
136 |             # Print current eval_loss with epoch and step
137 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current eval_loss: {eval_loss}")  
138 | 
139 |             if eval_loss < self.best_eval_loss:
140 |                 self.best_eval_loss = eval_loss
141 |                 # Save the best model
142 |                 self.best_model_checkpoint = state.global_step
143 |                 trainer.save_model(f"{args.output_dir}/best_model")
144 |                 # Print loss of Best Model
145 |                 logging.info(f"New best model saved at step {state.global_step} with eval_loss: {eval_loss}")  
146 | 
147 | # Create an instance of the callback
148 | save_best_model_callback = SaveBestModelCallback()
149 | 
150 | # Training and logging
151 | logging.basicConfig(filename=output_dir+'/training.log', level=logging.INFO)
152 | logging.info(f"""[Device]: cuda:{os.environ["CUDA_VISIBLE_DEVICES"]}...\n""")
153 | logging.info(f"""[Model]: Loading {model_name}...\n""")
154 | logging.info(f"""[Outputdir]: Loading {output_dir}...\n""")
155 | 
156 | # Add the callback to the trainer
157 | trainer.add_callback(save_best_model_callback)
158 | 
159 | # Train model
160 | trainer.train()
161 | 
162 | # Save trained model
163 | trainer.model.save_pretrained(new_model_dir)


--------------------------------------------------------------------------------
/Paragraph2RXNRole/Paragraph2Role/finetune_llms_peft_for_paragraph2role.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "2"
  3 | import torch
  4 | device = "cuda:0" if torch.cuda.is_available() else "cpu"
  5 | 
  6 | import pandas as pd
  7 | from datasets import load_dataset, Dataset, DatasetDict
  8 | from transformers import  AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
  9 | import logging
 10 | from peft import LoraConfig, PeftModel
 11 | from trl import SFTTrainer
 12 | from transformers import TrainerCallback
 13 | 
 14 | # Data Loading and Preprocessing
 15 | train_df = pd.read_csv("data/role/train.csv")
 16 | test_df = pd.read_csv("data/role/test.csv")
 17 | source_text = "input"
 18 | target_text = "output"
 19 | instruction = f"{source_text}2{target_text}: "
 20 | instruction = "annotate the reaction roles in the paragraph. "
 21 | 
 22 | train_df['text'] = f'<s>[INST] {instruction}' + train_df[source_text] + " [/INST] " + train_df[target_text] +  "!!! </s>"
 23 | test_df['text'] = f'<s>[INST] {instruction}' + test_df[source_text] + " [/INST] " + test_df[target_text] +  "!!! </s>"
 24 | 
 25 | train_dataset = Dataset.from_dict(train_df[['text']].astype(str))
 26 | test_dataset = Dataset.from_dict(test_df[['text']].astype(str))
 27 | dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
 28 | print(dataset)
 29 | 
 30 | ################################################################################
 31 | # Parameters Setting
 32 | ################################################################################
 33 | # QLoRA parameters
 34 | lora_r = 64                         # LoRA attention dimension (8, 16, 64, larger is better)
 35 | lora_alpha = 128                    # Alpha parameter for LoRA scaling (lora_r*2)
 36 | lora_dropout = 0.1                  # Dropout probability for LoRA layers
 37 | 
 38 | # bitsandbytes parameters
 39 | use_4bit = True                     # Activate 4-bit precision base model loading
 40 | bnb_4bit_compute_dtype = "float16"  # Compute dtype for 4-bit base models   
 41 | bnb_4bit_quant_type = "nf4"         # Quantization type (fp4 or nf4)
 42 | use_nested_quant = False            # Activate nested quantization for 4-bit base models (double quantization)
 43 | 
 44 | # TrainingArguments parameters
 45 | num_train_epochs = 20
 46 | save_steps = 0                      # Save checkpoint every X updates steps
 47 | logging_steps = 25                  # Log every X updates steps
 48 | 
 49 | fp16 = False                        # Enable fp16/bf16 training (set fp16 to True with an V100)
 50 | bf16 = True                         # Enable fp16/bf16 training (set bf16 to True with an A100)
 51 | 
 52 | per_device_train_batch_size = 2     # Batch size per GPU for training
 53 | per_device_eval_batch_size = 2      # Batch size per GPU for evaluation
 54 | gradient_accumulation_steps = 1     # Number of update steps to accumulate the gradients for
 55 | gradient_checkpointing = True       # Enable gradient checkpointing
 56 | 
 57 | max_grad_norm = 0.3                 # Maximum gradient normal (gradient clipping)
 58 | learning_rate = 1e-5                # Initial learning rate (AdamW optimizer)
 59 | weight_decay = 0.001                # Weight decay to apply to all layers except bias/LayerNorm weights
 60 | 
 61 | optim = "paged_adamw_32bit"         # Optimizer to use
 62 | lr_scheduler_type = "cosine"        # Learning rate schedule
 63 | 
 64 | max_steps = -1                      # Number of training steps (overrides num_train_epochs)
 65 | warmup_ratio = 0.03                 # Ratio of steps for a linear warmup (from 0 to learning rate)
 66 | 
 67 | group_by_length = True              # Group sequences into batches with same length (Saves memory and speeds up training considerably)
 68 | 
 69 | # SFT parameters
 70 | max_seq_length = 4096               # Maximum sequence length to use (default 1024)
 71 | packing = False                     # Pack multiple short examples in the same input sequence to increase efficiency
 72 | device_map = {"": 0}                # Load the entire model on the GPU 0, or "auto"
 73 | 
 74 | # Model Version (Meta-Llama-3-8B-Instruct, Mistral-7B-Instruct-v0.2, llama-2-13b-chat-hf ...)
 75 | model_name = "/home/zhangwei/pretrained_models/llama-2-13b-chat-hf"     # Path of the pretrained model downloaded from Hugging Face
 76 | new_model_dir = f"saved_models/llama-2-13b-chat-hf_qlora/train_{len(train_df)}_lora_r{lora_r}_lr{learning_rate}"  # Fine-tuned model name
 77 | output_dir = new_model_dir                                              # Output directory where the model predictions and checkpoints will be stored
 78 | 
 79 | ################################################################################
 80 | # Train
 81 | ################################################################################
 82 | # Load tokenizer and model with QLoRA configuration
 83 | compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
 84 | bnb_config = BitsAndBytesConfig(
 85 |     load_in_4bit=use_4bit,
 86 |     bnb_4bit_quant_type=bnb_4bit_quant_type,
 87 |     bnb_4bit_compute_dtype=compute_dtype,
 88 |     bnb_4bit_use_double_quant=use_nested_quant,
 89 | )
 90 | 
 91 | # Check GPU compatibility with bfloat16
 92 | if compute_dtype == torch.float16 and use_4bit:
 93 |     major, _ = torch.cuda.get_device_capability()
 94 |     if major >= 8:
 95 |         print("=" * 80)
 96 |         print("Your GPU supports bfloat16: accelerate training with bf16=True")
 97 |         print("=" * 80)
 98 | 
 99 | # Load base model
100 | model = AutoModelForCausalLM.from_pretrained(
101 |     pretrained_model_name_or_path = model_name,
102 |     quantization_config = bnb_config,
103 |     device_map = device_map
104 | )
105 | model.config.use_cache = False
106 | model.config.pretraining_tp = 1
107 | 
108 | # Load LLaMA tokenizer
109 | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
110 | tokenizer.pad_token = tokenizer.eos_token
111 | tokenizer.padding_side = "right"   # Fix weird overflow issue with fp16 training
112 | 
113 | # Load LoRA configuration
114 | peft_config = LoraConfig(
115 |     lora_alpha=lora_alpha,
116 |     lora_dropout=lora_dropout,
117 |     r=lora_r,
118 |     bias="none",
119 |     task_type="CAUSAL_LM",
120 | )
121 | 
122 | # Set training parameters
123 | training_arguments = TrainingArguments(
124 |     output_dir=output_dir,
125 |     logging_dir = output_dir + "/logs/",
126 |     evaluation_strategy = "epoch",            
127 |     save_strategy = "epoch",
128 |     num_train_epochs = num_train_epochs,
129 |     save_total_limit = num_train_epochs,
130 |     per_device_train_batch_size = per_device_train_batch_size,
131 |     gradient_accumulation_steps = gradient_accumulation_steps,
132 |     optim = optim,
133 |     #save_steps=save_steps,
134 |     logging_steps = logging_steps,
135 |     learning_rate = learning_rate,
136 |     weight_decay = weight_decay,
137 |     fp16 = fp16,
138 |     bf16 = bf16,
139 |     max_grad_norm = max_grad_norm,
140 |     max_steps = max_steps,
141 |     warmup_ratio = warmup_ratio,
142 |     group_by_length = group_by_length,
143 |     lr_scheduler_type = lr_scheduler_type,
144 |     report_to = "tensorboard"
145 | )
146 | 
147 | # Set supervised fine-tuning parameters
148 | trainer = SFTTrainer(
149 |     model = model,
150 |     train_dataset = dataset['train'],
151 |     eval_dataset = dataset["test"],
152 |     peft_config = peft_config,
153 |     dataset_text_field ="text",
154 |     max_seq_length = max_seq_length,
155 |     tokenizer = tokenizer,
156 |     args = training_arguments,
157 |     packing = packing,
158 | )
159 | 
160 | class SaveBestModelCallback(TrainerCallback):
161 |     def __init__(self):
162 |         super().__init__()
163 |         self.best_eval_loss = float('inf')
164 |         self.best_model_checkpoint = None
165 |         
166 |     def on_log(self, args, state, control, logs=None, **kwargs):
167 |         # Check if training_loss is in the logs and print it
168 |         if 'loss' in logs:
169 |             training_loss = logs['loss']
170 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current training_loss: {training_loss}")
171 | 
172 |     def on_evaluate(self, args, state, control, **kwargs):
173 |         # Check if eval_loss is in the logs
174 |         if 'eval_loss' in state.log_history[-1]:
175 |             eval_loss = state.log_history[-1]['eval_loss']
176 |             logging.info(f"Epoch: {int(state.epoch)}, Step: {state.global_step}, Current eval_loss: {eval_loss}")  # Print current eval_loss with epoch and step
177 | 
178 |             if eval_loss < self.best_eval_loss:
179 |                 self.best_eval_loss = eval_loss
180 |                 # Save the best model
181 |                 self.best_model_checkpoint = state.global_step
182 |                 trainer.save_model(f"{args.output_dir}/best_model")
183 |                 logging.info(f"New best model saved at step {state.global_step} with eval_loss: {eval_loss}")  # Print loss of Best Model
184 | 
185 | # Create an instance of the callback
186 | save_best_model_callback = SaveBestModelCallback()
187 | 
188 | # Training and logging
189 | logging.basicConfig(filename=output_dir+'/training.log', level=logging.INFO)
190 | logging.info(f"""[Device]: cuda:{os.environ["CUDA_VISIBLE_DEVICES"]}...\n""")
191 | logging.info(f"""[Model]: Loading {model_name}...\n""")
192 | logging.info(f"""[Outputdir]: Loading {output_dir}...\n""")
193 | 
194 | # Add the callback to the trainer
195 | trainer.add_callback(save_best_model_callback)
196 | 
197 | # Train model
198 | trainer.train()
199 | 
200 | # Save trained model
201 | trainer.model.save_pretrained(new_model_dir)


--------------------------------------------------------------------------------
/Paragraph2RXNRole/Paragraph2Role/results/README.md:
--------------------------------------------------------------------------------
1 | Save the generated outputs by different models. 


--------------------------------------------------------------------------------
/Paragraph2RXNRole/README.md:
--------------------------------------------------------------------------------
1 | The two subtasks are refered to the JCIM 2022 paper: [Automated Chemical Reaction Extraction from Scientific Literature](https://pubs.acs.org/doi/pdf/10.1021/acs.jcim.1c00284)
2 | 
3 | ### [SubTak 1. Pragraph2Prod](Paragraph2Prod)
4 | 
5 | ### [SubTak 2. Pragraph2Role](Paragraph2Role)
6 | 
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SFTLLMs_for_ChemText_Mining
 2 | 
 3 | ## Download
 4 | ```bash
 5 | git clone https://github.com/zw-SIMM/SFTLLMs_for_chemtext_mining
 6 | cd SFTLLMs_for_ChemText_Mining
 7 | ```
 8 | 
 9 | ## 🖊 Datasets and Codes
10 | 
11 | Preprocessed data， fine-tuning codes， README workflows have been placed in corresponding folders:
12 | 
13 | - ```Paragraph2Comound/```
14 | 
15 | - ```Paragraph2RXNRole/prod/``` and ```Paragraph2RXNRole/role/```
16 | 
17 | - ```Paragraph2MOFInfo/```
18 | 
19 | - ```Paragraph2NMR/```
20 | 
21 | - ```Paragraph2Action/``` (dataset is derived from pistachio dataset, which is available upon request.)
22 | 
23 | ## 💿Fine-tuning ChatGPT (GPT-3.5-Turbo) and Prompt-Engineering GPT-4
24 | 
25 | ### Environment (OS: Windows or Linux)
26 | 
27 | ```bash
28 | pip install openai
29 | pip install pandas
30 | ```
31 | Note: The fine-tuning code has been slightly different as the version of openai updated to v1.0.0+.
32 | 
33 | Here, we provide the latest code.
34 | 
35 | ### Implementation
36 | 
37 | Specific scripts  for each task are in the corresponding folders.
38 | 
39 | All notebooks of fine-tuning and prompt engineering GPTs (GPT-4, GPT-3.5) as well as evaluating for each task has beed released!
40 | 
41 | ###  Demo of Fine-tuning ChatGPT on small dataset
42 | 
43 | Here, we gave an example notebook of fine-tuning ChatGPT on 25 Paragraph2NMR data in ```demo/fine-tuning_chatgpt_on_25_paragraph2NMR_data.ipynb```, including:
44 | 
45 |  - Preprocessing
46 |  - Training
47 |  - Inferencing
48 |  - Evaluating
49 | 
50 | ## 📀Fine-tuning Open-source Language Models (Mistral, Llama3, Bart, T5) 
51 | 
52 | ### Environment (Linux)
53 | ```bash
54 | mamba create -n llm python=3.10
55 | mamba activate llm 
56 | pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pandas numpy ipywidgets tqdm
57 | pip install -i https://pypi.tuna.tsinghua.edu.cn/simple torch==2.1.2  transformers==4.38.2 datasets tiktoken wandb==0.11 openpyxl
58 | pip install -i https://pypi.tuna.tsinghua.edu.cn/simple peft==0.8.0 accelerate bitsandbytes safetensors jsonlines
59 | pip install -i https://pypi.tuna.tsinghua.edu.cn/simple vllm==0.3.1
60 | pip install -i https://pypi.tuna.tsinghua.edu.cn/simple trl==0.7
61 | pip install -i https://pypi.tuna.tsinghua.edu.cn/simple tensorboardX tensorboard
62 | pip install -i https://pypi.tuna.tsinghua.edu.cn/simple textdistance nltk matplotlib seaborn seqeval
63 | pip install -i https://pypi.tuna.tsinghua.edu.cn/simple modelscope
64 | ```
65 | 
66 | ### Pretrained Models Downloads
67 | 
68 | Open-sourced pretrained models (Llama3, Llama2, Mistral, Bart, T5) can be downloaded from [huggingface](https://huggingface.co/models) or [modelscope](https://www.modelscope.cn/models).
69 | 
70 | Here is an example for downloading pretrained models by scripts on linux servers from modelscope:
71 | ```python
72 | from modelscope import snapshot_download
73 | model_dir = snapshot_download("LLM-Research/Meta-Llama-3-8B-Instruct", revision='master', cache_dir='/home/pretrained_models')
74 | model_dir = snapshot_download('AI-ModelScope/Mistral-7B-Instruct-v0.2', revision='master', cache_dir='/home/pretrained_models')
75 | ```
76 | 
77 | ### Fine-tuning
78 | 
79 | The codes and tutorials of Fine-tuning Language models (ChatGPT, Llama3, Llama2, Mistral, Bart, T5) for each task are in the corresponding folders.
80 | 


--------------------------------------------------------------------------------