├── figs └── scores.png ├── requirements.txt ├── setup.sh ├── sequence_classification ├── sample_inputs │ ├── single_sequence │ │ ├── tsv │ │ │ ├── sample_test_without_labels.tsv │ │ │ ├── test.tsv │ │ │ ├── validation.tsv │ │ │ └── train.tsv │ │ ├── csv │ │ │ ├── sample_test_without_labels.csv │ │ │ ├── test.csv │ │ │ ├── validation.csv │ │ │ └── train.csv │ │ └── jsonl │ │ │ ├── sample_test_without_labels.jsonl │ │ │ ├── test.jsonl │ │ │ ├── validation.jsonl │ │ │ └── train.jsonl │ └── double_sequence │ │ ├── tsv │ │ ├── validation.tsv │ │ ├── train.tsv │ │ ├── sample_test_without_label.tsv │ │ └── test.tsv │ │ ├── csv │ │ ├── validation.csv │ │ ├── train.csv │ │ ├── sample_test_without_label.csv │ │ └── test.csv │ │ └── jsonl │ │ ├── validation.jsonl │ │ ├── train.jsonl │ │ ├── sample_test_without_label.jsonl │ │ └── test.jsonl ├── evaluate.sh ├── trainer.sh ├── README.md └── sequence_classification.py ├── token_classification ├── sample_inputs │ ├── sample_test_without_tags.jsonl │ ├── test.jsonl │ ├── validation.jsonl │ └── train.jsonl ├── evaluate.sh ├── README.md ├── trainer.sh └── token_classification.py ├── question_answering ├── evaluate.sh ├── README.md ├── trainer.sh ├── utils.py └── question_answering.py ├── .gitignore └── README.md /figs/scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/banglabert/HEAD/figs/scores.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | sentencepiece != 0.1.92 2 | protobuf 3 | datasets==1.11.0 4 | seqeval==1.2.2 5 | git+https://github.com/csebuetnlp/normalizer -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | git clone https://github.com/huggingface/transformers.git 4 | cd transformers/ 5 | git checkout 7a26307e3186926373cf9129248c209ab869148b 6 | pip install --upgrade ./ 7 | cd ../ 8 | 9 | pip install --upgrade -r requirements.txt -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/single_sequence/tsv/sample_test_without_labels.tsv: -------------------------------------------------------------------------------- 1 | sentence1 2 | সবার জন্য উন্মুক্ত। 3 | কাফের এর সাথে তর্ক করে ঈমান নষ্ট করার কি দরকার 4 | ভাই ভালোবাসা। যখন দেখি কেউ নাই, তখন দেখলাম আপনার লেখা আছে। ধণ্যবাদ 5 | আমরা জানি আপনারা কি চান। বাকরুদ্ধতা 6 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/single_sequence/tsv/test.tsv: -------------------------------------------------------------------------------- 1 | sentence1 label 2 | সবার জন্য উন্মুক্ত। pos 3 | কাফের এর সাথে তর্ক করে ঈমান নষ্ট করার কি দরকার neg 4 | ভাই ভালোবাসা। যখন দেখি কেউ নাই, তখন দেখলাম আপনার লেখা আছে। ধণ্যবাদ pos 5 | আমরা জানি আপনারা কি চান। বাকরুদ্ধতা neg 6 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/single_sequence/csv/sample_test_without_labels.csv: -------------------------------------------------------------------------------- 1 | sentence1 2 | সবার জন্য উন্মুক্ত। 3 | কাফের এর সাথে তর্ক করে ঈমান নষ্ট করার কি দরকার 4 | "ভাই ভালোবাসা। যখন দেখি কেউ নাই, তখন দেখলাম আপনার লেখা আছে। ধণ্যবাদ" 5 | আমরা জানি আপনারা কি চান। বাকরুদ্ধতা 6 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/single_sequence/csv/test.csv: -------------------------------------------------------------------------------- 1 | sentence1,label 2 | সবার জন্য উন্মুক্ত।,pos 3 | কাফের এর সাথে তর্ক করে ঈমান নষ্ট করার কি দরকার,neg 4 | "ভাই ভালোবাসা। যখন দেখি কেউ নাই, তখন দেখলাম আপনার লেখা আছে। ধণ্যবাদ",pos 5 | আমরা জানি আপনারা কি চান। বাকরুদ্ধতা,neg 6 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/single_sequence/jsonl/sample_test_without_labels.jsonl: -------------------------------------------------------------------------------- 1 | {"sentence1": "সবার জন্য উন্মুক্ত।"} 2 | {"sentence1": "কাফের এর সাথে তর্ক করে ঈমান নষ্ট করার কি দরকার"} 3 | {"sentence1": "ভাই ভালোবাসা। যখন দেখি কেউ নাই, তখন দেখলাম আপনার লেখা আছে। ধণ্যবাদ"} 4 | {"sentence1": "আমরা জানি আপনারা কি চান। বাকরুদ্ধতা"} 5 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/single_sequence/jsonl/test.jsonl: -------------------------------------------------------------------------------- 1 | {"sentence1": "সবার জন্য উন্মুক্ত।", "label": "pos"} 2 | {"sentence1": "কাফের এর সাথে তর্ক করে ঈমান নষ্ট করার কি দরকার", "label": "neg"} 3 | {"sentence1": "ভাই ভালোবাসা। যখন দেখি কেউ নাই, তখন দেখলাম আপনার লেখা আছে। ধণ্যবাদ", "label": "pos"} 4 | {"sentence1": "আমরা জানি আপনারা কি চান। বাকরুদ্ধতা", "label": "neg"} 5 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/single_sequence/tsv/validation.tsv: -------------------------------------------------------------------------------- 1 | sentence1 label 2 | লোক দেখানো সম্মান দেখিয়ে লাভ নাই।মিডিয়ার সামনে অনেকেই সমাজ সেবক,কিন্তু ভিতরে সদরঘাট। neg 3 | খেলার মাধ্যমে ধর্মিয় উৎসব কে অবমাননা।।।।। neg 4 | মালুদের কারবার দেখলে হাসি পায় neg 5 | অন্যায্য একটি সিদ্ধান্ত সমূলে কাটা পড়ায় ভালো ই হয়েছে। বেসরকারি বিশ্ববিদ্যালয়ে ভর্তি ফি নির্দিষ্ট করে দিলে এবং তা নিয়মিত মনিটরিং করলে আরও ভালো হবে! pos 6 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/single_sequence/csv/validation.csv: -------------------------------------------------------------------------------- 1 | sentence1,label 2 | "লোক দেখানো সম্মান দেখিয়ে লাভ নাই।মিডিয়ার সামনে অনেকেই সমাজ সেবক,কিন্তু ভিতরে সদরঘাট।",neg 3 | খেলার মাধ্যমে ধর্মিয় উৎসব কে অবমাননা।।।।।,neg 4 | মালুদের কারবার দেখলে হাসি পায়,neg 5 | অন্যায্য একটি সিদ্ধান্ত সমূলে কাটা পড়ায় ভালো ই হয়েছে। বেসরকারি বিশ্ববিদ্যালয়ে ভর্তি ফি নির্দিষ্ট করে দিলে এবং তা নিয়মিত মনিটরিং করলে আরও ভালো হবে!,pos 6 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/single_sequence/tsv/train.tsv: -------------------------------------------------------------------------------- 1 | sentence1 label 2 | যেই মাদারির পোলারা এই কাজটি করেছে, সেই সালারা অবৈধ জারপ সন্তান ছারা আর কিছুই না। neg 3 | ভারতের কুখ্যাত ষড়যন্ত্রের মুখোশ উন্মোচন হলো neg 4 | আমার প্রছন্দের একাদশ ১/তামিম ২/সৌম্য ৩/রিয়াদ ৪/সাকিব ৫/মুশফিক ৬/মোসাদ্দেক ৭/সাব্বির ৮/নাসির ৯/মাশরাফি ১০/তাসকিন ১১/রুবেল কেমন হলো আমার একাদশ পছন্দ হলে লাইক দিবেন এবং ভুল হলে তা কমেন্ট করে জানাবেন। pos 5 | মুসা কপা‌লে কি অা‌ছে জা‌নিনা neg 6 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/single_sequence/csv/train.csv: -------------------------------------------------------------------------------- 1 | sentence1,label 2 | "যেই মাদারির পোলারা এই কাজটি করেছে, সেই সালারা অবৈধ জারপ সন্তান ছারা আর কিছুই না।",neg 3 | ভারতের কুখ্যাত ষড়যন্ত্রের মুখোশ উন্মোচন হলো,neg 4 | আমার প্রছন্দের একাদশ ১/তামিম ২/সৌম্য ৩/রিয়াদ ৪/সাকিব ৫/মুশফিক ৬/মোসাদ্দেক ৭/সাব্বির ৮/নাসির ৯/মাশরাফি ১০/তাসকিন ১১/রুবেল কেমন হলো আমার একাদশ পছন্দ হলে লাইক দিবেন এবং ভুল হলে তা কমেন্ট করে জানাবেন।,pos 5 | মুসা কপা‌লে কি অা‌ছে জা‌নিনা,neg 6 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/single_sequence/jsonl/validation.jsonl: -------------------------------------------------------------------------------- 1 | {"sentence1": "লোক দেখানো সম্মান দেখিয়ে লাভ নাই।মিডিয়ার সামনে অনেকেই সমাজ সেবক,কিন্তু ভিতরে সদরঘাট।", "label": "neg"} 2 | {"sentence1": "খেলার মাধ্যমে ধর্মিয় উৎসব কে অবমাননা।।।।।", "label": "neg"} 3 | {"sentence1": "মালুদের কারবার দেখলে হাসি পায়", "label": "neg"} 4 | {"sentence1": "অন্যায্য একটি সিদ্ধান্ত সমূলে কাটা পড়ায় ভালো ই হয়েছে। বেসরকারি বিশ্ববিদ্যালয়ে ভর্তি ফি নির্দিষ্ট করে দিলে এবং তা নিয়মিত মনিটরিং করলে আরও ভালো হবে!", "label": "pos"} 5 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/single_sequence/jsonl/train.jsonl: -------------------------------------------------------------------------------- 1 | {"sentence1": "যেই মাদারির পোলারা এই কাজটি করেছে, সেই সালারা অবৈধ জারপ সন্তান ছারা আর কিছুই না।", "label": "neg"} 2 | {"sentence1": "ভারতের কুখ্যাত ষড়যন্ত্রের মুখোশ উন্মোচন হলো", "label": "neg"} 3 | {"sentence1": "আমার প্রছন্দের একাদশ ১/তামিম ২/সৌম্য ৩/রিয়াদ ৪/সাকিব ৫/মুশফিক ৬/মোসাদ্দেক ৭/সাব্বির ৮/নাসির ৯/মাশরাফি ১০/তাসকিন ১১/রুবেল কেমন হলো আমার একাদশ পছন্দ হলে লাইক দিবেন এবং ভুল হলে তা কমেন্ট করে জানাবেন।", "label": "pos"} 4 | {"sentence1": "মুসা কপা‌লে কি অা‌ছে জা‌নিনা", "label": "neg"} 5 | -------------------------------------------------------------------------------- /token_classification/sample_inputs/sample_test_without_tags.jsonl: -------------------------------------------------------------------------------- 1 | {"tokens": ["৫%", "তার", "চাইতে", "পশ্চিমোরে", "এর", "সাক্ষরতার", "হার", "কম"]} 2 | {"tokens": ["গত", "২০১৫", "সালের", "৫", "আগস্ট", "সকাল", "সাড়ে", "৮টার", "দিকে", "ভাড়া", "বাসায়", "আনিছুর", "রহমান", "ধারালো", "বটি", "দিয়ে", "কুপিয়ে", "স্ত্রী", "মৌসুমিকে", "হত্যা", "করে"]} 3 | {"tokens": ["জেলা", "ক্রীড়া", "সংস্থার", "সাধারণ", "সম্পাদক", "ওবায়দুর", "রহমান", "খান"]} 4 | {"tokens": ["আমার", "ছবি", "তুলে", "আর", "কী", "হবে"]} 5 | {"tokens": ["ডেভিড", "ওলিয়ারি", "জন্ম", "মে", "২", "২০০৮", "লন্ডনে", "একজন", "আয়ারল্যান্ডীয়", "পেশাদার", "ফুটবল", "খেলোয়াড়", "এবং", "ম্যানেজার"]} 6 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/double_sequence/tsv/validation.tsv: -------------------------------------------------------------------------------- 1 | sentence1 sentence2 label 2 | আর সে বললো, মা, আমি বাসায়। স্কুলের বাস তাকে নামিয়ে দেওয়ার সঙ্গে সঙ্গে তিনি তার মাকে ফোন করেছিলেন। neutral 3 | আর সে বললো, মা, আমি বাসায়। সে কোন কথা বলেনি। contradiction 4 | আর সে বললো, মা, আমি বাসায়। সে তার মাকে বলেছিল যে সে বাড়ি ফিরেছে। entailment 5 | আমি জানতাম না আমি কি জন্য যাচ্ছি বা কিছু, তাই ওয়াশিংটনের একটি নির্দিষ্ট স্থানে রিপোর্ট করার ছিল। আমি কখনো ওয়াশিংটনে যাইনি তাই যখন আমাকে সেখানে কার্যভার দেওয়া হয়, তখন আমি সেই জায়গা খুঁজে বের করার চেষ্টা করতে গিয়ে হেরে যাই। neutral 6 | আমি জানতাম না আমি কি জন্য যাচ্ছি বা কিছু, তাই ওয়াশিংটনের একটি নির্দিষ্ট স্থানে রিপোর্ট করার ছিল। আমি ঠিক জানতাম যে, ওয়াশিংটনের দিকে এগিয়ে যাওয়ার সময় আমাকে কী করতে হবে। contradiction 7 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/double_sequence/csv/validation.csv: -------------------------------------------------------------------------------- 1 | sentence1,sentence2,label 2 | "আর সে বললো, মা, আমি বাসায়।",স্কুলের বাস তাকে নামিয়ে দেওয়ার সঙ্গে সঙ্গে তিনি তার মাকে ফোন করেছিলেন।,neutral 3 | "আর সে বললো, মা, আমি বাসায়।",সে কোন কথা বলেনি।,contradiction 4 | "আর সে বললো, মা, আমি বাসায়।",সে তার মাকে বলেছিল যে সে বাড়ি ফিরেছে।,entailment 5 | "আমি জানতাম না আমি কি জন্য যাচ্ছি বা কিছু, তাই ওয়াশিংটনের একটি নির্দিষ্ট স্থানে রিপোর্ট করার ছিল।","আমি কখনো ওয়াশিংটনে যাইনি তাই যখন আমাকে সেখানে কার্যভার দেওয়া হয়, তখন আমি সেই জায়গা খুঁজে বের করার চেষ্টা করতে গিয়ে হেরে যাই।",neutral 6 | "আমি জানতাম না আমি কি জন্য যাচ্ছি বা কিছু, তাই ওয়াশিংটনের একটি নির্দিষ্ট স্থানে রিপোর্ট করার ছিল।","আমি ঠিক জানতাম যে, ওয়াশিংটনের দিকে এগিয়ে যাওয়ার সময় আমাকে কী করতে হবে।",contradiction 7 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/double_sequence/csv/train.csv: -------------------------------------------------------------------------------- 1 | sentence1,sentence2,label 2 | ধারণাগতভাবে ক্রিম স্কিমিং এর দুটি মৌলিক মাত্রা রয়েছে - পণ্য এবং ভূগোল।,পণ্য এবং ভূগোল হল ক্রিমের স্কিমিং কাজ।,neutral 3 | আপনি জানেন এই মৌসুমে আর আমার মনে হয় আপনার পর্যায়ে আপনি তাদেরকে পরবর্তী ধাপে হারিয়ে ফেলেছেন যদি তারা সিদ্ধান্ত নেন যে অভিভাবক দলকে মনে করতে যে ব্রেভস ট্রিপল এ এর একজন লোককে স্মরণ করার সিদ্ধান্ত নেন তাহলে একজন ডাবল এ লোক তার জায়গায় যায় আর একজন এ লোক তার জায়গায় যায়,মানুষ যদি স্মরণ করতে পারে তাহলে আপনি নীচের স্তরে হারিয়ে যাবেন।,entailment 4 | আমাদের মধ্যে একজন আপনার নির্দেশগুলো পুঙ্খানুপুঙ্খভাবে পালন করবে।,আমার দলের একজন সদস্য তোমার আদেশ পালন করবে খুবই নির্ভুলভাবে।,entailment 5 | আপনি কিভাবে জানলেন? এই সব তাদের তথ্য আবার।,এই তথ্য তাদের।,entailment 6 | হ্যাঁ আমি তোমাকে বলছি যদি তুমি কিছু টেনিস জুতার দাম দাও... ...তাহলে আমি বুঝতে পারছি কেন তারা ১০০ ডলারের রেঞ্জে উঠে যাচ্ছে,টেনিস জুতার দাম অনেক।,neutral 7 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/double_sequence/tsv/train.tsv: -------------------------------------------------------------------------------- 1 | sentence1 sentence2 label 2 | ধারণাগতভাবে ক্রিম স্কিমিং এর দুটি মৌলিক মাত্রা রয়েছে - পণ্য এবং ভূগোল। পণ্য এবং ভূগোল হল ক্রিমের স্কিমিং কাজ। neutral 3 | আপনি জানেন এই মৌসুমে আর আমার মনে হয় আপনার পর্যায়ে আপনি তাদেরকে পরবর্তী ধাপে হারিয়ে ফেলেছেন যদি তারা সিদ্ধান্ত নেন যে অভিভাবক দলকে মনে করতে যে ব্রেভস ট্রিপল এ এর একজন লোককে স্মরণ করার সিদ্ধান্ত নেন তাহলে একজন ডাবল এ লোক তার জায়গায় যায় আর একজন এ লোক তার জায়গায় যায় মানুষ যদি স্মরণ করতে পারে তাহলে আপনি নীচের স্তরে হারিয়ে যাবেন। entailment 4 | আমাদের মধ্যে একজন আপনার নির্দেশগুলো পুঙ্খানুপুঙ্খভাবে পালন করবে। আমার দলের একজন সদস্য তোমার আদেশ পালন করবে খুবই নির্ভুলভাবে। entailment 5 | আপনি কিভাবে জানলেন? এই সব তাদের তথ্য আবার। এই তথ্য তাদের। entailment 6 | হ্যাঁ আমি তোমাকে বলছি যদি তুমি কিছু টেনিস জুতার দাম দাও... ...তাহলে আমি বুঝতে পারছি কেন তারা ১০০ ডলারের রেঞ্জে উঠে যাচ্ছে টেনিস জুতার দাম অনেক। neutral 7 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/double_sequence/jsonl/validation.jsonl: -------------------------------------------------------------------------------- 1 | {"sentence1": "আর সে বললো, মা, আমি বাসায়।", "sentence2": "স্কুলের বাস তাকে নামিয়ে দেওয়ার সঙ্গে সঙ্গে তিনি তার মাকে ফোন করেছিলেন।", "label": "neutral"} 2 | {"sentence1": "আর সে বললো, মা, আমি বাসায়।", "sentence2": "সে কোন কথা বলেনি।", "label": "contradiction"} 3 | {"sentence1": "আর সে বললো, মা, আমি বাসায়।", "sentence2": "সে তার মাকে বলেছিল যে সে বাড়ি ফিরেছে।", "label": "entailment"} 4 | {"sentence1": "আমি জানতাম না আমি কি জন্য যাচ্ছি বা কিছু, তাই ওয়াশিংটনের একটি নির্দিষ্ট স্থানে রিপোর্ট করার ছিল।", "sentence2": "আমি কখনো ওয়াশিংটনে যাইনি তাই যখন আমাকে সেখানে কার্যভার দেওয়া হয়, তখন আমি সেই জায়গা খুঁজে বের করার চেষ্টা করতে গিয়ে হেরে যাই।", "label": "neutral"} 5 | {"sentence1": "আমি জানতাম না আমি কি জন্য যাচ্ছি বা কিছু, তাই ওয়াশিংটনের একটি নির্দিষ্ট স্থানে রিপোর্ট করার ছিল।", "sentence2": "আমি ঠিক জানতাম যে, ওয়াশিংটনের দিকে এগিয়ে যাওয়ার সময় আমাকে কী করতে হবে।", "label": "contradiction"} 6 | -------------------------------------------------------------------------------- /token_classification/sample_inputs/test.jsonl: -------------------------------------------------------------------------------- 1 | {"tokens": ["৫%", "তার", "চাইতে", "পশ্চিমোরে", "এর", "সাক্ষরতার", "হার", "কম"], "tags": ["O", "O", "O", "B-LOC", "O", "O", "O", "O"]} 2 | {"tokens": ["গত", "২০১৫", "সালের", "৫", "আগস্ট", "সকাল", "সাড়ে", "৮টার", "দিকে", "ভাড়া", "বাসায়", "আনিছুর", "রহমান", "ধারালো", "বটি", "দিয়ে", "কুপিয়ে", "স্ত্রী", "মৌসুমিকে", "হত্যা", "করে"], "tags": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-PER", "I-PER", "O", "B-OBJ", "O", "O", "B-PER", "B-PER", "O", "O"]} 3 | {"tokens": ["জেলা", "ক্রীড়া", "সংস্থার", "সাধারণ", "সম্পাদক", "ওবায়দুর", "রহমান", "খান"], "tags": ["B-LOC", "B-PER", "I-PER", "I-PER", "I-PER", "B-PER", "I-PER", "I-PER"]} 4 | {"tokens": ["আমার", "ছবি", "তুলে", "আর", "কী", "হবে"], "tags": ["B-PER", "O", "O", "O", "O", "O"]} 5 | {"tokens": ["ডেভিড", "ওলিয়ারি", "জন্ম", "মে", "২", "২০০৮", "লন্ডনে", "একজন", "আয়ারল্যান্ডীয়", "পেশাদার", "ফুটবল", "খেলোয়াড়", "এবং", "ম্যানেজার"], "tags": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC", "O", "B-LOC", "O", "O", "O", "O", "O"]} 6 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/double_sequence/tsv/sample_test_without_label.tsv: -------------------------------------------------------------------------------- 1 | sentence1 sentence2 2 | আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম। আমি তার সাথে আবার কথা বলিনি। 3 | আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম। আমি এতটাই ভেঙে পড়েছিলাম যে, আমি আবারও তার সঙ্গে কথা বলতে শুরু করেছিলাম। 4 | আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম। আমাদের খুব ভালো কথা হয়েছিল। 5 | এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র। আমি জানতাম না যে, সেই দিন শুধুমাত্র আমিই ক্ষেত্রে উপস্থিত থাকব না। 6 | এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র। আমার মনে হয়েছিল যে এএফএফসি বিমান বাহিনীর ক্যারিয়ারের ক্ষেত্রে আমিই একমাত্র ব্যক্তি। 7 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/double_sequence/csv/sample_test_without_label.csv: -------------------------------------------------------------------------------- 1 | sentence1,sentence2 2 | "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।",আমি তার সাথে আবার কথা বলিনি। 3 | "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।","আমি এতটাই ভেঙে পড়েছিলাম যে, আমি আবারও তার সঙ্গে কথা বলতে শুরু করেছিলাম।" 4 | "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।",আমাদের খুব ভালো কথা হয়েছিল। 5 | "এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।","আমি জানতাম না যে, সেই দিন শুধুমাত্র আমিই ক্ষেত্রে উপস্থিত থাকব না।" 6 | "এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।",আমার মনে হয়েছিল যে এএফএফসি বিমান বাহিনীর ক্যারিয়ারের ক্ষেত্রে আমিই একমাত্র ব্যক্তি। 7 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/double_sequence/tsv/test.tsv: -------------------------------------------------------------------------------- 1 | sentence1 sentence2 label 2 | আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম। আমি তার সাথে আবার কথা বলিনি। contradiction 3 | আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম। আমি এতটাই ভেঙে পড়েছিলাম যে, আমি আবারও তার সঙ্গে কথা বলতে শুরু করেছিলাম। entailment 4 | আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম। আমাদের খুব ভালো কথা হয়েছিল। neutral 5 | এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র। আমি জানতাম না যে, সেই দিন শুধুমাত্র আমিই ক্ষেত্রে উপস্থিত থাকব না। neutral 6 | এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র। আমার মনে হয়েছিল যে এএফএফসি বিমান বাহিনীর ক্যারিয়ারের ক্ষেত্রে আমিই একমাত্র ব্যক্তি। entailment 7 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/double_sequence/jsonl/train.jsonl: -------------------------------------------------------------------------------- 1 | {"sentence1": "ধারণাগতভাবে ক্রিম স্কিমিং এর দুটি মৌলিক মাত্রা রয়েছে - পণ্য এবং ভূগোল।", "sentence2": "পণ্য এবং ভূগোল হল ক্রিমের স্কিমিং কাজ।", "label": "neutral"} 2 | {"sentence1": "আপনি জানেন এই মৌসুমে আর আমার মনে হয় আপনার পর্যায়ে আপনি তাদেরকে পরবর্তী ধাপে হারিয়ে ফেলেছেন যদি তারা সিদ্ধান্ত নেন যে অভিভাবক দলকে মনে করতে যে ব্রেভস ট্রিপল এ এর একজন লোককে স্মরণ করার সিদ্ধান্ত নেন তাহলে একজন ডাবল এ লোক তার জায়গায় যায় আর একজন এ লোক তার জায়গায় যায়", "sentence2": "মানুষ যদি স্মরণ করতে পারে তাহলে আপনি নীচের স্তরে হারিয়ে যাবেন।", "label": "entailment"} 3 | {"sentence1": "আমাদের মধ্যে একজন আপনার নির্দেশগুলো পুঙ্খানুপুঙ্খভাবে পালন করবে।", "sentence2": "আমার দলের একজন সদস্য তোমার আদেশ পালন করবে খুবই নির্ভুলভাবে।", "label": "entailment"} 4 | {"sentence1": "আপনি কিভাবে জানলেন? এই সব তাদের তথ্য আবার।", "sentence2": "এই তথ্য তাদের।", "label": "entailment"} 5 | {"sentence1": "হ্যাঁ আমি তোমাকে বলছি যদি তুমি কিছু টেনিস জুতার দাম দাও... ...তাহলে আমি বুঝতে পারছি কেন তারা ১০০ ডলারের রেঞ্জে উঠে যাচ্ছে", "sentence2": "টেনিস জুতার দাম অনেক।", "label": "neutral"} 6 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/double_sequence/csv/test.csv: -------------------------------------------------------------------------------- 1 | sentence1,sentence2,label 2 | "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।",আমি তার সাথে আবার কথা বলিনি।,contradiction 3 | "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।","আমি এতটাই ভেঙে পড়েছিলাম যে, আমি আবারও তার সঙ্গে কথা বলতে শুরু করেছিলাম।",entailment 4 | "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।",আমাদের খুব ভালো কথা হয়েছিল।,neutral 5 | "এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।","আমি জানতাম না যে, সেই দিন শুধুমাত্র আমিই ক্ষেত্রে উপস্থিত থাকব না।",neutral 6 | "এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।",আমার মনে হয়েছিল যে এএফএফসি বিমান বাহিনীর ক্যারিয়ারের ক্ষেত্রে আমিই একমাত্র ব্যক্তি।,entailment 7 | -------------------------------------------------------------------------------- /token_classification/sample_inputs/validation.jsonl: -------------------------------------------------------------------------------- 1 | {"tokens": ["২৮", "সেপ্টেম্বর", "দুবাইতে", "অনুষ্ঠিত", "হবে", "এই", "১৫", "সেপ্টেম্বর", "শ্রীলঙ্কার", "বিপক্ষে", "এবং", "২০", "সেপ্টেম্বর", "আফগানিস্তানের", "বিপক্ষে", "খেলবে"], "tags": ["O", "O", "B-LOC", "O", "O", "O", "O", "O", "B-LOC", "O", "O", "O", "O", "B-LOC", "O", "O"]} 2 | {"tokens": ["তিনি", "বলছিলেন", "এখানে", "উন্নতি", "করতে", "হলে", "কোয়ালিটি", "খেলোয়াড়দের", "সংখ্যাটাও", "বাড়াতে", "হবে"], "tags": ["B-PER", "O", "O", "O", "O", "O", "O", "B-PER", "O", "O", "O"]} 3 | {"tokens": ["ঢাকায়", "যেতে", "হলে", "তাদেরকে", "সকাল", "৯", "টার", "জোয়ার", "আসার", "পর", "হাতিয়ার", "উদ্দেশ্য", "যাত্রা", "করতে", "হয়"], "tags": ["B-LOC", "O", "O", "B-PER", "O", "O", "O", "O", "O", "O", "B-LOC", "O", "O", "O", "O"]} 4 | {"tokens": ["প্রতিটি", "বুথে", "ভোটসংখ্যা", "অনুযায়ী", "ব্যালট", "পেপার", "ও", "বক্স", "বুঝে", "নিতে", "হবে"], "tags": ["O", "O", "O", "O", "B-OBJ", "I-OBJ", "O", "B-OBJ", "O", "O", "O"]} 5 | {"tokens": ["বাংলার", "ইতিহাস", "বলতে", "অধুনা", "বাংলাদেশ", "ও", "পশ্চিমবঙ্গের", "বিগত", "চার", "সহস্রাব্দের", "ইতিহাসকে", "বোঝায়"], "tags": ["B-LOC", "O", "O", "O", "B-LOC", "O", "B-LOC", "O", "O", "O", "O", "O"]} 6 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/double_sequence/jsonl/sample_test_without_label.jsonl: -------------------------------------------------------------------------------- 1 | {"sentence1": "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।", "sentence2": "আমি তার সাথে আবার কথা বলিনি।"} 2 | {"sentence1": "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।", "sentence2": "আমি এতটাই ভেঙে পড়েছিলাম যে, আমি আবারও তার সঙ্গে কথা বলতে শুরু করেছিলাম।"} 3 | {"sentence1": "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।", "sentence2": "আমাদের খুব ভালো কথা হয়েছিল।"} 4 | {"sentence1": "এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।", "sentence2": "আমি জানতাম না যে, সেই দিন শুধুমাত্র আমিই ক্ষেত্রে উপস্থিত থাকব না।"} 5 | {"sentence1": "এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।", "sentence2": "আমার মনে হয়েছিল যে এএফএফসি বিমান বাহিনীর ক্যারিয়ারের ক্ষেত্রে আমিই একমাত্র ব্যক্তি।"} 6 | -------------------------------------------------------------------------------- /token_classification/evaluate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # misc. settings 4 | export seed=1234 5 | 6 | # model settings 7 | export model_name= 8 | 9 | # input settings 10 | # exactly one of `dataset_dir` or the test 11 | # dataset file needs to be provided 12 | input_settings=( 13 | # "--dataset_dir sample_inputs/" 14 | "--test_file sample_inputs/sample_test_without_tags.jsonl" 15 | ) 16 | 17 | # output settings 18 | export output_dir="outputs/" 19 | 20 | # batch sizes 21 | export PER_DEVICE_EVAL_BATCH_SIZE=8 22 | 23 | # optional_arguments 24 | optional_arguments=( 25 | "--cache_dir cache_dir/" 26 | "--overwrite_cache" 27 | ) 28 | 29 | # optional for logging 30 | # export WANDB_PROJECT="Token_classification_finetuning" 31 | # export WANDB_WATCH=false 32 | # export WANDB_MODE="dryrun" 33 | export WANDB_DISABLED=true 34 | 35 | python ./token_classification.py \ 36 | --model_name_or_path $model_name \ 37 | --output_dir $output_dir \ 38 | --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \ 39 | --max_seq_length $MAX_SEQUENCE_LENGTH \ 40 | --seed $seed --overwrite_output_dir --do_predict \ 41 | $(echo -n ${input_settings[@]}) \ 42 | $(echo ${optional_arguments[@]}) 43 | -------------------------------------------------------------------------------- /sequence_classification/evaluate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # misc. settings 4 | export seed=1234 5 | 6 | # model settings 7 | export model_name= 8 | 9 | # input settings 10 | # exactly one of `dataset_dir` or the test 11 | # dataset file needs to be provided 12 | input_settings=( 13 | # "--dataset_dir sample_inputs/single_sequence/jsonl/" 14 | "--test_file sample_inputs/single_sequence/jsonl/sample_test_without_labels.jsonl" 15 | ) 16 | 17 | # output settings 18 | export output_dir="outputs/" 19 | 20 | # batch sizes 21 | export PER_DEVICE_EVAL_BATCH_SIZE=8 22 | 23 | # optional_arguments 24 | optional_arguments=( 25 | "--cache_dir cache_dir/" 26 | "--overwrite_cache" 27 | ) 28 | 29 | # optional for logging 30 | # export WANDB_PROJECT="Sequence_classification_finetuning" 31 | # export WANDB_WATCH=false 32 | # export WANDB_MODE="dryrun" 33 | export WANDB_DISABLED=true 34 | 35 | python ./sequence_classification.py \ 36 | --model_name_or_path $model_name \ 37 | --output_dir $output_dir \ 38 | --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \ 39 | --seed $seed --overwrite_output_dir --do_predict \ 40 | $(echo -n ${input_settings[@]}) \ 41 | $(echo ${optional_arguments[@]}) 42 | -------------------------------------------------------------------------------- /token_classification/sample_inputs/train.jsonl: -------------------------------------------------------------------------------- 1 | {"tokens": ["ত্রাণ", "ও", "সমাজকল্যাণ", "সম্পাদক", "সুজিত", "রায়", "নন্দী", "প্রমুখ", "সংবাদ", "সম্মেলনে", "উপস্থিত", "ছিলেন"], "tags": ["O", "O", "O", "B-PER", "B-PER", "I-PER", "I-PER", "O", "O", "O", "O", "O"]} 2 | {"tokens": ["পরিকল্পনা", "অনুযায়ী", "তারা", "বাসায়", "ঢুকে", "দুই", "অতিথিকে", "নগ্ন", "করে", "তাদের", "মাঝখানে", "এক", "ছাত্রীকে", "বসিয়ে", "ছবি", "তোলেন"], "tags": ["O", "O", "B-PER", "B-OBJ", "O", "O", "B-PER", "O", "O", "B-PER", "O", "O", "B-PER", "O", "B-OBJ", "O"]} 3 | {"tokens": ["এ", "ছাড়া", "শুরু", "থেকে", "স্থানীয়", "সরকারের", "গুরুত্বপূর্ণ", "সিটি", "নির্বাচন", "গভীর", "পর্যবেক্ষণে", "রেখেছে", "ইউরোপিয়ান", "ইউনিয়ন", "ইইউ"], "tags": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]} 4 | {"tokens": ["তিনি", "বলছিলেন", "সবচেয়ে", "বড়", "কথা", "উনি", "খুব", "ভালো", "মানুষ", "ছিলেন"], "tags": ["B-PER", "O", "O", "O", "O", "O", "O", "O", "B-PER", "O"]} 5 | {"tokens": ["এসব", "দোকান", "ও", "স্ট্যান্ড", "থেকে", "প্রতি", "মাসে", "বড়", "অঙ্কের", "টাকা", "সরকার", "দলীয়", "শ্রমিক", "সংগঠনের", "কিছু", "নেতা", "আদায়", "করেন", "বলে", "জানান", "স্থানীয়রা"], "tags": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-PER", "O", "O", "O", "O", "B-PER"]} 6 | -------------------------------------------------------------------------------- /sequence_classification/sample_inputs/double_sequence/jsonl/test.jsonl: -------------------------------------------------------------------------------- 1 | {"sentence1": "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।", "sentence2": "আমি তার সাথে আবার কথা বলিনি।", "label": "contradiction"} 2 | {"sentence1": "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।", "sentence2": "আমি এতটাই ভেঙে পড়েছিলাম যে, আমি আবারও তার সঙ্গে কথা বলতে শুরু করেছিলাম।", "label": "entailment"} 3 | {"sentence1": "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।", "sentence2": "আমাদের খুব ভালো কথা হয়েছিল।", "label": "neutral"} 4 | {"sentence1": "এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।", "sentence2": "আমি জানতাম না যে, সেই দিন শুধুমাত্র আমিই ক্ষেত্রে উপস্থিত থাকব না।", "label": "neutral"} 5 | {"sentence1": "এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।", "sentence2": "আমার মনে হয়েছিল যে এএফএফসি বিমান বাহিনীর ক্যারিয়ারের ক্ষেত্রে আমিই একমাত্র ব্যক্তি।", "label": "entailment"} 6 | -------------------------------------------------------------------------------- /question_answering/evaluate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # model settings 4 | export model_name= 5 | 6 | # qa specific settings 7 | export doc_stride=256 8 | export n_best_size=30 9 | export max_answer_length=30 10 | # misc. settings 11 | export seed=1234 12 | 13 | # input settings 14 | # exactly one of `dataset_dir` or the (train / validation) 15 | # dataset files need to be provided 16 | input_settings=( 17 | "--dataset_dir inputs/sample_inputs/" 18 | # "--train_file sample_inputs/train.json" 19 | # "--validation_file sample_inputs/validation.json" 20 | ) 21 | 22 | # output settings 23 | export output_dir="outputs/" 24 | 25 | # batch / sequence sizes 26 | export PER_DEVICE_EVAL_BATCH_SIZE=16 27 | export MAX_SEQUENCE_LENGTH=512 28 | 29 | # optional arguments 30 | optional_arguments=( 31 | "--allow_null_ans" 32 | "--null_score_diff_threshold 0.0" 33 | "--overwrite_cache" 34 | "--cache_dir cache_dir/" 35 | "--fp16" 36 | "--fp16_backend auto" 37 | ) 38 | 39 | # optional for logging 40 | # export WANDB_PROJECT="Question_answering_finetuning" 41 | # export WANDB_WATCH=false 42 | # export WANDB_MODE="dryrun" 43 | export WANDB_DISABLED=true 44 | 45 | python ./question_answering.py \ 46 | --model_name_or_path $model_name \ 47 | --output_dir $output_dir \ 48 | --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \ 49 | --max_seq_length $MAX_SEQUENCE_LENGTH \ 50 | --doc_stride $doc_stride --n_best_size $n_best_size --max_answer_length $max_answer_length \ 51 | --seed $seed --overwrite_output_dir --do_predict \ 52 | $(echo -n ${input_settings[@]}) \ 53 | $(echo ${optional_arguments[@]}) 54 | 55 | -------------------------------------------------------------------------------- /question_answering/README.md: -------------------------------------------------------------------------------- 1 | ## Data format 2 | 3 | The finetuning script supports only `json` as input file format. The input file structure should be the same as standard QA datasets like [SQuAD v2.0](). 4 | 5 | ## Training & Evaluation 6 | 7 | To see list of all available options, do `python question_answering.py -h`. There are two ways to provide input data files to the script: 8 | 9 | * with flag `--dataset_dir ` where `` points to the directory containing files with prefix `train`, `validation` and `test`. 10 | * with flags `--train_file ` / `--train_file ` / `--validation_file ` / `--test_file `. 11 | 12 | For the following commands, we are going to use the `--dataset_dir ` to provide input files. 13 | 14 | 15 | ### Finetuning 16 | For finetuning on single GPU, a minimal example is as follows: 17 | 18 | ```bash 19 | $ python ./question_answering.py \ 20 | --model_name_or_path "csebuetnlp/banglabert" \ 21 | --dataset_dir "sample_inputs/" \ 22 | --output_dir "outputs/" \ 23 | --learning_rate=2e-5 \ 24 | --warmup_ratio 0.1 \ 25 | --gradient_accumulation_steps 2 \ 26 | --weight_decay 0.1 \ 27 | --lr_scheduler_type "linear" \ 28 | --per_device_train_batch_size=16 \ 29 | --per_device_eval_batch_size=16 \ 30 | --max_seq_length 512 \ 31 | --logging_strategy "epoch" \ 32 | --save_strategy "epoch" \ 33 | --evaluation_strategy "epoch" \ 34 | --num_train_epochs=3 \ 35 | --do_train --do_eval 36 | ``` 37 | For a detailed example, refer to **[trainer.sh](trainer.sh).** 38 | 39 | 40 | ### Evaluation 41 | * To calculate metrics on test set / inference on raw data, use the following snippet: 42 | 43 | ```bash 44 | $ python ./question_answering.py \ 45 | --model_name_or_path \ 46 | --dataset_dir "sample_inputs/" \ 47 | --output_dir "outputs/" \ 48 | --per_device_eval_batch_size=16 \ 49 | --overwrite_output_dir \ 50 | --do_predict 51 | ``` 52 | For a detailed example, refer to **[evaluate.sh](evaluate.sh).** 53 | -------------------------------------------------------------------------------- /token_classification/README.md: -------------------------------------------------------------------------------- 1 | ## Data format 2 | 3 | The finetuning script supports only `jsonl`(one json per line) as input file format. By default, the script expects the following key names: 4 | 5 | * `tokens` - List of input tokens 6 | * `tags` - Classification labels / tags for each token 7 | 8 | 9 | You can specify custom key names using the flags `--tokens_key `, `--tags_key ` to `token_classification.py`. To view sample input files, see the files **[here](sample_inputs/).** 10 | 11 | ## Training & Evaluation 12 | 13 | To see list of all available options, do `python token_classification.py -h`. There are two ways to provide input data files to the script: 14 | 15 | * with flag `--dataset_dir ` where `` points to the directory containing files with prefix `train`, `validation` and `test`. 16 | * with flags `--train_file ` / `--train_file ` / `--validation_file ` / `--test_file `. 17 | 18 | For the following commands, we are going to use the `--dataset_dir ` to provide input files. 19 | 20 | 21 | ### Finetuning 22 | For finetuning on single GPU, a minimal example is as follows: 23 | 24 | ```bash 25 | $ python ./token_classification.py \ 26 | --model_name_or_path "csebuetnlp/banglabert" \ 27 | --dataset_dir "sample_inputs/" \ 28 | --output_dir "outputs/" \ 29 | --learning_rate=2e-5 \ 30 | --warmup_ratio 0.1 \ 31 | --gradient_accumulation_steps 2 \ 32 | --weight_decay 0.1 \ 33 | --lr_scheduler_type "linear" \ 34 | --per_device_train_batch_size=16 \ 35 | --per_device_eval_batch_size=16 \ 36 | --max_seq_length 512 \ 37 | --logging_strategy "epoch" \ 38 | --save_strategy "epoch" \ 39 | --evaluation_strategy "epoch" \ 40 | --num_train_epochs=3 \ 41 | --do_train --do_eval 42 | ``` 43 | For a detailed example, refer to **[trainer.sh](trainer.sh).** 44 | 45 | 46 | ### Evaluation 47 | * To calculate metrics on test set / inference on raw data, use the following snippet: 48 | 49 | ```bash 50 | $ python ./token_classification.py \ 51 | --model_name_or_path \ 52 | --dataset_dir "sample_inputs/" \ 53 | --output_dir "outputs/" \ 54 | --per_device_eval_batch_size=16 \ 55 | --overwrite_output_dir \ 56 | --do_predict 57 | ``` 58 | For a detailed example, refer to **[evaluate.sh](evaluate.sh).** 59 | -------------------------------------------------------------------------------- /token_classification/trainer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # training settings 4 | export num_train_epochs=3 5 | export save_strategy="epoch" 6 | export logging_strategy="epoch" 7 | 8 | # validation settings 9 | export evaluation_strategy="epoch" 10 | 11 | # model settings 12 | export model_name="csebuetnlp/banglabert" 13 | 14 | # optimization settings 15 | export learning_rate=2e-5 16 | export warmup_ratio=0.1 17 | export gradient_accumulation_steps=2 18 | export weight_decay=0.01 19 | export lr_scheduler_type="linear" 20 | 21 | # misc. settings 22 | export seed=1234 23 | 24 | # input settings 25 | # exactly one of `dataset_dir` or the (train / validation) 26 | # dataset files need to be provided 27 | input_settings=( 28 | "--dataset_dir sample_inputs/" 29 | # "--train_file sample_inputs/train.jsonl" 30 | # "--validation_file sample_inputs/validation.jsonl" 31 | ) 32 | 33 | # output settings 34 | export output_dir="outputs/" 35 | 36 | # batch / sequence sizes 37 | export PER_DEVICE_TRAIN_BATCH_SIZE=16 38 | export PER_DEVICE_EVAL_BATCH_SIZE=16 39 | export MAX_SEQUENCE_LENGTH=512 40 | 41 | # optional arguments 42 | optional_arguments=( 43 | "--metric_for_best_model weighted_avg_f1" 44 | "--greater_is_better true" # this should be commented out if the reverse is required 45 | "--load_best_model_at_end" 46 | "--logging_first_step" 47 | "--overwrite_cache" 48 | "--cache_dir cache_dir/" 49 | "--fp16" 50 | "--fp16_backend auto" 51 | ) 52 | 53 | # optional for logging 54 | # export WANDB_PROJECT="Token_classification_finetuning" 55 | # export WANDB_WATCH=false 56 | # export WANDB_MODE="dryrun" 57 | export WANDB_DISABLED=true 58 | 59 | python ./token_classification.py \ 60 | --model_name_or_path $model_name \ 61 | --output_dir $output_dir \ 62 | --learning_rate=$learning_rate --warmup_ratio $warmup_ratio --gradient_accumulation_steps $gradient_accumulation_steps \ 63 | --weight_decay $weight_decay --lr_scheduler_type $lr_scheduler_type \ 64 | --per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \ 65 | --max_seq_length $MAX_SEQUENCE_LENGTH --logging_strategy $logging_strategy \ 66 | --seed $seed --overwrite_output_dir \ 67 | --num_train_epochs=$num_train_epochs --save_strategy $save_strategy \ 68 | --evaluation_strategy $evaluation_strategy --do_train --do_eval \ 69 | $(echo -n ${input_settings[@]}) \ 70 | $(echo ${optional_arguments[@]}) 71 | 72 | -------------------------------------------------------------------------------- /sequence_classification/trainer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # training settings 4 | export num_train_epochs=3 5 | export save_strategy="epoch" 6 | export logging_strategy="epoch" 7 | 8 | # validation settings 9 | export evaluation_strategy="epoch" 10 | 11 | # model settings 12 | export model_name="csebuetnlp/banglabert" 13 | 14 | # optimization settings 15 | export learning_rate=2e-5 16 | export warmup_ratio=0.1 17 | export gradient_accumulation_steps=2 18 | export weight_decay=0.01 19 | export lr_scheduler_type="linear" 20 | 21 | # misc. settings 22 | export seed=1234 23 | 24 | # input settings 25 | # exactly one of `dataset_dir` or the (train / validation) 26 | # dataset files need to be provided 27 | input_settings=( 28 | "--dataset_dir sample_inputs/single_sequence/jsonl" 29 | # "--train_file sample_inputs/single_sequence/jsonl/train.jsonl" 30 | # "--validation_file sample_inputs/single_sequence/jsonl/validation.jsonl" 31 | ) 32 | 33 | 34 | # output settings 35 | export output_dir="outputs/" 36 | 37 | # batch / sequence sizes 38 | export PER_DEVICE_TRAIN_BATCH_SIZE=16 39 | export PER_DEVICE_EVAL_BATCH_SIZE=16 40 | export MAX_SEQUENCE_LENGTH=512 41 | 42 | # optional arguments 43 | optional_arguments=( 44 | "--metric_for_best_model accuracy" 45 | "--greater_is_better true" # this should be commented out if the reverse is required 46 | "--load_best_model_at_end" 47 | "--logging_first_step" 48 | "--overwrite_cache" 49 | "--cache_dir cache_dir/" 50 | "--fp16" 51 | "--fp16_backend auto" 52 | "--do_predict" 53 | ) 54 | 55 | # optional for logging 56 | # export WANDB_PROJECT="Sequence_classification_finetuning" 57 | # export WANDB_WATCH=false 58 | # export WANDB_MODE="dryrun" 59 | export WANDB_DISABLED=true 60 | 61 | python ./sequence_classification.py \ 62 | --model_name_or_path $model_name \ 63 | --output_dir $output_dir \ 64 | --learning_rate=$learning_rate --warmup_ratio $warmup_ratio --gradient_accumulation_steps $gradient_accumulation_steps \ 65 | --weight_decay $weight_decay --lr_scheduler_type $lr_scheduler_type \ 66 | --per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \ 67 | --max_seq_length $MAX_SEQUENCE_LENGTH --logging_strategy $logging_strategy \ 68 | --seed $seed --overwrite_output_dir \ 69 | --num_train_epochs=$num_train_epochs --save_strategy $save_strategy \ 70 | --evaluation_strategy $evaluation_strategy --do_train --do_eval \ 71 | $(echo -n ${input_settings[@]}) \ 72 | $(echo ${optional_arguments[@]}) 73 | 74 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Initially taken from Github's Python gitignore file 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # tests and logs 12 | tests/fixtures/cached_*_text.txt 13 | logs/ 14 | lightning_logs/ 15 | lang_code_data/ 16 | 17 | # Distribution / packaging 18 | .Python 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # celery beat schedule file 92 | celerybeat-schedule 93 | 94 | # SageMath parsed files 95 | *.sage.py 96 | 97 | # Environments 98 | .env 99 | .venv 100 | env/ 101 | venv/ 102 | ENV/ 103 | env.bak/ 104 | venv.bak/ 105 | 106 | # Spyder project settings 107 | .spyderproject 108 | .spyproject 109 | 110 | # Rope project settings 111 | .ropeproject 112 | 113 | # mkdocs documentation 114 | /site 115 | 116 | # mypy 117 | .mypy_cache/ 118 | .dmypy.json 119 | dmypy.json 120 | 121 | # Pyre type checker 122 | .pyre/ 123 | 124 | # vscode 125 | .vs 126 | .vscode 127 | 128 | # Pycharm 129 | .idea 130 | 131 | # TF code 132 | tensorflow_code 133 | 134 | # Models 135 | proc_data 136 | 137 | # examples 138 | runs 139 | /runs_old 140 | /wandb 141 | /examples/runs 142 | /examples/**/*.args 143 | /examples/rag/sweep 144 | 145 | # data 146 | /data 147 | serialization_dir 148 | 149 | # emacs 150 | *.*~ 151 | debug.env 152 | 153 | # vim 154 | .*.swp 155 | 156 | #ctags 157 | tags 158 | 159 | # pre-commit 160 | .pre-commit* 161 | 162 | # .lock 163 | *.lock -------------------------------------------------------------------------------- /question_answering/trainer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # training settings 4 | export num_train_epochs=10 5 | export save_strategy="epoch" 6 | export logging_strategy="epoch" 7 | 8 | # validation settings 9 | export evaluation_strategy="epoch" 10 | 11 | # model settings 12 | export model_name="csebuetnlp/banglabert" 13 | 14 | # optimization settings 15 | export learning_rate=2e-5 16 | export warmup_ratio=0.1 17 | export gradient_accumulation_steps=16 18 | export weight_decay=0.01 19 | export lr_scheduler_type="linear" 20 | 21 | # qa specific settings 22 | export doc_stride=256 23 | export n_best_size=30 24 | export max_answer_length=30 25 | 26 | # misc. settings 27 | export seed=1234 28 | 29 | # input settings 30 | # exactly one of `dataset_dir` or the (train / validation) 31 | # dataset files need to be provided 32 | input_settings=( 33 | "--dataset_dir inputs/sample_inputs" 34 | # "--train_file sample_inputs/train.json" 35 | # "--validation_file sample_inputs/validation.json" 36 | ) 37 | 38 | # output settings 39 | export output_dir="outputs/" 40 | 41 | # batch / sequence sizes 42 | export PER_DEVICE_TRAIN_BATCH_SIZE=2 43 | export PER_DEVICE_EVAL_BATCH_SIZE=2 44 | export MAX_SEQUENCE_LENGTH=512 45 | 46 | # optional arguments 47 | optional_arguments=( 48 | "--allow_null_ans" 49 | "--null_score_diff_threshold 0.0" 50 | "--metric_for_best_model f1" 51 | "--greater_is_better true" # this should be commented out if the reverse is required 52 | "--load_best_model_at_end" 53 | "--logging_first_step" 54 | "--overwrite_cache" 55 | "--cache_dir cache_dir/" 56 | "--fp16" 57 | "--fp16_backend auto" 58 | "--do_predict" 59 | ) 60 | 61 | # optional for logging 62 | # export WANDB_PROJECT="Question_answering_finetuning" 63 | # export WANDB_WATCH=false 64 | # export WANDB_MODE="dryrun" 65 | export WANDB_DISABLED=true 66 | 67 | python ./question_answering.py \ 68 | --model_name_or_path $model_name \ 69 | --output_dir $output_dir \ 70 | --learning_rate=$learning_rate --warmup_ratio $warmup_ratio --gradient_accumulation_steps $gradient_accumulation_steps \ 71 | --weight_decay $weight_decay --lr_scheduler_type $lr_scheduler_type \ 72 | --per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \ 73 | --max_seq_length $MAX_SEQUENCE_LENGTH --logging_strategy $logging_strategy \ 74 | --doc_stride $doc_stride --n_best_size $n_best_size --max_answer_length $max_answer_length \ 75 | --seed $seed --overwrite_output_dir \ 76 | --num_train_epochs=$num_train_epochs --save_strategy $save_strategy \ 77 | --evaluation_strategy $evaluation_strategy --do_train --do_eval \ 78 | $(echo -n ${input_settings[@]}) \ 79 | $(echo ${optional_arguments[@]}) 80 | 81 | -------------------------------------------------------------------------------- /sequence_classification/README.md: -------------------------------------------------------------------------------- 1 | ## Data format 2 | 3 | The finetuning script supports the following input file formats: `csv`, `tsv` and `jsonl`(one json per line). By default, the script expects the following column names (for `tsv`, `csv`) / key names (for `jsonl`): 4 | 5 | * For single sequence classification: 6 | * `sentence1` - Input sequence 7 | * `label` - Classification label (Optional for `test` files) 8 | 9 | * For double sequence classification: 10 | * `sentence1` - First input sequence 11 | * `sentence2` - Second input sequence 12 | * `label` - Classification label (Optional for `test` files) 13 | 14 | You can specify custom column / key names using the flags `--sentence1_key `, `--sentence2_key `, `--label_key ` to `sequence_classification.py`. To view sample input files for all supported formats, see the files **[here](sample_inputs/).** 15 | 16 | ## Training & Evaluation 17 | 18 | To see list of all available options, do `python sequence_classification.py -h`. There are two ways to provide input data files to the script: 19 | 20 | * with flag `--dataset_dir ` where `` points to the directory containing files with prefix `train`, `validation` and `test`. 21 | * with flags `--train_file ` / `--train_file ` / `--validation_file ` / `--test_file `. 22 | 23 | For the following commands, we are going to use the `--dataset_dir ` to provide input files. 24 | 25 | 26 | ### Finetuning 27 | For finetuning on single GPU, a minimal example is as follows: 28 | 29 | ```bash 30 | $ python ./sequence_classification.py \ 31 | --model_name_or_path "csebuetnlp/banglabert" \ 32 | --dataset_dir "sample_inputs/single_sequence/jsonl" \ 33 | --output_dir "outputs/" \ 34 | --learning_rate=2e-5 \ 35 | --warmup_ratio 0.1 \ 36 | --gradient_accumulation_steps 2 \ 37 | --weight_decay 0.1 \ 38 | --lr_scheduler_type "linear" \ 39 | --per_device_train_batch_size=16 \ 40 | --per_device_eval_batch_size=16 \ 41 | --max_seq_length 512 \ 42 | --logging_strategy "epoch" \ 43 | --save_strategy "epoch" \ 44 | --evaluation_strategy "epoch" \ 45 | --num_train_epochs=3 \ 46 | --do_train --do_eval 47 | ``` 48 | For a detailed example, refer to **[trainer.sh](trainer.sh).** 49 | 50 | 51 | ### Evaluation 52 | * To calculate metrics on test set / inference on raw data, use the following snippet: 53 | 54 | ```bash 55 | $ python ./sequence_classification.py \ 56 | --model_name_or_path \ 57 | --dataset_dir "sample_inputs/single_sequence/jsonl" \ 58 | --output_dir "outputs/" \ 59 | --per_device_eval_batch_size=16 \ 60 | --overwrite_output_dir \ 61 | --do_predict 62 | ``` 63 | For a detailed example, refer to **[evaluate.sh](evaluate.sh).** 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BanglaBERT 2 | 3 | This repository contains the official release of the model **"BanglaBERT"** and associated downstream fine-tuning code and datasets introduced in the paper titled [**"BanglaBERT: Language Model Pretraining and Benchmarks for 4 | Low-Resource Language Understanding Evaluation in Bangla"**](https://aclanthology.org/2022.findings-naacl.98/) published in *Findings of the Association for Computational Linguistics: NAACL 2022*. 5 | 6 | ## Updates 7 | * We have released [BanglaBERT (small)](https://huggingface.co/csebuetnlp/banglabert_small). It can be fine-tuned with as little as 4 GB VRAM! 8 | * We have released a large variant of BanglaBERT! Have a look [here](https://huggingface.co/csebuetnlp/banglabert_large). 9 | * The Bangla2B+ pretraining corpus is now available upon request! See [here](#datasets). 10 | 11 | ## Table of Contents 12 | 13 | - [BanglaBERT](#banglabert) 14 | - [Table of Contents](#table-of-contents) 15 | - [Models](#models) 16 | - [Datasets](#datasets) 17 | - [Setup](#setup) 18 | - [Training & Evaluation](#training--evaluation) 19 | - [Benchmarks](#benchmarks) 20 | - [Acknowledgements](#acknowledgements) 21 | - [License](#license) 22 | - [Citation](#citation) 23 | 24 | ## Models 25 | 26 | The pretrained model checkpoints are available at [Huggingface model hub](https://huggingface.co/csebuetnlp). 27 | 28 | - [**BanglaBERT**](https://huggingface.co/csebuetnlp/banglabert) 29 | - [**BanglishBERT**](https://huggingface.co/csebuetnlp/banglishbert) 30 | - [**BanglaBERT (small)**](https://huggingface.co/csebuetnlp/banglabert_small) 31 | - [**BanglaBERT (large)**](https://huggingface.co/csebuetnlp/banglabert_large) 32 | 33 | To use these models for the supported downstream tasks in this repository see **[Training & Evaluation](#training--evaluation).** 34 | 35 | ***Note:*** These models were pretrained using a ***specific normalization pipeline*** available **[here](https://github.com/csebuetnlp/normalizer)**. All finetuning scripts in this repository uses this normalization by default. If you need to adapt the pretrained model for a different task make sure ***the text units are normalized using this pipeline before tokenizing*** to get best results. A basic example is available at the **[model page](https://huggingface.co/csebuetnlp/banglabert).** 36 | 37 | ## Datasets 38 | 39 | We are also releasing the Bangla Natural Language Inference (NLI) and Bangla Question Answering (QA) datasets introduced in the paper. 40 | - [**NLI**](https://huggingface.co/datasets/csebuetnlp/xnli_bn) 41 | - [**QA**](https://huggingface.co/datasets/csebuetnlp/squad_bn) 42 | 43 | Please fill out this [**Google Form**](https://forms.gle/qiEW8f7i6Bw3FmmQA) to request access to the Bangla2B+ pretraining corpus. 44 | 45 | ## Setup 46 | 47 | For installing the necessary requirements, use the following bash snippet 48 | ```bash 49 | $ git clone https://github.com/csebuetnlp/banglabert 50 | $ cd banglabert/ 51 | $ conda create python==3.7.9 pytorch==1.8.1 torchvision==0.9.1 torchaudio==0.8.0 cudatoolkit=10.2 -c pytorch -p ./env 52 | $ conda activate ./env # or source activate ./env (for older versions of anaconda) 53 | $ bash setup.sh 54 | ``` 55 | * Use the newly created environment for running the scripts in this repository. 56 | 57 | ## Training & Evaluation 58 | 59 | To use the pretrained model for finetuning / inference on different downstream tasks see the following section: 60 | 61 | * **[Sequence Classification](sequence_classification/).** 62 | - For single sequence classification such as 63 | - Document classification 64 | - Sentiment classification 65 | - Emotion classification etc. 66 | - For double sequence classification such as 67 | - Natural Language Inference (NLI) 68 | - Paraphrase detection etc. 69 | - **[Token Classification](token_classification/).** 70 | - For token tagging / classification tasks such as 71 | - Named Entity Recognition (NER) 72 | - Parts of Speech Tagging (PoS) etc. 73 | - **[Question Answering](question_answering/).** 74 | - For tasks such as, 75 | - Extractive Question Answering 76 | - Open-domain Question Answering 77 | 78 | 79 | ## Benchmarks 80 | 81 | * Zero-shot cross-lingual transfer-learning 82 | 83 | | Model | Params | SC (macro-F1) | NLI (accuracy) | NER (micro-F1) | QA (EM/F1) | BangLUE score | 84 | |----------------|-----------|-----------|-----------|-----------|-----------|-----------| 85 | |[mBERT](https://huggingface.co/bert-base-multilingual-cased) | 180M | 27.05 | 62.22 | 39.27 | 59.01/64.18 | 50.35 | 86 | |[XLM-R (base)](https://huggingface.co/xlm-roberta-base) | 270M | 42.03 | 72.18 | 45.37 | 55.03/61.83 | 55.29 | 87 | |[XLM-R (large)](https://huggingface.co/xlm-roberta-large) | 550M | 49.49 | 78.13 | 56.48 | 71.13/77.70 | 66.59 | 88 | |[BanglishBERT](https://huggingface.co/csebuetnlp/banglishbert) | 110M | 48.39 | 75.26 | 55.56 | 72.87/78.63 | 66.14 | 89 | 90 | * Supervised fine-tuning 91 | 92 | | Model | Params | SC (macro-F1) | NLI (accuracy) | NER (micro-F1) | QA (EM/F1) | BangLUE score | 93 | |----------------|-----------|-----------|-----------|-----------|-----------|-----------| 94 | |[mBERT](https://huggingface.co/bert-base-multilingual-cased) | 180M | 67.59 | 75.13 | 68.97 | 67.12/72.64 | 70.29 | 95 | |[XLM-R (base)](https://huggingface.co/xlm-roberta-base) | 270M | 69.54 | 78.46 | 73.32 | 68.09/74.27 | 72.82 | 96 | |[XLM-R (large)](https://huggingface.co/xlm-roberta-large) | 550M | 70.97 | 82.40 | 78.39 | 73.15/79.06 | 76.79 | 97 | |[sahajBERT](https://huggingface.co/neuropark/sahajBERT) | 18M | 71.12 | 76.92 | 70.94 | 65.48/70.69 | 71.03 | 98 | |[BanglishBERT](https://huggingface.co/csebuetnlp/banglishbert) | 110M | 70.61 | 80.95 | 76.28 | 72.43/78.40 | *75.73* | 99 | |[BanglaBERT (small)](https://huggingface.co/csebuetnlp/banglabert_small) | 13M | 69.29 | 76.75 | 73.41 | 63.30/69.65 | *70.38* | 100 | |[BanglaBERT](https://huggingface.co/csebuetnlp/banglabert) | 110M | 72.89 | 82.80 | 77.78 | 72.63/79.34 | *77.09* | 101 | |[BanglaBERT (large)](https://huggingface.co/csebuetnlp/banglabert_large) | 335M | 71.94 | 83.41 | 79.20 | 76.10/81.50 | **78.43** | 102 | 103 | 104 | The benchmarking datasets are as follows: 105 | * **SC:** **[Sentiment Classification](https://aclanthology.org/2021.findings-emnlp.278)** 106 | * **NER:** **[Named Entity Recognition](https://multiconer.github.io/competition)** 107 | * **NLI:** **[Natural Language Inference](#datasets)** 108 | * **QA:** **[Question Answering](#datasets)** 109 | 110 | ## Acknowledgements 111 | 112 | We would like to thank [Intelligent Machines](https://bd.linkedin.com/company/intelligentmachines) and [Google TFRC Program](https://sites.research.google/trc/) for providing cloud support for pretraining the models. 113 | 114 | 115 | ## License 116 | Contents of this repository are restricted to non-commercial research purposes only under the [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/). 117 | 118 | Creative Commons License 119 | 120 | ## Citation 121 | If you use any of the datasets, models or code modules, please cite the following paper: 122 | ``` 123 | @inproceedings{bhattacharjee-etal-2022-banglabert, 124 | title = "{B}angla{BERT}: Language Model Pretraining and Benchmarks for Low-Resource Language Understanding Evaluation in {B}angla", 125 | author = "Bhattacharjee, Abhik and 126 | Hasan, Tahmid and 127 | Ahmad, Wasi and 128 | Mubasshir, Kazi Samin and 129 | Islam, Md Saiful and 130 | Iqbal, Anindya and 131 | Rahman, M. Sohel and 132 | Shahriyar, Rifat", 133 | booktitle = "Findings of the Association for Computational Linguistics: NAACL 2022", 134 | month = jul, 135 | year = "2022", 136 | address = "Seattle, United States", 137 | publisher = "Association for Computational Linguistics", 138 | url = "https://aclanthology.org/2022.findings-naacl.98", 139 | pages = "1318--1327", 140 | abstract = "In this work, we introduce BanglaBERT, a BERT-based Natural Language Understanding (NLU) model pretrained in Bangla, a widely spoken yet low-resource language in the NLP literature. To pretrain BanglaBERT, we collect 27.5 GB of Bangla pretraining data (dubbed {`}Bangla2B+{'}) by crawling 110 popular Bangla sites. We introduce two downstream task datasets on natural language inference and question answering and benchmark on four diverse NLU tasks covering text classification, sequence labeling, and span prediction. In the process, we bring them under the first-ever Bangla Language Understanding Benchmark (BLUB). BanglaBERT achieves state-of-the-art results outperforming multilingual and monolingual models. We are making the models, datasets, and a leaderboard publicly available at \url{https://github.com/csebuetnlp/banglabert} to advance Bangla NLP.", 141 | } 142 | ``` 143 | -------------------------------------------------------------------------------- /sequence_classification/sequence_classification.py: -------------------------------------------------------------------------------- 1 | # Adapted from huggingface transformers classificaton scripts 2 | 3 | import logging 4 | import os 5 | import random 6 | import sys 7 | from dataclasses import dataclass, field 8 | from typing import Optional 9 | 10 | import glob 11 | 12 | import datasets 13 | import numpy as np 14 | from datasets import load_metric 15 | from datasets.io.json import JsonDatasetReader 16 | from datasets.io.csv import CsvDatasetReader 17 | 18 | import transformers 19 | from transformers import ( 20 | AutoConfig, 21 | AutoModelForSequenceClassification, 22 | AutoTokenizer, 23 | DataCollatorWithPadding, 24 | EvalPrediction, 25 | HfArgumentParser, 26 | PretrainedConfig, 27 | Trainer, 28 | TrainingArguments, 29 | default_data_collator, 30 | set_seed, 31 | ) 32 | from transformers.trainer_utils import get_last_checkpoint 33 | from transformers.utils import check_min_version 34 | from transformers.utils.versions import require_version 35 | from normalizer import normalize 36 | 37 | EXT2CONFIG = { 38 | "csv" : (CsvDatasetReader, {}), 39 | "tsv" : (CsvDatasetReader, {"sep": "\t"}), 40 | "jsonl": (JsonDatasetReader, {}), 41 | "json": (JsonDatasetReader, {}) 42 | } 43 | 44 | logger = logging.getLogger(__name__) 45 | 46 | 47 | @dataclass 48 | class DataTrainingArguments: 49 | 50 | dataset_dir: Optional[str] = field( 51 | default=None, metadata={ 52 | "help": "Path to the directory containing the data files. (.csv / .tsv / .jsonl)" 53 | "File datatypes will be identified with their prefix names as follows: " 54 | "`train`- Training file(s) e.g. `train.csv`/ `train_part1.csv` etc. " 55 | "`validation`- Evaluation file(s) e.g. `validation.csv`/ `validation_part1.csv` etc. " 56 | "`test`- Test file(s) e.g. `test.csv`/ `test_part1.csv` etc. " 57 | "All files for must have the same extension." 58 | } 59 | ) 60 | max_seq_length: int = field( 61 | default=512, 62 | metadata={ 63 | "help": "The maximum total input sequence length after tokenization. Sequences longer " 64 | "than this will be truncated, sequences shorter will be padded." 65 | }, 66 | ) 67 | overwrite_cache: bool = field( 68 | default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} 69 | ) 70 | pad_to_max_length: bool = field( 71 | default=False, 72 | metadata={ 73 | "help": "Whether to pad all samples to `max_seq_length`. " 74 | "If False, will pad the samples dynamically when batching to the maximum length in the batch." 75 | }, 76 | ) 77 | max_train_samples: Optional[int] = field( 78 | default=None, 79 | metadata={ 80 | "help": "For debugging purposes or quicker training, truncate the number of training examples to this " 81 | "value if set." 82 | }, 83 | ) 84 | max_eval_samples: Optional[int] = field( 85 | default=None, 86 | metadata={ 87 | "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " 88 | "value if set." 89 | }, 90 | ) 91 | max_predict_samples: Optional[int] = field( 92 | default=None, 93 | metadata={ 94 | "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " 95 | "value if set." 96 | }, 97 | ) 98 | train_file: Optional[str] = field( 99 | default=None, metadata={"help": "A csv / tsv / jsonl file containing the training data."} 100 | ) 101 | validation_file: Optional[str] = field( 102 | default=None, metadata={"help": "A csv / tsv / jsonl file containing the validation data."} 103 | ) 104 | test_file: Optional[str] = field(default=None, metadata={"help": "A csv / tsv / jsonl file containing the test data."}) 105 | do_normalize: Optional[bool] = field(default=True, metadata={"help": "Normalize text before feeding to the model."}) 106 | unicode_norm: Optional[str] = field(default="NFKC", metadata={"help": "Type of unicode normalization"}) 107 | remove_punct: Optional[bool] = field( 108 | default=False, metadata={ 109 | "help": "Remove punctuation during normalization. To replace with custom token / selective replacement you should " 110 | "use this repo (https://github.com/abhik1505040/normalizer) before feeding the data to the script." 111 | }) 112 | remove_emoji: Optional[bool] = field( 113 | default=False, metadata={ 114 | "help": "Remove emojis during normalization. To replace with custom token / selective replacement you should " 115 | "use this repo (https://github.com/abhik1505040/normalizer) before feeding the data to the script." 116 | }) 117 | remove_urls: Optional[bool] = field( 118 | default=False, metadata={ 119 | "help": "Remove urls during normalization. To replace with custom token / selective replacement you should " 120 | "use this repo (https://github.com/abhik1505040/normalizer) before feeding the data to the script." 121 | }) 122 | sentence1_key: Optional[str] = field( 123 | default="sentence1", metadata={"help": "Key / column name in the input file corresponding to the first input sequence"} 124 | ) 125 | sentence2_key: Optional[str] = field( 126 | default="sentence2", metadata={"help": "Key / column name in the input file corresponding to the second input sequence"} 127 | ) 128 | label_key: Optional[str] = field( 129 | default="label", metadata={"help": "Key / column name in the input file corresponding to the classification label"} 130 | ) 131 | 132 | def __post_init__(self): 133 | if self.train_file is not None and self.validation_file is not None: 134 | train_extension = self.train_file.split(".")[-1] 135 | assert train_extension in ["csv", "jsonl", "tsv", "json"], "`train_file` should be a csv / tsv / jsonl file." 136 | validation_extension = self.validation_file.split(".")[-1] 137 | assert ( 138 | validation_extension == train_extension 139 | ), "`validation_file` should have the same extension csv / tsv / jsonl as `train_file`." 140 | 141 | 142 | 143 | @dataclass 144 | class ModelArguments: 145 | 146 | model_name_or_path: str = field( 147 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} 148 | ) 149 | cache_dir: Optional[str] = field( 150 | default=None, 151 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 152 | ) 153 | 154 | 155 | def main(): 156 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) 157 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 158 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 159 | else: 160 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 161 | 162 | # Setup logging 163 | logging.basicConfig( 164 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 165 | datefmt="%m/%d/%Y %H:%M:%S", 166 | handlers=[logging.StreamHandler(sys.stdout)], 167 | ) 168 | 169 | log_level = training_args.get_process_log_level() 170 | logger.setLevel(log_level) 171 | datasets.utils.logging.set_verbosity(log_level) 172 | transformers.utils.logging.set_verbosity(log_level) 173 | transformers.utils.logging.enable_default_handler() 174 | transformers.utils.logging.enable_explicit_format() 175 | 176 | # Log on each process the small summary: 177 | logger.warning( 178 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 179 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 180 | ) 181 | logger.info(f"Training/evaluation parameters {training_args}") 182 | 183 | # Detecting last checkpoint. 184 | last_checkpoint = None 185 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 186 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 187 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 188 | raise ValueError( 189 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 190 | "Use --overwrite_output_dir to overcome." 191 | ) 192 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: 193 | logger.info( 194 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 195 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 196 | ) 197 | 198 | # Set seed before initializing model. 199 | set_seed(training_args.seed) 200 | has_ext = lambda path: len(os.path.basename(path).split(".")) > 1 201 | get_ext = lambda path: os.path.basename(path).split(".")[-1] 202 | 203 | if data_args.dataset_dir is not None: 204 | data_files = {} 205 | all_files = glob.glob( 206 | os.path.join( 207 | data_args.dataset_dir, 208 | "*" 209 | ) 210 | ) 211 | all_exts = [get_ext(k) for k in all_files if has_ext(k)] 212 | if not all_exts: 213 | raise ValueError("The `dataset_dir` doesnt have any valid file.") 214 | 215 | selected_ext = max(set(all_exts), key=all_exts.count) 216 | for search_prefix in ["train", "validation", "test"]: 217 | found_files = glob.glob( 218 | os.path.join( 219 | data_args.dataset_dir, 220 | search_prefix + "*" + selected_ext 221 | ) 222 | ) 223 | if not found_files: 224 | continue 225 | 226 | data_files[search_prefix] = found_files 227 | 228 | else: 229 | data_files = { 230 | "train": data_args.train_file, 231 | "validation": data_args.validation_file, 232 | "test": data_args.test_file 233 | } 234 | 235 | data_files = {k: v for k, v in data_files.items() if v is not None} 236 | 237 | if not data_files: 238 | raise ValueError("No valid input file found.") 239 | 240 | selected_ext = get_ext(list(data_files.values())[0]) 241 | 242 | 243 | dataset_configs = EXT2CONFIG[selected_ext] 244 | raw_datasets = dataset_configs[0]( 245 | data_files, 246 | **dataset_configs[1] 247 | ).read() 248 | 249 | for data_type, ds in raw_datasets.items(): 250 | assert data_args.sentence1_key in ds.features, f"Input files doesnt have the `{data_args.sentence1_key}` key" 251 | if data_type != "test": 252 | assert data_args.label_key in ds.features, f"Input files doesnt have the `{data_args.label_key}` key" 253 | 254 | ignored_columns = set(ds.column_names) - set([data_args.sentence1_key, data_args.sentence2_key, data_args.label_key]) 255 | raw_datasets[data_type] = ds.remove_columns(ignored_columns) 256 | 257 | 258 | config = AutoConfig.from_pretrained( 259 | model_args.model_name_or_path, 260 | cache_dir=model_args.cache_dir, 261 | ) 262 | 263 | label_to_id = config.label2id if config.task_specific_params and config.task_specific_params.get("finetuned", False) else None 264 | if label_to_id is None: 265 | label_list = raw_datasets["train"].unique(data_args.label_key) 266 | label_list.sort() 267 | num_labels = len(label_list) 268 | label_to_id = {v: i for i, v in enumerate(label_list)} 269 | config.label2id = label_to_id 270 | config.id2label = {id: label for label, id in config.label2id.items()} 271 | config.task_specific_params = {"finetuned": True} 272 | else: 273 | label_list = list(label_to_id.keys()) 274 | num_labels = len(label_list) 275 | 276 | tokenizer = AutoTokenizer.from_pretrained( 277 | model_args.model_name_or_path, 278 | cache_dir=model_args.cache_dir, 279 | use_fast=False 280 | ) 281 | model = AutoModelForSequenceClassification.from_pretrained( 282 | model_args.model_name_or_path, 283 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 284 | config=config, 285 | cache_dir=model_args.cache_dir 286 | ) 287 | 288 | 289 | # Padding strategy 290 | if data_args.pad_to_max_length: 291 | padding = "max_length" 292 | else: 293 | # We will pad later, dynamically at batch creation, to the max sequence length in each batch 294 | padding = False 295 | 296 | if data_args.max_seq_length > tokenizer.model_max_length: 297 | logger.warning( 298 | f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" 299 | f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." 300 | ) 301 | max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) 302 | 303 | if data_args.do_normalize: 304 | normalization_kwargs = { 305 | "unicode_norm": data_args.unicode_norm, 306 | "punct_replacement": " " if data_args.remove_punct else None, 307 | "url_replacement": " " if data_args.remove_urls else None, 308 | "emoji_replacement": " " if data_args.remove_emoji else None 309 | } 310 | 311 | def normalize_example(example): 312 | l = example[data_args.sentence1_key] 313 | example[data_args.sentence1_key] = normalize(l, **normalization_kwargs) 314 | 315 | if data_args.sentence2_key in example: 316 | l = example[data_args.sentence2_key] 317 | example[data_args.sentence2_key] = normalize(l, **normalization_kwargs) 318 | 319 | return example 320 | 321 | raw_datasets = raw_datasets.map( 322 | normalize_example, 323 | desc="Running normalization on dataset", 324 | load_from_cache_file=not data_args.overwrite_cache 325 | ) 326 | 327 | 328 | def preprocess_function(examples): 329 | # Tokenize the texts 330 | args = ( 331 | (examples[data_args.sentence1_key],) if data_args.sentence2_key not in examples else (examples[data_args.sentence1_key], examples[data_args.sentence2_key]) 332 | ) 333 | result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) 334 | 335 | if label_to_id is not None and data_args.label_key in examples: 336 | result["label"] = [label_to_id[l] for l in examples[data_args.label_key]] 337 | 338 | return result 339 | 340 | with training_args.main_process_first(desc="dataset map pre-processing"): 341 | raw_datasets = raw_datasets.map( 342 | preprocess_function, 343 | batched=True, 344 | load_from_cache_file=not data_args.overwrite_cache, 345 | desc="Running tokenizer on dataset", 346 | ) 347 | if training_args.do_train: 348 | if "train" not in raw_datasets: 349 | raise ValueError("--do_train requires a train dataset") 350 | train_dataset = raw_datasets["train"] 351 | if data_args.max_train_samples is not None: 352 | train_dataset = train_dataset.select(range(data_args.max_train_samples)) 353 | 354 | if training_args.do_eval: 355 | if "validation" not in raw_datasets: 356 | raise ValueError("--do_eval requires a validation dataset") 357 | eval_dataset = raw_datasets["validation"] 358 | if data_args.max_eval_samples is not None: 359 | eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) 360 | 361 | if training_args.do_predict or data_args.test_file is not None: 362 | if "test" not in raw_datasets: 363 | raise ValueError("--do_predict requires a test dataset") 364 | predict_dataset = raw_datasets["test"] 365 | if data_args.max_predict_samples is not None: 366 | predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) 367 | 368 | # Log a few random samples from the training set: 369 | if training_args.do_train: 370 | for index in random.sample(range(len(train_dataset)), 3): 371 | logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") 372 | 373 | 374 | metric_names = [ 375 | "accuracy", 376 | "precision", 377 | "recall", 378 | "f1" 379 | ] 380 | required_metrics = [load_metric(k) for k in metric_names] 381 | average_required = metric_names[1:] 382 | 383 | def compute_metrics(p: EvalPrediction): 384 | results = {} 385 | preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions 386 | preds = np.argmax(preds, axis=1) 387 | 388 | for m in required_metrics: 389 | kwargs = {"average": "macro"} if m.name in average_required else {} 390 | r = m.compute( 391 | predictions=preds, 392 | references=p.label_ids, 393 | **kwargs 394 | ) 395 | for k, v in r.items(): 396 | results[k] = v 397 | 398 | return results 399 | 400 | if data_args.pad_to_max_length: 401 | data_collator = default_data_collator 402 | elif training_args.fp16: 403 | data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) 404 | else: 405 | data_collator = None 406 | 407 | # Initialize our Trainer 408 | trainer = Trainer( 409 | model=model, 410 | args=training_args, 411 | train_dataset=train_dataset if training_args.do_train else None, 412 | eval_dataset=eval_dataset if training_args.do_eval else None, 413 | compute_metrics=compute_metrics, 414 | tokenizer=tokenizer, 415 | data_collator=data_collator, 416 | ) 417 | 418 | # Training 419 | if training_args.do_train: 420 | checkpoint = None 421 | if training_args.resume_from_checkpoint is not None: 422 | checkpoint = training_args.resume_from_checkpoint 423 | elif last_checkpoint is not None: 424 | checkpoint = last_checkpoint 425 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 426 | metrics = train_result.metrics 427 | max_train_samples = ( 428 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) 429 | ) 430 | metrics["train_samples"] = min(max_train_samples, len(train_dataset)) 431 | 432 | trainer.save_model() 433 | 434 | trainer.log_metrics("train", metrics) 435 | trainer.save_metrics("train", metrics) 436 | trainer.save_state() 437 | 438 | # Evaluation 439 | if training_args.do_eval: 440 | logger.info("*** Evaluate ***") 441 | 442 | metrics = trainer.evaluate(eval_dataset=eval_dataset) 443 | 444 | max_eval_samples = ( 445 | data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) 446 | ) 447 | metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) 448 | 449 | trainer.log_metrics("eval", metrics) 450 | trainer.save_metrics("eval", metrics) 451 | 452 | if training_args.do_predict: 453 | logger.info("*** Predict ***") 454 | 455 | predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict") 456 | predictions = np.argmax(predictions, axis=1) 457 | 458 | trainer.log_metrics("predict", metrics) 459 | trainer.save_metrics("predict", metrics) 460 | 461 | output_predict_file = os.path.join(training_args.output_dir, f"predictions.txt") 462 | if trainer.is_world_process_zero(): 463 | with open(output_predict_file, "w") as writer: 464 | logger.info(f"***** Predict results *****") 465 | writer.write("index\tprediction\n") 466 | for index, item in enumerate(predictions): 467 | item = label_list[item] 468 | writer.write(f"{index}\t{item}\n") 469 | 470 | 471 | 472 | def _mp_fn(index): 473 | # For xla_spawn (TPUs) 474 | main() 475 | 476 | 477 | if __name__ == "__main__": 478 | main() 479 | -------------------------------------------------------------------------------- /token_classification/token_classification.py: -------------------------------------------------------------------------------- 1 | # Adapted from huggingface transformers classificaton scripts 2 | 3 | import logging 4 | import os 5 | import random 6 | import sys 7 | from dataclasses import dataclass, field 8 | from typing import Optional 9 | from seqeval.metrics import classification_report, accuracy_score 10 | 11 | import glob 12 | 13 | import datasets 14 | import numpy as np 15 | from datasets import ClassLabel, load_metric 16 | from datasets.io.json import JsonDatasetReader 17 | 18 | import transformers 19 | from transformers import ( 20 | AutoConfig, 21 | AutoModelForTokenClassification, 22 | AutoTokenizer, 23 | DataCollatorForTokenClassification, 24 | EvalPrediction, 25 | HfArgumentParser, 26 | PretrainedConfig, 27 | Trainer, 28 | TrainingArguments, 29 | default_data_collator, 30 | set_seed, 31 | ) 32 | from transformers.trainer_utils import get_last_checkpoint 33 | from transformers.utils import check_min_version 34 | from transformers.utils.versions import require_version 35 | from normalizer import normalize 36 | 37 | EXT2CONFIG = { 38 | "jsonl": (JsonDatasetReader, {}), 39 | "json": (JsonDatasetReader, {}) 40 | } 41 | 42 | logger = logging.getLogger(__name__) 43 | 44 | 45 | @dataclass 46 | class DataTrainingArguments: 47 | 48 | dataset_dir: Optional[str] = field( 49 | default=None, metadata={ 50 | "help": "Path to the directory containing the data files. (.jsonl)" 51 | "File datatypes will be identified with their prefix names as follows: " 52 | "`train`- Training file(s) e.g. `train.jsonl`/ `train_part1.jsonl` etc. " 53 | "`validation`- Evaluation file(s) e.g. `validation.jsonl`/ `validation_part1.jsonl` etc. " 54 | "`test`- Test file(s) e.g. `test.jsonl`/ `test_part1.jsonl` etc. " 55 | "All files for must have the same extension." 56 | } 57 | ) 58 | max_seq_length: int = field( 59 | default=512, 60 | metadata={ 61 | "help": "The maximum total input sequence length after tokenization. Sequences longer " 62 | "than this will be truncated, sequences shorter will be padded." 63 | }, 64 | ) 65 | overwrite_cache: bool = field( 66 | default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} 67 | ) 68 | pad_to_max_length: bool = field( 69 | default=False, 70 | metadata={ 71 | "help": "Whether to pad all samples to `max_seq_length`. " 72 | "If False, will pad the samples dynamically when batching to the maximum length in the batch." 73 | }, 74 | ) 75 | max_train_samples: Optional[int] = field( 76 | default=None, 77 | metadata={ 78 | "help": "For debugging purposes or quicker training, truncate the number of training examples to this " 79 | "value if set." 80 | }, 81 | ) 82 | max_eval_samples: Optional[int] = field( 83 | default=None, 84 | metadata={ 85 | "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " 86 | "value if set." 87 | }, 88 | ) 89 | max_predict_samples: Optional[int] = field( 90 | default=None, 91 | metadata={ 92 | "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " 93 | "value if set." 94 | }, 95 | ) 96 | train_file: Optional[str] = field( 97 | default=None, metadata={"help": "A csv / tsv / jsonl file containing the training data."} 98 | ) 99 | validation_file: Optional[str] = field( 100 | default=None, metadata={"help": "A csv / tsv / jsonl file containing the validation data."} 101 | ) 102 | test_file: Optional[str] = field(default=None, metadata={"help": "A csv / tsv / jsonl file containing the test data."}) 103 | do_normalize: Optional[bool] = field(default=True, metadata={"help": "Normalize text before feeding to the model."}) 104 | label_all_tokens: bool = field( 105 | default=False, 106 | metadata={ 107 | "help": "Whether to put the label for one word on all tokens of generated by that word or just on the " 108 | "one (in which case the other tokens will have a padding index)." 109 | }, 110 | ) 111 | tokens_key: Optional[str] = field( 112 | default="tokens", metadata={"help": "Key name in the input file corresponding to the tokens."} 113 | ) 114 | tags_key: Optional[str] = field( 115 | default="tags", metadata={"help": "Key name in the input file corresponding to the token labels/tags."} 116 | ) 117 | 118 | def __post_init__(self): 119 | if self.train_file is not None and self.validation_file is not None: 120 | train_extension = self.train_file.split(".")[-1] 121 | assert train_extension in ["csv", "jsonl", "tsv"], "`train_file` should be a csv / tsv / jsonl file." 122 | validation_extension = self.validation_file.split(".")[-1] 123 | assert ( 124 | validation_extension == train_extension 125 | ), "`validation_file` should have the same extension csv / tsv / jsonl as `train_file`." 126 | 127 | 128 | 129 | @dataclass 130 | class ModelArguments: 131 | 132 | model_name_or_path: str = field( 133 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} 134 | ) 135 | cache_dir: Optional[str] = field( 136 | default=None, 137 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 138 | ) 139 | 140 | 141 | def main(): 142 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) 143 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 144 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 145 | else: 146 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 147 | 148 | # Setup logging 149 | logging.basicConfig( 150 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 151 | datefmt="%m/%d/%Y %H:%M:%S", 152 | handlers=[logging.StreamHandler(sys.stdout)], 153 | ) 154 | 155 | log_level = training_args.get_process_log_level() 156 | logger.setLevel(log_level) 157 | datasets.utils.logging.set_verbosity(log_level) 158 | transformers.utils.logging.set_verbosity(log_level) 159 | transformers.utils.logging.enable_default_handler() 160 | transformers.utils.logging.enable_explicit_format() 161 | 162 | # Log on each process the small summary: 163 | logger.warning( 164 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 165 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 166 | ) 167 | logger.info(f"Training/evaluation parameters {training_args}") 168 | 169 | # Detecting last checkpoint. 170 | last_checkpoint = None 171 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 172 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 173 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 174 | raise ValueError( 175 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 176 | "Use --overwrite_output_dir to overcome." 177 | ) 178 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: 179 | logger.info( 180 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 181 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 182 | ) 183 | 184 | # Set seed before initializing model. 185 | set_seed(training_args.seed) 186 | has_ext = lambda path: len(os.path.basename(path).split(".")) > 1 187 | get_ext = lambda path: os.path.basename(path).split(".")[-1] 188 | 189 | if data_args.dataset_dir is not None: 190 | data_files = {} 191 | all_files = glob.glob( 192 | os.path.join( 193 | data_args.dataset_dir, 194 | "*" 195 | ) 196 | ) 197 | all_exts = [get_ext(k) for k in all_files if has_ext(k)] 198 | if not all_exts: 199 | raise ValueError("The `dataset_dir` doesnt have any valid file.") 200 | 201 | selected_ext = max(set(all_exts), key=all_exts.count) 202 | for search_prefix in ["train", "validation", "test"]: 203 | found_files = glob.glob( 204 | os.path.join( 205 | data_args.dataset_dir, 206 | search_prefix + "*" + selected_ext 207 | ) 208 | ) 209 | if not found_files: 210 | continue 211 | 212 | data_files[search_prefix] = found_files 213 | 214 | else: 215 | data_files = { 216 | "train": data_args.train_file, 217 | "validation": data_args.validation_file, 218 | "test": data_args.test_file 219 | } 220 | 221 | data_files = {k: v for k, v in data_files.items() if v is not None} 222 | 223 | if not data_files: 224 | raise ValueError("No valid input file found.") 225 | 226 | selected_ext = get_ext(list(data_files.values())[0]) 227 | 228 | 229 | dataset_configs = EXT2CONFIG[selected_ext] 230 | raw_datasets = dataset_configs[0]( 231 | data_files, 232 | **dataset_configs[1] 233 | ).read() 234 | 235 | for data_type, ds in raw_datasets.items(): 236 | assert data_args.tokens_key in ds.features, f"Input files doesnt have the `{data_args.tokens_key}` key" 237 | if data_type != "test": 238 | assert data_args.tags_key in ds.features, f"Input files doesnt have the `{data_args.tags_key}` key" 239 | 240 | ignored_columns = set(ds.column_names) - set([data_args.tokens_key, data_args.tags_key]) 241 | raw_datasets[data_type] = ds.remove_columns(ignored_columns) 242 | 243 | config = AutoConfig.from_pretrained( 244 | model_args.model_name_or_path, 245 | cache_dir=model_args.cache_dir, 246 | ) 247 | 248 | label_to_id = config.label2id if config.task_specific_params and config.task_specific_params.get("finetuned", False) else None 249 | if label_to_id is None: 250 | def get_label_list(labels): 251 | unique_labels = set() 252 | for label in labels: 253 | unique_labels = unique_labels | set(label) 254 | label_list = list(unique_labels) 255 | label_list.sort() 256 | return label_list 257 | 258 | label_list = get_label_list(raw_datasets["train"][data_args.tags_key]) 259 | num_labels = len(label_list) 260 | label_to_id = {v: i for i, v in enumerate(label_list)} 261 | config.label2id = label_to_id 262 | config.id2label = {id: label for label, id in config.label2id.items()} 263 | config.task_specific_params = {"finetuned": True} 264 | else: 265 | label_list = list(label_to_id.keys()) 266 | num_labels = len(label_list) 267 | 268 | tokenizer_kwargs = {"add_prefix_space": True} if config.model_type in {"gpt2", "roberta"} else {} 269 | tokenizer = AutoTokenizer.from_pretrained( 270 | model_args.model_name_or_path, 271 | cache_dir=model_args.cache_dir, 272 | use_fast=True, 273 | **tokenizer_kwargs 274 | ) 275 | model = AutoModelForTokenClassification.from_pretrained( 276 | model_args.model_name_or_path, 277 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 278 | config=config, 279 | cache_dir=model_args.cache_dir 280 | ) 281 | 282 | 283 | # Padding strategy 284 | if data_args.pad_to_max_length: 285 | padding = "max_length" 286 | else: 287 | # We will pad later, dynamically at batch creation, to the max sequence length in each batch 288 | padding = False 289 | 290 | if data_args.max_seq_length > tokenizer.model_max_length: 291 | logger.warning( 292 | f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" 293 | f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." 294 | ) 295 | max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) 296 | 297 | if data_args.do_normalize: 298 | def normalize_example(example): 299 | for i, token in enumerate(example[data_args.tokens_key]): 300 | normalized_token = normalize(token) 301 | if len(normalized_token) > 0: 302 | example[data_args.tokens_key][i] = normalized_token 303 | 304 | return example 305 | 306 | raw_datasets = raw_datasets.map( 307 | normalize_example, 308 | desc="Running normalization on dataset", 309 | load_from_cache_file=not data_args.overwrite_cache 310 | ) 311 | 312 | # Tokenize all texts and align the labels with them. 313 | def tokenize_and_align_labels(examples): 314 | 315 | tokenized_inputs = tokenizer( 316 | examples[data_args.tokens_key], 317 | padding=padding, 318 | truncation=True, 319 | max_length=max_seq_length, 320 | is_split_into_words=True, 321 | ) 322 | labels = [] 323 | for i, label in enumerate(examples[data_args.tags_key]): 324 | word_ids = tokenized_inputs.word_ids(batch_index=i) 325 | previous_word_idx = None 326 | label_ids = [] 327 | for word_idx in word_ids: 328 | if word_idx is None: 329 | label_ids.append(-100) 330 | elif word_idx != previous_word_idx: 331 | label_ids.append(label_to_id[label[word_idx]]) 332 | else: 333 | label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100) 334 | previous_word_idx = word_idx 335 | 336 | labels.append(label_ids) 337 | tokenized_inputs["labels"] = labels 338 | 339 | return tokenized_inputs 340 | 341 | with training_args.main_process_first(desc="dataset map pre-processing"): 342 | raw_datasets = raw_datasets.map( 343 | tokenize_and_align_labels, 344 | batched=True, 345 | load_from_cache_file=not data_args.overwrite_cache, 346 | desc="Running tokenizer on dataset", 347 | ) 348 | if training_args.do_train: 349 | if "train" not in raw_datasets: 350 | raise ValueError("--do_train requires a train dataset") 351 | train_dataset = raw_datasets["train"] 352 | if data_args.max_train_samples is not None: 353 | train_dataset = train_dataset.select(range(data_args.max_train_samples)) 354 | 355 | if training_args.do_eval: 356 | if "validation" not in raw_datasets: 357 | raise ValueError("--do_eval requires a validation dataset") 358 | eval_dataset = raw_datasets["validation"] 359 | if data_args.max_eval_samples is not None: 360 | eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) 361 | 362 | if training_args.do_predict or data_args.test_file is not None: 363 | if "test" not in raw_datasets: 364 | raise ValueError("--do_predict requires a test dataset") 365 | predict_dataset = raw_datasets["test"] 366 | if data_args.max_predict_samples is not None: 367 | predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) 368 | 369 | # Log a few random samples from the training set: 370 | if training_args.do_train: 371 | for index in random.sample(range(len(train_dataset)), 3): 372 | logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") 373 | 374 | 375 | def compute_metrics(p: EvalPrediction): 376 | predictions, labels = p 377 | predictions = np.argmax(predictions, axis=2) 378 | 379 | true_predictions = [ 380 | [label_list[p] for (p, l) in zip(prediction, label) if l != -100] 381 | for prediction, label in zip(predictions, labels) 382 | ] 383 | true_labels = [ 384 | [label_list[l] for (p, l) in zip(prediction, label) if l != -100] 385 | for prediction, label in zip(predictions, labels) 386 | ] 387 | 388 | report = classification_report( 389 | y_true=true_labels, 390 | y_pred=true_predictions, 391 | output_dict=True 392 | ) 393 | 394 | scores = { 395 | type_name: { 396 | "precision": score["precision"], 397 | "recall": score["recall"], 398 | "f1": score["f1-score"], 399 | "number": score["support"], 400 | } 401 | for type_name, score in report.items() 402 | } 403 | scores["overall_accuracy"] = accuracy_score(y_true=true_labels, y_pred=true_predictions) 404 | 405 | final_results = {} 406 | for key, value in scores.items(): 407 | if isinstance(value, dict): 408 | for n, v in value.items(): 409 | key = key.replace(" ", "_") 410 | n = n.replace(" ", "_") 411 | final_results[f"{key}_{n}"] = v 412 | else: 413 | final_results[key] = value 414 | return final_results 415 | 416 | data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) 417 | 418 | # Initialize our Trainer 419 | trainer = Trainer( 420 | model=model, 421 | args=training_args, 422 | train_dataset=train_dataset if training_args.do_train else None, 423 | eval_dataset=eval_dataset if training_args.do_eval else None, 424 | compute_metrics=compute_metrics, 425 | tokenizer=tokenizer, 426 | data_collator=data_collator, 427 | ) 428 | 429 | # Training 430 | if training_args.do_train: 431 | checkpoint = None 432 | if training_args.resume_from_checkpoint is not None: 433 | checkpoint = training_args.resume_from_checkpoint 434 | elif last_checkpoint is not None: 435 | checkpoint = last_checkpoint 436 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 437 | metrics = train_result.metrics 438 | max_train_samples = ( 439 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) 440 | ) 441 | metrics["train_samples"] = min(max_train_samples, len(train_dataset)) 442 | 443 | trainer.save_model() 444 | 445 | trainer.log_metrics("train", metrics) 446 | trainer.save_metrics("train", metrics) 447 | trainer.save_state() 448 | 449 | # Evaluation 450 | if training_args.do_eval: 451 | logger.info("*** Evaluate ***") 452 | 453 | metrics = trainer.evaluate(eval_dataset=eval_dataset) 454 | 455 | max_eval_samples = ( 456 | data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) 457 | ) 458 | metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) 459 | 460 | trainer.log_metrics("eval", metrics) 461 | trainer.save_metrics("eval", metrics) 462 | 463 | if training_args.do_predict: 464 | logger.info("*** Predict ***") 465 | 466 | predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict") 467 | predictions = np.argmax(predictions, axis=2) 468 | 469 | # Remove ignored index (special tokens) 470 | true_predictions = [ 471 | [label_list[p] for (p, l) in zip(prediction, label) if l != -100] 472 | for prediction, label in zip(predictions, labels) 473 | ] 474 | 475 | trainer.log_metrics("predict", metrics) 476 | trainer.save_metrics("predict", metrics) 477 | 478 | # Save predictions 479 | output_predictions_file = os.path.join(training_args.output_dir, "predictions.txt") 480 | if trainer.is_world_process_zero(): 481 | with open(output_predictions_file, "w") as writer: 482 | for prediction in true_predictions: 483 | writer.write(" ".join(prediction) + "\n") 484 | 485 | 486 | 487 | def _mp_fn(index): 488 | # For xla_spawn (TPUs) 489 | main() 490 | 491 | 492 | if __name__ == "__main__": 493 | main() 494 | -------------------------------------------------------------------------------- /question_answering/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import datasets 4 | from typing import Optional 5 | from datasets.io.abc import AbstractDatasetReader 6 | from datasets.utils.typing import NestedDataStructureLike, PathLike 7 | from datasets import Features, NamedSplit 8 | from datasets.tasks import QuestionAnsweringExtractive 9 | import collections 10 | import logging 11 | from typing import Optional, Tuple 12 | import numpy as np 13 | from tqdm.auto import tqdm 14 | from transformers import Trainer, is_torch_tpu_available 15 | from transformers.trainer_utils import PredictionOutput 16 | 17 | if is_torch_tpu_available(): 18 | import torch_xla.core.xla_model as xm 19 | import torch_xla.debug.metrics as met 20 | 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | 26 | class QADatasetBuilder(datasets.GeneratorBasedBuilder): 27 | 28 | def _info(self): 29 | return datasets.DatasetInfo( 30 | features=datasets.Features( 31 | { 32 | "id": datasets.Value("string"), 33 | "title": datasets.Value("string"), 34 | "context": datasets.Value("string"), 35 | "question": datasets.Value("string"), 36 | "answers": datasets.features.Sequence( 37 | { 38 | "text": datasets.Value("string"), 39 | "answer_start": datasets.Value("int32"), 40 | } 41 | ), 42 | } 43 | ), 44 | supervised_keys=None, 45 | task_templates=[ 46 | QuestionAnsweringExtractive( 47 | question_column="question", context_column="context", answers_column="answers" 48 | ) 49 | ], 50 | ) 51 | 52 | def _split_generators(self, dl_manager): 53 | if not self.config.data_files: 54 | raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}") 55 | data_files = dl_manager.download_and_extract(self.config.data_files) 56 | if isinstance(data_files, (str, list, tuple)): 57 | files = data_files 58 | if isinstance(files, str): 59 | files = [files] 60 | return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files})] 61 | splits = [] 62 | for split_name, files in data_files.items(): 63 | if isinstance(files, str): 64 | files = [files] 65 | splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files})) 66 | return splits 67 | 68 | def _generate_examples(self, files): 69 | for filepath in files: 70 | with open(filepath, encoding="utf-8") as f: 71 | squad = json.load(f) 72 | for example in squad["data"]: 73 | title = example.get("title", "") 74 | for paragraph in example["paragraphs"]: 75 | context = paragraph["context"] 76 | for qa in paragraph["qas"]: 77 | question = qa["question"] 78 | id_ = qa["id"] 79 | 80 | answer_starts = [answer["answer_start"] for answer in qa["answers"]] 81 | answers = [answer["text"] for answer in qa["answers"]] 82 | 83 | yield id_, { 84 | "title": title, 85 | "context": context, 86 | "question": question, 87 | "id": id_, 88 | "answers": { 89 | "answer_start": answer_starts, 90 | "text": answers, 91 | }, 92 | } 93 | 94 | class QADatasetReader(AbstractDatasetReader): 95 | 96 | def __init__( 97 | self, 98 | path_or_paths: NestedDataStructureLike[PathLike], 99 | split: Optional[NamedSplit] = None, 100 | features: Optional[Features] = None, 101 | cache_dir: str = None, 102 | keep_in_memory: bool = False, 103 | **kwargs, 104 | ): 105 | super().__init__( 106 | path_or_paths, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, **kwargs 107 | ) 108 | path_or_paths = path_or_paths if isinstance(path_or_paths, dict) else {self.split: path_or_paths} 109 | self.builder = QADatasetBuilder( 110 | cache_dir=cache_dir, 111 | data_files=path_or_paths, 112 | **kwargs, 113 | ) 114 | 115 | def read(self): 116 | download_config = None 117 | download_mode = None 118 | ignore_verifications = True 119 | try_from_hf_gcs = False 120 | use_auth_token = None 121 | base_path = None 122 | 123 | self.builder.download_and_prepare( 124 | download_config=download_config, 125 | download_mode=download_mode, 126 | ignore_verifications=ignore_verifications, 127 | try_from_hf_gcs=try_from_hf_gcs, 128 | base_path=base_path, 129 | use_auth_token=use_auth_token, 130 | ) 131 | 132 | dataset = self.builder.as_dataset( 133 | split=self.split, ignore_verifications=ignore_verifications, in_memory=self.keep_in_memory 134 | ) 135 | return dataset 136 | 137 | def find_all_indices(pattern_str, source_str, overlapping=True): 138 | index = source_str.find(pattern_str) 139 | while index != -1: 140 | yield index 141 | index = source_str.find( 142 | pattern_str, 143 | index + (1 if overlapping else len(pattern_str)) 144 | ) 145 | 146 | def postprocess_qa_predictions( 147 | examples, 148 | features, 149 | predictions: Tuple[np.ndarray, np.ndarray], 150 | allow_null_ans: bool = False, 151 | n_best_size: int = 20, 152 | max_answer_length: int = 30, 153 | null_score_diff_threshold: float = 0.0, 154 | output_dir: Optional[str] = None, 155 | prefix: Optional[str] = None, 156 | log_level: Optional[int] = logging.WARNING, 157 | ): 158 | """ 159 | Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the 160 | original contexts. This is the base postprocessing functions for models that only return start and end logits. 161 | 162 | Args: 163 | examples: The non-preprocessed dataset (see the main script for more information). 164 | features: The processed dataset (see the main script for more information). 165 | predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): 166 | The predictions of the model: two arrays containing the start logits and the end logits respectively. Its 167 | first dimension must match the number of elements of :obj:`features`. 168 | allow_null_ans (:obj:`bool`, `optional`, defaults to :obj:`False`): 169 | Whether or not the underlying dataset contains examples with no answers. 170 | n_best_size (:obj:`int`, `optional`, defaults to 20): 171 | The total number of n-best predictions to generate when looking for an answer. 172 | max_answer_length (:obj:`int`, `optional`, defaults to 30): 173 | The maximum length of an answer that can be generated. This is needed because the start and end predictions 174 | are not conditioned on one another. 175 | null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0): 176 | The threshold used to select the null answer: if the best answer has a score that is less than the score of 177 | the null answer minus this threshold, the null answer is selected for this example (note that the score of 178 | the null answer for an example giving several features is the minimum of the scores for the null answer on 179 | each feature: all features must be aligned on the fact they `want` to predict a null answer). 180 | 181 | Only useful when :obj:`allow_null_ans` is :obj:`True`. 182 | output_dir (:obj:`str`, `optional`): 183 | If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if 184 | :obj:`allow_null_ans=True`, the dictionary of the scores differences between best and null 185 | answers, are saved in `output_dir`. 186 | prefix (:obj:`str`, `optional`): 187 | If provided, the dictionaries mentioned above are saved with `prefix` added to their names. 188 | log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``): 189 | ``logging`` log level (e.g., ``logging.WARNING``) 190 | """ 191 | assert len(predictions) == 2, "`predictions` should be a tuple with two elements (start_logits, end_logits)." 192 | all_start_logits, all_end_logits = predictions 193 | 194 | assert len(predictions[0]) == len(features), f"Got {len(predictions[0])} predictions and {len(features)} features." 195 | 196 | # Build a map example to its corresponding features. 197 | example_id_to_index = {k: i for i, k in enumerate(examples["id"])} 198 | features_per_example = collections.defaultdict(list) 199 | for i, feature in enumerate(features): 200 | features_per_example[example_id_to_index[feature["example_id"]]].append(i) 201 | 202 | # The dictionaries we have to fill. 203 | all_predictions = collections.OrderedDict() 204 | all_nbest_json = collections.OrderedDict() 205 | if allow_null_ans: 206 | scores_diff_json = collections.OrderedDict() 207 | 208 | # Logging. 209 | logger.setLevel(log_level) 210 | logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") 211 | 212 | # Let's loop over all the examples! 213 | for example_index, example in enumerate(tqdm(examples)): 214 | # Those are the indices of the features associated to the current example. 215 | feature_indices = features_per_example[example_index] 216 | 217 | min_null_prediction = None 218 | prelim_predictions = [] 219 | 220 | # Looping through all the features associated to the current example. 221 | for feature_index in feature_indices: 222 | # We grab the predictions of the model for this feature. 223 | start_logits = all_start_logits[feature_index] 224 | end_logits = all_end_logits[feature_index] 225 | # This is what will allow us to map some the positions in our logits to span of texts in the original 226 | # context. 227 | offset_mapping = features[feature_index]["offset_mapping"] 228 | # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context 229 | # available in the current feature. 230 | token_is_max_context = features[feature_index].get("token_is_max_context", None) 231 | 232 | # Update minimum null prediction. 233 | feature_null_score = start_logits[0] + end_logits[0] 234 | if min_null_prediction is None or min_null_prediction["score"] > feature_null_score: 235 | min_null_prediction = { 236 | "offsets": (0, 0), 237 | "score": feature_null_score, 238 | "start_logit": start_logits[0], 239 | "end_logit": end_logits[0], 240 | } 241 | 242 | # Go through all possibilities for the `n_best_size` greater start and end logits. 243 | start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist() 244 | end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist() 245 | for start_index in start_indexes: 246 | for end_index in end_indexes: 247 | # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond 248 | # to part of the input_ids that are not in the context. 249 | if ( 250 | start_index >= len(offset_mapping) 251 | or end_index >= len(offset_mapping) 252 | or offset_mapping[start_index] is None 253 | or offset_mapping[end_index] is None 254 | ): 255 | continue 256 | # Don't consider answers with a length that is either < 0 or > max_answer_length. 257 | if end_index < start_index or end_index - start_index + 1 > max_answer_length: 258 | continue 259 | # Don't consider answer that don't have the maximum context available (if such information is 260 | # provided). 261 | if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False): 262 | continue 263 | prelim_predictions.append( 264 | { 265 | "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]), 266 | "score": start_logits[start_index] + end_logits[end_index], 267 | "start_logit": start_logits[start_index], 268 | "end_logit": end_logits[end_index], 269 | } 270 | ) 271 | if allow_null_ans: 272 | # Add the minimum null prediction 273 | prelim_predictions.append(min_null_prediction) 274 | null_score = min_null_prediction["score"] 275 | 276 | # Only keep the best `n_best_size` predictions. 277 | predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size] 278 | 279 | # Add back the minimum null prediction if it was removed because of its low score. 280 | if allow_null_ans and not any(p["offsets"] == (0, 0) for p in predictions): 281 | predictions.append(min_null_prediction) 282 | 283 | # Use the offsets to gather the answer text in the original context. 284 | context = example["context"] 285 | for pred in predictions: 286 | offsets = pred.pop("offsets") 287 | pred["text"] = context[offsets[0] : offsets[1]] 288 | 289 | # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid 290 | # failure. 291 | if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""): 292 | predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0}) 293 | 294 | # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using 295 | # the LogSumExp trick). 296 | scores = np.array([pred.pop("score") for pred in predictions]) 297 | exp_scores = np.exp(scores - np.max(scores)) 298 | probs = exp_scores / exp_scores.sum() 299 | 300 | # Include the probabilities in our predictions. 301 | for prob, pred in zip(probs, predictions): 302 | pred["probability"] = prob 303 | 304 | # Pick the best prediction. If the null answer is not possible, this is easy. 305 | if not allow_null_ans: 306 | all_predictions[example["id"]] = predictions[0]["text"] 307 | else: 308 | # Otherwise we first need to find the best non-empty prediction. 309 | i = 0 310 | while predictions[i]["text"] == "": 311 | i += 1 312 | best_non_null_pred = predictions[i] 313 | 314 | # Then we compare to the null prediction using the threshold. 315 | score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"] 316 | scores_diff_json[example["id"]] = float(score_diff) # To be JSON-serializable. 317 | if score_diff > null_score_diff_threshold: 318 | all_predictions[example["id"]] = "" 319 | else: 320 | all_predictions[example["id"]] = best_non_null_pred["text"] 321 | 322 | # Make `predictions` JSON-serializable by casting np.float back to float. 323 | all_nbest_json[example["id"]] = [ 324 | {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()} 325 | for pred in predictions 326 | ] 327 | 328 | # If we have an output_dir, let's save all those dicts. 329 | if output_dir is not None: 330 | assert os.path.isdir(output_dir), f"{output_dir} is not a directory." 331 | 332 | prediction_file = os.path.join( 333 | output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json" 334 | ) 335 | nbest_file = os.path.join( 336 | output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json" 337 | ) 338 | if allow_null_ans: 339 | null_odds_file = os.path.join( 340 | output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json" 341 | ) 342 | 343 | logger.info(f"Saving predictions to {prediction_file}.") 344 | with open(prediction_file, "w") as writer: 345 | writer.write(json.dumps(all_predictions, ensure_ascii=False, indent=4) + "\n") 346 | logger.info(f"Saving nbest_preds to {nbest_file}.") 347 | with open(nbest_file, "w") as writer: 348 | writer.write(json.dumps(all_nbest_json, ensure_ascii=False, indent=4) + "\n") 349 | if allow_null_ans: 350 | logger.info(f"Saving null_odds to {null_odds_file}.") 351 | with open(null_odds_file, "w") as writer: 352 | writer.write(json.dumps(scores_diff_json, ensure_ascii=False, indent=4) + "\n") 353 | 354 | return all_predictions 355 | 356 | 357 | class QuestionAnsweringTrainer(Trainer): 358 | def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs): 359 | super().__init__(*args, **kwargs) 360 | self.eval_examples = eval_examples 361 | self.post_process_function = post_process_function 362 | 363 | def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"): 364 | eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset 365 | eval_dataloader = self.get_eval_dataloader(eval_dataset) 366 | eval_examples = self.eval_examples if eval_examples is None else eval_examples 367 | 368 | # Temporarily disable metric computation, we will do it in the loop here. 369 | compute_metrics = self.compute_metrics 370 | self.compute_metrics = None 371 | eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop 372 | try: 373 | output = eval_loop( 374 | eval_dataloader, 375 | description="Evaluation", 376 | # No point gathering the predictions if there are no metrics, otherwise we defer to 377 | # self.args.prediction_loss_only 378 | prediction_loss_only=True if compute_metrics is None else None, 379 | ignore_keys=ignore_keys, 380 | ) 381 | finally: 382 | self.compute_metrics = compute_metrics 383 | 384 | if self.post_process_function is not None and self.compute_metrics is not None: 385 | eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions) 386 | metrics = self.compute_metrics(eval_preds) 387 | 388 | # Prefix all keys with metric_key_prefix + '_' 389 | for key in list(metrics.keys()): 390 | if not key.startswith(f"{metric_key_prefix}_"): 391 | metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) 392 | 393 | self.log(metrics) 394 | else: 395 | metrics = {} 396 | 397 | if self.args.tpu_metrics_debug or self.args.debug: 398 | # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) 399 | xm.master_print(met.metrics_report()) 400 | 401 | self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics) 402 | return metrics 403 | 404 | def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"): 405 | predict_dataloader = self.get_test_dataloader(predict_dataset) 406 | 407 | # Temporarily disable metric computation, we will do it in the loop here. 408 | compute_metrics = self.compute_metrics 409 | self.compute_metrics = None 410 | eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop 411 | try: 412 | output = eval_loop( 413 | predict_dataloader, 414 | description="Prediction", 415 | # No point gathering the predictions if there are no metrics, otherwise we defer to 416 | # self.args.prediction_loss_only 417 | prediction_loss_only=True if compute_metrics is None else None, 418 | ignore_keys=ignore_keys, 419 | ) 420 | finally: 421 | self.compute_metrics = compute_metrics 422 | 423 | if self.post_process_function is None or self.compute_metrics is None: 424 | return output 425 | 426 | predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict") 427 | metrics = self.compute_metrics(predictions) 428 | 429 | # Prefix all keys with metric_key_prefix + '_' 430 | for key in list(metrics.keys()): 431 | if not key.startswith(f"{metric_key_prefix}_"): 432 | metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) 433 | 434 | return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics) 435 | -------------------------------------------------------------------------------- /question_answering/question_answering.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import glob 4 | import random 5 | import sys 6 | from dataclasses import dataclass, field 7 | from typing import Optional 8 | 9 | import datasets 10 | from datasets import load_dataset, load_metric 11 | 12 | import transformers 13 | from transformers import ( 14 | AutoConfig, 15 | AutoModelForQuestionAnswering, 16 | AutoTokenizer, 17 | DataCollatorWithPadding, 18 | EvalPrediction, 19 | HfArgumentParser, 20 | PreTrainedTokenizerFast, 21 | Trainer, 22 | TrainingArguments, 23 | default_data_collator, 24 | set_seed, 25 | ) 26 | from transformers.trainer_utils import get_last_checkpoint 27 | from transformers.utils import check_min_version 28 | from transformers.utils.versions import require_version 29 | from normalizer import normalize 30 | from utils import ( 31 | QADatasetReader, 32 | find_all_indices, 33 | postprocess_qa_predictions, 34 | QuestionAnsweringTrainer 35 | ) 36 | 37 | EXT2CONFIG = { 38 | "json": (QADatasetReader, {}) 39 | } 40 | 41 | logger = logging.getLogger(__name__) 42 | 43 | 44 | @dataclass 45 | class DataTrainingArguments: 46 | 47 | dataset_dir: Optional[str] = field( 48 | default=None, metadata={ 49 | "help": "Path to the directory containing the data files. (.json)" 50 | "File datatypes will be identified with their prefix names as follows: " 51 | "`train`- Training file(s) e.g. `train.json`/ `train_part1.json` etc. " 52 | "`validation`- Evaluation file(s) e.g. `validation.json`/ `validation_part1.json` etc. " 53 | "`test`- Test file(s) e.g. `test.json`/ `test_part1.json` etc. " 54 | "All files for must have the same extension." 55 | } 56 | ) 57 | 58 | train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a json file)."}) 59 | validation_file: Optional[str] = field( 60 | default=None, 61 | metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a json file)."}, 62 | ) 63 | test_file: Optional[str] = field( 64 | default=None, 65 | metadata={"help": "An optional input test data file to evaluate the perplexity on (a json file)."}, 66 | ) 67 | overwrite_cache: bool = field( 68 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 69 | ) 70 | preprocessing_num_workers: Optional[int] = field( 71 | default=None, 72 | metadata={"help": "The number of processes to use for the preprocessing."}, 73 | ) 74 | max_seq_length: int = field( 75 | default=384, 76 | metadata={ 77 | "help": "The maximum total input sequence length after tokenization. Sequences longer " 78 | "than this will be truncated, sequences shorter will be padded." 79 | }, 80 | ) 81 | pad_to_max_length: bool = field( 82 | default=True, 83 | metadata={ 84 | "help": "Whether to pad all samples to `max_seq_length`. " 85 | "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can " 86 | "be faster on GPU but will be slower on TPU)." 87 | }, 88 | ) 89 | max_train_samples: Optional[int] = field( 90 | default=None, 91 | metadata={ 92 | "help": "For debugging purposes or quicker training, truncate the number of training examples to this " 93 | "value if set." 94 | }, 95 | ) 96 | max_eval_samples: Optional[int] = field( 97 | default=None, 98 | metadata={ 99 | "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " 100 | "value if set." 101 | }, 102 | ) 103 | max_predict_samples: Optional[int] = field( 104 | default=None, 105 | metadata={ 106 | "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " 107 | "value if set." 108 | }, 109 | ) 110 | allow_null_ans: bool = field( 111 | default=False, metadata={"help": "If true, some of the examples do not have an answer."} 112 | ) 113 | null_score_diff_threshold: float = field( 114 | default=0.0, 115 | metadata={ 116 | "help": "The threshold used to select the null answer: if the best answer has a score that is less than " 117 | "the score of the null answer minus this threshold, the null answer is selected for this example. " 118 | "Only useful when `allow_null_ans=True`." 119 | }, 120 | ) 121 | doc_stride: int = field( 122 | default=128, 123 | metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."}, 124 | ) 125 | n_best_size: int = field( 126 | default=20, 127 | metadata={"help": "The total number of n-best predictions to generate when looking for an answer."}, 128 | ) 129 | max_answer_length: int = field( 130 | default=30, 131 | metadata={ 132 | "help": "The maximum length of an answer that can be generated. This is needed because the start " 133 | "and end predictions are not conditioned on one another." 134 | }, 135 | ) 136 | do_normalize: Optional[bool] = field(default=True, metadata={"help": "Normalize text before feeding to the model."}) 137 | unicode_norm: Optional[str] = field(default="NFKC", metadata={"help": "Type of unicode normalization"}) 138 | 139 | 140 | def __post_init__(self): 141 | if self.train_file is not None and self.validation_file is not None: 142 | train_extension = self.train_file.split(".")[-1] 143 | assert train_extension in ["csv", "json", "tsv"], "`train_file` should be a csv / tsv / json file." 144 | validation_extension = self.validation_file.split(".")[-1] 145 | assert ( 146 | validation_extension == train_extension 147 | ), "`validation_file` should have the same extension csv / tsv / json as `train_file`." 148 | 149 | 150 | @dataclass 151 | class ModelArguments: 152 | 153 | model_name_or_path: str = field( 154 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} 155 | ) 156 | cache_dir: Optional[str] = field( 157 | default=None, 158 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 159 | ) 160 | 161 | 162 | def main(): 163 | 164 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) 165 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 166 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 167 | else: 168 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 169 | 170 | # Setup logging 171 | logging.basicConfig( 172 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 173 | datefmt="%m/%d/%Y %H:%M:%S", 174 | handlers=[logging.StreamHandler(sys.stdout)], 175 | ) 176 | 177 | log_level = training_args.get_process_log_level() 178 | logger.setLevel(log_level) 179 | datasets.utils.logging.set_verbosity(log_level) 180 | transformers.utils.logging.set_verbosity(log_level) 181 | transformers.utils.logging.enable_default_handler() 182 | transformers.utils.logging.enable_explicit_format() 183 | 184 | logger.warning( 185 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 186 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 187 | ) 188 | logger.info(f"Training/evaluation parameters {training_args}") 189 | 190 | last_checkpoint = None 191 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 192 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 193 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 194 | raise ValueError( 195 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 196 | "Use --overwrite_output_dir to overcome." 197 | ) 198 | elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: 199 | logger.info( 200 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 201 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 202 | ) 203 | 204 | # Set seed before initializing model. 205 | set_seed(training_args.seed) 206 | 207 | has_ext = lambda path: len(os.path.basename(path).split(".")) > 1 208 | get_ext = lambda path: os.path.basename(path).split(".")[-1] 209 | 210 | if data_args.dataset_dir is not None: 211 | data_files = {} 212 | all_files = glob.glob( 213 | os.path.join( 214 | data_args.dataset_dir, 215 | "*" 216 | ) 217 | ) 218 | all_exts = [get_ext(k) for k in all_files if has_ext(k)] 219 | if not all_exts: 220 | raise ValueError("The `dataset_dir` doesnt have any valid file.") 221 | 222 | selected_ext = max(set(all_exts), key=all_exts.count) 223 | for search_prefix in ["train", "validation", "test"]: 224 | found_files = glob.glob( 225 | os.path.join( 226 | data_args.dataset_dir, 227 | search_prefix + "*" + selected_ext 228 | ) 229 | ) 230 | if not found_files: 231 | continue 232 | 233 | data_files[search_prefix] = found_files 234 | 235 | else: 236 | data_files = { 237 | "train": data_args.train_file, 238 | "validation": data_args.validation_file, 239 | "test": data_args.test_file 240 | } 241 | 242 | data_files = {k: v for k, v in data_files.items() if v is not None} 243 | 244 | if not data_files: 245 | raise ValueError("No valid input file found.") 246 | 247 | selected_ext = get_ext(list(data_files.values())[0]) 248 | 249 | 250 | dataset_configs = EXT2CONFIG[selected_ext] 251 | raw_datasets = dataset_configs[0]( 252 | data_files, 253 | **dataset_configs[1] 254 | ).read() 255 | 256 | config = AutoConfig.from_pretrained( 257 | model_args.model_name_or_path, 258 | cache_dir=model_args.cache_dir 259 | ) 260 | 261 | tokenizer_kwargs = {"add_prefix_space": True} if config.model_type in {"gpt2", "roberta"} else {} 262 | tokenizer = AutoTokenizer.from_pretrained( 263 | model_args.model_name_or_path, 264 | cache_dir=model_args.cache_dir, 265 | use_fast=True, 266 | **tokenizer_kwargs 267 | ) 268 | model = AutoModelForQuestionAnswering.from_pretrained( 269 | model_args.model_name_or_path, 270 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 271 | config=config, 272 | cache_dir=model_args.cache_dir 273 | ) 274 | 275 | if not isinstance(tokenizer, PreTrainedTokenizerFast): 276 | raise ValueError( 277 | "This script only works for models that have a fast tokenizer. Checkout the big table of models " 278 | "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this " 279 | "requirement" 280 | ) 281 | 282 | if training_args.do_train: 283 | column_names = raw_datasets["train"].column_names 284 | elif training_args.do_eval: 285 | column_names = raw_datasets["validation"].column_names 286 | else: 287 | column_names = raw_datasets["test"].column_names 288 | 289 | question_column_name = "question" if "question" in column_names else column_names[0] 290 | context_column_name = "context" if "context" in column_names else column_names[1] 291 | answer_column_name = "answers" if "answers" in column_names else column_names[2] 292 | 293 | 294 | if data_args.do_normalize: 295 | normalization_kwargs = { 296 | "unicode_norm": data_args.unicode_norm, 297 | } 298 | required_column_names = [ 299 | question_column_name, 300 | context_column_name, 301 | answer_column_name 302 | ] 303 | 304 | def normalize_example(example): 305 | required_row_values = [example[k] for k in required_column_names if k in example] 306 | question, context = required_row_values[:2] 307 | example[question_column_name] = normalize(question, **normalization_kwargs) 308 | example[context_column_name] = normalize(context, **normalization_kwargs) 309 | 310 | if len(required_row_values) == 3: 311 | answer = required_row_values[2] 312 | for i, ans in enumerate(answer["text"]): 313 | prev_position = answer["answer_start"][i] 314 | answer["text"][i] = normalize(ans, **normalization_kwargs) 315 | 316 | replace_index = -1 317 | for j, pos in enumerate(find_all_indices(ans, context)): 318 | replace_index = j 319 | if pos == prev_position: 320 | break 321 | 322 | if replace_index != -1: 323 | index_iterator = find_all_indices( 324 | answer["text"][i], 325 | example[context_column_name] 326 | ) 327 | for j, pos in enumerate(index_iterator): 328 | if j == replace_index: 329 | answer["answer_start"][i] = pos 330 | assert answer["text"][i] == example[context_column_name][pos: pos + len(answer["text"][i])] 331 | break 332 | 333 | example[answer_column_name] = answer 334 | 335 | return example 336 | 337 | raw_datasets = raw_datasets.map( 338 | normalize_example, 339 | desc="Running normalization on dataset", 340 | load_from_cache_file=not data_args.overwrite_cache 341 | ) 342 | 343 | pad_on_right = tokenizer.padding_side == "right" 344 | 345 | if data_args.max_seq_length > tokenizer.model_max_length: 346 | logger.warning( 347 | f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" 348 | f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." 349 | ) 350 | max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) 351 | 352 | 353 | 354 | def prepare_train_features(examples): 355 | tokenized_examples = tokenizer( 356 | examples[question_column_name if pad_on_right else context_column_name], 357 | examples[context_column_name if pad_on_right else question_column_name], 358 | truncation="only_second" if pad_on_right else "only_first", 359 | max_length=max_seq_length, 360 | stride=data_args.doc_stride, 361 | return_overflowing_tokens=True, 362 | return_offsets_mapping=True, 363 | padding="max_length" if data_args.pad_to_max_length else False, 364 | ) 365 | 366 | sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") 367 | offset_mapping = tokenized_examples.pop("offset_mapping") 368 | 369 | tokenized_examples["start_positions"] = [] 370 | tokenized_examples["end_positions"] = [] 371 | 372 | for i, offsets in enumerate(offset_mapping): 373 | input_ids = tokenized_examples["input_ids"][i] 374 | cls_index = input_ids.index(tokenizer.cls_token_id) 375 | sequence_ids = tokenized_examples.sequence_ids(i) 376 | 377 | sample_index = sample_mapping[i] 378 | answers = examples[answer_column_name][sample_index] 379 | if len(answers["answer_start"]) == 0: 380 | tokenized_examples["start_positions"].append(cls_index) 381 | tokenized_examples["end_positions"].append(cls_index) 382 | else: 383 | start_char = answers["answer_start"][0] 384 | end_char = start_char + len(answers["text"][0]) 385 | 386 | token_start_index = 0 387 | while sequence_ids[token_start_index] != (1 if pad_on_right else 0): 388 | token_start_index += 1 389 | 390 | token_end_index = len(input_ids) - 1 391 | while sequence_ids[token_end_index] != (1 if pad_on_right else 0): 392 | token_end_index -= 1 393 | 394 | if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): 395 | tokenized_examples["start_positions"].append(cls_index) 396 | tokenized_examples["end_positions"].append(cls_index) 397 | else: 398 | while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char: 399 | token_start_index += 1 400 | tokenized_examples["start_positions"].append(token_start_index - 1) 401 | while offsets[token_end_index][1] >= end_char: 402 | token_end_index -= 1 403 | tokenized_examples["end_positions"].append(token_end_index + 1) 404 | 405 | return tokenized_examples 406 | 407 | if training_args.do_train: 408 | if "train" not in raw_datasets: 409 | raise ValueError("--do_train requires a train dataset") 410 | train_dataset = raw_datasets["train"] 411 | if data_args.max_train_samples is not None: 412 | answerable_indices = [i for i, data in enumerate(train_dataset) 413 | if data['answers']['text']] 414 | unanswerable_indices = [i for i, data in enumerate(train_dataset) 415 | if not data['answers']['text']] 416 | 417 | if ( 418 | len(answerable_indices) >= data_args.max_train_samples // 2 and 419 | len(unanswerable_indices) >= data_args.max_train_samples // 2 420 | ): 421 | selected_answerable_indices = answerable_indices[:data_args.max_train_samples // 2] 422 | selected_unanswerable_indices = unanswerable_indices[:data_args.max_train_samples - len(selected_answerable_indices)] 423 | train_dataset = train_dataset.select( 424 | selected_answerable_indices + 425 | selected_unanswerable_indices 426 | ) 427 | else: 428 | train_dataset = train_dataset.select(range(data_args.max_train_samples)) 429 | 430 | with training_args.main_process_first(desc="train dataset map pre-processing"): 431 | train_dataset = train_dataset.map( 432 | prepare_train_features, 433 | batched=True, 434 | num_proc=data_args.preprocessing_num_workers, 435 | remove_columns=column_names, 436 | load_from_cache_file=not data_args.overwrite_cache, 437 | desc="Running tokenizer on train dataset", 438 | ) 439 | if data_args.max_train_samples is not None: 440 | train_dataset = train_dataset.select(range(data_args.max_train_samples)) 441 | 442 | def prepare_validation_features(examples): 443 | 444 | tokenized_examples = tokenizer( 445 | examples[question_column_name if pad_on_right else context_column_name], 446 | examples[context_column_name if pad_on_right else question_column_name], 447 | truncation="only_second" if pad_on_right else "only_first", 448 | max_length=max_seq_length, 449 | stride=data_args.doc_stride, 450 | return_overflowing_tokens=True, 451 | return_offsets_mapping=True, 452 | padding="max_length" if data_args.pad_to_max_length else False, 453 | ) 454 | 455 | sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") 456 | 457 | tokenized_examples["example_id"] = [] 458 | 459 | for i in range(len(tokenized_examples["input_ids"])): 460 | sequence_ids = tokenized_examples.sequence_ids(i) 461 | context_index = 1 if pad_on_right else 0 462 | 463 | sample_index = sample_mapping[i] 464 | tokenized_examples["example_id"].append(examples["id"][sample_index]) 465 | 466 | tokenized_examples["offset_mapping"][i] = [ 467 | (o if sequence_ids[k] == context_index else None) 468 | for k, o in enumerate(tokenized_examples["offset_mapping"][i]) 469 | ] 470 | 471 | return tokenized_examples 472 | 473 | if training_args.do_eval: 474 | if "validation" not in raw_datasets: 475 | raise ValueError("--do_eval requires a validation dataset") 476 | eval_examples = raw_datasets["validation"] 477 | if data_args.max_eval_samples is not None: 478 | eval_examples = eval_examples.select(range(data_args.max_eval_samples)) 479 | 480 | with training_args.main_process_first(desc="validation dataset map pre-processing"): 481 | eval_dataset = eval_examples.map( 482 | prepare_validation_features, 483 | batched=True, 484 | num_proc=data_args.preprocessing_num_workers, 485 | remove_columns=column_names, 486 | load_from_cache_file=not data_args.overwrite_cache, 487 | desc="Running tokenizer on validation dataset", 488 | ) 489 | if data_args.max_eval_samples is not None: 490 | # During Feature creation dataset samples might increase, we will select required samples again 491 | eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) 492 | 493 | if training_args.do_predict: 494 | if "test" not in raw_datasets: 495 | raise ValueError("--do_predict requires a test dataset") 496 | predict_examples = raw_datasets["test"] 497 | if data_args.max_predict_samples is not None: 498 | predict_examples = predict_examples.select(range(data_args.max_predict_samples)) 499 | 500 | with training_args.main_process_first(desc="prediction dataset map pre-processing"): 501 | predict_dataset = predict_examples.map( 502 | prepare_validation_features, 503 | batched=True, 504 | num_proc=data_args.preprocessing_num_workers, 505 | remove_columns=column_names, 506 | load_from_cache_file=not data_args.overwrite_cache, 507 | desc="Running tokenizer on prediction dataset", 508 | ) 509 | if data_args.max_predict_samples is not None: 510 | predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) 511 | 512 | data_collator = ( 513 | default_data_collator 514 | if data_args.pad_to_max_length 515 | else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) 516 | ) 517 | 518 | # Log a few random samples from the training set: 519 | if training_args.do_train: 520 | for index in random.sample(range(len(train_dataset)), 3): 521 | logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") 522 | 523 | def post_processing_function(examples, features, predictions, stage="eval"): 524 | predictions = postprocess_qa_predictions( 525 | examples=examples, 526 | features=features, 527 | predictions=predictions, 528 | allow_null_ans=data_args.allow_null_ans, 529 | n_best_size=data_args.n_best_size, 530 | max_answer_length=data_args.max_answer_length, 531 | null_score_diff_threshold=data_args.null_score_diff_threshold, 532 | output_dir=training_args.output_dir, 533 | log_level=log_level, 534 | prefix=stage, 535 | ) 536 | 537 | if data_args.allow_null_ans: 538 | formatted_predictions = [ 539 | {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() 540 | ] 541 | else: 542 | formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] 543 | 544 | references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples] 545 | return EvalPrediction(predictions=formatted_predictions, label_ids=references) 546 | 547 | metric = load_metric("squad_v2" if data_args.allow_null_ans else "squad") 548 | 549 | def compute_metrics(p: EvalPrediction): 550 | return metric.compute(predictions=p.predictions, references=p.label_ids) 551 | 552 | # Initialize our Trainer 553 | trainer = QuestionAnsweringTrainer( 554 | model=model, 555 | args=training_args, 556 | train_dataset=train_dataset if training_args.do_train else None, 557 | eval_dataset=eval_dataset if training_args.do_eval else None, 558 | eval_examples=eval_examples if training_args.do_eval else None, 559 | tokenizer=tokenizer, 560 | data_collator=data_collator, 561 | post_process_function=post_processing_function, 562 | compute_metrics=compute_metrics, 563 | ) 564 | 565 | # Training 566 | if training_args.do_train: 567 | checkpoint = None 568 | if training_args.resume_from_checkpoint is not None: 569 | checkpoint = training_args.resume_from_checkpoint 570 | elif last_checkpoint is not None: 571 | checkpoint = last_checkpoint 572 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 573 | trainer.save_model() # Saves the tokenizer too for easy upload 574 | 575 | metrics = train_result.metrics 576 | max_train_samples = ( 577 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) 578 | ) 579 | metrics["train_samples"] = min(max_train_samples, len(train_dataset)) 580 | 581 | trainer.log_metrics("train", metrics) 582 | trainer.save_metrics("train", metrics) 583 | trainer.save_state() 584 | 585 | # Evaluation 586 | if training_args.do_eval: 587 | logger.info("*** Evaluate ***") 588 | metrics = trainer.evaluate() 589 | 590 | max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) 591 | metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) 592 | 593 | trainer.log_metrics("eval", metrics) 594 | trainer.save_metrics("eval", metrics) 595 | 596 | # Prediction 597 | if training_args.do_predict: 598 | logger.info("*** Predict ***") 599 | results = trainer.predict(predict_dataset, predict_examples) 600 | metrics = results.metrics 601 | 602 | max_predict_samples = ( 603 | data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset) 604 | ) 605 | metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) 606 | 607 | trainer.log_metrics("predict", metrics) 608 | trainer.save_metrics("predict", metrics) 609 | 610 | 611 | def _mp_fn(index): 612 | # For xla_spawn (TPUs) 613 | main() 614 | 615 | 616 | if __name__ == "__main__": 617 | main() 618 | --------------------------------------------------------------------------------