├── figs
    └── scores.png
├── requirements.txt
├── setup.sh
├── sequence_classification
    ├── sample_inputs
    │   ├── single_sequence
    │   │   ├── tsv
    │   │   │   ├── sample_test_without_labels.tsv
    │   │   │   ├── test.tsv
    │   │   │   ├── validation.tsv
    │   │   │   └── train.tsv
    │   │   ├── csv
    │   │   │   ├── sample_test_without_labels.csv
    │   │   │   ├── test.csv
    │   │   │   ├── validation.csv
    │   │   │   └── train.csv
    │   │   └── jsonl
    │   │   │   ├── sample_test_without_labels.jsonl
    │   │   │   ├── test.jsonl
    │   │   │   ├── validation.jsonl
    │   │   │   └── train.jsonl
    │   └── double_sequence
    │   │   ├── tsv
    │   │       ├── validation.tsv
    │   │       ├── train.tsv
    │   │       ├── sample_test_without_label.tsv
    │   │       └── test.tsv
    │   │   ├── csv
    │   │       ├── validation.csv
    │   │       ├── train.csv
    │   │       ├── sample_test_without_label.csv
    │   │       └── test.csv
    │   │   └── jsonl
    │   │       ├── validation.jsonl
    │   │       ├── train.jsonl
    │   │       ├── sample_test_without_label.jsonl
    │   │       └── test.jsonl
    ├── evaluate.sh
    ├── trainer.sh
    ├── README.md
    └── sequence_classification.py
├── token_classification
    ├── sample_inputs
    │   ├── sample_test_without_tags.jsonl
    │   ├── test.jsonl
    │   ├── validation.jsonl
    │   └── train.jsonl
    ├── evaluate.sh
    ├── README.md
    ├── trainer.sh
    └── token_classification.py
├── question_answering
    ├── evaluate.sh
    ├── README.md
    ├── trainer.sh
    ├── utils.py
    └── question_answering.py
├── .gitignore
└── README.md


/figs/scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csebuetnlp/banglabert/HEAD/figs/scores.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | sentencepiece != 0.1.92
2 | protobuf
3 | datasets==1.11.0
4 | seqeval==1.2.2
5 | git+https://github.com/csebuetnlp/normalizer


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | git clone https://github.com/huggingface/transformers.git
4 | cd transformers/
5 | git checkout 7a26307e3186926373cf9129248c209ab869148b
6 | pip install --upgrade ./
7 | cd ../
8 | 
9 | pip install --upgrade -r requirements.txt


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/single_sequence/tsv/sample_test_without_labels.tsv:
--------------------------------------------------------------------------------
1 | sentence1
2 | সবার জন্য উন্মুক্ত।
3 | কাফের এর সাথে তর্ক করে ঈমান নষ্ট করার কি দরকার
4 | ভাই ভালোবাসা। যখন দেখি কেউ নাই, তখন দেখলাম আপনার লেখা আছে। ধণ্যবাদ
5 | আমরা জানি আপনারা কি চান। বাকরুদ্ধতা
6 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/single_sequence/tsv/test.tsv:
--------------------------------------------------------------------------------
1 | sentence1	label
2 | সবার জন্য উন্মুক্ত।	pos
3 | কাফের এর সাথে তর্ক করে ঈমান নষ্ট করার কি দরকার	neg
4 | ভাই ভালোবাসা। যখন দেখি কেউ নাই, তখন দেখলাম আপনার লেখা আছে। ধণ্যবাদ	pos
5 | আমরা জানি আপনারা কি চান। বাকরুদ্ধতা	neg
6 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/single_sequence/csv/sample_test_without_labels.csv:
--------------------------------------------------------------------------------
1 | sentence1
2 | সবার জন্য উন্মুক্ত।
3 | কাফের এর সাথে তর্ক করে ঈমান নষ্ট করার কি দরকার
4 | "ভাই ভালোবাসা। যখন দেখি কেউ নাই, তখন দেখলাম আপনার লেখা আছে। ধণ্যবাদ"
5 | আমরা জানি আপনারা কি চান। বাকরুদ্ধতা
6 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/single_sequence/csv/test.csv:
--------------------------------------------------------------------------------
1 | sentence1,label
2 | সবার জন্য উন্মুক্ত।,pos
3 | কাফের এর সাথে তর্ক করে ঈমান নষ্ট করার কি দরকার,neg
4 | "ভাই ভালোবাসা। যখন দেখি কেউ নাই, তখন দেখলাম আপনার লেখা আছে। ধণ্যবাদ",pos
5 | আমরা জানি আপনারা কি চান। বাকরুদ্ধতা,neg
6 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/single_sequence/jsonl/sample_test_without_labels.jsonl:
--------------------------------------------------------------------------------
1 | {"sentence1": "সবার জন্য উন্মুক্ত।"}
2 | {"sentence1": "কাফের এর সাথে তর্ক করে ঈমান নষ্ট করার কি দরকার"}
3 | {"sentence1": "ভাই ভালোবাসা। যখন দেখি কেউ নাই, তখন দেখলাম আপনার লেখা আছে। ধণ্যবাদ"}
4 | {"sentence1": "আমরা জানি আপনারা কি চান। বাকরুদ্ধতা"}
5 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/single_sequence/jsonl/test.jsonl:
--------------------------------------------------------------------------------
1 | {"sentence1": "সবার জন্য উন্মুক্ত।", "label": "pos"}
2 | {"sentence1": "কাফের এর সাথে তর্ক করে ঈমান নষ্ট করার কি দরকার", "label": "neg"}
3 | {"sentence1": "ভাই ভালোবাসা। যখন দেখি কেউ নাই, তখন দেখলাম আপনার লেখা আছে। ধণ্যবাদ", "label": "pos"}
4 | {"sentence1": "আমরা জানি আপনারা কি চান। বাকরুদ্ধতা", "label": "neg"}
5 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/single_sequence/tsv/validation.tsv:
--------------------------------------------------------------------------------
1 | sentence1	label
2 | লোক দেখানো সম্মান দেখিয়ে লাভ নাই।মিডিয়ার সামনে অনেকেই সমাজ সেবক,কিন্তু ভিতরে সদরঘাট।	neg
3 | খেলার মাধ্যমে ধর্মিয় উৎসব কে অবমাননা।।।।।	neg
4 | মালুদের কারবার দেখলে হাসি পায়	neg
5 | অন্যায্য একটি সিদ্ধান্ত সমূলে কাটা পড়ায় ভালো ই হয়েছে। বেসরকারি বিশ্ববিদ্যালয়ে ভর্তি ফি নির্দিষ্ট করে দিলে এবং তা নিয়মিত মনিটরিং করলে আরও ভালো হবে!	pos
6 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/single_sequence/csv/validation.csv:
--------------------------------------------------------------------------------
1 | sentence1,label
2 | "লোক দেখানো সম্মান দেখিয়ে লাভ নাই।মিডিয়ার সামনে অনেকেই সমাজ সেবক,কিন্তু ভিতরে সদরঘাট।",neg
3 | খেলার মাধ্যমে ধর্মিয় উৎসব কে অবমাননা।।।।।,neg
4 | মালুদের কারবার দেখলে হাসি পায়,neg
5 | অন্যায্য একটি সিদ্ধান্ত সমূলে কাটা পড়ায় ভালো ই হয়েছে। বেসরকারি বিশ্ববিদ্যালয়ে ভর্তি ফি নির্দিষ্ট করে দিলে এবং তা নিয়মিত মনিটরিং করলে আরও ভালো হবে!,pos
6 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/single_sequence/tsv/train.tsv:
--------------------------------------------------------------------------------
1 | sentence1	label
2 | যেই মাদারির পোলারা এই কাজটি করেছে, সেই সালারা অবৈধ জারপ সন্তান ছারা আর কিছুই না।	neg
3 | ভারতের কুখ্যাত ষড়যন্ত্রের মুখোশ উন্মোচন হলো	neg
4 | আমার প্রছন্দের একাদশ  ১/তামিম  ২/সৌম্য  ৩/রিয়াদ  ৪/সাকিব  ৫/মুশফিক  ৬/মোসাদ্দেক   ৭/সাব্বির   ৮/নাসির  ৯/মাশরাফি   ১০/তাসকিন  ১১/রুবেল   কেমন হলো আমার একাদশ  পছন্দ হলে লাইক দিবেন এবং ভুল হলে তা কমেন্ট করে জানাবেন।	pos
5 | মুসা কপা‌লে কি অা‌ছে জা‌নিনা	neg
6 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/single_sequence/csv/train.csv:
--------------------------------------------------------------------------------
1 | sentence1,label
2 | "যেই মাদারির পোলারা এই কাজটি করেছে, সেই সালারা অবৈধ জারপ সন্তান ছারা আর কিছুই না।",neg
3 | ভারতের কুখ্যাত ষড়যন্ত্রের মুখোশ উন্মোচন হলো,neg
4 | আমার প্রছন্দের একাদশ  ১/তামিম  ২/সৌম্য  ৩/রিয়াদ  ৪/সাকিব  ৫/মুশফিক  ৬/মোসাদ্দেক   ৭/সাব্বির   ৮/নাসির  ৯/মাশরাফি   ১০/তাসকিন  ১১/রুবেল   কেমন হলো আমার একাদশ  পছন্দ হলে লাইক দিবেন এবং ভুল হলে তা কমেন্ট করে জানাবেন।,pos
5 | মুসা কপা‌লে কি অা‌ছে জা‌নিনা,neg
6 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/single_sequence/jsonl/validation.jsonl:
--------------------------------------------------------------------------------
1 | {"sentence1": "লোক দেখানো সম্মান দেখিয়ে লাভ নাই।মিডিয়ার সামনে অনেকেই সমাজ সেবক,কিন্তু ভিতরে সদরঘাট।", "label": "neg"}
2 | {"sentence1": "খেলার মাধ্যমে ধর্মিয় উৎসব কে অবমাননা।।।।।", "label": "neg"}
3 | {"sentence1": "মালুদের কারবার দেখলে হাসি পায়", "label": "neg"}
4 | {"sentence1": "অন্যায্য একটি সিদ্ধান্ত সমূলে কাটা পড়ায় ভালো ই হয়েছে। বেসরকারি বিশ্ববিদ্যালয়ে ভর্তি ফি নির্দিষ্ট করে দিলে এবং তা নিয়মিত মনিটরিং করলে আরও ভালো হবে!", "label": "pos"}
5 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/single_sequence/jsonl/train.jsonl:
--------------------------------------------------------------------------------
1 | {"sentence1": "যেই মাদারির পোলারা এই কাজটি করেছে, সেই সালারা অবৈধ জারপ সন্তান ছারা আর কিছুই না।", "label": "neg"}
2 | {"sentence1": "ভারতের কুখ্যাত ষড়যন্ত্রের মুখোশ উন্মোচন হলো", "label": "neg"}
3 | {"sentence1": "আমার প্রছন্দের একাদশ  ১/তামিম  ২/সৌম্য  ৩/রিয়াদ  ৪/সাকিব  ৫/মুশফিক  ৬/মোসাদ্দেক   ৭/সাব্বির   ৮/নাসির  ৯/মাশরাফি   ১০/তাসকিন  ১১/রুবেল   কেমন হলো আমার একাদশ  পছন্দ হলে লাইক দিবেন এবং ভুল হলে তা কমেন্ট করে জানাবেন।", "label": "pos"}
4 | {"sentence1": "মুসা কপা‌লে কি অা‌ছে জা‌নিনা", "label": "neg"}
5 | 


--------------------------------------------------------------------------------
/token_classification/sample_inputs/sample_test_without_tags.jsonl:
--------------------------------------------------------------------------------
1 | {"tokens": ["৫%", "তার", "চাইতে", "পশ্চিমোরে", "এর", "সাক্ষরতার", "হার", "কম"]}
2 | {"tokens": ["গত", "২০১৫", "সালের", "৫", "আগস্ট", "সকাল", "সাড়ে", "৮টার", "দিকে", "ভাড়া", "বাসায়", "আনিছুর", "রহমান", "ধারালো", "বটি", "দিয়ে", "কুপিয়ে", "স্ত্রী", "মৌসুমিকে", "হত্যা", "করে"]}
3 | {"tokens": ["জেলা", "ক্রীড়া", "সংস্থার", "সাধারণ", "সম্পাদক", "ওবায়দুর", "রহমান", "খান"]}
4 | {"tokens": ["আমার", "ছবি", "তুলে", "আর", "কী", "হবে"]}
5 | {"tokens": ["ডেভিড", "ওলিয়ারি", "জন্ম", "মে", "২", "২০০৮", "লন্ডনে", "একজন", "আয়ারল্যান্ডীয়", "পেশাদার", "ফুটবল", "খেলোয়াড়", "এবং", "ম্যানেজার"]}
6 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/double_sequence/tsv/validation.tsv:
--------------------------------------------------------------------------------
1 | sentence1	sentence2	label
2 | আর সে বললো, মা, আমি বাসায়।	স্কুলের বাস তাকে নামিয়ে দেওয়ার সঙ্গে সঙ্গে তিনি তার মাকে ফোন করেছিলেন।	neutral
3 | আর সে বললো, মা, আমি বাসায়।	সে কোন কথা বলেনি।	contradiction
4 | আর সে বললো, মা, আমি বাসায়।	সে তার মাকে বলেছিল যে সে বাড়ি ফিরেছে।	entailment
5 | আমি জানতাম না আমি কি জন্য যাচ্ছি বা কিছু, তাই ওয়াশিংটনের একটি নির্দিষ্ট স্থানে রিপোর্ট করার ছিল।	আমি কখনো ওয়াশিংটনে যাইনি তাই যখন আমাকে সেখানে কার্যভার দেওয়া হয়, তখন আমি সেই জায়গা খুঁজে বের করার চেষ্টা করতে গিয়ে হেরে যাই।	neutral
6 | আমি জানতাম না আমি কি জন্য যাচ্ছি বা কিছু, তাই ওয়াশিংটনের একটি নির্দিষ্ট স্থানে রিপোর্ট করার ছিল।	আমি ঠিক জানতাম যে, ওয়াশিংটনের দিকে এগিয়ে যাওয়ার সময় আমাকে কী করতে হবে।	contradiction
7 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/double_sequence/csv/validation.csv:
--------------------------------------------------------------------------------
1 | sentence1,sentence2,label
2 | "আর সে বললো, মা, আমি বাসায়।",স্কুলের বাস তাকে নামিয়ে দেওয়ার সঙ্গে সঙ্গে তিনি তার মাকে ফোন করেছিলেন।,neutral
3 | "আর সে বললো, মা, আমি বাসায়।",সে কোন কথা বলেনি।,contradiction
4 | "আর সে বললো, মা, আমি বাসায়।",সে তার মাকে বলেছিল যে সে বাড়ি ফিরেছে।,entailment
5 | "আমি জানতাম না আমি কি জন্য যাচ্ছি বা কিছু, তাই ওয়াশিংটনের একটি নির্দিষ্ট স্থানে রিপোর্ট করার ছিল।","আমি কখনো ওয়াশিংটনে যাইনি তাই যখন আমাকে সেখানে কার্যভার দেওয়া হয়, তখন আমি সেই জায়গা খুঁজে বের করার চেষ্টা করতে গিয়ে হেরে যাই।",neutral
6 | "আমি জানতাম না আমি কি জন্য যাচ্ছি বা কিছু, তাই ওয়াশিংটনের একটি নির্দিষ্ট স্থানে রিপোর্ট করার ছিল।","আমি ঠিক জানতাম যে, ওয়াশিংটনের দিকে এগিয়ে যাওয়ার সময় আমাকে কী করতে হবে।",contradiction
7 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/double_sequence/csv/train.csv:
--------------------------------------------------------------------------------
1 | sentence1,sentence2,label
2 | ধারণাগতভাবে ক্রিম স্কিমিং এর দুটি মৌলিক মাত্রা রয়েছে - পণ্য এবং ভূগোল।,পণ্য এবং ভূগোল হল ক্রিমের স্কিমিং কাজ।,neutral
3 | আপনি জানেন এই মৌসুমে আর আমার মনে হয় আপনার পর্যায়ে আপনি তাদেরকে পরবর্তী ধাপে হারিয়ে ফেলেছেন যদি তারা সিদ্ধান্ত নেন যে অভিভাবক দলকে মনে করতে যে ব্রেভস ট্রিপল এ এর একজন লোককে স্মরণ করার সিদ্ধান্ত নেন তাহলে একজন ডাবল এ লোক তার জায়গায় যায় আর একজন এ লোক তার জায়গায় যায়,মানুষ যদি স্মরণ করতে পারে তাহলে আপনি নীচের স্তরে হারিয়ে যাবেন।,entailment
4 | আমাদের মধ্যে একজন আপনার নির্দেশগুলো পুঙ্খানুপুঙ্খভাবে পালন করবে।,আমার দলের একজন সদস্য তোমার আদেশ পালন করবে খুবই নির্ভুলভাবে।,entailment
5 | আপনি কিভাবে জানলেন? এই সব তাদের তথ্য আবার।,এই তথ্য তাদের।,entailment
6 | হ্যাঁ আমি তোমাকে বলছি যদি তুমি কিছু টেনিস জুতার দাম দাও... ...তাহলে আমি বুঝতে পারছি কেন তারা ১০০ ডলারের রেঞ্জে উঠে যাচ্ছে,টেনিস জুতার দাম অনেক।,neutral
7 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/double_sequence/tsv/train.tsv:
--------------------------------------------------------------------------------
1 | sentence1	sentence2	label
2 | ধারণাগতভাবে ক্রিম স্কিমিং এর দুটি মৌলিক মাত্রা রয়েছে - পণ্য এবং ভূগোল।	পণ্য এবং ভূগোল হল ক্রিমের স্কিমিং কাজ।	neutral
3 | আপনি জানেন এই মৌসুমে আর আমার মনে হয় আপনার পর্যায়ে আপনি তাদেরকে পরবর্তী ধাপে হারিয়ে ফেলেছেন যদি তারা সিদ্ধান্ত নেন যে অভিভাবক দলকে মনে করতে যে ব্রেভস ট্রিপল এ এর একজন লোককে স্মরণ করার সিদ্ধান্ত নেন তাহলে একজন ডাবল এ লোক তার জায়গায় যায় আর একজন এ লোক তার জায়গায় যায়	মানুষ যদি স্মরণ করতে পারে তাহলে আপনি নীচের স্তরে হারিয়ে যাবেন।	entailment
4 | আমাদের মধ্যে একজন আপনার নির্দেশগুলো পুঙ্খানুপুঙ্খভাবে পালন করবে।	আমার দলের একজন সদস্য তোমার আদেশ পালন করবে খুবই নির্ভুলভাবে।	entailment
5 | আপনি কিভাবে জানলেন? এই সব তাদের তথ্য আবার।	এই তথ্য তাদের।	entailment
6 | হ্যাঁ আমি তোমাকে বলছি যদি তুমি কিছু টেনিস জুতার দাম দাও... ...তাহলে আমি বুঝতে পারছি কেন তারা ১০০ ডলারের রেঞ্জে উঠে যাচ্ছে	টেনিস জুতার দাম অনেক।	neutral
7 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/double_sequence/jsonl/validation.jsonl:
--------------------------------------------------------------------------------
1 | {"sentence1": "আর সে বললো, মা, আমি বাসায়।", "sentence2": "স্কুলের বাস তাকে নামিয়ে দেওয়ার সঙ্গে সঙ্গে তিনি তার মাকে ফোন করেছিলেন।", "label": "neutral"}
2 | {"sentence1": "আর সে বললো, মা, আমি বাসায়।", "sentence2": "সে কোন কথা বলেনি।", "label": "contradiction"}
3 | {"sentence1": "আর সে বললো, মা, আমি বাসায়।", "sentence2": "সে তার মাকে বলেছিল যে সে বাড়ি ফিরেছে।", "label": "entailment"}
4 | {"sentence1": "আমি জানতাম না আমি কি জন্য যাচ্ছি বা কিছু, তাই ওয়াশিংটনের একটি নির্দিষ্ট স্থানে রিপোর্ট করার ছিল।", "sentence2": "আমি কখনো ওয়াশিংটনে যাইনি তাই যখন আমাকে সেখানে কার্যভার দেওয়া হয়, তখন আমি সেই জায়গা খুঁজে বের করার চেষ্টা করতে গিয়ে হেরে যাই।", "label": "neutral"}
5 | {"sentence1": "আমি জানতাম না আমি কি জন্য যাচ্ছি বা কিছু, তাই ওয়াশিংটনের একটি নির্দিষ্ট স্থানে রিপোর্ট করার ছিল।", "sentence2": "আমি ঠিক জানতাম যে, ওয়াশিংটনের দিকে এগিয়ে যাওয়ার সময় আমাকে কী করতে হবে।", "label": "contradiction"}
6 | 


--------------------------------------------------------------------------------
/token_classification/sample_inputs/test.jsonl:
--------------------------------------------------------------------------------
1 | {"tokens": ["৫%", "তার", "চাইতে", "পশ্চিমোরে", "এর", "সাক্ষরতার", "হার", "কম"], "tags": ["O", "O", "O", "B-LOC", "O", "O", "O", "O"]}
2 | {"tokens": ["গত", "২০১৫", "সালের", "৫", "আগস্ট", "সকাল", "সাড়ে", "৮টার", "দিকে", "ভাড়া", "বাসায়", "আনিছুর", "রহমান", "ধারালো", "বটি", "দিয়ে", "কুপিয়ে", "স্ত্রী", "মৌসুমিকে", "হত্যা", "করে"], "tags": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-PER", "I-PER", "O", "B-OBJ", "O", "O", "B-PER", "B-PER", "O", "O"]}
3 | {"tokens": ["জেলা", "ক্রীড়া", "সংস্থার", "সাধারণ", "সম্পাদক", "ওবায়দুর", "রহমান", "খান"], "tags": ["B-LOC", "B-PER", "I-PER", "I-PER", "I-PER", "B-PER", "I-PER", "I-PER"]}
4 | {"tokens": ["আমার", "ছবি", "তুলে", "আর", "কী", "হবে"], "tags": ["B-PER", "O", "O", "O", "O", "O"]}
5 | {"tokens": ["ডেভিড", "ওলিয়ারি", "জন্ম", "মে", "২", "২০০৮", "লন্ডনে", "একজন", "আয়ারল্যান্ডীয়", "পেশাদার", "ফুটবল", "খেলোয়াড়", "এবং", "ম্যানেজার"], "tags": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC", "O", "B-LOC", "O", "O", "O", "O", "O"]}
6 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/double_sequence/tsv/sample_test_without_label.tsv:
--------------------------------------------------------------------------------
1 | sentence1	sentence2
2 | আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।	আমি তার সাথে আবার কথা বলিনি।
3 | আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।	আমি এতটাই ভেঙে পড়েছিলাম যে, আমি আবারও তার সঙ্গে কথা বলতে শুরু করেছিলাম।
4 | আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।	আমাদের খুব ভালো কথা হয়েছিল।
5 | এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।	আমি জানতাম না যে, সেই দিন শুধুমাত্র আমিই ক্ষেত্রে উপস্থিত থাকব না।
6 | এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।	আমার মনে হয়েছিল যে এএফএফসি বিমান বাহিনীর ক্যারিয়ারের ক্ষেত্রে আমিই একমাত্র ব্যক্তি।
7 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/double_sequence/csv/sample_test_without_label.csv:
--------------------------------------------------------------------------------
1 | sentence1,sentence2
2 | "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।",আমি তার সাথে আবার কথা বলিনি।
3 | "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।","আমি এতটাই ভেঙে পড়েছিলাম যে, আমি আবারও তার সঙ্গে কথা বলতে শুরু করেছিলাম।"
4 | "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।",আমাদের খুব ভালো কথা হয়েছিল।
5 | "এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।","আমি জানতাম না যে, সেই দিন শুধুমাত্র আমিই ক্ষেত্রে উপস্থিত থাকব না।"
6 | "এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।",আমার মনে হয়েছিল যে এএফএফসি বিমান বাহিনীর ক্যারিয়ারের ক্ষেত্রে আমিই একমাত্র ব্যক্তি।
7 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/double_sequence/tsv/test.tsv:
--------------------------------------------------------------------------------
1 | sentence1	sentence2	label
2 | আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।	আমি তার সাথে আবার কথা বলিনি।	contradiction
3 | আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।	আমি এতটাই ভেঙে পড়েছিলাম যে, আমি আবারও তার সঙ্গে কথা বলতে শুরু করেছিলাম।	entailment
4 | আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।	আমাদের খুব ভালো কথা হয়েছিল।	neutral
5 | এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।	আমি জানতাম না যে, সেই দিন শুধুমাত্র আমিই ক্ষেত্রে উপস্থিত থাকব না।	neutral
6 | এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।	আমার মনে হয়েছিল যে এএফএফসি বিমান বাহিনীর ক্যারিয়ারের ক্ষেত্রে আমিই একমাত্র ব্যক্তি।	entailment
7 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/double_sequence/jsonl/train.jsonl:
--------------------------------------------------------------------------------
1 | {"sentence1": "ধারণাগতভাবে ক্রিম স্কিমিং এর দুটি মৌলিক মাত্রা রয়েছে - পণ্য এবং ভূগোল।", "sentence2": "পণ্য এবং ভূগোল হল ক্রিমের স্কিমিং কাজ।", "label": "neutral"}
2 | {"sentence1": "আপনি জানেন এই মৌসুমে আর আমার মনে হয় আপনার পর্যায়ে আপনি তাদেরকে পরবর্তী ধাপে হারিয়ে ফেলেছেন যদি তারা সিদ্ধান্ত নেন যে অভিভাবক দলকে মনে করতে যে ব্রেভস ট্রিপল এ এর একজন লোককে স্মরণ করার সিদ্ধান্ত নেন তাহলে একজন ডাবল এ লোক তার জায়গায় যায় আর একজন এ লোক তার জায়গায় যায়", "sentence2": "মানুষ যদি স্মরণ করতে পারে তাহলে আপনি নীচের স্তরে হারিয়ে যাবেন।", "label": "entailment"}
3 | {"sentence1": "আমাদের মধ্যে একজন আপনার নির্দেশগুলো পুঙ্খানুপুঙ্খভাবে পালন করবে।", "sentence2": "আমার দলের একজন সদস্য তোমার আদেশ পালন করবে খুবই নির্ভুলভাবে।", "label": "entailment"}
4 | {"sentence1": "আপনি কিভাবে জানলেন? এই সব তাদের তথ্য আবার।", "sentence2": "এই তথ্য তাদের।", "label": "entailment"}
5 | {"sentence1": "হ্যাঁ আমি তোমাকে বলছি যদি তুমি কিছু টেনিস জুতার দাম দাও... ...তাহলে আমি বুঝতে পারছি কেন তারা ১০০ ডলারের রেঞ্জে উঠে যাচ্ছে", "sentence2": "টেনিস জুতার দাম অনেক।", "label": "neutral"}
6 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/double_sequence/csv/test.csv:
--------------------------------------------------------------------------------
1 | sentence1,sentence2,label
2 | "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।",আমি তার সাথে আবার কথা বলিনি।,contradiction
3 | "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।","আমি এতটাই ভেঙে পড়েছিলাম যে, আমি আবারও তার সঙ্গে কথা বলতে শুরু করেছিলাম।",entailment
4 | "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।",আমাদের খুব ভালো কথা হয়েছিল।,neutral
5 | "এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।","আমি জানতাম না যে, সেই দিন শুধুমাত্র আমিই ক্ষেত্রে উপস্থিত থাকব না।",neutral
6 | "এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।",আমার মনে হয়েছিল যে এএফএফসি বিমান বাহিনীর ক্যারিয়ারের ক্ষেত্রে আমিই একমাত্র ব্যক্তি।,entailment
7 | 


--------------------------------------------------------------------------------
/token_classification/sample_inputs/validation.jsonl:
--------------------------------------------------------------------------------
1 | {"tokens": ["২৮", "সেপ্টেম্বর", "দুবাইতে", "অনুষ্ঠিত", "হবে", "এই", "১৫", "সেপ্টেম্বর", "শ্রীলঙ্কার", "বিপক্ষে", "এবং", "২০", "সেপ্টেম্বর", "আফগানিস্তানের", "বিপক্ষে", "খেলবে"], "tags": ["O", "O", "B-LOC", "O", "O", "O", "O", "O", "B-LOC", "O", "O", "O", "O", "B-LOC", "O", "O"]}
2 | {"tokens": ["তিনি", "বলছিলেন", "এখানে", "উন্নতি", "করতে", "হলে", "কোয়ালিটি", "খেলোয়াড়দের", "সংখ্যাটাও", "বাড়াতে", "হবে"], "tags": ["B-PER", "O", "O", "O", "O", "O", "O", "B-PER", "O", "O", "O"]}
3 | {"tokens": ["ঢাকায়", "যেতে", "হলে", "তাদেরকে", "সকাল", "৯", "টার", "জোয়ার", "আসার", "পর", "হাতিয়ার", "উদ্দেশ্য", "যাত্রা", "করতে", "হয়"], "tags": ["B-LOC", "O", "O", "B-PER", "O", "O", "O", "O", "O", "O", "B-LOC", "O", "O", "O", "O"]}
4 | {"tokens": ["প্রতিটি", "বুথে", "ভোটসংখ্যা", "অনুযায়ী", "ব্যালট", "পেপার", "ও", "বক্স", "বুঝে", "নিতে", "হবে"], "tags": ["O", "O", "O", "O", "B-OBJ", "I-OBJ", "O", "B-OBJ", "O", "O", "O"]}
5 | {"tokens": ["বাংলার", "ইতিহাস", "বলতে", "অধুনা", "বাংলাদেশ", "ও", "পশ্চিমবঙ্গের", "বিগত", "চার", "সহস্রাব্দের", "ইতিহাসকে", "বোঝায়"], "tags": ["B-LOC", "O", "O", "O", "B-LOC", "O", "B-LOC", "O", "O", "O", "O", "O"]}
6 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/double_sequence/jsonl/sample_test_without_label.jsonl:
--------------------------------------------------------------------------------
1 | {"sentence1": "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।", "sentence2": "আমি তার সাথে আবার কথা বলিনি।"}
2 | {"sentence1": "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।", "sentence2": "আমি এতটাই ভেঙে পড়েছিলাম যে, আমি আবারও তার সঙ্গে কথা বলতে শুরু করেছিলাম।"}
3 | {"sentence1": "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।", "sentence2": "আমাদের খুব ভালো কথা হয়েছিল।"}
4 | {"sentence1": "এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।", "sentence2": "আমি জানতাম না যে, সেই দিন শুধুমাত্র আমিই ক্ষেত্রে উপস্থিত থাকব না।"}
5 | {"sentence1": "এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।", "sentence2": "আমার মনে হয়েছিল যে এএফএফসি বিমান বাহিনীর ক্যারিয়ারের ক্ষেত্রে আমিই একমাত্র ব্যক্তি।"}
6 | 


--------------------------------------------------------------------------------
/token_classification/evaluate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # misc. settings
 4 | export seed=1234
 5 | 
 6 | # model settings
 7 | export model_name=<path/to/finetuned/model>
 8 | 
 9 | # input settings
10 | # exactly one of `dataset_dir` or the test
11 | # dataset file needs to be provided
12 | input_settings=(
13 |     # "--dataset_dir sample_inputs/"
14 |     "--test_file sample_inputs/sample_test_without_tags.jsonl"
15 | )
16 | 
17 | # output settings
18 | export output_dir="outputs/"
19 | 
20 | # batch sizes
21 | export PER_DEVICE_EVAL_BATCH_SIZE=8
22 | 
23 | # optional_arguments
24 | optional_arguments=(
25 |     "--cache_dir cache_dir/"
26 |     "--overwrite_cache"
27 | )
28 | 
29 | # optional for logging
30 | # export WANDB_PROJECT="Token_classification_finetuning"
31 | # export WANDB_WATCH=false
32 | # export WANDB_MODE="dryrun"
33 | export WANDB_DISABLED=true
34 | 
35 | python ./token_classification.py \
36 |     --model_name_or_path $model_name \
37 |     --output_dir $output_dir \
38 |     --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
39 |     --max_seq_length $MAX_SEQUENCE_LENGTH \
40 |     --seed $seed --overwrite_output_dir  --do_predict \
41 |     $(echo -n ${input_settings[@]}) \
42 |     $(echo ${optional_arguments[@]})
43 | 


--------------------------------------------------------------------------------
/sequence_classification/evaluate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # misc. settings
 4 | export seed=1234
 5 | 
 6 | # model settings
 7 | export model_name=<path/to/finetuned/model>
 8 | 
 9 | # input settings
10 | # exactly one of `dataset_dir` or the test
11 | # dataset file needs to be provided
12 | input_settings=(
13 |     # "--dataset_dir sample_inputs/single_sequence/jsonl/"
14 |     "--test_file sample_inputs/single_sequence/jsonl/sample_test_without_labels.jsonl"
15 | )
16 | 
17 | # output settings
18 | export output_dir="outputs/"
19 | 
20 | # batch sizes
21 | export PER_DEVICE_EVAL_BATCH_SIZE=8
22 | 
23 | # optional_arguments
24 | optional_arguments=(
25 |     "--cache_dir cache_dir/"
26 |     "--overwrite_cache"
27 | )
28 | 
29 | # optional for logging
30 | # export WANDB_PROJECT="Sequence_classification_finetuning"
31 | # export WANDB_WATCH=false
32 | # export WANDB_MODE="dryrun"
33 | export WANDB_DISABLED=true
34 | 
35 | python ./sequence_classification.py \
36 |     --model_name_or_path $model_name \
37 |     --output_dir $output_dir \
38 |     --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
39 |     --seed $seed --overwrite_output_dir  --do_predict \
40 |     $(echo -n ${input_settings[@]}) \
41 |     $(echo ${optional_arguments[@]})
42 | 


--------------------------------------------------------------------------------
/token_classification/sample_inputs/train.jsonl:
--------------------------------------------------------------------------------
1 | {"tokens": ["ত্রাণ", "ও", "সমাজকল্যাণ", "সম্পাদক", "সুজিত", "রায়", "নন্দী", "প্রমুখ", "সংবাদ", "সম্মেলনে", "উপস্থিত", "ছিলেন"], "tags": ["O", "O", "O", "B-PER", "B-PER", "I-PER", "I-PER", "O", "O", "O", "O", "O"]}
2 | {"tokens": ["পরিকল্পনা", "অনুযায়ী", "তারা", "বাসায়", "ঢুকে", "দুই", "অতিথিকে", "নগ্ন", "করে", "তাদের", "মাঝখানে", "এক", "ছাত্রীকে", "বসিয়ে", "ছবি", "তোলেন"], "tags": ["O", "O", "B-PER", "B-OBJ", "O", "O", "B-PER", "O", "O", "B-PER", "O", "O", "B-PER", "O", "B-OBJ", "O"]}
3 | {"tokens": ["এ", "ছাড়া", "শুরু", "থেকে", "স্থানীয়", "সরকারের", "গুরুত্বপূর্ণ", "সিটি", "নির্বাচন", "গভীর", "পর্যবেক্ষণে", "রেখেছে", "ইউরোপিয়ান", "ইউনিয়ন", "ইইউ"], "tags": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
4 | {"tokens": ["তিনি", "বলছিলেন", "সবচেয়ে", "বড়", "কথা", "উনি", "খুব", "ভালো", "মানুষ", "ছিলেন"], "tags": ["B-PER", "O", "O", "O", "O", "O", "O", "O", "B-PER", "O"]}
5 | {"tokens": ["এসব", "দোকান", "ও", "স্ট্যান্ড", "থেকে", "প্রতি", "মাসে", "বড়", "অঙ্কের", "টাকা", "সরকার", "দলীয়", "শ্রমিক", "সংগঠনের", "কিছু", "নেতা", "আদায়", "করেন", "বলে", "জানান", "স্থানীয়রা"], "tags": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B-PER", "O", "O", "O", "O", "B-PER"]}
6 | 


--------------------------------------------------------------------------------
/sequence_classification/sample_inputs/double_sequence/jsonl/test.jsonl:
--------------------------------------------------------------------------------
1 | {"sentence1": "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।", "sentence2": "আমি তার সাথে আবার কথা বলিনি।", "label": "contradiction"}
2 | {"sentence1": "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।", "sentence2": "আমি এতটাই ভেঙে পড়েছিলাম যে, আমি আবারও তার সঙ্গে কথা বলতে শুরু করেছিলাম।", "label": "entailment"}
3 | {"sentence1": "আসলে, আমি এমনকি এই বিষয়ে চিন্তাও করিনি, কিন্তু আমি খুবই হতাশ হয়ে পড়েছিলাম এবং শেষ পর্যন্ত তার সঙ্গে আবার কথা বলতে শুরু করেছিলাম।", "sentence2": "আমাদের খুব ভালো কথা হয়েছিল।", "label": "neutral"}
4 | {"sentence1": "এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।", "sentence2": "আমি জানতাম না যে, সেই দিন শুধুমাত্র আমিই ক্ষেত্রে উপস্থিত থাকব না।", "label": "neutral"}
5 | {"sentence1": "এবং আমি ভেবেছিলাম এটা একটা বিশেষ সুযোগ, এবং এখনো, এটা এখনও, আমি মাত্র নয় দুই-দুই এক্স-ও যা ছিল আমার এএফএফসি বিমান বাহিনীর ক্যারিয়ার ক্ষেত্র।", "sentence2": "আমার মনে হয়েছিল যে এএফএফসি বিমান বাহিনীর ক্যারিয়ারের ক্ষেত্রে আমিই একমাত্র ব্যক্তি।", "label": "entailment"}
6 | 


--------------------------------------------------------------------------------
/question_answering/evaluate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # model settings
 4 | export model_name=<path/to/finetuned/model>
 5 | 
 6 | # qa specific settings
 7 | export doc_stride=256
 8 | export n_best_size=30
 9 | export max_answer_length=30
10 | # misc. settings
11 | export seed=1234
12 | 
13 | # input settings
14 | # exactly one of `dataset_dir` or the (train / validation)
15 | # dataset files need to be provided
16 | input_settings=(
17 |     "--dataset_dir inputs/sample_inputs/"
18 |     # "--train_file sample_inputs/train.json"
19 |     # "--validation_file sample_inputs/validation.json"
20 | )
21 | 
22 | # output settings
23 | export output_dir="outputs/"
24 | 
25 | # batch / sequence sizes
26 | export PER_DEVICE_EVAL_BATCH_SIZE=16
27 | export MAX_SEQUENCE_LENGTH=512
28 | 
29 | # optional arguments
30 | optional_arguments=(
31 |     "--allow_null_ans"
32 |     "--null_score_diff_threshold 0.0"
33 |     "--overwrite_cache"
34 |     "--cache_dir cache_dir/"
35 |     "--fp16"
36 |     "--fp16_backend auto"
37 | )
38 | 
39 | # optional for logging
40 | # export WANDB_PROJECT="Question_answering_finetuning"
41 | # export WANDB_WATCH=false
42 | # export WANDB_MODE="dryrun"
43 | export WANDB_DISABLED=true
44 | 
45 | python ./question_answering.py \
46 |     --model_name_or_path $model_name \
47 |     --output_dir $output_dir \
48 |     --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
49 |     --max_seq_length $MAX_SEQUENCE_LENGTH \
50 |     --doc_stride $doc_stride --n_best_size $n_best_size --max_answer_length $max_answer_length \
51 |     --seed $seed --overwrite_output_dir --do_predict \
52 |     $(echo -n ${input_settings[@]}) \
53 |     $(echo ${optional_arguments[@]})
54 | 
55 | 


--------------------------------------------------------------------------------
/question_answering/README.md:
--------------------------------------------------------------------------------
 1 | ## Data format
 2 | 
 3 | The finetuning script supports only `json` as input file format. The input file structure should be the same as standard QA datasets like [SQuAD v2.0](). 
 4 | 
 5 | ## Training & Evaluation
 6 | 
 7 | To see list of all available options, do `python question_answering.py -h`. There are two ways to provide input data files to the script:
 8 | 
 9 | * with flag `--dataset_dir <path>` where `<path>` points to the directory containing files with prefix `train`, `validation` and `test`.
10 | * with flags `--train_file <path>` / `--train_file <path>` / `--validation_file <path>` / `--test_file <path>`.
11 | 
12 | For the following commands, we are going to use the `--dataset_dir <path>` to provide input files.
13 | 
14 | 
15 | ### Finetuning
16 | For finetuning on single GPU, a minimal example is as follows:
17 | 
18 | ```bash
19 | $ python ./question_answering.py \
20 |     --model_name_or_path "csebuetnlp/banglabert" \
21 |     --dataset_dir "sample_inputs/" \
22 |     --output_dir "outputs/" \
23 |     --learning_rate=2e-5 \
24 |     --warmup_ratio 0.1 \
25 |     --gradient_accumulation_steps 2 \
26 |     --weight_decay 0.1 \
27 |     --lr_scheduler_type "linear"  \
28 |     --per_device_train_batch_size=16 \
29 |     --per_device_eval_batch_size=16 \
30 |     --max_seq_length 512 \
31 |     --logging_strategy "epoch" \
32 |     --save_strategy "epoch" \
33 |     --evaluation_strategy "epoch" \
34 |     --num_train_epochs=3 \ 
35 |     --do_train --do_eval
36 | ```
37 | For a detailed example, refer to **[trainer.sh](trainer.sh).**
38 | 
39 | 
40 | ### Evaluation
41 | * To calculate metrics on test set / inference on raw data, use the following snippet:
42 | 
43 | ```bash
44 | $ python ./question_answering.py \
45 |     --model_name_or_path <path/to/trained/model> \
46 |     --dataset_dir "sample_inputs/" \
47 |     --output_dir "outputs/" \
48 |     --per_device_eval_batch_size=16 \
49 |     --overwrite_output_dir \
50 |     --do_predict
51 | ```
52 | For a detailed example, refer to **[evaluate.sh](evaluate.sh).**
53 | 


--------------------------------------------------------------------------------
/token_classification/README.md:
--------------------------------------------------------------------------------
 1 | ## Data format
 2 | 
 3 | The finetuning script supports only `jsonl`(one json per line) as input file format. By default, the script expects the following key names:
 4 | 
 5 | * `tokens` - List of input tokens
 6 | * `tags` - Classification labels / tags for each token
 7 |   
 8 | 
 9 | You can specify custom key names using the flags `--tokens_key <key_name>`, `--tags_key <key_name>` to `token_classification.py`. To view sample input files, see the files **[here](sample_inputs/).**
10 | 
11 | ## Training & Evaluation
12 | 
13 | To see list of all available options, do `python token_classification.py -h`. There are two ways to provide input data files to the script:
14 | 
15 | * with flag `--dataset_dir <path>` where `<path>` points to the directory containing files with prefix `train`, `validation` and `test`.
16 | * with flags `--train_file <path>` / `--train_file <path>` / `--validation_file <path>` / `--test_file <path>`.
17 | 
18 | For the following commands, we are going to use the `--dataset_dir <path>` to provide input files.
19 | 
20 | 
21 | ### Finetuning
22 | For finetuning on single GPU, a minimal example is as follows:
23 | 
24 | ```bash
25 | $ python ./token_classification.py \
26 |     --model_name_or_path "csebuetnlp/banglabert" \
27 |     --dataset_dir "sample_inputs/" \
28 |     --output_dir "outputs/" \
29 |     --learning_rate=2e-5 \
30 |     --warmup_ratio 0.1 \
31 |     --gradient_accumulation_steps 2 \
32 |     --weight_decay 0.1 \
33 |     --lr_scheduler_type "linear"  \
34 |     --per_device_train_batch_size=16 \
35 |     --per_device_eval_batch_size=16 \
36 |     --max_seq_length 512 \
37 |     --logging_strategy "epoch" \
38 |     --save_strategy "epoch" \
39 |     --evaluation_strategy "epoch" \
40 |     --num_train_epochs=3 \ 
41 |     --do_train --do_eval
42 | ```
43 | For a detailed example, refer to **[trainer.sh](trainer.sh).**
44 | 
45 | 
46 | ### Evaluation
47 | * To calculate metrics on test set / inference on raw data, use the following snippet:
48 | 
49 | ```bash
50 | $ python ./token_classification.py \
51 |     --model_name_or_path <path/to/trained/model> \
52 |     --dataset_dir "sample_inputs/" \
53 |     --output_dir "outputs/" \
54 |     --per_device_eval_batch_size=16 \
55 |     --overwrite_output_dir \
56 |     --do_predict
57 | ```
58 | For a detailed example, refer to **[evaluate.sh](evaluate.sh).**
59 | 


--------------------------------------------------------------------------------
/token_classification/trainer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # training settings
 4 | export num_train_epochs=3
 5 | export save_strategy="epoch"
 6 | export logging_strategy="epoch"
 7 | 
 8 | # validation settings
 9 | export evaluation_strategy="epoch" 
10 | 
11 | # model settings
12 | export model_name="csebuetnlp/banglabert"
13 | 
14 | # optimization settings
15 | export learning_rate=2e-5 
16 | export warmup_ratio=0.1
17 | export gradient_accumulation_steps=2
18 | export weight_decay=0.01
19 | export lr_scheduler_type="linear"
20 | 
21 | # misc. settings
22 | export seed=1234
23 | 
24 | # input settings
25 | # exactly one of `dataset_dir` or the (train / validation)
26 | # dataset files need to be provided
27 | input_settings=(
28 |     "--dataset_dir sample_inputs/"
29 |     # "--train_file sample_inputs/train.jsonl"
30 |     # "--validation_file sample_inputs/validation.jsonl"
31 | )
32 | 
33 | # output settings
34 | export output_dir="outputs/"
35 | 
36 | # batch / sequence sizes
37 | export PER_DEVICE_TRAIN_BATCH_SIZE=16
38 | export PER_DEVICE_EVAL_BATCH_SIZE=16
39 | export MAX_SEQUENCE_LENGTH=512
40 | 
41 | # optional arguments
42 | optional_arguments=(
43 |     "--metric_for_best_model weighted_avg_f1"
44 |     "--greater_is_better true" # this should be commented out if the reverse is required
45 |     "--load_best_model_at_end"
46 |     "--logging_first_step"
47 |     "--overwrite_cache"
48 |     "--cache_dir cache_dir/"
49 |     "--fp16"
50 |     "--fp16_backend auto"
51 | )
52 | 
53 | # optional for logging
54 | # export WANDB_PROJECT="Token_classification_finetuning"
55 | # export WANDB_WATCH=false
56 | # export WANDB_MODE="dryrun"
57 | export WANDB_DISABLED=true
58 | 
59 | python ./token_classification.py \
60 |     --model_name_or_path $model_name \
61 |     --output_dir $output_dir \
62 |     --learning_rate=$learning_rate --warmup_ratio $warmup_ratio --gradient_accumulation_steps $gradient_accumulation_steps \
63 |     --weight_decay $weight_decay --lr_scheduler_type $lr_scheduler_type  \
64 |     --per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
65 |     --max_seq_length $MAX_SEQUENCE_LENGTH --logging_strategy $logging_strategy \
66 |     --seed $seed --overwrite_output_dir \
67 |     --num_train_epochs=$num_train_epochs --save_strategy $save_strategy \
68 |     --evaluation_strategy $evaluation_strategy --do_train --do_eval \
69 |     $(echo -n ${input_settings[@]}) \
70 |     $(echo ${optional_arguments[@]})
71 | 
72 | 


--------------------------------------------------------------------------------
/sequence_classification/trainer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # training settings
 4 | export num_train_epochs=3
 5 | export save_strategy="epoch"
 6 | export logging_strategy="epoch"
 7 | 
 8 | # validation settings
 9 | export evaluation_strategy="epoch" 
10 | 
11 | # model settings
12 | export model_name="csebuetnlp/banglabert"
13 | 
14 | # optimization settings
15 | export learning_rate=2e-5 
16 | export warmup_ratio=0.1
17 | export gradient_accumulation_steps=2
18 | export weight_decay=0.01
19 | export lr_scheduler_type="linear"
20 | 
21 | # misc. settings
22 | export seed=1234
23 | 
24 | # input settings
25 | # exactly one of `dataset_dir` or the (train / validation)
26 | # dataset files need to be provided
27 | input_settings=(
28 |     "--dataset_dir sample_inputs/single_sequence/jsonl"
29 |     # "--train_file sample_inputs/single_sequence/jsonl/train.jsonl"
30 |     # "--validation_file sample_inputs/single_sequence/jsonl/validation.jsonl"
31 | )
32 | 
33 | 
34 | # output settings
35 | export output_dir="outputs/"
36 | 
37 | # batch / sequence sizes
38 | export PER_DEVICE_TRAIN_BATCH_SIZE=16
39 | export PER_DEVICE_EVAL_BATCH_SIZE=16
40 | export MAX_SEQUENCE_LENGTH=512
41 | 
42 | # optional arguments
43 | optional_arguments=(
44 |     "--metric_for_best_model accuracy"
45 |     "--greater_is_better true" # this should be commented out if the reverse is required
46 |     "--load_best_model_at_end"
47 |     "--logging_first_step"
48 |     "--overwrite_cache"
49 |     "--cache_dir cache_dir/"
50 |     "--fp16"
51 |     "--fp16_backend auto"
52 |     "--do_predict"
53 | )
54 | 
55 | # optional for logging
56 | # export WANDB_PROJECT="Sequence_classification_finetuning"
57 | # export WANDB_WATCH=false
58 | # export WANDB_MODE="dryrun"
59 | export WANDB_DISABLED=true
60 | 
61 | python ./sequence_classification.py \
62 |     --model_name_or_path $model_name \
63 |     --output_dir $output_dir \
64 |     --learning_rate=$learning_rate --warmup_ratio $warmup_ratio --gradient_accumulation_steps $gradient_accumulation_steps \
65 |     --weight_decay $weight_decay --lr_scheduler_type $lr_scheduler_type  \
66 |     --per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
67 |     --max_seq_length $MAX_SEQUENCE_LENGTH --logging_strategy $logging_strategy \
68 |     --seed $seed --overwrite_output_dir \
69 |     --num_train_epochs=$num_train_epochs --save_strategy $save_strategy \
70 |     --evaluation_strategy $evaluation_strategy --do_train --do_eval \
71 |     $(echo -n ${input_settings[@]}) \
72 |     $(echo ${optional_arguments[@]})
73 | 
74 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Initially taken from Github's Python gitignore file
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # tests and logs
 12 | tests/fixtures/cached_*_text.txt
 13 | logs/
 14 | lightning_logs/
 15 | lang_code_data/
 16 | 
 17 | # Distribution / packaging
 18 | .Python
 19 | build/
 20 | develop-eggs/
 21 | dist/
 22 | downloads/
 23 | eggs/
 24 | .eggs/
 25 | lib/
 26 | lib64/
 27 | parts/
 28 | sdist/
 29 | var/
 30 | wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # celery beat schedule file
 92 | celerybeat-schedule
 93 | 
 94 | # SageMath parsed files
 95 | *.sage.py
 96 | 
 97 | # Environments
 98 | .env
 99 | .venv
100 | env/
101 | venv/
102 | ENV/
103 | env.bak/
104 | venv.bak/
105 | 
106 | # Spyder project settings
107 | .spyderproject
108 | .spyproject
109 | 
110 | # Rope project settings
111 | .ropeproject
112 | 
113 | # mkdocs documentation
114 | /site
115 | 
116 | # mypy
117 | .mypy_cache/
118 | .dmypy.json
119 | dmypy.json
120 | 
121 | # Pyre type checker
122 | .pyre/
123 | 
124 | # vscode
125 | .vs
126 | .vscode
127 | 
128 | # Pycharm
129 | .idea
130 | 
131 | # TF code
132 | tensorflow_code
133 | 
134 | # Models
135 | proc_data
136 | 
137 | # examples
138 | runs
139 | /runs_old
140 | /wandb
141 | /examples/runs
142 | /examples/**/*.args
143 | /examples/rag/sweep
144 | 
145 | # data
146 | /data
147 | serialization_dir
148 | 
149 | # emacs
150 | *.*~
151 | debug.env
152 | 
153 | # vim
154 | .*.swp
155 | 
156 | #ctags
157 | tags
158 | 
159 | # pre-commit
160 | .pre-commit*
161 | 
162 | # .lock
163 | *.lock


--------------------------------------------------------------------------------
/question_answering/trainer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # training settings
 4 | export num_train_epochs=10
 5 | export save_strategy="epoch"
 6 | export logging_strategy="epoch"
 7 | 
 8 | # validation settings
 9 | export evaluation_strategy="epoch" 
10 | 
11 | # model settings
12 | export model_name="csebuetnlp/banglabert"
13 | 
14 | # optimization settings
15 | export learning_rate=2e-5
16 | export warmup_ratio=0.1
17 | export gradient_accumulation_steps=16
18 | export weight_decay=0.01
19 | export lr_scheduler_type="linear"
20 | 
21 | # qa specific settings
22 | export doc_stride=256
23 | export n_best_size=30
24 | export max_answer_length=30
25 | 
26 | # misc. settings
27 | export seed=1234
28 | 
29 | # input settings
30 | # exactly one of `dataset_dir` or the (train / validation)
31 | # dataset files need to be provided
32 | input_settings=(
33 |     "--dataset_dir inputs/sample_inputs"
34 |     # "--train_file sample_inputs/train.json"
35 |     # "--validation_file sample_inputs/validation.json"
36 | )
37 | 
38 | # output settings
39 | export output_dir="outputs/"
40 | 
41 | # batch / sequence sizes
42 | export PER_DEVICE_TRAIN_BATCH_SIZE=2
43 | export PER_DEVICE_EVAL_BATCH_SIZE=2
44 | export MAX_SEQUENCE_LENGTH=512
45 | 
46 | # optional arguments
47 | optional_arguments=(
48 |     "--allow_null_ans"
49 |     "--null_score_diff_threshold 0.0"
50 |     "--metric_for_best_model f1"
51 |     "--greater_is_better true" # this should be commented out if the reverse is required
52 |     "--load_best_model_at_end"
53 |     "--logging_first_step"
54 |     "--overwrite_cache"
55 |     "--cache_dir cache_dir/"
56 |     "--fp16"
57 |     "--fp16_backend auto"
58 |     "--do_predict"
59 | )
60 | 
61 | # optional for logging
62 | # export WANDB_PROJECT="Question_answering_finetuning"
63 | # export WANDB_WATCH=false
64 | # export WANDB_MODE="dryrun"
65 | export WANDB_DISABLED=true
66 | 
67 | python ./question_answering.py \
68 |     --model_name_or_path $model_name \
69 |     --output_dir $output_dir \
70 |     --learning_rate=$learning_rate --warmup_ratio $warmup_ratio --gradient_accumulation_steps $gradient_accumulation_steps \
71 |     --weight_decay $weight_decay --lr_scheduler_type $lr_scheduler_type  \
72 |     --per_device_train_batch_size=$PER_DEVICE_TRAIN_BATCH_SIZE --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \
73 |     --max_seq_length $MAX_SEQUENCE_LENGTH --logging_strategy $logging_strategy \
74 |     --doc_stride $doc_stride --n_best_size $n_best_size --max_answer_length $max_answer_length \
75 |     --seed $seed --overwrite_output_dir \
76 |     --num_train_epochs=$num_train_epochs --save_strategy $save_strategy \
77 |     --evaluation_strategy $evaluation_strategy --do_train --do_eval \
78 |     $(echo -n ${input_settings[@]}) \
79 |     $(echo ${optional_arguments[@]})
80 | 
81 | 


--------------------------------------------------------------------------------
/sequence_classification/README.md:
--------------------------------------------------------------------------------
 1 | ## Data format
 2 | 
 3 | The finetuning script supports the following input file formats: `csv`, `tsv` and `jsonl`(one json per line). By default, the script expects the following column names (for `tsv`, `csv`) / key names (for `jsonl`):
 4 | 
 5 | * For single sequence classification:
 6 |     * `sentence1` - Input sequence
 7 |     * `label` - Classification label (Optional for `test` files)
 8 |   
 9 | * For double sequence classification:
10 |     * `sentence1` - First input sequence
11 |     * `sentence2` - Second input sequence
12 |     * `label` - Classification label (Optional for `test` files)
13 | 
14 | You can specify custom column / key names using the flags `--sentence1_key <key_name>`, `--sentence2_key <key_name>`, `--label_key <key_name>` to `sequence_classification.py`. To view sample input files for all supported formats, see the files **[here](sample_inputs/).**
15 | 
16 | ## Training & Evaluation
17 | 
18 | To see list of all available options, do `python sequence_classification.py -h`. There are two ways to provide input data files to the script:
19 | 
20 | * with flag `--dataset_dir <path>` where `<path>` points to the directory containing files with prefix `train`, `validation` and `test`.
21 | * with flags `--train_file <path>` / `--train_file <path>` / `--validation_file <path>` / `--test_file <path>`.
22 | 
23 | For the following commands, we are going to use the `--dataset_dir <path>` to provide input files.
24 | 
25 | 
26 | ### Finetuning
27 | For finetuning on single GPU, a minimal example is as follows:
28 | 
29 | ```bash
30 | $ python ./sequence_classification.py \
31 |     --model_name_or_path "csebuetnlp/banglabert" \
32 |     --dataset_dir "sample_inputs/single_sequence/jsonl" \
33 |     --output_dir "outputs/" \
34 |     --learning_rate=2e-5 \
35 |     --warmup_ratio 0.1 \
36 |     --gradient_accumulation_steps 2 \
37 |     --weight_decay 0.1 \
38 |     --lr_scheduler_type "linear"  \
39 |     --per_device_train_batch_size=16 \
40 |     --per_device_eval_batch_size=16 \
41 |     --max_seq_length 512 \
42 |     --logging_strategy "epoch" \
43 |     --save_strategy "epoch" \
44 |     --evaluation_strategy "epoch" \
45 |     --num_train_epochs=3 \ 
46 |     --do_train --do_eval
47 | ```
48 | For a detailed example, refer to **[trainer.sh](trainer.sh).**
49 | 
50 | 
51 | ### Evaluation
52 | * To calculate metrics on test set / inference on raw data, use the following snippet:
53 | 
54 | ```bash
55 | $ python ./sequence_classification.py \
56 |     --model_name_or_path <path/to/trained/model> \
57 |     --dataset_dir "sample_inputs/single_sequence/jsonl" \
58 |     --output_dir "outputs/" \
59 |     --per_device_eval_batch_size=16 \
60 |     --overwrite_output_dir \
61 |     --do_predict
62 | ```
63 | For a detailed example, refer to **[evaluate.sh](evaluate.sh).**
64 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # BanglaBERT
  2 | 
  3 | This repository contains the official release of the model **"BanglaBERT"** and associated downstream fine-tuning code and datasets introduced in the paper titled [**"BanglaBERT: Language Model Pretraining and Benchmarks for
  4 | Low-Resource Language Understanding Evaluation in Bangla"**](https://aclanthology.org/2022.findings-naacl.98/) published in *Findings of the Association for Computational Linguistics: NAACL 2022*.
  5 | 
  6 | ## Updates
  7 | * We have released [BanglaBERT (small)](https://huggingface.co/csebuetnlp/banglabert_small). It can be fine-tuned with as little as 4 GB VRAM!
  8 | * We have released a large variant of BanglaBERT! Have a look [here](https://huggingface.co/csebuetnlp/banglabert_large).
  9 | * The Bangla2B+ pretraining corpus is now available upon request! See [here](#datasets).
 10 | 
 11 | ## Table of Contents
 12 | 
 13 | - [BanglaBERT](#banglabert)
 14 |   - [Table of Contents](#table-of-contents)
 15 |   - [Models](#models)
 16 |   - [Datasets](#datasets)
 17 |   - [Setup](#setup)
 18 |   - [Training & Evaluation](#training--evaluation)
 19 |   - [Benchmarks](#benchmarks)
 20 |   - [Acknowledgements](#acknowledgements)
 21 |   - [License](#license)
 22 |   - [Citation](#citation)
 23 | 
 24 | ## Models
 25 | 
 26 | The pretrained model checkpoints are available at [Huggingface model hub](https://huggingface.co/csebuetnlp).
 27 | 
 28 | - [**BanglaBERT**](https://huggingface.co/csebuetnlp/banglabert)
 29 | - [**BanglishBERT**](https://huggingface.co/csebuetnlp/banglishbert)
 30 | - [**BanglaBERT (small)**](https://huggingface.co/csebuetnlp/banglabert_small)
 31 | - [**BanglaBERT (large)**](https://huggingface.co/csebuetnlp/banglabert_large)
 32 |   
 33 | To use these models for the supported downstream tasks in this repository see **[Training & Evaluation](#training--evaluation).**
 34 | 
 35 | ***Note:*** These models were pretrained using a ***specific normalization pipeline*** available **[here](https://github.com/csebuetnlp/normalizer)**. All finetuning scripts in this repository uses this normalization by default. If you need to adapt the pretrained model for a different task make sure ***the text units are normalized using this pipeline before tokenizing*** to get best results. A basic example is available at the **[model page](https://huggingface.co/csebuetnlp/banglabert).**
 36 | 
 37 | ## Datasets
 38 | 
 39 | We are also releasing the Bangla Natural Language Inference (NLI) and Bangla Question Answering (QA) datasets introduced in the paper. 
 40 | - [**NLI**](https://huggingface.co/datasets/csebuetnlp/xnli_bn)
 41 | - [**QA**](https://huggingface.co/datasets/csebuetnlp/squad_bn)
 42 | 
 43 | Please fill out this [**Google Form**](https://forms.gle/qiEW8f7i6Bw3FmmQA) to request access to the Bangla2B+ pretraining corpus. 
 44 | 
 45 | ## Setup
 46 | 
 47 | For installing the necessary requirements, use the following bash snippet
 48 | ```bash
 49 | $ git clone https://github.com/csebuetnlp/banglabert
 50 | $ cd banglabert/
 51 | $ conda create python==3.7.9 pytorch==1.8.1 torchvision==0.9.1 torchaudio==0.8.0 cudatoolkit=10.2 -c pytorch -p ./env
 52 | $ conda activate ./env # or source activate ./env (for older versions of anaconda)
 53 | $ bash setup.sh 
 54 | ```
 55 | * Use the newly created environment for running the scripts in this repository.
 56 | 
 57 | ## Training & Evaluation
 58 | 
 59 | To use the pretrained model for finetuning / inference on different downstream tasks see the following section:
 60 | 
 61 | * **[Sequence Classification](sequence_classification/).**
 62 |   - For single sequence classification such as
 63 |     - Document classification
 64 |     - Sentiment classification
 65 |     - Emotion classification etc.
 66 |   - For double sequence classification such as 
 67 |     - Natural Language Inference (NLI)
 68 |     - Paraphrase detection etc.
 69 | - **[Token Classification](token_classification/).**
 70 |   - For token tagging / classification tasks such as
 71 |     - Named Entity Recognition (NER)
 72 |     - Parts of Speech Tagging (PoS) etc.
 73 | - **[Question Answering](question_answering/).**
 74 |     - For tasks such as,
 75 |       - Extractive Question Answering
 76 |       - Open-domain Question Answering
 77 | 
 78 | 
 79 | ## Benchmarks
 80 |  
 81 | * Zero-shot cross-lingual transfer-learning
 82 | 
 83 | |     Model          |   Params   |     SC (macro-F1)     |      NLI (accuracy)     |    NER  (micro-F1)   |   QA (EM/F1)   |   BangLUE score |
 84 | |----------------|-----------|-----------|-----------|-----------|-----------|-----------|
 85 | |[mBERT](https://huggingface.co/bert-base-multilingual-cased) | 180M  | 27.05 | 62.22 | 39.27 | 59.01/64.18 |  50.35 |
 86 | |[XLM-R (base)](https://huggingface.co/xlm-roberta-base) |  270M   | 42.03 | 72.18 | 45.37 | 55.03/61.83 |  55.29 |
 87 | |[XLM-R (large)](https://huggingface.co/xlm-roberta-large) | 550M  | 49.49 | 78.13 | 56.48 | 71.13/77.70 |  66.59 |
 88 | |[BanglishBERT](https://huggingface.co/csebuetnlp/banglishbert) | 110M | 48.39 | 75.26 | 55.56 | 72.87/78.63 | 66.14 |
 89 | 
 90 | * Supervised fine-tuning
 91 | 
 92 | |     Model          |   Params   |     SC (macro-F1)     |      NLI (accuracy)     |    NER  (micro-F1)   |   QA (EM/F1)   |   BangLUE score |
 93 | |----------------|-----------|-----------|-----------|-----------|-----------|-----------|
 94 | |[mBERT](https://huggingface.co/bert-base-multilingual-cased) | 180M  | 67.59 | 75.13 | 68.97 | 67.12/72.64 | 70.29 |
 95 | |[XLM-R (base)](https://huggingface.co/xlm-roberta-base) |  270M   | 69.54 | 78.46 | 73.32 | 68.09/74.27  | 72.82 |        
 96 | |[XLM-R (large)](https://huggingface.co/xlm-roberta-large) | 550M  | 70.97 | 82.40 | 78.39 | 73.15/79.06 | 76.79 |
 97 | |[sahajBERT](https://huggingface.co/neuropark/sahajBERT) | 18M | 71.12 | 76.92 | 70.94 | 65.48/70.69 | 71.03 |
 98 | |[BanglishBERT](https://huggingface.co/csebuetnlp/banglishbert) | 110M | 70.61 | 80.95 | 76.28 | 72.43/78.40 | *75.73* |
 99 | |[BanglaBERT (small)](https://huggingface.co/csebuetnlp/banglabert_small) | 13M | 69.29 | 76.75 | 73.41 | 63.30/69.65 | *70.38* |
100 | |[BanglaBERT](https://huggingface.co/csebuetnlp/banglabert) | 110M | 72.89 | 82.80 | 77.78 | 72.63/79.34 | *77.09* |
101 | |[BanglaBERT (large)](https://huggingface.co/csebuetnlp/banglabert_large) | 335M | 71.94 | 83.41 | 79.20 | 76.10/81.50 | **78.43** |
102 | 
103 | 
104 | The benchmarking datasets are as follows:
105 | * **SC:** **[Sentiment Classification](https://aclanthology.org/2021.findings-emnlp.278)**
106 | * **NER:** **[Named Entity Recognition](https://multiconer.github.io/competition)**
107 | * **NLI:** **[Natural Language Inference](#datasets)**
108 | * **QA:** **[Question Answering](#datasets)**
109 |   
110 | ## Acknowledgements
111 | 
112 | We would like to thank [Intelligent Machines](https://bd.linkedin.com/company/intelligentmachines) and [Google TFRC Program](https://sites.research.google/trc/) for providing cloud support for pretraining the models.
113 | 
114 | 
115 | ## License
116 | Contents of this repository are restricted to non-commercial research purposes only under the [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/). 
117 | 
118 | <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a>
119 | 
120 | ## Citation
121 | If you use any of the datasets, models or code modules, please cite the following paper:
122 | ```
123 | @inproceedings{bhattacharjee-etal-2022-banglabert,
124 |     title = "{B}angla{BERT}: Language Model Pretraining and Benchmarks for Low-Resource Language Understanding Evaluation in {B}angla",
125 |     author = "Bhattacharjee, Abhik  and
126 |       Hasan, Tahmid  and
127 |       Ahmad, Wasi  and
128 |       Mubasshir, Kazi Samin  and
129 |       Islam, Md Saiful  and
130 |       Iqbal, Anindya  and
131 |       Rahman, M. Sohel  and
132 |       Shahriyar, Rifat",
133 |     booktitle = "Findings of the Association for Computational Linguistics: NAACL 2022",
134 |     month = jul,
135 |     year = "2022",
136 |     address = "Seattle, United States",
137 |     publisher = "Association for Computational Linguistics",
138 |     url = "https://aclanthology.org/2022.findings-naacl.98",
139 |     pages = "1318--1327",
140 |     abstract = "In this work, we introduce BanglaBERT, a BERT-based Natural Language Understanding (NLU) model pretrained in Bangla, a widely spoken yet low-resource language in the NLP literature. To pretrain BanglaBERT, we collect 27.5 GB of Bangla pretraining data (dubbed {`}Bangla2B+{'}) by crawling 110 popular Bangla sites. We introduce two downstream task datasets on natural language inference and question answering and benchmark on four diverse NLU tasks covering text classification, sequence labeling, and span prediction. In the process, we bring them under the first-ever Bangla Language Understanding Benchmark (BLUB). BanglaBERT achieves state-of-the-art results outperforming multilingual and monolingual models. We are making the models, datasets, and a leaderboard publicly available at \url{https://github.com/csebuetnlp/banglabert} to advance Bangla NLP.",
141 | }
142 | ```
143 | 


--------------------------------------------------------------------------------
/sequence_classification/sequence_classification.py:
--------------------------------------------------------------------------------
  1 | # Adapted from huggingface transformers classificaton scripts
  2 | 
  3 | import logging
  4 | import os
  5 | import random
  6 | import sys
  7 | from dataclasses import dataclass, field
  8 | from typing import Optional
  9 | 
 10 | import glob
 11 | 
 12 | import datasets
 13 | import numpy as np
 14 | from datasets import load_metric
 15 | from datasets.io.json import JsonDatasetReader
 16 | from datasets.io.csv import CsvDatasetReader
 17 | 
 18 | import transformers
 19 | from transformers import (
 20 |     AutoConfig,
 21 |     AutoModelForSequenceClassification,
 22 |     AutoTokenizer,
 23 |     DataCollatorWithPadding,
 24 |     EvalPrediction,
 25 |     HfArgumentParser,
 26 |     PretrainedConfig,
 27 |     Trainer,
 28 |     TrainingArguments,
 29 |     default_data_collator,
 30 |     set_seed,
 31 | )
 32 | from transformers.trainer_utils import get_last_checkpoint
 33 | from transformers.utils import check_min_version
 34 | from transformers.utils.versions import require_version
 35 | from normalizer import normalize
 36 | 
 37 | EXT2CONFIG = {
 38 |     "csv" : (CsvDatasetReader, {}),
 39 |     "tsv" : (CsvDatasetReader, {"sep": "\t"}),
 40 |     "jsonl": (JsonDatasetReader, {}),
 41 |     "json": (JsonDatasetReader, {})
 42 | }
 43 | 
 44 | logger = logging.getLogger(__name__)
 45 | 
 46 | 
 47 | @dataclass
 48 | class DataTrainingArguments:
 49 |    
 50 |     dataset_dir: Optional[str] = field(
 51 |         default=None, metadata={
 52 |             "help": "Path to the directory containing the data files. (.csv / .tsv / .jsonl)"
 53 |             "File datatypes will be identified with their prefix names as follows: "
 54 |             "`train`- Training file(s) e.g. `train.csv`/ `train_part1.csv` etc. "
 55 |             "`validation`- Evaluation file(s) e.g. `validation.csv`/ `validation_part1.csv` etc. "
 56 |             "`test`- Test file(s) e.g. `test.csv`/ `test_part1.csv` etc. "
 57 |             "All files for must have the same extension."
 58 |         }
 59 |     )
 60 |     max_seq_length: int = field(
 61 |         default=512,
 62 |         metadata={
 63 |             "help": "The maximum total input sequence length after tokenization. Sequences longer "
 64 |             "than this will be truncated, sequences shorter will be padded."
 65 |         },
 66 |     )
 67 |     overwrite_cache: bool = field(
 68 |         default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
 69 |     )
 70 |     pad_to_max_length: bool = field(
 71 |         default=False,
 72 |         metadata={
 73 |             "help": "Whether to pad all samples to `max_seq_length`. "
 74 |             "If False, will pad the samples dynamically when batching to the maximum length in the batch."
 75 |         },
 76 |     )
 77 |     max_train_samples: Optional[int] = field(
 78 |         default=None,
 79 |         metadata={
 80 |             "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
 81 |             "value if set."
 82 |         },
 83 |     )
 84 |     max_eval_samples: Optional[int] = field(
 85 |         default=None,
 86 |         metadata={
 87 |             "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
 88 |             "value if set."
 89 |         },
 90 |     )
 91 |     max_predict_samples: Optional[int] = field(
 92 |         default=None,
 93 |         metadata={
 94 |             "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
 95 |             "value if set."
 96 |         },
 97 |     )
 98 |     train_file: Optional[str] = field(
 99 |         default=None, metadata={"help": "A csv / tsv / jsonl file containing the training data."}
100 |     )
101 |     validation_file: Optional[str] = field(
102 |         default=None, metadata={"help": "A csv / tsv / jsonl file containing the validation data."}
103 |     )
104 |     test_file: Optional[str] = field(default=None, metadata={"help": "A csv / tsv / jsonl file containing the test data."})
105 |     do_normalize: Optional[bool] = field(default=True, metadata={"help": "Normalize text before feeding to the model."})
106 |     unicode_norm: Optional[str] = field(default="NFKC", metadata={"help": "Type of unicode normalization"})
107 |     remove_punct: Optional[bool] = field(
108 |         default=False, metadata={
109 |             "help": "Remove punctuation during normalization. To replace with custom token / selective replacement you should "
110 |             "use this repo (https://github.com/abhik1505040/normalizer) before feeding the data to the script."
111 |     })
112 |     remove_emoji: Optional[bool] = field(
113 |         default=False, metadata={
114 |             "help": "Remove emojis during normalization. To replace with custom token / selective replacement you should "
115 |             "use this repo (https://github.com/abhik1505040/normalizer) before feeding the data to the script."
116 |     })
117 |     remove_urls: Optional[bool] = field(
118 |         default=False, metadata={
119 |             "help": "Remove urls during normalization. To replace with custom token / selective replacement you should "
120 |             "use this repo (https://github.com/abhik1505040/normalizer) before feeding the data to the script."
121 |     })
122 |     sentence1_key: Optional[str] = field(
123 |         default="sentence1", metadata={"help": "Key / column name in the input file corresponding to the first input sequence"}
124 |     )
125 |     sentence2_key: Optional[str] = field(
126 |         default="sentence2", metadata={"help": "Key / column name in the input file corresponding to the second input sequence"}
127 |     )
128 |     label_key: Optional[str] = field(
129 |         default="label", metadata={"help": "Key / column name in the input file corresponding to the classification label"}
130 |     )
131 |     
132 |     def __post_init__(self):
133 |         if self.train_file is not None and self.validation_file is not None:
134 |             train_extension = self.train_file.split(".")[-1]
135 |             assert train_extension in ["csv", "jsonl", "tsv", "json"], "`train_file` should be a csv / tsv / jsonl file."
136 |             validation_extension = self.validation_file.split(".")[-1]
137 |             assert (
138 |                 validation_extension == train_extension
139 |             ), "`validation_file` should have the same extension csv / tsv / jsonl as `train_file`."
140 | 
141 | 
142 | 
143 | @dataclass
144 | class ModelArguments:
145 |     
146 |     model_name_or_path: str = field(
147 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
148 |     )
149 |     cache_dir: Optional[str] = field(
150 |         default=None,
151 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
152 |     )
153 | 
154 | 
155 | def main():
156 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
157 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
158 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
159 |     else:
160 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
161 | 
162 |     # Setup logging
163 |     logging.basicConfig(
164 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
165 |         datefmt="%m/%d/%Y %H:%M:%S",
166 |         handlers=[logging.StreamHandler(sys.stdout)],
167 |     )
168 | 
169 |     log_level = training_args.get_process_log_level()
170 |     logger.setLevel(log_level)
171 |     datasets.utils.logging.set_verbosity(log_level)
172 |     transformers.utils.logging.set_verbosity(log_level)
173 |     transformers.utils.logging.enable_default_handler()
174 |     transformers.utils.logging.enable_explicit_format()
175 | 
176 |     # Log on each process the small summary:
177 |     logger.warning(
178 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
179 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
180 |     )
181 |     logger.info(f"Training/evaluation parameters {training_args}")
182 | 
183 |     # Detecting last checkpoint.
184 |     last_checkpoint = None
185 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
186 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
187 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
188 |             raise ValueError(
189 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
190 |                 "Use --overwrite_output_dir to overcome."
191 |             )
192 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
193 |             logger.info(
194 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
195 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
196 |             )
197 | 
198 |     # Set seed before initializing model.
199 |     set_seed(training_args.seed)
200 |     has_ext = lambda path: len(os.path.basename(path).split(".")) > 1
201 |     get_ext = lambda path: os.path.basename(path).split(".")[-1]
202 | 
203 |     if data_args.dataset_dir is not None:
204 |         data_files = {}
205 |         all_files = glob.glob(
206 |             os.path.join(
207 |                 data_args.dataset_dir,
208 |                 "*"
209 |             )
210 |         )
211 |         all_exts = [get_ext(k) for k in all_files if has_ext(k)]
212 |         if not all_exts:
213 |             raise ValueError("The `dataset_dir` doesnt have any valid file.")
214 |             
215 |         selected_ext = max(set(all_exts), key=all_exts.count)
216 |         for search_prefix in ["train", "validation", "test"]:
217 |             found_files = glob.glob(
218 |                 os.path.join(
219 |                     data_args.dataset_dir,
220 |                     search_prefix + "*" + selected_ext
221 |                 )
222 |             )
223 |             if not found_files:
224 |                 continue
225 | 
226 |             data_files[search_prefix] = found_files
227 |         
228 |     else:
229 |         data_files = {
230 |             "train": data_args.train_file, 
231 |             "validation": data_args.validation_file,
232 |             "test": data_args.test_file
233 |         }
234 | 
235 |         data_files = {k: v for k, v in data_files.items() if v is not None}
236 |         
237 |         if not data_files:
238 |             raise ValueError("No valid input file found.")
239 | 
240 |         selected_ext = get_ext(list(data_files.values())[0])
241 | 
242 | 
243 |     dataset_configs = EXT2CONFIG[selected_ext]
244 |     raw_datasets = dataset_configs[0](
245 |         data_files, 
246 |         **dataset_configs[1]
247 |     ).read()
248 | 
249 |     for data_type, ds in raw_datasets.items():
250 |         assert data_args.sentence1_key in ds.features, f"Input files doesnt have the `{data_args.sentence1_key}` key"
251 |         if data_type != "test":
252 |             assert data_args.label_key in ds.features, f"Input files doesnt have the `{data_args.label_key}` key"
253 |         
254 |         ignored_columns = set(ds.column_names) - set([data_args.sentence1_key, data_args.sentence2_key, data_args.label_key])
255 |         raw_datasets[data_type] = ds.remove_columns(ignored_columns)
256 | 
257 | 
258 |     config = AutoConfig.from_pretrained(
259 |         model_args.model_name_or_path,
260 |         cache_dir=model_args.cache_dir,
261 |     )
262 |     
263 |     label_to_id = config.label2id if config.task_specific_params and config.task_specific_params.get("finetuned", False) else None
264 |     if label_to_id is None:
265 |         label_list = raw_datasets["train"].unique(data_args.label_key)
266 |         label_list.sort()
267 |         num_labels = len(label_list)
268 |         label_to_id = {v: i for i, v in enumerate(label_list)}
269 |         config.label2id = label_to_id
270 |         config.id2label = {id: label for label, id in config.label2id.items()}
271 |         config.task_specific_params = {"finetuned": True}
272 |     else:
273 |         label_list = list(label_to_id.keys())
274 |         num_labels = len(label_list)
275 |     
276 |     tokenizer = AutoTokenizer.from_pretrained(
277 |         model_args.model_name_or_path,
278 |         cache_dir=model_args.cache_dir,
279 |         use_fast=False
280 |     )
281 |     model = AutoModelForSequenceClassification.from_pretrained(
282 |         model_args.model_name_or_path,
283 |         from_tf=bool(".ckpt" in model_args.model_name_or_path),
284 |         config=config,
285 |         cache_dir=model_args.cache_dir
286 |     )
287 | 
288 |     
289 |     # Padding strategy
290 |     if data_args.pad_to_max_length:
291 |         padding = "max_length"
292 |     else:
293 |         # We will pad later, dynamically at batch creation, to the max sequence length in each batch
294 |         padding = False
295 | 
296 |     if data_args.max_seq_length > tokenizer.model_max_length:
297 |         logger.warning(
298 |             f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
299 |             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
300 |         )
301 |     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
302 | 
303 |     if data_args.do_normalize:
304 |         normalization_kwargs = {
305 |             "unicode_norm": data_args.unicode_norm,
306 |             "punct_replacement": " " if data_args.remove_punct else None,
307 |             "url_replacement": " " if data_args.remove_urls else None,
308 |             "emoji_replacement": " " if data_args.remove_emoji else None
309 |         }
310 | 
311 |         def normalize_example(example):
312 |             l = example[data_args.sentence1_key]
313 |             example[data_args.sentence1_key] = normalize(l, **normalization_kwargs)
314 | 
315 |             if data_args.sentence2_key in example:
316 |                 l = example[data_args.sentence2_key]
317 |                 example[data_args.sentence2_key] = normalize(l, **normalization_kwargs)
318 | 
319 |             return example
320 | 
321 |         raw_datasets = raw_datasets.map(
322 |             normalize_example,
323 |             desc="Running normalization on dataset",
324 |             load_from_cache_file=not data_args.overwrite_cache
325 |         )
326 | 
327 |     
328 |     def preprocess_function(examples):
329 |         # Tokenize the texts   
330 |         args = (
331 |             (examples[data_args.sentence1_key],) if data_args.sentence2_key not in examples else (examples[data_args.sentence1_key], examples[data_args.sentence2_key])
332 |         )
333 |         result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
334 | 
335 |         if label_to_id is not None and data_args.label_key in examples:
336 |             result["label"] = [label_to_id[l] for l in examples[data_args.label_key]]
337 | 
338 |         return result
339 | 
340 |     with training_args.main_process_first(desc="dataset map pre-processing"):
341 |         raw_datasets = raw_datasets.map(
342 |             preprocess_function,
343 |             batched=True,
344 |             load_from_cache_file=not data_args.overwrite_cache,
345 |             desc="Running tokenizer on dataset",
346 |         )
347 |     if training_args.do_train:
348 |         if "train" not in raw_datasets:
349 |             raise ValueError("--do_train requires a train dataset")
350 |         train_dataset = raw_datasets["train"]
351 |         if data_args.max_train_samples is not None:
352 |             train_dataset = train_dataset.select(range(data_args.max_train_samples))
353 | 
354 |     if training_args.do_eval:
355 |         if "validation" not in raw_datasets:
356 |             raise ValueError("--do_eval requires a validation dataset")
357 |         eval_dataset = raw_datasets["validation"]
358 |         if data_args.max_eval_samples is not None:
359 |             eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
360 | 
361 |     if training_args.do_predict or data_args.test_file is not None:
362 |         if "test" not in raw_datasets:
363 |             raise ValueError("--do_predict requires a test dataset")
364 |         predict_dataset = raw_datasets["test"]
365 |         if data_args.max_predict_samples is not None:
366 |             predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
367 | 
368 |     # Log a few random samples from the training set:
369 |     if training_args.do_train:
370 |         for index in random.sample(range(len(train_dataset)), 3):
371 |             logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
372 | 
373 |     
374 |     metric_names = [
375 |         "accuracy",
376 |         "precision",
377 |         "recall",
378 |         "f1"
379 |     ]
380 |     required_metrics = [load_metric(k) for k in metric_names]
381 |     average_required = metric_names[1:]
382 | 
383 |     def compute_metrics(p: EvalPrediction):
384 |         results = {}
385 |         preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
386 |         preds = np.argmax(preds, axis=1)
387 | 
388 |         for m in required_metrics:
389 |             kwargs = {"average": "macro"} if m.name in average_required else {} 
390 |             r = m.compute(
391 |                 predictions=preds,
392 |                 references=p.label_ids,
393 |                 **kwargs
394 |             )
395 |             for k, v in r.items():
396 |                 results[k] = v
397 | 
398 |         return results
399 | 
400 |     if data_args.pad_to_max_length:
401 |         data_collator = default_data_collator
402 |     elif training_args.fp16:
403 |         data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
404 |     else:
405 |         data_collator = None
406 | 
407 |     # Initialize our Trainer
408 |     trainer = Trainer(
409 |         model=model,
410 |         args=training_args,
411 |         train_dataset=train_dataset if training_args.do_train else None,
412 |         eval_dataset=eval_dataset if training_args.do_eval else None,
413 |         compute_metrics=compute_metrics,
414 |         tokenizer=tokenizer,
415 |         data_collator=data_collator,
416 |     )
417 | 
418 |     # Training
419 |     if training_args.do_train:
420 |         checkpoint = None
421 |         if training_args.resume_from_checkpoint is not None:
422 |             checkpoint = training_args.resume_from_checkpoint
423 |         elif last_checkpoint is not None:
424 |             checkpoint = last_checkpoint
425 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
426 |         metrics = train_result.metrics
427 |         max_train_samples = (
428 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
429 |         )
430 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
431 | 
432 |         trainer.save_model()
433 | 
434 |         trainer.log_metrics("train", metrics)
435 |         trainer.save_metrics("train", metrics)
436 |         trainer.save_state()
437 | 
438 |     # Evaluation
439 |     if training_args.do_eval:
440 |         logger.info("*** Evaluate ***")
441 | 
442 |         metrics = trainer.evaluate(eval_dataset=eval_dataset)
443 | 
444 |         max_eval_samples = (
445 |             data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
446 |         )
447 |         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
448 | 
449 |         trainer.log_metrics("eval", metrics)
450 |         trainer.save_metrics("eval", metrics)
451 | 
452 |     if training_args.do_predict:
453 |         logger.info("*** Predict ***")
454 | 
455 |         predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")
456 |         predictions = np.argmax(predictions, axis=1)
457 | 
458 |         trainer.log_metrics("predict", metrics)
459 |         trainer.save_metrics("predict", metrics)
460 | 
461 |         output_predict_file = os.path.join(training_args.output_dir, f"predictions.txt")
462 |         if trainer.is_world_process_zero():
463 |             with open(output_predict_file, "w") as writer:
464 |                 logger.info(f"***** Predict results *****")
465 |                 writer.write("index\tprediction\n")
466 |                 for index, item in enumerate(predictions):
467 |                     item = label_list[item]
468 |                     writer.write(f"{index}\t{item}\n")
469 | 
470 |     
471 | 
472 | def _mp_fn(index):
473 |     # For xla_spawn (TPUs)
474 |     main()
475 | 
476 | 
477 | if __name__ == "__main__":
478 |     main()
479 | 


--------------------------------------------------------------------------------
/token_classification/token_classification.py:
--------------------------------------------------------------------------------
  1 | # Adapted from huggingface transformers classificaton scripts
  2 | 
  3 | import logging
  4 | import os
  5 | import random
  6 | import sys
  7 | from dataclasses import dataclass, field
  8 | from typing import Optional
  9 | from seqeval.metrics import classification_report, accuracy_score
 10 | 
 11 | import glob
 12 | 
 13 | import datasets
 14 | import numpy as np
 15 | from datasets import ClassLabel, load_metric
 16 | from datasets.io.json import JsonDatasetReader
 17 | 
 18 | import transformers
 19 | from transformers import (
 20 |     AutoConfig,
 21 |     AutoModelForTokenClassification,
 22 |     AutoTokenizer,
 23 |     DataCollatorForTokenClassification,
 24 |     EvalPrediction,
 25 |     HfArgumentParser,
 26 |     PretrainedConfig,
 27 |     Trainer,
 28 |     TrainingArguments,
 29 |     default_data_collator,
 30 |     set_seed,
 31 | )
 32 | from transformers.trainer_utils import get_last_checkpoint
 33 | from transformers.utils import check_min_version
 34 | from transformers.utils.versions import require_version
 35 | from normalizer import normalize
 36 | 
 37 | EXT2CONFIG = {
 38 |     "jsonl": (JsonDatasetReader, {}),
 39 |     "json": (JsonDatasetReader, {})
 40 | }
 41 | 
 42 | logger = logging.getLogger(__name__)
 43 | 
 44 | 
 45 | @dataclass
 46 | class DataTrainingArguments:
 47 |    
 48 |     dataset_dir: Optional[str] = field(
 49 |         default=None, metadata={
 50 |             "help": "Path to the directory containing the data files. (.jsonl)"
 51 |             "File datatypes will be identified with their prefix names as follows: "
 52 |             "`train`- Training file(s) e.g. `train.jsonl`/ `train_part1.jsonl` etc. "
 53 |             "`validation`- Evaluation file(s) e.g. `validation.jsonl`/ `validation_part1.jsonl` etc. "
 54 |             "`test`- Test file(s) e.g. `test.jsonl`/ `test_part1.jsonl` etc. "
 55 |             "All files for must have the same extension."
 56 |         }
 57 |     )
 58 |     max_seq_length: int = field(
 59 |         default=512,
 60 |         metadata={
 61 |             "help": "The maximum total input sequence length after tokenization. Sequences longer "
 62 |             "than this will be truncated, sequences shorter will be padded."
 63 |         },
 64 |     )
 65 |     overwrite_cache: bool = field(
 66 |         default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
 67 |     )
 68 |     pad_to_max_length: bool = field(
 69 |         default=False,
 70 |         metadata={
 71 |             "help": "Whether to pad all samples to `max_seq_length`. "
 72 |             "If False, will pad the samples dynamically when batching to the maximum length in the batch."
 73 |         },
 74 |     )
 75 |     max_train_samples: Optional[int] = field(
 76 |         default=None,
 77 |         metadata={
 78 |             "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
 79 |             "value if set."
 80 |         },
 81 |     )
 82 |     max_eval_samples: Optional[int] = field(
 83 |         default=None,
 84 |         metadata={
 85 |             "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
 86 |             "value if set."
 87 |         },
 88 |     )
 89 |     max_predict_samples: Optional[int] = field(
 90 |         default=None,
 91 |         metadata={
 92 |             "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
 93 |             "value if set."
 94 |         },
 95 |     )
 96 |     train_file: Optional[str] = field(
 97 |         default=None, metadata={"help": "A csv / tsv / jsonl file containing the training data."}
 98 |     )
 99 |     validation_file: Optional[str] = field(
100 |         default=None, metadata={"help": "A csv / tsv / jsonl file containing the validation data."}
101 |     )
102 |     test_file: Optional[str] = field(default=None, metadata={"help": "A csv / tsv / jsonl file containing the test data."})
103 |     do_normalize: Optional[bool] = field(default=True, metadata={"help": "Normalize text before feeding to the model."})
104 |     label_all_tokens: bool = field(
105 |         default=False,
106 |         metadata={
107 |             "help": "Whether to put the label for one word on all tokens of generated by that word or just on the "
108 |             "one (in which case the other tokens will have a padding index)."
109 |         },
110 |     )
111 |     tokens_key: Optional[str] = field(
112 |         default="tokens", metadata={"help": "Key name in the input file corresponding to the tokens."}
113 |     )
114 |     tags_key: Optional[str] = field(
115 |         default="tags", metadata={"help": "Key name in the input file corresponding to the token labels/tags."}
116 |     )
117 | 
118 |     def __post_init__(self):
119 |         if self.train_file is not None and self.validation_file is not None:
120 |             train_extension = self.train_file.split(".")[-1]
121 |             assert train_extension in ["csv", "jsonl", "tsv"], "`train_file` should be a csv / tsv / jsonl file."
122 |             validation_extension = self.validation_file.split(".")[-1]
123 |             assert (
124 |                 validation_extension == train_extension
125 |             ), "`validation_file` should have the same extension csv / tsv / jsonl as `train_file`."
126 | 
127 | 
128 | 
129 | @dataclass
130 | class ModelArguments:
131 |     
132 |     model_name_or_path: str = field(
133 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
134 |     )
135 |     cache_dir: Optional[str] = field(
136 |         default=None,
137 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
138 |     )
139 | 
140 | 
141 | def main():
142 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
143 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
144 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
145 |     else:
146 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
147 | 
148 |     # Setup logging
149 |     logging.basicConfig(
150 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
151 |         datefmt="%m/%d/%Y %H:%M:%S",
152 |         handlers=[logging.StreamHandler(sys.stdout)],
153 |     )
154 | 
155 |     log_level = training_args.get_process_log_level()
156 |     logger.setLevel(log_level)
157 |     datasets.utils.logging.set_verbosity(log_level)
158 |     transformers.utils.logging.set_verbosity(log_level)
159 |     transformers.utils.logging.enable_default_handler()
160 |     transformers.utils.logging.enable_explicit_format()
161 | 
162 |     # Log on each process the small summary:
163 |     logger.warning(
164 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
165 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
166 |     )
167 |     logger.info(f"Training/evaluation parameters {training_args}")
168 | 
169 |     # Detecting last checkpoint.
170 |     last_checkpoint = None
171 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
172 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
173 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
174 |             raise ValueError(
175 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
176 |                 "Use --overwrite_output_dir to overcome."
177 |             )
178 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
179 |             logger.info(
180 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
181 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
182 |             )
183 | 
184 |     # Set seed before initializing model.
185 |     set_seed(training_args.seed)
186 |     has_ext = lambda path: len(os.path.basename(path).split(".")) > 1
187 |     get_ext = lambda path: os.path.basename(path).split(".")[-1]
188 | 
189 |     if data_args.dataset_dir is not None:
190 |         data_files = {}
191 |         all_files = glob.glob(
192 |             os.path.join(
193 |                 data_args.dataset_dir,
194 |                 "*"
195 |             )
196 |         )
197 |         all_exts = [get_ext(k) for k in all_files if has_ext(k)]
198 |         if not all_exts:
199 |             raise ValueError("The `dataset_dir` doesnt have any valid file.")
200 |             
201 |         selected_ext = max(set(all_exts), key=all_exts.count)
202 |         for search_prefix in ["train", "validation", "test"]:
203 |             found_files = glob.glob(
204 |                 os.path.join(
205 |                     data_args.dataset_dir,
206 |                     search_prefix + "*" + selected_ext
207 |                 )
208 |             )
209 |             if not found_files:
210 |                 continue
211 | 
212 |             data_files[search_prefix] = found_files
213 |         
214 |     else:
215 |         data_files = {
216 |             "train": data_args.train_file, 
217 |             "validation": data_args.validation_file,
218 |             "test": data_args.test_file
219 |         }
220 | 
221 |         data_files = {k: v for k, v in data_files.items() if v is not None}
222 |         
223 |         if not data_files:
224 |             raise ValueError("No valid input file found.")
225 | 
226 |         selected_ext = get_ext(list(data_files.values())[0])
227 | 
228 | 
229 |     dataset_configs = EXT2CONFIG[selected_ext]
230 |     raw_datasets = dataset_configs[0](
231 |         data_files, 
232 |         **dataset_configs[1]
233 |     ).read()
234 | 
235 |     for data_type, ds in raw_datasets.items():
236 |         assert data_args.tokens_key in ds.features, f"Input files doesnt have the `{data_args.tokens_key}` key"
237 |         if data_type != "test":
238 |             assert data_args.tags_key in ds.features, f"Input files doesnt have the `{data_args.tags_key}` key"
239 |     
240 |         ignored_columns = set(ds.column_names) - set([data_args.tokens_key, data_args.tags_key])
241 |         raw_datasets[data_type] = ds.remove_columns(ignored_columns)
242 | 
243 |     config = AutoConfig.from_pretrained(
244 |         model_args.model_name_or_path,
245 |         cache_dir=model_args.cache_dir,
246 |     )
247 |     
248 |     label_to_id = config.label2id if config.task_specific_params and config.task_specific_params.get("finetuned", False) else None
249 |     if label_to_id is None:
250 |         def get_label_list(labels):
251 |             unique_labels = set()
252 |             for label in labels:
253 |                 unique_labels = unique_labels | set(label)
254 |             label_list = list(unique_labels)
255 |             label_list.sort()
256 |             return label_list
257 | 
258 |         label_list = get_label_list(raw_datasets["train"][data_args.tags_key])
259 |         num_labels = len(label_list)
260 |         label_to_id = {v: i for i, v in enumerate(label_list)}
261 |         config.label2id = label_to_id
262 |         config.id2label = {id: label for label, id in config.label2id.items()}
263 |         config.task_specific_params = {"finetuned": True}
264 |     else:
265 |         label_list = list(label_to_id.keys())
266 |         num_labels = len(label_list)
267 | 
268 |     tokenizer_kwargs = {"add_prefix_space": True} if config.model_type in {"gpt2", "roberta"} else {}   
269 |     tokenizer = AutoTokenizer.from_pretrained(
270 |         model_args.model_name_or_path,
271 |         cache_dir=model_args.cache_dir,
272 |         use_fast=True,
273 |         **tokenizer_kwargs
274 |     )
275 |     model = AutoModelForTokenClassification.from_pretrained(
276 |         model_args.model_name_or_path,
277 |         from_tf=bool(".ckpt" in model_args.model_name_or_path),
278 |         config=config,
279 |         cache_dir=model_args.cache_dir
280 |     )
281 | 
282 |     
283 |     # Padding strategy
284 |     if data_args.pad_to_max_length:
285 |         padding = "max_length"
286 |     else:
287 |         # We will pad later, dynamically at batch creation, to the max sequence length in each batch
288 |         padding = False
289 | 
290 |     if data_args.max_seq_length > tokenizer.model_max_length:
291 |         logger.warning(
292 |             f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
293 |             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
294 |         )
295 |     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
296 | 
297 |     if data_args.do_normalize:
298 |         def normalize_example(example):
299 |             for i, token in enumerate(example[data_args.tokens_key]):
300 |                 normalized_token = normalize(token)
301 |                 if len(normalized_token) > 0:
302 |                     example[data_args.tokens_key][i] = normalized_token
303 | 
304 |             return example
305 | 
306 |         raw_datasets = raw_datasets.map(
307 |             normalize_example,
308 |             desc="Running normalization on dataset",
309 |             load_from_cache_file=not data_args.overwrite_cache
310 |         )
311 |     
312 |     # Tokenize all texts and align the labels with them.
313 |     def tokenize_and_align_labels(examples):
314 |         
315 |         tokenized_inputs = tokenizer(
316 |             examples[data_args.tokens_key],
317 |             padding=padding,
318 |             truncation=True,
319 |             max_length=max_seq_length,
320 |             is_split_into_words=True,
321 |         )
322 |         labels = []
323 |         for i, label in enumerate(examples[data_args.tags_key]):
324 |             word_ids = tokenized_inputs.word_ids(batch_index=i)
325 |             previous_word_idx = None
326 |             label_ids = []
327 |             for word_idx in word_ids:
328 |                 if word_idx is None:
329 |                     label_ids.append(-100)
330 |                 elif word_idx != previous_word_idx:
331 |                     label_ids.append(label_to_id[label[word_idx]])
332 |                 else:
333 |                     label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100)
334 |                 previous_word_idx = word_idx
335 | 
336 |             labels.append(label_ids)
337 |         tokenized_inputs["labels"] = labels
338 | 
339 |         return tokenized_inputs
340 | 
341 |     with training_args.main_process_first(desc="dataset map pre-processing"):
342 |         raw_datasets = raw_datasets.map(
343 |             tokenize_and_align_labels,
344 |             batched=True,
345 |             load_from_cache_file=not data_args.overwrite_cache,
346 |             desc="Running tokenizer on dataset",
347 |         )
348 |     if training_args.do_train:
349 |         if "train" not in raw_datasets:
350 |             raise ValueError("--do_train requires a train dataset")
351 |         train_dataset = raw_datasets["train"]
352 |         if data_args.max_train_samples is not None:
353 |             train_dataset = train_dataset.select(range(data_args.max_train_samples))
354 | 
355 |     if training_args.do_eval:
356 |         if "validation" not in raw_datasets:
357 |             raise ValueError("--do_eval requires a validation dataset")
358 |         eval_dataset = raw_datasets["validation"]
359 |         if data_args.max_eval_samples is not None:
360 |             eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
361 | 
362 |     if training_args.do_predict  or data_args.test_file is not None:
363 |         if "test" not in raw_datasets:
364 |             raise ValueError("--do_predict requires a test dataset")
365 |         predict_dataset = raw_datasets["test"]
366 |         if data_args.max_predict_samples is not None:
367 |             predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
368 | 
369 |     # Log a few random samples from the training set:
370 |     if training_args.do_train:
371 |         for index in random.sample(range(len(train_dataset)), 3):
372 |             logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
373 | 
374 |     
375 |     def compute_metrics(p: EvalPrediction):
376 |         predictions, labels = p
377 |         predictions = np.argmax(predictions, axis=2)
378 | 
379 |         true_predictions = [
380 |             [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
381 |             for prediction, label in zip(predictions, labels)
382 |         ]
383 |         true_labels = [
384 |             [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
385 |             for prediction, label in zip(predictions, labels)
386 |         ]
387 | 
388 |         report = classification_report(
389 |             y_true=true_labels,
390 |             y_pred=true_predictions,
391 |             output_dict=True
392 |         )
393 |         
394 |         scores = {
395 |             type_name: {
396 |                 "precision": score["precision"],
397 |                 "recall": score["recall"],
398 |                 "f1": score["f1-score"],
399 |                 "number": score["support"],
400 |             }
401 |             for type_name, score in report.items()
402 |         }
403 |         scores["overall_accuracy"] = accuracy_score(y_true=true_labels, y_pred=true_predictions)
404 | 
405 |         final_results = {}
406 |         for key, value in scores.items():
407 |             if isinstance(value, dict):
408 |                 for n, v in value.items():
409 |                     key = key.replace(" ", "_")
410 |                     n = n.replace(" ", "_")
411 |                     final_results[f"{key}_{n}"] = v
412 |             else:
413 |                 final_results[key] = value
414 |         return final_results
415 | 
416 |     data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
417 | 
418 |     # Initialize our Trainer
419 |     trainer = Trainer(
420 |         model=model,
421 |         args=training_args,
422 |         train_dataset=train_dataset if training_args.do_train else None,
423 |         eval_dataset=eval_dataset if training_args.do_eval else None,
424 |         compute_metrics=compute_metrics,
425 |         tokenizer=tokenizer,
426 |         data_collator=data_collator,
427 |     )
428 | 
429 |     # Training
430 |     if training_args.do_train:
431 |         checkpoint = None
432 |         if training_args.resume_from_checkpoint is not None:
433 |             checkpoint = training_args.resume_from_checkpoint
434 |         elif last_checkpoint is not None:
435 |             checkpoint = last_checkpoint
436 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
437 |         metrics = train_result.metrics
438 |         max_train_samples = (
439 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
440 |         )
441 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
442 | 
443 |         trainer.save_model()
444 | 
445 |         trainer.log_metrics("train", metrics)
446 |         trainer.save_metrics("train", metrics)
447 |         trainer.save_state()
448 | 
449 |     # Evaluation
450 |     if training_args.do_eval:
451 |         logger.info("*** Evaluate ***")
452 | 
453 |         metrics = trainer.evaluate(eval_dataset=eval_dataset)
454 | 
455 |         max_eval_samples = (
456 |             data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
457 |         )
458 |         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
459 | 
460 |         trainer.log_metrics("eval", metrics)
461 |         trainer.save_metrics("eval", metrics)
462 | 
463 |     if training_args.do_predict:
464 |         logger.info("*** Predict ***")
465 | 
466 |         predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")
467 |         predictions = np.argmax(predictions, axis=2)
468 | 
469 |         # Remove ignored index (special tokens)
470 |         true_predictions = [
471 |             [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
472 |             for prediction, label in zip(predictions, labels)
473 |         ]
474 | 
475 |         trainer.log_metrics("predict", metrics)
476 |         trainer.save_metrics("predict", metrics)
477 | 
478 |         # Save predictions
479 |         output_predictions_file = os.path.join(training_args.output_dir, "predictions.txt")
480 |         if trainer.is_world_process_zero():
481 |             with open(output_predictions_file, "w") as writer:
482 |                 for prediction in true_predictions:
483 |                     writer.write(" ".join(prediction) + "\n")
484 | 
485 |     
486 | 
487 | def _mp_fn(index):
488 |     # For xla_spawn (TPUs)
489 |     main()
490 | 
491 | 
492 | if __name__ == "__main__":
493 |     main()
494 | 


--------------------------------------------------------------------------------
/question_answering/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import datasets
  4 | from typing import Optional
  5 | from datasets.io.abc import AbstractDatasetReader
  6 | from datasets.utils.typing import NestedDataStructureLike, PathLike
  7 | from datasets import Features, NamedSplit
  8 | from datasets.tasks import QuestionAnsweringExtractive
  9 | import collections
 10 | import logging
 11 | from typing import Optional, Tuple
 12 | import numpy as np
 13 | from tqdm.auto import tqdm
 14 | from transformers import Trainer, is_torch_tpu_available
 15 | from transformers.trainer_utils import PredictionOutput
 16 | 
 17 | if is_torch_tpu_available():
 18 |     import torch_xla.core.xla_model as xm
 19 |     import torch_xla.debug.metrics as met
 20 | 
 21 | 
 22 | logger = logging.getLogger(__name__)
 23 | 
 24 | 
 25 | 
 26 | class QADatasetBuilder(datasets.GeneratorBasedBuilder):
 27 | 
 28 |     def _info(self):
 29 |         return datasets.DatasetInfo(
 30 |             features=datasets.Features(
 31 |                 {
 32 |                     "id": datasets.Value("string"),
 33 |                     "title": datasets.Value("string"),
 34 |                     "context": datasets.Value("string"),
 35 |                     "question": datasets.Value("string"),
 36 |                     "answers": datasets.features.Sequence(
 37 |                         {
 38 |                             "text": datasets.Value("string"),
 39 |                             "answer_start": datasets.Value("int32"),
 40 |                         }
 41 |                     ),
 42 |                 }
 43 |             ),
 44 |             supervised_keys=None,
 45 |             task_templates=[
 46 |                 QuestionAnsweringExtractive(
 47 |                     question_column="question", context_column="context", answers_column="answers"
 48 |                 )
 49 |             ],
 50 |         )
 51 |     
 52 |     def _split_generators(self, dl_manager):
 53 |         if not self.config.data_files:
 54 |             raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
 55 |         data_files = dl_manager.download_and_extract(self.config.data_files)
 56 |         if isinstance(data_files, (str, list, tuple)):
 57 |             files = data_files
 58 |             if isinstance(files, str):
 59 |                 files = [files]
 60 |             return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files})]
 61 |         splits = []
 62 |         for split_name, files in data_files.items():
 63 |             if isinstance(files, str):
 64 |                 files = [files]
 65 |             splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files}))
 66 |         return splits
 67 | 
 68 |     def _generate_examples(self, files):
 69 |         for filepath in files:
 70 |             with open(filepath, encoding="utf-8") as f:
 71 |                 squad = json.load(f)
 72 |                 for example in squad["data"]:
 73 |                     title = example.get("title", "")
 74 |                     for paragraph in example["paragraphs"]:
 75 |                         context = paragraph["context"]
 76 |                         for qa in paragraph["qas"]:
 77 |                             question = qa["question"]
 78 |                             id_ = qa["id"]
 79 | 
 80 |                             answer_starts = [answer["answer_start"] for answer in qa["answers"]]
 81 |                             answers = [answer["text"] for answer in qa["answers"]]
 82 | 
 83 |                             yield id_, {
 84 |                                 "title": title,
 85 |                                 "context": context,
 86 |                                 "question": question,
 87 |                                 "id": id_,
 88 |                                 "answers": {
 89 |                                     "answer_start": answer_starts,
 90 |                                     "text": answers,
 91 |                                 },
 92 |                             }
 93 | 
 94 | class QADatasetReader(AbstractDatasetReader):
 95 | 
 96 |     def __init__(
 97 |         self,
 98 |         path_or_paths: NestedDataStructureLike[PathLike],
 99 |         split: Optional[NamedSplit] = None,
100 |         features: Optional[Features] = None,
101 |         cache_dir: str = None,
102 |         keep_in_memory: bool = False,
103 |         **kwargs,
104 |     ):
105 |         super().__init__(
106 |             path_or_paths, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, **kwargs
107 |         )
108 |         path_or_paths = path_or_paths if isinstance(path_or_paths, dict) else {self.split: path_or_paths}
109 |         self.builder = QADatasetBuilder(
110 |             cache_dir=cache_dir,
111 |             data_files=path_or_paths,
112 |             **kwargs,
113 |         )
114 | 
115 |     def read(self):
116 |         download_config = None
117 |         download_mode = None
118 |         ignore_verifications = True
119 |         try_from_hf_gcs = False
120 |         use_auth_token = None
121 |         base_path = None
122 | 
123 |         self.builder.download_and_prepare(
124 |             download_config=download_config,
125 |             download_mode=download_mode,
126 |             ignore_verifications=ignore_verifications,
127 |             try_from_hf_gcs=try_from_hf_gcs,
128 |             base_path=base_path,
129 |             use_auth_token=use_auth_token,
130 |         )
131 | 
132 |         dataset = self.builder.as_dataset(
133 |             split=self.split, ignore_verifications=ignore_verifications, in_memory=self.keep_in_memory
134 |         )
135 |         return dataset
136 | 
137 | def find_all_indices(pattern_str, source_str, overlapping=True):
138 |     index = source_str.find(pattern_str)
139 |     while index != -1:
140 |         yield index
141 |         index = source_str.find(
142 |             pattern_str,
143 |             index + (1 if overlapping else len(pattern_str))
144 |         )
145 | 
146 | def postprocess_qa_predictions(
147 |     examples,
148 |     features,
149 |     predictions: Tuple[np.ndarray, np.ndarray],
150 |     allow_null_ans: bool = False,
151 |     n_best_size: int = 20,
152 |     max_answer_length: int = 30,
153 |     null_score_diff_threshold: float = 0.0,
154 |     output_dir: Optional[str] = None,
155 |     prefix: Optional[str] = None,
156 |     log_level: Optional[int] = logging.WARNING,
157 | ):
158 |     """
159 |     Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
160 |     original contexts. This is the base postprocessing functions for models that only return start and end logits.
161 | 
162 |     Args:
163 |         examples: The non-preprocessed dataset (see the main script for more information).
164 |         features: The processed dataset (see the main script for more information).
165 |         predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
166 |             The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
167 |             first dimension must match the number of elements of :obj:`features`.
168 |         allow_null_ans (:obj:`bool`, `optional`, defaults to :obj:`False`):
169 |             Whether or not the underlying dataset contains examples with no answers.
170 |         n_best_size (:obj:`int`, `optional`, defaults to 20):
171 |             The total number of n-best predictions to generate when looking for an answer.
172 |         max_answer_length (:obj:`int`, `optional`, defaults to 30):
173 |             The maximum length of an answer that can be generated. This is needed because the start and end predictions
174 |             are not conditioned on one another.
175 |         null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
176 |             The threshold used to select the null answer: if the best answer has a score that is less than the score of
177 |             the null answer minus this threshold, the null answer is selected for this example (note that the score of
178 |             the null answer for an example giving several features is the minimum of the scores for the null answer on
179 |             each feature: all features must be aligned on the fact they `want` to predict a null answer).
180 | 
181 |             Only useful when :obj:`allow_null_ans` is :obj:`True`.
182 |         output_dir (:obj:`str`, `optional`):
183 |             If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
184 |             :obj:`allow_null_ans=True`, the dictionary of the scores differences between best and null
185 |             answers, are saved in `output_dir`.
186 |         prefix (:obj:`str`, `optional`):
187 |             If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
188 |         log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
189 |             ``logging`` log level (e.g., ``logging.WARNING``)
190 |     """
191 |     assert len(predictions) == 2, "`predictions` should be a tuple with two elements (start_logits, end_logits)."
192 |     all_start_logits, all_end_logits = predictions
193 | 
194 |     assert len(predictions[0]) == len(features), f"Got {len(predictions[0])} predictions and {len(features)} features."
195 | 
196 |     # Build a map example to its corresponding features.
197 |     example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
198 |     features_per_example = collections.defaultdict(list)
199 |     for i, feature in enumerate(features):
200 |         features_per_example[example_id_to_index[feature["example_id"]]].append(i)
201 | 
202 |     # The dictionaries we have to fill.
203 |     all_predictions = collections.OrderedDict()
204 |     all_nbest_json = collections.OrderedDict()
205 |     if allow_null_ans:
206 |         scores_diff_json = collections.OrderedDict()
207 | 
208 |     # Logging.
209 |     logger.setLevel(log_level)
210 |     logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
211 | 
212 |     # Let's loop over all the examples!
213 |     for example_index, example in enumerate(tqdm(examples)):
214 |         # Those are the indices of the features associated to the current example.
215 |         feature_indices = features_per_example[example_index]
216 | 
217 |         min_null_prediction = None
218 |         prelim_predictions = []
219 | 
220 |         # Looping through all the features associated to the current example.
221 |         for feature_index in feature_indices:
222 |             # We grab the predictions of the model for this feature.
223 |             start_logits = all_start_logits[feature_index]
224 |             end_logits = all_end_logits[feature_index]
225 |             # This is what will allow us to map some the positions in our logits to span of texts in the original
226 |             # context.
227 |             offset_mapping = features[feature_index]["offset_mapping"]
228 |             # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
229 |             # available in the current feature.
230 |             token_is_max_context = features[feature_index].get("token_is_max_context", None)
231 | 
232 |             # Update minimum null prediction.
233 |             feature_null_score = start_logits[0] + end_logits[0]
234 |             if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
235 |                 min_null_prediction = {
236 |                     "offsets": (0, 0),
237 |                     "score": feature_null_score,
238 |                     "start_logit": start_logits[0],
239 |                     "end_logit": end_logits[0],
240 |                 }
241 | 
242 |             # Go through all possibilities for the `n_best_size` greater start and end logits.
243 |             start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
244 |             end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
245 |             for start_index in start_indexes:
246 |                 for end_index in end_indexes:
247 |                     # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
248 |                     # to part of the input_ids that are not in the context.
249 |                     if (
250 |                         start_index >= len(offset_mapping)
251 |                         or end_index >= len(offset_mapping)
252 |                         or offset_mapping[start_index] is None
253 |                         or offset_mapping[end_index] is None
254 |                     ):
255 |                         continue
256 |                     # Don't consider answers with a length that is either < 0 or > max_answer_length.
257 |                     if end_index < start_index or end_index - start_index + 1 > max_answer_length:
258 |                         continue
259 |                     # Don't consider answer that don't have the maximum context available (if such information is
260 |                     # provided).
261 |                     if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
262 |                         continue
263 |                     prelim_predictions.append(
264 |                         {
265 |                             "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
266 |                             "score": start_logits[start_index] + end_logits[end_index],
267 |                             "start_logit": start_logits[start_index],
268 |                             "end_logit": end_logits[end_index],
269 |                         }
270 |                     )
271 |         if allow_null_ans:
272 |             # Add the minimum null prediction
273 |             prelim_predictions.append(min_null_prediction)
274 |             null_score = min_null_prediction["score"]
275 | 
276 |         # Only keep the best `n_best_size` predictions.
277 |         predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
278 | 
279 |         # Add back the minimum null prediction if it was removed because of its low score.
280 |         if allow_null_ans and not any(p["offsets"] == (0, 0) for p in predictions):
281 |             predictions.append(min_null_prediction)
282 | 
283 |         # Use the offsets to gather the answer text in the original context.
284 |         context = example["context"]
285 |         for pred in predictions:
286 |             offsets = pred.pop("offsets")
287 |             pred["text"] = context[offsets[0] : offsets[1]]
288 | 
289 |         # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
290 |         # failure.
291 |         if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
292 |             predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})
293 | 
294 |         # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
295 |         # the LogSumExp trick).
296 |         scores = np.array([pred.pop("score") for pred in predictions])
297 |         exp_scores = np.exp(scores - np.max(scores))
298 |         probs = exp_scores / exp_scores.sum()
299 | 
300 |         # Include the probabilities in our predictions.
301 |         for prob, pred in zip(probs, predictions):
302 |             pred["probability"] = prob
303 | 
304 |         # Pick the best prediction. If the null answer is not possible, this is easy.
305 |         if not allow_null_ans:
306 |             all_predictions[example["id"]] = predictions[0]["text"]
307 |         else:
308 |             # Otherwise we first need to find the best non-empty prediction.
309 |             i = 0
310 |             while predictions[i]["text"] == "":
311 |                 i += 1
312 |             best_non_null_pred = predictions[i]
313 | 
314 |             # Then we compare to the null prediction using the threshold.
315 |             score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
316 |             scores_diff_json[example["id"]] = float(score_diff)  # To be JSON-serializable.
317 |             if score_diff > null_score_diff_threshold:
318 |                 all_predictions[example["id"]] = ""
319 |             else:
320 |                 all_predictions[example["id"]] = best_non_null_pred["text"]
321 | 
322 |         # Make `predictions` JSON-serializable by casting np.float back to float.
323 |         all_nbest_json[example["id"]] = [
324 |             {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
325 |             for pred in predictions
326 |         ]
327 | 
328 |     # If we have an output_dir, let's save all those dicts.
329 |     if output_dir is not None:
330 |         assert os.path.isdir(output_dir), f"{output_dir} is not a directory."
331 | 
332 |         prediction_file = os.path.join(
333 |             output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
334 |         )
335 |         nbest_file = os.path.join(
336 |             output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
337 |         )
338 |         if allow_null_ans:
339 |             null_odds_file = os.path.join(
340 |                 output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
341 |             )
342 | 
343 |         logger.info(f"Saving predictions to {prediction_file}.")
344 |         with open(prediction_file, "w") as writer:
345 |             writer.write(json.dumps(all_predictions, ensure_ascii=False, indent=4) + "\n")
346 |         logger.info(f"Saving nbest_preds to {nbest_file}.")
347 |         with open(nbest_file, "w") as writer:
348 |             writer.write(json.dumps(all_nbest_json, ensure_ascii=False, indent=4) + "\n")
349 |         if allow_null_ans:
350 |             logger.info(f"Saving null_odds to {null_odds_file}.")
351 |             with open(null_odds_file, "w") as writer:
352 |                 writer.write(json.dumps(scores_diff_json, ensure_ascii=False, indent=4) + "\n")
353 | 
354 |     return all_predictions
355 | 
356 | 
357 | class QuestionAnsweringTrainer(Trainer):
358 |     def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
359 |         super().__init__(*args, **kwargs)
360 |         self.eval_examples = eval_examples
361 |         self.post_process_function = post_process_function
362 | 
363 |     def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
364 |         eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
365 |         eval_dataloader = self.get_eval_dataloader(eval_dataset)
366 |         eval_examples = self.eval_examples if eval_examples is None else eval_examples
367 | 
368 |         # Temporarily disable metric computation, we will do it in the loop here.
369 |         compute_metrics = self.compute_metrics
370 |         self.compute_metrics = None
371 |         eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
372 |         try:
373 |             output = eval_loop(
374 |                 eval_dataloader,
375 |                 description="Evaluation",
376 |                 # No point gathering the predictions if there are no metrics, otherwise we defer to
377 |                 # self.args.prediction_loss_only
378 |                 prediction_loss_only=True if compute_metrics is None else None,
379 |                 ignore_keys=ignore_keys,
380 |             )
381 |         finally:
382 |             self.compute_metrics = compute_metrics
383 | 
384 |         if self.post_process_function is not None and self.compute_metrics is not None:
385 |             eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions)
386 |             metrics = self.compute_metrics(eval_preds)
387 | 
388 |             # Prefix all keys with metric_key_prefix + '_'
389 |             for key in list(metrics.keys()):
390 |                 if not key.startswith(f"{metric_key_prefix}_"):
391 |                     metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
392 | 
393 |             self.log(metrics)
394 |         else:
395 |             metrics = {}
396 | 
397 |         if self.args.tpu_metrics_debug or self.args.debug:
398 |             # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
399 |             xm.master_print(met.metrics_report())
400 | 
401 |         self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
402 |         return metrics
403 | 
404 |     def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"):
405 |         predict_dataloader = self.get_test_dataloader(predict_dataset)
406 | 
407 |         # Temporarily disable metric computation, we will do it in the loop here.
408 |         compute_metrics = self.compute_metrics
409 |         self.compute_metrics = None
410 |         eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
411 |         try:
412 |             output = eval_loop(
413 |                 predict_dataloader,
414 |                 description="Prediction",
415 |                 # No point gathering the predictions if there are no metrics, otherwise we defer to
416 |                 # self.args.prediction_loss_only
417 |                 prediction_loss_only=True if compute_metrics is None else None,
418 |                 ignore_keys=ignore_keys,
419 |             )
420 |         finally:
421 |             self.compute_metrics = compute_metrics
422 | 
423 |         if self.post_process_function is None or self.compute_metrics is None:
424 |             return output
425 | 
426 |         predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict")
427 |         metrics = self.compute_metrics(predictions)
428 | 
429 |         # Prefix all keys with metric_key_prefix + '_'
430 |         for key in list(metrics.keys()):
431 |             if not key.startswith(f"{metric_key_prefix}_"):
432 |                 metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
433 | 
434 |         return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics)
435 | 


--------------------------------------------------------------------------------
/question_answering/question_answering.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import glob
  4 | import random
  5 | import sys
  6 | from dataclasses import dataclass, field
  7 | from typing import Optional
  8 | 
  9 | import datasets
 10 | from datasets import load_dataset, load_metric
 11 | 
 12 | import transformers
 13 | from transformers import (
 14 |     AutoConfig,
 15 |     AutoModelForQuestionAnswering,
 16 |     AutoTokenizer,
 17 |     DataCollatorWithPadding,
 18 |     EvalPrediction,
 19 |     HfArgumentParser,
 20 |     PreTrainedTokenizerFast,
 21 |     Trainer,
 22 |     TrainingArguments,
 23 |     default_data_collator,
 24 |     set_seed,
 25 | )
 26 | from transformers.trainer_utils import get_last_checkpoint
 27 | from transformers.utils import check_min_version
 28 | from transformers.utils.versions import require_version
 29 | from normalizer import normalize
 30 | from utils import (
 31 |     QADatasetReader,
 32 |     find_all_indices,
 33 |     postprocess_qa_predictions,
 34 |     QuestionAnsweringTrainer
 35 | )
 36 | 
 37 | EXT2CONFIG = {
 38 |     "json": (QADatasetReader, {})
 39 | }
 40 | 
 41 | logger = logging.getLogger(__name__)
 42 | 
 43 | 
 44 | @dataclass
 45 | class DataTrainingArguments:
 46 |    
 47 |     dataset_dir: Optional[str] = field(
 48 |         default=None, metadata={
 49 |             "help": "Path to the directory containing the data files. (.json)"
 50 |             "File datatypes will be identified with their prefix names as follows: "
 51 |             "`train`- Training file(s) e.g. `train.json`/ `train_part1.json` etc. "
 52 |             "`validation`- Evaluation file(s) e.g. `validation.json`/ `validation_part1.json` etc. "
 53 |             "`test`- Test file(s) e.g. `test.json`/ `test_part1.json` etc. "
 54 |             "All files for must have the same extension."
 55 |         }
 56 |     )
 57 | 
 58 |     train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a json file)."})
 59 |     validation_file: Optional[str] = field(
 60 |         default=None,
 61 |         metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a json file)."},
 62 |     )
 63 |     test_file: Optional[str] = field(
 64 |         default=None,
 65 |         metadata={"help": "An optional input test data file to evaluate the perplexity on (a json file)."},
 66 |     )
 67 |     overwrite_cache: bool = field(
 68 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
 69 |     )
 70 |     preprocessing_num_workers: Optional[int] = field(
 71 |         default=None,
 72 |         metadata={"help": "The number of processes to use for the preprocessing."},
 73 |     )
 74 |     max_seq_length: int = field(
 75 |         default=384,
 76 |         metadata={
 77 |             "help": "The maximum total input sequence length after tokenization. Sequences longer "
 78 |             "than this will be truncated, sequences shorter will be padded."
 79 |         },
 80 |     )
 81 |     pad_to_max_length: bool = field(
 82 |         default=True,
 83 |         metadata={
 84 |             "help": "Whether to pad all samples to `max_seq_length`. "
 85 |             "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can "
 86 |             "be faster on GPU but will be slower on TPU)."
 87 |         },
 88 |     )
 89 |     max_train_samples: Optional[int] = field(
 90 |         default=None,
 91 |         metadata={
 92 |             "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
 93 |             "value if set."
 94 |         },
 95 |     )
 96 |     max_eval_samples: Optional[int] = field(
 97 |         default=None,
 98 |         metadata={
 99 |             "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
100 |             "value if set."
101 |         },
102 |     )
103 |     max_predict_samples: Optional[int] = field(
104 |         default=None,
105 |         metadata={
106 |             "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
107 |             "value if set."
108 |         },
109 |     )
110 |     allow_null_ans: bool = field(
111 |         default=False, metadata={"help": "If true, some of the examples do not have an answer."}
112 |     )
113 |     null_score_diff_threshold: float = field(
114 |         default=0.0,
115 |         metadata={
116 |             "help": "The threshold used to select the null answer: if the best answer has a score that is less than "
117 |             "the score of the null answer minus this threshold, the null answer is selected for this example. "
118 |             "Only useful when `allow_null_ans=True`."
119 |         },
120 |     )
121 |     doc_stride: int = field(
122 |         default=128,
123 |         metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
124 |     )
125 |     n_best_size: int = field(
126 |         default=20,
127 |         metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
128 |     )
129 |     max_answer_length: int = field(
130 |         default=30,
131 |         metadata={
132 |             "help": "The maximum length of an answer that can be generated. This is needed because the start "
133 |             "and end predictions are not conditioned on one another."
134 |         },
135 |     )
136 |     do_normalize: Optional[bool] = field(default=True, metadata={"help": "Normalize text before feeding to the model."})
137 |     unicode_norm: Optional[str] = field(default="NFKC", metadata={"help": "Type of unicode normalization"})
138 |    
139 | 
140 |     def __post_init__(self):
141 |         if self.train_file is not None and self.validation_file is not None:
142 |             train_extension = self.train_file.split(".")[-1]
143 |             assert train_extension in ["csv", "json", "tsv"], "`train_file` should be a csv / tsv / json file."
144 |             validation_extension = self.validation_file.split(".")[-1]
145 |             assert (
146 |                 validation_extension == train_extension
147 |             ), "`validation_file` should have the same extension csv / tsv / json as `train_file`."
148 | 
149 | 
150 | @dataclass
151 | class ModelArguments:
152 |     
153 |     model_name_or_path: str = field(
154 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
155 |     )
156 |     cache_dir: Optional[str] = field(
157 |         default=None,
158 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
159 |     )
160 | 
161 | 
162 | def main():
163 |     
164 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
165 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
166 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
167 |     else:
168 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
169 | 
170 |     # Setup logging
171 |     logging.basicConfig(
172 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
173 |         datefmt="%m/%d/%Y %H:%M:%S",
174 |         handlers=[logging.StreamHandler(sys.stdout)],
175 |     )
176 | 
177 |     log_level = training_args.get_process_log_level()
178 |     logger.setLevel(log_level)
179 |     datasets.utils.logging.set_verbosity(log_level)
180 |     transformers.utils.logging.set_verbosity(log_level)
181 |     transformers.utils.logging.enable_default_handler()
182 |     transformers.utils.logging.enable_explicit_format()
183 | 
184 |     logger.warning(
185 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
186 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
187 |     )
188 |     logger.info(f"Training/evaluation parameters {training_args}")
189 | 
190 |     last_checkpoint = None
191 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
192 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
193 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
194 |             raise ValueError(
195 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
196 |                 "Use --overwrite_output_dir to overcome."
197 |             )
198 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
199 |             logger.info(
200 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
201 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
202 |             )
203 | 
204 |     # Set seed before initializing model.
205 |     set_seed(training_args.seed)
206 | 
207 |     has_ext = lambda path: len(os.path.basename(path).split(".")) > 1
208 |     get_ext = lambda path: os.path.basename(path).split(".")[-1]
209 | 
210 |     if data_args.dataset_dir is not None:
211 |         data_files = {}
212 |         all_files = glob.glob(
213 |             os.path.join(
214 |                 data_args.dataset_dir,
215 |                 "*"
216 |             )
217 |         )
218 |         all_exts = [get_ext(k) for k in all_files if has_ext(k)]
219 |         if not all_exts:
220 |             raise ValueError("The `dataset_dir` doesnt have any valid file.")
221 |             
222 |         selected_ext = max(set(all_exts), key=all_exts.count)
223 |         for search_prefix in ["train", "validation", "test"]:
224 |             found_files = glob.glob(
225 |                 os.path.join(
226 |                     data_args.dataset_dir,
227 |                     search_prefix + "*" + selected_ext
228 |                 )
229 |             )
230 |             if not found_files:
231 |                 continue
232 | 
233 |             data_files[search_prefix] = found_files
234 |         
235 |     else:
236 |         data_files = {
237 |             "train": data_args.train_file, 
238 |             "validation": data_args.validation_file,
239 |             "test": data_args.test_file
240 |         }
241 | 
242 |         data_files = {k: v for k, v in data_files.items() if v is not None}
243 |         
244 |         if not data_files:
245 |             raise ValueError("No valid input file found.")
246 | 
247 |         selected_ext = get_ext(list(data_files.values())[0])
248 | 
249 | 
250 |     dataset_configs = EXT2CONFIG[selected_ext]
251 |     raw_datasets = dataset_configs[0](
252 |         data_files, 
253 |         **dataset_configs[1]
254 |     ).read()
255 | 
256 |     config = AutoConfig.from_pretrained(
257 |         model_args.model_name_or_path,
258 |         cache_dir=model_args.cache_dir
259 |     )
260 | 
261 |     tokenizer_kwargs = {"add_prefix_space": True} if config.model_type in {"gpt2", "roberta"} else {}   
262 |     tokenizer = AutoTokenizer.from_pretrained(
263 |         model_args.model_name_or_path,
264 |         cache_dir=model_args.cache_dir,
265 |         use_fast=True,
266 |         **tokenizer_kwargs
267 |     )
268 |     model = AutoModelForQuestionAnswering.from_pretrained(
269 |         model_args.model_name_or_path,
270 |         from_tf=bool(".ckpt" in model_args.model_name_or_path),
271 |         config=config,
272 |         cache_dir=model_args.cache_dir
273 |     )
274 | 
275 |     if not isinstance(tokenizer, PreTrainedTokenizerFast):
276 |         raise ValueError(
277 |             "This script only works for models that have a fast tokenizer. Checkout the big table of models "
278 |             "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this "
279 |             "requirement"
280 |         )
281 | 
282 |     if training_args.do_train:
283 |         column_names = raw_datasets["train"].column_names
284 |     elif training_args.do_eval:
285 |         column_names = raw_datasets["validation"].column_names
286 |     else:
287 |         column_names = raw_datasets["test"].column_names
288 | 
289 |     question_column_name = "question" if "question" in column_names else column_names[0]
290 |     context_column_name = "context" if "context" in column_names else column_names[1]
291 |     answer_column_name = "answers" if "answers" in column_names else column_names[2]
292 | 
293 | 
294 |     if data_args.do_normalize:
295 |         normalization_kwargs = {
296 |             "unicode_norm": data_args.unicode_norm,
297 |         }
298 |         required_column_names = [
299 |             question_column_name,
300 |             context_column_name,
301 |             answer_column_name
302 |         ]
303 | 
304 |         def normalize_example(example):
305 |             required_row_values = [example[k] for k in required_column_names if k in example]
306 |             question, context = required_row_values[:2]
307 |             example[question_column_name] = normalize(question, **normalization_kwargs)
308 |             example[context_column_name] = normalize(context, **normalization_kwargs)
309 | 
310 |             if len(required_row_values) == 3:
311 |                 answer = required_row_values[2]
312 |                 for i, ans in enumerate(answer["text"]):
313 |                     prev_position = answer["answer_start"][i]
314 |                     answer["text"][i] = normalize(ans, **normalization_kwargs)
315 | 
316 |                     replace_index = -1
317 |                     for j, pos in enumerate(find_all_indices(ans, context)):
318 |                         replace_index = j
319 |                         if pos == prev_position:
320 |                             break
321 | 
322 |                     if replace_index != -1:
323 |                         index_iterator = find_all_indices(
324 |                             answer["text"][i],
325 |                             example[context_column_name]
326 |                         )
327 |                         for j, pos in enumerate(index_iterator):
328 |                             if j == replace_index:
329 |                                 answer["answer_start"][i] = pos
330 |                                 assert answer["text"][i] == example[context_column_name][pos: pos + len(answer["text"][i])]
331 |                                 break
332 | 
333 |                 example[answer_column_name] = answer
334 | 
335 |             return example
336 | 
337 |         raw_datasets = raw_datasets.map(
338 |             normalize_example,
339 |             desc="Running normalization on dataset",
340 |             load_from_cache_file=not data_args.overwrite_cache
341 |         )
342 |     
343 |     pad_on_right = tokenizer.padding_side == "right"
344 | 
345 |     if data_args.max_seq_length > tokenizer.model_max_length:
346 |         logger.warning(
347 |             f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
348 |             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
349 |         )
350 |     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
351 | 
352 | 
353 | 
354 |     def prepare_train_features(examples):
355 |         tokenized_examples = tokenizer(
356 |             examples[question_column_name if pad_on_right else context_column_name],
357 |             examples[context_column_name if pad_on_right else question_column_name],
358 |             truncation="only_second" if pad_on_right else "only_first",
359 |             max_length=max_seq_length,
360 |             stride=data_args.doc_stride,
361 |             return_overflowing_tokens=True,
362 |             return_offsets_mapping=True,
363 |             padding="max_length" if data_args.pad_to_max_length else False,
364 |         )
365 | 
366 |         sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
367 |         offset_mapping = tokenized_examples.pop("offset_mapping")
368 | 
369 |         tokenized_examples["start_positions"] = []
370 |         tokenized_examples["end_positions"] = []
371 | 
372 |         for i, offsets in enumerate(offset_mapping):
373 |             input_ids = tokenized_examples["input_ids"][i]
374 |             cls_index = input_ids.index(tokenizer.cls_token_id)
375 |             sequence_ids = tokenized_examples.sequence_ids(i)
376 | 
377 |             sample_index = sample_mapping[i]
378 |             answers = examples[answer_column_name][sample_index]
379 |             if len(answers["answer_start"]) == 0:
380 |                 tokenized_examples["start_positions"].append(cls_index)
381 |                 tokenized_examples["end_positions"].append(cls_index)
382 |             else:
383 |                 start_char = answers["answer_start"][0]
384 |                 end_char = start_char + len(answers["text"][0])
385 | 
386 |                 token_start_index = 0
387 |                 while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
388 |                     token_start_index += 1
389 | 
390 |                 token_end_index = len(input_ids) - 1
391 |                 while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
392 |                     token_end_index -= 1
393 | 
394 |                 if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
395 |                     tokenized_examples["start_positions"].append(cls_index)
396 |                     tokenized_examples["end_positions"].append(cls_index)
397 |                 else:
398 |                     while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
399 |                         token_start_index += 1
400 |                     tokenized_examples["start_positions"].append(token_start_index - 1)
401 |                     while offsets[token_end_index][1] >= end_char:
402 |                         token_end_index -= 1
403 |                     tokenized_examples["end_positions"].append(token_end_index + 1)
404 | 
405 |         return tokenized_examples
406 | 
407 |     if training_args.do_train:
408 |         if "train" not in raw_datasets:
409 |             raise ValueError("--do_train requires a train dataset")
410 |         train_dataset = raw_datasets["train"]
411 |         if data_args.max_train_samples is not None:
412 |             answerable_indices = [i for i, data in enumerate(train_dataset)
413 |                                      if data['answers']['text']]
414 |             unanswerable_indices = [i for i, data in enumerate(train_dataset)
415 |                                      if not data['answers']['text']]
416 | 
417 |             if (
418 |                 len(answerable_indices) >= data_args.max_train_samples // 2 and
419 |                 len(unanswerable_indices) >= data_args.max_train_samples // 2
420 |             ):
421 |                 selected_answerable_indices = answerable_indices[:data_args.max_train_samples // 2]
422 |                 selected_unanswerable_indices = unanswerable_indices[:data_args.max_train_samples - len(selected_answerable_indices)]
423 |                 train_dataset = train_dataset.select(
424 |                     selected_answerable_indices + 
425 |                     selected_unanswerable_indices
426 |                 )
427 |             else:
428 |                 train_dataset = train_dataset.select(range(data_args.max_train_samples))
429 |         
430 |         with training_args.main_process_first(desc="train dataset map pre-processing"):
431 |             train_dataset = train_dataset.map(
432 |                 prepare_train_features,
433 |                 batched=True,
434 |                 num_proc=data_args.preprocessing_num_workers,
435 |                 remove_columns=column_names,
436 |                 load_from_cache_file=not data_args.overwrite_cache,
437 |                 desc="Running tokenizer on train dataset",
438 |             )
439 |         if data_args.max_train_samples is not None:
440 |             train_dataset = train_dataset.select(range(data_args.max_train_samples))
441 | 
442 |     def prepare_validation_features(examples):
443 |         
444 |         tokenized_examples = tokenizer(
445 |             examples[question_column_name if pad_on_right else context_column_name],
446 |             examples[context_column_name if pad_on_right else question_column_name],
447 |             truncation="only_second" if pad_on_right else "only_first",
448 |             max_length=max_seq_length,
449 |             stride=data_args.doc_stride,
450 |             return_overflowing_tokens=True,
451 |             return_offsets_mapping=True,
452 |             padding="max_length" if data_args.pad_to_max_length else False,
453 |         )
454 | 
455 |         sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
456 | 
457 |         tokenized_examples["example_id"] = []
458 | 
459 |         for i in range(len(tokenized_examples["input_ids"])):
460 |             sequence_ids = tokenized_examples.sequence_ids(i)
461 |             context_index = 1 if pad_on_right else 0
462 | 
463 |             sample_index = sample_mapping[i]
464 |             tokenized_examples["example_id"].append(examples["id"][sample_index])
465 | 
466 |             tokenized_examples["offset_mapping"][i] = [
467 |                 (o if sequence_ids[k] == context_index else None)
468 |                 for k, o in enumerate(tokenized_examples["offset_mapping"][i])
469 |             ]
470 | 
471 |         return tokenized_examples
472 | 
473 |     if training_args.do_eval:
474 |         if "validation" not in raw_datasets:
475 |             raise ValueError("--do_eval requires a validation dataset")
476 |         eval_examples = raw_datasets["validation"]
477 |         if data_args.max_eval_samples is not None:
478 |             eval_examples = eval_examples.select(range(data_args.max_eval_samples))
479 |         
480 |         with training_args.main_process_first(desc="validation dataset map pre-processing"):
481 |             eval_dataset = eval_examples.map(
482 |                 prepare_validation_features,
483 |                 batched=True,
484 |                 num_proc=data_args.preprocessing_num_workers,
485 |                 remove_columns=column_names,
486 |                 load_from_cache_file=not data_args.overwrite_cache,
487 |                 desc="Running tokenizer on validation dataset",
488 |             )
489 |         if data_args.max_eval_samples is not None:
490 |             # During Feature creation dataset samples might increase, we will select required samples again
491 |             eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
492 | 
493 |     if training_args.do_predict:
494 |         if "test" not in raw_datasets:
495 |             raise ValueError("--do_predict requires a test dataset")
496 |         predict_examples = raw_datasets["test"]
497 |         if data_args.max_predict_samples is not None:
498 |             predict_examples = predict_examples.select(range(data_args.max_predict_samples))
499 |         
500 |         with training_args.main_process_first(desc="prediction dataset map pre-processing"):
501 |             predict_dataset = predict_examples.map(
502 |                 prepare_validation_features,
503 |                 batched=True,
504 |                 num_proc=data_args.preprocessing_num_workers,
505 |                 remove_columns=column_names,
506 |                 load_from_cache_file=not data_args.overwrite_cache,
507 |                 desc="Running tokenizer on prediction dataset",
508 |             )
509 |         if data_args.max_predict_samples is not None:
510 |             predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
511 | 
512 |     data_collator = (
513 |         default_data_collator
514 |         if data_args.pad_to_max_length
515 |         else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
516 |     )
517 | 
518 |     # Log a few random samples from the training set:
519 |     if training_args.do_train:
520 |         for index in random.sample(range(len(train_dataset)), 3):
521 |             logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
522 | 
523 |     def post_processing_function(examples, features, predictions, stage="eval"):
524 |         predictions = postprocess_qa_predictions(
525 |             examples=examples,
526 |             features=features,
527 |             predictions=predictions,
528 |             allow_null_ans=data_args.allow_null_ans,
529 |             n_best_size=data_args.n_best_size,
530 |             max_answer_length=data_args.max_answer_length,
531 |             null_score_diff_threshold=data_args.null_score_diff_threshold,
532 |             output_dir=training_args.output_dir,
533 |             log_level=log_level,
534 |             prefix=stage,
535 |         )
536 |         
537 |         if data_args.allow_null_ans:
538 |             formatted_predictions = [
539 |                 {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
540 |             ]
541 |         else:
542 |             formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
543 | 
544 |         references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
545 |         return EvalPrediction(predictions=formatted_predictions, label_ids=references)
546 | 
547 |     metric = load_metric("squad_v2" if data_args.allow_null_ans else "squad")
548 | 
549 |     def compute_metrics(p: EvalPrediction):
550 |         return metric.compute(predictions=p.predictions, references=p.label_ids)
551 | 
552 |     # Initialize our Trainer
553 |     trainer = QuestionAnsweringTrainer(
554 |         model=model,
555 |         args=training_args,
556 |         train_dataset=train_dataset if training_args.do_train else None,
557 |         eval_dataset=eval_dataset if training_args.do_eval else None,
558 |         eval_examples=eval_examples if training_args.do_eval else None,
559 |         tokenizer=tokenizer,
560 |         data_collator=data_collator,
561 |         post_process_function=post_processing_function,
562 |         compute_metrics=compute_metrics,
563 |     )
564 | 
565 |     # Training
566 |     if training_args.do_train:
567 |         checkpoint = None
568 |         if training_args.resume_from_checkpoint is not None:
569 |             checkpoint = training_args.resume_from_checkpoint
570 |         elif last_checkpoint is not None:
571 |             checkpoint = last_checkpoint
572 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
573 |         trainer.save_model()  # Saves the tokenizer too for easy upload
574 | 
575 |         metrics = train_result.metrics
576 |         max_train_samples = (
577 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
578 |         )
579 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
580 | 
581 |         trainer.log_metrics("train", metrics)
582 |         trainer.save_metrics("train", metrics)
583 |         trainer.save_state()
584 | 
585 |     # Evaluation
586 |     if training_args.do_eval:
587 |         logger.info("*** Evaluate ***")
588 |         metrics = trainer.evaluate()
589 | 
590 |         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
591 |         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
592 | 
593 |         trainer.log_metrics("eval", metrics)
594 |         trainer.save_metrics("eval", metrics)
595 | 
596 |     # Prediction
597 |     if training_args.do_predict:
598 |         logger.info("*** Predict ***")
599 |         results = trainer.predict(predict_dataset, predict_examples)
600 |         metrics = results.metrics
601 | 
602 |         max_predict_samples = (
603 |             data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
604 |         )
605 |         metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
606 | 
607 |         trainer.log_metrics("predict", metrics)
608 |         trainer.save_metrics("predict", metrics)
609 | 
610 | 
611 | def _mp_fn(index):
612 |     # For xla_spawn (TPUs)
613 |     main()
614 | 
615 | 
616 | if __name__ == "__main__":
617 |     main()
618 | 


--------------------------------------------------------------------------------