├── images
    ├── coliee3.drawio.png
    ├── length_fullA_hist.png
    └── length_train_A_hist.png
├── requirements.txt
├── scripts
    ├── run_predict.sh
    ├── run_train.sh
    ├── run_finetune_bert.sh
    └── legal_text_retrieval.ipynb
├── Dockerfile
├── LICENSE
├── src
    ├── stopwords_tfidf_generator.py
    ├── tfidf_classifier.py
    ├── custom_rank_bm25.py
    ├── data_generator.py
    ├── infer.py
    ├── run_glue.py
    └── utils.py
├── .gitignore
└── README.md


/images/coliee3.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phuongnm94/legal_text_retrieval/HEAD/images/coliee3.drawio.png


--------------------------------------------------------------------------------
/images/length_fullA_hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phuongnm94/legal_text_retrieval/HEAD/images/length_fullA_hist.png


--------------------------------------------------------------------------------
/images/length_train_A_hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phuongnm94/legal_text_retrieval/HEAD/images/length_train_A_hist.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | scikit-learn
 2 | nltk
 3 | gensim
 4 | pandas
 5 | fugashi
 6 | mecab-python3
 7 | unidic-lite
 8 | transformers==4.3.2
 9 | openpyxl
10 | torch
11 | xlwt
12 | datasets
13 | sentencepiece
14 | protobuf
15 | rank_bm25
16 | seaborn
17 | vncorenlp


--------------------------------------------------------------------------------
/scripts/run_predict.sh:
--------------------------------------------------------------------------------
1 |  
2 | # this command will not run if do not mount private test - and infer on public test
3 | cp /data/private_test_question.json /app/data/zac2021-ltr-data/public_test_question.json
4 | 
5 | # this command will run infer the test file and get the output
6 | cd /app && \
7 | python3 src/infer.py && \
8 | cp data/result_prediction.json /result/submission.csv
9 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # FROM nvidia/cuda:10.2-base
 2 | FROM nvidia/cuda:11.0-base
 3 |  
 4 | 
 5 | CMD nvidia-smi
 6 | 
 7 | WORKDIR /app
 8 | ENV DEBIAN_FRONTEND=noninteractive
 9 | 
10 | #set up environment
11 | RUN apt update && apt install -y --no-install-recommends \
12 |     tzdata \
13 |     default-jdk \
14 |     git \
15 |     build-essential \
16 |     python3-dev \
17 |     python3-pip \
18 |     python3-setuptools \
19 |     unzip
20 | 
21 | RUN pip3 -q install pip --upgrade
22 | 
23 | # install python environments 
24 | COPY requirements.txt /app/requirements.txt
25 | RUN pip3 install -r /app/requirements.txt
26 | 
27 | RUN git clone https://github.com/vncorenlp/VnCoreNLP vncorenlp_data
28 | 
29 | #copies the applicaiton from local path to container path
30 | COPY . /app
31 | 
32 | RUN cd /app/data && unzip zac2021-ltr-data.zip
33 | 
34 | CMD ["bash", "/app/scripts/run_train.sh", "&&", "bash", "/app/scripts/run_predict.sh"]


--------------------------------------------------------------------------------
/scripts/run_train.sh:
--------------------------------------------------------------------------------
 1 | cd /app && mkdir settings
 2 | cd /app && mkdir data 
 3 | USER=root
 4 | 
 5 | # generate data 
 6 | cd /app && mkdir data/zalo-tfidfbm25150-full
 7 | cd /app && python3 src/data_generator.py --path_folder_base data/zac2021-ltr-data/ --test_file public_test_question.json --topk 150 --type_data task3 --tok --path_output_dir data/zalo-tfidfbm25150-full
 8 | 
 9 | 
10 | # train phoBERT 
11 | cd /app/scripts && bash run_finetune_bert.sh $USER  vinai/phobert-base  ../  data/zalo-tfidfbm25150-full Tfbm150E5-full 5
12 | 
13 | 
14 | # generate data 
15 | cd /app && mkdir data/zalo-tfidfngrbm25150-notok-full
16 | cd /app && python3 src/data_generator.py --path_folder_base data/zac2021-ltr-data/ --test_file public_test_question.json --topk 150 --type_data task3 --path_output_dir data/zalo-tfidfngrbm25150-notok-full
17 | 
18 | # train NLPHust 
19 | cd /app/scripts && bash run_finetune_bert.sh $USER  NlpHUST/electra-base-vn  ../  data/zalo-tfidfngrbm25150-notok-full NlpHTfbmngr150E5-notok-full 5
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Nguyễn Minh Phương 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/scripts/run_finetune_bert.sh:
--------------------------------------------------------------------------------
 1 | USER=${1:-"phuongnm"}
 2 | MODEL_NAME=${2:-"../settings/phobert-t3042/models/"}  # vinai/phobert-base
 3 | ROOT_DIR=${3:-"../"}  
 4 | DATA_DIR=${4:-"data/zalo-tfidf30/"}  
 5 | SETTING_NAME=${5:-"phobert-t30"}  
 6 | NUM_EPOCH=${6:-5}  
 7 | LR=${7:-1e-5}  
 8 | NUM_LABEL=${8:-2}  
 9 | MAX_LEN=${9:-512}  
10 | 
11 | CODE_DIR="${ROOT_DIR}/src/" 
12 | DATA_DIR="${ROOT_DIR}/$DATA_DIR"
13 | 
14 | SCRIPT_DIR=$(pwd)
15 | 
16 | for iSEED in {42..42}
17 | do
18 |   SETTING_NAME_SEED=${SETTING_NAME}${iSEED}
19 |   SETTING_DIR="${ROOT_DIR}/settings/${SETTING_NAME_SEED}/" 
20 |   MODEL_OUT="${SETTING_DIR}/models"
21 |   mkdir $SETTING_DIR
22 |   cp ${SCRIPT_DIR}/run_train.sh   $SETTING_DIR
23 | 
24 |   
25 |   cd $CODE_DIR  && python3 ./run_glue.py \
26 |   --model_name_or_path $MODEL_NAME \
27 |   --do_train \
28 |   --eval_steps 200 \
29 |   --do_predict \
30 |   --num_label  ${NUM_LABEL} \
31 |   --seed ${iSEED} \
32 |   --train_file $DATA_DIR/train.csv \
33 |   --validation_file $DATA_DIR/dev.csv \
34 |   --test_file $DATA_DIR/test.csv \
35 |   --max_seq_length $MAX_LEN \
36 |   --per_device_train_batch_size 16 \
37 |   --learning_rate $LR \
38 |   --warmup_steps 0 \
39 |   --num_train_epochs $NUM_EPOCH \
40 |   --save_total_limit 1 \
41 |   --logging_dir $MODEL_OUT/tensorboard --logging_steps 200 \
42 |   --output_dir $MODEL_OUT --overwrite_output_dir \
43 |   |tee $SETTING_DIR/train.log
44 | done
45 | 


--------------------------------------------------------------------------------
/src/stopwords_tfidf_generator.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from tfidf_classifier import do_classify
 5 | from utils import load_data_kse
 6 | 
 7 | 
 8 | def do_generate_stopwords(path_folder_base="../coliee3_2020/", threshold=0.02, tokenizer=None, data=None):
 9 |     c_docs, c_keys, dev_q, test_q, train_q = load_data_kse(
10 |         path_folder_base=path_folder_base) if data is None else data
11 | 
12 |     _, (test_similarities, c_vect, test_q_vect, vectorizer) = do_classify(c_docs, c_keys, test_q,
13 |                                                                           stopwords_=None,
14 |                                                                           topk=150,
15 |                                                                           tokenizer=tokenizer)
16 |     # generate stop words
17 |     stop_words_idx = []
18 |     for doc_vect in c_vect:
19 |         for word_idx in doc_vect.indices:
20 |             if doc_vect[0, word_idx] < threshold and word_idx not in stop_words_idx:
21 |                 stop_words_idx.append(word_idx)
22 | 
23 |     stop_words = [vectorizer.get_feature_names()[w_idx]
24 |                   for w_idx in stop_words_idx]
25 |     path_folder_data_out = "{}/stopwords/".format(path_folder_base)
26 |     if not os.path.exists(path_folder_data_out):
27 |         os.mkdir(path_folder_data_out)
28 |     json.dump(stop_words, open(
29 |         "{}/stopwords.json".format(path_folder_data_out), "wt"), ensure_ascii=False)
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     do_generate_stopwords()
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .history/
  2 | data/
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 


--------------------------------------------------------------------------------
/src/tfidf_classifier.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_extraction.text import TfidfVectorizer
 2 | from sklearn.metrics.pairwise import cosine_similarity
 3 | from tqdm import tqdm
 4 | from custom_rank_bm25 import BM25Plus
 5 | import numpy as np
 6 | 
 7 | from utils import Article, combine_idxs, evaluate, evaluate_idx, load_data_kse, standardize_data
 8 | import pickle, json
 9 |  
10 | 
11 | def do_classify(c_docs_, c_keys_, test_q_, stopwords_=None, topk=150, vectorizer=None, tokenizer=None, combine_score=False, c_vect=None):
12 |     # check old system 
13 |     if vectorizer is not None and isinstance(vectorizer, TfidfVectorizer):
14 |         return do_classify_old(c_docs_, c_keys_, test_q_, stopwords_=stopwords_, topk=topk, vectorizer=vectorizer, tokenizer=tokenizer)
15 | 
16 |     # new system 
17 |     c_docs_ = [standardize_data(d) for d in c_docs_]
18 |     if vectorizer is None:
19 |         print("[W] Learning Tfidf Vectorizer ...")
20 |         tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords_, tokenizer=tokenizer, ngram_range=(1,2))
21 |         tfidf_vectorizer.fit(c_docs_)
22 | 
23 |         print("[W] Learning BM25 Vectorizer ...")
24 |         bm25_scorer = BM25Plus([d.split(" ") for d in c_docs_])
25 | 
26 |         vectorizer = (tfidf_vectorizer, bm25_scorer) 
27 |     else:
28 |         tfidf_vectorizer, bm25_scorer = vectorizer[0], vectorizer[1]
29 | 
30 |     # get cosin score from tfidf vector
31 |     if c_vect is None:
32 |         c_vect = tfidf_vectorizer.transform(c_docs_)
33 |     test_q_docs = [standardize_data(q.content) for q in test_q_]
34 |     test_q_vect = tfidf_vectorizer.transform(test_q_docs)
35 |     tfidf_cosine_score = cosine_similarity(test_q_vect, c_vect)
36 | 
37 |     # get bm25 score
38 |     bm25_similarities = []
39 |     for query in tqdm(test_q_docs):
40 |         bm25_similarities.append(bm25_scorer.get_scores(query.split(" ")))
41 |     bm25_similarities = np.array(bm25_similarities)
42 | 
43 |     # combine score
44 |     if tokenizer is None:
45 |         final_score = 0.33*tfidf_cosine_score+ 0.67*bm25_similarities/np.max(bm25_similarities)
46 |         preds  = evaluate(final_score, test_q_, topk=topk, c_keys=c_keys_)
47 |     else:
48 |         idx_tfidf = tfidf_cosine_score.argsort()[:, ::-1][:, :topk]
49 |         idx_bm25 = bm25_similarities.argsort()[:, ::-1][:, :topk]
50 |         preds = combine_idxs(idx_tfidf, idx_bm25, topk)
51 |         preds  = evaluate_idx(preds, test_q_, c_keys=c_keys_)
52 | 
53 |     return preds, ((tfidf_cosine_score, bm25_similarities), c_vect, test_q_vect, vectorizer)
54 | 
55 | 
56 | 
57 | def do_classify_old(c_docs_, c_keys_, test_q_, stopwords_=None, topk=150, vectorizer=None, tokenizer=None):
58 |     if vectorizer is None:
59 |         print("[W] Learning Tfidf Vectorizer ...")
60 |         vectorizer = TfidfVectorizer(stop_words=stopwords_, tokenizer=tokenizer)
61 |         vectorizer.fit(c_docs_)
62 |     c_vect = vectorizer.transform(c_docs_)
63 | 
64 |     test_q_docs = [q.content for q in test_q_]
65 |     test_q_vect = vectorizer.transform(test_q_docs)
66 |     test_similarities = cosine_similarity(test_q_vect, c_vect)
67 |     test_pred = evaluate(test_similarities, test_q_, topk=topk, c_keys=c_keys_)
68 |     return test_pred, (test_similarities, c_vect, test_q_vect, vectorizer)
69 | 
70 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # legal-text-retrieval
 2 | ## Overview 
 3 | This system contains 2 steps: 
 4 | - generate training data containing negative sample found by mixture score of cosine(tfidf) + bm25 (using top 150 law articles most similarity)
 5 | - fine-tune PhoBERT model  (+NlpHUST model - optional) on generated data 
 6 | 
 7 | ![thissys](images/coliee3.drawio.png)
 8 | ## Environments
 9 | ```bash 
10 | git clone https://github.com/vncorenlp/VnCoreNLP.git vncorenlp_data # for vncorebnlp tokenize lib
11 | 
12 | conda create -n legal_retrieval_env python=3.8
13 | conda activate legal_retrieval_env
14 | pip install -r requirements.txt
15 | ```
16 | ## Run  
17 | 
18 | 1. Generate data from folder `data/zac2021-ltr-data/` containing `public_test_question.json` and `train_question_answer.json`
19 | 
20 | 
21 |     ```bash
22 |     python3 src/data_generator.py --path_folder_base data/zac2021-ltr-data/ --test_file public_test_question.json --topk 150  --tok --path_output_dir data/zalo-tfidfbm25150-full
23 |     ```
24 |     >Note: 
25 |     >   - `--test_file public_test_question.json` is optional, if this parameter is not used, test set will be random 33% in file `train_question_answer.json`
26 |     >   - `--path_output_dir` is the folder save 3 output file (`train.csv`, `dev.csv`, `test.csv`) and tfidf classifier (`tfidf_classifier.pkl`) for top k best relevant documents.
27 | 
28 | 2. Train model 
29 |    ```bash
30 |    bash scripts/run_finetune_bert.sh "magic"  vinai/phobert-base  ../  data/zalo-tfidfbm25150-full Tfbm150E5-full 5
31 |    ``` 
32 |     
33 | 
34 | 3. Predict
35 |    ```bash
36 |    python3 src/infer.py 
37 |    ```
38 |     >Note: 
39 |     >   This script will load model and run prediction, pls check the variable `model_configs` in file `src/infer.py` to modify. 
40 | 
41 | 
42 | Try our example on google colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/phuongnm-bkhn/legal_text_retrieval/blob/master/scripts/legal_text_retrieval.ipynb)
43 |  
44 |     
45 | ##  License
46 | MIT-licensed. 
47 | 
48 | ## Citation
49 | 
50 | Please cite as:
51 | 
52 | ``` bibtex
53 | @article{DBLP:journals/corr/abs-2106-13405,
54 |   author    = {Ha{-}Thanh Nguyen and
55 |                Phuong Minh Nguyen and
56 |                Thi{-}Hai{-}Yen Vuong and
57 |                Quan Minh Bui and
58 |                Chau Minh Nguyen and
59 |                Tran Binh Dang and
60 |                Vu Tran and
61 |                Minh Le Nguyen and
62 |                Ken Satoh},
63 |   title     = {{JNLP} Team: Deep Learning Approaches for Legal Processing Tasks in
64 |                {COLIEE} 2021},
65 |   journal   = {CoRR},
66 |   volume    = {abs/2106.13405},
67 |   year      = {2021},
68 |   url       = {https://arxiv.org/abs/2106.13405},
69 |   eprinttype = {arXiv},
70 |   eprint    = {2106.13405},
71 |   biburl    = {https://dblp.org/rec/journals/corr/abs-2106-13405.bib},
72 |   bibsource = {dblp computer science bibliography, https://dblp.org}
73 | }
74 | ```
75 | ```bibtex
76 | @article{DBLP:journals/corr/abs-2011-08071,
77 |   author    = {Ha{-}Thanh Nguyen and
78 |                Hai{-}Yen Thi Vuong and
79 |                Phuong Minh Nguyen and
80 |                Tran Binh Dang and
81 |                Quan Minh Bui and
82 |                Vu Trong Sinh and
83 |                Chau Minh Nguyen and
84 |                Vu D. Tran and
85 |                Ken Satoh and
86 |                Minh Le Nguyen},
87 |   title     = {{JNLP} Team: Deep Learning for Legal Processing in {COLIEE} 2020},
88 |   journal   = {CoRR},
89 |   volume    = {abs/2011.08071},
90 |   year      = {2020},
91 |   url       = {https://arxiv.org/abs/2011.08071},
92 |   eprinttype = {arXiv},
93 |   eprint    = {2011.08071},
94 |   biburl    = {https://dblp.org/rec/journals/corr/abs-2011-08071.bib},
95 |   bibsource = {dblp computer science bibliography, https://dblp.org}
96 | }
97 | ```
98 | 


--------------------------------------------------------------------------------
/src/custom_rank_bm25.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import math
  4 | import numpy as np
  5 | from multiprocessing import Pool, cpu_count
  6 | 
  7 | """
  8 | All of these algorithms have been taken from the paper:
  9 | Trotmam et al, Improvements to BM25 and Language Models Examined
 10 | Here we implement all the BM25 variations mentioned. 
 11 | """
 12 | 
 13 | 
 14 | class BM25:
 15 |     def __init__(self, corpus, tokenizer=None):
 16 |         self.corpus_size = len(corpus)
 17 |         self.avgdl = 0
 18 |         self.doc_freqs = []
 19 |         self.idf = {}
 20 |         self.doc_len = []
 21 |         self.tokenizer = tokenizer
 22 |         self.q_freq_cache = {}
 23 | 
 24 |         if tokenizer:
 25 |             corpus = self._tokenize_corpus(corpus)
 26 | 
 27 |         nd = self._initialize(corpus)
 28 |         self._calc_idf(nd)
 29 | 
 30 |     def _initialize(self, corpus):
 31 |         print("Loading BM25 model ...")
 32 |         nd = {}  # word -> number of documents with word
 33 |         num_doc = 0
 34 |         for document in corpus:
 35 |             self.doc_len.append(len(document))
 36 |             num_doc += len(document)
 37 | 
 38 |             frequencies = {}
 39 |             for word in document:
 40 |                 if word not in frequencies:
 41 |                     frequencies[word] = 0
 42 |                 frequencies[word] += 1
 43 |             self.doc_freqs.append(frequencies)
 44 | 
 45 |             for word, freq in frequencies.items():
 46 |                 try:
 47 |                     nd[word]+=1
 48 |                 except KeyError:
 49 |                     nd[word] = 1
 50 | 
 51 |         self.avgdl = num_doc / self.corpus_size
 52 |         return nd
 53 | 
 54 |     def _tokenize_corpus(self, corpus):
 55 |         pool = Pool(cpu_count())
 56 |         tokenized_corpus = pool.map(self.tokenizer, corpus)
 57 |         return tokenized_corpus
 58 | 
 59 |     def _calc_idf(self, nd):
 60 |         raise NotImplementedError()
 61 | 
 62 |     def get_scores(self, query):
 63 |         raise NotImplementedError()
 64 | 
 65 |     def get_batch_scores(self, query, doc_ids):
 66 |         raise NotImplementedError()
 67 | 
 68 |     def get_top_n(self, query, documents, n=5):
 69 | 
 70 |         assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"
 71 | 
 72 |         scores = self.get_scores(query)
 73 |         top_n = np.argsort(scores)[::-1][:n]
 74 |         return [documents[i] for i in top_n]
 75 | 
 76 | 
 77 | class BM25Okapi(BM25):
 78 |     def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, epsilon=0.25):
 79 |         self.k1 = k1
 80 |         self.b = b
 81 |         self.epsilon = epsilon
 82 |         super().__init__(corpus, tokenizer)
 83 | 
 84 |     def _calc_idf(self, nd):
 85 |         """
 86 |         Calculates frequencies of terms in documents and in corpus.
 87 |         This algorithm sets a floor on the idf values to eps * average_idf
 88 |         """
 89 |         # collect idf sum to calculate an average idf for epsilon value
 90 |         idf_sum = 0
 91 |         # collect words with negative idf to set them a special epsilon value.
 92 |         # idf can be negative if word is contained in more than half of documents
 93 |         negative_idfs = []
 94 |         for word, freq in nd.items():
 95 |             idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
 96 |             self.idf[word] = idf
 97 |             idf_sum += idf
 98 |             if idf < 0:
 99 |                 negative_idfs.append(word)
100 |         self.average_idf = idf_sum / len(self.idf)
101 | 
102 |         eps = self.epsilon * self.average_idf
103 |         for word in negative_idfs:
104 |             self.idf[word] = eps
105 | 
106 |     def get_scores(self, query):
107 |         """
108 |         The ATIRE BM25 variant uses an idf function which uses a log(idf) score. To prevent negative idf scores,
109 |         this algorithm also adds a floor to the idf value of epsilon.
110 |         See [Trotman, A., X. Jia, M. Crane, Towards an Efficient and Effective Search Engine] for more info
111 |         :param query:
112 |         :return:
113 |         """
114 |         score = np.zeros(self.corpus_size)
115 |         doc_len = np.array(self.doc_len)
116 |         for q in query:
117 |             q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
118 |             score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
119 |                                                (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
120 |         return score
121 | 
122 |     def get_batch_scores(self, query, doc_ids):
123 |         """
124 |         Calculate bm25 scores between query and subset of all docs
125 |         """
126 |         assert all(di < len(self.doc_freqs) for di in doc_ids)
127 |         score = np.zeros(len(doc_ids))
128 |         doc_len = np.array(self.doc_len)[doc_ids]
129 |         for q in query:
130 |             q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
131 |             score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
132 |                                                (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
133 |         return score.tolist()
134 | 
135 | 
136 | class BM25Plus(BM25):
137 |     def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, delta=1):
138 |         # Algorithm specific parameters
139 |         self.k1 = k1
140 |         self.b = b
141 |         self.delta = delta
142 |         super().__init__(corpus, tokenizer)
143 | 
144 |     def _calc_idf(self, nd):
145 |         for word, freq in nd.items():
146 |             idf = math.log((self.corpus_size + 1) / freq)
147 |             self.idf[word] = idf
148 | 
149 |     def get_scores(self, query):
150 |         score = np.zeros(self.corpus_size)
151 |         doc_len = np.array(self.doc_len)
152 |         for q in query:
153 |             if q in self.q_freq_cache:
154 |                 q_freq = self.q_freq_cache[q]
155 |             else:
156 |                 q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
157 |                 self.q_freq_cache[q] = q_freq
158 |             score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
159 |                                                (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
160 |         return score
161 | 
162 |     def get_batch_scores(self, query, doc_ids):
163 |         """
164 |         Calculate bm25 scores between query and subset of all docs
165 |         """
166 |         assert all(di < len(self.doc_freqs) for di in doc_ids)
167 |         score = np.zeros(len(doc_ids))
168 |         doc_len = np.array(self.doc_len)[doc_ids]
169 |         for q in query:
170 |             q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
171 |             score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
172 |                                                (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
173 |         return score.tolist()
174 | 
175 |  


--------------------------------------------------------------------------------
/src/data_generator.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import pickle
  4 | import re
  5 | import traceback
  6 | from typing import List
  7 | import random
  8 | 
  9 | import pandas as pd
 10 | import argparse
 11 | 
 12 | from stopwords_tfidf_generator import do_generate_stopwords
 13 | from tfidf_classifier import do_classify
 14 | from utils import Question, load_data_kse, postag_filter
 15 | from vncorenlp import VnCoreNLP
 16 | 
 17 | tokenizer_obj = VnCoreNLP("vncorenlp_data/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') 
 18 |  
 19 | def vi_tokenize(str_in):
 20 |     # return str_in
 21 |     str_in = " ".join(str_in.split(" ")[:512])
 22 |     sentences = [" ".join(s) for s in tokenizer_obj.tokenize(str_in)]
 23 |   
 24 |     return " ".join(sentences)
 25 |  
 26 | def generate_pair_inputs(data_pred, data_gold, _c_keys, append_gold=False, sub_doc_info=None):
 27 |     _data_pairs_id = []
 28 |     _, _, sub_key_mapping = sub_doc_info or (None, None, {})
 29 | 
 30 |     for i in range(data_pred.shape[0]):
 31 |         cur_pred = [_c_keys[idx] for idx in data_pred[i]]
 32 |         cur_label = [a.get_id() for a in data_gold[i].relevant_a]
 33 |         q_id = data_gold[i].id
 34 | 
 35 |         if append_gold:
 36 |             for id_civil_lb in cur_label:
 37 |                 if id_civil_lb not in cur_pred:
 38 |                     cur_pred = cur_pred + [id_civil_lb]
 39 | 
 40 |         for j, id_civil_pred in enumerate(cur_pred):
 41 |             check_lb = id_civil_pred in cur_label
 42 |             _data_pairs_id.append(((id_civil_pred, q_id), check_lb))
 43 | 
 44 |             # append sub articles (by chunking)
 45 |             for id_c in sub_key_mapping.get(id_civil_pred, []):
 46 |                 _data_pairs_id.append(((id_c, q_id), check_lb))
 47 | 
 48 |     print('Number data pairs: ', len(_data_pairs_id))
 49 |     return _data_pairs_id
 50 |  
 51 | 
 52 | def aggregate_sentence_pairs(_c_docs, _c_keys, _data_pairs_id, _q: List[Question], plus_filter_postags=False, filter_lb=False,
 53 |                              empty_article_id="None", sub_doc_info=None):
 54 |     _new_dataset = []
 55 |     _q_map = dict((q.id, q.content) for q in _q)
 56 |     empty_article_content = ""
 57 |     _c_docs = _c_docs + [empty_article_content]
 58 |     _c_keys = _c_keys + [empty_article_id]
 59 | 
 60 |     _c_sub_docs, _c_sub_keys, _ = sub_doc_info or (None, None, {})
 61 |     _c_docs = _c_docs + (_c_sub_docs if _c_sub_docs is not None else [])
 62 |     _c_keys = _c_keys + (_c_sub_keys if _c_sub_keys is not None else [])
 63 | 
 64 |     for (id_civil_pred, q_id), lb in _data_pairs_id:
 65 |         try:
 66 |             _new_dataset.append({
 67 |                 "id": [id_civil_pred, q_id],
 68 |                 "c_code": _c_docs[_c_keys.index(id_civil_pred)],
 69 |                 "query": _q_map[q_id],
 70 |                 'label': lb
 71 |             })
 72 | 
 73 |             if plus_filter_postags:
 74 |                 if filter_lb and lb:
 75 |                     _new_dataset.append({
 76 |                         "id": [id_civil_pred + "_pos_filtered", q_id],
 77 |                         "c_code": postag_filter(_c_docs[_c_keys.index(id_civil_pred)]),
 78 |                         "query": _q_map[q_id],
 79 |                         'label': lb
 80 |                     })
 81 |                 if not filter_lb:
 82 |                     _new_dataset.append({
 83 |                         "id": [id_civil_pred + "_pos_filtered", q_id],
 84 |                         "c_code": postag_filter(_c_docs[_c_keys.index(id_civil_pred)]),
 85 |                         "query": _q_map[q_id],
 86 |                         'label': lb
 87 |                     })
 88 |         except Exception as e:
 89 |             traceback.print_stack()
 90 |             print(e)
 91 |             print("[Err] {}".format(((id_civil_pred, q_id), lb)))
 92 |     return _new_dataset
 93 | 
 94 | 
 95 | def aggregate_sentence_pairs_task5(_data_pairs_id, _q: List[Question]):
 96 |     _new_dataset = []
 97 |     _q_map = dict((q.id, q.content) for q in _q)
 98 | 
 99 |     for q_id, lb in _data_pairs_id:
100 |         _new_dataset.append({
101 |             "id": q_id,
102 |             "query": _q_map[q_id],
103 |             'label': lb
104 |         })
105 |     return _new_dataset
106 | 
107 | 
108 | def gen_mrpc_data(coliee_data_, file_path):
109 |     data = {
110 |         "label": [],
111 |         "#1 ID": [],
112 |         "#2 ID": [],
113 |         "sentence1": [],
114 |         "sentence2": [],
115 |     }
116 |     for e in coliee_data_:
117 |         data['label'].append(1 if e['label'] else 0)
118 |         data['#1 ID'].append(e['id'][1])
119 |         data['#2 ID'].append(e['id'][0])
120 |         data['sentence1'].append(e['query'].replace('\n', " "))
121 |         data['sentence2'].append(e['c_code'].replace('\n', " "))
122 |     df = pd.DataFrame(data=data)
123 |     df.to_csv(file_path, index=False, sep=',')
124 | 
125 | 
126 | def gen_cola_data(coliee_data_, file_path):
127 |     data = {
128 |         "sentence": [],
129 |         "label": [],
130 |         "id": [],
131 |     }
132 |     for e in coliee_data_:
133 |         data['label'].append(1 if e['label'] else 0)
134 |         data['sentence'].append(e['query'].replace('\n', " "))
135 |         data['id'].append(e['id'].replace('\n', " "))
136 |     df = pd.DataFrame(data=data)
137 |     df.to_csv(file_path, index=False, sep=',')
138 | 
139 | if __name__ == "__main__":
140 |     parser = argparse.ArgumentParser()
141 |     parser.add_argument('--path_folder_base',
142 |                         action="store", dest="path_folder_base",
143 |                         help="path folder saving data", default='path/to/path_folder_base')
144 |     parser.add_argument('--path_output_dir',
145 |                         action="store", dest="path_output_dir",
146 |                         help="path folder saving output data", default='path/to/path_output_dir')
147 |     parser.add_argument('--type_data',
148 |                         action="store", dest="type_data",
149 |                         help="type data for generating process: task3 | task4", default='task3')
150 |     parser.add_argument('--test_file',
151 |                         action="store", dest="test_file", type=str,
152 |                         help="path to the test file", default=None)
153 |     parser.add_argument('--topk',
154 |                         action="store", dest="topk", type=int,
155 |                         help="topk select by tfidf when generating data", default=150) 
156 |     parser.add_argument('--only_test',
157 |                         action="store_true", dest="only_test", 
158 |                         help="just generate testing data", default=False)
159 |     parser.add_argument('--chunk_content_size',
160 |                         action="store", dest="chunk_content_size", type=int,
161 |                         help="chunk content of article with size", default=0)
162 |     parser.add_argument('--chunk_content_stride',
163 |                         action="store", dest="chunk_content_stride", type=int,
164 |                         help="chunk content of article with stride", default=0)
165 |     parser.add_argument('--tok',
166 |                     action="store_true", dest="tok",  
167 |                     help="run tokenize", default=False) 
168 | 
169 |     options = parser.parse_args()
170 |     tokenizer = vi_tokenize if options.tok else None
171 | 
172 |     path_folder_base = options.path_folder_base
173 |     topk_select = options.topk
174 | 
175 |     chunk_content_info = [options.chunk_content_size,
176 |                           options.chunk_content_stride] \
177 |         if options.chunk_content_size > 0 and options.chunk_content_stride > 0 else None
178 |         
179 |     test_ids = None
180 |     if options.test_file is not None:
181 |         test_data = json.load(open("{}/{}".format(path_folder_base, options.test_file)))
182 |         if 'items' in test_data:
183 |             test_data = test_data['items']
184 |         test_ids = [s["question_id"] for s in test_data]
185 |     
186 |     path_data_cached = '{}/tokenized_data_cached.pkl'.format(options.path_output_dir)
187 |     if os.path.isfile(path_data_cached):
188 |         print ("Load cached file data: {}".format(path_data_cached))
189 |         c_docs, c_keys, dev_q, test_q, train_q, sub_doc_info = pickle.load(open(path_data_cached, 'rb'))
190 |     else:
191 |         print ("Load data and tokenize data")
192 |         c_docs, c_keys, dev_q, test_q, train_q, sub_doc_info = load_data_kse(
193 |             path_folder_base=path_folder_base,  ids_test=test_ids, tokenizer=tokenizer, testing_data=options.test_file, 
194 |             chunk_content_info=chunk_content_info
195 |             )
196 |         try:
197 |             pickle.dump((c_docs, c_keys, dev_q, test_q, train_q, sub_doc_info), open(path_data_cached, 'wb'))
198 |         except Exception as e:
199 |             print(e)
200 |         
201 |     c_docs_raw = sub_doc_info[3]
202 |     sub_doc_info = sub_doc_info[:3] 
203 | 
204 |     # test_q = train_q
205 |     if len(dev_q) == 0:
206 |         dev_q = train_q
207 |     if len(test_q) == 0:
208 |         test_q = train_q
209 |  
210 |     stopwords = None
211 | 
212 |     # build tfidf vectorizer and generate pair sentence for training process
213 |     # if text is tokenized, not combine tfidf with bm25, otherwise combine
214 |     if not options.only_test:
215 |         train_pred, (_, _, _, vectorizer) = do_classify(c_docs, c_keys, train_q,
216 |                                                         stopwords_=stopwords, topk=topk_select, tokenizer=tokenizer, combine_score=(not options.tok))
217 |         train_data_pairs_id = generate_pair_inputs(_c_keys=c_keys, data_pred=train_pred, data_gold=train_q,
218 |                                                 append_gold=True, sub_doc_info=sub_doc_info)
219 |     else:
220 |         train_data_pairs_id = []
221 | 
222 |     test_pred, _ = do_classify(
223 |         c_docs, c_keys, test_q, vectorizer=vectorizer, topk=topk_select, tokenizer=tokenizer, combine_score=(not options.tok))
224 |     test_data_pairs_id = generate_pair_inputs(
225 |         _c_keys=c_keys, data_pred=test_pred, data_gold=test_q, sub_doc_info=sub_doc_info)
226 | 
227 |     dev_pred, _ = do_classify(
228 |         c_docs, c_keys, dev_q, vectorizer=vectorizer, topk=topk_select, tokenizer=tokenizer, combine_score=(not options.tok))
229 |     dev_data_pairs_id = generate_pair_inputs(
230 |         _c_keys=c_keys, data_pred=dev_pred, data_gold=dev_q, sub_doc_info=sub_doc_info)  
231 | 
232 |     print("len(train_data_pairs_id), len(test_data_pairs_id), len(dev_data_pairs_id) = ",
233 |           len(train_data_pairs_id), len(test_data_pairs_id), len(dev_data_pairs_id))
234 | 
235 |     # save file csv following template of mrpc task
236 |     path_folder_data_out = options.path_output_dir
237 |     if not os.path.exists(path_folder_data_out):
238 |         os.mkdir(path_folder_data_out)
239 | 
240 |     # fill data from train/test data_pairs_id
241 |     new_dataset_train = aggregate_sentence_pairs(c_docs, c_keys, train_data_pairs_id, train_q,
242 |                                                 plus_filter_postags=False,
243 |                                                 filter_lb=False, sub_doc_info=sub_doc_info)
244 |     new_dataset_test = aggregate_sentence_pairs(c_docs, c_keys, test_data_pairs_id, test_q,
245 |                                                 plus_filter_postags=False,
246 |                                                 filter_lb=False, sub_doc_info=sub_doc_info)
247 |     new_dataset_dev = aggregate_sentence_pairs(c_docs, c_keys, dev_data_pairs_id, dev_q,
248 |                                             plus_filter_postags=False,
249 |                                             filter_lb=False, sub_doc_info=sub_doc_info)
250 | 
251 |     gen_mrpc_data(new_dataset_train,
252 |                 "{}/train.csv".format(path_folder_data_out))
253 |     gen_mrpc_data(new_dataset_test, "{}/test.csv".format(path_folder_data_out))
254 |     gen_mrpc_data(new_dataset_dev, "{}/dev.csv".format(path_folder_data_out))
255 | 
256 |     # save tfidf vectorizer that filter fop 150 civil document
257 |     pickle.dump(vectorizer, open(
258 |         "{}/tfidf_classifier.pkl".format(path_folder_data_out), "wb")) 
259 | 
260 | 
261 | 


--------------------------------------------------------------------------------
/src/infer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | import json
  4 | import pickle
  5 | 
  6 | from transformers.tokenization_utils import PreTrainedTokenizer
  7 | from data_generator import vi_tokenize
  8 | 
  9 | from tfidf_classifier import do_classify
 10 | from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
 11 | from transformers import GlueDataTrainingArguments as DataTrainingArguments
 12 | from transformers import (
 13 |     HfArgumentParser,
 14 |     Trainer,
 15 |     TrainingArguments,
 16 | )
 17 | 
 18 | from transformers.data.datasets.glue import *
 19 | from transformers.data.processors.utils import InputExample
 20 | 
 21 | from utils import Question, load_data_kse, standardize_data, Article
 22 | import numpy as np
 23 | 
 24 | 
 25 | @dataclass
 26 | class ModelArguments:
 27 |     """
 28 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
 29 |     """
 30 |     model_name_or_path: str = field(
 31 |         metadata={
 32 |             "help": "Path to pretrained model or model identifier from huggingface.co/models"}
 33 |     )
 34 |     config_name: Optional[str] = field(
 35 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
 36 |     )
 37 |     tokenizer_name: Optional[str] = field(
 38 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
 39 |     )
 40 |     cache_dir: Optional[str] = field(
 41 |         default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
 42 |     )
 43 | 
 44 | 
 45 | class LawDataset(Dataset):
 46 |     """
 47 |     This will be superseded by a framework-agnostic approach
 48 |     soon.
 49 |     """
 50 |     args: GlueDataTrainingArguments
 51 |     output_mode: str
 52 |     features: List[InputFeatures]
 53 | 
 54 |     def __init__(
 55 |             self,
 56 |             args: GlueDataTrainingArguments,
 57 |             tokenizer: PreTrainedTokenizer,
 58 |             limit_examples: Optional[int] = None,
 59 |             mode: Union[str, Split] = Split.train,
 60 |             c_code=None,
 61 |             sentence=None,
 62 |     ):
 63 |         self.args = args
 64 |         task_name = 'mrpc'
 65 |         self.processor = glue_processors[task_name]()
 66 |         self.output_mode = 'classification'
 67 |         self.c_code = c_code if c_code is not None else []
 68 |         self.sentence = sentence if sentence is not None else ""
 69 |         if isinstance(mode, str):
 70 |             try:
 71 |                 mode = Split[mode]
 72 |             except KeyError:
 73 |                 raise KeyError("mode is not a valid split name")
 74 | 
 75 |         label_list = self.processor.get_labels()
 76 |         self.label_list = label_list
 77 | 
 78 |         def _create_examples(lines, set_type='test'):
 79 |             examples = []
 80 |             for (i, line) in enumerate(lines):
 81 |                 guid = "%s-%s" % (set_type, i)
 82 |                 text_a = line[3]
 83 |                 text_b = line[4]
 84 |                 label = None if set_type == "test" else line[0]
 85 |                 examples.append(InputExample(
 86 |                     guid=guid, text_a=text_a, text_b=text_b, label=label))
 87 |             return examples
 88 | 
 89 |         lines = []
 90 |         for i, e in enumerate(self.c_code):
 91 |             lines.append([0, "sent_{}".format(
 92 |                 i), e[1], self.sentence[i], e[0]])
 93 | 
 94 |         # recreate the data
 95 |         examples = _create_examples(lines)
 96 |         if limit_examples is not None:
 97 |             examples = examples[:limit_examples]
 98 |         self.features = glue_convert_examples_to_features(
 99 |             examples,
100 |             tokenizer,
101 |             max_length=args.max_seq_length,
102 |             label_list=label_list,
103 |             output_mode=self.output_mode,
104 |         )
105 | 
106 |     def __len__(self):
107 |         return len(self.features)
108 | 
109 |     def __getitem__(self, i) -> InputFeatures:
110 |         return self.features[i]
111 | 
112 |     def get_labels(self):
113 |         return self.label_list
114 | 
115 |     def get_c_code_ids(self):
116 |         return [e[1] for e in self.c_code]
117 | 
118 | 
119 | def infer_coliee_task3(sentence, all_civil_code, data_args, tfidf_vectorizer, trainer, bert_tokenizer,  tokenizer=None, topk=150):
120 |     if isinstance(sentence, str):
121 |         sentence = [sentence]
122 |     test_q = [Question(id='q{}'.format(i), content=tokenizer(
123 |         s)if tokenizer is not None else s, content_raw=s, relevant_a=[]) for i, s in enumerate(sentence)]
124 |     c_docs = all_civil_code[0]
125 |     c_keys = all_civil_code[1]
126 |     c_vect = all_civil_code[2]
127 |     c_docs_keys = list(zip(all_civil_code[0], all_civil_code[1]))
128 | 
129 |     test_pred, _ = do_classify(
130 |         c_docs, c_keys, test_q, vectorizer=tfidf_vectorizer, topk=topk, c_vect=c_vect,  combine_score=(tokenizer is None))
131 | 
132 |     c_code_pred_by_tfidf = []
133 |     coressponding_questions = []
134 |     for i, s_pred in enumerate(test_pred):
135 |         for idx in s_pred:
136 |             coressponding_questions.append(test_q[i].content)
137 |             c_code_pred_by_tfidf.append(c_docs_keys[idx])
138 | 
139 |     test_dataset = LawDataset(data_args,
140 |                               bert_tokenizer,
141 |                               mode='test', sentence=coressponding_questions, c_code=c_code_pred_by_tfidf)
142 |     predictions = trainer.predict(test_dataset=test_dataset).predictions
143 |     probs = torch.softmax(torch.from_numpy(predictions), dim=1)
144 |     predicted_labels = torch.argmax(probs, 1)
145 |     return predicted_labels, probs, c_code_pred_by_tfidf
146 | 
147 | def list_split(listA, n):
148 |     for x in range(0, len(listA), n):
149 |         every_chunk = listA[x: n+x]
150 | 
151 |         if len(every_chunk) < n:
152 |             every_chunk = every_chunk + \
153 |                 [None for y in range(n-len(every_chunk))]
154 |         yield every_chunk 
155 | 
156 | def init_state(path_c_code, path_data_org, path_preprocessed_data, model_path, tokenizer=None, topk=150, testing_data=None, max_seq_length=512,
157 |                do_lower_case=True):
158 |     model_version = model_path  # 'bert-base-uncased'
159 | 
160 |     config = AutoConfig.from_pretrained(
161 |         model_version,
162 |         num_labels=2,
163 |         finetuning_task='MRPC'
164 |     )
165 |     model = AutoModelForSequenceClassification.from_pretrained(
166 |         model_version, config=config)
167 |     bert_tokenizer = AutoTokenizer.from_pretrained(
168 |         model_version, do_lower_case=do_lower_case)
169 |     model.eval()
170 | 
171 |     parser = HfArgumentParser(
172 |         (ModelArguments, DataTrainingArguments, TrainingArguments))
173 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses(
174 |         args=["--model_name_or_path", model_version,
175 |               "--task_name", "MRPC",
176 |               "--data_dir", "./coliee3_2020/data",
177 |               "--do_predict",
178 |               "--per_device_train_batch_size", "16",
179 |               "--max_seq_length", "{}".format(max_seq_length),
180 |               "--learning_rate", "2e-5",
181 |               "--output_dir", model_version,
182 |               "--overwrite_output_dir"])
183 |     # Initialize our Trainer
184 |     trainer = Trainer(
185 |         model=model,
186 |         args=training_args
187 |     )
188 |     tfidf_vectorizer = pickle.load(
189 |         open("{}/tfidf_classifier.pkl".format(path_preprocessed_data), "rb"))
190 |     if isinstance(tfidf_vectorizer, tuple):
191 |         tfidf_vectorizer, bm25_scorer = tfidf_vectorizer[0], tfidf_vectorizer[1]
192 | 
193 |     path_data_cached = '{}/tokenized_data_cached.pkl'.format(
194 |         path_preprocessed_data)
195 |     if os.path.isfile(path_data_cached):
196 |         print("Load cached file data: {}".format(path_data_cached))
197 |         c_docs, c_keys, dev_q, test_q, train_q, sub_doc_info = pickle.load(
198 |             open(path_data_cached, 'rb'))
199 |     else:
200 |         print("Load data and tokenize data")
201 |         c_docs, c_keys, dev_q, test_q, train_q, sub_doc_info = load_data_kse(
202 |             path_folder_base=path_data_org,  ids_test=[
203 |             ], tokenizer=tokenizer, testing_data=testing_data,
204 |             # chunk_content_info=chunk_content_info
205 |         )
206 | 
207 |     c_vect = tfidf_vectorizer.transform([standardize_data(d) for d in c_docs])
208 |     return (c_docs, c_keys, c_vect), data_args, (tfidf_vectorizer, bm25_scorer), trainer, bert_tokenizer
209 | 
210 | 
211 | if __name__ == "__main__":
212 |     global all_civil_code, data_args, tfidf_vectorizer, trainer, bert_tokenizer
213 | 
214 |     model_configs = {
215 |         # 'NlpHUST': {
216 |         #     "path_data_org": 'data/zac2021-ltr-data/',
217 |         #     "path_c_code": 'data/zac2021-ltr-data/legal_corpus.json',
218 |         #     "tokenizer": None,
219 |         #     "topk": 150,
220 |         #     "do_lower_case": False,
221 |         #     "max_seq_length": 512,
222 |         #     "path_preprocessed_data": 'data/zalo-tfidfngrbm25150-notok-full/',
223 |         #     "model_path": 'settings/NlpHTfbmngr150E5-notok-full42/models',
224 |         # },
225 |         'PhoBERT': {
226 |             "path_data_org": 'data/zac2021-ltr-data/',
227 |             "path_c_code": 'data/zac2021-ltr-data/legal_corpus.json',
228 |             "tokenizer": 'vi_tokenize',
229 |             "topk": 300,
230 |             "do_lower_case": True,
231 |             "max_seq_length": 256,
232 |             "path_preprocessed_data": 'data/zalo-tfidfbm25150-full/',
233 |             "model_path": 'settings/Tfbm150E5-full42/models',
234 |         }
235 |     }
236 |     print(json.dumps(model_configs, indent=2))
237 | 
238 |     test_all_data = json.load(open('data/zac2021-ltr-data/public_test_question.json'))['items']
239 |     test_ids = [e['question_id'] for e in test_all_data]
240 |     test_sents = [e['question'] for e in test_all_data]
241 | 
242 |     # test_sents = [
243 |     #     "Đới khoáng hóa là gì?",
244 |     #     "Kinh phí bảo đảm thi hành án đối với pháp nhân thương mại được quy định như thế nào?",
245 |     #     "Thời gian viên chức nghỉ thai sản có đánh giá chất lượng không?",
246 |     #     "Việc trình, giải quyết hồ sơ đề nghị sửa đổi, bổ sung Quyết định giao khu vực biển được quy định như thế nào?",
247 |     #     # "Hình thức kỷ luật hạ bậc lương trong việc xử lý VPHC sẽ áp dụng cho đối tượng nào?",
248 |     #     # "Nguyên tắc xác định tổ chức, cá nhân làm môi trường bị ô nhiễm, suy thoái theo quy định của pháp luật",
249 |     #     # "Người được đề xuất hình thức kỷ luật trong quốc phòng?",
250 |     # ]
251 | 
252 |     # def init models
253 |     model_init_states = {}
254 |     print("Loading model ....")
255 |     for m_name, model_info in model_configs.items():
256 |         if 'tokenizer' in model_info and model_info['tokenizer'] == 'vi_tokenize':
257 |             model_info['tokenizer'] = vi_tokenize
258 | 
259 |         model_init_states[m_name] = init_state(**model_info)
260 |         all_civil_code, data_args, tfidf_vectorizer, trainer, bert_tokenizer = model_init_states[
261 |             m_name]
262 | 
263 |         tokenizer = model_info.get('tokenizer')
264 |         topk = 150
265 |         infer_coliee_task3(sentence=test_sents[:5],
266 |                            all_civil_code=all_civil_code, data_args=data_args, tfidf_vectorizer=tfidf_vectorizer,
267 |                            trainer=trainer, bert_tokenizer=bert_tokenizer,
268 |                            tokenizer=tokenizer, topk=topk)
269 |     print("Finish loaded model")
270 | 
271 |     missing_ids_info = {}
272 |     real_prediction = {}
273 | 
274 |     # start infer
275 |     time_start = time.time()
276 |     for m_name, model_info in model_configs.items():
277 |         if 'tokenizer' in model_info and model_info['tokenizer'] == 'vi_tokenize':
278 |             model_info['tokenizer'] = vi_tokenize
279 | 
280 |         all_civil_code, data_args, tfidf_vectorizer, trainer, bert_tokenizer = model_init_states[
281 |             m_name]
282 |         tokenizer = model_info.get('tokenizer')
283 |         topk = model_info.get('topk', 150)
284 |         predicted_labels, probs, c_code_pred_by_tfidf = infer_coliee_task3(sentence=test_sents, all_civil_code=all_civil_code,
285 |                                                                            data_args=data_args,
286 |                                                                            tfidf_vectorizer=tfidf_vectorizer,
287 |                                                                            trainer=trainer, bert_tokenizer=bert_tokenizer,
288 |                                                                            tokenizer=tokenizer, topk=topk)
289 | 
290 |         predicted_labels = [x for x in list_split(predicted_labels, topk)] # np.array_split(predicted_labels, len(test_sents))
291 |         probs = [x for x in list_split(probs, topk)] #np.array_split(probs, len(test_sents))
292 |         c_code_pred_by_tfidf = [x for x in list_split(c_code_pred_by_tfidf, topk)] # np.array_split( c_code_pred_by_tfidf, len(test_sents))
293 | 
294 |         result = [[{"label": True if lb == 1 else False,
295 |                     "scores": [float(probs[jj][i][j]) for j in range(probs[jj][i].shape[0])],
296 |                   "id": test_ids[jj],
297 |                     "sentence": s,
298 |                     #  "civil_code": c_code_pred_by_tfidf[jj][i][0],
299 |                     "civil_code_id": c_code_pred_by_tfidf[jj][i][1],
300 |                     }
301 |                    for i, lb in enumerate(predicted_labels[jj]) if lb == 1] for jj, s in enumerate(test_sents)]
302 | 
303 |         current_missing_ids = [[{"label":  False,
304 |                                       "score": float(probs[jj][i][1]),
305 |                                       "id": test_ids[jj],
306 |                                       "civil_code_id": c_code_pred_by_tfidf[jj][i][1],
307 |                                       }
308 |                                      for i, lb in enumerate(predicted_labels[jj]) if lb == 0] for jj, s in enumerate(test_sents)]
309 |         for negative_prediction in current_missing_ids:
310 |             negative_prediction.sort(key=lambda info: info['score'], reverse=True)
311 | 
312 |         missing_ids_info[m_name] = current_missing_ids
313 |         
314 |         for jj, k in enumerate(test_ids):
315 |             if k not in real_prediction:
316 |                 real_prediction[k] = set()
317 |             real_prediction[k] = real_prediction[k].union(
318 |                 set([pred_infor['civil_code_id'] for pred_infor in result[jj]]))
319 | 
320 |         print(json.dumps(result, indent=2, ensure_ascii=False))
321 |         print("Finish inference on fine-tuned model {}, total time consuming: ".format(
322 |             m_name), time.time() - time_start)
323 |         print(len(result))
324 | 
325 |     count_negative_add = 0
326 |     for jj, k in enumerate(test_ids):
327 |         if len(real_prediction[k]) == 0:
328 |             count_negative_add += 1
329 |             # pick 1 best score from negative prediction each model
330 |             for m_name, _ in model_configs.items():
331 |                 real_prediction[k].add(missing_ids_info[m_name][jj][0]['civil_code_id'])
332 | 
333 |     print("Total time consuming for {} samples: {} seconds => avg 1 sample in {} second".format(
334 |         len(test_sents), time.time() - time_start,  (time.time() - time_start) / len(test_sents)))
335 | 
336 |     submit_result = []
337 |     for k, v in real_prediction.items():
338 |         relevant_a_s = []
339 |         for relevant_a in v:
340 |             tmp_a = Article.from_string(relevant_a)
341 |             relevant_a_s.append({'law_id': tmp_a.l_id, 'article_id': tmp_a.a_id})
342 |         submit_result.append({
343 |             'question_id': k,
344 |             'relevant_articles': relevant_a_s
345 |         })
346 |     print("Count negative addition = {}".format(count_negative_add))
347 | 
348 |     json.dump(submit_result, open("data/result_prediction.json", "wt", encoding='utf8'), ensure_ascii=False, indent=2)
349 | 
350 | 
351 | 


--------------------------------------------------------------------------------
/src/run_glue.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ Finetuning the library models for sequence classification on GLUE."""
 17 | # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 18 | 
 19 | import logging
 20 | import os
 21 | import random
 22 | import sys
 23 | from dataclasses import dataclass, field
 24 | from threading import local
 25 | from typing import Optional
 26 | 
 27 | import numpy as np
 28 | from datasets import load_dataset, load_metric
 29 | 
 30 | import transformers
 31 | from transformers import (
 32 |     AutoConfig,
 33 |     AutoModelForSequenceClassification,
 34 |     AutoTokenizer,
 35 |     DataCollatorWithPadding,
 36 |     EvalPrediction,
 37 |     HfArgumentParser,
 38 |     PretrainedConfig,
 39 |     Trainer,
 40 |     TrainingArguments,
 41 |     default_data_collator,
 42 |     set_seed,
 43 | )
 44 | from transformers.trainer_utils import get_last_checkpoint, is_main_process
 45 | 
 46 | 
 47 | task_to_keys = {
 48 |     "cola": ("sentence", None),
 49 |     "mnli": ("premise", "hypothesis"),
 50 |     "mrpc": ("sentence1", "sentence2"),
 51 |     "qnli": ("question", "sentence"),
 52 |     "qqp": ("question1", "question2"),
 53 |     "rte": ("sentence1", "sentence2"),
 54 |     "sst2": ("sentence", None),
 55 |     "stsb": ("sentence1", "sentence2"),
 56 |     "wnli": ("sentence1", "sentence2"),
 57 | }
 58 | 
 59 | logger = logging.getLogger(__name__)
 60 | 
 61 | 
 62 | @dataclass
 63 | class DataTrainingArguments:
 64 |     """
 65 |     Arguments pertaining to what data we are going to input our model for training and eval.
 66 | 
 67 |     Using `HfArgumentParser` we can turn this class
 68 |     into argparse arguments to be able to specify them on
 69 |     the command line.
 70 |     """
 71 | 
 72 |     task_name: Optional[str] = field(
 73 |         default=None,
 74 |         metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
 75 |     )
 76 |     max_seq_length: int = field(
 77 |         default=128,
 78 |         metadata={
 79 |             "help": "The maximum total input sequence length after tokenization. Sequences longer "
 80 |             "than this will be truncated, sequences shorter will be padded."
 81 |         },
 82 |     )
 83 |     overwrite_cache: bool = field(
 84 |         default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
 85 |     )
 86 |     pad_to_max_length: bool = field(
 87 |         default=True,
 88 |         metadata={
 89 |             "help": "Whether to pad all samples to `max_seq_length`. "
 90 |             "If False, will pad the samples dynamically when batching to the maximum length in the batch."
 91 |         },
 92 |     )
 93 |     train_file: Optional[str] = field(
 94 |         default=None, metadata={"help": "A csv or a json file containing the training data."}
 95 |     )
 96 |     validation_file: Optional[str] = field(
 97 |         default=None, metadata={"help": "A csv or a json file containing the validation data."}
 98 |     )
 99 |     test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})
100 | 
101 |     def __post_init__(self):
102 |         if self.task_name is not None:
103 |             self.task_name = self.task_name.lower()
104 |             if self.task_name not in task_to_keys.keys():
105 |                 raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
106 |         elif self.train_file is None or self.validation_file is None:
107 |             raise ValueError("Need either a GLUE task or a training/validation file.")
108 |         else:
109 |             train_extension = self.train_file.split(".")[-1]
110 |             assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
111 |             validation_extension = self.validation_file.split(".")[-1]
112 |             assert (
113 |                 validation_extension == train_extension
114 |             ), "`validation_file` should have the same extension (csv or json) as `train_file`."
115 | 
116 | 
117 | @dataclass
118 | class ModelArguments:
119 |     """
120 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
121 |     """
122 | 
123 |     model_name_or_path: str = field(
124 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
125 |     )
126 |     config_name: Optional[str] = field(
127 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
128 |     )
129 |     tokenizer_name: Optional[str] = field(
130 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
131 |     )
132 |     cache_dir: Optional[str] = field(
133 |         default=None,
134 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
135 |     )
136 |     num_label: Optional[int] = field(
137 |         default=None,
138 |         metadata={"help": "The number of label"},
139 |     )
140 |     use_fast_tokenizer: bool = field(
141 |         default=True,
142 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
143 |     )
144 |     model_revision: str = field(
145 |         default="main",
146 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
147 |     )
148 |     use_auth_token: bool = field(
149 |         default=False,
150 |         metadata={
151 |             "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
152 |             "with private models)."
153 |         },
154 |     )
155 | 
156 | 
157 | def main():
158 |     # See all possible arguments in src/transformers/training_args.py
159 |     # or by passing the --help flag to this script.
160 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
161 | 
162 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
163 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
164 |         # If we pass only one argument to the script and it's the path to a json file,
165 |         # let's parse it to get our arguments.
166 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
167 |     else:
168 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
169 | 
170 |     # Detecting last checkpoint.
171 |     last_checkpoint = None
172 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
173 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
174 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
175 |             raise ValueError(
176 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
177 |                 "Use --overwrite_output_dir to overcome."
178 |             )
179 |         elif last_checkpoint is not None:
180 |             logger.info(
181 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
182 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
183 |             )
184 | 
185 |     # Setup logging
186 |     logging.basicConfig(
187 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
188 |         datefmt="%m/%d/%Y %H:%M:%S",
189 |         handlers=[logging.StreamHandler(sys.stdout)],
190 |     )
191 |     logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
192 | 
193 |     # Log on each process the small summary:
194 |     logger.warning(
195 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
196 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
197 |     )
198 |     # Set the verbosity to info of the Transformers logger (on main process only):
199 |     if is_main_process(training_args.local_rank):
200 |         transformers.utils.logging.set_verbosity_info()
201 |         transformers.utils.logging.enable_default_handler()
202 |         transformers.utils.logging.enable_explicit_format()
203 |     logger.info(f"Training/evaluation parameters {training_args}")
204 | 
205 |     # Set seed before initializing model.
206 |     set_seed(training_args.seed)
207 | 
208 |     # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
209 |     # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
210 |     #
211 |     # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
212 |     # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
213 |     # label if at least two columns are provided.
214 |     #
215 |     # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
216 |     # single column. You can easily tweak this behavior (see below)
217 |     #
218 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
219 |     # download the dataset.
220 |     if data_args.task_name is not None:
221 |         # Downloading and loading a dataset from the hub.
222 |         datasets = load_dataset("glue", data_args.task_name)
223 |     else:
224 |         # Loading a dataset from your local files.
225 |         # CSV/JSON training and evaluation files are needed.
226 |         data_files = {"train": data_args.train_file, "validation": data_args.validation_file}
227 | 
228 |         # Get the test dataset: you can provide your own CSV/JSON test file (see below)
229 |         # when you use `do_predict` without specifying a GLUE benchmark task.
230 |         if training_args.do_predict:
231 |             if data_args.test_file is not None:
232 |                 train_extension = data_args.train_file.split(".")[-1]
233 |                 test_extension = data_args.test_file.split(".")[-1]
234 |                 assert (
235 |                     test_extension == train_extension
236 |                 ), "`test_file` should have the same extension (csv or json) as `train_file`."
237 |                 data_files["test"] = data_args.test_file
238 |             else:
239 |                 raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
240 | 
241 |         for key in data_files.keys():
242 |             logger.info(f"load a local file for {key}: {data_files[key]}")
243 | 
244 |         if data_args.train_file.endswith(".csv"):
245 |             # Loading a dataset from local csv files
246 |             datasets = load_dataset("csv", data_files=data_files)
247 |         else:
248 |             # Loading a dataset from local json files
249 |             datasets = load_dataset("json", data_files=data_files)
250 |     # See more about loading any type of standard or custom dataset at
251 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
252 | 
253 |     # Labels
254 |     if data_args.task_name is not None:
255 |         is_regression = data_args.task_name == "stsb"
256 |         if not is_regression:
257 |             label_list = datasets["train"].features["label"].names
258 |             num_labels = len(label_list)
259 |         else:
260 |             num_labels = 1
261 |     else:
262 |         # Trying to have good defaults here, don't hesitate to tweak to your needs.
263 |         is_regression = datasets["train"].features["label"].dtype in ["float32", "float64"]
264 |         if is_regression:
265 |             num_labels = 1
266 |         else:
267 |             # A useful fast method:
268 |             # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
269 |             label_list = datasets["train"].unique("label")
270 |             label_list.sort()  # Let's sort it for determinism
271 |             num_labels = len(label_list)
272 | 
273 |     if model_args.num_label is not None:
274 |         num_labels =  model_args.num_label
275 |         if "label_list" not in locals():
276 |             label_list = datasets["train"].unique("label")
277 |             label_list.sort()  # Let's sort it for determinism
278 |         num_labels_from_data = len(label_list)
279 | 
280 |     # Load pretrained model and tokenizer
281 |     #
282 |     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
283 |     # download model & vocab.
284 |     config = AutoConfig.from_pretrained(
285 |         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
286 |         num_labels=num_labels,
287 |         finetuning_task=data_args.task_name,
288 |         cache_dir=model_args.cache_dir,
289 |         revision=model_args.model_revision,
290 |         use_auth_token=True if model_args.use_auth_token else None,
291 |     )
292 |     tokenizer = AutoTokenizer.from_pretrained(
293 |         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
294 |         cache_dir=model_args.cache_dir,
295 |         use_fast=model_args.use_fast_tokenizer,
296 |         revision=model_args.model_revision,
297 |         use_auth_token=True if model_args.use_auth_token else None,
298 |     )
299 |     model = AutoModelForSequenceClassification.from_pretrained(
300 |         model_args.model_name_or_path,
301 |         from_tf=bool(".ckpt" in model_args.model_name_or_path),
302 |         config=config,
303 |         cache_dir=model_args.cache_dir,
304 |         revision=model_args.model_revision,
305 |         use_auth_token=True if model_args.use_auth_token else None,
306 |     )
307 | 
308 |     # Preprocessing the datasets
309 |     if data_args.task_name is not None:
310 |         sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
311 |     else:
312 |         # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
313 |         non_label_column_names = [name for name in datasets["train"].column_names if name != "label" and name != "id"]
314 |         if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
315 |             sentence1_key, sentence2_key = "sentence1", "sentence2"
316 |         else:
317 |             if len(non_label_column_names) >= 2:
318 |                 sentence1_key, sentence2_key = non_label_column_names[:2]
319 |             else:
320 |                 sentence1_key, sentence2_key = non_label_column_names[0], None
321 | 
322 |     # Padding strategy
323 |     if data_args.pad_to_max_length:
324 |         padding = "max_length"
325 |     else:
326 |         # We will pad later, dynamically at batch creation, to the max sequence length in each batch
327 |         padding = False
328 | 
329 |     # Some models have set the order of the labels to use, so let's make sure we do use it.
330 |     label_to_id = None
331 |     if (
332 |         model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
333 |         and data_args.task_name is not None
334 |         and not is_regression
335 |     ):
336 |         # Some have all caps in their config, some don't.
337 |         label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
338 |         if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
339 |             label_to_id = {i: label_name_to_id[label_list[i]] for i in range(num_labels)}
340 |         else:
341 |             logger.warn(
342 |                 "Your model seems to have been trained with labels, but they don't match the dataset: ",
343 |                 f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
344 |                 "\nIgnoring the model labels as a result.",
345 |             )
346 |     elif data_args.task_name is None and not is_regression:
347 |         label_to_id = {v: i for i, v in enumerate(label_list)}
348 | 
349 |     if data_args.max_seq_length > tokenizer.model_max_length:
350 |         logger.warn(
351 |             f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
352 |             f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
353 |         )
354 |     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
355 | 
356 |     def preprocess_function(examples):
357 |         # Tokenize the texts
358 |         args = (
359 |             (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
360 |         )
361 |         result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
362 | 
363 |         # Map labels to IDs (not necessary for GLUE tasks)
364 |         if label_to_id is not None and "label" in examples:
365 |             result["label"] = [label_to_id[l] for l in examples["label"]]
366 |         return result
367 | 
368 |     datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
369 | 
370 |     train_dataset = datasets["train"]
371 |     eval_dataset = datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
372 |     if data_args.task_name is not None or data_args.test_file is not None:
373 |         test_dataset = datasets["test_matched" if data_args.task_name == "mnli" else "test"]
374 | 
375 |     # Log a few random samples from the training set:
376 |     for index in random.sample(range(len(train_dataset)), 3):
377 |         logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
378 | 
379 |     # Get the metric function
380 |     if data_args.task_name is not None:
381 |         metric = load_metric("glue", data_args.task_name)
382 |     # TODO: When datasets metrics include regular accuracy, make an else here and remove special branch from
383 |     # compute_metrics
384 | 
385 |     # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
386 |     # predictions and label_ids field) and has to return a dictionary string to float.
387 |     def compute_metrics(p: EvalPrediction):
388 |         preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
389 |         preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
390 |         if data_args.task_name is not None:
391 |             result = metric.compute(predictions=preds, references=p.label_ids)
392 |             if len(result) > 1:
393 |                 result["combined_score"] = np.mean(list(result.values())).item()
394 |             return result
395 |         elif is_regression:
396 |             return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
397 |         else:
398 |             return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
399 | 
400 |     # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
401 |     if data_args.pad_to_max_length:
402 |         data_collator = default_data_collator
403 |     elif training_args.fp16:
404 |         data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
405 |     else:
406 |         data_collator = None
407 |     
408 |     # Initialize our Trainer
409 |     trainer = Trainer(
410 |         model=model,
411 |         args=training_args,
412 |         train_dataset=train_dataset,
413 |         eval_dataset=eval_dataset if training_args.do_eval else None,
414 |         compute_metrics=compute_metrics,
415 |         tokenizer=tokenizer,
416 |         data_collator=data_collator,
417 |     )
418 |     # trainer.num_train_epochs = trainer.num_train_epochs + training_args.num_train_epochs
419 | 
420 |     # Training
421 |     if training_args.do_train:
422 |         if last_checkpoint is not None:
423 |             checkpoint = last_checkpoint
424 |         elif os.path.isdir(model_args.model_name_or_path):
425 |             checkpoint = model_args.model_name_or_path
426 |         else:
427 |             checkpoint = None
428 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
429 |         metrics = train_result.metrics
430 | 
431 |         trainer.save_model()  # Saves the tokenizer too for easy upload
432 | 
433 |         output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
434 |         if trainer.is_world_process_zero():
435 |             with open(output_train_file, "w") as writer:
436 |                 logger.info("***** Train results *****")
437 |                 for key, value in sorted(metrics.items()):
438 |                     logger.info(f"  {key} = {value}")
439 |                     writer.write(f"{key} = {value}\n")
440 | 
441 |             # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
442 |             trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))
443 | 
444 |     # Evaluation
445 |     eval_results = {}
446 |     if training_args.do_eval:
447 |         logger.info("*** Evaluate ***")
448 | 
449 |         # Loop to handle MNLI double evaluation (matched, mis-matched)
450 |         tasks = [data_args.task_name]
451 |         eval_datasets = [eval_dataset]
452 |         if data_args.task_name == "mnli":
453 |             tasks.append("mnli-mm")
454 |             eval_datasets.append(datasets["validation_mismatched"])
455 | 
456 |         for eval_dataset, task in zip(eval_datasets, tasks):
457 |             eval_result = trainer.evaluate(eval_dataset=eval_dataset)
458 | 
459 |             output_eval_file = os.path.join(training_args.output_dir, f"eval_results_{task}.txt")
460 |             if trainer.is_world_process_zero():
461 |                 with open(output_eval_file, "w") as writer:
462 |                     logger.info(f"***** Eval results {task} *****")
463 |                     for key, value in sorted(eval_result.items()):
464 |                         logger.info(f"  {key} = {value}")
465 |                         writer.write(f"{key} = {value}\n")
466 | 
467 |             eval_results.update(eval_result)
468 | 
469 |     if training_args.do_predict:
470 |         logger.info("*** Test ***")
471 | 
472 |         # Loop to handle MNLI double evaluation (matched, mis-matched)
473 |         tasks = [data_args.task_name]
474 |         test_datasets = [test_dataset]
475 |         if data_args.task_name == "mnli":
476 |             tasks.append("mnli-mm")
477 |             test_datasets.append(datasets["test_mismatched"])
478 | 
479 |         for test_dataset, task in zip(test_datasets, tasks):
480 |             # Removing the `label` columns because it contains -1 and Trainer won't like that.
481 |             test_dataset.remove_columns_("label")
482 |             predictions_mt = trainer.predict(test_dataset=test_dataset).predictions
483 |             if num_labels_from_data < model_args.num_label:
484 |                 predictions_mt = predictions_mt[:, :num_labels_from_data]
485 | 
486 |             import pickle
487 |             pickle.dump(predictions_mt, open(os.path.join(training_args.output_dir, 
488 |                                                           "predictions.pkl" if 'train' not in data_args.test_file else "predictions_train.pkl"), "wb"))
489 |             predictions = np.squeeze(predictions_mt) if is_regression else np.argmax(predictions_mt, axis=1)
490 | 
491 |             output_test_file = os.path.join(training_args.output_dir, f"test_results_{task}.txt" 
492 |                                             if 'train' not in data_args.test_file else f"train_results_{task}.txt" )
493 |             if trainer.is_world_process_zero():
494 |                 with open(output_test_file, "w") as writer:
495 |                     logger.info(f"***** Test results {task} *****")
496 |                     writer.write("index\tprediction\n")
497 |                     for index, item in enumerate(predictions):
498 |                         if is_regression:
499 |                             writer.write(f"{index}\t{item:3.3f}\n")
500 |                         else:
501 |                             item = label_list[item]
502 |                             writer.write(f"{index}\t{item}\n")
503 |     return eval_results
504 | 
505 | 
506 | def _mp_fn(index):
507 |     # For xla_spawn (TPUs)
508 |     main()
509 | 
510 | 
511 | if __name__ == "__main__":
512 |     main()


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
  1 | from pprint import pprint
  2 | from typing import Any, Dict, List
  3 | 
  4 | import nltk
  5 | import glob
  6 | import pickle
  7 | import torch
  8 | import pandas as pd
  9 | import re
 10 | import json
 11 | import numpy as np
 12 | 
 13 | rdrsegmenter = None
 14 | 
 15 | 
 16 | def standardize_data(row):
 17 |     # Xóa dấu chấm, phẩy, hỏi ở cuối câu
 18 |     row = re.sub(r"[\.,\?]+$-", "", row)
 19 |     # Xóa tất cả dấu chấm, phẩy, chấm phẩy, chấm thang, ... trong câu
 20 |     row = row.replace(",", " ").replace(".", " ") \
 21 |         .replace(";", " ").replace("“", " ") \
 22 |         .replace(":", " ").replace("”", " ") \
 23 |         .replace('"', " ").replace("'", " ") \
 24 |         .replace("!", " ").replace("?", " ") \
 25 |         .replace("-", " ").replace("?", " ") \
 26 |         .replace("(", " ").replace(")", " ")
 27 |     row = re.sub(r" {2, }", " ", row).strip().lower()
 28 |     return row
 29 | 
 30 |  
 31 | class Article:
 32 |     pattern = "{}-->{}-->{}"
 33 | 
 34 |     def __init__(self, a_id, l_id, content, content_raw=None) -> None:
 35 |         self.a_id = a_id
 36 |         self.l_id = l_id
 37 |         self.content = content
 38 |         self.content_raw = content_raw if content_raw is not None else content
 39 | 
 40 |     def __str__(self) -> str:
 41 |         return self.pattern.format(self.l_id, self.a_id, self.content)
 42 | 
 43 |     @classmethod
 44 |     def from_string(cls, str_in):
 45 |         info = str_in.split(cls.pattern.format("", "a", "").split("a")[0])
 46 |         return cls(info[1], info[0], info[2])
 47 | 
 48 |     def get_id(self):
 49 |         return self.pattern.format(self.l_id, self.a_id, "")
 50 | 
 51 |     def get_subid(self, sub_id):
 52 |         return self.pattern.format(self.l_id, self.a_id+"-sub{}".format(sub_id), "")
 53 | 
 54 | 
 55 | class Question:
 56 |     pattern = "{}-->{}"
 57 | 
 58 |     def __init__(self, id, content, relevant_a: List[Article] = None, label: bool = None, content_raw=None) -> None:
 59 |         self.id = id
 60 |         self.content = content
 61 |         self.relevant_a = relevant_a or []
 62 |         self.label = label
 63 |         self.content_raw = content_raw if content_raw is not None else content
 64 | 
 65 |     def __str__(self) -> str:
 66 |         return self.pattern.format(self.id, self.content)
 67 | 
 68 |     @classmethod
 69 |     def from_string(cls, str_in):
 70 |         info = str_in.split(cls.pattern.format("", "a", "").split("a")[0])
 71 |         return cls(info[0], info[1])
 72 | 
 73 |     def get_id(self):
 74 |         return self.pattern.format(self.id, "")
 75 | 
 76 | 
 77 | def f_score(p, r, beta=1):
 78 |     y = (beta * beta * p + r)
 79 |     return (1 + beta * beta) * p * r / y if y != 0 else 0.0
 80 | 
 81 | 
 82 | def micro_result(count_real_lb, count_predicted, count_true,  count_gold_lb=138):
 83 |     p = count_true/count_predicted if count_predicted != 0 else 0.0
 84 |     r = count_true/count_real_lb if count_real_lb != 0 else 0.0
 85 |     result = {"count_real_lb": count_real_lb,
 86 |               "count_predicted": count_predicted,
 87 |               "count_gold_lb": count_gold_lb,
 88 |               "count_true": count_true,
 89 |               "P": p,
 90 |               "R": r,
 91 |               "f1": f_score(p, r, 1),
 92 |               "f2": f_score(p, r, 2),
 93 |               "f2_": f_score(p, count_true/count_gold_lb, 2)}
 94 |     print(result)
 95 |     return result
 96 | 
 97 | 
 98 | def evaluate_by_similarity(similarities_, gold_data, c_keys, topk=150):
 99 |     count_true = 0
100 |     count_all_prediction = 0
101 |     count_all_gold_lb = 0
102 | 
103 |     idx_result = similarities_.argsort()[:, -topk:]
104 |     for i in range(idx_result.shape[0]):
105 |         gold_lb = gold_data[i]['result']
106 |         count_all_gold_lb += len(gold_lb)
107 | 
108 |         pred = [c_keys[idx] for idx in idx_result[i]]
109 |         count_all_prediction += len(pred)
110 | 
111 |         for i, pred_lb in enumerate(pred):
112 |             if pred_lb in gold_lb:
113 |                 count_true += 1
114 | 
115 |     print(count_true, count_all_prediction,  count_all_gold_lb,
116 |           'P: ', count_true/count_all_prediction,
117 |           'R: ', count_true/count_all_gold_lb,
118 |           'F1: ', f_score(count_true*1.0/count_all_prediction,
119 |                           count_true*1.0/count_all_gold_lb),
120 |           'F2: ', f_score(count_true*1.0/count_all_prediction,
121 |                           count_true*1.0/count_all_gold_lb, beta=2),
122 |           )
123 |     return idx_result
124 | 
125 | 
126 | def evaluate_by_label(prediction_file, test_dat_file, ensemble_files=None):
127 |     test_dat = pd.read_csv(test_dat_file, sep=',')
128 |     predictions = []
129 | 
130 |     count_real_lb = 0
131 |     count_gold_lb = 138
132 |     count_true = 0
133 |     count_predicted = 0
134 |     ensemble_files = ensemble_files or []
135 | 
136 |     if prediction_file not in ensemble_files:
137 |         ensemble_files.append(prediction_file)
138 | 
139 |     for pred_file in ensemble_files:
140 |         prediction_ = pd.read_csv(pred_file, sep='\t')
141 |         predictions.append(prediction_)
142 | 
143 |     for i in range(len(test_dat)):
144 |         if test_dat['label'][i] == 1:
145 |             count_real_lb += 1
146 |             for prediction in predictions:
147 |                 if prediction['prediction'][i] == 1:
148 |                     count_true += 1
149 |                     break
150 | 
151 |         for prediction in predictions:
152 |             if prediction['prediction'][i] == 1:
153 |                 count_predicted += 1
154 |                 break
155 | 
156 |     return micro_result(count_real_lb, count_predicted, count_true,  count_gold_lb)
157 | 
158 | def evaluate_idx(preds, gold_data, c_keys=None):
159 |     try:
160 |         count_true = 0
161 |         count_all_prediction = 0
162 |         count_all_gold_lb = 0
163 | 
164 |         for i_gold in range(len(preds)):
165 |             gold_lb = [a.get_id() for a in gold_data[i_gold].relevant_a]
166 |             count_all_gold_lb += len(gold_lb)
167 | 
168 |             pred = [c_keys[idx] for idx in preds[i_gold]]
169 |             count_all_prediction += len(pred)
170 | 
171 |             for _i, pred_lb in enumerate(pred):
172 |                 if pred_lb in gold_lb:
173 |                     count_true += 1
174 | 
175 |         print(count_true, count_all_prediction, count_all_gold_lb,
176 |               'P: ', count_true / count_all_prediction,
177 |               'R: ', count_true / count_all_gold_lb,
178 |               'F1: ', f_score(count_true * 1.0 / count_all_prediction,
179 |                               count_true * 1.0 / count_all_gold_lb),
180 |               'F2: ', f_score(count_true * 1.0 / count_all_prediction,
181 |                               count_true * 1.0 / count_all_gold_lb, beta=2),
182 |               )
183 |         return preds
184 |     except Exception as e:
185 |         print(e)
186 |     return preds
187 | 
188 | def combine_idxs(idx_ifidf, idx_bm25, top_k=100):
189 |     preds = []
190 |     for i in range(len(idx_bm25)):
191 |         prediction_item = []
192 |         for j in range(len(idx_bm25[i])):
193 |             if idx_bm25[i][j] not in prediction_item:
194 |                 prediction_item.append(idx_bm25[i][j])
195 | 
196 |             if len(prediction_item) == top_k:
197 |                 break
198 | 
199 |             if idx_ifidf[i][j] not in prediction_item:
200 |                 prediction_item.append(idx_ifidf[i][j])
201 | 
202 |             if len(prediction_item) == top_k:
203 |                 break
204 |         preds.append(prediction_item)
205 |     return np.array(preds)
206 | 
207 | def evaluate(similarities_, gold_data, topk=150, c_keys=None):
208 |     try:
209 |         idx_result = similarities_.argsort()[:, -topk:]
210 |         return evaluate_idx(idx_result, gold_data, c_keys)
211 |     except Exception as e:
212 |         print(e)
213 |         return idx_result
214 | 
215 | def _article_content(full_content, chunk_content_info=None):
216 |     if chunk_content_info is None:
217 |         return ["{}".format(full_content)]
218 |     chunk_content_size, chunk_content_stride = chunk_content_info[0], chunk_content_info[1]
219 |     sub_contents = []
220 |     separate_w = ' ' 
221 |     words = full_content.split(separate_w)
222 | 
223 |     if len(words) > chunk_content_size:
224 |         for i_start in range(0, len(words), chunk_content_size-chunk_content_stride):
225 |             sub_cont = separate_w.join(
226 |                 words[i_start:i_start + chunk_content_size])
227 |             sub_contents.append(sub_cont)
228 |             if len(words[i_start:i_start + chunk_content_size]) < chunk_content_size:
229 |                 break
230 | 
231 |     articles = ["{}".format(full_content)] + ["{}".format(sub_content) for sub_content in sub_contents]
232 |     return articles
233 | 
234 |      
235 | def _do_nothing(str_in):
236 |     return str_in
237 | 
238 | def load_data_kse(path_folder_base="data/", postags_select=None, ids_test=None, ids_dev=None, tokenizer=None,
239 |                   law_corpus='legal_corpus.json', training_data='train_question_answer.json', testing_data=None,
240 |                   chunk_content_info=None):
241 | 
242 |     if tokenizer is None:
243 |         tokenizer = _do_nothing
244 | 
245 |     articles = {}
246 |     sub_articles = {}
247 |     sub_key_mapping ={}
248 |     articles_raw = json.load(open("{}/{}".format(path_folder_base, law_corpus)))
249 |     for l_info in articles_raw:
250 |         l_id = l_info['id'] if 'id' in l_info else l_info['law_id']
251 |         for a_info in l_info["articles"]:
252 |             a_id = a_info['id'] if 'id' in a_info else a_info['article_id']
253 |             a_title = tokenizer(a_info['title']) + " . " if 'title' in a_info else ""
254 |             a_title_raw =  a_info['title'] + " . " if 'title' in a_info else ""
255 | 
256 |             a_content_s = _article_content(tokenizer(a_info['text']), chunk_content_info)
257 |             a_content_s_raw = _article_content(a_info['text'], chunk_content_info)
258 | 
259 |             new_a = Article(l_id=l_id, a_id=a_id, content=a_title + a_content_s[0], content_raw=a_title_raw+a_content_s_raw[0])
260 |             k = new_a.get_id()
261 |             articles[k] = new_a
262 | 
263 |             for i, a_content in enumerate(a_content_s[1:10]):
264 |                 new_sub_a = Article(l_id=l_id, a_id=a_id, content=a_title + a_content)
265 |                 sub_articles[new_a.get_subid(i)] = new_sub_a
266 |                 
267 |                 if k not in sub_key_mapping:
268 |                     sub_key_mapping[k] = []
269 |                 sub_key_mapping[k].append(new_sub_a.get_subid(i))
270 | 
271 |     print(len(articles))
272 |     print(articles[list(articles.keys())[0]])
273 | 
274 |     # load annotated data
275 |     data = []
276 |     q_raw = json.load(open("{}/{}".format(path_folder_base, training_data)))
277 |     if testing_data is not None:
278 |         q_raw_test = json.load(open("{}/{}".format(path_folder_base, testing_data)))
279 |         if 'items' in q_raw and 'items' in q_raw_test:
280 |             for e in q_raw_test['items']:
281 |                 e['relevant_articles'] = e.get('relevant_articles', [])
282 |             q_raw['items'] = q_raw['items'] + q_raw_test['items']
283 |         
284 |     if 'items' in q_raw:
285 |         for q_info in q_raw['items']:
286 |             data.append(Question(id=q_info["question_id"], content=tokenizer(q_info["question"])if tokenizer is not None else q_info["question"],
287 |                 content_raw=q_info["question"],
288 |                 relevant_a=[articles[Article(
289 |                 a_info["article_id"], a_info["law_id"], None).get_id()] for a_info in q_info["relevant_articles"]], 
290 |                 label=True))
291 |     else:
292 |         for q_info in q_raw:
293 |             data.append(Question(id=q_info["question_id"], content=q_info["text"], relevant_a=[articles[Article(
294 |                 a_info["article_id"], a_info["law_id"], None).get_id()] for a_info in q_info["relevant_articles"]], 
295 |                 label=q_info['label']))
296 | 
297 |     # random test id 
298 |     if ids_test is None:
299 |         ids_test = [q.id for idx, q in enumerate(data) if idx % 10 < 2] 
300 |     if ids_dev is None or len(ids_dev) == 0:
301 |         ids_dev = ids_test
302 |     print('Test ids = {}, Dev ids = {}', ids_test, ids_dev)
303 | 
304 |     print("Test ids ({} samples) = {}".format(len(ids_test), ids_test))
305 |     test_q = [q for q in data if q.id in ids_test]
306 |     print('Len test_q', len(test_q))
307 |     dev_q = [q for q in data if q.id in ids_dev]
308 |     print('Len dev_q', len(dev_q))
309 |     train_q = [q for q in data if q.id not in set(ids_test + ids_dev)]
310 |     print('Len train_q', len(train_q))
311 | 
312 |     c_docs = []
313 |     c_docs_raw = []
314 |     c_keys = []
315 |     for k, c in articles.items():
316 |         c_docs.append(c.content)
317 |         c_docs_raw.append(c.content_raw)
318 |         c_keys.append(k)
319 | 
320 |     c_sub_docs, c_sub_keys = [], []
321 |     for k, c in sub_articles.items():
322 |         c_sub_docs.append(c.content)
323 |         c_sub_keys.append(k)
324 |         
325 |     return c_docs, c_keys, dev_q, test_q, train_q, (c_sub_docs, c_sub_keys, sub_key_mapping, c_docs_raw)
326 | 
327 | 
328 | def postag_filter(input_str, tags_filter=["V", "N", "P", "."]):
329 |     words = nltk.word_tokenize(input_str)
330 |     pos = nltk.pos_tag(words)
331 |     new_words = []
332 | 
333 |     for p in pos:
334 |         if p[1][0] in tags_filter:
335 |             new_words.append(p[0])
336 |     return " ".join(new_words)
337 | 
338 | 
339 | def aggregate_results(base_folder, aggregate_predictions=None, keys=None):
340 |     prediction_mt = pickle.load(
341 |         open("{}/predictions.pkl".format(base_folder), "rb"))
342 |     test_dat = pd.read_csv("{}/test.tsv".format(base_folder), sep="\t")
343 |     prediction = pd.read_csv(
344 |         "{}/test_results_mrpc.txt".format(base_folder), sep="\t")
345 |     probs = torch.softmax(torch.from_numpy(prediction_mt), dim=1)
346 | 
347 |     # aggregate gold values
348 |     if aggregate_predictions is None and keys is None:
349 |         aggregate_predictions = {}
350 |         keys = []
351 | 
352 |     predicted_pairs = set()
353 |     for k, v_s in aggregate_predictions.items():
354 |         for v in v_s:
355 |             predicted_pairs.add((v[0], v[1]))
356 | 
357 |     for i in range(len(test_dat)):
358 |         if prediction['prediction'][i] == 1:
359 |             # H30-1-A Q0 886 1 0.193 JNLP
360 |             query_id = test_dat["#1 ID"][i]
361 |             c_id = test_dat["#2 ID"][i]
362 |             score = probs[i][1]
363 |             if query_id not in aggregate_predictions:
364 |                 keys.append(query_id)
365 |                 aggregate_predictions[query_id] = []
366 | 
367 |             if (query_id, c_id) not in predicted_pairs:
368 |                 aggregate_predictions[query_id].append((query_id, c_id, score))
369 |                 predicted_pairs.add((query_id, c_id))
370 | 
371 |     return aggregate_predictions, keys
372 | 
373 | 
374 | def aggregate_all_results_task5(prediction_files, gold_test_file):
375 |     prediction_mt = [pickle.load(open(f_, 'rb')) for f_ in prediction_files][0]
376 | 
377 |     # load test file - gold data for question id and article idß
378 |     test_dat = pd.read_csv(gold_test_file, sep=",")
379 | 
380 |     predicted_pairs = {}
381 | 
382 |     probs = torch.softmax(torch.from_numpy(prediction_mt), dim=1)
383 |     count_true = 0
384 |     for i in range(len(test_dat)):
385 |         # H30-1-A Q0 886 1 0.193 JNLP
386 |         query_id = test_dat["id"][i]
387 |         lb = test_dat["label"][i] == 1
388 |         score = probs[i][1]
389 |         predicted_pairs[query_id] = score
390 | 
391 |         pred_lb = True if  probs[i][1] > probs[i][0] else False
392 | 
393 |         if pred_lb == lb:
394 |             count_true += 1
395 |         
396 |     print("acc={}, true={}, total={}".format(count_true / len(test_dat), count_true, len(test_dat)))
397 | 
398 |     return count_true / len(test_dat), (count_true, len(test_dat), predicted_pairs)
399 | 
400 | def aggregate_all_results(prediction_files, gold_test_file, topk=1, append_unpredicted_q=True, miss_ids_prediction_file=None):
401 |     prediction_mt_s = [pickle.load(open(f_, 'rb')) for f_ in prediction_files]
402 | 
403 |     # load test file - gold data for question id and article idß
404 |     test_dat = pd.read_csv(gold_test_file, sep=",")
405 | 
406 |     predicted_pairs = {}
407 |     unpredicted_pairs = {}
408 |     individual_model_stats = [[] for i in range(len(prediction_files))]
409 | 
410 |     for i_mod, prediction_mt in enumerate(prediction_mt_s):
411 |         probs = torch.softmax(torch.from_numpy(prediction_mt), dim=1)
412 | 
413 |         for i in range(len(test_dat)):
414 |             # H30-1-A Q0 886 1 0.193 JNLP
415 |             query_id = test_dat["#1 ID"][i]
416 |             c_id = re.sub(r'-sub\d+', '', test_dat["#2 ID"][i])
417 |             score = probs[i][1]
418 | 
419 |             if probs[i][1] > probs[i][0]:
420 |                 if (query_id, c_id) not in predicted_pairs:
421 |                     predicted_pairs[(query_id, c_id)] = []
422 | 
423 |                 predicted_pairs[(query_id, c_id)].append(score)
424 |             else:
425 |                 if (query_id, c_id) not in unpredicted_pairs:
426 |                     unpredicted_pairs[(query_id, c_id)] = []
427 | 
428 |                 unpredicted_pairs[(query_id, c_id)].append(score)
429 | 
430 |             # stats each model
431 |             individual_model_stats[i_mod].append((query_id, c_id, score))
432 | 
433 |     # sort stats each model
434 |     new_stats = [{} for i in range(len(prediction_files))]
435 |     for i_mod, result in enumerate(individual_model_stats):
436 |         for stat_e in result:
437 |             if stat_e[0] not in new_stats[i_mod]:
438 |                 new_stats[i_mod][stat_e[0]] = []
439 |             new_stats[i_mod][stat_e[0]].append((stat_e[1], stat_e[2].item()))
440 |         for q_id, v in new_stats[i_mod].items():
441 |             new_stats[i_mod][q_id].sort(key=lambda x: x[1], reverse=True)
442 |             new_stats[i_mod][q_id] = new_stats[i_mod][q_id][:topk]
443 |     individual_model_stats = new_stats
444 | 
445 |     #
446 |     # aggregrate result from many models
447 |     def aggregrate_result_(pairs_):
448 |         aggregate_results = {}
449 |         for k, v in pairs_.items():
450 |             if k[0] not in aggregate_results:
451 |                 aggregate_results[k[0]] = []
452 |             # aggregate_results[k[0]].append((k[0], k[1], max(v)))
453 |             aggregate_results[k[0]].append((k[0], k[1], sum(v) / len(v)))
454 |         return aggregate_results
455 | 
456 |     predicted_results = aggregrate_result_(predicted_pairs)
457 |     unpredicted_results = aggregrate_result_(unpredicted_pairs)
458 | 
459 |     # append unpredicted question by top 1
460 |     miss_prediction_keys = set()
461 |     if append_unpredicted_q:
462 |         miss_prediction_keys = set(unpredicted_results.keys()).difference(
463 |             set(predicted_results.keys()))
464 |         print('Miss {} question ids: {}'.format(len(miss_prediction_keys), miss_prediction_keys))
465 |         if miss_ids_prediction_file is not None:
466 |             json.dump(list(miss_prediction_keys), open(miss_ids_prediction_file, "wt", encoding='utf8'))
467 |         for q_id in miss_prediction_keys:
468 |             unpredicted_results[q_id].sort(key=lambda x: x[2], reverse=True)
469 |             predicted_results[q_id] = unpredicted_results[q_id][:topk if topk is not None else 1]
470 | 
471 |     #
472 |     # aggregrate gold label
473 |     gold_results = {}
474 |     gold_all_q_ids = set()
475 |     for i in range(len(test_dat)):
476 |         query_id = test_dat["#1 ID"][i]
477 |         # test_dat["#2 ID"][i]
478 |         c_id = re.sub(r'-sub\d+', '', test_dat["#2 ID"][i])
479 |         gold_all_q_ids.add(query_id)
480 | 
481 |         if test_dat['label'][i] == 1:
482 |             if query_id not in gold_results:
483 |                 gold_results[query_id] = []
484 |             gold_results[query_id].append((query_id, c_id, 1))
485 |     #
486 |     # compute performance by accuracy task 4
487 |     stats_task4 = {'pred': [], 'gold': []}
488 |     for q_id in gold_all_q_ids:
489 |         if q_id in gold_results:
490 |             stats_task4['gold'].append((q_id, True))
491 |         else:
492 |             stats_task4['gold'].append((q_id, False))
493 | 
494 |         if q_id in predicted_results:
495 |             stats_task4['pred'].append((q_id, True))
496 |         else:
497 |             stats_task4['pred'].append((q_id, False))
498 |     right_count = len(set(stats_task4['pred']).intersection(
499 |         set(stats_task4['gold'])))
500 |     stats_task4['acc'] = right_count / len(gold_all_q_ids)
501 |     stats_task4['correct_count'] = right_count
502 |     stats_task4['total'] = len(gold_all_q_ids)
503 | 
504 |     #
505 |     # compute performance by some metrics
506 |     stats_result = {}
507 |     for q_id in gold_all_q_ids:
508 |         stats_result[q_id] = {}
509 |         if q_id not in gold_results or q_id not in predicted_results:
510 |             stats_result[q_id]['pred'] = [x[1]
511 |                                           for x in predicted_results.get(q_id, [])]
512 |             stats_result[q_id]['enssemble_score'] = [x[2].item()
513 |                                                      for x in predicted_results.get(q_id, [])]
514 |             stats_result[q_id]['gold'] = []
515 |             stats_result[q_id]["P"] = 0
516 |             stats_result[q_id]["R"] = 0
517 |             stats_result[q_id]["F2"] = 0
518 |         else:
519 |             articles_prediction = [x[1]for x in predicted_results[q_id]]
520 |             articles_gold = [x[1]for x in gold_results[q_id]]
521 |             stats_result[q_id]['pred'] = articles_prediction
522 |             stats_result[q_id]['enssemble_score'] = [x[2].item()
523 |                                                      for x in predicted_results[q_id]]
524 |             stats_result[q_id]['gold'] = articles_gold
525 |             count_true = len(
526 |                 set(articles_prediction).intersection(set(articles_gold)))
527 |             stats_result[q_id]["P"] = count_true / \
528 |                 len(set(articles_prediction))
529 |             stats_result[q_id]["R"] = count_true / len(set(articles_gold))
530 |             stats_result[q_id]["F2"] = f_score(
531 |                 stats_result[q_id]["P"],  stats_result[q_id]["R"], beta=2)
532 | 
533 |         stats_result[q_id]['found_by_model'] = q_id not in miss_prediction_keys
534 |         stats_result[q_id]['detail_scores'] = [individual_model_stats[i][q_id]
535 |                                                for i in range(len(prediction_files))]
536 | 
537 |     all_p = [stats_result[q_id]['P'] for q_id in stats_result]
538 |     p = sum(all_p) / len(all_p)
539 | 
540 |     all_r = [stats_result[q_id]['R'] for q_id in stats_result]
541 |     r = sum(all_r) / len(all_r)
542 | 
543 |     all_f2 = [stats_result[q_id]['F2'] for q_id in stats_result]
544 |     macro_f2 = sum(all_f2) / len(all_f2)
545 | 
546 |     f2 = f_score(p, r, beta=2)
547 | 
548 |     overall_result = {'p': p, 'r': r, 'f2': f2,
549 |                       'macro_f2': macro_f2, 'acc_task4': stats_task4}
550 |     stats_result.update(overall_result)
551 |     # pprint(stats_result)
552 |     print('task 4:', "{:2.2f}".format(
553 |         stats_task4['acc']*100), stats_task4['correct_count'], stats_task4['total'])
554 | 
555 |     return stats_result
556 | 
557 | 
558 | def generate_file_submission(stats_result: Dict[str, Any], file_name: str, topk: int = None):
559 |     predictions = {}
560 |     for q_id, a_info in stats_result.items():
561 |         if '-' not in q_id:
562 |             continue
563 |         if q_id not in predictions:
564 |             predictions[q_id] = []
565 |         if topk is None:
566 |             for i_pred, pred in enumerate(zip(a_info['pred'], a_info['enssemble_score'])):
567 |                 predictions[q_id].append((q_id, pred[0], pred[1]))
568 |         else:
569 |             enssemble_scores = {}
570 |             # aggregate all score
571 |             for scores_model_i in a_info['detail_scores']:
572 |                 for score in scores_model_i:
573 |                     a_id = score[0]
574 |                     score_raw = score[1]
575 |                     if a_id not in enssemble_scores:
576 |                         enssemble_scores[a_id] = []
577 |                     enssemble_scores[a_id].append(score_raw)
578 |             # get mean all score
579 |             for a_id in enssemble_scores:
580 |                 # max(enssemble_scores[a_id]) #
581 |                 enssemble_scores[a_id] = sum(
582 |                     enssemble_scores[a_id]) / len(enssemble_scores[a_id])
583 | 
584 |             for a_id, score_enss in enssemble_scores.items():
585 |                 predictions[q_id].append((q_id, a_id, score_enss))
586 | 
587 |     keys_ = predictions.keys()
588 |     for query_id in keys_:
589 |         predictions[query_id].sort(key=lambda x: x[2], reverse=True)
590 |         if topk is not None:
591 |             # if len(predictions[query_id]) < topk:
592 |             #     print("exception in {}, countpred = {}, topk={}".format(query_id, len(predictions[query_id]), topk))
593 |             predictions[query_id] = predictions[query_id][:topk]
594 |     prediction_str = []
595 |     for query_id in keys_:
596 |         for i, prediction_info in enumerate(predictions[query_id]):
597 |             template = "{} {} {} {} {:.9f} {}"
598 | 
599 |             # H30-1-A Q0 886 1 0.193 JNLP
600 |             prediction_str.append(
601 |                 template.format(query_id, "Q0", prediction_info[1], i + 1, prediction_info[2], "JNLP"))
602 | 
603 |     with open(file_name, "wt", encoding="utf8") as f:
604 |         f.write("\n".join(prediction_str))
605 |  


--------------------------------------------------------------------------------
/scripts/legal_text_retrieval.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "legal-text-retrieval.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": []
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     },
 17 |     "accelerator": "GPU"
 18 |   },
 19 |   "cells": [
 20 |     {
 21 |       "cell_type": "code",
 22 |       "execution_count": null,
 23 |       "metadata": {
 24 |         "colab": {
 25 |           "base_uri": "https://localhost:8080/"
 26 |         },
 27 |         "id": "ep8Xqg5y3x1W",
 28 |         "outputId": "7b78e40e-4a7d-4f9c-9d73-cad8ea0fa44e"
 29 |       },
 30 |       "outputs": [
 31 |         {
 32 |           "output_type": "stream",
 33 |           "name": "stdout",
 34 |           "text": [
 35 |             "Cloning into 'legal_text_retrieval'...\n",
 36 |             "remote: Enumerating objects: 32, done.\u001b[K\n",
 37 |             "remote: Counting objects: 100% (32/32), done.\u001b[K\n",
 38 |             "remote: Compressing objects: 100% (26/26), done.\u001b[K\n",
 39 |             "remote: Total 32 (delta 5), reused 32 (delta 5), pack-reused 0\u001b[K\n",
 40 |             "Unpacking objects: 100% (32/32), done.\n"
 41 |           ]
 42 |         }
 43 |       ],
 44 |       "source": [
 45 |         "!git clone https://github.com/phuongnm-bkhn/legal_text_retrieval"
 46 |       ]
 47 |     },
 48 |     {
 49 |       "cell_type": "code",
 50 |       "source": [
 51 |         "!git clone https://github.com/vncorenlp/VnCoreNLP.git vncorenlp_data\n",
 52 |         "!pip install -r legal_text_retrieval/requirements.txt"
 53 |       ],
 54 |       "metadata": {
 55 |         "colab": {
 56 |           "base_uri": "https://localhost:8080/"
 57 |         },
 58 |         "id": "6xoYIRTU3_bF",
 59 |         "outputId": "358b0494-8aa8-4a3a-9ff3-d1e2c5bbdacd"
 60 |       },
 61 |       "execution_count": null,
 62 |       "outputs": [
 63 |         {
 64 |           "output_type": "stream",
 65 |           "name": "stdout",
 66 |           "text": [
 67 |             "Cloning into 'vncorenlp_data'...\n",
 68 |             "remote: Enumerating objects: 215, done.\u001b[K\n",
 69 |             "remote: Counting objects:  33% (1/3)\u001b[K\rremote: Counting objects:  66% (2/3)\u001b[K\rremote: Counting objects: 100% (3/3)\u001b[K\rremote: Counting objects: 100% (3/3), done.\u001b[K\n",
 70 |             "remote: Compressing objects: 100% (3/3), done.\u001b[K\n",
 71 |             "remote: Total 215 (delta 0), reused 0 (delta 0), pack-reused 212\u001b[K\n",
 72 |             "Receiving objects: 100% (215/215), 214.22 MiB | 30.89 MiB/s, done.\n",
 73 |             "Resolving deltas: 100% (76/76), done.\n",
 74 |             "Requirement already satisfied: sklearn in /usr/local/lib/python3.7/dist-packages (from -r legal_text_retrieval/requirements.txt (line 1)) (0.0)\n",
 75 |             "Requirement already satisfied: nltk in /usr/local/lib/python3.7/dist-packages (from -r legal_text_retrieval/requirements.txt (line 2)) (3.2.5)\n",
 76 |             "Requirement already satisfied: gensim in /usr/local/lib/python3.7/dist-packages (from -r legal_text_retrieval/requirements.txt (line 3)) (3.6.0)\n",
 77 |             "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from -r legal_text_retrieval/requirements.txt (line 4)) (1.1.5)\n",
 78 |             "Collecting fugashi\n",
 79 |             "  Downloading fugashi-1.1.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (490 kB)\n",
 80 |             "\u001b[K     |████████████████████████████████| 490 kB 5.4 MB/s \n",
 81 |             "\u001b[?25hCollecting mecab-python3\n",
 82 |             "  Downloading mecab_python3-1.0.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (488 kB)\n",
 83 |             "\u001b[K     |████████████████████████████████| 488 kB 41.4 MB/s \n",
 84 |             "\u001b[?25hCollecting unidic-lite\n",
 85 |             "  Downloading unidic-lite-1.0.8.tar.gz (47.4 MB)\n",
 86 |             "\u001b[K     |████████████████████████████████| 47.4 MB 1.6 MB/s \n",
 87 |             "\u001b[?25hCollecting transformers==4.3.2\n",
 88 |             "  Downloading transformers-4.3.2-py3-none-any.whl (1.8 MB)\n",
 89 |             "\u001b[K     |████████████████████████████████| 1.8 MB 33.6 MB/s \n",
 90 |             "\u001b[?25hRequirement already satisfied: openpyxl in /usr/local/lib/python3.7/dist-packages (from -r legal_text_retrieval/requirements.txt (line 9)) (2.5.9)\n",
 91 |             "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from -r legal_text_retrieval/requirements.txt (line 10)) (1.10.0+cu111)\n",
 92 |             "Requirement already satisfied: xlwt in /usr/local/lib/python3.7/dist-packages (from -r legal_text_retrieval/requirements.txt (line 11)) (1.3.0)\n",
 93 |             "Collecting datasets\n",
 94 |             "  Downloading datasets-1.17.0-py3-none-any.whl (306 kB)\n",
 95 |             "\u001b[K     |████████████████████████████████| 306 kB 39.1 MB/s \n",
 96 |             "\u001b[?25hCollecting sentencepiece\n",
 97 |             "  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n",
 98 |             "\u001b[K     |████████████████████████████████| 1.2 MB 32.0 MB/s \n",
 99 |             "\u001b[?25hRequirement already satisfied: protobuf in /usr/local/lib/python3.7/dist-packages (from -r legal_text_retrieval/requirements.txt (line 14)) (3.17.3)\n",
100 |             "Collecting rank_bm25\n",
101 |             "  Downloading rank_bm25-0.2.1-py3-none-any.whl (8.5 kB)\n",
102 |             "Requirement already satisfied: seaborn in /usr/local/lib/python3.7/dist-packages (from -r legal_text_retrieval/requirements.txt (line 16)) (0.11.2)\n",
103 |             "Collecting vncorenlp\n",
104 |             "  Downloading vncorenlp-1.0.3.tar.gz (2.6 MB)\n",
105 |             "\u001b[K     |████████████████████████████████| 2.6 MB 31.3 MB/s \n",
106 |             "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (1.19.5)\n",
107 |             "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (4.8.2)\n",
108 |             "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (3.4.0)\n",
109 |             "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (21.3)\n",
110 |             "Collecting sacremoses\n",
111 |             "  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)\n",
112 |             "\u001b[K     |████████████████████████████████| 895 kB 44.2 MB/s \n",
113 |             "\u001b[?25hCollecting tokenizers<0.11,>=0.10.1\n",
114 |             "  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)\n",
115 |             "\u001b[K     |████████████████████████████████| 3.3 MB 27.6 MB/s \n",
116 |             "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (2019.12.20)\n",
117 |             "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (2.23.0)\n",
118 |             "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (4.62.3)\n",
119 |             "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.7/dist-packages (from sklearn->-r legal_text_retrieval/requirements.txt (line 1)) (1.0.1)\n",
120 |             "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from nltk->-r legal_text_retrieval/requirements.txt (line 2)) (1.15.0)\n",
121 |             "Requirement already satisfied: scipy>=0.18.1 in /usr/local/lib/python3.7/dist-packages (from gensim->-r legal_text_retrieval/requirements.txt (line 3)) (1.4.1)\n",
122 |             "Requirement already satisfied: smart-open>=1.2.1 in /usr/local/lib/python3.7/dist-packages (from gensim->-r legal_text_retrieval/requirements.txt (line 3)) (5.2.1)\n",
123 |             "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->-r legal_text_retrieval/requirements.txt (line 4)) (2018.9)\n",
124 |             "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->-r legal_text_retrieval/requirements.txt (line 4)) (2.8.2)\n",
125 |             "Requirement already satisfied: et-xmlfile in /usr/local/lib/python3.7/dist-packages (from openpyxl->-r legal_text_retrieval/requirements.txt (line 9)) (1.1.0)\n",
126 |             "Requirement already satisfied: jdcal in /usr/local/lib/python3.7/dist-packages (from openpyxl->-r legal_text_retrieval/requirements.txt (line 9)) (1.4.1)\n",
127 |             "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch->-r legal_text_retrieval/requirements.txt (line 10)) (3.10.0.2)\n",
128 |             "Requirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from datasets->-r legal_text_retrieval/requirements.txt (line 12)) (0.3.4)\n",
129 |             "Requirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from datasets->-r legal_text_retrieval/requirements.txt (line 12)) (0.70.12.2)\n",
130 |             "Collecting fsspec[http]>=2021.05.0\n",
131 |             "  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)\n",
132 |             "\u001b[K     |████████████████████████████████| 132 kB 46.8 MB/s \n",
133 |             "\u001b[?25hRequirement already satisfied: pyarrow!=4.0.0,>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets->-r legal_text_retrieval/requirements.txt (line 12)) (3.0.0)\n",
134 |             "Collecting xxhash\n",
135 |             "  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)\n",
136 |             "\u001b[K     |████████████████████████████████| 243 kB 33.5 MB/s \n",
137 |             "\u001b[?25hCollecting huggingface-hub<1.0.0,>=0.1.0\n",
138 |             "  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)\n",
139 |             "\u001b[K     |████████████████████████████████| 61 kB 519 kB/s \n",
140 |             "\u001b[?25hCollecting aiohttp\n",
141 |             "  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)\n",
142 |             "\u001b[K     |████████████████████████████████| 1.1 MB 41.7 MB/s \n",
143 |             "\u001b[?25hRequirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets->-r legal_text_retrieval/requirements.txt (line 12)) (3.13)\n",
144 |             "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (3.0.6)\n",
145 |             "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (1.24.3)\n",
146 |             "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (3.0.4)\n",
147 |             "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (2021.10.8)\n",
148 |             "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (2.10)\n",
149 |             "Requirement already satisfied: matplotlib>=2.2 in /usr/local/lib/python3.7/dist-packages (from seaborn->-r legal_text_retrieval/requirements.txt (line 16)) (3.2.2)\n",
150 |             "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib>=2.2->seaborn->-r legal_text_retrieval/requirements.txt (line 16)) (0.11.0)\n",
151 |             "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib>=2.2->seaborn->-r legal_text_retrieval/requirements.txt (line 16)) (1.3.2)\n",
152 |             "Collecting aiosignal>=1.1.2\n",
153 |             "  Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)\n",
154 |             "Collecting yarl<2.0,>=1.0\n",
155 |             "  Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)\n",
156 |             "\u001b[K     |████████████████████████████████| 271 kB 35.7 MB/s \n",
157 |             "\u001b[?25hCollecting multidict<7.0,>=4.5\n",
158 |             "  Downloading multidict-5.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (160 kB)\n",
159 |             "\u001b[K     |████████████████████████████████| 160 kB 42.5 MB/s \n",
160 |             "\u001b[?25hRequirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets->-r legal_text_retrieval/requirements.txt (line 12)) (21.2.0)\n",
161 |             "Collecting frozenlist>=1.1.1\n",
162 |             "  Downloading frozenlist-1.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (192 kB)\n",
163 |             "\u001b[K     |████████████████████████████████| 192 kB 43.6 MB/s \n",
164 |             "\u001b[?25hCollecting asynctest==0.13.0\n",
165 |             "  Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)\n",
166 |             "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets->-r legal_text_retrieval/requirements.txt (line 12)) (2.0.8)\n",
167 |             "Collecting async-timeout<5.0,>=4.0.0a3\n",
168 |             "  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n",
169 |             "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (3.6.0)\n",
170 |             "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (7.1.2)\n",
171 |             "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (1.1.0)\n",
172 |             "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->sklearn->-r legal_text_retrieval/requirements.txt (line 1)) (3.0.0)\n",
173 |             "Building wheels for collected packages: unidic-lite, vncorenlp\n",
174 |             "  Building wheel for unidic-lite (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
175 |             "  Created wheel for unidic-lite: filename=unidic_lite-1.0.8-py3-none-any.whl size=47658836 sha256=9a2f1ce4d990b4cb1836d0c513a302d4fcc03b7a996150dd2ea8bc8c8a229445\n",
176 |             "  Stored in directory: /root/.cache/pip/wheels/de/69/b1/112140b599f2b13f609d485a99e357ba68df194d2079c5b1a2\n",
177 |             "  Building wheel for vncorenlp (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
178 |             "  Created wheel for vncorenlp: filename=vncorenlp-1.0.3-py3-none-any.whl size=2645951 sha256=b9e7a739e682cc3014f4451ed33864d7f84ae738e0e5e881b49ca3720cc0465d\n",
179 |             "  Stored in directory: /root/.cache/pip/wheels/0c/d8/f2/d28d97379b4f6479bf51247c8dfd57fa00932fa7a74b6aab29\n",
180 |             "Successfully built unidic-lite vncorenlp\n",
181 |             "Installing collected packages: multidict, frozenlist, yarl, asynctest, async-timeout, aiosignal, fsspec, aiohttp, xxhash, tokenizers, sacremoses, huggingface-hub, vncorenlp, unidic-lite, transformers, sentencepiece, rank-bm25, mecab-python3, fugashi, datasets\n",
182 |             "Successfully installed aiohttp-3.8.1 aiosignal-1.2.0 async-timeout-4.0.2 asynctest-0.13.0 datasets-1.17.0 frozenlist-1.2.0 fsspec-2021.11.1 fugashi-1.1.1 huggingface-hub-0.2.1 mecab-python3-1.0.4 multidict-5.2.0 rank-bm25-0.2.1 sacremoses-0.0.46 sentencepiece-0.1.96 tokenizers-0.10.3 transformers-4.3.2 unidic-lite-1.0.8 vncorenlp-1.0.3 xxhash-2.0.2 yarl-1.7.2\n"
183 |           ]
184 |         }
185 |       ]
186 |     },
187 |     {
188 |       "cell_type": "code",
189 |       "source": [
190 |         "!cd legal_text_retrieval/data/ && unzip zac2021-ltr-data.zip"
191 |       ],
192 |       "metadata": {
193 |         "colab": {
194 |           "base_uri": "https://localhost:8080/"
195 |         },
196 |         "id": "A4NnMJxiCQOK",
197 |         "outputId": "4cf03fc3-2445-4c54-930b-ff6e1f0f0236"
198 |       },
199 |       "execution_count": null,
200 |       "outputs": [
201 |         {
202 |           "output_type": "stream",
203 |           "name": "stdout",
204 |           "text": [
205 |             "Archive:  zac2021-ltr-data.zip\n",
206 |             "   creating: zac2021-ltr-data/\n",
207 |             "  inflating: zac2021-ltr-data/train_question_answer.json  \n",
208 |             "  inflating: zac2021-ltr-data/bug.log  \n",
209 |             "  inflating: zac2021-ltr-data/private_test_question.json  \n",
210 |             "  inflating: zac2021-ltr-data/legal_corpus.json  \n",
211 |             "  inflating: zac2021-ltr-data/public_test_question.json  \n",
212 |             "  inflating: zac2021-ltr-data/public_test_sample_submission.json  \n"
213 |           ]
214 |         }
215 |       ]
216 |     },
217 |     {
218 |       "cell_type": "code",
219 |       "source": [
220 |         "\n",
221 |         "!mkdir legal_text_retrieval/data/zalo-tfidfbm25150-full\n",
222 |         "\n",
223 |         "# increase topk to increate the recall score, maximum perfromance that we found is topk=150 => however, it is time-consuming\n",
224 |         "!python3 legal_text_retrieval/src/data_generator.py --path_folder_base legal_text_retrieval/data/zac2021-ltr-data/ --test_file public_test_question.json --topk 20  --tok --path_output_dir legal_text_retrieval/data/zalo-tfidfbm25150-full"
225 |       ],
226 |       "metadata": {
227 |         "colab": {
228 |           "base_uri": "https://localhost:8080/"
229 |         },
230 |         "id": "EifhPKVA4G45",
231 |         "outputId": "4fc7abe4-b13e-466b-8b60-357a2374186d"
232 |       },
233 |       "execution_count": null,
234 |       "outputs": [
235 |         {
236 |           "metadata": {
237 |             "tags": null
238 |           },
239 |           "name": "stdout",
240 |           "output_type": "stream",
241 |           "text": [
242 |             "Load data and tokenize data\n",
243 |             "61425\n",
244 |             "01/2009/tt-bnn-->1-->Điều 1 . Phạm_vi áp_dụng . Thông_tư này hướng_dẫn tuần_tra , canh_gác bảo_vệ đê Điều trong mùa lũ đối_với các tuyến đê sông được phân_loại , phân_cấp theo quy_định tại Điều 4 của Luật Đê_Điều .\n",
245 |             "Test ids = {}, Dev ids = {} ['71d7a66777628e3f9b8b85270366166c', '9032f491fd4ef429d9c18b3b015bd11b', '776e27836f8fbe84f60a850952498988', '8d4086520df91f54eec6a4accf796569', 'c0a8f1d99cd3b7ee60c21d1e9b98125c', 'd27c13871d62e88714fdaf539e50382e', '516686bcd8331a26428556e346c377c5', 'a8f2bbaf928d9834c8257188ca2a0780', '98603c2f4ff85046a071701c2659295d', '867833a9097a0838ad3a0a9df0f7a4a5', 'ecf80f40365366e561ce1b900a43c8f8', '761000dc2471950ac51f58d60694e78a', '38959f4d0d0d96f0f567b4e8c3e24809', '0f4bd20ef527fa6d503f0747f91247e8', 'c6c616ea37c4f4e9f753a1a26c1f513f', '1dd1899fc4c0293f6bbb95af80cc39c4', '1d07fcbe2a6900c4e4ea4409ae2c5270', '568d6332f094e839329a4d66d30dc6d0', '6d9ed75aecea4cb4e6298bd638ff4afc', '0b0fd757eef935f9609cbfab1443840d', '8031c4b9f388c424ba2ed4da91f00173', 'a1c8fc922eac56a7bd275565defa9763', '07f44d71c453c52f9f055ab877b909cc', 'a71a739c154aa77fb2800a426818df5e', '998fe36c4f8131243b766398f5801c22', '5db6785789345d1638b1046fee3b4c76', 'fb85950dc8effebfa15c9555f238d7ec', '927893d2acecb4415e1f68f923dce782', '1cded34a5dfb786368208f632e5a5d6f', 'e180827dbd190aa6e9651be5d08f7230', 'e243fda5f1cb39ea6c810564c025638b', '37710e7c0449baedbea4236ef2306da1', 'eaba6c7538a5d6f7f993e1bed678db73', 'c062f1a1e198a4da90193c5e06c484c8', '57cd65ad7d3909ce9b8fe9d4e2fdf709', 'f571b343e99c24e012804a5b04a591f4', '97ca93ecb6d659c2af7e71a269ad021a', 'bec4c14ba02a32fad812ee94a98c1602', '12f506bf952aec9b3d8289a62ded85d6', 'c396496b481e2685a7733cfdbeaf925b', '932b64b6d7fa0605031eaa6f3c63eea5', 'b00b0b63b241943957b48e9a4f9b7c27', '085fd00ce06d27d19f40b330c7d8ba0d', 'a6a55291b82d26ed8852361a80575e89', '0f1a741eea4c4cbdba9d46dd3d762e2a', 'a6f2a358b6a2a33fd8c44830293b8323', '6c5bebf8e67ea0562455656f4132dca8', '27c8850bb21db8dbccc40ebadaf4daea', '393a3cf1a1ce2e6de5955e85605b2e61', '346ba06d2fc6ad303d2c25aa03c8f309', '5160a38cdbea74be5e569b767f025e01', '0a63c38ccede65ad5788d469185fc315', '940904b10cb465abe65aa7f7fafbe2a3', '22333feb1c9b1f02fa531f878c1970ae', '9348b0de17a27622d52d6c71fa12d419', '568982eb42078f033b3ea568329b6c0e', '56a842911136ec4628584bbb915f8f4b', 'f33db0c82c1d8924f035a0ea3b2e7ca9', 'b4fbc993f786bed4cec41b356919f2e3', '6cc5cb34c7f90bd870c027ece3a15fad', 'bcc2b44d3cd1f9d2b8458c595fb9b02d', '269c71c3a7cb517e2d21f20bad0714c6', '513b44a60a7bfd9a8c1a5a2c49ca185e', 'e05c7b866e6e8e7394ca5b1d02c20f58', '9652a10d1629156e0789bb630e081f9c', '8a07ce1347bfecea1903820b2b76e0be', '9e560864413aee1abddcde454e778e94', '62beeffb51a33d06b38e2da8d5f2d051', '1d6f6841cc1de15d208daa6aee88e2ad', '47c9781f39a732c3643f6c4e600e19cf', '8816ba131c25d99edd0cdd75c07859b1', '07bf09806e2103ac74dac0afeb33bf6d', 'f7a936a8f212e039b7fea6867ad7f010', '199525a09762d6e51c1c7f4ef55c8926', '6aa56327ff9ef7983d0c691aca0ef1ca', 'f29cdf5317bffce5648a21766b6b5f34', 'fa220df5ca6551e99016b9d5897b7915', 'e9213ea8f437aa9d320e0e1271c4a1e6', '51482d141bf1dc8a92bc294563992ef0', 'b829d2d9b0b73662c3bcfaf032bcb1e2', '55540a87c8cdfba5615d4079484901f3', '5b53081a25e4d66f353d2229a0564c65', '1f993001eef14eee827293e16d218ca6', 'b0288182991527078a0693bca89985b8', '6b599fe2bfc9db75aabab13175652d56', 'e8c7b7a4bf4390bf03ddcc5d4215dba2', '0c0856b3f01f7a0a6d9fbcbf4253e190', 'd45962a733099ddc701566f0d33556ef', '025be63b5d1ae501b268711832cc9665', 'ba71aa394247a7a27230a2be489ab420', '916e520d5f358a54ebcdbba0179d8843', '6095a61aaf59fb5bcdc88fe1ad8a78c2', '84b51054f1131067758bb1137d812b78', '1823baf623fff2394d985f4f6c0e6d74', '2b5c43f2fa8f0b133535cd3ee6c6c695', '7dcb8c84aee27fee41f9400c33ff1551', 'eae565378fff9055ed48235add42bc15', 'ea1ccd1847912ce2cc57d7901a0d81d7', '2922376d0667c1809e159264aea169f5', '0c0bcba0c40adafd29e2cf47ec4fb544', '85eca4e62686b28864e1f6e20ce8d683', '1a78051dee60dbe7d688a3127f12544e', '46b1520f1cbaa923ac8cf85feadc9cf4', '49655c66704cf856ca2dbbc77af22586', '7b107a90c7d45209b9b367a08feb7708', '4361de87fcb0cfd740d35212aa7c7211', '1e8f4d2acdb3d4a94ce1205e0326f9f6', '9f0e04bd2d29040421904442696311b5', '2bb484a925b6a25f1277b7008651cf92', 'db5bdd3d97d53f257ac6afb8fd78f462', '63aa5757186c1def977a118d31445ec0', '9864da59e74afac4d219fc65939826e5', '4ec499632cf69a6f75f630f09fefa1dd', '6fe5bea03a70ebb09564b74be2555d7b', '6011873dfea515990a882d38539e126d', '93667e0ed1442c25148ea330f86e97ba', '5c4a6d35244fd3ce149b1008535a7809', 'ccd1c08880fb15e70d4b1aac0063f1bf', 'ad408423128850292ffeb29b163cd78a', '55051865aa377bcf48acad6782479e97', '11664771aa54bb4138078077243e6c3c', '3cbedf02828b82d680371b91c7ec53fe', 'dfbd4c5642fcda3f5008590ba629c9e6', '6ae4cab1d0684afaea997a01a0939f71', 'f6af9cbc4f43f3a708e4cbf20e0315db', '8b03abc22d2051cb7740ac9f5795346f', '66d8156c3e21d105c697555f7952d5db', 'c95987f619409f89814af39a83d36e4f', 'b2a375e7fb7dd6883b7563f7dd915624', '4ff8af2020275408284cf75a8bbb94ec', 'b82841783bc6ea570cac350a031f462a', '35a9f67c975bd2a4d3b79c97df499293', 'a3a91683dc5cc0f98037c23b8d962acf', '3d800b30aef8a7da60395c92a0369ac7', 'b6ba5044eefaf39648286d57aee10803', '1f72bf63e2247500e0c6d674f5f1aab8', '4eed0c5d082e7207ab2083b26db70593', 'a69b1206b162095ea6bbb35fb64ef57e', '941af762cfe86db412d9b995b33277ee', '7e063e5a3613f993d9b7bcfb423bbda6', 'b6db70131e6355a13a741fd803d8ff95', '4db28b9c8602a7a17e9b9b80e5d090ce', '60cce41086fd0866c296ad09e4a9fb5e', 'a8bc849f5a64ea13fcfc139b152fe03d', '2df3055029fb0a7eaf3315d35853d756', 'df0cba25540dc2d66bafa4af3144190c', 'c586059099b151b8fa61c7483717cbc9', '225018e18cbf57b20a80c746e7861594', '9c1a4c9d0b5a9d47e4e053eb2d420407', 'f712c1df0637ed68819cdecf4617716d', 'b0779a8ab51a8f8f8bff1d2f35d5a6e2', '1cd6de23155f859461caf9c0458f8f87', 'e881b7ab3d0a47237fb5b1c1fc23376d', 'd54a17301e302785439f9dde6f3fc208', '9d4519587afe7aafb8f21dfc1b0d0996', 'b86f86f863585b3356d2304b7e3c62bc', '1a78fb79fa2a8982c87f21b8bc533615', 'df5b0524ad802d413159260557db958c', 'ee59574e746db4c025602b8605420659', 'd9bba8a3889fa3dac1b8e41e0146563b', '2fff98ca52fd19ef92a48c52029606e1', 'e5f2ca84b9f42da0c5a9fd90164ccad9', '6152b60d67077a228acebe412e1ee849', 'ee2a77ce843d24362cabb427c958871e', 'cce910d6457d84f18b53b37624e9be86', '62879b753d9ceb7931a02255fd59738d', '752560c49d508b3109ccede8c466d331', '2e705213cc37d836360a3eaca1137239', '8b524012052a7f83dcbd6747d17057c4', '5ab6090e8225c3787e56132e2a607d68', 'da4dd2ba6768317321515fcc645867cf', 'b53228aa58bd0a7218eb65b470870fc5', 'e6c4a523e241b25f3ddbc21333d93183', 'ac5e9a8968f532cccd40dfa75105d883', '3917656e05e2824cf5c7d49807daa559', 'afd9c8e6cf5e853598a4b093779381e3', 'ddf2dd4a5343a76ae274b951a573bc20', '1638139e889ba19e05813f7c63a62313', '5efc169c7d3e7f7319189a86f88d18e8', 'd8825e8ec4411927082d088ef0008d54', 'bc48d54e378c819bf0ed83f614d2eead', 'dbe28073723a939c538e46d36ac858ca', '3610e843650a8876095bc5510280dcf1', 'ae84f2816f15df41129e2c3baaf78063', '63e890c53312ed84881061fa597e1173', '398aa804f95674fe47027a3ace7b8a1b', '5b212fad64feba45b1aac1e00e5ad5d4', 'd039fdd87b3227982f2b8ce698849d47', 'de630b5890582133d146be37b8f7d034', '3f9be74ba902086262dce25092726c8d', 'a57a45a928db8a107d431931d1b63081', 'ce608bb27a6e97f47e81e6b285621d82', 'c2d1d17f441fe69a98dee9e647dc1d11', 'a02b340d7c63212f2a77f7f98fa53637', '2486853af95400d8af5f9eaad8264d30', 'a22cc150e903605e28fc5770adfbda9f', 'f8b7020ca42524815eb92b9e4b4cdf4c', '728b7cd0d354f32d64d47b364b35d813', '26a1457116407d0ff002d2aea64e0703', 'cb8868533691a6f0793cddc3576390a6', 'f816baec07810f53f1aa0bddad3993ea', '807ff24320ec7286f132a001f49aa297', 'cb42d83f38d352f4fe9caabf460722d0', '9c42c41c65c68ff4d10a5bf85cc586a3', '5b3607941af2d1290240044796d36a5e', '4f4418d5d81f8fdee4d8093a7bf92980', '5025836c4777bf0b7f56fd9f727ab568', 'bbcc8eb3b942fc2547e43e15d3b2d66d', '203726a51ed642b922e82a97f7c68651', '4e96ddfa5985f50356a0264498a42444', '59b9c3f86716b3f59175f1b839e324be', 'e27841a82df6187263b88bac10c5793c', '5a36cf95bf04a713652359ff744518c4', 'a4452247cf772a8f493ed45be2146279', '4ae391cbf0fbb6f54d4cdb79e40b3014', 'f474f0d3f6886b50246f2a74969d3cb9', '88f6107830f5e3d8413a025248e1e1a4', '6b24fa9df1b373a9d71192a8758e85d2', 'bbab47202e290663b1159185a7fbf016', '4f84706a5b704fb07a371f877021fe70', 'edd0fc3a25e58af6c5a94cffc5b53512', 'a7d6f138c779a0cf4917bd802663fe1c', '39104ba8977eb63a7cda6d6b13de2f96', '64bd5389adeb465253b181d838e5b68d', '7b748d57587bebad3762a865e45cf8e1', '69e7e1191bd2466b30f43c7d4c38d665', '1b82531c05d3135a5d1c7ff780454ffb', 'f9cba099a2686b06d37d17ef17239f67', '5586215272cda7ca20de7960ba716ac3', 'd7e7f51b5c05eebc51be0667d6786bc9', '0adf2d326855f05d198dc27e32f83e6b', '9cd29923f70f5ee2a346c2f85b6c0c70', 'c8ef92bff62daa7573e81decb7eeed7d', '1fc7365a9797e6012b4cf394a5f14627', '5aa0c0bf9466f8295f6e9eae1b2bbc3c', '5ff4f20594b9a376ba23cb3931cb3751', 'cfa5d4ad00c3f173563848c3abbf9f0b', '8b1f55cf26ef28727d3e98446c014dc1', '019d46639c173a9a95d4516cf4f0e46d', 'b98dae8f5c65cff8179d5dd2406648e6', '374cebaf127949e6f7ed7839c3656fa9', '0959dffe01941688e8408f5d4a7b0384', '07da3d9c550c65237902e2f2ae292914', '609a8cd3a608dd5c64b6823eefa5663f', '82d39c8eacd784def6cf14c8f849a679', '70582abf441e432c99cb13b40e1c4665', '37de9bb8ff75c3afdf94b71b9a48d4a1', '7b157fe80fb7aa84783db5d5ca0c1589', '00906c9fd0920b8ae1cecc38bf92d29b', '27c7c6af5658a77c7066c4832225577c', 'f755871faee0d36e43adcbc8324be2ef', '812eedc1f31d984ba60b2e443045cd77', '9bb137ff6aed857c6756063fd415518e', 'ab0ee37a6cd76cbaef1e38cbec044c88', '30820e2aa5a6d375c933e7185d9b6899', '3bb9b4d9bb761a655ad434180d384017', '8737364460c311e9489f2efecb858b0d', 'bd50fa4aad7b18da1e0376b1148dd66b', 'cbccbb39823fad6e408c746fec35968e', 'dec302dcb6372b436b70a37f2b92a2c8', '2598956a9f17ff8f12aecd8091b20d65', '95a06a96871131c62c1ae37356fef1ec', '0731f13bab833224c2fec1f4281ae2f2', '259542ca8b02d16c6df1489a8b2e8d4f', '9c875e2d5bbc5976db7deb1c5ee79fa0', '58e490450309513c0b78302827014dfd', '3c2c076905a292801eedb7550959b5db', 'a2506e16e44f432807654a94f913cef9', 'fca93e9a8540078571a1f16c0986d14a', 'c1bb5a2deb487a453c394c526ad1acaf', '9fbc30cde5a73c9a4c9ae2ec1875b1f6', 'a51cfc65bfe9f541bcc901decca81b9f', '6b16de115c6d63bf08c82931d50ad05d', 'a486e66cc48a344bc3ad9bcfdf4cfc37', '94b95dd371e0e8e70fe3745067567e66', '6c9c93cface745b241432ce73c76bfb8', '750ebc1df0359555bd5a10313880fb3b', '41b7b5c2056aa9688b8d2c220ebb2b2f', '35cdeb7d3d5b011b816eff9f99dc223d', 'b57075c4d487a10f6212b1cd470f1cdc', '89ee072332c37d47ad58d1ac7865c92e', '248a5c9213b4e959291792f0bd7241a8', '951b8b2ed48aec6ad14a335cb2e63d91', 'd4fcfa0666a39af97969a29b0ca002d1', 'b6594604feb3757adea8657e9b0fcad0', '061f7bd9fe6ae22b8b07f16fa3a3cde4', '18d5df1fe3fbfdd3bf1ffaacef35f3a1', '479ab4e179ebab72a9d2963afe8018ac', '5281c51e3be66ddc921277bb1b9657a4', 'df73e8ab1e7805c4497357162c38d6fa', 'b5fe04d4ea147e3e9de780f0a88fd71d', '32553a298851afcdb65adba991384ea1', '1c4b63d48bfb2eb6772e551aff2b287b', '3cfce1dc697fc9775f7b80beb0fc4d23', 'e90dff2cb72d52bba7128d2e90e79eb1', '932fd4c716a5226b78b80f208147021c', 'ba41403971e54618c894007859384a42', '108e66d5039e1bacf6e69141f23f7a67', '8d472c1214f5c9224c74f2917f7c80c9', 'abd6f49246d4181cab90bf2da64da308', '5ceceb62de9d399e702c3006c5e3f1be', 'dd838c540ca88782f3fba00e934f1af0', '245456396bf0b2fbc271039160f22de2', 'bc2f1dd0b53c59269ce1bfe9a0b84a69', 'e8569569549580e62aaf75b8f4a759d5', 'af772af5e47050b906b8dc5f71efa76e', '3d20882750f46c89d64300fcb45424ae', '39be757346ecd48bce9e3b9ce4b2bb0c', '81ade189c6fda6a6d405a240f37989a7', 'c41c73d21bfea0ade03346b169864391', 'c0a55a31e49e98a9d6664e05b8702a5b', '47520892700b208136b0365136789078', '5d3a6e952fb8cdde9b918b34bf626e05', '3e4db1b0ea7a8d6499c71e2851537f77', '1a2e68f56d7dae8e4369b55d3c82ecef', 'bc4908c241186bf2397a60837fd601b8', '64babcb1fbda40238397712e62bd418f', '56b56e26b932fab51fa35f2946788353', 'a053819838a1dc21b8f1f2761a476029', 'b4f5a15df471762b3d3f15e10ada7171', 'd629ca20350bf128696d0086e8ba1762', 'fc2d0d092e23c52a0a219e26554eb40e', '8518868ded7e9a76d1e21914656225c3', 'f7d55a23b8f1ad646d77ac69526abb22', '767f3c58c6fddb6fee0c2b03d61499e0', '64910b54509bb37879cb61a133434b15', '0d7c0dedd986f0062147701f65253c89', '0d0a7d1e8836bc96e0ddd6c072535c75', 'f573ab803966a22b6e2060e86baf9803', '7258c9222dec4148e4e9bde4b4f6afee', '30d269e7cbc4ba4586e7f8c9b84ad5c5', 'b237dc219aedbe90928a29c700d08d60', 'a168bc2802e792320cfab8ed3ed5f5aa', 'd34f25ea05cf09e53a6eeb621249eaf9', '82c63353ee1e8aa779afa3e6c7c2aa2e', '09ae7cead00b87c87234745a7155faed', 'f37ff2114d5a0078c0d782a4e9cf2646', '719101222a537d7214776067541b7dc8', '8d583b2ee0dc364257f7cb5e3e9571ba', '05cfaef20dd799bf7b135f8486c56922', 'c431244d360e3d09a87f24c87ee9a9d5', 'b119a39d90795c2d8c58cc525740b2de', 'c49d9157b44086dad352d2866880eff7', '975b8049ed717e42161e2ea4b3ebfc1c', '88cbb6e64e123c3da8f5eb64908e041e', 'c89ee7997fcecf5103063d18412a2159', '2402d0b852f2719838715a5483b67f90', '6972ad489459106c6a38b4ceacb1fdda', 'a583b5daaf6a730d836371b2dae4df22', '6ead8a81311508ffab0ac0128482c3b9', '904e9ee3a6001249cfeb13247082c067', '0b7f097bfc132334cdbb3bdcf078c82c', 'd8b4faffdb0a557a6bea5ae0eea4053f', 'bad4561e5cfdc3eb113144c08b3253a1', '7fd2e5f6832bcf6975b247937d175718', '171703ad36d950b4a9fa1de65627d3cb', '4d32efcbb63b326f72c6a055a2505625', '653fd442746b1d58061c58f1131ca39b', 'f48a364c29a0a0353284e4a6b6471471', '2cd2e01cfa43e6f8e6ff9b72fa5bd931', '06c9aa1712329ae572986b32b0fe0343', '1b7d61ec7377d0608370f25469477f62', '41f4e509d5096168cc118de4388fcbf5', 'cd915974bc6420f03e825049c84ff7f7', 'a360bcce8248c707faa9a3a8abdda997', '2157d4fbab685a25dcce9534d03c6d41', '9f8a677c3c6fa1509caa703ef7916c35', '28aa921921dae3d9dcc83516a340420c', '0b8c1356537d121dce47ff385ce91450', '2ba48182d48f83af1b96a6ac34d80dbb', '9f220c00fd0003c0b51b08dbaf5d2f2d', '17465d16e0c6a6addf57547fee84bb1a', '3ba711f77a46f743aa6deafce77f8576', '142a8c532b54289fea4a7f1687998772', '575ec950ce1ca0684b1add581cb0c995', '94aba9c9160c90cdf98f0af4fb203ce4', '9ed041c2bdcf38442c449b9e1b18535c', 'e7a108a395663e07775218a03583ed81', '94add8111457f05850e8bd5d4cc3a7a3', '5054d386528dc4ae36e845a1cc3cdae0', '666193de419791b49959af7932f5c8c2', 'eb6d9abf15d5220f992a9ac27a5001fe', 'f46cc14603a04178db39532dc15c176b', 'c3813cd5a191ef574003db15d615cbb8', 'e3fea6941cf58fa9b4e898d8955649dd', 'ce08c1226d445a18b7ce2ebc80a7096e', '8a8dfcf2c4cd350651bb8734955765aa', '9634bea378870bf33a7e347fdabf2456', '39b5f0e9e1a1f50e97c7a9789d5e2af0', 'cdac30e8a60ac5f771c987518dca38aa', '6cc2d361537457056399f96e437316ba', 'adfe05f999905e8b3341d3a8ba3d63ad', '37b0829936bcd9934a15eed60fa8532d', '496e10c729a3437c0654d4b5d4c09d82', 'd9c96f6ee613abc7d7949c3377681413', 'f0cb10da92bc2083159e443319216a37', '15045d3e23344df09a1d8c005bca41d5', '15d02f99130c0283a580bed0bed78275', '2d15324eba0e70ef4a1451d5321e980e', 'c7d3b831381fb5e75f144433358493b6', '9d4d42de8c64f7c79fd82fb3bb8960c4', '4bfe10874285d8d68486b8c991462ac8', '200c064f3fbf08ddcbbfb2f36118b066', '8eecc61b4ccca9910892a9c267d35d07', '41c698f0fefac0e702a03bbe38c755ee', '9221ec3ed5a0cac9cc211b1f76fc2642', '835f40e6144719e73f26d260b8ad6c80', '463d9a27874682c6001c748196070b22', 'a236d75f2c03e16fbbac2825d42eb914', '83ad78f86297babe27c2ea9cb47b4675', '421f72e8a492f524b696dce4658879e2', 'ebff992242dcd49d4004414a94581574', 'd6b7c0a83da2af88d584dad256dd08e0', '6a559666d4f4561014c97c2177068f78', '67d0e885fbbd04b3675512bb31d1a0e0', 'dd8ecfa0c5e14fc9dfa9903f3501a07b', '590ca279d64e0fc64a5a1f87d91c6bfb', 'fa457b5f7729ecaad819d201ffc49784', 'cc36fbe3a57fa2b04738ab2a7b0e5e12', '864e494bca15d43d2a036610712c0180', 'f8509fc848701a8f2ee6c0c0a59936b8', '903a785a05d396bb9f3b3b1db1e59d7e', '87d48c0f33a9213c9c252535a399ecf4', '3deb452f47e2dba10f8baea299721ca9', '12d512a24a5210ceb6074e7bdaebc227', 'ae97d323473045ec06cfc1fd165808ce', '88199cab3584d87a46150bda69379a9c', '09be0cb04e6fd9080bc7ccf8366373ee', '2d81d266eaf9148fdf177891eb51d708', 'b3d89252b2dcdf7ab1b6ea69e05b4e7b', '4e642b8d7055956511fd63978aec8d0d', '3b3b42273065160a4cbf88d5507a0dc0', 'b5073b22539ea0a59136f8b540c95b02', 'bd723b441e4de6d16bf3ef78a0ed10ad', 'a65ad70ea91d2a1af991e304fadd62a4', '6f451320b2bafe1e730a8b6e2bd833a5', '91dfef441587afeac249429a4dd44cf8', '38a8dcbd0ebfa6961a368f4fc17145ab', '8fc484b27aa36a70fd67edefd9e2477f', '6b4b817b770b8ccb189a2597c694506b', 'f45a2ec2f61ca00a1e099f97f316d79b', 'f53603d80e75c2e3be16c3feb36a539d', '531d0a1f09348b04f5dc54bf84583cc1', '879bd82b192d004e0fb657469c76fc51', '2b50e856f51760e84318e7092fdeb0d8', '2e8f9fc2dcdd0dd7752ad2d268566b53', '80cd82b51fcebfcd90ee3af11adc09e1', 'd18e62ff9a47263dd6ab7c3087879692', 'e6c1afcc7bc65980b80bc5e921bfca06', '806fbd7dd888ce61828fe4395b6f4e75', '1bc94726d834ef7b9535d51e78bc8c55', '09598adef2692a5ecc76bf5e258c2ec1', 'b629be66e8e45a28ed259cfe274af605', '28bb026e961ba92d5f792df7039555ae', '7e28b3a432db2f50621c6fe42f2f36ba', '0d828e3d92683f430d43c51c9fb5b54d', '84536d2896d31e74666b6aba27aae014', '3da255cae775b060a22dc63fda61fefa', 'eec32b35fca1c7403c1fea55d39afda5', 'f44a80af3c1bd429a946f4a274113e6b', 'e64d2d6c8000833d9b5883073485ff03', '699193b16271dbb8b49f218310243030', '1792cda8cff53c86561827b682b91bed', '5f862b1fa2ca69490d9bbac6ceae0337', '57bd90ae2a361f1382fc725fef0a0921', 'd382ebbd528354123e59a86233c32b06', 'd6d2c4d555de208ebbbe6c82b0aafd17', '8fd7fabebed71969fbe86d7364c8861d', '1be229c4bbd1962054419c0afef9eee4', '8834cc11931a2e49c5e0d54e9da9aad9', '743729efb199a6558d0899195d8ca740', 'f7bec41b29d58c70966f97ea9b52bb3e', '0190b8458cfbd28683eb07811a7a7ec5', 'fd816aea82671f71e3eb67e977af00a0', '6f4f1114bd7840ed5e5d756ef8c6432b', 'e9b5c43b5fc7642377401d25a9bedd2d', 'a35a63e506a0b73d125ff0080bdd588c', '1b84156caef23d63398ece07abe5929e', 'de14cd74a8e95268187653ec969312c0', '4b3e0bfd44175b0cae7c36b9e8e519e3', '104a4d3bd3aeeac50d2cc2fb4493bce4', 'cc1d54da1222b9a544b08ff4f645f3fd', 'c80255158cfaef8697d6c6bfc2362341', 'c87009f5755a9b6aa356a1935a57e5a9', '36eeb567a2ea72c1b54cdd9d46a6647b', 'c5ab8758f3c8796ab006d65be6fc25f8', '319f3b8aadfb71e710e070af1bb1047c', '02c93ce8c413e2e8e77975123aecec49', '91ee38d3951b17dfc4c7bca94b6d6dd8', '2e9de09614d9065d6dfc2ae16d4268c1', '57bba769688fb4372296000214954f56', 'fbc81ab85f91aa162fedb88547abc2ce', '05bdc47cee2f106023cbd8617b0997c4', 'a88fd0271a539c09d0c906153736e111', '466e95e1a82f48dd1fdc2a119f8df9fc', 'e580bba0a66774889eb04ca5ca6f8e37', 'd0668af748d1fd0ae7347eb90a5d4fcf', 'be84c54af594465edf6dede1977dee38', 'b582dff2a2b91f406b14ff64e032c72e', '29705fbaf540d6f6b2f8c67313b3070c', 'e3160345920372b267101bdae099bc20', '0a0620ab140b578867661ce0f0abf856', '0bfbf291188a05a0967b5ec02f574357', '98c5511cd321f13f7c1953986475c8f0', '51f0ada675f84142440bc7e56caa57ba', 'cea5cc20e8b7e2c2de0deaa3fb32a371', '7d7212ceeaa7cb2795038bc79da98159', '7d3db7859099303167c8396310a79fe8', 'e37bf85f107b2ab0fe65cc84a7ae103e', '70eb3bc85ff6e3914131f5a6cd81834c', 'b7ee2b49525d85788b253a1508efd4f7'] ['71d7a66777628e3f9b8b85270366166c', '9032f491fd4ef429d9c18b3b015bd11b', '776e27836f8fbe84f60a850952498988', '8d4086520df91f54eec6a4accf796569', 'c0a8f1d99cd3b7ee60c21d1e9b98125c', 'd27c13871d62e88714fdaf539e50382e', '516686bcd8331a26428556e346c377c5', 'a8f2bbaf928d9834c8257188ca2a0780', '98603c2f4ff85046a071701c2659295d', '867833a9097a0838ad3a0a9df0f7a4a5', 'ecf80f40365366e561ce1b900a43c8f8', '761000dc2471950ac51f58d60694e78a', '38959f4d0d0d96f0f567b4e8c3e24809', '0f4bd20ef527fa6d503f0747f91247e8', 'c6c616ea37c4f4e9f753a1a26c1f513f', '1dd1899fc4c0293f6bbb95af80cc39c4', '1d07fcbe2a6900c4e4ea4409ae2c5270', '568d6332f094e839329a4d66d30dc6d0', '6d9ed75aecea4cb4e6298bd638ff4afc', '0b0fd757eef935f9609cbfab1443840d', '8031c4b9f388c424ba2ed4da91f00173', 'a1c8fc922eac56a7bd275565defa9763', '07f44d71c453c52f9f055ab877b909cc', 'a71a739c154aa77fb2800a426818df5e', '998fe36c4f8131243b766398f5801c22', '5db6785789345d1638b1046fee3b4c76', 'fb85950dc8effebfa15c9555f238d7ec', '927893d2acecb4415e1f68f923dce782', '1cded34a5dfb786368208f632e5a5d6f', 'e180827dbd190aa6e9651be5d08f7230', 'e243fda5f1cb39ea6c810564c025638b', '37710e7c0449baedbea4236ef2306da1', 'eaba6c7538a5d6f7f993e1bed678db73', 'c062f1a1e198a4da90193c5e06c484c8', '57cd65ad7d3909ce9b8fe9d4e2fdf709', 'f571b343e99c24e012804a5b04a591f4', '97ca93ecb6d659c2af7e71a269ad021a', 'bec4c14ba02a32fad812ee94a98c1602', '12f506bf952aec9b3d8289a62ded85d6', 'c396496b481e2685a7733cfdbeaf925b', '932b64b6d7fa0605031eaa6f3c63eea5', 'b00b0b63b241943957b48e9a4f9b7c27', '085fd00ce06d27d19f40b330c7d8ba0d', 'a6a55291b82d26ed8852361a80575e89', '0f1a741eea4c4cbdba9d46dd3d762e2a', 'a6f2a358b6a2a33fd8c44830293b8323', '6c5bebf8e67ea0562455656f4132dca8', '27c8850bb21db8dbccc40ebadaf4daea', '393a3cf1a1ce2e6de5955e85605b2e61', '346ba06d2fc6ad303d2c25aa03c8f309', '5160a38cdbea74be5e569b767f025e01', '0a63c38ccede65ad5788d469185fc315', '940904b10cb465abe65aa7f7fafbe2a3', '22333feb1c9b1f02fa531f878c1970ae', '9348b0de17a27622d52d6c71fa12d419', '568982eb42078f033b3ea568329b6c0e', '56a842911136ec4628584bbb915f8f4b', 'f33db0c82c1d8924f035a0ea3b2e7ca9', 'b4fbc993f786bed4cec41b356919f2e3', '6cc5cb34c7f90bd870c027ece3a15fad', 'bcc2b44d3cd1f9d2b8458c595fb9b02d', '269c71c3a7cb517e2d21f20bad0714c6', '513b44a60a7bfd9a8c1a5a2c49ca185e', 'e05c7b866e6e8e7394ca5b1d02c20f58', '9652a10d1629156e0789bb630e081f9c', '8a07ce1347bfecea1903820b2b76e0be', '9e560864413aee1abddcde454e778e94', '62beeffb51a33d06b38e2da8d5f2d051', '1d6f6841cc1de15d208daa6aee88e2ad', '47c9781f39a732c3643f6c4e600e19cf', '8816ba131c25d99edd0cdd75c07859b1', '07bf09806e2103ac74dac0afeb33bf6d', 'f7a936a8f212e039b7fea6867ad7f010', '199525a09762d6e51c1c7f4ef55c8926', '6aa56327ff9ef7983d0c691aca0ef1ca', 'f29cdf5317bffce5648a21766b6b5f34', 'fa220df5ca6551e99016b9d5897b7915', 'e9213ea8f437aa9d320e0e1271c4a1e6', '51482d141bf1dc8a92bc294563992ef0', 'b829d2d9b0b73662c3bcfaf032bcb1e2', '55540a87c8cdfba5615d4079484901f3', '5b53081a25e4d66f353d2229a0564c65', '1f993001eef14eee827293e16d218ca6', 'b0288182991527078a0693bca89985b8', '6b599fe2bfc9db75aabab13175652d56', 'e8c7b7a4bf4390bf03ddcc5d4215dba2', '0c0856b3f01f7a0a6d9fbcbf4253e190', 'd45962a733099ddc701566f0d33556ef', '025be63b5d1ae501b268711832cc9665', 'ba71aa394247a7a27230a2be489ab420', '916e520d5f358a54ebcdbba0179d8843', '6095a61aaf59fb5bcdc88fe1ad8a78c2', '84b51054f1131067758bb1137d812b78', '1823baf623fff2394d985f4f6c0e6d74', '2b5c43f2fa8f0b133535cd3ee6c6c695', '7dcb8c84aee27fee41f9400c33ff1551', 'eae565378fff9055ed48235add42bc15', 'ea1ccd1847912ce2cc57d7901a0d81d7', '2922376d0667c1809e159264aea169f5', '0c0bcba0c40adafd29e2cf47ec4fb544', '85eca4e62686b28864e1f6e20ce8d683', '1a78051dee60dbe7d688a3127f12544e', '46b1520f1cbaa923ac8cf85feadc9cf4', '49655c66704cf856ca2dbbc77af22586', '7b107a90c7d45209b9b367a08feb7708', '4361de87fcb0cfd740d35212aa7c7211', '1e8f4d2acdb3d4a94ce1205e0326f9f6', '9f0e04bd2d29040421904442696311b5', '2bb484a925b6a25f1277b7008651cf92', 'db5bdd3d97d53f257ac6afb8fd78f462', '63aa5757186c1def977a118d31445ec0', '9864da59e74afac4d219fc65939826e5', '4ec499632cf69a6f75f630f09fefa1dd', '6fe5bea03a70ebb09564b74be2555d7b', '6011873dfea515990a882d38539e126d', '93667e0ed1442c25148ea330f86e97ba', '5c4a6d35244fd3ce149b1008535a7809', 'ccd1c08880fb15e70d4b1aac0063f1bf', 'ad408423128850292ffeb29b163cd78a', '55051865aa377bcf48acad6782479e97', '11664771aa54bb4138078077243e6c3c', '3cbedf02828b82d680371b91c7ec53fe', 'dfbd4c5642fcda3f5008590ba629c9e6', '6ae4cab1d0684afaea997a01a0939f71', 'f6af9cbc4f43f3a708e4cbf20e0315db', '8b03abc22d2051cb7740ac9f5795346f', '66d8156c3e21d105c697555f7952d5db', 'c95987f619409f89814af39a83d36e4f', 'b2a375e7fb7dd6883b7563f7dd915624', '4ff8af2020275408284cf75a8bbb94ec', 'b82841783bc6ea570cac350a031f462a', '35a9f67c975bd2a4d3b79c97df499293', 'a3a91683dc5cc0f98037c23b8d962acf', '3d800b30aef8a7da60395c92a0369ac7', 'b6ba5044eefaf39648286d57aee10803', '1f72bf63e2247500e0c6d674f5f1aab8', '4eed0c5d082e7207ab2083b26db70593', 'a69b1206b162095ea6bbb35fb64ef57e', '941af762cfe86db412d9b995b33277ee', '7e063e5a3613f993d9b7bcfb423bbda6', 'b6db70131e6355a13a741fd803d8ff95', '4db28b9c8602a7a17e9b9b80e5d090ce', '60cce41086fd0866c296ad09e4a9fb5e', 'a8bc849f5a64ea13fcfc139b152fe03d', '2df3055029fb0a7eaf3315d35853d756', 'df0cba25540dc2d66bafa4af3144190c', 'c586059099b151b8fa61c7483717cbc9', '225018e18cbf57b20a80c746e7861594', '9c1a4c9d0b5a9d47e4e053eb2d420407', 'f712c1df0637ed68819cdecf4617716d', 'b0779a8ab51a8f8f8bff1d2f35d5a6e2', '1cd6de23155f859461caf9c0458f8f87', 'e881b7ab3d0a47237fb5b1c1fc23376d', 'd54a17301e302785439f9dde6f3fc208', '9d4519587afe7aafb8f21dfc1b0d0996', 'b86f86f863585b3356d2304b7e3c62bc', '1a78fb79fa2a8982c87f21b8bc533615', 'df5b0524ad802d413159260557db958c', 'ee59574e746db4c025602b8605420659', 'd9bba8a3889fa3dac1b8e41e0146563b', '2fff98ca52fd19ef92a48c52029606e1', 'e5f2ca84b9f42da0c5a9fd90164ccad9', '6152b60d67077a228acebe412e1ee849', 'ee2a77ce843d24362cabb427c958871e', 'cce910d6457d84f18b53b37624e9be86', '62879b753d9ceb7931a02255fd59738d', '752560c49d508b3109ccede8c466d331', '2e705213cc37d836360a3eaca1137239', '8b524012052a7f83dcbd6747d17057c4', '5ab6090e8225c3787e56132e2a607d68', 'da4dd2ba6768317321515fcc645867cf', 'b53228aa58bd0a7218eb65b470870fc5', 'e6c4a523e241b25f3ddbc21333d93183', 'ac5e9a8968f532cccd40dfa75105d883', '3917656e05e2824cf5c7d49807daa559', 'afd9c8e6cf5e853598a4b093779381e3', 'ddf2dd4a5343a76ae274b951a573bc20', '1638139e889ba19e05813f7c63a62313', '5efc169c7d3e7f7319189a86f88d18e8', 'd8825e8ec4411927082d088ef0008d54', 'bc48d54e378c819bf0ed83f614d2eead', 'dbe28073723a939c538e46d36ac858ca', '3610e843650a8876095bc5510280dcf1', 'ae84f2816f15df41129e2c3baaf78063', '63e890c53312ed84881061fa597e1173', '398aa804f95674fe47027a3ace7b8a1b', '5b212fad64feba45b1aac1e00e5ad5d4', 'd039fdd87b3227982f2b8ce698849d47', 'de630b5890582133d146be37b8f7d034', '3f9be74ba902086262dce25092726c8d', 'a57a45a928db8a107d431931d1b63081', 'ce608bb27a6e97f47e81e6b285621d82', 'c2d1d17f441fe69a98dee9e647dc1d11', 'a02b340d7c63212f2a77f7f98fa53637', '2486853af95400d8af5f9eaad8264d30', 'a22cc150e903605e28fc5770adfbda9f', 'f8b7020ca42524815eb92b9e4b4cdf4c', '728b7cd0d354f32d64d47b364b35d813', '26a1457116407d0ff002d2aea64e0703', 'cb8868533691a6f0793cddc3576390a6', 'f816baec07810f53f1aa0bddad3993ea', '807ff24320ec7286f132a001f49aa297', 'cb42d83f38d352f4fe9caabf460722d0', '9c42c41c65c68ff4d10a5bf85cc586a3', '5b3607941af2d1290240044796d36a5e', '4f4418d5d81f8fdee4d8093a7bf92980', '5025836c4777bf0b7f56fd9f727ab568', 'bbcc8eb3b942fc2547e43e15d3b2d66d', '203726a51ed642b922e82a97f7c68651', '4e96ddfa5985f50356a0264498a42444', '59b9c3f86716b3f59175f1b839e324be', 'e27841a82df6187263b88bac10c5793c', '5a36cf95bf04a713652359ff744518c4', 'a4452247cf772a8f493ed45be2146279', '4ae391cbf0fbb6f54d4cdb79e40b3014', 'f474f0d3f6886b50246f2a74969d3cb9', '88f6107830f5e3d8413a025248e1e1a4', '6b24fa9df1b373a9d71192a8758e85d2', 'bbab47202e290663b1159185a7fbf016', '4f84706a5b704fb07a371f877021fe70', 'edd0fc3a25e58af6c5a94cffc5b53512', 'a7d6f138c779a0cf4917bd802663fe1c', '39104ba8977eb63a7cda6d6b13de2f96', '64bd5389adeb465253b181d838e5b68d', '7b748d57587bebad3762a865e45cf8e1', '69e7e1191bd2466b30f43c7d4c38d665', '1b82531c05d3135a5d1c7ff780454ffb', 'f9cba099a2686b06d37d17ef17239f67', '5586215272cda7ca20de7960ba716ac3', 'd7e7f51b5c05eebc51be0667d6786bc9', '0adf2d326855f05d198dc27e32f83e6b', '9cd29923f70f5ee2a346c2f85b6c0c70', 'c8ef92bff62daa7573e81decb7eeed7d', '1fc7365a9797e6012b4cf394a5f14627', '5aa0c0bf9466f8295f6e9eae1b2bbc3c', '5ff4f20594b9a376ba23cb3931cb3751', 'cfa5d4ad00c3f173563848c3abbf9f0b', '8b1f55cf26ef28727d3e98446c014dc1', '019d46639c173a9a95d4516cf4f0e46d', 'b98dae8f5c65cff8179d5dd2406648e6', '374cebaf127949e6f7ed7839c3656fa9', '0959dffe01941688e8408f5d4a7b0384', '07da3d9c550c65237902e2f2ae292914', '609a8cd3a608dd5c64b6823eefa5663f', '82d39c8eacd784def6cf14c8f849a679', '70582abf441e432c99cb13b40e1c4665', '37de9bb8ff75c3afdf94b71b9a48d4a1', '7b157fe80fb7aa84783db5d5ca0c1589', '00906c9fd0920b8ae1cecc38bf92d29b', '27c7c6af5658a77c7066c4832225577c', 'f755871faee0d36e43adcbc8324be2ef', '812eedc1f31d984ba60b2e443045cd77', '9bb137ff6aed857c6756063fd415518e', 'ab0ee37a6cd76cbaef1e38cbec044c88', '30820e2aa5a6d375c933e7185d9b6899', '3bb9b4d9bb761a655ad434180d384017', '8737364460c311e9489f2efecb858b0d', 'bd50fa4aad7b18da1e0376b1148dd66b', 'cbccbb39823fad6e408c746fec35968e', 'dec302dcb6372b436b70a37f2b92a2c8', '2598956a9f17ff8f12aecd8091b20d65', '95a06a96871131c62c1ae37356fef1ec', '0731f13bab833224c2fec1f4281ae2f2', '259542ca8b02d16c6df1489a8b2e8d4f', '9c875e2d5bbc5976db7deb1c5ee79fa0', '58e490450309513c0b78302827014dfd', '3c2c076905a292801eedb7550959b5db', 'a2506e16e44f432807654a94f913cef9', 'fca93e9a8540078571a1f16c0986d14a', 'c1bb5a2deb487a453c394c526ad1acaf', '9fbc30cde5a73c9a4c9ae2ec1875b1f6', 'a51cfc65bfe9f541bcc901decca81b9f', '6b16de115c6d63bf08c82931d50ad05d', 'a486e66cc48a344bc3ad9bcfdf4cfc37', '94b95dd371e0e8e70fe3745067567e66', '6c9c93cface745b241432ce73c76bfb8', '750ebc1df0359555bd5a10313880fb3b', '41b7b5c2056aa9688b8d2c220ebb2b2f', '35cdeb7d3d5b011b816eff9f99dc223d', 'b57075c4d487a10f6212b1cd470f1cdc', '89ee072332c37d47ad58d1ac7865c92e', '248a5c9213b4e959291792f0bd7241a8', '951b8b2ed48aec6ad14a335cb2e63d91', 'd4fcfa0666a39af97969a29b0ca002d1', 'b6594604feb3757adea8657e9b0fcad0', '061f7bd9fe6ae22b8b07f16fa3a3cde4', '18d5df1fe3fbfdd3bf1ffaacef35f3a1', '479ab4e179ebab72a9d2963afe8018ac', '5281c51e3be66ddc921277bb1b9657a4', 'df73e8ab1e7805c4497357162c38d6fa', 'b5fe04d4ea147e3e9de780f0a88fd71d', '32553a298851afcdb65adba991384ea1', '1c4b63d48bfb2eb6772e551aff2b287b', '3cfce1dc697fc9775f7b80beb0fc4d23', 'e90dff2cb72d52bba7128d2e90e79eb1', '932fd4c716a5226b78b80f208147021c', 'ba41403971e54618c894007859384a42', '108e66d5039e1bacf6e69141f23f7a67', '8d472c1214f5c9224c74f2917f7c80c9', 'abd6f49246d4181cab90bf2da64da308', '5ceceb62de9d399e702c3006c5e3f1be', 'dd838c540ca88782f3fba00e934f1af0', '245456396bf0b2fbc271039160f22de2', 'bc2f1dd0b53c59269ce1bfe9a0b84a69', 'e8569569549580e62aaf75b8f4a759d5', 'af772af5e47050b906b8dc5f71efa76e', '3d20882750f46c89d64300fcb45424ae', '39be757346ecd48bce9e3b9ce4b2bb0c', '81ade189c6fda6a6d405a240f37989a7', 'c41c73d21bfea0ade03346b169864391', 'c0a55a31e49e98a9d6664e05b8702a5b', '47520892700b208136b0365136789078', '5d3a6e952fb8cdde9b918b34bf626e05', '3e4db1b0ea7a8d6499c71e2851537f77', '1a2e68f56d7dae8e4369b55d3c82ecef', 'bc4908c241186bf2397a60837fd601b8', '64babcb1fbda40238397712e62bd418f', '56b56e26b932fab51fa35f2946788353', 'a053819838a1dc21b8f1f2761a476029', 'b4f5a15df471762b3d3f15e10ada7171', 'd629ca20350bf128696d0086e8ba1762', 'fc2d0d092e23c52a0a219e26554eb40e', '8518868ded7e9a76d1e21914656225c3', 'f7d55a23b8f1ad646d77ac69526abb22', '767f3c58c6fddb6fee0c2b03d61499e0', '64910b54509bb37879cb61a133434b15', '0d7c0dedd986f0062147701f65253c89', '0d0a7d1e8836bc96e0ddd6c072535c75', 'f573ab803966a22b6e2060e86baf9803', '7258c9222dec4148e4e9bde4b4f6afee', '30d269e7cbc4ba4586e7f8c9b84ad5c5', 'b237dc219aedbe90928a29c700d08d60', 'a168bc2802e792320cfab8ed3ed5f5aa', 'd34f25ea05cf09e53a6eeb621249eaf9', '82c63353ee1e8aa779afa3e6c7c2aa2e', '09ae7cead00b87c87234745a7155faed', 'f37ff2114d5a0078c0d782a4e9cf2646', '719101222a537d7214776067541b7dc8', '8d583b2ee0dc364257f7cb5e3e9571ba', '05cfaef20dd799bf7b135f8486c56922', 'c431244d360e3d09a87f24c87ee9a9d5', 'b119a39d90795c2d8c58cc525740b2de', 'c49d9157b44086dad352d2866880eff7', '975b8049ed717e42161e2ea4b3ebfc1c', '88cbb6e64e123c3da8f5eb64908e041e', 'c89ee7997fcecf5103063d18412a2159', '2402d0b852f2719838715a5483b67f90', '6972ad489459106c6a38b4ceacb1fdda', 'a583b5daaf6a730d836371b2dae4df22', '6ead8a81311508ffab0ac0128482c3b9', '904e9ee3a6001249cfeb13247082c067', '0b7f097bfc132334cdbb3bdcf078c82c', 'd8b4faffdb0a557a6bea5ae0eea4053f', 'bad4561e5cfdc3eb113144c08b3253a1', '7fd2e5f6832bcf6975b247937d175718', '171703ad36d950b4a9fa1de65627d3cb', '4d32efcbb63b326f72c6a055a2505625', '653fd442746b1d58061c58f1131ca39b', 'f48a364c29a0a0353284e4a6b6471471', '2cd2e01cfa43e6f8e6ff9b72fa5bd931', '06c9aa1712329ae572986b32b0fe0343', '1b7d61ec7377d0608370f25469477f62', '41f4e509d5096168cc118de4388fcbf5', 'cd915974bc6420f03e825049c84ff7f7', 'a360bcce8248c707faa9a3a8abdda997', '2157d4fbab685a25dcce9534d03c6d41', '9f8a677c3c6fa1509caa703ef7916c35', '28aa921921dae3d9dcc83516a340420c', '0b8c1356537d121dce47ff385ce91450', '2ba48182d48f83af1b96a6ac34d80dbb', '9f220c00fd0003c0b51b08dbaf5d2f2d', '17465d16e0c6a6addf57547fee84bb1a', '3ba711f77a46f743aa6deafce77f8576', '142a8c532b54289fea4a7f1687998772', '575ec950ce1ca0684b1add581cb0c995', '94aba9c9160c90cdf98f0af4fb203ce4', '9ed041c2bdcf38442c449b9e1b18535c', 'e7a108a395663e07775218a03583ed81', '94add8111457f05850e8bd5d4cc3a7a3', '5054d386528dc4ae36e845a1cc3cdae0', '666193de419791b49959af7932f5c8c2', 'eb6d9abf15d5220f992a9ac27a5001fe', 'f46cc14603a04178db39532dc15c176b', 'c3813cd5a191ef574003db15d615cbb8', 'e3fea6941cf58fa9b4e898d8955649dd', 'ce08c1226d445a18b7ce2ebc80a7096e', '8a8dfcf2c4cd350651bb8734955765aa', '9634bea378870bf33a7e347fdabf2456', '39b5f0e9e1a1f50e97c7a9789d5e2af0', 'cdac30e8a60ac5f771c987518dca38aa', '6cc2d361537457056399f96e437316ba', 'adfe05f999905e8b3341d3a8ba3d63ad', '37b0829936bcd9934a15eed60fa8532d', '496e10c729a3437c0654d4b5d4c09d82', 'd9c96f6ee613abc7d7949c3377681413', 'f0cb10da92bc2083159e443319216a37', '15045d3e23344df09a1d8c005bca41d5', '15d02f99130c0283a580bed0bed78275', '2d15324eba0e70ef4a1451d5321e980e', 'c7d3b831381fb5e75f144433358493b6', '9d4d42de8c64f7c79fd82fb3bb8960c4', '4bfe10874285d8d68486b8c991462ac8', '200c064f3fbf08ddcbbfb2f36118b066', '8eecc61b4ccca9910892a9c267d35d07', '41c698f0fefac0e702a03bbe38c755ee', '9221ec3ed5a0cac9cc211b1f76fc2642', '835f40e6144719e73f26d260b8ad6c80', '463d9a27874682c6001c748196070b22', 'a236d75f2c03e16fbbac2825d42eb914', '83ad78f86297babe27c2ea9cb47b4675', '421f72e8a492f524b696dce4658879e2', 'ebff992242dcd49d4004414a94581574', 'd6b7c0a83da2af88d584dad256dd08e0', '6a559666d4f4561014c97c2177068f78', '67d0e885fbbd04b3675512bb31d1a0e0', 'dd8ecfa0c5e14fc9dfa9903f3501a07b', '590ca279d64e0fc64a5a1f87d91c6bfb', 'fa457b5f7729ecaad819d201ffc49784', 'cc36fbe3a57fa2b04738ab2a7b0e5e12', '864e494bca15d43d2a036610712c0180', 'f8509fc848701a8f2ee6c0c0a59936b8', '903a785a05d396bb9f3b3b1db1e59d7e', '87d48c0f33a9213c9c252535a399ecf4', '3deb452f47e2dba10f8baea299721ca9', '12d512a24a5210ceb6074e7bdaebc227', 'ae97d323473045ec06cfc1fd165808ce', '88199cab3584d87a46150bda69379a9c', '09be0cb04e6fd9080bc7ccf8366373ee', '2d81d266eaf9148fdf177891eb51d708', 'b3d89252b2dcdf7ab1b6ea69e05b4e7b', '4e642b8d7055956511fd63978aec8d0d', '3b3b42273065160a4cbf88d5507a0dc0', 'b5073b22539ea0a59136f8b540c95b02', 'bd723b441e4de6d16bf3ef78a0ed10ad', 'a65ad70ea91d2a1af991e304fadd62a4', '6f451320b2bafe1e730a8b6e2bd833a5', '91dfef441587afeac249429a4dd44cf8', '38a8dcbd0ebfa6961a368f4fc17145ab', '8fc484b27aa36a70fd67edefd9e2477f', '6b4b817b770b8ccb189a2597c694506b', 'f45a2ec2f61ca00a1e099f97f316d79b', 'f53603d80e75c2e3be16c3feb36a539d', '531d0a1f09348b04f5dc54bf84583cc1', '879bd82b192d004e0fb657469c76fc51', '2b50e856f51760e84318e7092fdeb0d8', '2e8f9fc2dcdd0dd7752ad2d268566b53', '80cd82b51fcebfcd90ee3af11adc09e1', 'd18e62ff9a47263dd6ab7c3087879692', 'e6c1afcc7bc65980b80bc5e921bfca06', '806fbd7dd888ce61828fe4395b6f4e75', '1bc94726d834ef7b9535d51e78bc8c55', '09598adef2692a5ecc76bf5e258c2ec1', 'b629be66e8e45a28ed259cfe274af605', '28bb026e961ba92d5f792df7039555ae', '7e28b3a432db2f50621c6fe42f2f36ba', '0d828e3d92683f430d43c51c9fb5b54d', '84536d2896d31e74666b6aba27aae014', '3da255cae775b060a22dc63fda61fefa', 'eec32b35fca1c7403c1fea55d39afda5', 'f44a80af3c1bd429a946f4a274113e6b', 'e64d2d6c8000833d9b5883073485ff03', '699193b16271dbb8b49f218310243030', '1792cda8cff53c86561827b682b91bed', '5f862b1fa2ca69490d9bbac6ceae0337', '57bd90ae2a361f1382fc725fef0a0921', 'd382ebbd528354123e59a86233c32b06', 'd6d2c4d555de208ebbbe6c82b0aafd17', '8fd7fabebed71969fbe86d7364c8861d', '1be229c4bbd1962054419c0afef9eee4', '8834cc11931a2e49c5e0d54e9da9aad9', '743729efb199a6558d0899195d8ca740', 'f7bec41b29d58c70966f97ea9b52bb3e', '0190b8458cfbd28683eb07811a7a7ec5', 'fd816aea82671f71e3eb67e977af00a0', '6f4f1114bd7840ed5e5d756ef8c6432b', 'e9b5c43b5fc7642377401d25a9bedd2d', 'a35a63e506a0b73d125ff0080bdd588c', '1b84156caef23d63398ece07abe5929e', 'de14cd74a8e95268187653ec969312c0', '4b3e0bfd44175b0cae7c36b9e8e519e3', '104a4d3bd3aeeac50d2cc2fb4493bce4', 'cc1d54da1222b9a544b08ff4f645f3fd', 'c80255158cfaef8697d6c6bfc2362341', 'c87009f5755a9b6aa356a1935a57e5a9', '36eeb567a2ea72c1b54cdd9d46a6647b', 'c5ab8758f3c8796ab006d65be6fc25f8', '319f3b8aadfb71e710e070af1bb1047c', '02c93ce8c413e2e8e77975123aecec49', '91ee38d3951b17dfc4c7bca94b6d6dd8', '2e9de09614d9065d6dfc2ae16d4268c1', '57bba769688fb4372296000214954f56', 'fbc81ab85f91aa162fedb88547abc2ce', '05bdc47cee2f106023cbd8617b0997c4', 'a88fd0271a539c09d0c906153736e111', '466e95e1a82f48dd1fdc2a119f8df9fc', 'e580bba0a66774889eb04ca5ca6f8e37', 'd0668af748d1fd0ae7347eb90a5d4fcf', 'be84c54af594465edf6dede1977dee38', 'b582dff2a2b91f406b14ff64e032c72e', '29705fbaf540d6f6b2f8c67313b3070c', 'e3160345920372b267101bdae099bc20', '0a0620ab140b578867661ce0f0abf856', '0bfbf291188a05a0967b5ec02f574357', '98c5511cd321f13f7c1953986475c8f0', '51f0ada675f84142440bc7e56caa57ba', 'cea5cc20e8b7e2c2de0deaa3fb32a371', '7d7212ceeaa7cb2795038bc79da98159', '7d3db7859099303167c8396310a79fe8', 'e37bf85f107b2ab0fe65cc84a7ae103e', '70eb3bc85ff6e3914131f5a6cd81834c', 'b7ee2b49525d85788b253a1508efd4f7']\n",
246 |             "Test ids (511 samples) = ['71d7a66777628e3f9b8b85270366166c', '9032f491fd4ef429d9c18b3b015bd11b', '776e27836f8fbe84f60a850952498988', '8d4086520df91f54eec6a4accf796569', 'c0a8f1d99cd3b7ee60c21d1e9b98125c', 'd27c13871d62e88714fdaf539e50382e', '516686bcd8331a26428556e346c377c5', 'a8f2bbaf928d9834c8257188ca2a0780', '98603c2f4ff85046a071701c2659295d', '867833a9097a0838ad3a0a9df0f7a4a5', 'ecf80f40365366e561ce1b900a43c8f8', '761000dc2471950ac51f58d60694e78a', '38959f4d0d0d96f0f567b4e8c3e24809', '0f4bd20ef527fa6d503f0747f91247e8', 'c6c616ea37c4f4e9f753a1a26c1f513f', '1dd1899fc4c0293f6bbb95af80cc39c4', '1d07fcbe2a6900c4e4ea4409ae2c5270', '568d6332f094e839329a4d66d30dc6d0', '6d9ed75aecea4cb4e6298bd638ff4afc', '0b0fd757eef935f9609cbfab1443840d', '8031c4b9f388c424ba2ed4da91f00173', 'a1c8fc922eac56a7bd275565defa9763', '07f44d71c453c52f9f055ab877b909cc', 'a71a739c154aa77fb2800a426818df5e', '998fe36c4f8131243b766398f5801c22', '5db6785789345d1638b1046fee3b4c76', 'fb85950dc8effebfa15c9555f238d7ec', '927893d2acecb4415e1f68f923dce782', '1cded34a5dfb786368208f632e5a5d6f', 'e180827dbd190aa6e9651be5d08f7230', 'e243fda5f1cb39ea6c810564c025638b', '37710e7c0449baedbea4236ef2306da1', 'eaba6c7538a5d6f7f993e1bed678db73', 'c062f1a1e198a4da90193c5e06c484c8', '57cd65ad7d3909ce9b8fe9d4e2fdf709', 'f571b343e99c24e012804a5b04a591f4', '97ca93ecb6d659c2af7e71a269ad021a', 'bec4c14ba02a32fad812ee94a98c1602', '12f506bf952aec9b3d8289a62ded85d6', 'c396496b481e2685a7733cfdbeaf925b', '932b64b6d7fa0605031eaa6f3c63eea5', 'b00b0b63b241943957b48e9a4f9b7c27', '085fd00ce06d27d19f40b330c7d8ba0d', 'a6a55291b82d26ed8852361a80575e89', '0f1a741eea4c4cbdba9d46dd3d762e2a', 'a6f2a358b6a2a33fd8c44830293b8323', '6c5bebf8e67ea0562455656f4132dca8', '27c8850bb21db8dbccc40ebadaf4daea', '393a3cf1a1ce2e6de5955e85605b2e61', '346ba06d2fc6ad303d2c25aa03c8f309', '5160a38cdbea74be5e569b767f025e01', '0a63c38ccede65ad5788d469185fc315', '940904b10cb465abe65aa7f7fafbe2a3', '22333feb1c9b1f02fa531f878c1970ae', '9348b0de17a27622d52d6c71fa12d419', '568982eb42078f033b3ea568329b6c0e', '56a842911136ec4628584bbb915f8f4b', 'f33db0c82c1d8924f035a0ea3b2e7ca9', 'b4fbc993f786bed4cec41b356919f2e3', '6cc5cb34c7f90bd870c027ece3a15fad', 'bcc2b44d3cd1f9d2b8458c595fb9b02d', '269c71c3a7cb517e2d21f20bad0714c6', '513b44a60a7bfd9a8c1a5a2c49ca185e', 'e05c7b866e6e8e7394ca5b1d02c20f58', '9652a10d1629156e0789bb630e081f9c', '8a07ce1347bfecea1903820b2b76e0be', '9e560864413aee1abddcde454e778e94', '62beeffb51a33d06b38e2da8d5f2d051', '1d6f6841cc1de15d208daa6aee88e2ad', '47c9781f39a732c3643f6c4e600e19cf', '8816ba131c25d99edd0cdd75c07859b1', '07bf09806e2103ac74dac0afeb33bf6d', 'f7a936a8f212e039b7fea6867ad7f010', '199525a09762d6e51c1c7f4ef55c8926', '6aa56327ff9ef7983d0c691aca0ef1ca', 'f29cdf5317bffce5648a21766b6b5f34', 'fa220df5ca6551e99016b9d5897b7915', 'e9213ea8f437aa9d320e0e1271c4a1e6', '51482d141bf1dc8a92bc294563992ef0', 'b829d2d9b0b73662c3bcfaf032bcb1e2', '55540a87c8cdfba5615d4079484901f3', '5b53081a25e4d66f353d2229a0564c65', '1f993001eef14eee827293e16d218ca6', 'b0288182991527078a0693bca89985b8', '6b599fe2bfc9db75aabab13175652d56', 'e8c7b7a4bf4390bf03ddcc5d4215dba2', '0c0856b3f01f7a0a6d9fbcbf4253e190', 'd45962a733099ddc701566f0d33556ef', '025be63b5d1ae501b268711832cc9665', 'ba71aa394247a7a27230a2be489ab420', '916e520d5f358a54ebcdbba0179d8843', '6095a61aaf59fb5bcdc88fe1ad8a78c2', '84b51054f1131067758bb1137d812b78', '1823baf623fff2394d985f4f6c0e6d74', '2b5c43f2fa8f0b133535cd3ee6c6c695', '7dcb8c84aee27fee41f9400c33ff1551', 'eae565378fff9055ed48235add42bc15', 'ea1ccd1847912ce2cc57d7901a0d81d7', '2922376d0667c1809e159264aea169f5', '0c0bcba0c40adafd29e2cf47ec4fb544', '85eca4e62686b28864e1f6e20ce8d683', '1a78051dee60dbe7d688a3127f12544e', '46b1520f1cbaa923ac8cf85feadc9cf4', '49655c66704cf856ca2dbbc77af22586', '7b107a90c7d45209b9b367a08feb7708', '4361de87fcb0cfd740d35212aa7c7211', '1e8f4d2acdb3d4a94ce1205e0326f9f6', '9f0e04bd2d29040421904442696311b5', '2bb484a925b6a25f1277b7008651cf92', 'db5bdd3d97d53f257ac6afb8fd78f462', '63aa5757186c1def977a118d31445ec0', '9864da59e74afac4d219fc65939826e5', '4ec499632cf69a6f75f630f09fefa1dd', '6fe5bea03a70ebb09564b74be2555d7b', '6011873dfea515990a882d38539e126d', '93667e0ed1442c25148ea330f86e97ba', '5c4a6d35244fd3ce149b1008535a7809', 'ccd1c08880fb15e70d4b1aac0063f1bf', 'ad408423128850292ffeb29b163cd78a', '55051865aa377bcf48acad6782479e97', '11664771aa54bb4138078077243e6c3c', '3cbedf02828b82d680371b91c7ec53fe', 'dfbd4c5642fcda3f5008590ba629c9e6', '6ae4cab1d0684afaea997a01a0939f71', 'f6af9cbc4f43f3a708e4cbf20e0315db', '8b03abc22d2051cb7740ac9f5795346f', '66d8156c3e21d105c697555f7952d5db', 'c95987f619409f89814af39a83d36e4f', 'b2a375e7fb7dd6883b7563f7dd915624', '4ff8af2020275408284cf75a8bbb94ec', 'b82841783bc6ea570cac350a031f462a', '35a9f67c975bd2a4d3b79c97df499293', 'a3a91683dc5cc0f98037c23b8d962acf', '3d800b30aef8a7da60395c92a0369ac7', 'b6ba5044eefaf39648286d57aee10803', '1f72bf63e2247500e0c6d674f5f1aab8', '4eed0c5d082e7207ab2083b26db70593', 'a69b1206b162095ea6bbb35fb64ef57e', '941af762cfe86db412d9b995b33277ee', '7e063e5a3613f993d9b7bcfb423bbda6', 'b6db70131e6355a13a741fd803d8ff95', '4db28b9c8602a7a17e9b9b80e5d090ce', '60cce41086fd0866c296ad09e4a9fb5e', 'a8bc849f5a64ea13fcfc139b152fe03d', '2df3055029fb0a7eaf3315d35853d756', 'df0cba25540dc2d66bafa4af3144190c', 'c586059099b151b8fa61c7483717cbc9', '225018e18cbf57b20a80c746e7861594', '9c1a4c9d0b5a9d47e4e053eb2d420407', 'f712c1df0637ed68819cdecf4617716d', 'b0779a8ab51a8f8f8bff1d2f35d5a6e2', '1cd6de23155f859461caf9c0458f8f87', 'e881b7ab3d0a47237fb5b1c1fc23376d', 'd54a17301e302785439f9dde6f3fc208', '9d4519587afe7aafb8f21dfc1b0d0996', 'b86f86f863585b3356d2304b7e3c62bc', '1a78fb79fa2a8982c87f21b8bc533615', 'df5b0524ad802d413159260557db958c', 'ee59574e746db4c025602b8605420659', 'd9bba8a3889fa3dac1b8e41e0146563b', '2fff98ca52fd19ef92a48c52029606e1', 'e5f2ca84b9f42da0c5a9fd90164ccad9', '6152b60d67077a228acebe412e1ee849', 'ee2a77ce843d24362cabb427c958871e', 'cce910d6457d84f18b53b37624e9be86', '62879b753d9ceb7931a02255fd59738d', '752560c49d508b3109ccede8c466d331', '2e705213cc37d836360a3eaca1137239', '8b524012052a7f83dcbd6747d17057c4', '5ab6090e8225c3787e56132e2a607d68', 'da4dd2ba6768317321515fcc645867cf', 'b53228aa58bd0a7218eb65b470870fc5', 'e6c4a523e241b25f3ddbc21333d93183', 'ac5e9a8968f532cccd40dfa75105d883', '3917656e05e2824cf5c7d49807daa559', 'afd9c8e6cf5e853598a4b093779381e3', 'ddf2dd4a5343a76ae274b951a573bc20', '1638139e889ba19e05813f7c63a62313', '5efc169c7d3e7f7319189a86f88d18e8', 'd8825e8ec4411927082d088ef0008d54', 'bc48d54e378c819bf0ed83f614d2eead', 'dbe28073723a939c538e46d36ac858ca', '3610e843650a8876095bc5510280dcf1', 'ae84f2816f15df41129e2c3baaf78063', '63e890c53312ed84881061fa597e1173', '398aa804f95674fe47027a3ace7b8a1b', '5b212fad64feba45b1aac1e00e5ad5d4', 'd039fdd87b3227982f2b8ce698849d47', 'de630b5890582133d146be37b8f7d034', '3f9be74ba902086262dce25092726c8d', 'a57a45a928db8a107d431931d1b63081', 'ce608bb27a6e97f47e81e6b285621d82', 'c2d1d17f441fe69a98dee9e647dc1d11', 'a02b340d7c63212f2a77f7f98fa53637', '2486853af95400d8af5f9eaad8264d30', 'a22cc150e903605e28fc5770adfbda9f', 'f8b7020ca42524815eb92b9e4b4cdf4c', '728b7cd0d354f32d64d47b364b35d813', '26a1457116407d0ff002d2aea64e0703', 'cb8868533691a6f0793cddc3576390a6', 'f816baec07810f53f1aa0bddad3993ea', '807ff24320ec7286f132a001f49aa297', 'cb42d83f38d352f4fe9caabf460722d0', '9c42c41c65c68ff4d10a5bf85cc586a3', '5b3607941af2d1290240044796d36a5e', '4f4418d5d81f8fdee4d8093a7bf92980', '5025836c4777bf0b7f56fd9f727ab568', 'bbcc8eb3b942fc2547e43e15d3b2d66d', '203726a51ed642b922e82a97f7c68651', '4e96ddfa5985f50356a0264498a42444', '59b9c3f86716b3f59175f1b839e324be', 'e27841a82df6187263b88bac10c5793c', '5a36cf95bf04a713652359ff744518c4', 'a4452247cf772a8f493ed45be2146279', '4ae391cbf0fbb6f54d4cdb79e40b3014', 'f474f0d3f6886b50246f2a74969d3cb9', '88f6107830f5e3d8413a025248e1e1a4', '6b24fa9df1b373a9d71192a8758e85d2', 'bbab47202e290663b1159185a7fbf016', '4f84706a5b704fb07a371f877021fe70', 'edd0fc3a25e58af6c5a94cffc5b53512', 'a7d6f138c779a0cf4917bd802663fe1c', '39104ba8977eb63a7cda6d6b13de2f96', '64bd5389adeb465253b181d838e5b68d', '7b748d57587bebad3762a865e45cf8e1', '69e7e1191bd2466b30f43c7d4c38d665', '1b82531c05d3135a5d1c7ff780454ffb', 'f9cba099a2686b06d37d17ef17239f67', '5586215272cda7ca20de7960ba716ac3', 'd7e7f51b5c05eebc51be0667d6786bc9', '0adf2d326855f05d198dc27e32f83e6b', '9cd29923f70f5ee2a346c2f85b6c0c70', 'c8ef92bff62daa7573e81decb7eeed7d', '1fc7365a9797e6012b4cf394a5f14627', '5aa0c0bf9466f8295f6e9eae1b2bbc3c', '5ff4f20594b9a376ba23cb3931cb3751', 'cfa5d4ad00c3f173563848c3abbf9f0b', '8b1f55cf26ef28727d3e98446c014dc1', '019d46639c173a9a95d4516cf4f0e46d', 'b98dae8f5c65cff8179d5dd2406648e6', '374cebaf127949e6f7ed7839c3656fa9', '0959dffe01941688e8408f5d4a7b0384', '07da3d9c550c65237902e2f2ae292914', '609a8cd3a608dd5c64b6823eefa5663f', '82d39c8eacd784def6cf14c8f849a679', '70582abf441e432c99cb13b40e1c4665', '37de9bb8ff75c3afdf94b71b9a48d4a1', '7b157fe80fb7aa84783db5d5ca0c1589', '00906c9fd0920b8ae1cecc38bf92d29b', '27c7c6af5658a77c7066c4832225577c', 'f755871faee0d36e43adcbc8324be2ef', '812eedc1f31d984ba60b2e443045cd77', '9bb137ff6aed857c6756063fd415518e', 'ab0ee37a6cd76cbaef1e38cbec044c88', '30820e2aa5a6d375c933e7185d9b6899', '3bb9b4d9bb761a655ad434180d384017', '8737364460c311e9489f2efecb858b0d', 'bd50fa4aad7b18da1e0376b1148dd66b', 'cbccbb39823fad6e408c746fec35968e', 'dec302dcb6372b436b70a37f2b92a2c8', '2598956a9f17ff8f12aecd8091b20d65', '95a06a96871131c62c1ae37356fef1ec', '0731f13bab833224c2fec1f4281ae2f2', '259542ca8b02d16c6df1489a8b2e8d4f', '9c875e2d5bbc5976db7deb1c5ee79fa0', '58e490450309513c0b78302827014dfd', '3c2c076905a292801eedb7550959b5db', 'a2506e16e44f432807654a94f913cef9', 'fca93e9a8540078571a1f16c0986d14a', 'c1bb5a2deb487a453c394c526ad1acaf', '9fbc30cde5a73c9a4c9ae2ec1875b1f6', 'a51cfc65bfe9f541bcc901decca81b9f', '6b16de115c6d63bf08c82931d50ad05d', 'a486e66cc48a344bc3ad9bcfdf4cfc37', '94b95dd371e0e8e70fe3745067567e66', '6c9c93cface745b241432ce73c76bfb8', '750ebc1df0359555bd5a10313880fb3b', '41b7b5c2056aa9688b8d2c220ebb2b2f', '35cdeb7d3d5b011b816eff9f99dc223d', 'b57075c4d487a10f6212b1cd470f1cdc', '89ee072332c37d47ad58d1ac7865c92e', '248a5c9213b4e959291792f0bd7241a8', '951b8b2ed48aec6ad14a335cb2e63d91', 'd4fcfa0666a39af97969a29b0ca002d1', 'b6594604feb3757adea8657e9b0fcad0', '061f7bd9fe6ae22b8b07f16fa3a3cde4', '18d5df1fe3fbfdd3bf1ffaacef35f3a1', '479ab4e179ebab72a9d2963afe8018ac', '5281c51e3be66ddc921277bb1b9657a4', 'df73e8ab1e7805c4497357162c38d6fa', 'b5fe04d4ea147e3e9de780f0a88fd71d', '32553a298851afcdb65adba991384ea1', '1c4b63d48bfb2eb6772e551aff2b287b', '3cfce1dc697fc9775f7b80beb0fc4d23', 'e90dff2cb72d52bba7128d2e90e79eb1', '932fd4c716a5226b78b80f208147021c', 'ba41403971e54618c894007859384a42', '108e66d5039e1bacf6e69141f23f7a67', '8d472c1214f5c9224c74f2917f7c80c9', 'abd6f49246d4181cab90bf2da64da308', '5ceceb62de9d399e702c3006c5e3f1be', 'dd838c540ca88782f3fba00e934f1af0', '245456396bf0b2fbc271039160f22de2', 'bc2f1dd0b53c59269ce1bfe9a0b84a69', 'e8569569549580e62aaf75b8f4a759d5', 'af772af5e47050b906b8dc5f71efa76e', '3d20882750f46c89d64300fcb45424ae', '39be757346ecd48bce9e3b9ce4b2bb0c', '81ade189c6fda6a6d405a240f37989a7', 'c41c73d21bfea0ade03346b169864391', 'c0a55a31e49e98a9d6664e05b8702a5b', '47520892700b208136b0365136789078', '5d3a6e952fb8cdde9b918b34bf626e05', '3e4db1b0ea7a8d6499c71e2851537f77', '1a2e68f56d7dae8e4369b55d3c82ecef', 'bc4908c241186bf2397a60837fd601b8', '64babcb1fbda40238397712e62bd418f', '56b56e26b932fab51fa35f2946788353', 'a053819838a1dc21b8f1f2761a476029', 'b4f5a15df471762b3d3f15e10ada7171', 'd629ca20350bf128696d0086e8ba1762', 'fc2d0d092e23c52a0a219e26554eb40e', '8518868ded7e9a76d1e21914656225c3', 'f7d55a23b8f1ad646d77ac69526abb22', '767f3c58c6fddb6fee0c2b03d61499e0', '64910b54509bb37879cb61a133434b15', '0d7c0dedd986f0062147701f65253c89', '0d0a7d1e8836bc96e0ddd6c072535c75', 'f573ab803966a22b6e2060e86baf9803', '7258c9222dec4148e4e9bde4b4f6afee', '30d269e7cbc4ba4586e7f8c9b84ad5c5', 'b237dc219aedbe90928a29c700d08d60', 'a168bc2802e792320cfab8ed3ed5f5aa', 'd34f25ea05cf09e53a6eeb621249eaf9', '82c63353ee1e8aa779afa3e6c7c2aa2e', '09ae7cead00b87c87234745a7155faed', 'f37ff2114d5a0078c0d782a4e9cf2646', '719101222a537d7214776067541b7dc8', '8d583b2ee0dc364257f7cb5e3e9571ba', '05cfaef20dd799bf7b135f8486c56922', 'c431244d360e3d09a87f24c87ee9a9d5', 'b119a39d90795c2d8c58cc525740b2de', 'c49d9157b44086dad352d2866880eff7', '975b8049ed717e42161e2ea4b3ebfc1c', '88cbb6e64e123c3da8f5eb64908e041e', 'c89ee7997fcecf5103063d18412a2159', '2402d0b852f2719838715a5483b67f90', '6972ad489459106c6a38b4ceacb1fdda', 'a583b5daaf6a730d836371b2dae4df22', '6ead8a81311508ffab0ac0128482c3b9', '904e9ee3a6001249cfeb13247082c067', '0b7f097bfc132334cdbb3bdcf078c82c', 'd8b4faffdb0a557a6bea5ae0eea4053f', 'bad4561e5cfdc3eb113144c08b3253a1', '7fd2e5f6832bcf6975b247937d175718', '171703ad36d950b4a9fa1de65627d3cb', '4d32efcbb63b326f72c6a055a2505625', '653fd442746b1d58061c58f1131ca39b', 'f48a364c29a0a0353284e4a6b6471471', '2cd2e01cfa43e6f8e6ff9b72fa5bd931', '06c9aa1712329ae572986b32b0fe0343', '1b7d61ec7377d0608370f25469477f62', '41f4e509d5096168cc118de4388fcbf5', 'cd915974bc6420f03e825049c84ff7f7', 'a360bcce8248c707faa9a3a8abdda997', '2157d4fbab685a25dcce9534d03c6d41', '9f8a677c3c6fa1509caa703ef7916c35', '28aa921921dae3d9dcc83516a340420c', '0b8c1356537d121dce47ff385ce91450', '2ba48182d48f83af1b96a6ac34d80dbb', '9f220c00fd0003c0b51b08dbaf5d2f2d', '17465d16e0c6a6addf57547fee84bb1a', '3ba711f77a46f743aa6deafce77f8576', '142a8c532b54289fea4a7f1687998772', '575ec950ce1ca0684b1add581cb0c995', '94aba9c9160c90cdf98f0af4fb203ce4', '9ed041c2bdcf38442c449b9e1b18535c', 'e7a108a395663e07775218a03583ed81', '94add8111457f05850e8bd5d4cc3a7a3', '5054d386528dc4ae36e845a1cc3cdae0', '666193de419791b49959af7932f5c8c2', 'eb6d9abf15d5220f992a9ac27a5001fe', 'f46cc14603a04178db39532dc15c176b', 'c3813cd5a191ef574003db15d615cbb8', 'e3fea6941cf58fa9b4e898d8955649dd', 'ce08c1226d445a18b7ce2ebc80a7096e', '8a8dfcf2c4cd350651bb8734955765aa', '9634bea378870bf33a7e347fdabf2456', '39b5f0e9e1a1f50e97c7a9789d5e2af0', 'cdac30e8a60ac5f771c987518dca38aa', '6cc2d361537457056399f96e437316ba', 'adfe05f999905e8b3341d3a8ba3d63ad', '37b0829936bcd9934a15eed60fa8532d', '496e10c729a3437c0654d4b5d4c09d82', 'd9c96f6ee613abc7d7949c3377681413', 'f0cb10da92bc2083159e443319216a37', '15045d3e23344df09a1d8c005bca41d5', '15d02f99130c0283a580bed0bed78275', '2d15324eba0e70ef4a1451d5321e980e', 'c7d3b831381fb5e75f144433358493b6', '9d4d42de8c64f7c79fd82fb3bb8960c4', '4bfe10874285d8d68486b8c991462ac8', '200c064f3fbf08ddcbbfb2f36118b066', '8eecc61b4ccca9910892a9c267d35d07', '41c698f0fefac0e702a03bbe38c755ee', '9221ec3ed5a0cac9cc211b1f76fc2642', '835f40e6144719e73f26d260b8ad6c80', '463d9a27874682c6001c748196070b22', 'a236d75f2c03e16fbbac2825d42eb914', '83ad78f86297babe27c2ea9cb47b4675', '421f72e8a492f524b696dce4658879e2', 'ebff992242dcd49d4004414a94581574', 'd6b7c0a83da2af88d584dad256dd08e0', '6a559666d4f4561014c97c2177068f78', '67d0e885fbbd04b3675512bb31d1a0e0', 'dd8ecfa0c5e14fc9dfa9903f3501a07b', '590ca279d64e0fc64a5a1f87d91c6bfb', 'fa457b5f7729ecaad819d201ffc49784', 'cc36fbe3a57fa2b04738ab2a7b0e5e12', '864e494bca15d43d2a036610712c0180', 'f8509fc848701a8f2ee6c0c0a59936b8', '903a785a05d396bb9f3b3b1db1e59d7e', '87d48c0f33a9213c9c252535a399ecf4', '3deb452f47e2dba10f8baea299721ca9', '12d512a24a5210ceb6074e7bdaebc227', 'ae97d323473045ec06cfc1fd165808ce', '88199cab3584d87a46150bda69379a9c', '09be0cb04e6fd9080bc7ccf8366373ee', '2d81d266eaf9148fdf177891eb51d708', 'b3d89252b2dcdf7ab1b6ea69e05b4e7b', '4e642b8d7055956511fd63978aec8d0d', '3b3b42273065160a4cbf88d5507a0dc0', 'b5073b22539ea0a59136f8b540c95b02', 'bd723b441e4de6d16bf3ef78a0ed10ad', 'a65ad70ea91d2a1af991e304fadd62a4', '6f451320b2bafe1e730a8b6e2bd833a5', '91dfef441587afeac249429a4dd44cf8', '38a8dcbd0ebfa6961a368f4fc17145ab', '8fc484b27aa36a70fd67edefd9e2477f', '6b4b817b770b8ccb189a2597c694506b', 'f45a2ec2f61ca00a1e099f97f316d79b', 'f53603d80e75c2e3be16c3feb36a539d', '531d0a1f09348b04f5dc54bf84583cc1', '879bd82b192d004e0fb657469c76fc51', '2b50e856f51760e84318e7092fdeb0d8', '2e8f9fc2dcdd0dd7752ad2d268566b53', '80cd82b51fcebfcd90ee3af11adc09e1', 'd18e62ff9a47263dd6ab7c3087879692', 'e6c1afcc7bc65980b80bc5e921bfca06', '806fbd7dd888ce61828fe4395b6f4e75', '1bc94726d834ef7b9535d51e78bc8c55', '09598adef2692a5ecc76bf5e258c2ec1', 'b629be66e8e45a28ed259cfe274af605', '28bb026e961ba92d5f792df7039555ae', '7e28b3a432db2f50621c6fe42f2f36ba', '0d828e3d92683f430d43c51c9fb5b54d', '84536d2896d31e74666b6aba27aae014', '3da255cae775b060a22dc63fda61fefa', 'eec32b35fca1c7403c1fea55d39afda5', 'f44a80af3c1bd429a946f4a274113e6b', 'e64d2d6c8000833d9b5883073485ff03', '699193b16271dbb8b49f218310243030', '1792cda8cff53c86561827b682b91bed', '5f862b1fa2ca69490d9bbac6ceae0337', '57bd90ae2a361f1382fc725fef0a0921', 'd382ebbd528354123e59a86233c32b06', 'd6d2c4d555de208ebbbe6c82b0aafd17', '8fd7fabebed71969fbe86d7364c8861d', '1be229c4bbd1962054419c0afef9eee4', '8834cc11931a2e49c5e0d54e9da9aad9', '743729efb199a6558d0899195d8ca740', 'f7bec41b29d58c70966f97ea9b52bb3e', '0190b8458cfbd28683eb07811a7a7ec5', 'fd816aea82671f71e3eb67e977af00a0', '6f4f1114bd7840ed5e5d756ef8c6432b', 'e9b5c43b5fc7642377401d25a9bedd2d', 'a35a63e506a0b73d125ff0080bdd588c', '1b84156caef23d63398ece07abe5929e', 'de14cd74a8e95268187653ec969312c0', '4b3e0bfd44175b0cae7c36b9e8e519e3', '104a4d3bd3aeeac50d2cc2fb4493bce4', 'cc1d54da1222b9a544b08ff4f645f3fd', 'c80255158cfaef8697d6c6bfc2362341', 'c87009f5755a9b6aa356a1935a57e5a9', '36eeb567a2ea72c1b54cdd9d46a6647b', 'c5ab8758f3c8796ab006d65be6fc25f8', '319f3b8aadfb71e710e070af1bb1047c', '02c93ce8c413e2e8e77975123aecec49', '91ee38d3951b17dfc4c7bca94b6d6dd8', '2e9de09614d9065d6dfc2ae16d4268c1', '57bba769688fb4372296000214954f56', 'fbc81ab85f91aa162fedb88547abc2ce', '05bdc47cee2f106023cbd8617b0997c4', 'a88fd0271a539c09d0c906153736e111', '466e95e1a82f48dd1fdc2a119f8df9fc', 'e580bba0a66774889eb04ca5ca6f8e37', 'd0668af748d1fd0ae7347eb90a5d4fcf', 'be84c54af594465edf6dede1977dee38', 'b582dff2a2b91f406b14ff64e032c72e', '29705fbaf540d6f6b2f8c67313b3070c', 'e3160345920372b267101bdae099bc20', '0a0620ab140b578867661ce0f0abf856', '0bfbf291188a05a0967b5ec02f574357', '98c5511cd321f13f7c1953986475c8f0', '51f0ada675f84142440bc7e56caa57ba', 'cea5cc20e8b7e2c2de0deaa3fb32a371', '7d7212ceeaa7cb2795038bc79da98159', '7d3db7859099303167c8396310a79fe8', 'e37bf85f107b2ab0fe65cc84a7ae103e', '70eb3bc85ff6e3914131f5a6cd81834c', 'b7ee2b49525d85788b253a1508efd4f7']\n",
247 |             "Len test_q 511\n",
248 |             "Len dev_q 511\n",
249 |             "Len train_q 3196\n",
250 |             "[W] Learning Tfidf Vectorizer ...\n",
251 |             "/usr/local/lib/python3.7/dist-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
252 |             "  \"The parameter 'token_pattern' will not be used\"\n",
253 |             "[W] Learning BM25 Vectorizer ...\n",
254 |             "Loading BM25 model ...\n",
255 |             "tcmalloc: large alloc 1570521088 bytes == 0x5652b8a66000 @  0x7fa378b0b1e7 0x7fa375d2646e 0x7fa375d76c7b 0x7fa375d7735f 0x7fa375e19103 0x56521c580544 0x56521c580240 0x56521c5f4627 0x56521c581afa 0x56521c5f3d00 0x56521c581afa 0x56521c5f3d00 0x56521c583b6b 0x56521c5c57bf 0x56521c4d8654 0x56521c563933 0x56521c564d64 0x56521c5f7134 0x56521c5eeced 0x56521c581bda 0x56521c5f0737 0x56521c5ee9ee 0x56521c581bda 0x56521c5ef915 0x56521c5ee9ee 0x56521c581bda 0x56521c5f0737 0x56521c5ee9ee 0x56521c5ee6f3 0x56521c6b84c2 0x56521c6b883d\n",
256 |             "tcmalloc: large alloc 1570521088 bytes == 0x56531642a000 @  0x7fa378b0d001 0x7fa375d2654f 0x7fa375d76b58 0x7fa375d7ab17 0x7fa375e19203 0x56521c580544 0x56521c580240 0x56521c5f4627 0x56521c581afa 0x56521c5f3d00 0x56521c5ee9ee 0x56521c581bda 0x56521c5f3d00 0x56521c5eeced 0x56521c581bda 0x56521c5f0737 0x56521c5ee9ee 0x56521c581bda 0x56521c5ef915 0x56521c5ee9ee 0x56521c581bda 0x56521c5f0737 0x56521c5ee9ee 0x56521c5ee6f3 0x56521c6b84c2 0x56521c6b883d 0x56521c6b86e6 0x56521c690163 0x56521c68fe0c 0x7fa3778f5bf7 0x56521c68fcea\n",
257 |             "100% 3196/3196 [03:37<00:00, 14.69it/s]\n",
258 |             "tcmalloc: large alloc 1570521088 bytes == 0x56537a852000 @  0x7fa378b0b1e7 0x7fa375d2646e 0x7fa375d76c7b 0x7fa375d79e83 0x7fa375d7a07b 0x7fa375e1b761 0x56521c5804b0 0x56521c580240 0x56521c5f40f3 0x56521c5ee9ee 0x56521c581bda 0x56521c5f0737 0x56521c5ee9ee 0x56521c5ee6f3 0x56521c6b84c2 0x56521c6b883d 0x56521c6b86e6 0x56521c690163 0x56521c68fe0c 0x7fa3778f5bf7 0x56521c68fcea\n",
259 |             "tcmalloc: large alloc 1570521088 bytes == 0x5653d8216000 @  0x7fa378b0b1e7 0x7fa375d2646e 0x7fa375d76c7b 0x7fa375d76d18 0x7fa375dc16e7 0x7fa375dc4fcc 0x7fa375e0fac1 0x56521c5804b0 0x56521c671e1d 0x56521c5f3e99 0x56521c5ee9ee 0x56521c581bda 0x56521c5f0737 0x56521c5ee9ee 0x56521c5ee6f3 0x56521c6b84c2 0x56521c6b883d 0x56521c6b86e6 0x56521c690163 0x56521c68fe0c 0x7fa3778f5bf7 0x56521c68fcea\n",
260 |             "2747 63920 3298 P:  0.042975594493116394 R:  0.832929047907823 F1:  0.08173405932934631 F2:  0.17811754331362173\n",
261 |             "Number data pairs:  64471\n",
262 |             "100% 511/511 [00:11<00:00, 43.06it/s]\n",
263 |             "division by zero\n",
264 |             "Number data pairs:  10220\n",
265 |             "100% 511/511 [00:07<00:00, 66.01it/s]\n",
266 |             "division by zero\n",
267 |             "Number data pairs:  10220\n",
268 |             "len(train_data_pairs_id), len(test_data_pairs_id), len(dev_data_pairs_id) =  64471 10220 10220\n"
269 |           ]
270 |         }
271 |       ]
272 |     },
273 |     {
274 |       "cell_type": "code",
275 |       "source": [
276 |         "# folder save fine-tuned model \n",
277 |         "!mkdir legal_text_retrieval/settings\n",
278 |         "\n",
279 |         "!cd  legal_text_retrieval/scripts/ &&  bash run_finetune_bert.sh \"magic\"  vinai/phobert-base  \"../\" data/zalo-tfidfbm25150-full Tfbm150E5-full 5\n"
280 |       ],
281 |       "metadata": {
282 |         "id": "OlcF4OdnCaJF",
283 |         "colab": {
284 |           "base_uri": "https://localhost:8080/"
285 |         },
286 |         "outputId": "fec9883b-75d1-4e4f-bf79-37938044fc53"
287 |       },
288 |       "execution_count": null,
289 |       "outputs": [
290 |         {
291 |           "output_type": "stream",
292 |           "name": "stdout",
293 |           "text": [
294 |             "mkdir: cannot create directory ‘legal_text_retrieval/settings’: File exists\n",
295 |             "mkdir: cannot create directory ‘..//settings/Tfbm150E5-full42/’: File exists\n",
296 |             "12/24/2021 08:40:27 - WARNING - __main__ -   Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n",
297 |             "12/24/2021 08:40:27 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir=..//settings/Tfbm150E5-full42//models, overwrite_output_dir=True, do_train=True, do_eval=None, do_predict=True, evaluation_strategy=EvaluationStrategy.NO, prediction_loss_only=False, per_device_train_batch_size=16, per_device_eval_batch_size=8, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=1e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_steps=0, logging_dir=..//settings/Tfbm150E5-full42//models/tensorboard, logging_first_step=False, logging_steps=200, save_steps=500, save_total_limit=1, no_cuda=False, seed=42, fp16=False, fp16_opt_level=O1, fp16_backend=auto, local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=200, dataloader_num_workers=0, past_index=-1, run_name=..//settings/Tfbm150E5-full42//models, disable_tqdm=False, remove_unused_columns=True, label_names=None, load_best_model_at_end=False, metric_for_best_model=None, greater_is_better=None, ignore_data_skip=False, sharded_ddp=False, deepspeed=None, label_smoothing_factor=0.0, adafactor=False, group_by_length=False, report_to=['tensorboard'], ddp_find_unused_parameters=None, dataloader_pin_memory=True, _n_gpu=1)\n",
298 |             "12/24/2021 08:40:27 - INFO - __main__ -   load a local file for train: ..//data/zalo-tfidfbm25150-full/train.csv\n",
299 |             "12/24/2021 08:40:27 - INFO - __main__ -   load a local file for validation: ..//data/zalo-tfidfbm25150-full/dev.csv\n",
300 |             "12/24/2021 08:40:27 - INFO - __main__ -   load a local file for test: ..//data/zalo-tfidfbm25150-full/test.csv\n",
301 |             "12/24/2021 08:40:27 - WARNING - datasets.builder -   Using custom data configuration default-33c56ada0b4f05de\n",
302 |             "Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-33c56ada0b4f05de/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...\n",
303 |             "100% 3/3 [00:00<00:00, 9013.55it/s]\n",
304 |             "100% 3/3 [00:00<00:00, 867.55it/s]\n",
305 |             "Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-33c56ada0b4f05de/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.\n",
306 |             "100% 3/3 [00:00<00:00, 631.39it/s]\n",
307 |             "[INFO|file_utils.py:1302] 2021-12-24 08:40:31,171 >> https://huggingface.co/vinai/phobert-base/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpb_xadf8_\n",
308 |             "Downloading: 100% 557/557 [00:00<00:00, 416kB/s]\n",
309 |             "[INFO|file_utils.py:1306] 2021-12-24 08:40:31,314 >> storing https://huggingface.co/vinai/phobert-base/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/a596f267f08b7158c7ab6300b1bf98eb6e1b05e6bcb0d7c18a8070364ee3011b.bbe27b2cac909b2279c83792c2d2b6f159f0a95f5d1c1eb66451da1c89a53609\n",
310 |             "[INFO|file_utils.py:1309] 2021-12-24 08:40:31,314 >> creating metadata file for /root/.cache/huggingface/transformers/a596f267f08b7158c7ab6300b1bf98eb6e1b05e6bcb0d7c18a8070364ee3011b.bbe27b2cac909b2279c83792c2d2b6f159f0a95f5d1c1eb66451da1c89a53609\n",
311 |             "[INFO|configuration_utils.py:449] 2021-12-24 08:40:31,315 >> loading configuration file https://huggingface.co/vinai/phobert-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a596f267f08b7158c7ab6300b1bf98eb6e1b05e6bcb0d7c18a8070364ee3011b.bbe27b2cac909b2279c83792c2d2b6f159f0a95f5d1c1eb66451da1c89a53609\n",
312 |             "[INFO|configuration_utils.py:485] 2021-12-24 08:40:31,316 >> Model config RobertaConfig {\n",
313 |             "  \"architectures\": [\n",
314 |             "    \"RobertaForMaskedLM\"\n",
315 |             "  ],\n",
316 |             "  \"attention_probs_dropout_prob\": 0.1,\n",
317 |             "  \"bos_token_id\": 0,\n",
318 |             "  \"eos_token_id\": 2,\n",
319 |             "  \"gradient_checkpointing\": false,\n",
320 |             "  \"hidden_act\": \"gelu\",\n",
321 |             "  \"hidden_dropout_prob\": 0.1,\n",
322 |             "  \"hidden_size\": 768,\n",
323 |             "  \"initializer_range\": 0.02,\n",
324 |             "  \"intermediate_size\": 3072,\n",
325 |             "  \"layer_norm_eps\": 1e-05,\n",
326 |             "  \"max_position_embeddings\": 258,\n",
327 |             "  \"model_type\": \"roberta\",\n",
328 |             "  \"num_attention_heads\": 12,\n",
329 |             "  \"num_hidden_layers\": 12,\n",
330 |             "  \"pad_token_id\": 1,\n",
331 |             "  \"position_embedding_type\": \"absolute\",\n",
332 |             "  \"tokenizer_class\": \"PhobertTokenizer\",\n",
333 |             "  \"transformers_version\": \"4.3.2\",\n",
334 |             "  \"type_vocab_size\": 1,\n",
335 |             "  \"use_cache\": true,\n",
336 |             "  \"vocab_size\": 64001\n",
337 |             "}\n",
338 |             "\n",
339 |             "[INFO|configuration_utils.py:449] 2021-12-24 08:40:31,455 >> loading configuration file https://huggingface.co/vinai/phobert-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a596f267f08b7158c7ab6300b1bf98eb6e1b05e6bcb0d7c18a8070364ee3011b.bbe27b2cac909b2279c83792c2d2b6f159f0a95f5d1c1eb66451da1c89a53609\n",
340 |             "[INFO|configuration_utils.py:485] 2021-12-24 08:40:31,455 >> Model config RobertaConfig {\n",
341 |             "  \"architectures\": [\n",
342 |             "    \"RobertaForMaskedLM\"\n",
343 |             "  ],\n",
344 |             "  \"attention_probs_dropout_prob\": 0.1,\n",
345 |             "  \"bos_token_id\": 0,\n",
346 |             "  \"eos_token_id\": 2,\n",
347 |             "  \"gradient_checkpointing\": false,\n",
348 |             "  \"hidden_act\": \"gelu\",\n",
349 |             "  \"hidden_dropout_prob\": 0.1,\n",
350 |             "  \"hidden_size\": 768,\n",
351 |             "  \"initializer_range\": 0.02,\n",
352 |             "  \"intermediate_size\": 3072,\n",
353 |             "  \"layer_norm_eps\": 1e-05,\n",
354 |             "  \"max_position_embeddings\": 258,\n",
355 |             "  \"model_type\": \"roberta\",\n",
356 |             "  \"num_attention_heads\": 12,\n",
357 |             "  \"num_hidden_layers\": 12,\n",
358 |             "  \"pad_token_id\": 1,\n",
359 |             "  \"position_embedding_type\": \"absolute\",\n",
360 |             "  \"tokenizer_class\": \"PhobertTokenizer\",\n",
361 |             "  \"transformers_version\": \"4.3.2\",\n",
362 |             "  \"type_vocab_size\": 1,\n",
363 |             "  \"use_cache\": true,\n",
364 |             "  \"vocab_size\": 64001\n",
365 |             "}\n",
366 |             "\n",
367 |             "[INFO|file_utils.py:1302] 2021-12-24 08:40:31,597 >> https://huggingface.co/vinai/phobert-base/resolve/main/vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp9hs_zwr5\n",
368 |             "Downloading: 100% 895k/895k [00:00<00:00, 5.02MB/s]\n",
369 |             "[INFO|file_utils.py:1306] 2021-12-24 08:40:31,923 >> storing https://huggingface.co/vinai/phobert-base/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/970c6224b2713c8b52a7bcfc4d5a951c9bb88302e4523388b50f28284e87ac44.26ba0c8945e559c68d0bc35d24fea16f5463a49fe8f134e0c32261d590b577fa\n",
370 |             "[INFO|file_utils.py:1309] 2021-12-24 08:40:31,923 >> creating metadata file for /root/.cache/huggingface/transformers/970c6224b2713c8b52a7bcfc4d5a951c9bb88302e4523388b50f28284e87ac44.26ba0c8945e559c68d0bc35d24fea16f5463a49fe8f134e0c32261d590b577fa\n",
371 |             "[INFO|file_utils.py:1302] 2021-12-24 08:40:32,068 >> https://huggingface.co/vinai/phobert-base/resolve/main/bpe.codes not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpmnffyx5y\n",
372 |             "Downloading: 100% 1.14M/1.14M [00:00<00:00, 6.34MB/s]\n",
373 |             "[INFO|file_utils.py:1306] 2021-12-24 08:40:32,394 >> storing https://huggingface.co/vinai/phobert-base/resolve/main/bpe.codes in cache at /root/.cache/huggingface/transformers/f3a66ae0a78d1a53b3eb99e31837d0d8e2f684a2dcc1f52f75fd36873e3d79de.301ac8958de708ddcea8500d9acbe6261dba391d249c98dcda1e49dbbff870dd\n",
374 |             "[INFO|file_utils.py:1309] 2021-12-24 08:40:32,395 >> creating metadata file for /root/.cache/huggingface/transformers/f3a66ae0a78d1a53b3eb99e31837d0d8e2f684a2dcc1f52f75fd36873e3d79de.301ac8958de708ddcea8500d9acbe6261dba391d249c98dcda1e49dbbff870dd\n",
375 |             "[INFO|tokenization_utils_base.py:1786] 2021-12-24 08:40:32,395 >> loading file https://huggingface.co/vinai/phobert-base/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/970c6224b2713c8b52a7bcfc4d5a951c9bb88302e4523388b50f28284e87ac44.26ba0c8945e559c68d0bc35d24fea16f5463a49fe8f134e0c32261d590b577fa\n",
376 |             "[INFO|tokenization_utils_base.py:1786] 2021-12-24 08:40:32,395 >> loading file https://huggingface.co/vinai/phobert-base/resolve/main/bpe.codes from cache at /root/.cache/huggingface/transformers/f3a66ae0a78d1a53b3eb99e31837d0d8e2f684a2dcc1f52f75fd36873e3d79de.301ac8958de708ddcea8500d9acbe6261dba391d249c98dcda1e49dbbff870dd\n",
377 |             "[INFO|tokenization_utils.py:193] 2021-12-24 08:40:32,550 >> Adding <mask> to the vocabulary\n",
378 |             "[WARNING|tokenization_utils_base.py:1904] 2021-12-24 08:40:32,550 >> Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.\n",
379 |             "[INFO|file_utils.py:1302] 2021-12-24 08:40:32,688 >> https://huggingface.co/vinai/phobert-base/resolve/main/pytorch_model.bin not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp6_s100_b\n",
380 |             "Downloading: 100% 543M/543M [00:14<00:00, 36.5MB/s]\n",
381 |             "[INFO|file_utils.py:1306] 2021-12-24 08:40:47,625 >> storing https://huggingface.co/vinai/phobert-base/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/8363542cfd9e2bad1a9a618e87ea1153d84819a3ae581cff0816a2c1f610f433.42a5e558f15db4cc3af338445707272b8f7545df78efdc125d3fd51025b22d85\n",
382 |             "[INFO|file_utils.py:1309] 2021-12-24 08:40:47,625 >> creating metadata file for /root/.cache/huggingface/transformers/8363542cfd9e2bad1a9a618e87ea1153d84819a3ae581cff0816a2c1f610f433.42a5e558f15db4cc3af338445707272b8f7545df78efdc125d3fd51025b22d85\n",
383 |             "[INFO|modeling_utils.py:1027] 2021-12-24 08:40:47,626 >> loading weights file https://huggingface.co/vinai/phobert-base/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/8363542cfd9e2bad1a9a618e87ea1153d84819a3ae581cff0816a2c1f610f433.42a5e558f15db4cc3af338445707272b8f7545df78efdc125d3fd51025b22d85\n",
384 |             "[WARNING|modeling_utils.py:1135] 2021-12-24 08:40:51,834 >> Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n",
385 |             "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
386 |             "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
387 |             "[WARNING|modeling_utils.py:1146] 2021-12-24 08:40:51,834 >> Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']\n",
388 |             "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
389 |             "./run_glue.py:351: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead\n",
390 |             "  f\"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the\"\n",
391 |             "12/24/2021 08:40:51 - WARNING - __main__ -   The max_seq_length passed (512) is larger than the maximum length for themodel (256). Using max_seq_length=256.\n",
392 |             "100% 65/65 [01:00<00:00,  1.07ba/s]\n",
393 |             "100% 11/11 [00:09<00:00,  1.18ba/s]\n",
394 |             "100% 11/11 [00:09<00:00,  1.19ba/s]\n",
395 |             "12/24/2021 08:42:15 - INFO - __main__ -   Sample 41905 of the training set: {'#1 ID': 'e6d18a0365b648e5115ac22a6c5e5de4', '#2 ID': '25/2018/tt-bgtvt-->8-->', 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 14997, 105, 747, 17, 1401, 7973, 109, 216, 145, 41, 63, 405, 57, 25, 189, 109, 10, 8078, 4838, 38, 45, 1868, 1823, 114, 2, 2, 432, 358, 5, 8284, 190, 687, 2493, 35, 109, 1611, 26, 141, 60, 109, 1611, 5, 251, 141, 60, 109, 1611, 4, 687, 2493, 35, 109, 1611, 41, 875, 1213, 706, 3176, 63, 181, 2493, 4, 508, 41, 1014, 9, 209, 791, 53, 97, 27, 99, 5, 4003, 1283, 27, 8170, 39, 7973, 12149, 227, 81, 682, 40, 41, 1796, 34, 16, 687, 363, 2764, 121, 15115, 748, 364, 12606, 105, 35, 22635, 76, 7, 2526, 23, 4, 420, 330, 28, 5134, 32, 17, 11, 239, 48, 613, 2231, 20, 599, 19, 5, 1161, 109, 1611, 10, 1681, 10115, 147, 4, 2444, 39, 7973, 12149, 227, 81, 30, 127, 705, 10115, 2764, 8, 245, 2231, 20, 599, 19, 5, 76, 5, 42778, 2010, 27, 1517, 19, 92, 7027, 2082, 6, 39, 7973, 12149, 227, 81, 682, 40, 4, 2493, 3419, 6516, 34, 405, 363, 2764, 925, 2231, 20, 599, 19, 4, 420, 330, 17, 239, 48, 210, 2231, 20, 599, 19, 65, 2774, 19, 4657, 2493, 1512, 10, 378, 3419, 17, 204, 4073, 34, 405, 363, 2764, 431, 2231, 20, 599, 19, 65, 420, 275, 778, 6, 5134, 330, 4, 378, 3419, 9, 687, 23, 17, 11, 204, 4388, 65, 1894, 19, 4657, 2493, 57, 89, 82, 1974, 682, 72, 4, 378, 3419, 2010, 7, 2493, 11, 600, 63, 7999, 2246, 12149, 7, 82, 1974, 1228, 5426, 5, 2], 'label': 0, 'sentence1': 'Đỗ xe ô_tô không sát mép đường phía bên phải theo chiều đi ở nơi đường có lề_đường hẹp sẽ bị xử_phạt bao_nhiêu ?', 'sentence2': 'Điều 8 . Yêu_cầu đối_với đoạn đường_bộ tại đường ngang khi xây_dựng mới đường ngang . Khi xây_dựng mới đường ngang , đoạn đường_bộ tại đường ngang phải đáp_ứng tiêu_chuẩn kỹ_thuật tương_ứng theo cấp đường_bộ , đồng_thời phải bảo_đảm các quy_định cụ_thể sau đây : 1 . Bình_diện : Đường_bộ từ mép ray ngoài cùng trở ra phải thẳng trên một đoạn dài tối_thiểu bằng Khoảng_cách tầm nhìn hãm xe tại Phụ_lục 2 của Thông_tư này , trường_hợp khó_khăn về địa_hình cũng không được nhỏ hơn 15 mét ( m ) . Đối_với đường ngang có bố_trí dải_phân_cách giữa , khoảng_cách từ mép ray ngoài cùng đến đầu đảo dải_phân_cách tối_thiểu là 6 mét ( m ) . 2 . Trắc dọc : a ) Trong lòng_đường sắt và từ mép ray ngoài cùng trở ra , đường_bộ dốc 0% trên chiều dài tối_thiểu 16 mét ( m ) , trường_hợp khó_khăn không nhỏ hơn 10 mét ( m ) ; b ) Đoạn đường_bộ tiếp_theo có độ dốc không quá 3% trên chiều dài tối_thiểu 20 mét ( m ) ; trường_hợp vùng núi và địa_hình khó_khăn , độ dốc các đoạn này không được quá 6% ; c ) Đoạn đường_bộ đi qua hai đường_sắt trở lên , độ dốc dọc của đường_bộ được xác_định theo cao_độ đỉnh ray của hai đường_sắt liền kề . 3 . Chiều rộng phần xe chạy của đoạn đường_bộ trong phạm_vi đường ngang không nhỏ hơn bề rộng phần xe chạy trên đường_bộ phía ngoài và không nhỏ hơn 6 mét ( m ) . Trường_hợp phải mở_rộng để mặt đường không nhỏ hơn 6 mét ( m ) thì đoạn tiếp_theo vuốt dần về bề rộng phần xe chạy trên đường_bộ ngoài phạm_vi đường ngang theo tỉ_lệ 10:1 . Bề rộng phần lề_đường tối_thiểu phải bảo_đảm đủ để lắp_đặt biển_báo hiệu đường_bộ . 4 . Trong phạm_vi đường ngang phải có đầy_đủ hệ_thống thoát nước để bảo_đảm thoát nước của khu_vực . 5 . Trên mặt đường bộ trong khu_vực đường ngang không có người gác được bố_trí gờ giảm_tốc , gồ giảm_tốc để tăng_cường an_toàn giao_thông . Việc xây_dựng gờ giảm_tốc , gồ giảm_tốc theo quy_định của Bộ Giao_thông vận_tải . Trong trường_hợp này , đoạn đường_bộ trong lòng_đường sắt và từ mép ray ngoài cùng trở ra , đường_bộ dốc 0% trên chiều dài tối_thiểu 25 mét ( m ) . 6 . Đường ngang cấp I , cấp II và đường ngang trong khu dân_cư phải có phần đường dành riêng cho người đi bộ trong phạm_vi đường ngang .', 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}.\n",
396 |             "12/24/2021 08:42:15 - INFO - __main__ -   Sample 7296 of the training set: {'#1 ID': '472d7ac7531474310d85d5c4801c9518', '#2 ID': '01/2019/tt-bkhđt-->3-->', 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 20466, 409, 535, 1799, 4570, 5027, 11, 209, 1398, 114, 2, 2, 432, 107, 5, 3947, 9636, 535, 1799, 4570, 5027, 5, 99, 5, 9636, 535, 1799, 4570, 5027, 646, 27, 3496, 35068, 535, 4, 35068, 6, 1610, 35068, 535, 7, 9, 518, 209, 35, 9, 22635, 4, 39, 22635, 100, 1555, 30, 22635, 100, 9441, 1121, 1171, 63, 2526, 23, 5, 53681, 585, 535, 28, 150, 518, 42, 53, 27, 1517, 19, 53681, 585, 535, 28, 518, 16460, 257, 112, 63, 35068, 35, 22635, 100, 1555, 65, 2774, 19, 53681, 585, 535, 28, 518, 2884, 4, 14637, 6, 9511, 112, 63, 35068, 35, 22635, 100, 1966, 65, 1894, 19, 53681, 585, 535, 28, 518, 3934, 112, 63, 35068, 35, 22635, 100, 3491, 65, 2158, 19, 53681, 585, 535, 28, 518, 244, 158, 6, 1271, 112, 63, 35068, 35, 22635, 100, 5308, 65, 3494, 19, 53681, 585, 535, 28, 518, 2600, 6, 4298, 112, 63, 35068, 35, 22635, 100, 2754, 65, 1565, 19, 53681, 585, 535, 28, 518, 3357, 6, 2174, 112, 63, 35068, 35, 22635, 100, 9441, 5, 76, 5, 5617, 535, 8, 647, 5027, 98, 4, 214, 2850, 736, 11, 701, 791, 35, 1638, 34, 145, 41, 7, 150, 35068, 535, 1799, 5, 107, 5, 5617, 154, 535, 8, 304, 187, 1530, 5027, 11, 701, 791, 1638, 34, 145, 41, 7, 150, 35068, 535, 1799, 4, 280, 532, 304, 535, 5, 163, 5, 15031, 154, 535, 8, 43, 154, 535, 11, 701, 791, 35, 1638, 34, 145, 732, 7, 150, 35068, 535, 2], 'label': 1, 'sentence1': 'Phương_thức gửi báo_cáo thống_kê Ngành Thống_kê được quy_định như_thế_nào ?', 'sentence2': 'Điều 3 . Nội_dung Chế_độ báo_cáo thống_kê Ngành Thống_kê . 1 . Chế_độ báo_cáo thống_kê Ngành Thống_kê bao_gồm : danh_mục biểu_mẫu báo_cáo , biểu_mẫu và giải_thích biểu_mẫu báo_cáo của các lĩnh_vực quy_định tại các Phụ_lục , từ Phụ_lục số I đến Phụ_lục số VI ban_hành kèm theo Thông_tư này . Biểu_mẫu báo_cáo về từng lĩnh_vực như sau : a ) Biểu_mẫu báo_cáo về lĩnh_vực Tài_khoản quốc_gia thực_hiện theo biểu_mẫu tại Phụ_lục số I ; b ) Biểu_mẫu báo_cáo về lĩnh_vực Nông_nghiệp , Lâm_nghiệp và Thuỷ_sản thực_hiện theo biểu_mẫu tại Phụ_lục số II ; c ) Biểu_mẫu báo_cáo về lĩnh_vực Công_nghiệp thực_hiện theo biểu_mẫu tại Phụ_lục số III ; d ) Biểu_mẫu báo_cáo về lĩnh_vực vốn đầu_tư và Xây_dựng thực_hiện theo biểu_mẫu tại Phụ_lục số IV ; đ ) Biểu_mẫu báo_cáo về lĩnh_vực Thương_mại và Dịch_vụ thực_hiện theo biểu_mẫu tại Phụ_lục số V ; e ) Biểu_mẫu báo_cáo về lĩnh_vực Xã_hội và Môi_trường thực_hiện theo biểu_mẫu tại Phụ_lục số VI . 2 . Đơn_vị báo_cáo là Cục Thống_kê tỉnh , thành_phố trực_thuộc Trung_ương được ghi cụ_thể tại góc trên bên phải của từng biểu_mẫu báo_cáo thống_kê . 3 . Đơn_vị nhận báo_cáo là đơn_vị thuộc Tổng_cục Thống_kê được ghi cụ_thể góc trên bên phải của từng biểu_mẫu báo_cáo thống_kê , dưới dòng đơn_vị báo_cáo . 4 . Thời_hạn nhận báo_cáo là ngày nhận báo_cáo được ghi cụ_thể tại góc trên bên trái của từng biểu_mẫu báo_cáo . 5 . Phương_thức gửi báo_cáo : Cục Thống_kê tỉnh , thành_phố trực_thuộc Trung_ương gửi báo_cáo bằng văn_bản hoặc báo_cáo điện_tử trên hệ_thống đến Tổng_cục Thống_kê theo thời_gian quy_định trên từng biểu_mẫu . Biểu_mẫu báo_cáo bằng văn_bản giấy phải có chữ_ký , đóng_dấu của Thủ_trưởng đơn_vị gửi báo_cáo để thuận_tiện cho việc kiểm_tra , đối_chiếu , xử_lý số_liệu . Biểu_mẫu báo_cáo qua hệ_thống báo_cáo điện_tử được thể_hiện dưới hai hình_thức là định_dạng file pdf của văn_bản giấy hoặc dưới dạng tệp_tin điện_tử được xác_thực bằng chữ_ký số của Thủ_trưởng đơn_vị báo_cáo . 6 . Ký_hiệu biểu Ký_hiệu biểu gồm hai phần : phần số và phần chữ ; phần số được đánh liên_tục từ 001 , 002 , 003 , ... ; phần chữ được ghi chữ in viết tắt sao cho phù_hợp với từng lĩnh_vực và kỳ báo_cáo ( năm - N ; Quý - Q ; tháng - T ; ) ; lấy chữ BCC ( Báo_cáo Cục ) thể_hiện cho hệ biểu_mẫu báo_cáo thống_kê .', 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}.\n",
397 |             "12/24/2021 08:42:15 - INFO - __main__ -   Sample 1639 of the training set: {'#1 ID': 'e70faf39a18707bbd53629415ed9f8e2', '#2 ID': '65/2014/tt-bqp-->1-->', 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], 'input_ids': [0, 9636, 470, 1420, 1038, 71, 2056, 190, 6218, 1564, 14, 10, 261, 546, 2469, 11, 209, 1398, 114, 2, 2, 432, 99, 5, 18998, 947, 6, 429, 731, 5, 99, 5, 18998, 947, 4, 429, 731, 27, 2526, 23, 1010, 112, 1150, 470, 1420, 1038, 636, 6, 470, 1420, 1038, 71, 2056, 190, 6218, 1564, 6, 18, 47, 247, 38072, 17, 41, 8, 6218, 4, 863, 603, 20, 53, 97, 328, 8, 6218, 1564, 19, 52, 1666, 1315, 1038, 63, 614, 1038, 53367, 151, 614, 1038, 1533, 706, 38072, 65, 1386, 3391, 1958, 6, 18, 47, 247, 85, 12, 116, 38072, 20, 53, 97, 328, 8, 1386, 3391, 1958, 19, 52, 1666, 1315, 1038, 63, 9, 614, 1038, 1533, 4, 2729, 4, 29801, 2537, 4, 533, 1121, 1171, 63, 1545, 3914, 33956, 15745, 43, 948, 78, 445, 29, 2758, 7, 315, 28, 1150, 4016, 190, 416, 4, 2205, 4, 3391, 6, 4730, 20, 53, 328, 2135, 8, 1545, 3914, 33956, 4432, 31477, 15831, 76, 5, 4507, 17, 731, 27, 9636, 470, 1420, 1038, 636, 6, 470, 1420, 1038, 71, 2056, 209, 35, 2526, 23, 17, 731, 190, 6218, 1564, 14, 10, 261, 886, 533, 40726, 151, 14, 1666, 1315, 1420, 1038, 710, 12, 276, 187, 614, 1038, 6218, 1564, 65, 1386, 3391, 1958, 14, 10, 261, 11719, 151, 14, 1666, 1315, 1420, 1038, 710, 12, 13895, 187, 9, 614, 1038, 1533, 4, 2729, 4, 29801, 2537, 4, 533, 1121, 1171, 63, 1545, 3914, 33956, 4432, 19768, 2, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 0, 'sentence1': 'Chế_độ nâng bậc lương trước thời_hạn đối_với quân_nhân chuyên_nghiệp đã có quyết_định chuẩn_bị hưu được quy_định như_thế_nào ?', 'sentence2': 'Điều 1 . Phạm_vi điều_chỉnh và đối_tượng áp_dụng . 1 . Phạm_vi điều_chỉnh , đối_tượng áp_dụng : Thông_tư này hướng_dẫn thực_hiện chế_độ nâng bậc lương thường_xuyên và nâng bậc lương trước thời_hạn đối_với quân_nhân chuyên_nghiệp và người làm công_tác cơ_yếu không phải là quân_nhân , công_an nhân_dân ( sau đây gọi là quân_nhân chuyên_nghiệp ) đang xếp hưởng lương theo bảng lương QNCN hoặc bảng lương chuyên_môn kỹ_thuật cơ_yếu ; công_nhân viên_chức quốc_phòng và người làm công_tác khác trong tổ_chức cơ_yếu ( sau đây gọi là công_nhân viên_chức quốc_phòng ) đang xếp hưởng lương theo các bảng lương chuyên_môn , nghiệp_vụ , thừa_hành , phục_vụ ban_hành kèm theo Nghị_định 204/2004/NĐ-CP ngày 14 tháng 12 năm 2004 của Chính_phủ về chế_độ tiền_lương đối_với cán_bộ , công_chức , viên_chức và lực_lượng_vũ_trang ( sau gọi tắt là Nghị_định 204/2004/NĐ-CP). 2 . Đối_tượng không áp_dụng : Chế_độ nâng bậc lương thường_xuyên và nâng bậc lương trước thời_hạn quy_định tại Thông_tư này không áp_dụng đối_với quân_nhân chuyên_nghiệp đã có quyết_định thôi phục_vụ tại_ngũ hoặc đã xếp hưởng bậc lương cuối_cùng trong nhóm thuộc bảng lương quân_nhân chuyên_nghiệp ; công_nhân viên_chức quốc_phòng đã có quyết_định thôi_việc hoặc đã xếp hưởng bậc lương cuối_cùng trong ngạch thuộc các bảng lương chuyên_môn , nghiệp_vụ , thừa_hành , phục_vụ ban_hành kèm theo Nghị_định 204/2004/NĐ-CP.', 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}.\n",
398 |             "[INFO|trainer.py:432] 2021-12-24 08:42:26,777 >> The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence2, #2 ID, #1 ID, sentence1.\n",
399 |             "[INFO|trainer.py:837] 2021-12-24 08:42:26,780 >> ***** Running training *****\n",
400 |             "[INFO|trainer.py:838] 2021-12-24 08:42:26,780 >>   Num examples = 64471\n",
401 |             "[INFO|trainer.py:839] 2021-12-24 08:42:26,780 >>   Num Epochs = 5\n",
402 |             "[INFO|trainer.py:840] 2021-12-24 08:42:26,780 >>   Instantaneous batch size per device = 16\n",
403 |             "[INFO|trainer.py:841] 2021-12-24 08:42:26,781 >>   Total train batch size (w. parallel, distributed & accumulation) = 16\n",
404 |             "[INFO|trainer.py:842] 2021-12-24 08:42:26,781 >>   Gradient Accumulation steps = 1\n",
405 |             "[INFO|trainer.py:843] 2021-12-24 08:42:26,781 >>   Total optimization steps = 20150\n",
406 |             "  2% 500/20150 [11:45<7:41:22,  1.41s/it][INFO|trainer.py:1408] 2021-12-24 08:54:12,294 >> Saving model checkpoint to ..//settings/Tfbm150E5-full42//models/checkpoint-500\n",
407 |             "[INFO|configuration_utils.py:304] 2021-12-24 08:54:12,298 >> Configuration saved in ..//settings/Tfbm150E5-full42//models/checkpoint-500/config.json\n",
408 |             "[INFO|modeling_utils.py:817] 2021-12-24 08:54:13,782 >> Model weights saved in ..//settings/Tfbm150E5-full42//models/checkpoint-500/pytorch_model.bin\n",
409 |             "  5% 1000/20150 [23:37<7:29:56,  1.41s/it][INFO|trainer.py:1408] 2021-12-24 09:06:04,682 >> Saving model checkpoint to ..//settings/Tfbm150E5-full42//models/checkpoint-1000\n",
410 |             "[INFO|configuration_utils.py:304] 2021-12-24 09:06:04,684 >> Configuration saved in ..//settings/Tfbm150E5-full42//models/checkpoint-1000/config.json\n",
411 |             "[INFO|modeling_utils.py:817] 2021-12-24 09:06:06,238 >> Model weights saved in ..//settings/Tfbm150E5-full42//models/checkpoint-1000/pytorch_model.bin\n",
412 |             "[INFO|trainer.py:1467] 2021-12-24 09:06:11,830 >> Deleting older checkpoint [../settings/Tfbm150E5-full42/models/checkpoint-500] due to args.save_total_limit\n",
413 |             "  7% 1500/20150 [35:30<7:18:26,  1.41s/it][INFO|trainer.py:1408] 2021-12-24 09:17:57,165 >> Saving model checkpoint to ..//settings/Tfbm150E5-full42//models/checkpoint-1500\n",
414 |             "[INFO|configuration_utils.py:304] 2021-12-24 09:17:57,167 >> Configuration saved in ..//settings/Tfbm150E5-full42//models/checkpoint-1500/config.json\n",
415 |             "[INFO|modeling_utils.py:817] 2021-12-24 09:17:58,685 >> Model weights saved in ..//settings/Tfbm150E5-full42//models/checkpoint-1500/pytorch_model.bin\n",
416 |             "[INFO|trainer.py:1467] 2021-12-24 09:18:04,969 >> Deleting older checkpoint [../settings/Tfbm150E5-full42/models/checkpoint-1000] due to args.save_total_limit\n",
417 |             " 10% 2000/20150 [47:23<7:05:25,  1.41s/it][INFO|trainer.py:1408] 2021-12-24 09:29:50,298 >> Saving model checkpoint to ..//settings/Tfbm150E5-full42//models/checkpoint-2000\n",
418 |             "[INFO|configuration_utils.py:304] 2021-12-24 09:29:50,300 >> Configuration saved in ..//settings/Tfbm150E5-full42//models/checkpoint-2000/config.json\n",
419 |             "[INFO|modeling_utils.py:817] 2021-12-24 09:29:51,865 >> Model weights saved in ..//settings/Tfbm150E5-full42//models/checkpoint-2000/pytorch_model.bin\n",
420 |             "[INFO|trainer.py:1467] 2021-12-24 09:29:57,839 >> Deleting older checkpoint [../settings/Tfbm150E5-full42/models/checkpoint-1500] due to args.save_total_limit\n",
421 |             " 12% 2500/20150 [59:17<6:53:57,  1.41s/it][INFO|trainer.py:1408] 2021-12-24 09:41:44,472 >> Saving model checkpoint to ..//settings/Tfbm150E5-full42//models/checkpoint-2500\n",
422 |             "[INFO|configuration_utils.py:304] 2021-12-24 09:41:44,474 >> Configuration saved in ..//settings/Tfbm150E5-full42//models/checkpoint-2500/config.json\n",
423 |             "[INFO|modeling_utils.py:817] 2021-12-24 09:41:46,178 >> Model weights saved in ..//settings/Tfbm150E5-full42//models/checkpoint-2500/pytorch_model.bin\n",
424 |             "[INFO|trainer.py:1467] 2021-12-24 09:41:52,026 >> Deleting older checkpoint [../settings/Tfbm150E5-full42/models/checkpoint-2000] due to args.save_total_limit\n",
425 |             " 15% 3000/20150 [1:11:10<6:42:10,  1.41s/it][INFO|trainer.py:1408] 2021-12-24 09:53:37,468 >> Saving model checkpoint to ..//settings/Tfbm150E5-full42//models/checkpoint-3000\n",
426 |             "[INFO|configuration_utils.py:304] 2021-12-24 09:53:37,470 >> Configuration saved in ..//settings/Tfbm150E5-full42//models/checkpoint-3000/config.json\n",
427 |             "[INFO|modeling_utils.py:817] 2021-12-24 09:53:39,073 >> Model weights saved in ..//settings/Tfbm150E5-full42//models/checkpoint-3000/pytorch_model.bin\n",
428 |             "[INFO|trainer.py:1467] 2021-12-24 09:53:44,894 >> Deleting older checkpoint [../settings/Tfbm150E5-full42/models/checkpoint-2500] due to args.save_total_limit\n",
429 |             " 17% 3500/20150 [1:23:00<6:28:14,  1.40s/it][INFO|trainer.py:1408] 2021-12-24 10:05:27,782 >> Saving model checkpoint to ..//settings/Tfbm150E5-full42//models/checkpoint-3500\n",
430 |             "[INFO|configuration_utils.py:304] 2021-12-24 10:05:27,786 >> Configuration saved in ..//settings/Tfbm150E5-full42//models/checkpoint-3500/config.json\n",
431 |             "[INFO|modeling_utils.py:817] 2021-12-24 10:05:29,457 >> Model weights saved in ..//settings/Tfbm150E5-full42//models/checkpoint-3500/pytorch_model.bin\n",
432 |             "[INFO|trainer.py:1467] 2021-12-24 10:05:35,344 >> Deleting older checkpoint [../settings/Tfbm150E5-full42/models/checkpoint-3000] due to args.save_total_limit\n",
433 |             " 20% 4000/20150 [1:34:53<6:20:10,  1.41s/it][INFO|trainer.py:1408] 2021-12-24 10:17:19,997 >> Saving model checkpoint to ..//settings/Tfbm150E5-full42//models/checkpoint-4000\n",
434 |             "[INFO|configuration_utils.py:304] 2021-12-24 10:17:19,999 >> Configuration saved in ..//settings/Tfbm150E5-full42//models/checkpoint-4000/config.json\n",
435 |             "[INFO|modeling_utils.py:817] 2021-12-24 10:17:21,585 >> Model weights saved in ..//settings/Tfbm150E5-full42//models/checkpoint-4000/pytorch_model.bin\n",
436 |             "[INFO|trainer.py:1467] 2021-12-24 10:17:27,902 >> Deleting older checkpoint [../settings/Tfbm150E5-full42/models/checkpoint-3500] due to args.save_total_limit\n",
437 |             " 22% 4500/20150 [1:46:44<6:06:33,  1.41s/it][INFO|trainer.py:1408] 2021-12-24 10:29:11,166 >> Saving model checkpoint to ..//settings/Tfbm150E5-full42//models/checkpoint-4500\n",
438 |             "[INFO|configuration_utils.py:304] 2021-12-24 10:29:11,168 >> Configuration saved in ..//settings/Tfbm150E5-full42//models/checkpoint-4500/config.json\n",
439 |             "[INFO|modeling_utils.py:817] 2021-12-24 10:29:12,764 >> Model weights saved in ..//settings/Tfbm150E5-full42//models/checkpoint-4500/pytorch_model.bin\n",
440 |             "[INFO|trainer.py:1467] 2021-12-24 10:29:20,512 >> Deleting older checkpoint [../settings/Tfbm150E5-full42/models/checkpoint-4000] due to args.save_total_limit\n",
441 |             " 23% 4588/20150 [1:48:57<6:04:03,  1.40s/it]"
442 |           ]
443 |         }
444 |       ]
445 |     },
446 |     {
447 |       "cell_type": "code",
448 |       "source": [
449 |         "!cd legal_text_retrieval && python src/infer.py"
450 |       ],
451 |       "metadata": {
452 |         "id": "YCs6_efIY7hM"
453 |       },
454 |       "execution_count": null,
455 |       "outputs": []
456 |     },
457 |     {
458 |       "cell_type": "code",
459 |       "source": [
460 |         ""
461 |       ],
462 |       "metadata": {
463 |         "id": "nwuEWf4f0aSz"
464 |       },
465 |       "execution_count": null,
466 |       "outputs": []
467 |     }
468 |   ]
469 | }


--------------------------------------------------------------------------------