├── images ├── coliee3.drawio.png ├── length_fullA_hist.png └── length_train_A_hist.png ├── requirements.txt ├── scripts ├── run_predict.sh ├── run_train.sh ├── run_finetune_bert.sh └── legal_text_retrieval.ipynb ├── Dockerfile ├── LICENSE ├── src ├── stopwords_tfidf_generator.py ├── tfidf_classifier.py ├── custom_rank_bm25.py ├── data_generator.py ├── infer.py ├── run_glue.py └── utils.py ├── .gitignore └── README.md /images/coliee3.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phuongnm94/legal_text_retrieval/HEAD/images/coliee3.drawio.png -------------------------------------------------------------------------------- /images/length_fullA_hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phuongnm94/legal_text_retrieval/HEAD/images/length_fullA_hist.png -------------------------------------------------------------------------------- /images/length_train_A_hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phuongnm94/legal_text_retrieval/HEAD/images/length_train_A_hist.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn 2 | nltk 3 | gensim 4 | pandas 5 | fugashi 6 | mecab-python3 7 | unidic-lite 8 | transformers==4.3.2 9 | openpyxl 10 | torch 11 | xlwt 12 | datasets 13 | sentencepiece 14 | protobuf 15 | rank_bm25 16 | seaborn 17 | vncorenlp -------------------------------------------------------------------------------- /scripts/run_predict.sh: -------------------------------------------------------------------------------- 1 | 2 | # this command will not run if do not mount private test - and infer on public test 3 | cp /data/private_test_question.json /app/data/zac2021-ltr-data/public_test_question.json 4 | 5 | # this command will run infer the test file and get the output 6 | cd /app && \ 7 | python3 src/infer.py && \ 8 | cp data/result_prediction.json /result/submission.csv 9 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # FROM nvidia/cuda:10.2-base 2 | FROM nvidia/cuda:11.0-base 3 | 4 | 5 | CMD nvidia-smi 6 | 7 | WORKDIR /app 8 | ENV DEBIAN_FRONTEND=noninteractive 9 | 10 | #set up environment 11 | RUN apt update && apt install -y --no-install-recommends \ 12 | tzdata \ 13 | default-jdk \ 14 | git \ 15 | build-essential \ 16 | python3-dev \ 17 | python3-pip \ 18 | python3-setuptools \ 19 | unzip 20 | 21 | RUN pip3 -q install pip --upgrade 22 | 23 | # install python environments 24 | COPY requirements.txt /app/requirements.txt 25 | RUN pip3 install -r /app/requirements.txt 26 | 27 | RUN git clone https://github.com/vncorenlp/VnCoreNLP vncorenlp_data 28 | 29 | #copies the applicaiton from local path to container path 30 | COPY . /app 31 | 32 | RUN cd /app/data && unzip zac2021-ltr-data.zip 33 | 34 | CMD ["bash", "/app/scripts/run_train.sh", "&&", "bash", "/app/scripts/run_predict.sh"] -------------------------------------------------------------------------------- /scripts/run_train.sh: -------------------------------------------------------------------------------- 1 | cd /app && mkdir settings 2 | cd /app && mkdir data 3 | USER=root 4 | 5 | # generate data 6 | cd /app && mkdir data/zalo-tfidfbm25150-full 7 | cd /app && python3 src/data_generator.py --path_folder_base data/zac2021-ltr-data/ --test_file public_test_question.json --topk 150 --type_data task3 --tok --path_output_dir data/zalo-tfidfbm25150-full 8 | 9 | 10 | # train phoBERT 11 | cd /app/scripts && bash run_finetune_bert.sh $USER vinai/phobert-base ../ data/zalo-tfidfbm25150-full Tfbm150E5-full 5 12 | 13 | 14 | # generate data 15 | cd /app && mkdir data/zalo-tfidfngrbm25150-notok-full 16 | cd /app && python3 src/data_generator.py --path_folder_base data/zac2021-ltr-data/ --test_file public_test_question.json --topk 150 --type_data task3 --path_output_dir data/zalo-tfidfngrbm25150-notok-full 17 | 18 | # train NLPHust 19 | cd /app/scripts && bash run_finetune_bert.sh $USER NlpHUST/electra-base-vn ../ data/zalo-tfidfngrbm25150-notok-full NlpHTfbmngr150E5-notok-full 5 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Nguyễn Minh Phương 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /scripts/run_finetune_bert.sh: -------------------------------------------------------------------------------- 1 | USER=${1:-"phuongnm"} 2 | MODEL_NAME=${2:-"../settings/phobert-t3042/models/"} # vinai/phobert-base 3 | ROOT_DIR=${3:-"../"} 4 | DATA_DIR=${4:-"data/zalo-tfidf30/"} 5 | SETTING_NAME=${5:-"phobert-t30"} 6 | NUM_EPOCH=${6:-5} 7 | LR=${7:-1e-5} 8 | NUM_LABEL=${8:-2} 9 | MAX_LEN=${9:-512} 10 | 11 | CODE_DIR="${ROOT_DIR}/src/" 12 | DATA_DIR="${ROOT_DIR}/$DATA_DIR" 13 | 14 | SCRIPT_DIR=$(pwd) 15 | 16 | for iSEED in {42..42} 17 | do 18 | SETTING_NAME_SEED=${SETTING_NAME}${iSEED} 19 | SETTING_DIR="${ROOT_DIR}/settings/${SETTING_NAME_SEED}/" 20 | MODEL_OUT="${SETTING_DIR}/models" 21 | mkdir $SETTING_DIR 22 | cp ${SCRIPT_DIR}/run_train.sh $SETTING_DIR 23 | 24 | 25 | cd $CODE_DIR && python3 ./run_glue.py \ 26 | --model_name_or_path $MODEL_NAME \ 27 | --do_train \ 28 | --eval_steps 200 \ 29 | --do_predict \ 30 | --num_label ${NUM_LABEL} \ 31 | --seed ${iSEED} \ 32 | --train_file $DATA_DIR/train.csv \ 33 | --validation_file $DATA_DIR/dev.csv \ 34 | --test_file $DATA_DIR/test.csv \ 35 | --max_seq_length $MAX_LEN \ 36 | --per_device_train_batch_size 16 \ 37 | --learning_rate $LR \ 38 | --warmup_steps 0 \ 39 | --num_train_epochs $NUM_EPOCH \ 40 | --save_total_limit 1 \ 41 | --logging_dir $MODEL_OUT/tensorboard --logging_steps 200 \ 42 | --output_dir $MODEL_OUT --overwrite_output_dir \ 43 | |tee $SETTING_DIR/train.log 44 | done 45 | -------------------------------------------------------------------------------- /src/stopwords_tfidf_generator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from tfidf_classifier import do_classify 5 | from utils import load_data_kse 6 | 7 | 8 | def do_generate_stopwords(path_folder_base="../coliee3_2020/", threshold=0.02, tokenizer=None, data=None): 9 | c_docs, c_keys, dev_q, test_q, train_q = load_data_kse( 10 | path_folder_base=path_folder_base) if data is None else data 11 | 12 | _, (test_similarities, c_vect, test_q_vect, vectorizer) = do_classify(c_docs, c_keys, test_q, 13 | stopwords_=None, 14 | topk=150, 15 | tokenizer=tokenizer) 16 | # generate stop words 17 | stop_words_idx = [] 18 | for doc_vect in c_vect: 19 | for word_idx in doc_vect.indices: 20 | if doc_vect[0, word_idx] < threshold and word_idx not in stop_words_idx: 21 | stop_words_idx.append(word_idx) 22 | 23 | stop_words = [vectorizer.get_feature_names()[w_idx] 24 | for w_idx in stop_words_idx] 25 | path_folder_data_out = "{}/stopwords/".format(path_folder_base) 26 | if not os.path.exists(path_folder_data_out): 27 | os.mkdir(path_folder_data_out) 28 | json.dump(stop_words, open( 29 | "{}/stopwords.json".format(path_folder_data_out), "wt"), ensure_ascii=False) 30 | 31 | 32 | if __name__ == "__main__": 33 | do_generate_stopwords() 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .history/ 2 | data/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | -------------------------------------------------------------------------------- /src/tfidf_classifier.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import TfidfVectorizer 2 | from sklearn.metrics.pairwise import cosine_similarity 3 | from tqdm import tqdm 4 | from custom_rank_bm25 import BM25Plus 5 | import numpy as np 6 | 7 | from utils import Article, combine_idxs, evaluate, evaluate_idx, load_data_kse, standardize_data 8 | import pickle, json 9 | 10 | 11 | def do_classify(c_docs_, c_keys_, test_q_, stopwords_=None, topk=150, vectorizer=None, tokenizer=None, combine_score=False, c_vect=None): 12 | # check old system 13 | if vectorizer is not None and isinstance(vectorizer, TfidfVectorizer): 14 | return do_classify_old(c_docs_, c_keys_, test_q_, stopwords_=stopwords_, topk=topk, vectorizer=vectorizer, tokenizer=tokenizer) 15 | 16 | # new system 17 | c_docs_ = [standardize_data(d) for d in c_docs_] 18 | if vectorizer is None: 19 | print("[W] Learning Tfidf Vectorizer ...") 20 | tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords_, tokenizer=tokenizer, ngram_range=(1,2)) 21 | tfidf_vectorizer.fit(c_docs_) 22 | 23 | print("[W] Learning BM25 Vectorizer ...") 24 | bm25_scorer = BM25Plus([d.split(" ") for d in c_docs_]) 25 | 26 | vectorizer = (tfidf_vectorizer, bm25_scorer) 27 | else: 28 | tfidf_vectorizer, bm25_scorer = vectorizer[0], vectorizer[1] 29 | 30 | # get cosin score from tfidf vector 31 | if c_vect is None: 32 | c_vect = tfidf_vectorizer.transform(c_docs_) 33 | test_q_docs = [standardize_data(q.content) for q in test_q_] 34 | test_q_vect = tfidf_vectorizer.transform(test_q_docs) 35 | tfidf_cosine_score = cosine_similarity(test_q_vect, c_vect) 36 | 37 | # get bm25 score 38 | bm25_similarities = [] 39 | for query in tqdm(test_q_docs): 40 | bm25_similarities.append(bm25_scorer.get_scores(query.split(" "))) 41 | bm25_similarities = np.array(bm25_similarities) 42 | 43 | # combine score 44 | if tokenizer is None: 45 | final_score = 0.33*tfidf_cosine_score+ 0.67*bm25_similarities/np.max(bm25_similarities) 46 | preds = evaluate(final_score, test_q_, topk=topk, c_keys=c_keys_) 47 | else: 48 | idx_tfidf = tfidf_cosine_score.argsort()[:, ::-1][:, :topk] 49 | idx_bm25 = bm25_similarities.argsort()[:, ::-1][:, :topk] 50 | preds = combine_idxs(idx_tfidf, idx_bm25, topk) 51 | preds = evaluate_idx(preds, test_q_, c_keys=c_keys_) 52 | 53 | return preds, ((tfidf_cosine_score, bm25_similarities), c_vect, test_q_vect, vectorizer) 54 | 55 | 56 | 57 | def do_classify_old(c_docs_, c_keys_, test_q_, stopwords_=None, topk=150, vectorizer=None, tokenizer=None): 58 | if vectorizer is None: 59 | print("[W] Learning Tfidf Vectorizer ...") 60 | vectorizer = TfidfVectorizer(stop_words=stopwords_, tokenizer=tokenizer) 61 | vectorizer.fit(c_docs_) 62 | c_vect = vectorizer.transform(c_docs_) 63 | 64 | test_q_docs = [q.content for q in test_q_] 65 | test_q_vect = vectorizer.transform(test_q_docs) 66 | test_similarities = cosine_similarity(test_q_vect, c_vect) 67 | test_pred = evaluate(test_similarities, test_q_, topk=topk, c_keys=c_keys_) 68 | return test_pred, (test_similarities, c_vect, test_q_vect, vectorizer) 69 | 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # legal-text-retrieval 2 | ## Overview 3 | This system contains 2 steps: 4 | - generate training data containing negative sample found by mixture score of cosine(tfidf) + bm25 (using top 150 law articles most similarity) 5 | - fine-tune PhoBERT model (+NlpHUST model - optional) on generated data 6 | 7 | ![thissys](images/coliee3.drawio.png) 8 | ## Environments 9 | ```bash 10 | git clone https://github.com/vncorenlp/VnCoreNLP.git vncorenlp_data # for vncorebnlp tokenize lib 11 | 12 | conda create -n legal_retrieval_env python=3.8 13 | conda activate legal_retrieval_env 14 | pip install -r requirements.txt 15 | ``` 16 | ## Run 17 | 18 | 1. Generate data from folder `data/zac2021-ltr-data/` containing `public_test_question.json` and `train_question_answer.json` 19 | 20 | 21 | ```bash 22 | python3 src/data_generator.py --path_folder_base data/zac2021-ltr-data/ --test_file public_test_question.json --topk 150 --tok --path_output_dir data/zalo-tfidfbm25150-full 23 | ``` 24 | >Note: 25 | > - `--test_file public_test_question.json` is optional, if this parameter is not used, test set will be random 33% in file `train_question_answer.json` 26 | > - `--path_output_dir` is the folder save 3 output file (`train.csv`, `dev.csv`, `test.csv`) and tfidf classifier (`tfidf_classifier.pkl`) for top k best relevant documents. 27 | 28 | 2. Train model 29 | ```bash 30 | bash scripts/run_finetune_bert.sh "magic" vinai/phobert-base ../ data/zalo-tfidfbm25150-full Tfbm150E5-full 5 31 | ``` 32 | 33 | 34 | 3. Predict 35 | ```bash 36 | python3 src/infer.py 37 | ``` 38 | >Note: 39 | > This script will load model and run prediction, pls check the variable `model_configs` in file `src/infer.py` to modify. 40 | 41 | 42 | Try our example on google colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/phuongnm-bkhn/legal_text_retrieval/blob/master/scripts/legal_text_retrieval.ipynb) 43 | 44 | 45 | ## License 46 | MIT-licensed. 47 | 48 | ## Citation 49 | 50 | Please cite as: 51 | 52 | ``` bibtex 53 | @article{DBLP:journals/corr/abs-2106-13405, 54 | author = {Ha{-}Thanh Nguyen and 55 | Phuong Minh Nguyen and 56 | Thi{-}Hai{-}Yen Vuong and 57 | Quan Minh Bui and 58 | Chau Minh Nguyen and 59 | Tran Binh Dang and 60 | Vu Tran and 61 | Minh Le Nguyen and 62 | Ken Satoh}, 63 | title = {{JNLP} Team: Deep Learning Approaches for Legal Processing Tasks in 64 | {COLIEE} 2021}, 65 | journal = {CoRR}, 66 | volume = {abs/2106.13405}, 67 | year = {2021}, 68 | url = {https://arxiv.org/abs/2106.13405}, 69 | eprinttype = {arXiv}, 70 | eprint = {2106.13405}, 71 | biburl = {https://dblp.org/rec/journals/corr/abs-2106-13405.bib}, 72 | bibsource = {dblp computer science bibliography, https://dblp.org} 73 | } 74 | ``` 75 | ```bibtex 76 | @article{DBLP:journals/corr/abs-2011-08071, 77 | author = {Ha{-}Thanh Nguyen and 78 | Hai{-}Yen Thi Vuong and 79 | Phuong Minh Nguyen and 80 | Tran Binh Dang and 81 | Quan Minh Bui and 82 | Vu Trong Sinh and 83 | Chau Minh Nguyen and 84 | Vu D. Tran and 85 | Ken Satoh and 86 | Minh Le Nguyen}, 87 | title = {{JNLP} Team: Deep Learning for Legal Processing in {COLIEE} 2020}, 88 | journal = {CoRR}, 89 | volume = {abs/2011.08071}, 90 | year = {2020}, 91 | url = {https://arxiv.org/abs/2011.08071}, 92 | eprinttype = {arXiv}, 93 | eprint = {2011.08071}, 94 | biburl = {https://dblp.org/rec/journals/corr/abs-2011-08071.bib}, 95 | bibsource = {dblp computer science bibliography, https://dblp.org} 96 | } 97 | ``` 98 | -------------------------------------------------------------------------------- /src/custom_rank_bm25.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import math 4 | import numpy as np 5 | from multiprocessing import Pool, cpu_count 6 | 7 | """ 8 | All of these algorithms have been taken from the paper: 9 | Trotmam et al, Improvements to BM25 and Language Models Examined 10 | Here we implement all the BM25 variations mentioned. 11 | """ 12 | 13 | 14 | class BM25: 15 | def __init__(self, corpus, tokenizer=None): 16 | self.corpus_size = len(corpus) 17 | self.avgdl = 0 18 | self.doc_freqs = [] 19 | self.idf = {} 20 | self.doc_len = [] 21 | self.tokenizer = tokenizer 22 | self.q_freq_cache = {} 23 | 24 | if tokenizer: 25 | corpus = self._tokenize_corpus(corpus) 26 | 27 | nd = self._initialize(corpus) 28 | self._calc_idf(nd) 29 | 30 | def _initialize(self, corpus): 31 | print("Loading BM25 model ...") 32 | nd = {} # word -> number of documents with word 33 | num_doc = 0 34 | for document in corpus: 35 | self.doc_len.append(len(document)) 36 | num_doc += len(document) 37 | 38 | frequencies = {} 39 | for word in document: 40 | if word not in frequencies: 41 | frequencies[word] = 0 42 | frequencies[word] += 1 43 | self.doc_freqs.append(frequencies) 44 | 45 | for word, freq in frequencies.items(): 46 | try: 47 | nd[word]+=1 48 | except KeyError: 49 | nd[word] = 1 50 | 51 | self.avgdl = num_doc / self.corpus_size 52 | return nd 53 | 54 | def _tokenize_corpus(self, corpus): 55 | pool = Pool(cpu_count()) 56 | tokenized_corpus = pool.map(self.tokenizer, corpus) 57 | return tokenized_corpus 58 | 59 | def _calc_idf(self, nd): 60 | raise NotImplementedError() 61 | 62 | def get_scores(self, query): 63 | raise NotImplementedError() 64 | 65 | def get_batch_scores(self, query, doc_ids): 66 | raise NotImplementedError() 67 | 68 | def get_top_n(self, query, documents, n=5): 69 | 70 | assert self.corpus_size == len(documents), "The documents given don't match the index corpus!" 71 | 72 | scores = self.get_scores(query) 73 | top_n = np.argsort(scores)[::-1][:n] 74 | return [documents[i] for i in top_n] 75 | 76 | 77 | class BM25Okapi(BM25): 78 | def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, epsilon=0.25): 79 | self.k1 = k1 80 | self.b = b 81 | self.epsilon = epsilon 82 | super().__init__(corpus, tokenizer) 83 | 84 | def _calc_idf(self, nd): 85 | """ 86 | Calculates frequencies of terms in documents and in corpus. 87 | This algorithm sets a floor on the idf values to eps * average_idf 88 | """ 89 | # collect idf sum to calculate an average idf for epsilon value 90 | idf_sum = 0 91 | # collect words with negative idf to set them a special epsilon value. 92 | # idf can be negative if word is contained in more than half of documents 93 | negative_idfs = [] 94 | for word, freq in nd.items(): 95 | idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5) 96 | self.idf[word] = idf 97 | idf_sum += idf 98 | if idf < 0: 99 | negative_idfs.append(word) 100 | self.average_idf = idf_sum / len(self.idf) 101 | 102 | eps = self.epsilon * self.average_idf 103 | for word in negative_idfs: 104 | self.idf[word] = eps 105 | 106 | def get_scores(self, query): 107 | """ 108 | The ATIRE BM25 variant uses an idf function which uses a log(idf) score. To prevent negative idf scores, 109 | this algorithm also adds a floor to the idf value of epsilon. 110 | See [Trotman, A., X. Jia, M. Crane, Towards an Efficient and Effective Search Engine] for more info 111 | :param query: 112 | :return: 113 | """ 114 | score = np.zeros(self.corpus_size) 115 | doc_len = np.array(self.doc_len) 116 | for q in query: 117 | q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs]) 118 | score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) / 119 | (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl))) 120 | return score 121 | 122 | def get_batch_scores(self, query, doc_ids): 123 | """ 124 | Calculate bm25 scores between query and subset of all docs 125 | """ 126 | assert all(di < len(self.doc_freqs) for di in doc_ids) 127 | score = np.zeros(len(doc_ids)) 128 | doc_len = np.array(self.doc_len)[doc_ids] 129 | for q in query: 130 | q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids]) 131 | score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) / 132 | (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl))) 133 | return score.tolist() 134 | 135 | 136 | class BM25Plus(BM25): 137 | def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, delta=1): 138 | # Algorithm specific parameters 139 | self.k1 = k1 140 | self.b = b 141 | self.delta = delta 142 | super().__init__(corpus, tokenizer) 143 | 144 | def _calc_idf(self, nd): 145 | for word, freq in nd.items(): 146 | idf = math.log((self.corpus_size + 1) / freq) 147 | self.idf[word] = idf 148 | 149 | def get_scores(self, query): 150 | score = np.zeros(self.corpus_size) 151 | doc_len = np.array(self.doc_len) 152 | for q in query: 153 | if q in self.q_freq_cache: 154 | q_freq = self.q_freq_cache[q] 155 | else: 156 | q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs]) 157 | self.q_freq_cache[q] = q_freq 158 | score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) / 159 | (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq)) 160 | return score 161 | 162 | def get_batch_scores(self, query, doc_ids): 163 | """ 164 | Calculate bm25 scores between query and subset of all docs 165 | """ 166 | assert all(di < len(self.doc_freqs) for di in doc_ids) 167 | score = np.zeros(len(doc_ids)) 168 | doc_len = np.array(self.doc_len)[doc_ids] 169 | for q in query: 170 | q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids]) 171 | score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) / 172 | (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq)) 173 | return score.tolist() 174 | 175 | -------------------------------------------------------------------------------- /src/data_generator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pickle 4 | import re 5 | import traceback 6 | from typing import List 7 | import random 8 | 9 | import pandas as pd 10 | import argparse 11 | 12 | from stopwords_tfidf_generator import do_generate_stopwords 13 | from tfidf_classifier import do_classify 14 | from utils import Question, load_data_kse, postag_filter 15 | from vncorenlp import VnCoreNLP 16 | 17 | tokenizer_obj = VnCoreNLP("vncorenlp_data/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') 18 | 19 | def vi_tokenize(str_in): 20 | # return str_in 21 | str_in = " ".join(str_in.split(" ")[:512]) 22 | sentences = [" ".join(s) for s in tokenizer_obj.tokenize(str_in)] 23 | 24 | return " ".join(sentences) 25 | 26 | def generate_pair_inputs(data_pred, data_gold, _c_keys, append_gold=False, sub_doc_info=None): 27 | _data_pairs_id = [] 28 | _, _, sub_key_mapping = sub_doc_info or (None, None, {}) 29 | 30 | for i in range(data_pred.shape[0]): 31 | cur_pred = [_c_keys[idx] for idx in data_pred[i]] 32 | cur_label = [a.get_id() for a in data_gold[i].relevant_a] 33 | q_id = data_gold[i].id 34 | 35 | if append_gold: 36 | for id_civil_lb in cur_label: 37 | if id_civil_lb not in cur_pred: 38 | cur_pred = cur_pred + [id_civil_lb] 39 | 40 | for j, id_civil_pred in enumerate(cur_pred): 41 | check_lb = id_civil_pred in cur_label 42 | _data_pairs_id.append(((id_civil_pred, q_id), check_lb)) 43 | 44 | # append sub articles (by chunking) 45 | for id_c in sub_key_mapping.get(id_civil_pred, []): 46 | _data_pairs_id.append(((id_c, q_id), check_lb)) 47 | 48 | print('Number data pairs: ', len(_data_pairs_id)) 49 | return _data_pairs_id 50 | 51 | 52 | def aggregate_sentence_pairs(_c_docs, _c_keys, _data_pairs_id, _q: List[Question], plus_filter_postags=False, filter_lb=False, 53 | empty_article_id="None", sub_doc_info=None): 54 | _new_dataset = [] 55 | _q_map = dict((q.id, q.content) for q in _q) 56 | empty_article_content = "" 57 | _c_docs = _c_docs + [empty_article_content] 58 | _c_keys = _c_keys + [empty_article_id] 59 | 60 | _c_sub_docs, _c_sub_keys, _ = sub_doc_info or (None, None, {}) 61 | _c_docs = _c_docs + (_c_sub_docs if _c_sub_docs is not None else []) 62 | _c_keys = _c_keys + (_c_sub_keys if _c_sub_keys is not None else []) 63 | 64 | for (id_civil_pred, q_id), lb in _data_pairs_id: 65 | try: 66 | _new_dataset.append({ 67 | "id": [id_civil_pred, q_id], 68 | "c_code": _c_docs[_c_keys.index(id_civil_pred)], 69 | "query": _q_map[q_id], 70 | 'label': lb 71 | }) 72 | 73 | if plus_filter_postags: 74 | if filter_lb and lb: 75 | _new_dataset.append({ 76 | "id": [id_civil_pred + "_pos_filtered", q_id], 77 | "c_code": postag_filter(_c_docs[_c_keys.index(id_civil_pred)]), 78 | "query": _q_map[q_id], 79 | 'label': lb 80 | }) 81 | if not filter_lb: 82 | _new_dataset.append({ 83 | "id": [id_civil_pred + "_pos_filtered", q_id], 84 | "c_code": postag_filter(_c_docs[_c_keys.index(id_civil_pred)]), 85 | "query": _q_map[q_id], 86 | 'label': lb 87 | }) 88 | except Exception as e: 89 | traceback.print_stack() 90 | print(e) 91 | print("[Err] {}".format(((id_civil_pred, q_id), lb))) 92 | return _new_dataset 93 | 94 | 95 | def aggregate_sentence_pairs_task5(_data_pairs_id, _q: List[Question]): 96 | _new_dataset = [] 97 | _q_map = dict((q.id, q.content) for q in _q) 98 | 99 | for q_id, lb in _data_pairs_id: 100 | _new_dataset.append({ 101 | "id": q_id, 102 | "query": _q_map[q_id], 103 | 'label': lb 104 | }) 105 | return _new_dataset 106 | 107 | 108 | def gen_mrpc_data(coliee_data_, file_path): 109 | data = { 110 | "label": [], 111 | "#1 ID": [], 112 | "#2 ID": [], 113 | "sentence1": [], 114 | "sentence2": [], 115 | } 116 | for e in coliee_data_: 117 | data['label'].append(1 if e['label'] else 0) 118 | data['#1 ID'].append(e['id'][1]) 119 | data['#2 ID'].append(e['id'][0]) 120 | data['sentence1'].append(e['query'].replace('\n', " ")) 121 | data['sentence2'].append(e['c_code'].replace('\n', " ")) 122 | df = pd.DataFrame(data=data) 123 | df.to_csv(file_path, index=False, sep=',') 124 | 125 | 126 | def gen_cola_data(coliee_data_, file_path): 127 | data = { 128 | "sentence": [], 129 | "label": [], 130 | "id": [], 131 | } 132 | for e in coliee_data_: 133 | data['label'].append(1 if e['label'] else 0) 134 | data['sentence'].append(e['query'].replace('\n', " ")) 135 | data['id'].append(e['id'].replace('\n', " ")) 136 | df = pd.DataFrame(data=data) 137 | df.to_csv(file_path, index=False, sep=',') 138 | 139 | if __name__ == "__main__": 140 | parser = argparse.ArgumentParser() 141 | parser.add_argument('--path_folder_base', 142 | action="store", dest="path_folder_base", 143 | help="path folder saving data", default='path/to/path_folder_base') 144 | parser.add_argument('--path_output_dir', 145 | action="store", dest="path_output_dir", 146 | help="path folder saving output data", default='path/to/path_output_dir') 147 | parser.add_argument('--type_data', 148 | action="store", dest="type_data", 149 | help="type data for generating process: task3 | task4", default='task3') 150 | parser.add_argument('--test_file', 151 | action="store", dest="test_file", type=str, 152 | help="path to the test file", default=None) 153 | parser.add_argument('--topk', 154 | action="store", dest="topk", type=int, 155 | help="topk select by tfidf when generating data", default=150) 156 | parser.add_argument('--only_test', 157 | action="store_true", dest="only_test", 158 | help="just generate testing data", default=False) 159 | parser.add_argument('--chunk_content_size', 160 | action="store", dest="chunk_content_size", type=int, 161 | help="chunk content of article with size", default=0) 162 | parser.add_argument('--chunk_content_stride', 163 | action="store", dest="chunk_content_stride", type=int, 164 | help="chunk content of article with stride", default=0) 165 | parser.add_argument('--tok', 166 | action="store_true", dest="tok", 167 | help="run tokenize", default=False) 168 | 169 | options = parser.parse_args() 170 | tokenizer = vi_tokenize if options.tok else None 171 | 172 | path_folder_base = options.path_folder_base 173 | topk_select = options.topk 174 | 175 | chunk_content_info = [options.chunk_content_size, 176 | options.chunk_content_stride] \ 177 | if options.chunk_content_size > 0 and options.chunk_content_stride > 0 else None 178 | 179 | test_ids = None 180 | if options.test_file is not None: 181 | test_data = json.load(open("{}/{}".format(path_folder_base, options.test_file))) 182 | if 'items' in test_data: 183 | test_data = test_data['items'] 184 | test_ids = [s["question_id"] for s in test_data] 185 | 186 | path_data_cached = '{}/tokenized_data_cached.pkl'.format(options.path_output_dir) 187 | if os.path.isfile(path_data_cached): 188 | print ("Load cached file data: {}".format(path_data_cached)) 189 | c_docs, c_keys, dev_q, test_q, train_q, sub_doc_info = pickle.load(open(path_data_cached, 'rb')) 190 | else: 191 | print ("Load data and tokenize data") 192 | c_docs, c_keys, dev_q, test_q, train_q, sub_doc_info = load_data_kse( 193 | path_folder_base=path_folder_base, ids_test=test_ids, tokenizer=tokenizer, testing_data=options.test_file, 194 | chunk_content_info=chunk_content_info 195 | ) 196 | try: 197 | pickle.dump((c_docs, c_keys, dev_q, test_q, train_q, sub_doc_info), open(path_data_cached, 'wb')) 198 | except Exception as e: 199 | print(e) 200 | 201 | c_docs_raw = sub_doc_info[3] 202 | sub_doc_info = sub_doc_info[:3] 203 | 204 | # test_q = train_q 205 | if len(dev_q) == 0: 206 | dev_q = train_q 207 | if len(test_q) == 0: 208 | test_q = train_q 209 | 210 | stopwords = None 211 | 212 | # build tfidf vectorizer and generate pair sentence for training process 213 | # if text is tokenized, not combine tfidf with bm25, otherwise combine 214 | if not options.only_test: 215 | train_pred, (_, _, _, vectorizer) = do_classify(c_docs, c_keys, train_q, 216 | stopwords_=stopwords, topk=topk_select, tokenizer=tokenizer, combine_score=(not options.tok)) 217 | train_data_pairs_id = generate_pair_inputs(_c_keys=c_keys, data_pred=train_pred, data_gold=train_q, 218 | append_gold=True, sub_doc_info=sub_doc_info) 219 | else: 220 | train_data_pairs_id = [] 221 | 222 | test_pred, _ = do_classify( 223 | c_docs, c_keys, test_q, vectorizer=vectorizer, topk=topk_select, tokenizer=tokenizer, combine_score=(not options.tok)) 224 | test_data_pairs_id = generate_pair_inputs( 225 | _c_keys=c_keys, data_pred=test_pred, data_gold=test_q, sub_doc_info=sub_doc_info) 226 | 227 | dev_pred, _ = do_classify( 228 | c_docs, c_keys, dev_q, vectorizer=vectorizer, topk=topk_select, tokenizer=tokenizer, combine_score=(not options.tok)) 229 | dev_data_pairs_id = generate_pair_inputs( 230 | _c_keys=c_keys, data_pred=dev_pred, data_gold=dev_q, sub_doc_info=sub_doc_info) 231 | 232 | print("len(train_data_pairs_id), len(test_data_pairs_id), len(dev_data_pairs_id) = ", 233 | len(train_data_pairs_id), len(test_data_pairs_id), len(dev_data_pairs_id)) 234 | 235 | # save file csv following template of mrpc task 236 | path_folder_data_out = options.path_output_dir 237 | if not os.path.exists(path_folder_data_out): 238 | os.mkdir(path_folder_data_out) 239 | 240 | # fill data from train/test data_pairs_id 241 | new_dataset_train = aggregate_sentence_pairs(c_docs, c_keys, train_data_pairs_id, train_q, 242 | plus_filter_postags=False, 243 | filter_lb=False, sub_doc_info=sub_doc_info) 244 | new_dataset_test = aggregate_sentence_pairs(c_docs, c_keys, test_data_pairs_id, test_q, 245 | plus_filter_postags=False, 246 | filter_lb=False, sub_doc_info=sub_doc_info) 247 | new_dataset_dev = aggregate_sentence_pairs(c_docs, c_keys, dev_data_pairs_id, dev_q, 248 | plus_filter_postags=False, 249 | filter_lb=False, sub_doc_info=sub_doc_info) 250 | 251 | gen_mrpc_data(new_dataset_train, 252 | "{}/train.csv".format(path_folder_data_out)) 253 | gen_mrpc_data(new_dataset_test, "{}/test.csv".format(path_folder_data_out)) 254 | gen_mrpc_data(new_dataset_dev, "{}/dev.csv".format(path_folder_data_out)) 255 | 256 | # save tfidf vectorizer that filter fop 150 civil document 257 | pickle.dump(vectorizer, open( 258 | "{}/tfidf_classifier.pkl".format(path_folder_data_out), "wb")) 259 | 260 | 261 | -------------------------------------------------------------------------------- /src/infer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | import json 4 | import pickle 5 | 6 | from transformers.tokenization_utils import PreTrainedTokenizer 7 | from data_generator import vi_tokenize 8 | 9 | from tfidf_classifier import do_classify 10 | from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer 11 | from transformers import GlueDataTrainingArguments as DataTrainingArguments 12 | from transformers import ( 13 | HfArgumentParser, 14 | Trainer, 15 | TrainingArguments, 16 | ) 17 | 18 | from transformers.data.datasets.glue import * 19 | from transformers.data.processors.utils import InputExample 20 | 21 | from utils import Question, load_data_kse, standardize_data, Article 22 | import numpy as np 23 | 24 | 25 | @dataclass 26 | class ModelArguments: 27 | """ 28 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. 29 | """ 30 | model_name_or_path: str = field( 31 | metadata={ 32 | "help": "Path to pretrained model or model identifier from huggingface.co/models"} 33 | ) 34 | config_name: Optional[str] = field( 35 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 36 | ) 37 | tokenizer_name: Optional[str] = field( 38 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 39 | ) 40 | cache_dir: Optional[str] = field( 41 | default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} 42 | ) 43 | 44 | 45 | class LawDataset(Dataset): 46 | """ 47 | This will be superseded by a framework-agnostic approach 48 | soon. 49 | """ 50 | args: GlueDataTrainingArguments 51 | output_mode: str 52 | features: List[InputFeatures] 53 | 54 | def __init__( 55 | self, 56 | args: GlueDataTrainingArguments, 57 | tokenizer: PreTrainedTokenizer, 58 | limit_examples: Optional[int] = None, 59 | mode: Union[str, Split] = Split.train, 60 | c_code=None, 61 | sentence=None, 62 | ): 63 | self.args = args 64 | task_name = 'mrpc' 65 | self.processor = glue_processors[task_name]() 66 | self.output_mode = 'classification' 67 | self.c_code = c_code if c_code is not None else [] 68 | self.sentence = sentence if sentence is not None else "" 69 | if isinstance(mode, str): 70 | try: 71 | mode = Split[mode] 72 | except KeyError: 73 | raise KeyError("mode is not a valid split name") 74 | 75 | label_list = self.processor.get_labels() 76 | self.label_list = label_list 77 | 78 | def _create_examples(lines, set_type='test'): 79 | examples = [] 80 | for (i, line) in enumerate(lines): 81 | guid = "%s-%s" % (set_type, i) 82 | text_a = line[3] 83 | text_b = line[4] 84 | label = None if set_type == "test" else line[0] 85 | examples.append(InputExample( 86 | guid=guid, text_a=text_a, text_b=text_b, label=label)) 87 | return examples 88 | 89 | lines = [] 90 | for i, e in enumerate(self.c_code): 91 | lines.append([0, "sent_{}".format( 92 | i), e[1], self.sentence[i], e[0]]) 93 | 94 | # recreate the data 95 | examples = _create_examples(lines) 96 | if limit_examples is not None: 97 | examples = examples[:limit_examples] 98 | self.features = glue_convert_examples_to_features( 99 | examples, 100 | tokenizer, 101 | max_length=args.max_seq_length, 102 | label_list=label_list, 103 | output_mode=self.output_mode, 104 | ) 105 | 106 | def __len__(self): 107 | return len(self.features) 108 | 109 | def __getitem__(self, i) -> InputFeatures: 110 | return self.features[i] 111 | 112 | def get_labels(self): 113 | return self.label_list 114 | 115 | def get_c_code_ids(self): 116 | return [e[1] for e in self.c_code] 117 | 118 | 119 | def infer_coliee_task3(sentence, all_civil_code, data_args, tfidf_vectorizer, trainer, bert_tokenizer, tokenizer=None, topk=150): 120 | if isinstance(sentence, str): 121 | sentence = [sentence] 122 | test_q = [Question(id='q{}'.format(i), content=tokenizer( 123 | s)if tokenizer is not None else s, content_raw=s, relevant_a=[]) for i, s in enumerate(sentence)] 124 | c_docs = all_civil_code[0] 125 | c_keys = all_civil_code[1] 126 | c_vect = all_civil_code[2] 127 | c_docs_keys = list(zip(all_civil_code[0], all_civil_code[1])) 128 | 129 | test_pred, _ = do_classify( 130 | c_docs, c_keys, test_q, vectorizer=tfidf_vectorizer, topk=topk, c_vect=c_vect, combine_score=(tokenizer is None)) 131 | 132 | c_code_pred_by_tfidf = [] 133 | coressponding_questions = [] 134 | for i, s_pred in enumerate(test_pred): 135 | for idx in s_pred: 136 | coressponding_questions.append(test_q[i].content) 137 | c_code_pred_by_tfidf.append(c_docs_keys[idx]) 138 | 139 | test_dataset = LawDataset(data_args, 140 | bert_tokenizer, 141 | mode='test', sentence=coressponding_questions, c_code=c_code_pred_by_tfidf) 142 | predictions = trainer.predict(test_dataset=test_dataset).predictions 143 | probs = torch.softmax(torch.from_numpy(predictions), dim=1) 144 | predicted_labels = torch.argmax(probs, 1) 145 | return predicted_labels, probs, c_code_pred_by_tfidf 146 | 147 | def list_split(listA, n): 148 | for x in range(0, len(listA), n): 149 | every_chunk = listA[x: n+x] 150 | 151 | if len(every_chunk) < n: 152 | every_chunk = every_chunk + \ 153 | [None for y in range(n-len(every_chunk))] 154 | yield every_chunk 155 | 156 | def init_state(path_c_code, path_data_org, path_preprocessed_data, model_path, tokenizer=None, topk=150, testing_data=None, max_seq_length=512, 157 | do_lower_case=True): 158 | model_version = model_path # 'bert-base-uncased' 159 | 160 | config = AutoConfig.from_pretrained( 161 | model_version, 162 | num_labels=2, 163 | finetuning_task='MRPC' 164 | ) 165 | model = AutoModelForSequenceClassification.from_pretrained( 166 | model_version, config=config) 167 | bert_tokenizer = AutoTokenizer.from_pretrained( 168 | model_version, do_lower_case=do_lower_case) 169 | model.eval() 170 | 171 | parser = HfArgumentParser( 172 | (ModelArguments, DataTrainingArguments, TrainingArguments)) 173 | model_args, data_args, training_args = parser.parse_args_into_dataclasses( 174 | args=["--model_name_or_path", model_version, 175 | "--task_name", "MRPC", 176 | "--data_dir", "./coliee3_2020/data", 177 | "--do_predict", 178 | "--per_device_train_batch_size", "16", 179 | "--max_seq_length", "{}".format(max_seq_length), 180 | "--learning_rate", "2e-5", 181 | "--output_dir", model_version, 182 | "--overwrite_output_dir"]) 183 | # Initialize our Trainer 184 | trainer = Trainer( 185 | model=model, 186 | args=training_args 187 | ) 188 | tfidf_vectorizer = pickle.load( 189 | open("{}/tfidf_classifier.pkl".format(path_preprocessed_data), "rb")) 190 | if isinstance(tfidf_vectorizer, tuple): 191 | tfidf_vectorizer, bm25_scorer = tfidf_vectorizer[0], tfidf_vectorizer[1] 192 | 193 | path_data_cached = '{}/tokenized_data_cached.pkl'.format( 194 | path_preprocessed_data) 195 | if os.path.isfile(path_data_cached): 196 | print("Load cached file data: {}".format(path_data_cached)) 197 | c_docs, c_keys, dev_q, test_q, train_q, sub_doc_info = pickle.load( 198 | open(path_data_cached, 'rb')) 199 | else: 200 | print("Load data and tokenize data") 201 | c_docs, c_keys, dev_q, test_q, train_q, sub_doc_info = load_data_kse( 202 | path_folder_base=path_data_org, ids_test=[ 203 | ], tokenizer=tokenizer, testing_data=testing_data, 204 | # chunk_content_info=chunk_content_info 205 | ) 206 | 207 | c_vect = tfidf_vectorizer.transform([standardize_data(d) for d in c_docs]) 208 | return (c_docs, c_keys, c_vect), data_args, (tfidf_vectorizer, bm25_scorer), trainer, bert_tokenizer 209 | 210 | 211 | if __name__ == "__main__": 212 | global all_civil_code, data_args, tfidf_vectorizer, trainer, bert_tokenizer 213 | 214 | model_configs = { 215 | # 'NlpHUST': { 216 | # "path_data_org": 'data/zac2021-ltr-data/', 217 | # "path_c_code": 'data/zac2021-ltr-data/legal_corpus.json', 218 | # "tokenizer": None, 219 | # "topk": 150, 220 | # "do_lower_case": False, 221 | # "max_seq_length": 512, 222 | # "path_preprocessed_data": 'data/zalo-tfidfngrbm25150-notok-full/', 223 | # "model_path": 'settings/NlpHTfbmngr150E5-notok-full42/models', 224 | # }, 225 | 'PhoBERT': { 226 | "path_data_org": 'data/zac2021-ltr-data/', 227 | "path_c_code": 'data/zac2021-ltr-data/legal_corpus.json', 228 | "tokenizer": 'vi_tokenize', 229 | "topk": 300, 230 | "do_lower_case": True, 231 | "max_seq_length": 256, 232 | "path_preprocessed_data": 'data/zalo-tfidfbm25150-full/', 233 | "model_path": 'settings/Tfbm150E5-full42/models', 234 | } 235 | } 236 | print(json.dumps(model_configs, indent=2)) 237 | 238 | test_all_data = json.load(open('data/zac2021-ltr-data/public_test_question.json'))['items'] 239 | test_ids = [e['question_id'] for e in test_all_data] 240 | test_sents = [e['question'] for e in test_all_data] 241 | 242 | # test_sents = [ 243 | # "Đới khoáng hóa là gì?", 244 | # "Kinh phí bảo đảm thi hành án đối với pháp nhân thương mại được quy định như thế nào?", 245 | # "Thời gian viên chức nghỉ thai sản có đánh giá chất lượng không?", 246 | # "Việc trình, giải quyết hồ sơ đề nghị sửa đổi, bổ sung Quyết định giao khu vực biển được quy định như thế nào?", 247 | # # "Hình thức kỷ luật hạ bậc lương trong việc xử lý VPHC sẽ áp dụng cho đối tượng nào?", 248 | # # "Nguyên tắc xác định tổ chức, cá nhân làm môi trường bị ô nhiễm, suy thoái theo quy định của pháp luật", 249 | # # "Người được đề xuất hình thức kỷ luật trong quốc phòng?", 250 | # ] 251 | 252 | # def init models 253 | model_init_states = {} 254 | print("Loading model ....") 255 | for m_name, model_info in model_configs.items(): 256 | if 'tokenizer' in model_info and model_info['tokenizer'] == 'vi_tokenize': 257 | model_info['tokenizer'] = vi_tokenize 258 | 259 | model_init_states[m_name] = init_state(**model_info) 260 | all_civil_code, data_args, tfidf_vectorizer, trainer, bert_tokenizer = model_init_states[ 261 | m_name] 262 | 263 | tokenizer = model_info.get('tokenizer') 264 | topk = 150 265 | infer_coliee_task3(sentence=test_sents[:5], 266 | all_civil_code=all_civil_code, data_args=data_args, tfidf_vectorizer=tfidf_vectorizer, 267 | trainer=trainer, bert_tokenizer=bert_tokenizer, 268 | tokenizer=tokenizer, topk=topk) 269 | print("Finish loaded model") 270 | 271 | missing_ids_info = {} 272 | real_prediction = {} 273 | 274 | # start infer 275 | time_start = time.time() 276 | for m_name, model_info in model_configs.items(): 277 | if 'tokenizer' in model_info and model_info['tokenizer'] == 'vi_tokenize': 278 | model_info['tokenizer'] = vi_tokenize 279 | 280 | all_civil_code, data_args, tfidf_vectorizer, trainer, bert_tokenizer = model_init_states[ 281 | m_name] 282 | tokenizer = model_info.get('tokenizer') 283 | topk = model_info.get('topk', 150) 284 | predicted_labels, probs, c_code_pred_by_tfidf = infer_coliee_task3(sentence=test_sents, all_civil_code=all_civil_code, 285 | data_args=data_args, 286 | tfidf_vectorizer=tfidf_vectorizer, 287 | trainer=trainer, bert_tokenizer=bert_tokenizer, 288 | tokenizer=tokenizer, topk=topk) 289 | 290 | predicted_labels = [x for x in list_split(predicted_labels, topk)] # np.array_split(predicted_labels, len(test_sents)) 291 | probs = [x for x in list_split(probs, topk)] #np.array_split(probs, len(test_sents)) 292 | c_code_pred_by_tfidf = [x for x in list_split(c_code_pred_by_tfidf, topk)] # np.array_split( c_code_pred_by_tfidf, len(test_sents)) 293 | 294 | result = [[{"label": True if lb == 1 else False, 295 | "scores": [float(probs[jj][i][j]) for j in range(probs[jj][i].shape[0])], 296 | "id": test_ids[jj], 297 | "sentence": s, 298 | # "civil_code": c_code_pred_by_tfidf[jj][i][0], 299 | "civil_code_id": c_code_pred_by_tfidf[jj][i][1], 300 | } 301 | for i, lb in enumerate(predicted_labels[jj]) if lb == 1] for jj, s in enumerate(test_sents)] 302 | 303 | current_missing_ids = [[{"label": False, 304 | "score": float(probs[jj][i][1]), 305 | "id": test_ids[jj], 306 | "civil_code_id": c_code_pred_by_tfidf[jj][i][1], 307 | } 308 | for i, lb in enumerate(predicted_labels[jj]) if lb == 0] for jj, s in enumerate(test_sents)] 309 | for negative_prediction in current_missing_ids: 310 | negative_prediction.sort(key=lambda info: info['score'], reverse=True) 311 | 312 | missing_ids_info[m_name] = current_missing_ids 313 | 314 | for jj, k in enumerate(test_ids): 315 | if k not in real_prediction: 316 | real_prediction[k] = set() 317 | real_prediction[k] = real_prediction[k].union( 318 | set([pred_infor['civil_code_id'] for pred_infor in result[jj]])) 319 | 320 | print(json.dumps(result, indent=2, ensure_ascii=False)) 321 | print("Finish inference on fine-tuned model {}, total time consuming: ".format( 322 | m_name), time.time() - time_start) 323 | print(len(result)) 324 | 325 | count_negative_add = 0 326 | for jj, k in enumerate(test_ids): 327 | if len(real_prediction[k]) == 0: 328 | count_negative_add += 1 329 | # pick 1 best score from negative prediction each model 330 | for m_name, _ in model_configs.items(): 331 | real_prediction[k].add(missing_ids_info[m_name][jj][0]['civil_code_id']) 332 | 333 | print("Total time consuming for {} samples: {} seconds => avg 1 sample in {} second".format( 334 | len(test_sents), time.time() - time_start, (time.time() - time_start) / len(test_sents))) 335 | 336 | submit_result = [] 337 | for k, v in real_prediction.items(): 338 | relevant_a_s = [] 339 | for relevant_a in v: 340 | tmp_a = Article.from_string(relevant_a) 341 | relevant_a_s.append({'law_id': tmp_a.l_id, 'article_id': tmp_a.a_id}) 342 | submit_result.append({ 343 | 'question_id': k, 344 | 'relevant_articles': relevant_a_s 345 | }) 346 | print("Count negative addition = {}".format(count_negative_add)) 347 | 348 | json.dump(submit_result, open("data/result_prediction.json", "wt", encoding='utf8'), ensure_ascii=False, indent=2) 349 | 350 | 351 | -------------------------------------------------------------------------------- /src/run_glue.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ Finetuning the library models for sequence classification on GLUE.""" 17 | # You can also adapt this script on your own text classification task. Pointers for this are left as comments. 18 | 19 | import logging 20 | import os 21 | import random 22 | import sys 23 | from dataclasses import dataclass, field 24 | from threading import local 25 | from typing import Optional 26 | 27 | import numpy as np 28 | from datasets import load_dataset, load_metric 29 | 30 | import transformers 31 | from transformers import ( 32 | AutoConfig, 33 | AutoModelForSequenceClassification, 34 | AutoTokenizer, 35 | DataCollatorWithPadding, 36 | EvalPrediction, 37 | HfArgumentParser, 38 | PretrainedConfig, 39 | Trainer, 40 | TrainingArguments, 41 | default_data_collator, 42 | set_seed, 43 | ) 44 | from transformers.trainer_utils import get_last_checkpoint, is_main_process 45 | 46 | 47 | task_to_keys = { 48 | "cola": ("sentence", None), 49 | "mnli": ("premise", "hypothesis"), 50 | "mrpc": ("sentence1", "sentence2"), 51 | "qnli": ("question", "sentence"), 52 | "qqp": ("question1", "question2"), 53 | "rte": ("sentence1", "sentence2"), 54 | "sst2": ("sentence", None), 55 | "stsb": ("sentence1", "sentence2"), 56 | "wnli": ("sentence1", "sentence2"), 57 | } 58 | 59 | logger = logging.getLogger(__name__) 60 | 61 | 62 | @dataclass 63 | class DataTrainingArguments: 64 | """ 65 | Arguments pertaining to what data we are going to input our model for training and eval. 66 | 67 | Using `HfArgumentParser` we can turn this class 68 | into argparse arguments to be able to specify them on 69 | the command line. 70 | """ 71 | 72 | task_name: Optional[str] = field( 73 | default=None, 74 | metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())}, 75 | ) 76 | max_seq_length: int = field( 77 | default=128, 78 | metadata={ 79 | "help": "The maximum total input sequence length after tokenization. Sequences longer " 80 | "than this will be truncated, sequences shorter will be padded." 81 | }, 82 | ) 83 | overwrite_cache: bool = field( 84 | default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} 85 | ) 86 | pad_to_max_length: bool = field( 87 | default=True, 88 | metadata={ 89 | "help": "Whether to pad all samples to `max_seq_length`. " 90 | "If False, will pad the samples dynamically when batching to the maximum length in the batch." 91 | }, 92 | ) 93 | train_file: Optional[str] = field( 94 | default=None, metadata={"help": "A csv or a json file containing the training data."} 95 | ) 96 | validation_file: Optional[str] = field( 97 | default=None, metadata={"help": "A csv or a json file containing the validation data."} 98 | ) 99 | test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."}) 100 | 101 | def __post_init__(self): 102 | if self.task_name is not None: 103 | self.task_name = self.task_name.lower() 104 | if self.task_name not in task_to_keys.keys(): 105 | raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys())) 106 | elif self.train_file is None or self.validation_file is None: 107 | raise ValueError("Need either a GLUE task or a training/validation file.") 108 | else: 109 | train_extension = self.train_file.split(".")[-1] 110 | assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file." 111 | validation_extension = self.validation_file.split(".")[-1] 112 | assert ( 113 | validation_extension == train_extension 114 | ), "`validation_file` should have the same extension (csv or json) as `train_file`." 115 | 116 | 117 | @dataclass 118 | class ModelArguments: 119 | """ 120 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. 121 | """ 122 | 123 | model_name_or_path: str = field( 124 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} 125 | ) 126 | config_name: Optional[str] = field( 127 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 128 | ) 129 | tokenizer_name: Optional[str] = field( 130 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 131 | ) 132 | cache_dir: Optional[str] = field( 133 | default=None, 134 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 135 | ) 136 | num_label: Optional[int] = field( 137 | default=None, 138 | metadata={"help": "The number of label"}, 139 | ) 140 | use_fast_tokenizer: bool = field( 141 | default=True, 142 | metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, 143 | ) 144 | model_revision: str = field( 145 | default="main", 146 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, 147 | ) 148 | use_auth_token: bool = field( 149 | default=False, 150 | metadata={ 151 | "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " 152 | "with private models)." 153 | }, 154 | ) 155 | 156 | 157 | def main(): 158 | # See all possible arguments in src/transformers/training_args.py 159 | # or by passing the --help flag to this script. 160 | # We now keep distinct sets of args, for a cleaner separation of concerns. 161 | 162 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) 163 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 164 | # If we pass only one argument to the script and it's the path to a json file, 165 | # let's parse it to get our arguments. 166 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 167 | else: 168 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 169 | 170 | # Detecting last checkpoint. 171 | last_checkpoint = None 172 | if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: 173 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 174 | if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: 175 | raise ValueError( 176 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 177 | "Use --overwrite_output_dir to overcome." 178 | ) 179 | elif last_checkpoint is not None: 180 | logger.info( 181 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 182 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 183 | ) 184 | 185 | # Setup logging 186 | logging.basicConfig( 187 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 188 | datefmt="%m/%d/%Y %H:%M:%S", 189 | handlers=[logging.StreamHandler(sys.stdout)], 190 | ) 191 | logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) 192 | 193 | # Log on each process the small summary: 194 | logger.warning( 195 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 196 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 197 | ) 198 | # Set the verbosity to info of the Transformers logger (on main process only): 199 | if is_main_process(training_args.local_rank): 200 | transformers.utils.logging.set_verbosity_info() 201 | transformers.utils.logging.enable_default_handler() 202 | transformers.utils.logging.enable_explicit_format() 203 | logger.info(f"Training/evaluation parameters {training_args}") 204 | 205 | # Set seed before initializing model. 206 | set_seed(training_args.seed) 207 | 208 | # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) 209 | # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). 210 | # 211 | # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the 212 | # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named 213 | # label if at least two columns are provided. 214 | # 215 | # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this 216 | # single column. You can easily tweak this behavior (see below) 217 | # 218 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently 219 | # download the dataset. 220 | if data_args.task_name is not None: 221 | # Downloading and loading a dataset from the hub. 222 | datasets = load_dataset("glue", data_args.task_name) 223 | else: 224 | # Loading a dataset from your local files. 225 | # CSV/JSON training and evaluation files are needed. 226 | data_files = {"train": data_args.train_file, "validation": data_args.validation_file} 227 | 228 | # Get the test dataset: you can provide your own CSV/JSON test file (see below) 229 | # when you use `do_predict` without specifying a GLUE benchmark task. 230 | if training_args.do_predict: 231 | if data_args.test_file is not None: 232 | train_extension = data_args.train_file.split(".")[-1] 233 | test_extension = data_args.test_file.split(".")[-1] 234 | assert ( 235 | test_extension == train_extension 236 | ), "`test_file` should have the same extension (csv or json) as `train_file`." 237 | data_files["test"] = data_args.test_file 238 | else: 239 | raise ValueError("Need either a GLUE task or a test file for `do_predict`.") 240 | 241 | for key in data_files.keys(): 242 | logger.info(f"load a local file for {key}: {data_files[key]}") 243 | 244 | if data_args.train_file.endswith(".csv"): 245 | # Loading a dataset from local csv files 246 | datasets = load_dataset("csv", data_files=data_files) 247 | else: 248 | # Loading a dataset from local json files 249 | datasets = load_dataset("json", data_files=data_files) 250 | # See more about loading any type of standard or custom dataset at 251 | # https://huggingface.co/docs/datasets/loading_datasets.html. 252 | 253 | # Labels 254 | if data_args.task_name is not None: 255 | is_regression = data_args.task_name == "stsb" 256 | if not is_regression: 257 | label_list = datasets["train"].features["label"].names 258 | num_labels = len(label_list) 259 | else: 260 | num_labels = 1 261 | else: 262 | # Trying to have good defaults here, don't hesitate to tweak to your needs. 263 | is_regression = datasets["train"].features["label"].dtype in ["float32", "float64"] 264 | if is_regression: 265 | num_labels = 1 266 | else: 267 | # A useful fast method: 268 | # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique 269 | label_list = datasets["train"].unique("label") 270 | label_list.sort() # Let's sort it for determinism 271 | num_labels = len(label_list) 272 | 273 | if model_args.num_label is not None: 274 | num_labels = model_args.num_label 275 | if "label_list" not in locals(): 276 | label_list = datasets["train"].unique("label") 277 | label_list.sort() # Let's sort it for determinism 278 | num_labels_from_data = len(label_list) 279 | 280 | # Load pretrained model and tokenizer 281 | # 282 | # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently 283 | # download model & vocab. 284 | config = AutoConfig.from_pretrained( 285 | model_args.config_name if model_args.config_name else model_args.model_name_or_path, 286 | num_labels=num_labels, 287 | finetuning_task=data_args.task_name, 288 | cache_dir=model_args.cache_dir, 289 | revision=model_args.model_revision, 290 | use_auth_token=True if model_args.use_auth_token else None, 291 | ) 292 | tokenizer = AutoTokenizer.from_pretrained( 293 | model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, 294 | cache_dir=model_args.cache_dir, 295 | use_fast=model_args.use_fast_tokenizer, 296 | revision=model_args.model_revision, 297 | use_auth_token=True if model_args.use_auth_token else None, 298 | ) 299 | model = AutoModelForSequenceClassification.from_pretrained( 300 | model_args.model_name_or_path, 301 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 302 | config=config, 303 | cache_dir=model_args.cache_dir, 304 | revision=model_args.model_revision, 305 | use_auth_token=True if model_args.use_auth_token else None, 306 | ) 307 | 308 | # Preprocessing the datasets 309 | if data_args.task_name is not None: 310 | sentence1_key, sentence2_key = task_to_keys[data_args.task_name] 311 | else: 312 | # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. 313 | non_label_column_names = [name for name in datasets["train"].column_names if name != "label" and name != "id"] 314 | if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: 315 | sentence1_key, sentence2_key = "sentence1", "sentence2" 316 | else: 317 | if len(non_label_column_names) >= 2: 318 | sentence1_key, sentence2_key = non_label_column_names[:2] 319 | else: 320 | sentence1_key, sentence2_key = non_label_column_names[0], None 321 | 322 | # Padding strategy 323 | if data_args.pad_to_max_length: 324 | padding = "max_length" 325 | else: 326 | # We will pad later, dynamically at batch creation, to the max sequence length in each batch 327 | padding = False 328 | 329 | # Some models have set the order of the labels to use, so let's make sure we do use it. 330 | label_to_id = None 331 | if ( 332 | model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id 333 | and data_args.task_name is not None 334 | and not is_regression 335 | ): 336 | # Some have all caps in their config, some don't. 337 | label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()} 338 | if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): 339 | label_to_id = {i: label_name_to_id[label_list[i]] for i in range(num_labels)} 340 | else: 341 | logger.warn( 342 | "Your model seems to have been trained with labels, but they don't match the dataset: ", 343 | f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." 344 | "\nIgnoring the model labels as a result.", 345 | ) 346 | elif data_args.task_name is None and not is_regression: 347 | label_to_id = {v: i for i, v in enumerate(label_list)} 348 | 349 | if data_args.max_seq_length > tokenizer.model_max_length: 350 | logger.warn( 351 | f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" 352 | f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." 353 | ) 354 | max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) 355 | 356 | def preprocess_function(examples): 357 | # Tokenize the texts 358 | args = ( 359 | (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) 360 | ) 361 | result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) 362 | 363 | # Map labels to IDs (not necessary for GLUE tasks) 364 | if label_to_id is not None and "label" in examples: 365 | result["label"] = [label_to_id[l] for l in examples["label"]] 366 | return result 367 | 368 | datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) 369 | 370 | train_dataset = datasets["train"] 371 | eval_dataset = datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] 372 | if data_args.task_name is not None or data_args.test_file is not None: 373 | test_dataset = datasets["test_matched" if data_args.task_name == "mnli" else "test"] 374 | 375 | # Log a few random samples from the training set: 376 | for index in random.sample(range(len(train_dataset)), 3): 377 | logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") 378 | 379 | # Get the metric function 380 | if data_args.task_name is not None: 381 | metric = load_metric("glue", data_args.task_name) 382 | # TODO: When datasets metrics include regular accuracy, make an else here and remove special branch from 383 | # compute_metrics 384 | 385 | # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a 386 | # predictions and label_ids field) and has to return a dictionary string to float. 387 | def compute_metrics(p: EvalPrediction): 388 | preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions 389 | preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) 390 | if data_args.task_name is not None: 391 | result = metric.compute(predictions=preds, references=p.label_ids) 392 | if len(result) > 1: 393 | result["combined_score"] = np.mean(list(result.values())).item() 394 | return result 395 | elif is_regression: 396 | return {"mse": ((preds - p.label_ids) ** 2).mean().item()} 397 | else: 398 | return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} 399 | 400 | # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. 401 | if data_args.pad_to_max_length: 402 | data_collator = default_data_collator 403 | elif training_args.fp16: 404 | data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) 405 | else: 406 | data_collator = None 407 | 408 | # Initialize our Trainer 409 | trainer = Trainer( 410 | model=model, 411 | args=training_args, 412 | train_dataset=train_dataset, 413 | eval_dataset=eval_dataset if training_args.do_eval else None, 414 | compute_metrics=compute_metrics, 415 | tokenizer=tokenizer, 416 | data_collator=data_collator, 417 | ) 418 | # trainer.num_train_epochs = trainer.num_train_epochs + training_args.num_train_epochs 419 | 420 | # Training 421 | if training_args.do_train: 422 | if last_checkpoint is not None: 423 | checkpoint = last_checkpoint 424 | elif os.path.isdir(model_args.model_name_or_path): 425 | checkpoint = model_args.model_name_or_path 426 | else: 427 | checkpoint = None 428 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 429 | metrics = train_result.metrics 430 | 431 | trainer.save_model() # Saves the tokenizer too for easy upload 432 | 433 | output_train_file = os.path.join(training_args.output_dir, "train_results.txt") 434 | if trainer.is_world_process_zero(): 435 | with open(output_train_file, "w") as writer: 436 | logger.info("***** Train results *****") 437 | for key, value in sorted(metrics.items()): 438 | logger.info(f" {key} = {value}") 439 | writer.write(f"{key} = {value}\n") 440 | 441 | # Need to save the state, since Trainer.save_model saves only the tokenizer with the model 442 | trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json")) 443 | 444 | # Evaluation 445 | eval_results = {} 446 | if training_args.do_eval: 447 | logger.info("*** Evaluate ***") 448 | 449 | # Loop to handle MNLI double evaluation (matched, mis-matched) 450 | tasks = [data_args.task_name] 451 | eval_datasets = [eval_dataset] 452 | if data_args.task_name == "mnli": 453 | tasks.append("mnli-mm") 454 | eval_datasets.append(datasets["validation_mismatched"]) 455 | 456 | for eval_dataset, task in zip(eval_datasets, tasks): 457 | eval_result = trainer.evaluate(eval_dataset=eval_dataset) 458 | 459 | output_eval_file = os.path.join(training_args.output_dir, f"eval_results_{task}.txt") 460 | if trainer.is_world_process_zero(): 461 | with open(output_eval_file, "w") as writer: 462 | logger.info(f"***** Eval results {task} *****") 463 | for key, value in sorted(eval_result.items()): 464 | logger.info(f" {key} = {value}") 465 | writer.write(f"{key} = {value}\n") 466 | 467 | eval_results.update(eval_result) 468 | 469 | if training_args.do_predict: 470 | logger.info("*** Test ***") 471 | 472 | # Loop to handle MNLI double evaluation (matched, mis-matched) 473 | tasks = [data_args.task_name] 474 | test_datasets = [test_dataset] 475 | if data_args.task_name == "mnli": 476 | tasks.append("mnli-mm") 477 | test_datasets.append(datasets["test_mismatched"]) 478 | 479 | for test_dataset, task in zip(test_datasets, tasks): 480 | # Removing the `label` columns because it contains -1 and Trainer won't like that. 481 | test_dataset.remove_columns_("label") 482 | predictions_mt = trainer.predict(test_dataset=test_dataset).predictions 483 | if num_labels_from_data < model_args.num_label: 484 | predictions_mt = predictions_mt[:, :num_labels_from_data] 485 | 486 | import pickle 487 | pickle.dump(predictions_mt, open(os.path.join(training_args.output_dir, 488 | "predictions.pkl" if 'train' not in data_args.test_file else "predictions_train.pkl"), "wb")) 489 | predictions = np.squeeze(predictions_mt) if is_regression else np.argmax(predictions_mt, axis=1) 490 | 491 | output_test_file = os.path.join(training_args.output_dir, f"test_results_{task}.txt" 492 | if 'train' not in data_args.test_file else f"train_results_{task}.txt" ) 493 | if trainer.is_world_process_zero(): 494 | with open(output_test_file, "w") as writer: 495 | logger.info(f"***** Test results {task} *****") 496 | writer.write("index\tprediction\n") 497 | for index, item in enumerate(predictions): 498 | if is_regression: 499 | writer.write(f"{index}\t{item:3.3f}\n") 500 | else: 501 | item = label_list[item] 502 | writer.write(f"{index}\t{item}\n") 503 | return eval_results 504 | 505 | 506 | def _mp_fn(index): 507 | # For xla_spawn (TPUs) 508 | main() 509 | 510 | 511 | if __name__ == "__main__": 512 | main() -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | from typing import Any, Dict, List 3 | 4 | import nltk 5 | import glob 6 | import pickle 7 | import torch 8 | import pandas as pd 9 | import re 10 | import json 11 | import numpy as np 12 | 13 | rdrsegmenter = None 14 | 15 | 16 | def standardize_data(row): 17 | # Xóa dấu chấm, phẩy, hỏi ở cuối câu 18 | row = re.sub(r"[\.,\?]+$-", "", row) 19 | # Xóa tất cả dấu chấm, phẩy, chấm phẩy, chấm thang, ... trong câu 20 | row = row.replace(",", " ").replace(".", " ") \ 21 | .replace(";", " ").replace("“", " ") \ 22 | .replace(":", " ").replace("”", " ") \ 23 | .replace('"', " ").replace("'", " ") \ 24 | .replace("!", " ").replace("?", " ") \ 25 | .replace("-", " ").replace("?", " ") \ 26 | .replace("(", " ").replace(")", " ") 27 | row = re.sub(r" {2, }", " ", row).strip().lower() 28 | return row 29 | 30 | 31 | class Article: 32 | pattern = "{}-->{}-->{}" 33 | 34 | def __init__(self, a_id, l_id, content, content_raw=None) -> None: 35 | self.a_id = a_id 36 | self.l_id = l_id 37 | self.content = content 38 | self.content_raw = content_raw if content_raw is not None else content 39 | 40 | def __str__(self) -> str: 41 | return self.pattern.format(self.l_id, self.a_id, self.content) 42 | 43 | @classmethod 44 | def from_string(cls, str_in): 45 | info = str_in.split(cls.pattern.format("", "a", "").split("a")[0]) 46 | return cls(info[1], info[0], info[2]) 47 | 48 | def get_id(self): 49 | return self.pattern.format(self.l_id, self.a_id, "") 50 | 51 | def get_subid(self, sub_id): 52 | return self.pattern.format(self.l_id, self.a_id+"-sub{}".format(sub_id), "") 53 | 54 | 55 | class Question: 56 | pattern = "{}-->{}" 57 | 58 | def __init__(self, id, content, relevant_a: List[Article] = None, label: bool = None, content_raw=None) -> None: 59 | self.id = id 60 | self.content = content 61 | self.relevant_a = relevant_a or [] 62 | self.label = label 63 | self.content_raw = content_raw if content_raw is not None else content 64 | 65 | def __str__(self) -> str: 66 | return self.pattern.format(self.id, self.content) 67 | 68 | @classmethod 69 | def from_string(cls, str_in): 70 | info = str_in.split(cls.pattern.format("", "a", "").split("a")[0]) 71 | return cls(info[0], info[1]) 72 | 73 | def get_id(self): 74 | return self.pattern.format(self.id, "") 75 | 76 | 77 | def f_score(p, r, beta=1): 78 | y = (beta * beta * p + r) 79 | return (1 + beta * beta) * p * r / y if y != 0 else 0.0 80 | 81 | 82 | def micro_result(count_real_lb, count_predicted, count_true, count_gold_lb=138): 83 | p = count_true/count_predicted if count_predicted != 0 else 0.0 84 | r = count_true/count_real_lb if count_real_lb != 0 else 0.0 85 | result = {"count_real_lb": count_real_lb, 86 | "count_predicted": count_predicted, 87 | "count_gold_lb": count_gold_lb, 88 | "count_true": count_true, 89 | "P": p, 90 | "R": r, 91 | "f1": f_score(p, r, 1), 92 | "f2": f_score(p, r, 2), 93 | "f2_": f_score(p, count_true/count_gold_lb, 2)} 94 | print(result) 95 | return result 96 | 97 | 98 | def evaluate_by_similarity(similarities_, gold_data, c_keys, topk=150): 99 | count_true = 0 100 | count_all_prediction = 0 101 | count_all_gold_lb = 0 102 | 103 | idx_result = similarities_.argsort()[:, -topk:] 104 | for i in range(idx_result.shape[0]): 105 | gold_lb = gold_data[i]['result'] 106 | count_all_gold_lb += len(gold_lb) 107 | 108 | pred = [c_keys[idx] for idx in idx_result[i]] 109 | count_all_prediction += len(pred) 110 | 111 | for i, pred_lb in enumerate(pred): 112 | if pred_lb in gold_lb: 113 | count_true += 1 114 | 115 | print(count_true, count_all_prediction, count_all_gold_lb, 116 | 'P: ', count_true/count_all_prediction, 117 | 'R: ', count_true/count_all_gold_lb, 118 | 'F1: ', f_score(count_true*1.0/count_all_prediction, 119 | count_true*1.0/count_all_gold_lb), 120 | 'F2: ', f_score(count_true*1.0/count_all_prediction, 121 | count_true*1.0/count_all_gold_lb, beta=2), 122 | ) 123 | return idx_result 124 | 125 | 126 | def evaluate_by_label(prediction_file, test_dat_file, ensemble_files=None): 127 | test_dat = pd.read_csv(test_dat_file, sep=',') 128 | predictions = [] 129 | 130 | count_real_lb = 0 131 | count_gold_lb = 138 132 | count_true = 0 133 | count_predicted = 0 134 | ensemble_files = ensemble_files or [] 135 | 136 | if prediction_file not in ensemble_files: 137 | ensemble_files.append(prediction_file) 138 | 139 | for pred_file in ensemble_files: 140 | prediction_ = pd.read_csv(pred_file, sep='\t') 141 | predictions.append(prediction_) 142 | 143 | for i in range(len(test_dat)): 144 | if test_dat['label'][i] == 1: 145 | count_real_lb += 1 146 | for prediction in predictions: 147 | if prediction['prediction'][i] == 1: 148 | count_true += 1 149 | break 150 | 151 | for prediction in predictions: 152 | if prediction['prediction'][i] == 1: 153 | count_predicted += 1 154 | break 155 | 156 | return micro_result(count_real_lb, count_predicted, count_true, count_gold_lb) 157 | 158 | def evaluate_idx(preds, gold_data, c_keys=None): 159 | try: 160 | count_true = 0 161 | count_all_prediction = 0 162 | count_all_gold_lb = 0 163 | 164 | for i_gold in range(len(preds)): 165 | gold_lb = [a.get_id() for a in gold_data[i_gold].relevant_a] 166 | count_all_gold_lb += len(gold_lb) 167 | 168 | pred = [c_keys[idx] for idx in preds[i_gold]] 169 | count_all_prediction += len(pred) 170 | 171 | for _i, pred_lb in enumerate(pred): 172 | if pred_lb in gold_lb: 173 | count_true += 1 174 | 175 | print(count_true, count_all_prediction, count_all_gold_lb, 176 | 'P: ', count_true / count_all_prediction, 177 | 'R: ', count_true / count_all_gold_lb, 178 | 'F1: ', f_score(count_true * 1.0 / count_all_prediction, 179 | count_true * 1.0 / count_all_gold_lb), 180 | 'F2: ', f_score(count_true * 1.0 / count_all_prediction, 181 | count_true * 1.0 / count_all_gold_lb, beta=2), 182 | ) 183 | return preds 184 | except Exception as e: 185 | print(e) 186 | return preds 187 | 188 | def combine_idxs(idx_ifidf, idx_bm25, top_k=100): 189 | preds = [] 190 | for i in range(len(idx_bm25)): 191 | prediction_item = [] 192 | for j in range(len(idx_bm25[i])): 193 | if idx_bm25[i][j] not in prediction_item: 194 | prediction_item.append(idx_bm25[i][j]) 195 | 196 | if len(prediction_item) == top_k: 197 | break 198 | 199 | if idx_ifidf[i][j] not in prediction_item: 200 | prediction_item.append(idx_ifidf[i][j]) 201 | 202 | if len(prediction_item) == top_k: 203 | break 204 | preds.append(prediction_item) 205 | return np.array(preds) 206 | 207 | def evaluate(similarities_, gold_data, topk=150, c_keys=None): 208 | try: 209 | idx_result = similarities_.argsort()[:, -topk:] 210 | return evaluate_idx(idx_result, gold_data, c_keys) 211 | except Exception as e: 212 | print(e) 213 | return idx_result 214 | 215 | def _article_content(full_content, chunk_content_info=None): 216 | if chunk_content_info is None: 217 | return ["{}".format(full_content)] 218 | chunk_content_size, chunk_content_stride = chunk_content_info[0], chunk_content_info[1] 219 | sub_contents = [] 220 | separate_w = ' ' 221 | words = full_content.split(separate_w) 222 | 223 | if len(words) > chunk_content_size: 224 | for i_start in range(0, len(words), chunk_content_size-chunk_content_stride): 225 | sub_cont = separate_w.join( 226 | words[i_start:i_start + chunk_content_size]) 227 | sub_contents.append(sub_cont) 228 | if len(words[i_start:i_start + chunk_content_size]) < chunk_content_size: 229 | break 230 | 231 | articles = ["{}".format(full_content)] + ["{}".format(sub_content) for sub_content in sub_contents] 232 | return articles 233 | 234 | 235 | def _do_nothing(str_in): 236 | return str_in 237 | 238 | def load_data_kse(path_folder_base="data/", postags_select=None, ids_test=None, ids_dev=None, tokenizer=None, 239 | law_corpus='legal_corpus.json', training_data='train_question_answer.json', testing_data=None, 240 | chunk_content_info=None): 241 | 242 | if tokenizer is None: 243 | tokenizer = _do_nothing 244 | 245 | articles = {} 246 | sub_articles = {} 247 | sub_key_mapping ={} 248 | articles_raw = json.load(open("{}/{}".format(path_folder_base, law_corpus))) 249 | for l_info in articles_raw: 250 | l_id = l_info['id'] if 'id' in l_info else l_info['law_id'] 251 | for a_info in l_info["articles"]: 252 | a_id = a_info['id'] if 'id' in a_info else a_info['article_id'] 253 | a_title = tokenizer(a_info['title']) + " . " if 'title' in a_info else "" 254 | a_title_raw = a_info['title'] + " . " if 'title' in a_info else "" 255 | 256 | a_content_s = _article_content(tokenizer(a_info['text']), chunk_content_info) 257 | a_content_s_raw = _article_content(a_info['text'], chunk_content_info) 258 | 259 | new_a = Article(l_id=l_id, a_id=a_id, content=a_title + a_content_s[0], content_raw=a_title_raw+a_content_s_raw[0]) 260 | k = new_a.get_id() 261 | articles[k] = new_a 262 | 263 | for i, a_content in enumerate(a_content_s[1:10]): 264 | new_sub_a = Article(l_id=l_id, a_id=a_id, content=a_title + a_content) 265 | sub_articles[new_a.get_subid(i)] = new_sub_a 266 | 267 | if k not in sub_key_mapping: 268 | sub_key_mapping[k] = [] 269 | sub_key_mapping[k].append(new_sub_a.get_subid(i)) 270 | 271 | print(len(articles)) 272 | print(articles[list(articles.keys())[0]]) 273 | 274 | # load annotated data 275 | data = [] 276 | q_raw = json.load(open("{}/{}".format(path_folder_base, training_data))) 277 | if testing_data is not None: 278 | q_raw_test = json.load(open("{}/{}".format(path_folder_base, testing_data))) 279 | if 'items' in q_raw and 'items' in q_raw_test: 280 | for e in q_raw_test['items']: 281 | e['relevant_articles'] = e.get('relevant_articles', []) 282 | q_raw['items'] = q_raw['items'] + q_raw_test['items'] 283 | 284 | if 'items' in q_raw: 285 | for q_info in q_raw['items']: 286 | data.append(Question(id=q_info["question_id"], content=tokenizer(q_info["question"])if tokenizer is not None else q_info["question"], 287 | content_raw=q_info["question"], 288 | relevant_a=[articles[Article( 289 | a_info["article_id"], a_info["law_id"], None).get_id()] for a_info in q_info["relevant_articles"]], 290 | label=True)) 291 | else: 292 | for q_info in q_raw: 293 | data.append(Question(id=q_info["question_id"], content=q_info["text"], relevant_a=[articles[Article( 294 | a_info["article_id"], a_info["law_id"], None).get_id()] for a_info in q_info["relevant_articles"]], 295 | label=q_info['label'])) 296 | 297 | # random test id 298 | if ids_test is None: 299 | ids_test = [q.id for idx, q in enumerate(data) if idx % 10 < 2] 300 | if ids_dev is None or len(ids_dev) == 0: 301 | ids_dev = ids_test 302 | print('Test ids = {}, Dev ids = {}', ids_test, ids_dev) 303 | 304 | print("Test ids ({} samples) = {}".format(len(ids_test), ids_test)) 305 | test_q = [q for q in data if q.id in ids_test] 306 | print('Len test_q', len(test_q)) 307 | dev_q = [q for q in data if q.id in ids_dev] 308 | print('Len dev_q', len(dev_q)) 309 | train_q = [q for q in data if q.id not in set(ids_test + ids_dev)] 310 | print('Len train_q', len(train_q)) 311 | 312 | c_docs = [] 313 | c_docs_raw = [] 314 | c_keys = [] 315 | for k, c in articles.items(): 316 | c_docs.append(c.content) 317 | c_docs_raw.append(c.content_raw) 318 | c_keys.append(k) 319 | 320 | c_sub_docs, c_sub_keys = [], [] 321 | for k, c in sub_articles.items(): 322 | c_sub_docs.append(c.content) 323 | c_sub_keys.append(k) 324 | 325 | return c_docs, c_keys, dev_q, test_q, train_q, (c_sub_docs, c_sub_keys, sub_key_mapping, c_docs_raw) 326 | 327 | 328 | def postag_filter(input_str, tags_filter=["V", "N", "P", "."]): 329 | words = nltk.word_tokenize(input_str) 330 | pos = nltk.pos_tag(words) 331 | new_words = [] 332 | 333 | for p in pos: 334 | if p[1][0] in tags_filter: 335 | new_words.append(p[0]) 336 | return " ".join(new_words) 337 | 338 | 339 | def aggregate_results(base_folder, aggregate_predictions=None, keys=None): 340 | prediction_mt = pickle.load( 341 | open("{}/predictions.pkl".format(base_folder), "rb")) 342 | test_dat = pd.read_csv("{}/test.tsv".format(base_folder), sep="\t") 343 | prediction = pd.read_csv( 344 | "{}/test_results_mrpc.txt".format(base_folder), sep="\t") 345 | probs = torch.softmax(torch.from_numpy(prediction_mt), dim=1) 346 | 347 | # aggregate gold values 348 | if aggregate_predictions is None and keys is None: 349 | aggregate_predictions = {} 350 | keys = [] 351 | 352 | predicted_pairs = set() 353 | for k, v_s in aggregate_predictions.items(): 354 | for v in v_s: 355 | predicted_pairs.add((v[0], v[1])) 356 | 357 | for i in range(len(test_dat)): 358 | if prediction['prediction'][i] == 1: 359 | # H30-1-A Q0 886 1 0.193 JNLP 360 | query_id = test_dat["#1 ID"][i] 361 | c_id = test_dat["#2 ID"][i] 362 | score = probs[i][1] 363 | if query_id not in aggregate_predictions: 364 | keys.append(query_id) 365 | aggregate_predictions[query_id] = [] 366 | 367 | if (query_id, c_id) not in predicted_pairs: 368 | aggregate_predictions[query_id].append((query_id, c_id, score)) 369 | predicted_pairs.add((query_id, c_id)) 370 | 371 | return aggregate_predictions, keys 372 | 373 | 374 | def aggregate_all_results_task5(prediction_files, gold_test_file): 375 | prediction_mt = [pickle.load(open(f_, 'rb')) for f_ in prediction_files][0] 376 | 377 | # load test file - gold data for question id and article idß 378 | test_dat = pd.read_csv(gold_test_file, sep=",") 379 | 380 | predicted_pairs = {} 381 | 382 | probs = torch.softmax(torch.from_numpy(prediction_mt), dim=1) 383 | count_true = 0 384 | for i in range(len(test_dat)): 385 | # H30-1-A Q0 886 1 0.193 JNLP 386 | query_id = test_dat["id"][i] 387 | lb = test_dat["label"][i] == 1 388 | score = probs[i][1] 389 | predicted_pairs[query_id] = score 390 | 391 | pred_lb = True if probs[i][1] > probs[i][0] else False 392 | 393 | if pred_lb == lb: 394 | count_true += 1 395 | 396 | print("acc={}, true={}, total={}".format(count_true / len(test_dat), count_true, len(test_dat))) 397 | 398 | return count_true / len(test_dat), (count_true, len(test_dat), predicted_pairs) 399 | 400 | def aggregate_all_results(prediction_files, gold_test_file, topk=1, append_unpredicted_q=True, miss_ids_prediction_file=None): 401 | prediction_mt_s = [pickle.load(open(f_, 'rb')) for f_ in prediction_files] 402 | 403 | # load test file - gold data for question id and article idß 404 | test_dat = pd.read_csv(gold_test_file, sep=",") 405 | 406 | predicted_pairs = {} 407 | unpredicted_pairs = {} 408 | individual_model_stats = [[] for i in range(len(prediction_files))] 409 | 410 | for i_mod, prediction_mt in enumerate(prediction_mt_s): 411 | probs = torch.softmax(torch.from_numpy(prediction_mt), dim=1) 412 | 413 | for i in range(len(test_dat)): 414 | # H30-1-A Q0 886 1 0.193 JNLP 415 | query_id = test_dat["#1 ID"][i] 416 | c_id = re.sub(r'-sub\d+', '', test_dat["#2 ID"][i]) 417 | score = probs[i][1] 418 | 419 | if probs[i][1] > probs[i][0]: 420 | if (query_id, c_id) not in predicted_pairs: 421 | predicted_pairs[(query_id, c_id)] = [] 422 | 423 | predicted_pairs[(query_id, c_id)].append(score) 424 | else: 425 | if (query_id, c_id) not in unpredicted_pairs: 426 | unpredicted_pairs[(query_id, c_id)] = [] 427 | 428 | unpredicted_pairs[(query_id, c_id)].append(score) 429 | 430 | # stats each model 431 | individual_model_stats[i_mod].append((query_id, c_id, score)) 432 | 433 | # sort stats each model 434 | new_stats = [{} for i in range(len(prediction_files))] 435 | for i_mod, result in enumerate(individual_model_stats): 436 | for stat_e in result: 437 | if stat_e[0] not in new_stats[i_mod]: 438 | new_stats[i_mod][stat_e[0]] = [] 439 | new_stats[i_mod][stat_e[0]].append((stat_e[1], stat_e[2].item())) 440 | for q_id, v in new_stats[i_mod].items(): 441 | new_stats[i_mod][q_id].sort(key=lambda x: x[1], reverse=True) 442 | new_stats[i_mod][q_id] = new_stats[i_mod][q_id][:topk] 443 | individual_model_stats = new_stats 444 | 445 | # 446 | # aggregrate result from many models 447 | def aggregrate_result_(pairs_): 448 | aggregate_results = {} 449 | for k, v in pairs_.items(): 450 | if k[0] not in aggregate_results: 451 | aggregate_results[k[0]] = [] 452 | # aggregate_results[k[0]].append((k[0], k[1], max(v))) 453 | aggregate_results[k[0]].append((k[0], k[1], sum(v) / len(v))) 454 | return aggregate_results 455 | 456 | predicted_results = aggregrate_result_(predicted_pairs) 457 | unpredicted_results = aggregrate_result_(unpredicted_pairs) 458 | 459 | # append unpredicted question by top 1 460 | miss_prediction_keys = set() 461 | if append_unpredicted_q: 462 | miss_prediction_keys = set(unpredicted_results.keys()).difference( 463 | set(predicted_results.keys())) 464 | print('Miss {} question ids: {}'.format(len(miss_prediction_keys), miss_prediction_keys)) 465 | if miss_ids_prediction_file is not None: 466 | json.dump(list(miss_prediction_keys), open(miss_ids_prediction_file, "wt", encoding='utf8')) 467 | for q_id in miss_prediction_keys: 468 | unpredicted_results[q_id].sort(key=lambda x: x[2], reverse=True) 469 | predicted_results[q_id] = unpredicted_results[q_id][:topk if topk is not None else 1] 470 | 471 | # 472 | # aggregrate gold label 473 | gold_results = {} 474 | gold_all_q_ids = set() 475 | for i in range(len(test_dat)): 476 | query_id = test_dat["#1 ID"][i] 477 | # test_dat["#2 ID"][i] 478 | c_id = re.sub(r'-sub\d+', '', test_dat["#2 ID"][i]) 479 | gold_all_q_ids.add(query_id) 480 | 481 | if test_dat['label'][i] == 1: 482 | if query_id not in gold_results: 483 | gold_results[query_id] = [] 484 | gold_results[query_id].append((query_id, c_id, 1)) 485 | # 486 | # compute performance by accuracy task 4 487 | stats_task4 = {'pred': [], 'gold': []} 488 | for q_id in gold_all_q_ids: 489 | if q_id in gold_results: 490 | stats_task4['gold'].append((q_id, True)) 491 | else: 492 | stats_task4['gold'].append((q_id, False)) 493 | 494 | if q_id in predicted_results: 495 | stats_task4['pred'].append((q_id, True)) 496 | else: 497 | stats_task4['pred'].append((q_id, False)) 498 | right_count = len(set(stats_task4['pred']).intersection( 499 | set(stats_task4['gold']))) 500 | stats_task4['acc'] = right_count / len(gold_all_q_ids) 501 | stats_task4['correct_count'] = right_count 502 | stats_task4['total'] = len(gold_all_q_ids) 503 | 504 | # 505 | # compute performance by some metrics 506 | stats_result = {} 507 | for q_id in gold_all_q_ids: 508 | stats_result[q_id] = {} 509 | if q_id not in gold_results or q_id not in predicted_results: 510 | stats_result[q_id]['pred'] = [x[1] 511 | for x in predicted_results.get(q_id, [])] 512 | stats_result[q_id]['enssemble_score'] = [x[2].item() 513 | for x in predicted_results.get(q_id, [])] 514 | stats_result[q_id]['gold'] = [] 515 | stats_result[q_id]["P"] = 0 516 | stats_result[q_id]["R"] = 0 517 | stats_result[q_id]["F2"] = 0 518 | else: 519 | articles_prediction = [x[1]for x in predicted_results[q_id]] 520 | articles_gold = [x[1]for x in gold_results[q_id]] 521 | stats_result[q_id]['pred'] = articles_prediction 522 | stats_result[q_id]['enssemble_score'] = [x[2].item() 523 | for x in predicted_results[q_id]] 524 | stats_result[q_id]['gold'] = articles_gold 525 | count_true = len( 526 | set(articles_prediction).intersection(set(articles_gold))) 527 | stats_result[q_id]["P"] = count_true / \ 528 | len(set(articles_prediction)) 529 | stats_result[q_id]["R"] = count_true / len(set(articles_gold)) 530 | stats_result[q_id]["F2"] = f_score( 531 | stats_result[q_id]["P"], stats_result[q_id]["R"], beta=2) 532 | 533 | stats_result[q_id]['found_by_model'] = q_id not in miss_prediction_keys 534 | stats_result[q_id]['detail_scores'] = [individual_model_stats[i][q_id] 535 | for i in range(len(prediction_files))] 536 | 537 | all_p = [stats_result[q_id]['P'] for q_id in stats_result] 538 | p = sum(all_p) / len(all_p) 539 | 540 | all_r = [stats_result[q_id]['R'] for q_id in stats_result] 541 | r = sum(all_r) / len(all_r) 542 | 543 | all_f2 = [stats_result[q_id]['F2'] for q_id in stats_result] 544 | macro_f2 = sum(all_f2) / len(all_f2) 545 | 546 | f2 = f_score(p, r, beta=2) 547 | 548 | overall_result = {'p': p, 'r': r, 'f2': f2, 549 | 'macro_f2': macro_f2, 'acc_task4': stats_task4} 550 | stats_result.update(overall_result) 551 | # pprint(stats_result) 552 | print('task 4:', "{:2.2f}".format( 553 | stats_task4['acc']*100), stats_task4['correct_count'], stats_task4['total']) 554 | 555 | return stats_result 556 | 557 | 558 | def generate_file_submission(stats_result: Dict[str, Any], file_name: str, topk: int = None): 559 | predictions = {} 560 | for q_id, a_info in stats_result.items(): 561 | if '-' not in q_id: 562 | continue 563 | if q_id not in predictions: 564 | predictions[q_id] = [] 565 | if topk is None: 566 | for i_pred, pred in enumerate(zip(a_info['pred'], a_info['enssemble_score'])): 567 | predictions[q_id].append((q_id, pred[0], pred[1])) 568 | else: 569 | enssemble_scores = {} 570 | # aggregate all score 571 | for scores_model_i in a_info['detail_scores']: 572 | for score in scores_model_i: 573 | a_id = score[0] 574 | score_raw = score[1] 575 | if a_id not in enssemble_scores: 576 | enssemble_scores[a_id] = [] 577 | enssemble_scores[a_id].append(score_raw) 578 | # get mean all score 579 | for a_id in enssemble_scores: 580 | # max(enssemble_scores[a_id]) # 581 | enssemble_scores[a_id] = sum( 582 | enssemble_scores[a_id]) / len(enssemble_scores[a_id]) 583 | 584 | for a_id, score_enss in enssemble_scores.items(): 585 | predictions[q_id].append((q_id, a_id, score_enss)) 586 | 587 | keys_ = predictions.keys() 588 | for query_id in keys_: 589 | predictions[query_id].sort(key=lambda x: x[2], reverse=True) 590 | if topk is not None: 591 | # if len(predictions[query_id]) < topk: 592 | # print("exception in {}, countpred = {}, topk={}".format(query_id, len(predictions[query_id]), topk)) 593 | predictions[query_id] = predictions[query_id][:topk] 594 | prediction_str = [] 595 | for query_id in keys_: 596 | for i, prediction_info in enumerate(predictions[query_id]): 597 | template = "{} {} {} {} {:.9f} {}" 598 | 599 | # H30-1-A Q0 886 1 0.193 JNLP 600 | prediction_str.append( 601 | template.format(query_id, "Q0", prediction_info[1], i + 1, prediction_info[2], "JNLP")) 602 | 603 | with open(file_name, "wt", encoding="utf8") as f: 604 | f.write("\n".join(prediction_str)) 605 | -------------------------------------------------------------------------------- /scripts/legal_text_retrieval.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "legal-text-retrieval.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | }, 17 | "accelerator": "GPU" 18 | }, 19 | "cells": [ 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": { 24 | "colab": { 25 | "base_uri": "https://localhost:8080/" 26 | }, 27 | "id": "ep8Xqg5y3x1W", 28 | "outputId": "7b78e40e-4a7d-4f9c-9d73-cad8ea0fa44e" 29 | }, 30 | "outputs": [ 31 | { 32 | "output_type": "stream", 33 | "name": "stdout", 34 | "text": [ 35 | "Cloning into 'legal_text_retrieval'...\n", 36 | "remote: Enumerating objects: 32, done.\u001b[K\n", 37 | "remote: Counting objects: 100% (32/32), done.\u001b[K\n", 38 | "remote: Compressing objects: 100% (26/26), done.\u001b[K\n", 39 | "remote: Total 32 (delta 5), reused 32 (delta 5), pack-reused 0\u001b[K\n", 40 | "Unpacking objects: 100% (32/32), done.\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "!git clone https://github.com/phuongnm-bkhn/legal_text_retrieval" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "source": [ 51 | "!git clone https://github.com/vncorenlp/VnCoreNLP.git vncorenlp_data\n", 52 | "!pip install -r legal_text_retrieval/requirements.txt" 53 | ], 54 | "metadata": { 55 | "colab": { 56 | "base_uri": "https://localhost:8080/" 57 | }, 58 | "id": "6xoYIRTU3_bF", 59 | "outputId": "358b0494-8aa8-4a3a-9ff3-d1e2c5bbdacd" 60 | }, 61 | "execution_count": null, 62 | "outputs": [ 63 | { 64 | "output_type": "stream", 65 | "name": "stdout", 66 | "text": [ 67 | "Cloning into 'vncorenlp_data'...\n", 68 | "remote: Enumerating objects: 215, done.\u001b[K\n", 69 | "remote: Counting objects: 33% (1/3)\u001b[K\rremote: Counting objects: 66% (2/3)\u001b[K\rremote: Counting objects: 100% (3/3)\u001b[K\rremote: Counting objects: 100% (3/3), done.\u001b[K\n", 70 | "remote: Compressing objects: 100% (3/3), done.\u001b[K\n", 71 | "remote: Total 215 (delta 0), reused 0 (delta 0), pack-reused 212\u001b[K\n", 72 | "Receiving objects: 100% (215/215), 214.22 MiB | 30.89 MiB/s, done.\n", 73 | "Resolving deltas: 100% (76/76), done.\n", 74 | "Requirement already satisfied: sklearn in /usr/local/lib/python3.7/dist-packages (from -r legal_text_retrieval/requirements.txt (line 1)) (0.0)\n", 75 | "Requirement already satisfied: nltk in /usr/local/lib/python3.7/dist-packages (from -r legal_text_retrieval/requirements.txt (line 2)) (3.2.5)\n", 76 | "Requirement already satisfied: gensim in /usr/local/lib/python3.7/dist-packages (from -r legal_text_retrieval/requirements.txt (line 3)) (3.6.0)\n", 77 | "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from -r legal_text_retrieval/requirements.txt (line 4)) (1.1.5)\n", 78 | "Collecting fugashi\n", 79 | " Downloading fugashi-1.1.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (490 kB)\n", 80 | "\u001b[K |████████████████████████████████| 490 kB 5.4 MB/s \n", 81 | "\u001b[?25hCollecting mecab-python3\n", 82 | " Downloading mecab_python3-1.0.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (488 kB)\n", 83 | "\u001b[K |████████████████████████████████| 488 kB 41.4 MB/s \n", 84 | "\u001b[?25hCollecting unidic-lite\n", 85 | " Downloading unidic-lite-1.0.8.tar.gz (47.4 MB)\n", 86 | "\u001b[K |████████████████████████████████| 47.4 MB 1.6 MB/s \n", 87 | "\u001b[?25hCollecting transformers==4.3.2\n", 88 | " Downloading transformers-4.3.2-py3-none-any.whl (1.8 MB)\n", 89 | "\u001b[K |████████████████████████████████| 1.8 MB 33.6 MB/s \n", 90 | "\u001b[?25hRequirement already satisfied: openpyxl in /usr/local/lib/python3.7/dist-packages (from -r legal_text_retrieval/requirements.txt (line 9)) (2.5.9)\n", 91 | "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from -r legal_text_retrieval/requirements.txt (line 10)) (1.10.0+cu111)\n", 92 | "Requirement already satisfied: xlwt in /usr/local/lib/python3.7/dist-packages (from -r legal_text_retrieval/requirements.txt (line 11)) (1.3.0)\n", 93 | "Collecting datasets\n", 94 | " Downloading datasets-1.17.0-py3-none-any.whl (306 kB)\n", 95 | "\u001b[K |████████████████████████████████| 306 kB 39.1 MB/s \n", 96 | "\u001b[?25hCollecting sentencepiece\n", 97 | " Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n", 98 | "\u001b[K |████████████████████████████████| 1.2 MB 32.0 MB/s \n", 99 | "\u001b[?25hRequirement already satisfied: protobuf in /usr/local/lib/python3.7/dist-packages (from -r legal_text_retrieval/requirements.txt (line 14)) (3.17.3)\n", 100 | "Collecting rank_bm25\n", 101 | " Downloading rank_bm25-0.2.1-py3-none-any.whl (8.5 kB)\n", 102 | "Requirement already satisfied: seaborn in /usr/local/lib/python3.7/dist-packages (from -r legal_text_retrieval/requirements.txt (line 16)) (0.11.2)\n", 103 | "Collecting vncorenlp\n", 104 | " Downloading vncorenlp-1.0.3.tar.gz (2.6 MB)\n", 105 | "\u001b[K |████████████████████████████████| 2.6 MB 31.3 MB/s \n", 106 | "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (1.19.5)\n", 107 | "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (4.8.2)\n", 108 | "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (3.4.0)\n", 109 | "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (21.3)\n", 110 | "Collecting sacremoses\n", 111 | " Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)\n", 112 | "\u001b[K |████████████████████████████████| 895 kB 44.2 MB/s \n", 113 | "\u001b[?25hCollecting tokenizers<0.11,>=0.10.1\n", 114 | " Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)\n", 115 | "\u001b[K |████████████████████████████████| 3.3 MB 27.6 MB/s \n", 116 | "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (2019.12.20)\n", 117 | "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (2.23.0)\n", 118 | "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (4.62.3)\n", 119 | "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.7/dist-packages (from sklearn->-r legal_text_retrieval/requirements.txt (line 1)) (1.0.1)\n", 120 | "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from nltk->-r legal_text_retrieval/requirements.txt (line 2)) (1.15.0)\n", 121 | "Requirement already satisfied: scipy>=0.18.1 in /usr/local/lib/python3.7/dist-packages (from gensim->-r legal_text_retrieval/requirements.txt (line 3)) (1.4.1)\n", 122 | "Requirement already satisfied: smart-open>=1.2.1 in /usr/local/lib/python3.7/dist-packages (from gensim->-r legal_text_retrieval/requirements.txt (line 3)) (5.2.1)\n", 123 | "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->-r legal_text_retrieval/requirements.txt (line 4)) (2018.9)\n", 124 | "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->-r legal_text_retrieval/requirements.txt (line 4)) (2.8.2)\n", 125 | "Requirement already satisfied: et-xmlfile in /usr/local/lib/python3.7/dist-packages (from openpyxl->-r legal_text_retrieval/requirements.txt (line 9)) (1.1.0)\n", 126 | "Requirement already satisfied: jdcal in /usr/local/lib/python3.7/dist-packages (from openpyxl->-r legal_text_retrieval/requirements.txt (line 9)) (1.4.1)\n", 127 | "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch->-r legal_text_retrieval/requirements.txt (line 10)) (3.10.0.2)\n", 128 | "Requirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from datasets->-r legal_text_retrieval/requirements.txt (line 12)) (0.3.4)\n", 129 | "Requirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from datasets->-r legal_text_retrieval/requirements.txt (line 12)) (0.70.12.2)\n", 130 | "Collecting fsspec[http]>=2021.05.0\n", 131 | " Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)\n", 132 | "\u001b[K |████████████████████████████████| 132 kB 46.8 MB/s \n", 133 | "\u001b[?25hRequirement already satisfied: pyarrow!=4.0.0,>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets->-r legal_text_retrieval/requirements.txt (line 12)) (3.0.0)\n", 134 | "Collecting xxhash\n", 135 | " Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)\n", 136 | "\u001b[K |████████████████████████████████| 243 kB 33.5 MB/s \n", 137 | "\u001b[?25hCollecting huggingface-hub<1.0.0,>=0.1.0\n", 138 | " Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)\n", 139 | "\u001b[K |████████████████████████████████| 61 kB 519 kB/s \n", 140 | "\u001b[?25hCollecting aiohttp\n", 141 | " Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)\n", 142 | "\u001b[K |████████████████████████████████| 1.1 MB 41.7 MB/s \n", 143 | "\u001b[?25hRequirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets->-r legal_text_retrieval/requirements.txt (line 12)) (3.13)\n", 144 | "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (3.0.6)\n", 145 | "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (1.24.3)\n", 146 | "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (3.0.4)\n", 147 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (2021.10.8)\n", 148 | "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (2.10)\n", 149 | "Requirement already satisfied: matplotlib>=2.2 in /usr/local/lib/python3.7/dist-packages (from seaborn->-r legal_text_retrieval/requirements.txt (line 16)) (3.2.2)\n", 150 | "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib>=2.2->seaborn->-r legal_text_retrieval/requirements.txt (line 16)) (0.11.0)\n", 151 | "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib>=2.2->seaborn->-r legal_text_retrieval/requirements.txt (line 16)) (1.3.2)\n", 152 | "Collecting aiosignal>=1.1.2\n", 153 | " Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)\n", 154 | "Collecting yarl<2.0,>=1.0\n", 155 | " Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)\n", 156 | "\u001b[K |████████████████████████████████| 271 kB 35.7 MB/s \n", 157 | "\u001b[?25hCollecting multidict<7.0,>=4.5\n", 158 | " Downloading multidict-5.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (160 kB)\n", 159 | "\u001b[K |████████████████████████████████| 160 kB 42.5 MB/s \n", 160 | "\u001b[?25hRequirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets->-r legal_text_retrieval/requirements.txt (line 12)) (21.2.0)\n", 161 | "Collecting frozenlist>=1.1.1\n", 162 | " Downloading frozenlist-1.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (192 kB)\n", 163 | "\u001b[K |████████████████████████████████| 192 kB 43.6 MB/s \n", 164 | "\u001b[?25hCollecting asynctest==0.13.0\n", 165 | " Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)\n", 166 | "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets->-r legal_text_retrieval/requirements.txt (line 12)) (2.0.8)\n", 167 | "Collecting async-timeout<5.0,>=4.0.0a3\n", 168 | " Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n", 169 | "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (3.6.0)\n", 170 | "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (7.1.2)\n", 171 | "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.3.2->-r legal_text_retrieval/requirements.txt (line 8)) (1.1.0)\n", 172 | "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->sklearn->-r legal_text_retrieval/requirements.txt (line 1)) (3.0.0)\n", 173 | "Building wheels for collected packages: unidic-lite, vncorenlp\n", 174 | " Building wheel for unidic-lite (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 175 | " Created wheel for unidic-lite: filename=unidic_lite-1.0.8-py3-none-any.whl size=47658836 sha256=9a2f1ce4d990b4cb1836d0c513a302d4fcc03b7a996150dd2ea8bc8c8a229445\n", 176 | " Stored in directory: /root/.cache/pip/wheels/de/69/b1/112140b599f2b13f609d485a99e357ba68df194d2079c5b1a2\n", 177 | " Building wheel for vncorenlp (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 178 | " Created wheel for vncorenlp: filename=vncorenlp-1.0.3-py3-none-any.whl size=2645951 sha256=b9e7a739e682cc3014f4451ed33864d7f84ae738e0e5e881b49ca3720cc0465d\n", 179 | " Stored in directory: /root/.cache/pip/wheels/0c/d8/f2/d28d97379b4f6479bf51247c8dfd57fa00932fa7a74b6aab29\n", 180 | "Successfully built unidic-lite vncorenlp\n", 181 | "Installing collected packages: multidict, frozenlist, yarl, asynctest, async-timeout, aiosignal, fsspec, aiohttp, xxhash, tokenizers, sacremoses, huggingface-hub, vncorenlp, unidic-lite, transformers, sentencepiece, rank-bm25, mecab-python3, fugashi, datasets\n", 182 | "Successfully installed aiohttp-3.8.1 aiosignal-1.2.0 async-timeout-4.0.2 asynctest-0.13.0 datasets-1.17.0 frozenlist-1.2.0 fsspec-2021.11.1 fugashi-1.1.1 huggingface-hub-0.2.1 mecab-python3-1.0.4 multidict-5.2.0 rank-bm25-0.2.1 sacremoses-0.0.46 sentencepiece-0.1.96 tokenizers-0.10.3 transformers-4.3.2 unidic-lite-1.0.8 vncorenlp-1.0.3 xxhash-2.0.2 yarl-1.7.2\n" 183 | ] 184 | } 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "source": [ 190 | "!cd legal_text_retrieval/data/ && unzip zac2021-ltr-data.zip" 191 | ], 192 | "metadata": { 193 | "colab": { 194 | "base_uri": "https://localhost:8080/" 195 | }, 196 | "id": "A4NnMJxiCQOK", 197 | "outputId": "4cf03fc3-2445-4c54-930b-ff6e1f0f0236" 198 | }, 199 | "execution_count": null, 200 | "outputs": [ 201 | { 202 | "output_type": "stream", 203 | "name": "stdout", 204 | "text": [ 205 | "Archive: zac2021-ltr-data.zip\n", 206 | " creating: zac2021-ltr-data/\n", 207 | " inflating: zac2021-ltr-data/train_question_answer.json \n", 208 | " inflating: zac2021-ltr-data/bug.log \n", 209 | " inflating: zac2021-ltr-data/private_test_question.json \n", 210 | " inflating: zac2021-ltr-data/legal_corpus.json \n", 211 | " inflating: zac2021-ltr-data/public_test_question.json \n", 212 | " inflating: zac2021-ltr-data/public_test_sample_submission.json \n" 213 | ] 214 | } 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "source": [ 220 | "\n", 221 | "!mkdir legal_text_retrieval/data/zalo-tfidfbm25150-full\n", 222 | "\n", 223 | "# increase topk to increate the recall score, maximum perfromance that we found is topk=150 => however, it is time-consuming\n", 224 | "!python3 legal_text_retrieval/src/data_generator.py --path_folder_base legal_text_retrieval/data/zac2021-ltr-data/ --test_file public_test_question.json --topk 20 --tok --path_output_dir legal_text_retrieval/data/zalo-tfidfbm25150-full" 225 | ], 226 | "metadata": { 227 | "colab": { 228 | "base_uri": "https://localhost:8080/" 229 | }, 230 | "id": "EifhPKVA4G45", 231 | "outputId": "4fc7abe4-b13e-466b-8b60-357a2374186d" 232 | }, 233 | "execution_count": null, 234 | "outputs": [ 235 | { 236 | "metadata": { 237 | "tags": null 238 | }, 239 | "name": "stdout", 240 | "output_type": "stream", 241 | "text": [ 242 | "Load data and tokenize data\n", 243 | "61425\n", 244 | "01/2009/tt-bnn-->1-->Điều 1 . Phạm_vi áp_dụng . Thông_tư này hướng_dẫn tuần_tra , canh_gác bảo_vệ đê Điều trong mùa lũ đối_với các tuyến đê sông được phân_loại , phân_cấp theo quy_định tại Điều 4 của Luật Đê_Điều .\n", 245 | "Test ids = {}, Dev ids = {} ['71d7a66777628e3f9b8b85270366166c', '9032f491fd4ef429d9c18b3b015bd11b', '776e27836f8fbe84f60a850952498988', '8d4086520df91f54eec6a4accf796569', 'c0a8f1d99cd3b7ee60c21d1e9b98125c', 'd27c13871d62e88714fdaf539e50382e', '516686bcd8331a26428556e346c377c5', 'a8f2bbaf928d9834c8257188ca2a0780', '98603c2f4ff85046a071701c2659295d', '867833a9097a0838ad3a0a9df0f7a4a5', 'ecf80f40365366e561ce1b900a43c8f8', '761000dc2471950ac51f58d60694e78a', '38959f4d0d0d96f0f567b4e8c3e24809', '0f4bd20ef527fa6d503f0747f91247e8', 'c6c616ea37c4f4e9f753a1a26c1f513f', '1dd1899fc4c0293f6bbb95af80cc39c4', '1d07fcbe2a6900c4e4ea4409ae2c5270', '568d6332f094e839329a4d66d30dc6d0', '6d9ed75aecea4cb4e6298bd638ff4afc', '0b0fd757eef935f9609cbfab1443840d', '8031c4b9f388c424ba2ed4da91f00173', 'a1c8fc922eac56a7bd275565defa9763', '07f44d71c453c52f9f055ab877b909cc', 'a71a739c154aa77fb2800a426818df5e', '998fe36c4f8131243b766398f5801c22', '5db6785789345d1638b1046fee3b4c76', 'fb85950dc8effebfa15c9555f238d7ec', '927893d2acecb4415e1f68f923dce782', '1cded34a5dfb786368208f632e5a5d6f', 'e180827dbd190aa6e9651be5d08f7230', 'e243fda5f1cb39ea6c810564c025638b', '37710e7c0449baedbea4236ef2306da1', 'eaba6c7538a5d6f7f993e1bed678db73', 'c062f1a1e198a4da90193c5e06c484c8', '57cd65ad7d3909ce9b8fe9d4e2fdf709', 'f571b343e99c24e012804a5b04a591f4', '97ca93ecb6d659c2af7e71a269ad021a', 'bec4c14ba02a32fad812ee94a98c1602', '12f506bf952aec9b3d8289a62ded85d6', 'c396496b481e2685a7733cfdbeaf925b', '932b64b6d7fa0605031eaa6f3c63eea5', 'b00b0b63b241943957b48e9a4f9b7c27', '085fd00ce06d27d19f40b330c7d8ba0d', 'a6a55291b82d26ed8852361a80575e89', '0f1a741eea4c4cbdba9d46dd3d762e2a', 'a6f2a358b6a2a33fd8c44830293b8323', '6c5bebf8e67ea0562455656f4132dca8', '27c8850bb21db8dbccc40ebadaf4daea', '393a3cf1a1ce2e6de5955e85605b2e61', '346ba06d2fc6ad303d2c25aa03c8f309', '5160a38cdbea74be5e569b767f025e01', '0a63c38ccede65ad5788d469185fc315', '940904b10cb465abe65aa7f7fafbe2a3', '22333feb1c9b1f02fa531f878c1970ae', '9348b0de17a27622d52d6c71fa12d419', '568982eb42078f033b3ea568329b6c0e', '56a842911136ec4628584bbb915f8f4b', 'f33db0c82c1d8924f035a0ea3b2e7ca9', 'b4fbc993f786bed4cec41b356919f2e3', '6cc5cb34c7f90bd870c027ece3a15fad', 'bcc2b44d3cd1f9d2b8458c595fb9b02d', '269c71c3a7cb517e2d21f20bad0714c6', '513b44a60a7bfd9a8c1a5a2c49ca185e', 'e05c7b866e6e8e7394ca5b1d02c20f58', '9652a10d1629156e0789bb630e081f9c', '8a07ce1347bfecea1903820b2b76e0be', '9e560864413aee1abddcde454e778e94', '62beeffb51a33d06b38e2da8d5f2d051', '1d6f6841cc1de15d208daa6aee88e2ad', '47c9781f39a732c3643f6c4e600e19cf', '8816ba131c25d99edd0cdd75c07859b1', '07bf09806e2103ac74dac0afeb33bf6d', 'f7a936a8f212e039b7fea6867ad7f010', '199525a09762d6e51c1c7f4ef55c8926', '6aa56327ff9ef7983d0c691aca0ef1ca', 'f29cdf5317bffce5648a21766b6b5f34', 'fa220df5ca6551e99016b9d5897b7915', 'e9213ea8f437aa9d320e0e1271c4a1e6', '51482d141bf1dc8a92bc294563992ef0', 'b829d2d9b0b73662c3bcfaf032bcb1e2', '55540a87c8cdfba5615d4079484901f3', '5b53081a25e4d66f353d2229a0564c65', '1f993001eef14eee827293e16d218ca6', 'b0288182991527078a0693bca89985b8', '6b599fe2bfc9db75aabab13175652d56', 'e8c7b7a4bf4390bf03ddcc5d4215dba2', '0c0856b3f01f7a0a6d9fbcbf4253e190', 'd45962a733099ddc701566f0d33556ef', '025be63b5d1ae501b268711832cc9665', 'ba71aa394247a7a27230a2be489ab420', '916e520d5f358a54ebcdbba0179d8843', '6095a61aaf59fb5bcdc88fe1ad8a78c2', '84b51054f1131067758bb1137d812b78', '1823baf623fff2394d985f4f6c0e6d74', '2b5c43f2fa8f0b133535cd3ee6c6c695', '7dcb8c84aee27fee41f9400c33ff1551', 'eae565378fff9055ed48235add42bc15', 'ea1ccd1847912ce2cc57d7901a0d81d7', '2922376d0667c1809e159264aea169f5', '0c0bcba0c40adafd29e2cf47ec4fb544', '85eca4e62686b28864e1f6e20ce8d683', '1a78051dee60dbe7d688a3127f12544e', '46b1520f1cbaa923ac8cf85feadc9cf4', '49655c66704cf856ca2dbbc77af22586', '7b107a90c7d45209b9b367a08feb7708', '4361de87fcb0cfd740d35212aa7c7211', '1e8f4d2acdb3d4a94ce1205e0326f9f6', '9f0e04bd2d29040421904442696311b5', '2bb484a925b6a25f1277b7008651cf92', 'db5bdd3d97d53f257ac6afb8fd78f462', '63aa5757186c1def977a118d31445ec0', '9864da59e74afac4d219fc65939826e5', '4ec499632cf69a6f75f630f09fefa1dd', '6fe5bea03a70ebb09564b74be2555d7b', '6011873dfea515990a882d38539e126d', '93667e0ed1442c25148ea330f86e97ba', '5c4a6d35244fd3ce149b1008535a7809', 'ccd1c08880fb15e70d4b1aac0063f1bf', 'ad408423128850292ffeb29b163cd78a', '55051865aa377bcf48acad6782479e97', '11664771aa54bb4138078077243e6c3c', '3cbedf02828b82d680371b91c7ec53fe', 'dfbd4c5642fcda3f5008590ba629c9e6', '6ae4cab1d0684afaea997a01a0939f71', 'f6af9cbc4f43f3a708e4cbf20e0315db', '8b03abc22d2051cb7740ac9f5795346f', '66d8156c3e21d105c697555f7952d5db', 'c95987f619409f89814af39a83d36e4f', 'b2a375e7fb7dd6883b7563f7dd915624', '4ff8af2020275408284cf75a8bbb94ec', 'b82841783bc6ea570cac350a031f462a', '35a9f67c975bd2a4d3b79c97df499293', 'a3a91683dc5cc0f98037c23b8d962acf', '3d800b30aef8a7da60395c92a0369ac7', 'b6ba5044eefaf39648286d57aee10803', '1f72bf63e2247500e0c6d674f5f1aab8', '4eed0c5d082e7207ab2083b26db70593', 'a69b1206b162095ea6bbb35fb64ef57e', '941af762cfe86db412d9b995b33277ee', '7e063e5a3613f993d9b7bcfb423bbda6', 'b6db70131e6355a13a741fd803d8ff95', '4db28b9c8602a7a17e9b9b80e5d090ce', '60cce41086fd0866c296ad09e4a9fb5e', 'a8bc849f5a64ea13fcfc139b152fe03d', '2df3055029fb0a7eaf3315d35853d756', 'df0cba25540dc2d66bafa4af3144190c', 'c586059099b151b8fa61c7483717cbc9', '225018e18cbf57b20a80c746e7861594', '9c1a4c9d0b5a9d47e4e053eb2d420407', 'f712c1df0637ed68819cdecf4617716d', 'b0779a8ab51a8f8f8bff1d2f35d5a6e2', '1cd6de23155f859461caf9c0458f8f87', 'e881b7ab3d0a47237fb5b1c1fc23376d', 'd54a17301e302785439f9dde6f3fc208', '9d4519587afe7aafb8f21dfc1b0d0996', 'b86f86f863585b3356d2304b7e3c62bc', '1a78fb79fa2a8982c87f21b8bc533615', 'df5b0524ad802d413159260557db958c', 'ee59574e746db4c025602b8605420659', 'd9bba8a3889fa3dac1b8e41e0146563b', '2fff98ca52fd19ef92a48c52029606e1', 'e5f2ca84b9f42da0c5a9fd90164ccad9', '6152b60d67077a228acebe412e1ee849', 'ee2a77ce843d24362cabb427c958871e', 'cce910d6457d84f18b53b37624e9be86', '62879b753d9ceb7931a02255fd59738d', '752560c49d508b3109ccede8c466d331', '2e705213cc37d836360a3eaca1137239', '8b524012052a7f83dcbd6747d17057c4', '5ab6090e8225c3787e56132e2a607d68', 'da4dd2ba6768317321515fcc645867cf', 'b53228aa58bd0a7218eb65b470870fc5', 'e6c4a523e241b25f3ddbc21333d93183', 'ac5e9a8968f532cccd40dfa75105d883', '3917656e05e2824cf5c7d49807daa559', 'afd9c8e6cf5e853598a4b093779381e3', 'ddf2dd4a5343a76ae274b951a573bc20', '1638139e889ba19e05813f7c63a62313', '5efc169c7d3e7f7319189a86f88d18e8', 'd8825e8ec4411927082d088ef0008d54', 'bc48d54e378c819bf0ed83f614d2eead', 'dbe28073723a939c538e46d36ac858ca', '3610e843650a8876095bc5510280dcf1', 'ae84f2816f15df41129e2c3baaf78063', '63e890c53312ed84881061fa597e1173', '398aa804f95674fe47027a3ace7b8a1b', '5b212fad64feba45b1aac1e00e5ad5d4', 'd039fdd87b3227982f2b8ce698849d47', 'de630b5890582133d146be37b8f7d034', '3f9be74ba902086262dce25092726c8d', 'a57a45a928db8a107d431931d1b63081', 'ce608bb27a6e97f47e81e6b285621d82', 'c2d1d17f441fe69a98dee9e647dc1d11', 'a02b340d7c63212f2a77f7f98fa53637', '2486853af95400d8af5f9eaad8264d30', 'a22cc150e903605e28fc5770adfbda9f', 'f8b7020ca42524815eb92b9e4b4cdf4c', '728b7cd0d354f32d64d47b364b35d813', '26a1457116407d0ff002d2aea64e0703', 'cb8868533691a6f0793cddc3576390a6', 'f816baec07810f53f1aa0bddad3993ea', '807ff24320ec7286f132a001f49aa297', 'cb42d83f38d352f4fe9caabf460722d0', '9c42c41c65c68ff4d10a5bf85cc586a3', '5b3607941af2d1290240044796d36a5e', '4f4418d5d81f8fdee4d8093a7bf92980', '5025836c4777bf0b7f56fd9f727ab568', 'bbcc8eb3b942fc2547e43e15d3b2d66d', '203726a51ed642b922e82a97f7c68651', '4e96ddfa5985f50356a0264498a42444', '59b9c3f86716b3f59175f1b839e324be', 'e27841a82df6187263b88bac10c5793c', '5a36cf95bf04a713652359ff744518c4', 'a4452247cf772a8f493ed45be2146279', '4ae391cbf0fbb6f54d4cdb79e40b3014', 'f474f0d3f6886b50246f2a74969d3cb9', '88f6107830f5e3d8413a025248e1e1a4', '6b24fa9df1b373a9d71192a8758e85d2', 'bbab47202e290663b1159185a7fbf016', '4f84706a5b704fb07a371f877021fe70', 'edd0fc3a25e58af6c5a94cffc5b53512', 'a7d6f138c779a0cf4917bd802663fe1c', '39104ba8977eb63a7cda6d6b13de2f96', '64bd5389adeb465253b181d838e5b68d', '7b748d57587bebad3762a865e45cf8e1', '69e7e1191bd2466b30f43c7d4c38d665', '1b82531c05d3135a5d1c7ff780454ffb', 'f9cba099a2686b06d37d17ef17239f67', '5586215272cda7ca20de7960ba716ac3', 'd7e7f51b5c05eebc51be0667d6786bc9', '0adf2d326855f05d198dc27e32f83e6b', '9cd29923f70f5ee2a346c2f85b6c0c70', 'c8ef92bff62daa7573e81decb7eeed7d', '1fc7365a9797e6012b4cf394a5f14627', '5aa0c0bf9466f8295f6e9eae1b2bbc3c', '5ff4f20594b9a376ba23cb3931cb3751', 'cfa5d4ad00c3f173563848c3abbf9f0b', '8b1f55cf26ef28727d3e98446c014dc1', '019d46639c173a9a95d4516cf4f0e46d', 'b98dae8f5c65cff8179d5dd2406648e6', '374cebaf127949e6f7ed7839c3656fa9', '0959dffe01941688e8408f5d4a7b0384', '07da3d9c550c65237902e2f2ae292914', '609a8cd3a608dd5c64b6823eefa5663f', '82d39c8eacd784def6cf14c8f849a679', '70582abf441e432c99cb13b40e1c4665', '37de9bb8ff75c3afdf94b71b9a48d4a1', '7b157fe80fb7aa84783db5d5ca0c1589', '00906c9fd0920b8ae1cecc38bf92d29b', '27c7c6af5658a77c7066c4832225577c', 'f755871faee0d36e43adcbc8324be2ef', '812eedc1f31d984ba60b2e443045cd77', '9bb137ff6aed857c6756063fd415518e', 'ab0ee37a6cd76cbaef1e38cbec044c88', '30820e2aa5a6d375c933e7185d9b6899', '3bb9b4d9bb761a655ad434180d384017', '8737364460c311e9489f2efecb858b0d', 'bd50fa4aad7b18da1e0376b1148dd66b', 'cbccbb39823fad6e408c746fec35968e', 'dec302dcb6372b436b70a37f2b92a2c8', '2598956a9f17ff8f12aecd8091b20d65', '95a06a96871131c62c1ae37356fef1ec', '0731f13bab833224c2fec1f4281ae2f2', '259542ca8b02d16c6df1489a8b2e8d4f', '9c875e2d5bbc5976db7deb1c5ee79fa0', '58e490450309513c0b78302827014dfd', '3c2c076905a292801eedb7550959b5db', 'a2506e16e44f432807654a94f913cef9', 'fca93e9a8540078571a1f16c0986d14a', 'c1bb5a2deb487a453c394c526ad1acaf', '9fbc30cde5a73c9a4c9ae2ec1875b1f6', 'a51cfc65bfe9f541bcc901decca81b9f', '6b16de115c6d63bf08c82931d50ad05d', 'a486e66cc48a344bc3ad9bcfdf4cfc37', '94b95dd371e0e8e70fe3745067567e66', '6c9c93cface745b241432ce73c76bfb8', '750ebc1df0359555bd5a10313880fb3b', '41b7b5c2056aa9688b8d2c220ebb2b2f', '35cdeb7d3d5b011b816eff9f99dc223d', 'b57075c4d487a10f6212b1cd470f1cdc', '89ee072332c37d47ad58d1ac7865c92e', '248a5c9213b4e959291792f0bd7241a8', '951b8b2ed48aec6ad14a335cb2e63d91', 'd4fcfa0666a39af97969a29b0ca002d1', 'b6594604feb3757adea8657e9b0fcad0', '061f7bd9fe6ae22b8b07f16fa3a3cde4', '18d5df1fe3fbfdd3bf1ffaacef35f3a1', '479ab4e179ebab72a9d2963afe8018ac', '5281c51e3be66ddc921277bb1b9657a4', 'df73e8ab1e7805c4497357162c38d6fa', 'b5fe04d4ea147e3e9de780f0a88fd71d', '32553a298851afcdb65adba991384ea1', '1c4b63d48bfb2eb6772e551aff2b287b', '3cfce1dc697fc9775f7b80beb0fc4d23', 'e90dff2cb72d52bba7128d2e90e79eb1', '932fd4c716a5226b78b80f208147021c', 'ba41403971e54618c894007859384a42', '108e66d5039e1bacf6e69141f23f7a67', '8d472c1214f5c9224c74f2917f7c80c9', 'abd6f49246d4181cab90bf2da64da308', '5ceceb62de9d399e702c3006c5e3f1be', 'dd838c540ca88782f3fba00e934f1af0', '245456396bf0b2fbc271039160f22de2', 'bc2f1dd0b53c59269ce1bfe9a0b84a69', 'e8569569549580e62aaf75b8f4a759d5', 'af772af5e47050b906b8dc5f71efa76e', '3d20882750f46c89d64300fcb45424ae', '39be757346ecd48bce9e3b9ce4b2bb0c', '81ade189c6fda6a6d405a240f37989a7', 'c41c73d21bfea0ade03346b169864391', 'c0a55a31e49e98a9d6664e05b8702a5b', '47520892700b208136b0365136789078', '5d3a6e952fb8cdde9b918b34bf626e05', '3e4db1b0ea7a8d6499c71e2851537f77', '1a2e68f56d7dae8e4369b55d3c82ecef', 'bc4908c241186bf2397a60837fd601b8', '64babcb1fbda40238397712e62bd418f', '56b56e26b932fab51fa35f2946788353', 'a053819838a1dc21b8f1f2761a476029', 'b4f5a15df471762b3d3f15e10ada7171', 'd629ca20350bf128696d0086e8ba1762', 'fc2d0d092e23c52a0a219e26554eb40e', '8518868ded7e9a76d1e21914656225c3', 'f7d55a23b8f1ad646d77ac69526abb22', '767f3c58c6fddb6fee0c2b03d61499e0', '64910b54509bb37879cb61a133434b15', '0d7c0dedd986f0062147701f65253c89', '0d0a7d1e8836bc96e0ddd6c072535c75', 'f573ab803966a22b6e2060e86baf9803', '7258c9222dec4148e4e9bde4b4f6afee', '30d269e7cbc4ba4586e7f8c9b84ad5c5', 'b237dc219aedbe90928a29c700d08d60', 'a168bc2802e792320cfab8ed3ed5f5aa', 'd34f25ea05cf09e53a6eeb621249eaf9', '82c63353ee1e8aa779afa3e6c7c2aa2e', '09ae7cead00b87c87234745a7155faed', 'f37ff2114d5a0078c0d782a4e9cf2646', '719101222a537d7214776067541b7dc8', '8d583b2ee0dc364257f7cb5e3e9571ba', '05cfaef20dd799bf7b135f8486c56922', 'c431244d360e3d09a87f24c87ee9a9d5', 'b119a39d90795c2d8c58cc525740b2de', 'c49d9157b44086dad352d2866880eff7', '975b8049ed717e42161e2ea4b3ebfc1c', '88cbb6e64e123c3da8f5eb64908e041e', 'c89ee7997fcecf5103063d18412a2159', '2402d0b852f2719838715a5483b67f90', '6972ad489459106c6a38b4ceacb1fdda', 'a583b5daaf6a730d836371b2dae4df22', '6ead8a81311508ffab0ac0128482c3b9', '904e9ee3a6001249cfeb13247082c067', '0b7f097bfc132334cdbb3bdcf078c82c', 'd8b4faffdb0a557a6bea5ae0eea4053f', 'bad4561e5cfdc3eb113144c08b3253a1', '7fd2e5f6832bcf6975b247937d175718', '171703ad36d950b4a9fa1de65627d3cb', '4d32efcbb63b326f72c6a055a2505625', '653fd442746b1d58061c58f1131ca39b', 'f48a364c29a0a0353284e4a6b6471471', '2cd2e01cfa43e6f8e6ff9b72fa5bd931', '06c9aa1712329ae572986b32b0fe0343', '1b7d61ec7377d0608370f25469477f62', '41f4e509d5096168cc118de4388fcbf5', 'cd915974bc6420f03e825049c84ff7f7', 'a360bcce8248c707faa9a3a8abdda997', '2157d4fbab685a25dcce9534d03c6d41', '9f8a677c3c6fa1509caa703ef7916c35', '28aa921921dae3d9dcc83516a340420c', '0b8c1356537d121dce47ff385ce91450', '2ba48182d48f83af1b96a6ac34d80dbb', '9f220c00fd0003c0b51b08dbaf5d2f2d', '17465d16e0c6a6addf57547fee84bb1a', '3ba711f77a46f743aa6deafce77f8576', '142a8c532b54289fea4a7f1687998772', '575ec950ce1ca0684b1add581cb0c995', '94aba9c9160c90cdf98f0af4fb203ce4', '9ed041c2bdcf38442c449b9e1b18535c', 'e7a108a395663e07775218a03583ed81', '94add8111457f05850e8bd5d4cc3a7a3', '5054d386528dc4ae36e845a1cc3cdae0', '666193de419791b49959af7932f5c8c2', 'eb6d9abf15d5220f992a9ac27a5001fe', 'f46cc14603a04178db39532dc15c176b', 'c3813cd5a191ef574003db15d615cbb8', 'e3fea6941cf58fa9b4e898d8955649dd', 'ce08c1226d445a18b7ce2ebc80a7096e', '8a8dfcf2c4cd350651bb8734955765aa', '9634bea378870bf33a7e347fdabf2456', '39b5f0e9e1a1f50e97c7a9789d5e2af0', 'cdac30e8a60ac5f771c987518dca38aa', '6cc2d361537457056399f96e437316ba', 'adfe05f999905e8b3341d3a8ba3d63ad', '37b0829936bcd9934a15eed60fa8532d', '496e10c729a3437c0654d4b5d4c09d82', 'd9c96f6ee613abc7d7949c3377681413', 'f0cb10da92bc2083159e443319216a37', '15045d3e23344df09a1d8c005bca41d5', '15d02f99130c0283a580bed0bed78275', '2d15324eba0e70ef4a1451d5321e980e', 'c7d3b831381fb5e75f144433358493b6', '9d4d42de8c64f7c79fd82fb3bb8960c4', '4bfe10874285d8d68486b8c991462ac8', '200c064f3fbf08ddcbbfb2f36118b066', '8eecc61b4ccca9910892a9c267d35d07', '41c698f0fefac0e702a03bbe38c755ee', '9221ec3ed5a0cac9cc211b1f76fc2642', '835f40e6144719e73f26d260b8ad6c80', '463d9a27874682c6001c748196070b22', 'a236d75f2c03e16fbbac2825d42eb914', '83ad78f86297babe27c2ea9cb47b4675', '421f72e8a492f524b696dce4658879e2', 'ebff992242dcd49d4004414a94581574', 'd6b7c0a83da2af88d584dad256dd08e0', '6a559666d4f4561014c97c2177068f78', '67d0e885fbbd04b3675512bb31d1a0e0', 'dd8ecfa0c5e14fc9dfa9903f3501a07b', '590ca279d64e0fc64a5a1f87d91c6bfb', 'fa457b5f7729ecaad819d201ffc49784', 'cc36fbe3a57fa2b04738ab2a7b0e5e12', '864e494bca15d43d2a036610712c0180', 'f8509fc848701a8f2ee6c0c0a59936b8', '903a785a05d396bb9f3b3b1db1e59d7e', '87d48c0f33a9213c9c252535a399ecf4', '3deb452f47e2dba10f8baea299721ca9', '12d512a24a5210ceb6074e7bdaebc227', 'ae97d323473045ec06cfc1fd165808ce', '88199cab3584d87a46150bda69379a9c', '09be0cb04e6fd9080bc7ccf8366373ee', '2d81d266eaf9148fdf177891eb51d708', 'b3d89252b2dcdf7ab1b6ea69e05b4e7b', '4e642b8d7055956511fd63978aec8d0d', '3b3b42273065160a4cbf88d5507a0dc0', 'b5073b22539ea0a59136f8b540c95b02', 'bd723b441e4de6d16bf3ef78a0ed10ad', 'a65ad70ea91d2a1af991e304fadd62a4', '6f451320b2bafe1e730a8b6e2bd833a5', '91dfef441587afeac249429a4dd44cf8', '38a8dcbd0ebfa6961a368f4fc17145ab', '8fc484b27aa36a70fd67edefd9e2477f', '6b4b817b770b8ccb189a2597c694506b', 'f45a2ec2f61ca00a1e099f97f316d79b', 'f53603d80e75c2e3be16c3feb36a539d', '531d0a1f09348b04f5dc54bf84583cc1', '879bd82b192d004e0fb657469c76fc51', '2b50e856f51760e84318e7092fdeb0d8', '2e8f9fc2dcdd0dd7752ad2d268566b53', '80cd82b51fcebfcd90ee3af11adc09e1', 'd18e62ff9a47263dd6ab7c3087879692', 'e6c1afcc7bc65980b80bc5e921bfca06', '806fbd7dd888ce61828fe4395b6f4e75', '1bc94726d834ef7b9535d51e78bc8c55', '09598adef2692a5ecc76bf5e258c2ec1', 'b629be66e8e45a28ed259cfe274af605', '28bb026e961ba92d5f792df7039555ae', '7e28b3a432db2f50621c6fe42f2f36ba', '0d828e3d92683f430d43c51c9fb5b54d', '84536d2896d31e74666b6aba27aae014', '3da255cae775b060a22dc63fda61fefa', 'eec32b35fca1c7403c1fea55d39afda5', 'f44a80af3c1bd429a946f4a274113e6b', 'e64d2d6c8000833d9b5883073485ff03', '699193b16271dbb8b49f218310243030', '1792cda8cff53c86561827b682b91bed', '5f862b1fa2ca69490d9bbac6ceae0337', '57bd90ae2a361f1382fc725fef0a0921', 'd382ebbd528354123e59a86233c32b06', 'd6d2c4d555de208ebbbe6c82b0aafd17', '8fd7fabebed71969fbe86d7364c8861d', '1be229c4bbd1962054419c0afef9eee4', '8834cc11931a2e49c5e0d54e9da9aad9', '743729efb199a6558d0899195d8ca740', 'f7bec41b29d58c70966f97ea9b52bb3e', '0190b8458cfbd28683eb07811a7a7ec5', 'fd816aea82671f71e3eb67e977af00a0', '6f4f1114bd7840ed5e5d756ef8c6432b', 'e9b5c43b5fc7642377401d25a9bedd2d', 'a35a63e506a0b73d125ff0080bdd588c', '1b84156caef23d63398ece07abe5929e', 'de14cd74a8e95268187653ec969312c0', '4b3e0bfd44175b0cae7c36b9e8e519e3', '104a4d3bd3aeeac50d2cc2fb4493bce4', 'cc1d54da1222b9a544b08ff4f645f3fd', 'c80255158cfaef8697d6c6bfc2362341', 'c87009f5755a9b6aa356a1935a57e5a9', '36eeb567a2ea72c1b54cdd9d46a6647b', 'c5ab8758f3c8796ab006d65be6fc25f8', '319f3b8aadfb71e710e070af1bb1047c', '02c93ce8c413e2e8e77975123aecec49', '91ee38d3951b17dfc4c7bca94b6d6dd8', '2e9de09614d9065d6dfc2ae16d4268c1', '57bba769688fb4372296000214954f56', 'fbc81ab85f91aa162fedb88547abc2ce', '05bdc47cee2f106023cbd8617b0997c4', 'a88fd0271a539c09d0c906153736e111', '466e95e1a82f48dd1fdc2a119f8df9fc', 'e580bba0a66774889eb04ca5ca6f8e37', 'd0668af748d1fd0ae7347eb90a5d4fcf', 'be84c54af594465edf6dede1977dee38', 'b582dff2a2b91f406b14ff64e032c72e', '29705fbaf540d6f6b2f8c67313b3070c', 'e3160345920372b267101bdae099bc20', '0a0620ab140b578867661ce0f0abf856', '0bfbf291188a05a0967b5ec02f574357', '98c5511cd321f13f7c1953986475c8f0', '51f0ada675f84142440bc7e56caa57ba', 'cea5cc20e8b7e2c2de0deaa3fb32a371', '7d7212ceeaa7cb2795038bc79da98159', '7d3db7859099303167c8396310a79fe8', 'e37bf85f107b2ab0fe65cc84a7ae103e', '70eb3bc85ff6e3914131f5a6cd81834c', 'b7ee2b49525d85788b253a1508efd4f7'] ['71d7a66777628e3f9b8b85270366166c', '9032f491fd4ef429d9c18b3b015bd11b', '776e27836f8fbe84f60a850952498988', '8d4086520df91f54eec6a4accf796569', 'c0a8f1d99cd3b7ee60c21d1e9b98125c', 'd27c13871d62e88714fdaf539e50382e', '516686bcd8331a26428556e346c377c5', 'a8f2bbaf928d9834c8257188ca2a0780', '98603c2f4ff85046a071701c2659295d', '867833a9097a0838ad3a0a9df0f7a4a5', 'ecf80f40365366e561ce1b900a43c8f8', '761000dc2471950ac51f58d60694e78a', '38959f4d0d0d96f0f567b4e8c3e24809', '0f4bd20ef527fa6d503f0747f91247e8', 'c6c616ea37c4f4e9f753a1a26c1f513f', '1dd1899fc4c0293f6bbb95af80cc39c4', '1d07fcbe2a6900c4e4ea4409ae2c5270', '568d6332f094e839329a4d66d30dc6d0', '6d9ed75aecea4cb4e6298bd638ff4afc', '0b0fd757eef935f9609cbfab1443840d', '8031c4b9f388c424ba2ed4da91f00173', 'a1c8fc922eac56a7bd275565defa9763', '07f44d71c453c52f9f055ab877b909cc', 'a71a739c154aa77fb2800a426818df5e', '998fe36c4f8131243b766398f5801c22', '5db6785789345d1638b1046fee3b4c76', 'fb85950dc8effebfa15c9555f238d7ec', '927893d2acecb4415e1f68f923dce782', '1cded34a5dfb786368208f632e5a5d6f', 'e180827dbd190aa6e9651be5d08f7230', 'e243fda5f1cb39ea6c810564c025638b', '37710e7c0449baedbea4236ef2306da1', 'eaba6c7538a5d6f7f993e1bed678db73', 'c062f1a1e198a4da90193c5e06c484c8', '57cd65ad7d3909ce9b8fe9d4e2fdf709', 'f571b343e99c24e012804a5b04a591f4', '97ca93ecb6d659c2af7e71a269ad021a', 'bec4c14ba02a32fad812ee94a98c1602', '12f506bf952aec9b3d8289a62ded85d6', 'c396496b481e2685a7733cfdbeaf925b', '932b64b6d7fa0605031eaa6f3c63eea5', 'b00b0b63b241943957b48e9a4f9b7c27', '085fd00ce06d27d19f40b330c7d8ba0d', 'a6a55291b82d26ed8852361a80575e89', '0f1a741eea4c4cbdba9d46dd3d762e2a', 'a6f2a358b6a2a33fd8c44830293b8323', '6c5bebf8e67ea0562455656f4132dca8', '27c8850bb21db8dbccc40ebadaf4daea', '393a3cf1a1ce2e6de5955e85605b2e61', '346ba06d2fc6ad303d2c25aa03c8f309', '5160a38cdbea74be5e569b767f025e01', '0a63c38ccede65ad5788d469185fc315', '940904b10cb465abe65aa7f7fafbe2a3', '22333feb1c9b1f02fa531f878c1970ae', '9348b0de17a27622d52d6c71fa12d419', '568982eb42078f033b3ea568329b6c0e', '56a842911136ec4628584bbb915f8f4b', 'f33db0c82c1d8924f035a0ea3b2e7ca9', 'b4fbc993f786bed4cec41b356919f2e3', '6cc5cb34c7f90bd870c027ece3a15fad', 'bcc2b44d3cd1f9d2b8458c595fb9b02d', '269c71c3a7cb517e2d21f20bad0714c6', '513b44a60a7bfd9a8c1a5a2c49ca185e', 'e05c7b866e6e8e7394ca5b1d02c20f58', '9652a10d1629156e0789bb630e081f9c', '8a07ce1347bfecea1903820b2b76e0be', '9e560864413aee1abddcde454e778e94', '62beeffb51a33d06b38e2da8d5f2d051', '1d6f6841cc1de15d208daa6aee88e2ad', '47c9781f39a732c3643f6c4e600e19cf', '8816ba131c25d99edd0cdd75c07859b1', '07bf09806e2103ac74dac0afeb33bf6d', 'f7a936a8f212e039b7fea6867ad7f010', '199525a09762d6e51c1c7f4ef55c8926', '6aa56327ff9ef7983d0c691aca0ef1ca', 'f29cdf5317bffce5648a21766b6b5f34', 'fa220df5ca6551e99016b9d5897b7915', 'e9213ea8f437aa9d320e0e1271c4a1e6', '51482d141bf1dc8a92bc294563992ef0', 'b829d2d9b0b73662c3bcfaf032bcb1e2', '55540a87c8cdfba5615d4079484901f3', '5b53081a25e4d66f353d2229a0564c65', '1f993001eef14eee827293e16d218ca6', 'b0288182991527078a0693bca89985b8', '6b599fe2bfc9db75aabab13175652d56', 'e8c7b7a4bf4390bf03ddcc5d4215dba2', '0c0856b3f01f7a0a6d9fbcbf4253e190', 'd45962a733099ddc701566f0d33556ef', '025be63b5d1ae501b268711832cc9665', 'ba71aa394247a7a27230a2be489ab420', '916e520d5f358a54ebcdbba0179d8843', '6095a61aaf59fb5bcdc88fe1ad8a78c2', '84b51054f1131067758bb1137d812b78', '1823baf623fff2394d985f4f6c0e6d74', '2b5c43f2fa8f0b133535cd3ee6c6c695', '7dcb8c84aee27fee41f9400c33ff1551', 'eae565378fff9055ed48235add42bc15', 'ea1ccd1847912ce2cc57d7901a0d81d7', '2922376d0667c1809e159264aea169f5', '0c0bcba0c40adafd29e2cf47ec4fb544', '85eca4e62686b28864e1f6e20ce8d683', '1a78051dee60dbe7d688a3127f12544e', '46b1520f1cbaa923ac8cf85feadc9cf4', '49655c66704cf856ca2dbbc77af22586', '7b107a90c7d45209b9b367a08feb7708', '4361de87fcb0cfd740d35212aa7c7211', '1e8f4d2acdb3d4a94ce1205e0326f9f6', '9f0e04bd2d29040421904442696311b5', '2bb484a925b6a25f1277b7008651cf92', 'db5bdd3d97d53f257ac6afb8fd78f462', '63aa5757186c1def977a118d31445ec0', '9864da59e74afac4d219fc65939826e5', '4ec499632cf69a6f75f630f09fefa1dd', '6fe5bea03a70ebb09564b74be2555d7b', '6011873dfea515990a882d38539e126d', '93667e0ed1442c25148ea330f86e97ba', '5c4a6d35244fd3ce149b1008535a7809', 'ccd1c08880fb15e70d4b1aac0063f1bf', 'ad408423128850292ffeb29b163cd78a', '55051865aa377bcf48acad6782479e97', '11664771aa54bb4138078077243e6c3c', '3cbedf02828b82d680371b91c7ec53fe', 'dfbd4c5642fcda3f5008590ba629c9e6', '6ae4cab1d0684afaea997a01a0939f71', 'f6af9cbc4f43f3a708e4cbf20e0315db', '8b03abc22d2051cb7740ac9f5795346f', '66d8156c3e21d105c697555f7952d5db', 'c95987f619409f89814af39a83d36e4f', 'b2a375e7fb7dd6883b7563f7dd915624', '4ff8af2020275408284cf75a8bbb94ec', 'b82841783bc6ea570cac350a031f462a', '35a9f67c975bd2a4d3b79c97df499293', 'a3a91683dc5cc0f98037c23b8d962acf', '3d800b30aef8a7da60395c92a0369ac7', 'b6ba5044eefaf39648286d57aee10803', '1f72bf63e2247500e0c6d674f5f1aab8', '4eed0c5d082e7207ab2083b26db70593', 'a69b1206b162095ea6bbb35fb64ef57e', '941af762cfe86db412d9b995b33277ee', '7e063e5a3613f993d9b7bcfb423bbda6', 'b6db70131e6355a13a741fd803d8ff95', '4db28b9c8602a7a17e9b9b80e5d090ce', '60cce41086fd0866c296ad09e4a9fb5e', 'a8bc849f5a64ea13fcfc139b152fe03d', '2df3055029fb0a7eaf3315d35853d756', 'df0cba25540dc2d66bafa4af3144190c', 'c586059099b151b8fa61c7483717cbc9', '225018e18cbf57b20a80c746e7861594', '9c1a4c9d0b5a9d47e4e053eb2d420407', 'f712c1df0637ed68819cdecf4617716d', 'b0779a8ab51a8f8f8bff1d2f35d5a6e2', '1cd6de23155f859461caf9c0458f8f87', 'e881b7ab3d0a47237fb5b1c1fc23376d', 'd54a17301e302785439f9dde6f3fc208', '9d4519587afe7aafb8f21dfc1b0d0996', 'b86f86f863585b3356d2304b7e3c62bc', '1a78fb79fa2a8982c87f21b8bc533615', 'df5b0524ad802d413159260557db958c', 'ee59574e746db4c025602b8605420659', 'd9bba8a3889fa3dac1b8e41e0146563b', '2fff98ca52fd19ef92a48c52029606e1', 'e5f2ca84b9f42da0c5a9fd90164ccad9', '6152b60d67077a228acebe412e1ee849', 'ee2a77ce843d24362cabb427c958871e', 'cce910d6457d84f18b53b37624e9be86', '62879b753d9ceb7931a02255fd59738d', '752560c49d508b3109ccede8c466d331', '2e705213cc37d836360a3eaca1137239', '8b524012052a7f83dcbd6747d17057c4', '5ab6090e8225c3787e56132e2a607d68', 'da4dd2ba6768317321515fcc645867cf', 'b53228aa58bd0a7218eb65b470870fc5', 'e6c4a523e241b25f3ddbc21333d93183', 'ac5e9a8968f532cccd40dfa75105d883', '3917656e05e2824cf5c7d49807daa559', 'afd9c8e6cf5e853598a4b093779381e3', 'ddf2dd4a5343a76ae274b951a573bc20', '1638139e889ba19e05813f7c63a62313', '5efc169c7d3e7f7319189a86f88d18e8', 'd8825e8ec4411927082d088ef0008d54', 'bc48d54e378c819bf0ed83f614d2eead', 'dbe28073723a939c538e46d36ac858ca', '3610e843650a8876095bc5510280dcf1', 'ae84f2816f15df41129e2c3baaf78063', '63e890c53312ed84881061fa597e1173', '398aa804f95674fe47027a3ace7b8a1b', '5b212fad64feba45b1aac1e00e5ad5d4', 'd039fdd87b3227982f2b8ce698849d47', 'de630b5890582133d146be37b8f7d034', '3f9be74ba902086262dce25092726c8d', 'a57a45a928db8a107d431931d1b63081', 'ce608bb27a6e97f47e81e6b285621d82', 'c2d1d17f441fe69a98dee9e647dc1d11', 'a02b340d7c63212f2a77f7f98fa53637', '2486853af95400d8af5f9eaad8264d30', 'a22cc150e903605e28fc5770adfbda9f', 'f8b7020ca42524815eb92b9e4b4cdf4c', '728b7cd0d354f32d64d47b364b35d813', '26a1457116407d0ff002d2aea64e0703', 'cb8868533691a6f0793cddc3576390a6', 'f816baec07810f53f1aa0bddad3993ea', '807ff24320ec7286f132a001f49aa297', 'cb42d83f38d352f4fe9caabf460722d0', '9c42c41c65c68ff4d10a5bf85cc586a3', '5b3607941af2d1290240044796d36a5e', '4f4418d5d81f8fdee4d8093a7bf92980', '5025836c4777bf0b7f56fd9f727ab568', 'bbcc8eb3b942fc2547e43e15d3b2d66d', '203726a51ed642b922e82a97f7c68651', '4e96ddfa5985f50356a0264498a42444', '59b9c3f86716b3f59175f1b839e324be', 'e27841a82df6187263b88bac10c5793c', '5a36cf95bf04a713652359ff744518c4', 'a4452247cf772a8f493ed45be2146279', '4ae391cbf0fbb6f54d4cdb79e40b3014', 'f474f0d3f6886b50246f2a74969d3cb9', '88f6107830f5e3d8413a025248e1e1a4', '6b24fa9df1b373a9d71192a8758e85d2', 'bbab47202e290663b1159185a7fbf016', '4f84706a5b704fb07a371f877021fe70', 'edd0fc3a25e58af6c5a94cffc5b53512', 'a7d6f138c779a0cf4917bd802663fe1c', '39104ba8977eb63a7cda6d6b13de2f96', '64bd5389adeb465253b181d838e5b68d', '7b748d57587bebad3762a865e45cf8e1', '69e7e1191bd2466b30f43c7d4c38d665', '1b82531c05d3135a5d1c7ff780454ffb', 'f9cba099a2686b06d37d17ef17239f67', '5586215272cda7ca20de7960ba716ac3', 'd7e7f51b5c05eebc51be0667d6786bc9', '0adf2d326855f05d198dc27e32f83e6b', '9cd29923f70f5ee2a346c2f85b6c0c70', 'c8ef92bff62daa7573e81decb7eeed7d', '1fc7365a9797e6012b4cf394a5f14627', '5aa0c0bf9466f8295f6e9eae1b2bbc3c', '5ff4f20594b9a376ba23cb3931cb3751', 'cfa5d4ad00c3f173563848c3abbf9f0b', '8b1f55cf26ef28727d3e98446c014dc1', '019d46639c173a9a95d4516cf4f0e46d', 'b98dae8f5c65cff8179d5dd2406648e6', '374cebaf127949e6f7ed7839c3656fa9', '0959dffe01941688e8408f5d4a7b0384', '07da3d9c550c65237902e2f2ae292914', '609a8cd3a608dd5c64b6823eefa5663f', '82d39c8eacd784def6cf14c8f849a679', '70582abf441e432c99cb13b40e1c4665', '37de9bb8ff75c3afdf94b71b9a48d4a1', '7b157fe80fb7aa84783db5d5ca0c1589', '00906c9fd0920b8ae1cecc38bf92d29b', '27c7c6af5658a77c7066c4832225577c', 'f755871faee0d36e43adcbc8324be2ef', '812eedc1f31d984ba60b2e443045cd77', '9bb137ff6aed857c6756063fd415518e', 'ab0ee37a6cd76cbaef1e38cbec044c88', '30820e2aa5a6d375c933e7185d9b6899', '3bb9b4d9bb761a655ad434180d384017', '8737364460c311e9489f2efecb858b0d', 'bd50fa4aad7b18da1e0376b1148dd66b', 'cbccbb39823fad6e408c746fec35968e', 'dec302dcb6372b436b70a37f2b92a2c8', '2598956a9f17ff8f12aecd8091b20d65', '95a06a96871131c62c1ae37356fef1ec', '0731f13bab833224c2fec1f4281ae2f2', '259542ca8b02d16c6df1489a8b2e8d4f', '9c875e2d5bbc5976db7deb1c5ee79fa0', '58e490450309513c0b78302827014dfd', '3c2c076905a292801eedb7550959b5db', 'a2506e16e44f432807654a94f913cef9', 'fca93e9a8540078571a1f16c0986d14a', 'c1bb5a2deb487a453c394c526ad1acaf', '9fbc30cde5a73c9a4c9ae2ec1875b1f6', 'a51cfc65bfe9f541bcc901decca81b9f', '6b16de115c6d63bf08c82931d50ad05d', 'a486e66cc48a344bc3ad9bcfdf4cfc37', '94b95dd371e0e8e70fe3745067567e66', '6c9c93cface745b241432ce73c76bfb8', '750ebc1df0359555bd5a10313880fb3b', '41b7b5c2056aa9688b8d2c220ebb2b2f', '35cdeb7d3d5b011b816eff9f99dc223d', 'b57075c4d487a10f6212b1cd470f1cdc', '89ee072332c37d47ad58d1ac7865c92e', '248a5c9213b4e959291792f0bd7241a8', '951b8b2ed48aec6ad14a335cb2e63d91', 'd4fcfa0666a39af97969a29b0ca002d1', 'b6594604feb3757adea8657e9b0fcad0', '061f7bd9fe6ae22b8b07f16fa3a3cde4', '18d5df1fe3fbfdd3bf1ffaacef35f3a1', '479ab4e179ebab72a9d2963afe8018ac', '5281c51e3be66ddc921277bb1b9657a4', 'df73e8ab1e7805c4497357162c38d6fa', 'b5fe04d4ea147e3e9de780f0a88fd71d', '32553a298851afcdb65adba991384ea1', '1c4b63d48bfb2eb6772e551aff2b287b', '3cfce1dc697fc9775f7b80beb0fc4d23', 'e90dff2cb72d52bba7128d2e90e79eb1', '932fd4c716a5226b78b80f208147021c', 'ba41403971e54618c894007859384a42', '108e66d5039e1bacf6e69141f23f7a67', '8d472c1214f5c9224c74f2917f7c80c9', 'abd6f49246d4181cab90bf2da64da308', '5ceceb62de9d399e702c3006c5e3f1be', 'dd838c540ca88782f3fba00e934f1af0', '245456396bf0b2fbc271039160f22de2', 'bc2f1dd0b53c59269ce1bfe9a0b84a69', 'e8569569549580e62aaf75b8f4a759d5', 'af772af5e47050b906b8dc5f71efa76e', '3d20882750f46c89d64300fcb45424ae', '39be757346ecd48bce9e3b9ce4b2bb0c', '81ade189c6fda6a6d405a240f37989a7', 'c41c73d21bfea0ade03346b169864391', 'c0a55a31e49e98a9d6664e05b8702a5b', '47520892700b208136b0365136789078', '5d3a6e952fb8cdde9b918b34bf626e05', '3e4db1b0ea7a8d6499c71e2851537f77', '1a2e68f56d7dae8e4369b55d3c82ecef', 'bc4908c241186bf2397a60837fd601b8', '64babcb1fbda40238397712e62bd418f', '56b56e26b932fab51fa35f2946788353', 'a053819838a1dc21b8f1f2761a476029', 'b4f5a15df471762b3d3f15e10ada7171', 'd629ca20350bf128696d0086e8ba1762', 'fc2d0d092e23c52a0a219e26554eb40e', '8518868ded7e9a76d1e21914656225c3', 'f7d55a23b8f1ad646d77ac69526abb22', '767f3c58c6fddb6fee0c2b03d61499e0', '64910b54509bb37879cb61a133434b15', '0d7c0dedd986f0062147701f65253c89', '0d0a7d1e8836bc96e0ddd6c072535c75', 'f573ab803966a22b6e2060e86baf9803', '7258c9222dec4148e4e9bde4b4f6afee', '30d269e7cbc4ba4586e7f8c9b84ad5c5', 'b237dc219aedbe90928a29c700d08d60', 'a168bc2802e792320cfab8ed3ed5f5aa', 'd34f25ea05cf09e53a6eeb621249eaf9', '82c63353ee1e8aa779afa3e6c7c2aa2e', '09ae7cead00b87c87234745a7155faed', 'f37ff2114d5a0078c0d782a4e9cf2646', '719101222a537d7214776067541b7dc8', '8d583b2ee0dc364257f7cb5e3e9571ba', '05cfaef20dd799bf7b135f8486c56922', 'c431244d360e3d09a87f24c87ee9a9d5', 'b119a39d90795c2d8c58cc525740b2de', 'c49d9157b44086dad352d2866880eff7', '975b8049ed717e42161e2ea4b3ebfc1c', '88cbb6e64e123c3da8f5eb64908e041e', 'c89ee7997fcecf5103063d18412a2159', '2402d0b852f2719838715a5483b67f90', '6972ad489459106c6a38b4ceacb1fdda', 'a583b5daaf6a730d836371b2dae4df22', '6ead8a81311508ffab0ac0128482c3b9', '904e9ee3a6001249cfeb13247082c067', '0b7f097bfc132334cdbb3bdcf078c82c', 'd8b4faffdb0a557a6bea5ae0eea4053f', 'bad4561e5cfdc3eb113144c08b3253a1', '7fd2e5f6832bcf6975b247937d175718', '171703ad36d950b4a9fa1de65627d3cb', '4d32efcbb63b326f72c6a055a2505625', '653fd442746b1d58061c58f1131ca39b', 'f48a364c29a0a0353284e4a6b6471471', '2cd2e01cfa43e6f8e6ff9b72fa5bd931', '06c9aa1712329ae572986b32b0fe0343', '1b7d61ec7377d0608370f25469477f62', '41f4e509d5096168cc118de4388fcbf5', 'cd915974bc6420f03e825049c84ff7f7', 'a360bcce8248c707faa9a3a8abdda997', '2157d4fbab685a25dcce9534d03c6d41', '9f8a677c3c6fa1509caa703ef7916c35', '28aa921921dae3d9dcc83516a340420c', '0b8c1356537d121dce47ff385ce91450', '2ba48182d48f83af1b96a6ac34d80dbb', '9f220c00fd0003c0b51b08dbaf5d2f2d', '17465d16e0c6a6addf57547fee84bb1a', '3ba711f77a46f743aa6deafce77f8576', '142a8c532b54289fea4a7f1687998772', '575ec950ce1ca0684b1add581cb0c995', '94aba9c9160c90cdf98f0af4fb203ce4', '9ed041c2bdcf38442c449b9e1b18535c', 'e7a108a395663e07775218a03583ed81', '94add8111457f05850e8bd5d4cc3a7a3', '5054d386528dc4ae36e845a1cc3cdae0', '666193de419791b49959af7932f5c8c2', 'eb6d9abf15d5220f992a9ac27a5001fe', 'f46cc14603a04178db39532dc15c176b', 'c3813cd5a191ef574003db15d615cbb8', 'e3fea6941cf58fa9b4e898d8955649dd', 'ce08c1226d445a18b7ce2ebc80a7096e', '8a8dfcf2c4cd350651bb8734955765aa', '9634bea378870bf33a7e347fdabf2456', '39b5f0e9e1a1f50e97c7a9789d5e2af0', 'cdac30e8a60ac5f771c987518dca38aa', '6cc2d361537457056399f96e437316ba', 'adfe05f999905e8b3341d3a8ba3d63ad', '37b0829936bcd9934a15eed60fa8532d', '496e10c729a3437c0654d4b5d4c09d82', 'd9c96f6ee613abc7d7949c3377681413', 'f0cb10da92bc2083159e443319216a37', '15045d3e23344df09a1d8c005bca41d5', '15d02f99130c0283a580bed0bed78275', '2d15324eba0e70ef4a1451d5321e980e', 'c7d3b831381fb5e75f144433358493b6', '9d4d42de8c64f7c79fd82fb3bb8960c4', '4bfe10874285d8d68486b8c991462ac8', '200c064f3fbf08ddcbbfb2f36118b066', '8eecc61b4ccca9910892a9c267d35d07', '41c698f0fefac0e702a03bbe38c755ee', '9221ec3ed5a0cac9cc211b1f76fc2642', '835f40e6144719e73f26d260b8ad6c80', '463d9a27874682c6001c748196070b22', 'a236d75f2c03e16fbbac2825d42eb914', '83ad78f86297babe27c2ea9cb47b4675', '421f72e8a492f524b696dce4658879e2', 'ebff992242dcd49d4004414a94581574', 'd6b7c0a83da2af88d584dad256dd08e0', '6a559666d4f4561014c97c2177068f78', '67d0e885fbbd04b3675512bb31d1a0e0', 'dd8ecfa0c5e14fc9dfa9903f3501a07b', '590ca279d64e0fc64a5a1f87d91c6bfb', 'fa457b5f7729ecaad819d201ffc49784', 'cc36fbe3a57fa2b04738ab2a7b0e5e12', '864e494bca15d43d2a036610712c0180', 'f8509fc848701a8f2ee6c0c0a59936b8', '903a785a05d396bb9f3b3b1db1e59d7e', '87d48c0f33a9213c9c252535a399ecf4', '3deb452f47e2dba10f8baea299721ca9', '12d512a24a5210ceb6074e7bdaebc227', 'ae97d323473045ec06cfc1fd165808ce', '88199cab3584d87a46150bda69379a9c', '09be0cb04e6fd9080bc7ccf8366373ee', '2d81d266eaf9148fdf177891eb51d708', 'b3d89252b2dcdf7ab1b6ea69e05b4e7b', '4e642b8d7055956511fd63978aec8d0d', '3b3b42273065160a4cbf88d5507a0dc0', 'b5073b22539ea0a59136f8b540c95b02', 'bd723b441e4de6d16bf3ef78a0ed10ad', 'a65ad70ea91d2a1af991e304fadd62a4', '6f451320b2bafe1e730a8b6e2bd833a5', '91dfef441587afeac249429a4dd44cf8', '38a8dcbd0ebfa6961a368f4fc17145ab', '8fc484b27aa36a70fd67edefd9e2477f', '6b4b817b770b8ccb189a2597c694506b', 'f45a2ec2f61ca00a1e099f97f316d79b', 'f53603d80e75c2e3be16c3feb36a539d', '531d0a1f09348b04f5dc54bf84583cc1', '879bd82b192d004e0fb657469c76fc51', '2b50e856f51760e84318e7092fdeb0d8', '2e8f9fc2dcdd0dd7752ad2d268566b53', '80cd82b51fcebfcd90ee3af11adc09e1', 'd18e62ff9a47263dd6ab7c3087879692', 'e6c1afcc7bc65980b80bc5e921bfca06', '806fbd7dd888ce61828fe4395b6f4e75', '1bc94726d834ef7b9535d51e78bc8c55', '09598adef2692a5ecc76bf5e258c2ec1', 'b629be66e8e45a28ed259cfe274af605', '28bb026e961ba92d5f792df7039555ae', '7e28b3a432db2f50621c6fe42f2f36ba', '0d828e3d92683f430d43c51c9fb5b54d', '84536d2896d31e74666b6aba27aae014', '3da255cae775b060a22dc63fda61fefa', 'eec32b35fca1c7403c1fea55d39afda5', 'f44a80af3c1bd429a946f4a274113e6b', 'e64d2d6c8000833d9b5883073485ff03', '699193b16271dbb8b49f218310243030', '1792cda8cff53c86561827b682b91bed', '5f862b1fa2ca69490d9bbac6ceae0337', '57bd90ae2a361f1382fc725fef0a0921', 'd382ebbd528354123e59a86233c32b06', 'd6d2c4d555de208ebbbe6c82b0aafd17', '8fd7fabebed71969fbe86d7364c8861d', '1be229c4bbd1962054419c0afef9eee4', '8834cc11931a2e49c5e0d54e9da9aad9', '743729efb199a6558d0899195d8ca740', 'f7bec41b29d58c70966f97ea9b52bb3e', '0190b8458cfbd28683eb07811a7a7ec5', 'fd816aea82671f71e3eb67e977af00a0', '6f4f1114bd7840ed5e5d756ef8c6432b', 'e9b5c43b5fc7642377401d25a9bedd2d', 'a35a63e506a0b73d125ff0080bdd588c', '1b84156caef23d63398ece07abe5929e', 'de14cd74a8e95268187653ec969312c0', '4b3e0bfd44175b0cae7c36b9e8e519e3', '104a4d3bd3aeeac50d2cc2fb4493bce4', 'cc1d54da1222b9a544b08ff4f645f3fd', 'c80255158cfaef8697d6c6bfc2362341', 'c87009f5755a9b6aa356a1935a57e5a9', '36eeb567a2ea72c1b54cdd9d46a6647b', 'c5ab8758f3c8796ab006d65be6fc25f8', '319f3b8aadfb71e710e070af1bb1047c', '02c93ce8c413e2e8e77975123aecec49', '91ee38d3951b17dfc4c7bca94b6d6dd8', '2e9de09614d9065d6dfc2ae16d4268c1', '57bba769688fb4372296000214954f56', 'fbc81ab85f91aa162fedb88547abc2ce', '05bdc47cee2f106023cbd8617b0997c4', 'a88fd0271a539c09d0c906153736e111', '466e95e1a82f48dd1fdc2a119f8df9fc', 'e580bba0a66774889eb04ca5ca6f8e37', 'd0668af748d1fd0ae7347eb90a5d4fcf', 'be84c54af594465edf6dede1977dee38', 'b582dff2a2b91f406b14ff64e032c72e', '29705fbaf540d6f6b2f8c67313b3070c', 'e3160345920372b267101bdae099bc20', '0a0620ab140b578867661ce0f0abf856', '0bfbf291188a05a0967b5ec02f574357', '98c5511cd321f13f7c1953986475c8f0', '51f0ada675f84142440bc7e56caa57ba', 'cea5cc20e8b7e2c2de0deaa3fb32a371', '7d7212ceeaa7cb2795038bc79da98159', '7d3db7859099303167c8396310a79fe8', 'e37bf85f107b2ab0fe65cc84a7ae103e', '70eb3bc85ff6e3914131f5a6cd81834c', 'b7ee2b49525d85788b253a1508efd4f7']\n", 246 | "Test ids (511 samples) = ['71d7a66777628e3f9b8b85270366166c', '9032f491fd4ef429d9c18b3b015bd11b', '776e27836f8fbe84f60a850952498988', '8d4086520df91f54eec6a4accf796569', 'c0a8f1d99cd3b7ee60c21d1e9b98125c', 'd27c13871d62e88714fdaf539e50382e', '516686bcd8331a26428556e346c377c5', 'a8f2bbaf928d9834c8257188ca2a0780', '98603c2f4ff85046a071701c2659295d', '867833a9097a0838ad3a0a9df0f7a4a5', 'ecf80f40365366e561ce1b900a43c8f8', '761000dc2471950ac51f58d60694e78a', '38959f4d0d0d96f0f567b4e8c3e24809', '0f4bd20ef527fa6d503f0747f91247e8', 'c6c616ea37c4f4e9f753a1a26c1f513f', '1dd1899fc4c0293f6bbb95af80cc39c4', '1d07fcbe2a6900c4e4ea4409ae2c5270', '568d6332f094e839329a4d66d30dc6d0', '6d9ed75aecea4cb4e6298bd638ff4afc', '0b0fd757eef935f9609cbfab1443840d', '8031c4b9f388c424ba2ed4da91f00173', 'a1c8fc922eac56a7bd275565defa9763', '07f44d71c453c52f9f055ab877b909cc', 'a71a739c154aa77fb2800a426818df5e', '998fe36c4f8131243b766398f5801c22', '5db6785789345d1638b1046fee3b4c76', 'fb85950dc8effebfa15c9555f238d7ec', '927893d2acecb4415e1f68f923dce782', '1cded34a5dfb786368208f632e5a5d6f', 'e180827dbd190aa6e9651be5d08f7230', 'e243fda5f1cb39ea6c810564c025638b', '37710e7c0449baedbea4236ef2306da1', 'eaba6c7538a5d6f7f993e1bed678db73', 'c062f1a1e198a4da90193c5e06c484c8', '57cd65ad7d3909ce9b8fe9d4e2fdf709', 'f571b343e99c24e012804a5b04a591f4', '97ca93ecb6d659c2af7e71a269ad021a', 'bec4c14ba02a32fad812ee94a98c1602', '12f506bf952aec9b3d8289a62ded85d6', 'c396496b481e2685a7733cfdbeaf925b', '932b64b6d7fa0605031eaa6f3c63eea5', 'b00b0b63b241943957b48e9a4f9b7c27', '085fd00ce06d27d19f40b330c7d8ba0d', 'a6a55291b82d26ed8852361a80575e89', '0f1a741eea4c4cbdba9d46dd3d762e2a', 'a6f2a358b6a2a33fd8c44830293b8323', '6c5bebf8e67ea0562455656f4132dca8', '27c8850bb21db8dbccc40ebadaf4daea', '393a3cf1a1ce2e6de5955e85605b2e61', '346ba06d2fc6ad303d2c25aa03c8f309', '5160a38cdbea74be5e569b767f025e01', '0a63c38ccede65ad5788d469185fc315', '940904b10cb465abe65aa7f7fafbe2a3', '22333feb1c9b1f02fa531f878c1970ae', '9348b0de17a27622d52d6c71fa12d419', '568982eb42078f033b3ea568329b6c0e', '56a842911136ec4628584bbb915f8f4b', 'f33db0c82c1d8924f035a0ea3b2e7ca9', 'b4fbc993f786bed4cec41b356919f2e3', '6cc5cb34c7f90bd870c027ece3a15fad', 'bcc2b44d3cd1f9d2b8458c595fb9b02d', '269c71c3a7cb517e2d21f20bad0714c6', '513b44a60a7bfd9a8c1a5a2c49ca185e', 'e05c7b866e6e8e7394ca5b1d02c20f58', '9652a10d1629156e0789bb630e081f9c', '8a07ce1347bfecea1903820b2b76e0be', '9e560864413aee1abddcde454e778e94', '62beeffb51a33d06b38e2da8d5f2d051', '1d6f6841cc1de15d208daa6aee88e2ad', '47c9781f39a732c3643f6c4e600e19cf', '8816ba131c25d99edd0cdd75c07859b1', '07bf09806e2103ac74dac0afeb33bf6d', 'f7a936a8f212e039b7fea6867ad7f010', '199525a09762d6e51c1c7f4ef55c8926', '6aa56327ff9ef7983d0c691aca0ef1ca', 'f29cdf5317bffce5648a21766b6b5f34', 'fa220df5ca6551e99016b9d5897b7915', 'e9213ea8f437aa9d320e0e1271c4a1e6', '51482d141bf1dc8a92bc294563992ef0', 'b829d2d9b0b73662c3bcfaf032bcb1e2', '55540a87c8cdfba5615d4079484901f3', '5b53081a25e4d66f353d2229a0564c65', '1f993001eef14eee827293e16d218ca6', 'b0288182991527078a0693bca89985b8', '6b599fe2bfc9db75aabab13175652d56', 'e8c7b7a4bf4390bf03ddcc5d4215dba2', '0c0856b3f01f7a0a6d9fbcbf4253e190', 'd45962a733099ddc701566f0d33556ef', '025be63b5d1ae501b268711832cc9665', 'ba71aa394247a7a27230a2be489ab420', '916e520d5f358a54ebcdbba0179d8843', '6095a61aaf59fb5bcdc88fe1ad8a78c2', '84b51054f1131067758bb1137d812b78', '1823baf623fff2394d985f4f6c0e6d74', '2b5c43f2fa8f0b133535cd3ee6c6c695', '7dcb8c84aee27fee41f9400c33ff1551', 'eae565378fff9055ed48235add42bc15', 'ea1ccd1847912ce2cc57d7901a0d81d7', '2922376d0667c1809e159264aea169f5', '0c0bcba0c40adafd29e2cf47ec4fb544', '85eca4e62686b28864e1f6e20ce8d683', '1a78051dee60dbe7d688a3127f12544e', '46b1520f1cbaa923ac8cf85feadc9cf4', '49655c66704cf856ca2dbbc77af22586', '7b107a90c7d45209b9b367a08feb7708', '4361de87fcb0cfd740d35212aa7c7211', '1e8f4d2acdb3d4a94ce1205e0326f9f6', '9f0e04bd2d29040421904442696311b5', '2bb484a925b6a25f1277b7008651cf92', 'db5bdd3d97d53f257ac6afb8fd78f462', '63aa5757186c1def977a118d31445ec0', '9864da59e74afac4d219fc65939826e5', '4ec499632cf69a6f75f630f09fefa1dd', '6fe5bea03a70ebb09564b74be2555d7b', '6011873dfea515990a882d38539e126d', '93667e0ed1442c25148ea330f86e97ba', '5c4a6d35244fd3ce149b1008535a7809', 'ccd1c08880fb15e70d4b1aac0063f1bf', 'ad408423128850292ffeb29b163cd78a', '55051865aa377bcf48acad6782479e97', '11664771aa54bb4138078077243e6c3c', '3cbedf02828b82d680371b91c7ec53fe', 'dfbd4c5642fcda3f5008590ba629c9e6', '6ae4cab1d0684afaea997a01a0939f71', 'f6af9cbc4f43f3a708e4cbf20e0315db', '8b03abc22d2051cb7740ac9f5795346f', '66d8156c3e21d105c697555f7952d5db', 'c95987f619409f89814af39a83d36e4f', 'b2a375e7fb7dd6883b7563f7dd915624', '4ff8af2020275408284cf75a8bbb94ec', 'b82841783bc6ea570cac350a031f462a', '35a9f67c975bd2a4d3b79c97df499293', 'a3a91683dc5cc0f98037c23b8d962acf', '3d800b30aef8a7da60395c92a0369ac7', 'b6ba5044eefaf39648286d57aee10803', '1f72bf63e2247500e0c6d674f5f1aab8', '4eed0c5d082e7207ab2083b26db70593', 'a69b1206b162095ea6bbb35fb64ef57e', '941af762cfe86db412d9b995b33277ee', '7e063e5a3613f993d9b7bcfb423bbda6', 'b6db70131e6355a13a741fd803d8ff95', '4db28b9c8602a7a17e9b9b80e5d090ce', '60cce41086fd0866c296ad09e4a9fb5e', 'a8bc849f5a64ea13fcfc139b152fe03d', '2df3055029fb0a7eaf3315d35853d756', 'df0cba25540dc2d66bafa4af3144190c', 'c586059099b151b8fa61c7483717cbc9', '225018e18cbf57b20a80c746e7861594', '9c1a4c9d0b5a9d47e4e053eb2d420407', 'f712c1df0637ed68819cdecf4617716d', 'b0779a8ab51a8f8f8bff1d2f35d5a6e2', '1cd6de23155f859461caf9c0458f8f87', 'e881b7ab3d0a47237fb5b1c1fc23376d', 'd54a17301e302785439f9dde6f3fc208', '9d4519587afe7aafb8f21dfc1b0d0996', 'b86f86f863585b3356d2304b7e3c62bc', '1a78fb79fa2a8982c87f21b8bc533615', 'df5b0524ad802d413159260557db958c', 'ee59574e746db4c025602b8605420659', 'd9bba8a3889fa3dac1b8e41e0146563b', '2fff98ca52fd19ef92a48c52029606e1', 'e5f2ca84b9f42da0c5a9fd90164ccad9', '6152b60d67077a228acebe412e1ee849', 'ee2a77ce843d24362cabb427c958871e', 'cce910d6457d84f18b53b37624e9be86', '62879b753d9ceb7931a02255fd59738d', '752560c49d508b3109ccede8c466d331', '2e705213cc37d836360a3eaca1137239', '8b524012052a7f83dcbd6747d17057c4', '5ab6090e8225c3787e56132e2a607d68', 'da4dd2ba6768317321515fcc645867cf', 'b53228aa58bd0a7218eb65b470870fc5', 'e6c4a523e241b25f3ddbc21333d93183', 'ac5e9a8968f532cccd40dfa75105d883', '3917656e05e2824cf5c7d49807daa559', 'afd9c8e6cf5e853598a4b093779381e3', 'ddf2dd4a5343a76ae274b951a573bc20', '1638139e889ba19e05813f7c63a62313', '5efc169c7d3e7f7319189a86f88d18e8', 'd8825e8ec4411927082d088ef0008d54', 'bc48d54e378c819bf0ed83f614d2eead', 'dbe28073723a939c538e46d36ac858ca', '3610e843650a8876095bc5510280dcf1', 'ae84f2816f15df41129e2c3baaf78063', '63e890c53312ed84881061fa597e1173', '398aa804f95674fe47027a3ace7b8a1b', '5b212fad64feba45b1aac1e00e5ad5d4', 'd039fdd87b3227982f2b8ce698849d47', 'de630b5890582133d146be37b8f7d034', '3f9be74ba902086262dce25092726c8d', 'a57a45a928db8a107d431931d1b63081', 'ce608bb27a6e97f47e81e6b285621d82', 'c2d1d17f441fe69a98dee9e647dc1d11', 'a02b340d7c63212f2a77f7f98fa53637', '2486853af95400d8af5f9eaad8264d30', 'a22cc150e903605e28fc5770adfbda9f', 'f8b7020ca42524815eb92b9e4b4cdf4c', '728b7cd0d354f32d64d47b364b35d813', '26a1457116407d0ff002d2aea64e0703', 'cb8868533691a6f0793cddc3576390a6', 'f816baec07810f53f1aa0bddad3993ea', '807ff24320ec7286f132a001f49aa297', 'cb42d83f38d352f4fe9caabf460722d0', '9c42c41c65c68ff4d10a5bf85cc586a3', '5b3607941af2d1290240044796d36a5e', '4f4418d5d81f8fdee4d8093a7bf92980', '5025836c4777bf0b7f56fd9f727ab568', 'bbcc8eb3b942fc2547e43e15d3b2d66d', '203726a51ed642b922e82a97f7c68651', '4e96ddfa5985f50356a0264498a42444', '59b9c3f86716b3f59175f1b839e324be', 'e27841a82df6187263b88bac10c5793c', '5a36cf95bf04a713652359ff744518c4', 'a4452247cf772a8f493ed45be2146279', '4ae391cbf0fbb6f54d4cdb79e40b3014', 'f474f0d3f6886b50246f2a74969d3cb9', '88f6107830f5e3d8413a025248e1e1a4', '6b24fa9df1b373a9d71192a8758e85d2', 'bbab47202e290663b1159185a7fbf016', '4f84706a5b704fb07a371f877021fe70', 'edd0fc3a25e58af6c5a94cffc5b53512', 'a7d6f138c779a0cf4917bd802663fe1c', '39104ba8977eb63a7cda6d6b13de2f96', '64bd5389adeb465253b181d838e5b68d', '7b748d57587bebad3762a865e45cf8e1', '69e7e1191bd2466b30f43c7d4c38d665', '1b82531c05d3135a5d1c7ff780454ffb', 'f9cba099a2686b06d37d17ef17239f67', '5586215272cda7ca20de7960ba716ac3', 'd7e7f51b5c05eebc51be0667d6786bc9', '0adf2d326855f05d198dc27e32f83e6b', '9cd29923f70f5ee2a346c2f85b6c0c70', 'c8ef92bff62daa7573e81decb7eeed7d', '1fc7365a9797e6012b4cf394a5f14627', '5aa0c0bf9466f8295f6e9eae1b2bbc3c', '5ff4f20594b9a376ba23cb3931cb3751', 'cfa5d4ad00c3f173563848c3abbf9f0b', '8b1f55cf26ef28727d3e98446c014dc1', '019d46639c173a9a95d4516cf4f0e46d', 'b98dae8f5c65cff8179d5dd2406648e6', '374cebaf127949e6f7ed7839c3656fa9', '0959dffe01941688e8408f5d4a7b0384', '07da3d9c550c65237902e2f2ae292914', '609a8cd3a608dd5c64b6823eefa5663f', '82d39c8eacd784def6cf14c8f849a679', '70582abf441e432c99cb13b40e1c4665', '37de9bb8ff75c3afdf94b71b9a48d4a1', '7b157fe80fb7aa84783db5d5ca0c1589', '00906c9fd0920b8ae1cecc38bf92d29b', '27c7c6af5658a77c7066c4832225577c', 'f755871faee0d36e43adcbc8324be2ef', '812eedc1f31d984ba60b2e443045cd77', '9bb137ff6aed857c6756063fd415518e', 'ab0ee37a6cd76cbaef1e38cbec044c88', '30820e2aa5a6d375c933e7185d9b6899', '3bb9b4d9bb761a655ad434180d384017', '8737364460c311e9489f2efecb858b0d', 'bd50fa4aad7b18da1e0376b1148dd66b', 'cbccbb39823fad6e408c746fec35968e', 'dec302dcb6372b436b70a37f2b92a2c8', '2598956a9f17ff8f12aecd8091b20d65', '95a06a96871131c62c1ae37356fef1ec', '0731f13bab833224c2fec1f4281ae2f2', '259542ca8b02d16c6df1489a8b2e8d4f', '9c875e2d5bbc5976db7deb1c5ee79fa0', '58e490450309513c0b78302827014dfd', '3c2c076905a292801eedb7550959b5db', 'a2506e16e44f432807654a94f913cef9', 'fca93e9a8540078571a1f16c0986d14a', 'c1bb5a2deb487a453c394c526ad1acaf', '9fbc30cde5a73c9a4c9ae2ec1875b1f6', 'a51cfc65bfe9f541bcc901decca81b9f', '6b16de115c6d63bf08c82931d50ad05d', 'a486e66cc48a344bc3ad9bcfdf4cfc37', '94b95dd371e0e8e70fe3745067567e66', '6c9c93cface745b241432ce73c76bfb8', '750ebc1df0359555bd5a10313880fb3b', '41b7b5c2056aa9688b8d2c220ebb2b2f', '35cdeb7d3d5b011b816eff9f99dc223d', 'b57075c4d487a10f6212b1cd470f1cdc', '89ee072332c37d47ad58d1ac7865c92e', '248a5c9213b4e959291792f0bd7241a8', '951b8b2ed48aec6ad14a335cb2e63d91', 'd4fcfa0666a39af97969a29b0ca002d1', 'b6594604feb3757adea8657e9b0fcad0', '061f7bd9fe6ae22b8b07f16fa3a3cde4', '18d5df1fe3fbfdd3bf1ffaacef35f3a1', '479ab4e179ebab72a9d2963afe8018ac', '5281c51e3be66ddc921277bb1b9657a4', 'df73e8ab1e7805c4497357162c38d6fa', 'b5fe04d4ea147e3e9de780f0a88fd71d', '32553a298851afcdb65adba991384ea1', '1c4b63d48bfb2eb6772e551aff2b287b', '3cfce1dc697fc9775f7b80beb0fc4d23', 'e90dff2cb72d52bba7128d2e90e79eb1', '932fd4c716a5226b78b80f208147021c', 'ba41403971e54618c894007859384a42', '108e66d5039e1bacf6e69141f23f7a67', '8d472c1214f5c9224c74f2917f7c80c9', 'abd6f49246d4181cab90bf2da64da308', '5ceceb62de9d399e702c3006c5e3f1be', 'dd838c540ca88782f3fba00e934f1af0', '245456396bf0b2fbc271039160f22de2', 'bc2f1dd0b53c59269ce1bfe9a0b84a69', 'e8569569549580e62aaf75b8f4a759d5', 'af772af5e47050b906b8dc5f71efa76e', '3d20882750f46c89d64300fcb45424ae', '39be757346ecd48bce9e3b9ce4b2bb0c', '81ade189c6fda6a6d405a240f37989a7', 'c41c73d21bfea0ade03346b169864391', 'c0a55a31e49e98a9d6664e05b8702a5b', '47520892700b208136b0365136789078', '5d3a6e952fb8cdde9b918b34bf626e05', '3e4db1b0ea7a8d6499c71e2851537f77', '1a2e68f56d7dae8e4369b55d3c82ecef', 'bc4908c241186bf2397a60837fd601b8', '64babcb1fbda40238397712e62bd418f', '56b56e26b932fab51fa35f2946788353', 'a053819838a1dc21b8f1f2761a476029', 'b4f5a15df471762b3d3f15e10ada7171', 'd629ca20350bf128696d0086e8ba1762', 'fc2d0d092e23c52a0a219e26554eb40e', '8518868ded7e9a76d1e21914656225c3', 'f7d55a23b8f1ad646d77ac69526abb22', '767f3c58c6fddb6fee0c2b03d61499e0', '64910b54509bb37879cb61a133434b15', '0d7c0dedd986f0062147701f65253c89', '0d0a7d1e8836bc96e0ddd6c072535c75', 'f573ab803966a22b6e2060e86baf9803', '7258c9222dec4148e4e9bde4b4f6afee', '30d269e7cbc4ba4586e7f8c9b84ad5c5', 'b237dc219aedbe90928a29c700d08d60', 'a168bc2802e792320cfab8ed3ed5f5aa', 'd34f25ea05cf09e53a6eeb621249eaf9', '82c63353ee1e8aa779afa3e6c7c2aa2e', '09ae7cead00b87c87234745a7155faed', 'f37ff2114d5a0078c0d782a4e9cf2646', '719101222a537d7214776067541b7dc8', '8d583b2ee0dc364257f7cb5e3e9571ba', '05cfaef20dd799bf7b135f8486c56922', 'c431244d360e3d09a87f24c87ee9a9d5', 'b119a39d90795c2d8c58cc525740b2de', 'c49d9157b44086dad352d2866880eff7', '975b8049ed717e42161e2ea4b3ebfc1c', '88cbb6e64e123c3da8f5eb64908e041e', 'c89ee7997fcecf5103063d18412a2159', '2402d0b852f2719838715a5483b67f90', '6972ad489459106c6a38b4ceacb1fdda', 'a583b5daaf6a730d836371b2dae4df22', '6ead8a81311508ffab0ac0128482c3b9', '904e9ee3a6001249cfeb13247082c067', '0b7f097bfc132334cdbb3bdcf078c82c', 'd8b4faffdb0a557a6bea5ae0eea4053f', 'bad4561e5cfdc3eb113144c08b3253a1', '7fd2e5f6832bcf6975b247937d175718', '171703ad36d950b4a9fa1de65627d3cb', '4d32efcbb63b326f72c6a055a2505625', '653fd442746b1d58061c58f1131ca39b', 'f48a364c29a0a0353284e4a6b6471471', '2cd2e01cfa43e6f8e6ff9b72fa5bd931', '06c9aa1712329ae572986b32b0fe0343', '1b7d61ec7377d0608370f25469477f62', '41f4e509d5096168cc118de4388fcbf5', 'cd915974bc6420f03e825049c84ff7f7', 'a360bcce8248c707faa9a3a8abdda997', '2157d4fbab685a25dcce9534d03c6d41', '9f8a677c3c6fa1509caa703ef7916c35', '28aa921921dae3d9dcc83516a340420c', '0b8c1356537d121dce47ff385ce91450', '2ba48182d48f83af1b96a6ac34d80dbb', '9f220c00fd0003c0b51b08dbaf5d2f2d', '17465d16e0c6a6addf57547fee84bb1a', '3ba711f77a46f743aa6deafce77f8576', '142a8c532b54289fea4a7f1687998772', '575ec950ce1ca0684b1add581cb0c995', '94aba9c9160c90cdf98f0af4fb203ce4', '9ed041c2bdcf38442c449b9e1b18535c', 'e7a108a395663e07775218a03583ed81', '94add8111457f05850e8bd5d4cc3a7a3', '5054d386528dc4ae36e845a1cc3cdae0', '666193de419791b49959af7932f5c8c2', 'eb6d9abf15d5220f992a9ac27a5001fe', 'f46cc14603a04178db39532dc15c176b', 'c3813cd5a191ef574003db15d615cbb8', 'e3fea6941cf58fa9b4e898d8955649dd', 'ce08c1226d445a18b7ce2ebc80a7096e', '8a8dfcf2c4cd350651bb8734955765aa', '9634bea378870bf33a7e347fdabf2456', '39b5f0e9e1a1f50e97c7a9789d5e2af0', 'cdac30e8a60ac5f771c987518dca38aa', '6cc2d361537457056399f96e437316ba', 'adfe05f999905e8b3341d3a8ba3d63ad', '37b0829936bcd9934a15eed60fa8532d', '496e10c729a3437c0654d4b5d4c09d82', 'd9c96f6ee613abc7d7949c3377681413', 'f0cb10da92bc2083159e443319216a37', '15045d3e23344df09a1d8c005bca41d5', '15d02f99130c0283a580bed0bed78275', '2d15324eba0e70ef4a1451d5321e980e', 'c7d3b831381fb5e75f144433358493b6', '9d4d42de8c64f7c79fd82fb3bb8960c4', '4bfe10874285d8d68486b8c991462ac8', '200c064f3fbf08ddcbbfb2f36118b066', '8eecc61b4ccca9910892a9c267d35d07', '41c698f0fefac0e702a03bbe38c755ee', '9221ec3ed5a0cac9cc211b1f76fc2642', '835f40e6144719e73f26d260b8ad6c80', '463d9a27874682c6001c748196070b22', 'a236d75f2c03e16fbbac2825d42eb914', '83ad78f86297babe27c2ea9cb47b4675', '421f72e8a492f524b696dce4658879e2', 'ebff992242dcd49d4004414a94581574', 'd6b7c0a83da2af88d584dad256dd08e0', '6a559666d4f4561014c97c2177068f78', '67d0e885fbbd04b3675512bb31d1a0e0', 'dd8ecfa0c5e14fc9dfa9903f3501a07b', '590ca279d64e0fc64a5a1f87d91c6bfb', 'fa457b5f7729ecaad819d201ffc49784', 'cc36fbe3a57fa2b04738ab2a7b0e5e12', '864e494bca15d43d2a036610712c0180', 'f8509fc848701a8f2ee6c0c0a59936b8', '903a785a05d396bb9f3b3b1db1e59d7e', '87d48c0f33a9213c9c252535a399ecf4', '3deb452f47e2dba10f8baea299721ca9', '12d512a24a5210ceb6074e7bdaebc227', 'ae97d323473045ec06cfc1fd165808ce', '88199cab3584d87a46150bda69379a9c', '09be0cb04e6fd9080bc7ccf8366373ee', '2d81d266eaf9148fdf177891eb51d708', 'b3d89252b2dcdf7ab1b6ea69e05b4e7b', '4e642b8d7055956511fd63978aec8d0d', '3b3b42273065160a4cbf88d5507a0dc0', 'b5073b22539ea0a59136f8b540c95b02', 'bd723b441e4de6d16bf3ef78a0ed10ad', 'a65ad70ea91d2a1af991e304fadd62a4', '6f451320b2bafe1e730a8b6e2bd833a5', '91dfef441587afeac249429a4dd44cf8', '38a8dcbd0ebfa6961a368f4fc17145ab', '8fc484b27aa36a70fd67edefd9e2477f', '6b4b817b770b8ccb189a2597c694506b', 'f45a2ec2f61ca00a1e099f97f316d79b', 'f53603d80e75c2e3be16c3feb36a539d', '531d0a1f09348b04f5dc54bf84583cc1', '879bd82b192d004e0fb657469c76fc51', '2b50e856f51760e84318e7092fdeb0d8', '2e8f9fc2dcdd0dd7752ad2d268566b53', '80cd82b51fcebfcd90ee3af11adc09e1', 'd18e62ff9a47263dd6ab7c3087879692', 'e6c1afcc7bc65980b80bc5e921bfca06', '806fbd7dd888ce61828fe4395b6f4e75', '1bc94726d834ef7b9535d51e78bc8c55', '09598adef2692a5ecc76bf5e258c2ec1', 'b629be66e8e45a28ed259cfe274af605', '28bb026e961ba92d5f792df7039555ae', '7e28b3a432db2f50621c6fe42f2f36ba', '0d828e3d92683f430d43c51c9fb5b54d', '84536d2896d31e74666b6aba27aae014', '3da255cae775b060a22dc63fda61fefa', 'eec32b35fca1c7403c1fea55d39afda5', 'f44a80af3c1bd429a946f4a274113e6b', 'e64d2d6c8000833d9b5883073485ff03', '699193b16271dbb8b49f218310243030', '1792cda8cff53c86561827b682b91bed', '5f862b1fa2ca69490d9bbac6ceae0337', '57bd90ae2a361f1382fc725fef0a0921', 'd382ebbd528354123e59a86233c32b06', 'd6d2c4d555de208ebbbe6c82b0aafd17', '8fd7fabebed71969fbe86d7364c8861d', '1be229c4bbd1962054419c0afef9eee4', '8834cc11931a2e49c5e0d54e9da9aad9', '743729efb199a6558d0899195d8ca740', 'f7bec41b29d58c70966f97ea9b52bb3e', '0190b8458cfbd28683eb07811a7a7ec5', 'fd816aea82671f71e3eb67e977af00a0', '6f4f1114bd7840ed5e5d756ef8c6432b', 'e9b5c43b5fc7642377401d25a9bedd2d', 'a35a63e506a0b73d125ff0080bdd588c', '1b84156caef23d63398ece07abe5929e', 'de14cd74a8e95268187653ec969312c0', '4b3e0bfd44175b0cae7c36b9e8e519e3', '104a4d3bd3aeeac50d2cc2fb4493bce4', 'cc1d54da1222b9a544b08ff4f645f3fd', 'c80255158cfaef8697d6c6bfc2362341', 'c87009f5755a9b6aa356a1935a57e5a9', '36eeb567a2ea72c1b54cdd9d46a6647b', 'c5ab8758f3c8796ab006d65be6fc25f8', '319f3b8aadfb71e710e070af1bb1047c', '02c93ce8c413e2e8e77975123aecec49', '91ee38d3951b17dfc4c7bca94b6d6dd8', '2e9de09614d9065d6dfc2ae16d4268c1', '57bba769688fb4372296000214954f56', 'fbc81ab85f91aa162fedb88547abc2ce', '05bdc47cee2f106023cbd8617b0997c4', 'a88fd0271a539c09d0c906153736e111', '466e95e1a82f48dd1fdc2a119f8df9fc', 'e580bba0a66774889eb04ca5ca6f8e37', 'd0668af748d1fd0ae7347eb90a5d4fcf', 'be84c54af594465edf6dede1977dee38', 'b582dff2a2b91f406b14ff64e032c72e', '29705fbaf540d6f6b2f8c67313b3070c', 'e3160345920372b267101bdae099bc20', '0a0620ab140b578867661ce0f0abf856', '0bfbf291188a05a0967b5ec02f574357', '98c5511cd321f13f7c1953986475c8f0', '51f0ada675f84142440bc7e56caa57ba', 'cea5cc20e8b7e2c2de0deaa3fb32a371', '7d7212ceeaa7cb2795038bc79da98159', '7d3db7859099303167c8396310a79fe8', 'e37bf85f107b2ab0fe65cc84a7ae103e', '70eb3bc85ff6e3914131f5a6cd81834c', 'b7ee2b49525d85788b253a1508efd4f7']\n", 247 | "Len test_q 511\n", 248 | "Len dev_q 511\n", 249 | "Len train_q 3196\n", 250 | "[W] Learning Tfidf Vectorizer ...\n", 251 | "/usr/local/lib/python3.7/dist-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n", 252 | " \"The parameter 'token_pattern' will not be used\"\n", 253 | "[W] Learning BM25 Vectorizer ...\n", 254 | "Loading BM25 model ...\n", 255 | "tcmalloc: large alloc 1570521088 bytes == 0x5652b8a66000 @ 0x7fa378b0b1e7 0x7fa375d2646e 0x7fa375d76c7b 0x7fa375d7735f 0x7fa375e19103 0x56521c580544 0x56521c580240 0x56521c5f4627 0x56521c581afa 0x56521c5f3d00 0x56521c581afa 0x56521c5f3d00 0x56521c583b6b 0x56521c5c57bf 0x56521c4d8654 0x56521c563933 0x56521c564d64 0x56521c5f7134 0x56521c5eeced 0x56521c581bda 0x56521c5f0737 0x56521c5ee9ee 0x56521c581bda 0x56521c5ef915 0x56521c5ee9ee 0x56521c581bda 0x56521c5f0737 0x56521c5ee9ee 0x56521c5ee6f3 0x56521c6b84c2 0x56521c6b883d\n", 256 | "tcmalloc: large alloc 1570521088 bytes == 0x56531642a000 @ 0x7fa378b0d001 0x7fa375d2654f 0x7fa375d76b58 0x7fa375d7ab17 0x7fa375e19203 0x56521c580544 0x56521c580240 0x56521c5f4627 0x56521c581afa 0x56521c5f3d00 0x56521c5ee9ee 0x56521c581bda 0x56521c5f3d00 0x56521c5eeced 0x56521c581bda 0x56521c5f0737 0x56521c5ee9ee 0x56521c581bda 0x56521c5ef915 0x56521c5ee9ee 0x56521c581bda 0x56521c5f0737 0x56521c5ee9ee 0x56521c5ee6f3 0x56521c6b84c2 0x56521c6b883d 0x56521c6b86e6 0x56521c690163 0x56521c68fe0c 0x7fa3778f5bf7 0x56521c68fcea\n", 257 | "100% 3196/3196 [03:37<00:00, 14.69it/s]\n", 258 | "tcmalloc: large alloc 1570521088 bytes == 0x56537a852000 @ 0x7fa378b0b1e7 0x7fa375d2646e 0x7fa375d76c7b 0x7fa375d79e83 0x7fa375d7a07b 0x7fa375e1b761 0x56521c5804b0 0x56521c580240 0x56521c5f40f3 0x56521c5ee9ee 0x56521c581bda 0x56521c5f0737 0x56521c5ee9ee 0x56521c5ee6f3 0x56521c6b84c2 0x56521c6b883d 0x56521c6b86e6 0x56521c690163 0x56521c68fe0c 0x7fa3778f5bf7 0x56521c68fcea\n", 259 | "tcmalloc: large alloc 1570521088 bytes == 0x5653d8216000 @ 0x7fa378b0b1e7 0x7fa375d2646e 0x7fa375d76c7b 0x7fa375d76d18 0x7fa375dc16e7 0x7fa375dc4fcc 0x7fa375e0fac1 0x56521c5804b0 0x56521c671e1d 0x56521c5f3e99 0x56521c5ee9ee 0x56521c581bda 0x56521c5f0737 0x56521c5ee9ee 0x56521c5ee6f3 0x56521c6b84c2 0x56521c6b883d 0x56521c6b86e6 0x56521c690163 0x56521c68fe0c 0x7fa3778f5bf7 0x56521c68fcea\n", 260 | "2747 63920 3298 P: 0.042975594493116394 R: 0.832929047907823 F1: 0.08173405932934631 F2: 0.17811754331362173\n", 261 | "Number data pairs: 64471\n", 262 | "100% 511/511 [00:11<00:00, 43.06it/s]\n", 263 | "division by zero\n", 264 | "Number data pairs: 10220\n", 265 | "100% 511/511 [00:07<00:00, 66.01it/s]\n", 266 | "division by zero\n", 267 | "Number data pairs: 10220\n", 268 | "len(train_data_pairs_id), len(test_data_pairs_id), len(dev_data_pairs_id) = 64471 10220 10220\n" 269 | ] 270 | } 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "source": [ 276 | "# folder save fine-tuned model \n", 277 | "!mkdir legal_text_retrieval/settings\n", 278 | "\n", 279 | "!cd legal_text_retrieval/scripts/ && bash run_finetune_bert.sh \"magic\" vinai/phobert-base \"../\" data/zalo-tfidfbm25150-full Tfbm150E5-full 5\n" 280 | ], 281 | "metadata": { 282 | "id": "OlcF4OdnCaJF", 283 | "colab": { 284 | "base_uri": "https://localhost:8080/" 285 | }, 286 | "outputId": "fec9883b-75d1-4e4f-bf79-37938044fc53" 287 | }, 288 | "execution_count": null, 289 | "outputs": [ 290 | { 291 | "output_type": "stream", 292 | "name": "stdout", 293 | "text": [ 294 | "mkdir: cannot create directory ‘legal_text_retrieval/settings’: File exists\n", 295 | "mkdir: cannot create directory ‘..//settings/Tfbm150E5-full42/’: File exists\n", 296 | "12/24/2021 08:40:27 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n", 297 | "12/24/2021 08:40:27 - INFO - __main__ - Training/evaluation parameters TrainingArguments(output_dir=..//settings/Tfbm150E5-full42//models, overwrite_output_dir=True, do_train=True, do_eval=None, do_predict=True, evaluation_strategy=EvaluationStrategy.NO, prediction_loss_only=False, per_device_train_batch_size=16, per_device_eval_batch_size=8, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=1e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_steps=0, logging_dir=..//settings/Tfbm150E5-full42//models/tensorboard, logging_first_step=False, logging_steps=200, save_steps=500, save_total_limit=1, no_cuda=False, seed=42, fp16=False, fp16_opt_level=O1, fp16_backend=auto, local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=200, dataloader_num_workers=0, past_index=-1, run_name=..//settings/Tfbm150E5-full42//models, disable_tqdm=False, remove_unused_columns=True, label_names=None, load_best_model_at_end=False, metric_for_best_model=None, greater_is_better=None, ignore_data_skip=False, sharded_ddp=False, deepspeed=None, label_smoothing_factor=0.0, adafactor=False, group_by_length=False, report_to=['tensorboard'], ddp_find_unused_parameters=None, dataloader_pin_memory=True, _n_gpu=1)\n", 298 | "12/24/2021 08:40:27 - INFO - __main__ - load a local file for train: ..//data/zalo-tfidfbm25150-full/train.csv\n", 299 | "12/24/2021 08:40:27 - INFO - __main__ - load a local file for validation: ..//data/zalo-tfidfbm25150-full/dev.csv\n", 300 | "12/24/2021 08:40:27 - INFO - __main__ - load a local file for test: ..//data/zalo-tfidfbm25150-full/test.csv\n", 301 | "12/24/2021 08:40:27 - WARNING - datasets.builder - Using custom data configuration default-33c56ada0b4f05de\n", 302 | "Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-33c56ada0b4f05de/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...\n", 303 | "100% 3/3 [00:00<00:00, 9013.55it/s]\n", 304 | "100% 3/3 [00:00<00:00, 867.55it/s]\n", 305 | "Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-33c56ada0b4f05de/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.\n", 306 | "100% 3/3 [00:00<00:00, 631.39it/s]\n", 307 | "[INFO|file_utils.py:1302] 2021-12-24 08:40:31,171 >> https://huggingface.co/vinai/phobert-base/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpb_xadf8_\n", 308 | "Downloading: 100% 557/557 [00:00<00:00, 416kB/s]\n", 309 | "[INFO|file_utils.py:1306] 2021-12-24 08:40:31,314 >> storing https://huggingface.co/vinai/phobert-base/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/a596f267f08b7158c7ab6300b1bf98eb6e1b05e6bcb0d7c18a8070364ee3011b.bbe27b2cac909b2279c83792c2d2b6f159f0a95f5d1c1eb66451da1c89a53609\n", 310 | "[INFO|file_utils.py:1309] 2021-12-24 08:40:31,314 >> creating metadata file for /root/.cache/huggingface/transformers/a596f267f08b7158c7ab6300b1bf98eb6e1b05e6bcb0d7c18a8070364ee3011b.bbe27b2cac909b2279c83792c2d2b6f159f0a95f5d1c1eb66451da1c89a53609\n", 311 | "[INFO|configuration_utils.py:449] 2021-12-24 08:40:31,315 >> loading configuration file https://huggingface.co/vinai/phobert-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a596f267f08b7158c7ab6300b1bf98eb6e1b05e6bcb0d7c18a8070364ee3011b.bbe27b2cac909b2279c83792c2d2b6f159f0a95f5d1c1eb66451da1c89a53609\n", 312 | "[INFO|configuration_utils.py:485] 2021-12-24 08:40:31,316 >> Model config RobertaConfig {\n", 313 | " \"architectures\": [\n", 314 | " \"RobertaForMaskedLM\"\n", 315 | " ],\n", 316 | " \"attention_probs_dropout_prob\": 0.1,\n", 317 | " \"bos_token_id\": 0,\n", 318 | " \"eos_token_id\": 2,\n", 319 | " \"gradient_checkpointing\": false,\n", 320 | " \"hidden_act\": \"gelu\",\n", 321 | " \"hidden_dropout_prob\": 0.1,\n", 322 | " \"hidden_size\": 768,\n", 323 | " \"initializer_range\": 0.02,\n", 324 | " \"intermediate_size\": 3072,\n", 325 | " \"layer_norm_eps\": 1e-05,\n", 326 | " \"max_position_embeddings\": 258,\n", 327 | " \"model_type\": \"roberta\",\n", 328 | " \"num_attention_heads\": 12,\n", 329 | " \"num_hidden_layers\": 12,\n", 330 | " \"pad_token_id\": 1,\n", 331 | " \"position_embedding_type\": \"absolute\",\n", 332 | " \"tokenizer_class\": \"PhobertTokenizer\",\n", 333 | " \"transformers_version\": \"4.3.2\",\n", 334 | " \"type_vocab_size\": 1,\n", 335 | " \"use_cache\": true,\n", 336 | " \"vocab_size\": 64001\n", 337 | "}\n", 338 | "\n", 339 | "[INFO|configuration_utils.py:449] 2021-12-24 08:40:31,455 >> loading configuration file https://huggingface.co/vinai/phobert-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a596f267f08b7158c7ab6300b1bf98eb6e1b05e6bcb0d7c18a8070364ee3011b.bbe27b2cac909b2279c83792c2d2b6f159f0a95f5d1c1eb66451da1c89a53609\n", 340 | "[INFO|configuration_utils.py:485] 2021-12-24 08:40:31,455 >> Model config RobertaConfig {\n", 341 | " \"architectures\": [\n", 342 | " \"RobertaForMaskedLM\"\n", 343 | " ],\n", 344 | " \"attention_probs_dropout_prob\": 0.1,\n", 345 | " \"bos_token_id\": 0,\n", 346 | " \"eos_token_id\": 2,\n", 347 | " \"gradient_checkpointing\": false,\n", 348 | " \"hidden_act\": \"gelu\",\n", 349 | " \"hidden_dropout_prob\": 0.1,\n", 350 | " \"hidden_size\": 768,\n", 351 | " \"initializer_range\": 0.02,\n", 352 | " \"intermediate_size\": 3072,\n", 353 | " \"layer_norm_eps\": 1e-05,\n", 354 | " \"max_position_embeddings\": 258,\n", 355 | " \"model_type\": \"roberta\",\n", 356 | " \"num_attention_heads\": 12,\n", 357 | " \"num_hidden_layers\": 12,\n", 358 | " \"pad_token_id\": 1,\n", 359 | " \"position_embedding_type\": \"absolute\",\n", 360 | " \"tokenizer_class\": \"PhobertTokenizer\",\n", 361 | " \"transformers_version\": \"4.3.2\",\n", 362 | " \"type_vocab_size\": 1,\n", 363 | " \"use_cache\": true,\n", 364 | " \"vocab_size\": 64001\n", 365 | "}\n", 366 | "\n", 367 | "[INFO|file_utils.py:1302] 2021-12-24 08:40:31,597 >> https://huggingface.co/vinai/phobert-base/resolve/main/vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp9hs_zwr5\n", 368 | "Downloading: 100% 895k/895k [00:00<00:00, 5.02MB/s]\n", 369 | "[INFO|file_utils.py:1306] 2021-12-24 08:40:31,923 >> storing https://huggingface.co/vinai/phobert-base/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/970c6224b2713c8b52a7bcfc4d5a951c9bb88302e4523388b50f28284e87ac44.26ba0c8945e559c68d0bc35d24fea16f5463a49fe8f134e0c32261d590b577fa\n", 370 | "[INFO|file_utils.py:1309] 2021-12-24 08:40:31,923 >> creating metadata file for /root/.cache/huggingface/transformers/970c6224b2713c8b52a7bcfc4d5a951c9bb88302e4523388b50f28284e87ac44.26ba0c8945e559c68d0bc35d24fea16f5463a49fe8f134e0c32261d590b577fa\n", 371 | "[INFO|file_utils.py:1302] 2021-12-24 08:40:32,068 >> https://huggingface.co/vinai/phobert-base/resolve/main/bpe.codes not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpmnffyx5y\n", 372 | "Downloading: 100% 1.14M/1.14M [00:00<00:00, 6.34MB/s]\n", 373 | "[INFO|file_utils.py:1306] 2021-12-24 08:40:32,394 >> storing https://huggingface.co/vinai/phobert-base/resolve/main/bpe.codes in cache at /root/.cache/huggingface/transformers/f3a66ae0a78d1a53b3eb99e31837d0d8e2f684a2dcc1f52f75fd36873e3d79de.301ac8958de708ddcea8500d9acbe6261dba391d249c98dcda1e49dbbff870dd\n", 374 | "[INFO|file_utils.py:1309] 2021-12-24 08:40:32,395 >> creating metadata file for /root/.cache/huggingface/transformers/f3a66ae0a78d1a53b3eb99e31837d0d8e2f684a2dcc1f52f75fd36873e3d79de.301ac8958de708ddcea8500d9acbe6261dba391d249c98dcda1e49dbbff870dd\n", 375 | "[INFO|tokenization_utils_base.py:1786] 2021-12-24 08:40:32,395 >> loading file https://huggingface.co/vinai/phobert-base/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/970c6224b2713c8b52a7bcfc4d5a951c9bb88302e4523388b50f28284e87ac44.26ba0c8945e559c68d0bc35d24fea16f5463a49fe8f134e0c32261d590b577fa\n", 376 | "[INFO|tokenization_utils_base.py:1786] 2021-12-24 08:40:32,395 >> loading file https://huggingface.co/vinai/phobert-base/resolve/main/bpe.codes from cache at /root/.cache/huggingface/transformers/f3a66ae0a78d1a53b3eb99e31837d0d8e2f684a2dcc1f52f75fd36873e3d79de.301ac8958de708ddcea8500d9acbe6261dba391d249c98dcda1e49dbbff870dd\n", 377 | "[INFO|tokenization_utils.py:193] 2021-12-24 08:40:32,550 >> Adding to the vocabulary\n", 378 | "[WARNING|tokenization_utils_base.py:1904] 2021-12-24 08:40:32,550 >> Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.\n", 379 | "[INFO|file_utils.py:1302] 2021-12-24 08:40:32,688 >> https://huggingface.co/vinai/phobert-base/resolve/main/pytorch_model.bin not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp6_s100_b\n", 380 | "Downloading: 100% 543M/543M [00:14<00:00, 36.5MB/s]\n", 381 | "[INFO|file_utils.py:1306] 2021-12-24 08:40:47,625 >> storing https://huggingface.co/vinai/phobert-base/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/8363542cfd9e2bad1a9a618e87ea1153d84819a3ae581cff0816a2c1f610f433.42a5e558f15db4cc3af338445707272b8f7545df78efdc125d3fd51025b22d85\n", 382 | "[INFO|file_utils.py:1309] 2021-12-24 08:40:47,625 >> creating metadata file for /root/.cache/huggingface/transformers/8363542cfd9e2bad1a9a618e87ea1153d84819a3ae581cff0816a2c1f610f433.42a5e558f15db4cc3af338445707272b8f7545df78efdc125d3fd51025b22d85\n", 383 | "[INFO|modeling_utils.py:1027] 2021-12-24 08:40:47,626 >> loading weights file https://huggingface.co/vinai/phobert-base/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/8363542cfd9e2bad1a9a618e87ea1153d84819a3ae581cff0816a2c1f610f433.42a5e558f15db4cc3af338445707272b8f7545df78efdc125d3fd51025b22d85\n", 384 | "[WARNING|modeling_utils.py:1135] 2021-12-24 08:40:51,834 >> Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n", 385 | "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", 386 | "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", 387 | "[WARNING|modeling_utils.py:1146] 2021-12-24 08:40:51,834 >> Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']\n", 388 | "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", 389 | "./run_glue.py:351: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead\n", 390 | " f\"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the\"\n", 391 | "12/24/2021 08:40:51 - WARNING - __main__ - The max_seq_length passed (512) is larger than the maximum length for themodel (256). Using max_seq_length=256.\n", 392 | "100% 65/65 [01:00<00:00, 1.07ba/s]\n", 393 | "100% 11/11 [00:09<00:00, 1.18ba/s]\n", 394 | "100% 11/11 [00:09<00:00, 1.19ba/s]\n", 395 | "12/24/2021 08:42:15 - INFO - __main__ - Sample 41905 of the training set: {'#1 ID': 'e6d18a0365b648e5115ac22a6c5e5de4', '#2 ID': '25/2018/tt-bgtvt-->8-->', 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 14997, 105, 747, 17, 1401, 7973, 109, 216, 145, 41, 63, 405, 57, 25, 189, 109, 10, 8078, 4838, 38, 45, 1868, 1823, 114, 2, 2, 432, 358, 5, 8284, 190, 687, 2493, 35, 109, 1611, 26, 141, 60, 109, 1611, 5, 251, 141, 60, 109, 1611, 4, 687, 2493, 35, 109, 1611, 41, 875, 1213, 706, 3176, 63, 181, 2493, 4, 508, 41, 1014, 9, 209, 791, 53, 97, 27, 99, 5, 4003, 1283, 27, 8170, 39, 7973, 12149, 227, 81, 682, 40, 41, 1796, 34, 16, 687, 363, 2764, 121, 15115, 748, 364, 12606, 105, 35, 22635, 76, 7, 2526, 23, 4, 420, 330, 28, 5134, 32, 17, 11, 239, 48, 613, 2231, 20, 599, 19, 5, 1161, 109, 1611, 10, 1681, 10115, 147, 4, 2444, 39, 7973, 12149, 227, 81, 30, 127, 705, 10115, 2764, 8, 245, 2231, 20, 599, 19, 5, 76, 5, 42778, 2010, 27, 1517, 19, 92, 7027, 2082, 6, 39, 7973, 12149, 227, 81, 682, 40, 4, 2493, 3419, 6516, 34, 405, 363, 2764, 925, 2231, 20, 599, 19, 4, 420, 330, 17, 239, 48, 210, 2231, 20, 599, 19, 65, 2774, 19, 4657, 2493, 1512, 10, 378, 3419, 17, 204, 4073, 34, 405, 363, 2764, 431, 2231, 20, 599, 19, 65, 420, 275, 778, 6, 5134, 330, 4, 378, 3419, 9, 687, 23, 17, 11, 204, 4388, 65, 1894, 19, 4657, 2493, 57, 89, 82, 1974, 682, 72, 4, 378, 3419, 2010, 7, 2493, 11, 600, 63, 7999, 2246, 12149, 7, 82, 1974, 1228, 5426, 5, 2], 'label': 0, 'sentence1': 'Đỗ xe ô_tô không sát mép đường phía bên phải theo chiều đi ở nơi đường có lề_đường hẹp sẽ bị xử_phạt bao_nhiêu ?', 'sentence2': 'Điều 8 . Yêu_cầu đối_với đoạn đường_bộ tại đường ngang khi xây_dựng mới đường ngang . Khi xây_dựng mới đường ngang , đoạn đường_bộ tại đường ngang phải đáp_ứng tiêu_chuẩn kỹ_thuật tương_ứng theo cấp đường_bộ , đồng_thời phải bảo_đảm các quy_định cụ_thể sau đây : 1 . Bình_diện : Đường_bộ từ mép ray ngoài cùng trở ra phải thẳng trên một đoạn dài tối_thiểu bằng Khoảng_cách tầm nhìn hãm xe tại Phụ_lục 2 của Thông_tư này , trường_hợp khó_khăn về địa_hình cũng không được nhỏ hơn 15 mét ( m ) . Đối_với đường ngang có bố_trí dải_phân_cách giữa , khoảng_cách từ mép ray ngoài cùng đến đầu đảo dải_phân_cách tối_thiểu là 6 mét ( m ) . 2 . Trắc dọc : a ) Trong lòng_đường sắt và từ mép ray ngoài cùng trở ra , đường_bộ dốc 0% trên chiều dài tối_thiểu 16 mét ( m ) , trường_hợp khó_khăn không nhỏ hơn 10 mét ( m ) ; b ) Đoạn đường_bộ tiếp_theo có độ dốc không quá 3% trên chiều dài tối_thiểu 20 mét ( m ) ; trường_hợp vùng núi và địa_hình khó_khăn , độ dốc các đoạn này không được quá 6% ; c ) Đoạn đường_bộ đi qua hai đường_sắt trở lên , độ dốc dọc của đường_bộ được xác_định theo cao_độ đỉnh ray của hai đường_sắt liền kề . 3 . Chiều rộng phần xe chạy của đoạn đường_bộ trong phạm_vi đường ngang không nhỏ hơn bề rộng phần xe chạy trên đường_bộ phía ngoài và không nhỏ hơn 6 mét ( m ) . Trường_hợp phải mở_rộng để mặt đường không nhỏ hơn 6 mét ( m ) thì đoạn tiếp_theo vuốt dần về bề rộng phần xe chạy trên đường_bộ ngoài phạm_vi đường ngang theo tỉ_lệ 10:1 . Bề rộng phần lề_đường tối_thiểu phải bảo_đảm đủ để lắp_đặt biển_báo hiệu đường_bộ . 4 . Trong phạm_vi đường ngang phải có đầy_đủ hệ_thống thoát nước để bảo_đảm thoát nước của khu_vực . 5 . Trên mặt đường bộ trong khu_vực đường ngang không có người gác được bố_trí gờ giảm_tốc , gồ giảm_tốc để tăng_cường an_toàn giao_thông . Việc xây_dựng gờ giảm_tốc , gồ giảm_tốc theo quy_định của Bộ Giao_thông vận_tải . Trong trường_hợp này , đoạn đường_bộ trong lòng_đường sắt và từ mép ray ngoài cùng trở ra , đường_bộ dốc 0% trên chiều dài tối_thiểu 25 mét ( m ) . 6 . Đường ngang cấp I , cấp II và đường ngang trong khu dân_cư phải có phần đường dành riêng cho người đi bộ trong phạm_vi đường ngang .', 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}.\n", 396 | "12/24/2021 08:42:15 - INFO - __main__ - Sample 7296 of the training set: {'#1 ID': '472d7ac7531474310d85d5c4801c9518', '#2 ID': '01/2019/tt-bkhđt-->3-->', 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 20466, 409, 535, 1799, 4570, 5027, 11, 209, 1398, 114, 2, 2, 432, 107, 5, 3947, 9636, 535, 1799, 4570, 5027, 5, 99, 5, 9636, 535, 1799, 4570, 5027, 646, 27, 3496, 35068, 535, 4, 35068, 6, 1610, 35068, 535, 7, 9, 518, 209, 35, 9, 22635, 4, 39, 22635, 100, 1555, 30, 22635, 100, 9441, 1121, 1171, 63, 2526, 23, 5, 53681, 585, 535, 28, 150, 518, 42, 53, 27, 1517, 19, 53681, 585, 535, 28, 518, 16460, 257, 112, 63, 35068, 35, 22635, 100, 1555, 65, 2774, 19, 53681, 585, 535, 28, 518, 2884, 4, 14637, 6, 9511, 112, 63, 35068, 35, 22635, 100, 1966, 65, 1894, 19, 53681, 585, 535, 28, 518, 3934, 112, 63, 35068, 35, 22635, 100, 3491, 65, 2158, 19, 53681, 585, 535, 28, 518, 244, 158, 6, 1271, 112, 63, 35068, 35, 22635, 100, 5308, 65, 3494, 19, 53681, 585, 535, 28, 518, 2600, 6, 4298, 112, 63, 35068, 35, 22635, 100, 2754, 65, 1565, 19, 53681, 585, 535, 28, 518, 3357, 6, 2174, 112, 63, 35068, 35, 22635, 100, 9441, 5, 76, 5, 5617, 535, 8, 647, 5027, 98, 4, 214, 2850, 736, 11, 701, 791, 35, 1638, 34, 145, 41, 7, 150, 35068, 535, 1799, 5, 107, 5, 5617, 154, 535, 8, 304, 187, 1530, 5027, 11, 701, 791, 1638, 34, 145, 41, 7, 150, 35068, 535, 1799, 4, 280, 532, 304, 535, 5, 163, 5, 15031, 154, 535, 8, 43, 154, 535, 11, 701, 791, 35, 1638, 34, 145, 732, 7, 150, 35068, 535, 2], 'label': 1, 'sentence1': 'Phương_thức gửi báo_cáo thống_kê Ngành Thống_kê được quy_định như_thế_nào ?', 'sentence2': 'Điều 3 . Nội_dung Chế_độ báo_cáo thống_kê Ngành Thống_kê . 1 . Chế_độ báo_cáo thống_kê Ngành Thống_kê bao_gồm : danh_mục biểu_mẫu báo_cáo , biểu_mẫu và giải_thích biểu_mẫu báo_cáo của các lĩnh_vực quy_định tại các Phụ_lục , từ Phụ_lục số I đến Phụ_lục số VI ban_hành kèm theo Thông_tư này . Biểu_mẫu báo_cáo về từng lĩnh_vực như sau : a ) Biểu_mẫu báo_cáo về lĩnh_vực Tài_khoản quốc_gia thực_hiện theo biểu_mẫu tại Phụ_lục số I ; b ) Biểu_mẫu báo_cáo về lĩnh_vực Nông_nghiệp , Lâm_nghiệp và Thuỷ_sản thực_hiện theo biểu_mẫu tại Phụ_lục số II ; c ) Biểu_mẫu báo_cáo về lĩnh_vực Công_nghiệp thực_hiện theo biểu_mẫu tại Phụ_lục số III ; d ) Biểu_mẫu báo_cáo về lĩnh_vực vốn đầu_tư và Xây_dựng thực_hiện theo biểu_mẫu tại Phụ_lục số IV ; đ ) Biểu_mẫu báo_cáo về lĩnh_vực Thương_mại và Dịch_vụ thực_hiện theo biểu_mẫu tại Phụ_lục số V ; e ) Biểu_mẫu báo_cáo về lĩnh_vực Xã_hội và Môi_trường thực_hiện theo biểu_mẫu tại Phụ_lục số VI . 2 . Đơn_vị báo_cáo là Cục Thống_kê tỉnh , thành_phố trực_thuộc Trung_ương được ghi cụ_thể tại góc trên bên phải của từng biểu_mẫu báo_cáo thống_kê . 3 . Đơn_vị nhận báo_cáo là đơn_vị thuộc Tổng_cục Thống_kê được ghi cụ_thể góc trên bên phải của từng biểu_mẫu báo_cáo thống_kê , dưới dòng đơn_vị báo_cáo . 4 . Thời_hạn nhận báo_cáo là ngày nhận báo_cáo được ghi cụ_thể tại góc trên bên trái của từng biểu_mẫu báo_cáo . 5 . Phương_thức gửi báo_cáo : Cục Thống_kê tỉnh , thành_phố trực_thuộc Trung_ương gửi báo_cáo bằng văn_bản hoặc báo_cáo điện_tử trên hệ_thống đến Tổng_cục Thống_kê theo thời_gian quy_định trên từng biểu_mẫu . Biểu_mẫu báo_cáo bằng văn_bản giấy phải có chữ_ký , đóng_dấu của Thủ_trưởng đơn_vị gửi báo_cáo để thuận_tiện cho việc kiểm_tra , đối_chiếu , xử_lý số_liệu . Biểu_mẫu báo_cáo qua hệ_thống báo_cáo điện_tử được thể_hiện dưới hai hình_thức là định_dạng file pdf của văn_bản giấy hoặc dưới dạng tệp_tin điện_tử được xác_thực bằng chữ_ký số của Thủ_trưởng đơn_vị báo_cáo . 6 . Ký_hiệu biểu Ký_hiệu biểu gồm hai phần : phần số và phần chữ ; phần số được đánh liên_tục từ 001 , 002 , 003 , ... ; phần chữ được ghi chữ in viết tắt sao cho phù_hợp với từng lĩnh_vực và kỳ báo_cáo ( năm - N ; Quý - Q ; tháng - T ; ) ; lấy chữ BCC ( Báo_cáo Cục ) thể_hiện cho hệ biểu_mẫu báo_cáo thống_kê .', 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}.\n", 397 | "12/24/2021 08:42:15 - INFO - __main__ - Sample 1639 of the training set: {'#1 ID': 'e70faf39a18707bbd53629415ed9f8e2', '#2 ID': '65/2014/tt-bqp-->1-->', 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], 'input_ids': [0, 9636, 470, 1420, 1038, 71, 2056, 190, 6218, 1564, 14, 10, 261, 546, 2469, 11, 209, 1398, 114, 2, 2, 432, 99, 5, 18998, 947, 6, 429, 731, 5, 99, 5, 18998, 947, 4, 429, 731, 27, 2526, 23, 1010, 112, 1150, 470, 1420, 1038, 636, 6, 470, 1420, 1038, 71, 2056, 190, 6218, 1564, 6, 18, 47, 247, 38072, 17, 41, 8, 6218, 4, 863, 603, 20, 53, 97, 328, 8, 6218, 1564, 19, 52, 1666, 1315, 1038, 63, 614, 1038, 53367, 151, 614, 1038, 1533, 706, 38072, 65, 1386, 3391, 1958, 6, 18, 47, 247, 85, 12, 116, 38072, 20, 53, 97, 328, 8, 1386, 3391, 1958, 19, 52, 1666, 1315, 1038, 63, 9, 614, 1038, 1533, 4, 2729, 4, 29801, 2537, 4, 533, 1121, 1171, 63, 1545, 3914, 33956, 15745, 43, 948, 78, 445, 29, 2758, 7, 315, 28, 1150, 4016, 190, 416, 4, 2205, 4, 3391, 6, 4730, 20, 53, 328, 2135, 8, 1545, 3914, 33956, 4432, 31477, 15831, 76, 5, 4507, 17, 731, 27, 9636, 470, 1420, 1038, 636, 6, 470, 1420, 1038, 71, 2056, 209, 35, 2526, 23, 17, 731, 190, 6218, 1564, 14, 10, 261, 886, 533, 40726, 151, 14, 1666, 1315, 1420, 1038, 710, 12, 276, 187, 614, 1038, 6218, 1564, 65, 1386, 3391, 1958, 14, 10, 261, 11719, 151, 14, 1666, 1315, 1420, 1038, 710, 12, 13895, 187, 9, 614, 1038, 1533, 4, 2729, 4, 29801, 2537, 4, 533, 1121, 1171, 63, 1545, 3914, 33956, 4432, 19768, 2, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 0, 'sentence1': 'Chế_độ nâng bậc lương trước thời_hạn đối_với quân_nhân chuyên_nghiệp đã có quyết_định chuẩn_bị hưu được quy_định như_thế_nào ?', 'sentence2': 'Điều 1 . Phạm_vi điều_chỉnh và đối_tượng áp_dụng . 1 . Phạm_vi điều_chỉnh , đối_tượng áp_dụng : Thông_tư này hướng_dẫn thực_hiện chế_độ nâng bậc lương thường_xuyên và nâng bậc lương trước thời_hạn đối_với quân_nhân chuyên_nghiệp và người làm công_tác cơ_yếu không phải là quân_nhân , công_an nhân_dân ( sau đây gọi là quân_nhân chuyên_nghiệp ) đang xếp hưởng lương theo bảng lương QNCN hoặc bảng lương chuyên_môn kỹ_thuật cơ_yếu ; công_nhân viên_chức quốc_phòng và người làm công_tác khác trong tổ_chức cơ_yếu ( sau đây gọi là công_nhân viên_chức quốc_phòng ) đang xếp hưởng lương theo các bảng lương chuyên_môn , nghiệp_vụ , thừa_hành , phục_vụ ban_hành kèm theo Nghị_định 204/2004/NĐ-CP ngày 14 tháng 12 năm 2004 của Chính_phủ về chế_độ tiền_lương đối_với cán_bộ , công_chức , viên_chức và lực_lượng_vũ_trang ( sau gọi tắt là Nghị_định 204/2004/NĐ-CP). 2 . Đối_tượng không áp_dụng : Chế_độ nâng bậc lương thường_xuyên và nâng bậc lương trước thời_hạn quy_định tại Thông_tư này không áp_dụng đối_với quân_nhân chuyên_nghiệp đã có quyết_định thôi phục_vụ tại_ngũ hoặc đã xếp hưởng bậc lương cuối_cùng trong nhóm thuộc bảng lương quân_nhân chuyên_nghiệp ; công_nhân viên_chức quốc_phòng đã có quyết_định thôi_việc hoặc đã xếp hưởng bậc lương cuối_cùng trong ngạch thuộc các bảng lương chuyên_môn , nghiệp_vụ , thừa_hành , phục_vụ ban_hành kèm theo Nghị_định 204/2004/NĐ-CP.', 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}.\n", 398 | "[INFO|trainer.py:432] 2021-12-24 08:42:26,777 >> The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence2, #2 ID, #1 ID, sentence1.\n", 399 | "[INFO|trainer.py:837] 2021-12-24 08:42:26,780 >> ***** Running training *****\n", 400 | "[INFO|trainer.py:838] 2021-12-24 08:42:26,780 >> Num examples = 64471\n", 401 | "[INFO|trainer.py:839] 2021-12-24 08:42:26,780 >> Num Epochs = 5\n", 402 | "[INFO|trainer.py:840] 2021-12-24 08:42:26,780 >> Instantaneous batch size per device = 16\n", 403 | "[INFO|trainer.py:841] 2021-12-24 08:42:26,781 >> Total train batch size (w. parallel, distributed & accumulation) = 16\n", 404 | "[INFO|trainer.py:842] 2021-12-24 08:42:26,781 >> Gradient Accumulation steps = 1\n", 405 | "[INFO|trainer.py:843] 2021-12-24 08:42:26,781 >> Total optimization steps = 20150\n", 406 | " 2% 500/20150 [11:45<7:41:22, 1.41s/it][INFO|trainer.py:1408] 2021-12-24 08:54:12,294 >> Saving model checkpoint to ..//settings/Tfbm150E5-full42//models/checkpoint-500\n", 407 | "[INFO|configuration_utils.py:304] 2021-12-24 08:54:12,298 >> Configuration saved in ..//settings/Tfbm150E5-full42//models/checkpoint-500/config.json\n", 408 | "[INFO|modeling_utils.py:817] 2021-12-24 08:54:13,782 >> Model weights saved in ..//settings/Tfbm150E5-full42//models/checkpoint-500/pytorch_model.bin\n", 409 | " 5% 1000/20150 [23:37<7:29:56, 1.41s/it][INFO|trainer.py:1408] 2021-12-24 09:06:04,682 >> Saving model checkpoint to ..//settings/Tfbm150E5-full42//models/checkpoint-1000\n", 410 | "[INFO|configuration_utils.py:304] 2021-12-24 09:06:04,684 >> Configuration saved in ..//settings/Tfbm150E5-full42//models/checkpoint-1000/config.json\n", 411 | "[INFO|modeling_utils.py:817] 2021-12-24 09:06:06,238 >> Model weights saved in ..//settings/Tfbm150E5-full42//models/checkpoint-1000/pytorch_model.bin\n", 412 | "[INFO|trainer.py:1467] 2021-12-24 09:06:11,830 >> Deleting older checkpoint [../settings/Tfbm150E5-full42/models/checkpoint-500] due to args.save_total_limit\n", 413 | " 7% 1500/20150 [35:30<7:18:26, 1.41s/it][INFO|trainer.py:1408] 2021-12-24 09:17:57,165 >> Saving model checkpoint to ..//settings/Tfbm150E5-full42//models/checkpoint-1500\n", 414 | "[INFO|configuration_utils.py:304] 2021-12-24 09:17:57,167 >> Configuration saved in ..//settings/Tfbm150E5-full42//models/checkpoint-1500/config.json\n", 415 | "[INFO|modeling_utils.py:817] 2021-12-24 09:17:58,685 >> Model weights saved in ..//settings/Tfbm150E5-full42//models/checkpoint-1500/pytorch_model.bin\n", 416 | "[INFO|trainer.py:1467] 2021-12-24 09:18:04,969 >> Deleting older checkpoint [../settings/Tfbm150E5-full42/models/checkpoint-1000] due to args.save_total_limit\n", 417 | " 10% 2000/20150 [47:23<7:05:25, 1.41s/it][INFO|trainer.py:1408] 2021-12-24 09:29:50,298 >> Saving model checkpoint to ..//settings/Tfbm150E5-full42//models/checkpoint-2000\n", 418 | "[INFO|configuration_utils.py:304] 2021-12-24 09:29:50,300 >> Configuration saved in ..//settings/Tfbm150E5-full42//models/checkpoint-2000/config.json\n", 419 | "[INFO|modeling_utils.py:817] 2021-12-24 09:29:51,865 >> Model weights saved in ..//settings/Tfbm150E5-full42//models/checkpoint-2000/pytorch_model.bin\n", 420 | "[INFO|trainer.py:1467] 2021-12-24 09:29:57,839 >> Deleting older checkpoint [../settings/Tfbm150E5-full42/models/checkpoint-1500] due to args.save_total_limit\n", 421 | " 12% 2500/20150 [59:17<6:53:57, 1.41s/it][INFO|trainer.py:1408] 2021-12-24 09:41:44,472 >> Saving model checkpoint to ..//settings/Tfbm150E5-full42//models/checkpoint-2500\n", 422 | "[INFO|configuration_utils.py:304] 2021-12-24 09:41:44,474 >> Configuration saved in ..//settings/Tfbm150E5-full42//models/checkpoint-2500/config.json\n", 423 | "[INFO|modeling_utils.py:817] 2021-12-24 09:41:46,178 >> Model weights saved in ..//settings/Tfbm150E5-full42//models/checkpoint-2500/pytorch_model.bin\n", 424 | "[INFO|trainer.py:1467] 2021-12-24 09:41:52,026 >> Deleting older checkpoint [../settings/Tfbm150E5-full42/models/checkpoint-2000] due to args.save_total_limit\n", 425 | " 15% 3000/20150 [1:11:10<6:42:10, 1.41s/it][INFO|trainer.py:1408] 2021-12-24 09:53:37,468 >> Saving model checkpoint to ..//settings/Tfbm150E5-full42//models/checkpoint-3000\n", 426 | "[INFO|configuration_utils.py:304] 2021-12-24 09:53:37,470 >> Configuration saved in ..//settings/Tfbm150E5-full42//models/checkpoint-3000/config.json\n", 427 | "[INFO|modeling_utils.py:817] 2021-12-24 09:53:39,073 >> Model weights saved in ..//settings/Tfbm150E5-full42//models/checkpoint-3000/pytorch_model.bin\n", 428 | "[INFO|trainer.py:1467] 2021-12-24 09:53:44,894 >> Deleting older checkpoint [../settings/Tfbm150E5-full42/models/checkpoint-2500] due to args.save_total_limit\n", 429 | " 17% 3500/20150 [1:23:00<6:28:14, 1.40s/it][INFO|trainer.py:1408] 2021-12-24 10:05:27,782 >> Saving model checkpoint to ..//settings/Tfbm150E5-full42//models/checkpoint-3500\n", 430 | "[INFO|configuration_utils.py:304] 2021-12-24 10:05:27,786 >> Configuration saved in ..//settings/Tfbm150E5-full42//models/checkpoint-3500/config.json\n", 431 | "[INFO|modeling_utils.py:817] 2021-12-24 10:05:29,457 >> Model weights saved in ..//settings/Tfbm150E5-full42//models/checkpoint-3500/pytorch_model.bin\n", 432 | "[INFO|trainer.py:1467] 2021-12-24 10:05:35,344 >> Deleting older checkpoint [../settings/Tfbm150E5-full42/models/checkpoint-3000] due to args.save_total_limit\n", 433 | " 20% 4000/20150 [1:34:53<6:20:10, 1.41s/it][INFO|trainer.py:1408] 2021-12-24 10:17:19,997 >> Saving model checkpoint to ..//settings/Tfbm150E5-full42//models/checkpoint-4000\n", 434 | "[INFO|configuration_utils.py:304] 2021-12-24 10:17:19,999 >> Configuration saved in ..//settings/Tfbm150E5-full42//models/checkpoint-4000/config.json\n", 435 | "[INFO|modeling_utils.py:817] 2021-12-24 10:17:21,585 >> Model weights saved in ..//settings/Tfbm150E5-full42//models/checkpoint-4000/pytorch_model.bin\n", 436 | "[INFO|trainer.py:1467] 2021-12-24 10:17:27,902 >> Deleting older checkpoint [../settings/Tfbm150E5-full42/models/checkpoint-3500] due to args.save_total_limit\n", 437 | " 22% 4500/20150 [1:46:44<6:06:33, 1.41s/it][INFO|trainer.py:1408] 2021-12-24 10:29:11,166 >> Saving model checkpoint to ..//settings/Tfbm150E5-full42//models/checkpoint-4500\n", 438 | "[INFO|configuration_utils.py:304] 2021-12-24 10:29:11,168 >> Configuration saved in ..//settings/Tfbm150E5-full42//models/checkpoint-4500/config.json\n", 439 | "[INFO|modeling_utils.py:817] 2021-12-24 10:29:12,764 >> Model weights saved in ..//settings/Tfbm150E5-full42//models/checkpoint-4500/pytorch_model.bin\n", 440 | "[INFO|trainer.py:1467] 2021-12-24 10:29:20,512 >> Deleting older checkpoint [../settings/Tfbm150E5-full42/models/checkpoint-4000] due to args.save_total_limit\n", 441 | " 23% 4588/20150 [1:48:57<6:04:03, 1.40s/it]" 442 | ] 443 | } 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "source": [ 449 | "!cd legal_text_retrieval && python src/infer.py" 450 | ], 451 | "metadata": { 452 | "id": "YCs6_efIY7hM" 453 | }, 454 | "execution_count": null, 455 | "outputs": [] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "source": [ 460 | "" 461 | ], 462 | "metadata": { 463 | "id": "nwuEWf4f0aSz" 464 | }, 465 | "execution_count": null, 466 | "outputs": [] 467 | } 468 | ] 469 | } --------------------------------------------------------------------------------