├── .gitignore ├── README.md ├── data └── README.md ├── docs ├── book │ ├── .gitignore │ ├── .travis.yml │ ├── Makefile │ ├── README.md │ ├── make.bat │ └── src │ │ ├── chapters │ │ ├── abstract-en.tex │ │ ├── abstract-id.tex │ │ ├── appendix-1.tex │ │ ├── appendix-2.tex │ │ ├── appendix-3.tex │ │ ├── approval.tex │ │ ├── chapter-1.tex │ │ ├── chapter-2.tex │ │ ├── chapter-3.tex │ │ ├── chapter-4.tex │ │ ├── chapter-5.tex │ │ ├── cover.tex │ │ ├── daftar_istilah.tex │ │ ├── forewords.tex │ │ ├── instruction.tex │ │ └── statement.tex │ │ ├── config │ │ ├── hypenation-id.tex │ │ ├── if-itb-thesis.sty │ │ └── informations.tex │ │ ├── references.bib │ │ ├── resources │ │ ├── Arsitektur-TA-1.png │ │ ├── Arsitektur-TA-2.png │ │ ├── Arsitektur-TA-3.png │ │ ├── Data-tipe-A.png │ │ ├── Data-tipe-B.png │ │ ├── Data-tipe-C.png │ │ ├── Full-fine-tune.png │ │ ├── Head-fine-tune.png │ │ ├── cbow-skip-gram-illustration.png │ │ ├── cover-ganesha.jpg │ │ ├── data_xlm_r.png │ │ ├── ilustrasi-mlm.png │ │ ├── ilustration-eng-spn-word.png │ │ ├── linimasa-1.jpg │ │ ├── linimasa-2.jpg │ │ ├── luong_et_al_2015.jpg │ │ ├── overview-attention.png │ │ ├── overview-transformer.png │ │ ├── plot-full-prosa-xlmr.png │ │ ├── plot-full-toxic-xlmr.png │ │ ├── plot-full-trip-advisor-xlmr-duplicate.png │ │ ├── plot-full-trip-advisor-xlmr.png │ │ ├── plot-gain-mbert.png │ │ ├── plot-gain-xlmr.png │ │ ├── plot-head-prosa-mbert.png │ │ ├── plot-head-prosa-xlmr.png │ │ ├── plot-head-toxic-mbert.png │ │ ├── plot-head-toxic-xlmr.png │ │ ├── plot-head-trip-mbert.png │ │ ├── plot-head-trip-xlmr.png │ │ ├── prosa-mbert-eng-1.png │ │ ├── prosa-mbert-eng-2.png │ │ ├── prosa-mbert-malay-1.png │ │ ├── prosa-mbert-malay-2.png │ │ ├── prosa-xlmr-eng-1.png │ │ ├── prosa-xlmr-eng-2.png │ │ ├── prosa-xlmr-malay-1.png │ │ ├── prosa-xlmr-malay-2.png │ │ ├── tandatangan.png │ │ └── tandatangan_bu_ayu.png │ │ ├── thesis-blx.bib │ │ ├── thesis.fdb_latexmk │ │ ├── thesis.fls │ │ ├── thesis.run.xml │ │ └── thesis.tex └── paper │ ├── Figure_explained.png │ ├── Figure_symbol.png │ ├── Improving Indonesian Text Classification Using Multilingual Language Model.docx │ ├── Improving Indonesian Text Classification Using Multilingual Language Model.pdf │ ├── Improving Indonesian Text Classification Using Multilingual Language Model.zip │ └── Improving-Indonesian-Text-Classification-Using-Multilingual-Language-Model-Putra.pdf ├── notebooks ├── README.md ├── fine_tune_full │ ├── prosa │ │ └── xlm_r │ │ │ └── indoxtc-fine-tune-full-prosa-xlm-r.ipynb │ ├── toxic │ │ ├── xlm_r │ │ │ └── indoxtc-fine-tune-full-toxic-xlm-r-simpler.ipynb │ │ └── xlm_r_comparable │ │ │ └── indoxtc-fine-tune-full-toxic-xlm-r-comparable.ipynb │ └── trip_advisor │ │ ├── xlm_r │ │ └── indoxtc-fine-tune-full-tripadvisor-xlm-r.ipynb │ │ └── xlm_r_duplicate_removed │ │ └── indoxtc-fine-tune-full-tripadvisor-xlm-r-dupli.ipynb ├── fine_tune_head │ ├── extracting_features │ │ ├── jigsaw_toxic │ │ │ ├── mbert │ │ │ │ ├── indoxtc-combining-toxic-en-features-mbert.ipynb │ │ │ │ ├── indoxtc-extracting-toxic-en-features-mbert-1.ipynb │ │ │ │ ├── indoxtc-extracting-toxic-en-features-mbert-2.ipynb │ │ │ │ ├── indoxtc-extracting-toxic-en-features-mbert-3.ipynb │ │ │ │ ├── indoxtc-extracting-toxic-en-features-mbert-4.ipynb │ │ │ │ ├── indoxtc-extracting-toxic-en-features-mbert-5.ipynb │ │ │ │ └── indoxtc-extracting-toxic-en-features-mbert-6.ipynb │ │ │ └── xlm_r │ │ │ │ ├── indoxtc-combining-toxic-en-features-xlm-r.ipynb │ │ │ │ ├── indoxtc-extracting-toxic-en-features-xlm-r-1.ipynb │ │ │ │ ├── indoxtc-extracting-toxic-en-features-xlm-r-2.ipynb │ │ │ │ └── indoxtc-extracting-toxic-en-features-xlm-r-3.ipynb │ │ ├── prosa │ │ │ ├── mbert │ │ │ │ └── indoxtc-extracting-prosa-features-mbert.ipynb │ │ │ └── xlm_r │ │ │ │ └── indoxtc-extracting-prosa-features-xlm-r.ipynb │ │ ├── toxic │ │ │ ├── mbert │ │ │ │ └── indoxtc-extracting-toxic-features-mbert.ipynb │ │ │ └── xlm_r │ │ │ │ └── indoxtc-extracting-toxic-features-xlm-r.ipynb │ │ ├── trip_advisor │ │ │ ├── mbert │ │ │ │ └── indoxtc-extracting-tripadvisor-features-mbert.ipynb │ │ │ └── xlm_r │ │ │ │ └── indoxtc-extracting-tripadvisor-features-xlm-r.ipynb │ │ └── yelp_review │ │ │ ├── mbert │ │ │ ├── indoxtc-combining-yelp-features-mbert.ipynb │ │ │ ├── indoxtc-extracting-yelp-features-mbert-1.ipynb │ │ │ ├── indoxtc-extracting-yelp-features-mbert-2.ipynb │ │ │ ├── indoxtc-extracting-yelp-features-mbert-3.ipynb │ │ │ ├── indoxtc-extracting-yelp-features-mbert-4.ipynb │ │ │ ├── indoxtc-extracting-yelp-features-mbert-5.ipynb │ │ │ ├── indoxtc-extracting-yelp-features-mbert-6.ipynb │ │ │ ├── indoxtc-extracting-yelp-features-mbert-7.ipynb │ │ │ ├── indoxtc-extracting-yelp-features-mbert-8.ipynb │ │ │ └── indoxtc-extracting-yelp-features-mbert-9.ipynb │ │ │ └── xlm_r │ │ │ ├── indoxtc-combining-yelp-features-xlm-r.ipynb │ │ │ ├── indoxtc-extracting-yelp-features-xlm-r-1.ipynb │ │ │ ├── indoxtc-extracting-yelp-features-xlm-r-2.ipynb │ │ │ ├── indoxtc-extracting-yelp-features-xlm-r-3.ipynb │ │ │ ├── indoxtc-extracting-yelp-features-xlm-r-4.ipynb │ │ │ ├── indoxtc-extracting-yelp-features-xlm-r-5.ipynb │ │ │ ├── indoxtc-extracting-yelp-features-xlm-r-6.ipynb │ │ │ ├── indoxtc-extracting-yelp-features-xlm-r-7.ipynb │ │ │ ├── indoxtc-extracting-yelp-features-xlm-r-8.ipynb │ │ │ └── indoxtc-extracting-yelp-features-xlm-r-9.ipynb │ ├── prosa │ │ ├── mbert │ │ │ └── indoxtc-fine-tune-head-prosa-mbert-all.ipynb │ │ └── xlm_r │ │ │ └── indoxtc-fine-tune-head-prosa-xlm-r-all.ipynb │ ├── toxic │ │ ├── mbert │ │ │ └── indoxtc-fine-tune-head-toxic-mbert-all.ipynb │ │ └── xlm_r │ │ │ └── indoxtc-fine-tune-head-toxic-xlm-r-all.ipynb │ └── trip_advisor │ │ ├── mbert │ │ └── indoxtc-fine-tune-head-tripadvisor-mbert-all.ipynb │ │ └── xlm_r │ │ └── indoxtc-fine-tune-head-tripadvisor-xlm-r-all.ipynb └── result_analysis │ ├── fine_tune_full │ ├── prosa │ │ └── xlm_r │ │ │ ├── Analyze Improvement.ipynb │ │ │ ├── Analyze Zero-shot.ipynb │ │ │ ├── Result Prosa.ipynb │ │ │ ├── final_prosa_yelp_xlm_r_result_combined_10981.csv │ │ │ ├── plot-full-prosa-xlmr.png │ │ │ ├── plot.png │ │ │ ├── result_prosa_yelp_XLM_R_A_10981_0.5_full.csv │ │ │ ├── result_prosa_yelp_XLM_R_B_10981_0.5_full.csv │ │ │ ├── result_prosa_yelp_XLM_R_C_10981_0.5_full.csv │ │ │ ├── result_prosa_yelp_XLM_R_C_10981_1.5_full.csv │ │ │ ├── result_prosa_yelp_XLM_R_C_10981_1_full.csv │ │ │ ├── result_prosa_yelp_XLM_R_C_10981_2_full.csv │ │ │ └── result_prosa_yelp_XLM_R_C_10981_3_full.csv │ ├── toxic │ │ ├── xlm_r │ │ │ ├── Analyze Improvement.ipynb │ │ │ ├── Analyze Zero-shot.ipynb │ │ │ ├── Result Toxic.ipynb │ │ │ ├── final_toxic_toxic_xlm_r_result_combined_11852.csv │ │ │ ├── plot-full-toxic-xlmr.png │ │ │ ├── result_toxic_toxic_XLM_R_A_11852_0.5_full.csv │ │ │ ├── result_toxic_toxic_XLM_R_B_11852_0.5_full.csv │ │ │ ├── result_toxic_toxic_XLM_R_C_11852_0.5_full.csv │ │ │ ├── result_toxic_toxic_XLM_R_C_11852_1.5_full.csv │ │ │ ├── result_toxic_toxic_XLM_R_C_11852_1_full.csv │ │ │ ├── result_toxic_toxic_XLM_R_C_11852_2_full.csv │ │ │ └── result_toxic_toxic_XLM_R_C_11852_3_full.csv │ │ └── xlm_r_comparable │ │ │ ├── Result Toxic Comparable.ipynb │ │ │ ├── result_Abusive_toxic_toxic_XLM_R_A_11852_0.5_full.csv │ │ │ └── result_HS_toxic_toxic_XLM_R_A_11852_0.5_full.csv │ └── trip_advisor │ │ ├── xlm_r │ │ ├── Analyze Improvement.ipynb │ │ ├── Analyze Zero-shot.ipynb │ │ ├── Result Trip.ipynb │ │ ├── final_trip_advisor_yelp_xlm_r_result_combined_12389.csv │ │ ├── plot-full-trip-advisor-xlmr.png │ │ ├── result_trip_advisor_yelp_XLM_R_A_12389_0.5_full.csv │ │ ├── result_trip_advisor_yelp_XLM_R_B_12389_0.5_full.csv │ │ ├── result_trip_advisor_yelp_XLM_R_C_12389_0.5_full.csv │ │ ├── result_trip_advisor_yelp_XLM_R_C_12389_1.5_full.csv │ │ ├── result_trip_advisor_yelp_XLM_R_C_12389_1_full.csv │ │ ├── result_trip_advisor_yelp_XLM_R_C_12389_2_full.csv │ │ └── result_trip_advisor_yelp_XLM_R_C_12389_3_full.csv │ │ └── xlm_r_duplicate │ │ ├── Result Trip Dupli.ipynb │ │ ├── final_trip_advisor_yelp_xlm_r_result_combined_9816.csv │ │ ├── plot-full-trip-advisor-xlmr-duplicate.png │ │ ├── result_trip_advisor_yelp_XLM_R_A_9816_0.5_full.csv │ │ ├── result_trip_advisor_yelp_XLM_R_B_9816_0.5_full.csv │ │ ├── result_trip_advisor_yelp_XLM_R_C_9816_0.5_full.csv │ │ ├── result_trip_advisor_yelp_XLM_R_C_9816_1.5_full.csv │ │ ├── result_trip_advisor_yelp_XLM_R_C_9816_1_full.csv │ │ ├── result_trip_advisor_yelp_XLM_R_C_9816_2_full.csv │ │ └── result_trip_advisor_yelp_XLM_R_C_9816_3_full.csv │ └── fine_tune_head │ ├── Gain analysis - mBERT.ipynb │ ├── Gain analysis.ipynb │ ├── compilation │ ├── average-f1-score-gains.png │ ├── plot-prosa-mbert-english.png │ ├── plot-prosa-xlmr-english.png │ ├── plot-toxic-mbert-english.png │ ├── plot-toxic-xlmr-english.png │ ├── plot-trip-mbert-english.png │ └── plot-trip-xlmr-english.png │ ├── plot.png │ ├── plot_mbert.png │ ├── prosa │ ├── mbert │ │ ├── Plot Result Prosa mBERT.ipynb │ │ ├── final_prosa_yelp_mBERT_result_combined_1.csv │ │ ├── final_prosa_yelp_mBERT_result_combined_2.csv │ │ ├── gains.csv │ │ ├── plot-prosa-mbert-english.png │ │ └── plot.png │ └── xlm_r │ │ ├── Plot Result Prosa XLMR.ipynb │ │ ├── final_prosa_yelp_XLM_R_result_combined_1.csv │ │ ├── final_prosa_yelp_XLM_R_result_combined_2.csv │ │ ├── gains.csv │ │ ├── plot-prosa-xlmr-english.png │ │ └── plot.png │ ├── toxic │ ├── mbert │ │ ├── Plot Result Toxic mBERT.ipynb │ │ ├── final_toxic_toxic_mBERT_result_combined_1.csv │ │ ├── final_toxic_toxic_mBERT_result_combined_2.csv │ │ ├── gains.csv │ │ ├── plot-toxic-mbert-english.png │ │ └── plot.png │ └── xlm_r │ │ ├── Plot Result Toxic XLMR.ipynb │ │ ├── final_toxic_toxic_XLM_R_result_combined_1.csv │ │ ├── final_toxic_toxic_XLM_R_result_combined_2.csv │ │ ├── gains.csv │ │ ├── plot-toxic-xlmr-english.png │ │ └── plot.png │ └── trip_advisor │ ├── mbert │ ├── Plot Result Trip mBERT.ipynb │ ├── final_trip_advisor_yelp_mBERT_result_combined_1.csv │ ├── final_trip_advisor_yelp_mBERT_result_combined_2.csv │ ├── gains.csv │ ├── plot-trip-mbert-english.png │ └── plot.png │ └── xlm_r │ ├── Plot Result Trip XLMR.ipynb │ ├── final_trip_advisor_yelp_XLM_R_result_combined_1.csv │ ├── final_trip_advisor_yelp_XLM_R_result_combined_2.csv │ ├── gains.csv │ ├── plot-trip-xlmr-english.png │ └── plot.png └── src ├── README.md ├── extract-feature.py ├── load-data.py ├── model-full.py └── model-head.py /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | One sentence summary: 2 | > You can use English text data to improve Indonesian text classification performance using a multilingual language model. 3 | 4 | # Indonesian Text Classification Multilingual 5 | 6 | This repository is my final year undergraduate project on Institut Teknologi Bandung, supervised by [Dr. Eng. Ayu Purwarianti, ST.,MT](https://scholar.google.co.id/citations?user=8jUro_cAAAAJ&hl=en). It contains the unpolished source codes (`.py` on [/src](./src/) and `.ipynb` on [/notebooks](./)), [book](./docs/book) (Indonesian), and paper (English). 7 | 8 | * Book title: 9 | **Klasifikasi Teks Berbahasa Indonesia Menggunakan Multilingual Language Model (Studi Kasus: Klasifikasi Ujaran Kebencian dan Analisis Sentimen)** 10 | 11 | * Paper title ([Arxiv](https://arxiv.org/abs/2009.05713)): 12 | **Improving Indonesian Text Classification Using Multilingual Language Model** 13 | 14 | 15 | ## Project Organization 16 | ------------ 17 | ├── README.md <- The top-level README 18 | ├── data <- Contain information regarding the data 19 | ├── docs 20 | | ├── book <- Latex source for the book 21 | | └── paper <- Microsoft word source for the paper 22 | | 23 | ├── notebooks <- The *.ipynb jupyter notebooks 24 | | ├── fine_tune_full <- Notebooks for full finetune experiments 25 | | ├── fine_tune_head <- Notebooks for feature-based experiments 26 | | └── result_analysis <- Notebooks analyzing and producing figures 27 | | 28 | └── src <- The *.py source code 29 | 30 | ------------ 31 | 32 | 33 | ## Abstract 34 | 35 | Compared to English, the amount of labeled data for Indonesian text classification tasks is very small. Recently developed multilingual language models have shown its ability to create multilingual representations effectively. This paper investigates the effect of combining English and Indonesian data on building Indonesian text classification (e.g., sentiment analysis and hate speech) using multilingual language models. Using the feature-based approach, we observe its performance on various data sizes and total added English data. The experiment showed that the addition of English data, especially if the amount of Indonesian data is small, improves performance. Using the fine-tuning approach, we further showed its effectiveness in utilizing the English language to build Indonesian text classification models. 36 | 37 | ## Experiments 38 | 39 | The experiments consist of two multilingual language model (mBERT [1] & XLM-R [2]), three training data scenarios, two training approaches, and five datasets. Every experiment was run on [Kaggle](https://www.kaggle.com/) kernel. You can find the link to every Kaggle's kernel & datasets on each directory. 40 | 41 | #### A. Training Data Scenarios 42 | We investigate the model performance in three different scenarios. Each differs by the combination of the language used in its training data: monolingual, zero-shot, and multilingual. In the monolingual scenario, we use the Indonesian language text to train and validate the model. In the zero-shot scenario, we use the English language text to train the model while being validated on Indonesian text. Lastly, we use a combination of Indonesian and English text to train the model while being validated on Indonesian text in the multilingual scenario. Using these scenarios, we observe the improvement of the added English text. 43 | 44 | #### B. Training Approaches 45 | There are two approaches on applying large pre-trained language representation to downstream tasks: feature-based and fine-tuning [1]. On the feature-based approach, we extract fixed features from the pre-trained model. In this experiment, we use the last hidden state, which is 768 for mBERT and 1024 for XLM-R Large, as the feature. This extracted feature is then fed into a single dense layer, the only layer we trained on the feature-based approach, connected with dropout before finally ending on a sigmoid function. In contrast, the finetuning approach trains all the language model parameters, 110M for mBERT and 550M for XLM-R Large, including the last dense layer, on the training data binary cross-entropy loss. 46 | 47 | Using the feature-based scenario, we run many experiments as the expensive and multilingual representation have been precomputed on all the data. In all training data scenarios, we vary the total data used. More specifically, we train the model using [500, 1000, 2500, 5000, 7500, Max] text data. Specific to multilingual training data scenario, we vary the amount of added English data by [0.25, 0.5, 0.75, 1, 1.5, 2, 3, 4, 5, 6, 7, 8, 9, 10] times the amount of Indonesian text data. We refer to a multilingual experiment with added English data N times the amount of Indonesian text data as multilingual(N). 48 | 49 | In contrast to the feature-based scenarios, fine-tuning the full language model is expensive and resource-intensive. However, as shown in [1], fully fine-tuning the full language model will result in a better text classifier. We fine-tuned the best performing model on the feature-based scenarios. The experiment was reduced to only using the maximum total data and an added English data multiplier up to 3. 50 | 51 | #### C. Datasets 52 | More details on the book and paper. Quick summary: 53 | * Indonesian: 54 | * Sentiment Analysis 1: [(Farhan & Khodra, 2017) [3]](https://www.researchgate.net/publication/320832619_Sentiment-specific_word_embedding_for_Indonesian_sentiment_analysis) 55 | * Sentiment Analysis 2: [(Crisdayanti & Purwarianti, 2019) [4]](https://ieeexplore.ieee.org/abstract/document/8904199/) 56 | * Hate-speech and Abusive: [(Ibrohim & Budi, 2019) [5]](https://www.aclweb.org/anthology/W19-3506.pdf) 57 | 58 | * English: 59 | * Sentiment Analysis: [Yelp Review Sentiment Dataset](https://www.kaggle.com/ilhamfp31/yelp-review-dataset) 60 | * Toxic Comment: [Jigsaw Toxic Comment](https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification/data) 61 | 62 | 63 | #### D. Training Reproducibility and Hyperparameters 64 | We split the training data into training and validation set with a 90:10 ratio. The split was done in a stratified fashion, conserving the distribution of labels between the training & validation set. The result is a dataset separated into training, validation, and test sets. 65 | 66 | Each experiment will train the model using the training set and validate it to the validation set on each epoch. After each epoch, we will evaluate whether we will continue, reduce the learning rate, or stop the training process based on validation set performance and the hyperparameter set on each condition. In the end, we use the model from the best performing epoch based on its validation performance to predict the test set. 67 | 68 | On the feature-based experiment, we set the final layer dropout probability to 0.2, the learning rate reducer patience to 5, and the early stopping patience to 12. On full fine-tune experiment, we set the final layer dropout probability to 0.2, the learning rate reducer patience to 0, and the early stopping patience to 4. Every validation and prediction use 0.5 as its label threshold. 69 | 70 | To ensure reproducibility, we set every random seed possible on each experiment. On the feature-based experiment, we average the result of 6 different runs by varying the seed from 1-6. Running the same experiment on the feature-based approach will result in the same final score. On the full fine-tune experiment, we only run one experiment. While the result should not differ substantially, [the exact reproducibility cannot be guaranteed as the training was done on a TPU](https://suneeta-mall.github.io/2019/12/22/Reproducible-ml-tensorflow.html). 71 | 72 | ## Result 73 | #### A. Feature-based experiment 74 |

xlm-r-1-result xlm-r-1-result xlm-r-1-result

75 |

Fig. 1. Feature-based experiment result with XLM-R on [3] (left), [4] (middle), and [5] (right)

76 | 77 | 78 | The result of feature-based experiments with XLM-R model on all datasets can be seen in Fig 1. Through this result, we can see that adding English data can help the performance of the model. On [3] & [4] dataset, adding English data consistently improves the performance. But on [5] dataset, there's a point where the added English data results in worse performance. We hypothesize this is due to the large difference in what constitutes hate-speech (or toxic by Jigsaw dataset) between the datasets used. 79 | 80 | 81 |

xlm-r-1-result xlm-r-1-result xlm-r-1-result

82 |

Fig. 2. Feature-based experiment result with mBERT on [3] (left), [4] (middle), and [5] (right)

83 | 84 | The result of feature-based experiments with mBERT model on all datasets can be seen in Fig 2. The same phenomenon is observed on mBERT based experiment, although the performance is substantially lower. This is expected as XLM-R is designed to improve mBERT on various design choices. 85 | 86 | Defining the gain as the difference between monolingual and its highest multilingual performance, Table I shows the gains averaged on all datasets across total data and model. The highest gain can be seen on the lowest amount of total data used, 500, with F1-score gain of 0.176 using XLM-R model and 0.129 using mBERT model. The results suggest that the lower the amount of data used; the more gains yield by adding English data to the training set. 87 | 88 |

Table I. Average F1-Score Gains

89 |

xlm-r-1-result

90 | 91 | 92 | #### B. Full fine-tune experiment 93 | The result of fully fine-tuning all parameters, in addition to utilizing English data, proved to be effective in building a better Indonesian text classification model. On [3] dataset, the highest performance achieved on the zero-shot scenario where it yielded 0.893 F1-score, improving the previous works of 0.834. On [4] dataset, the highest performance achieved on multilingual(1.5) scenario where it yielded perfect F1-score, improving the previous works of 0,9369. On [5] dataset, the highest performance achieved on multilingual(3) scenario where it yielded 0.898 F1-score and 89.9% accuracy. To provide a fair comparison with the previous work by Ibrohim & Budi [5], we also ran the experiment using the original label and monolingual scenario. The experiment yielded 89.52% average accuracy, improving the previous works of 77.36%. 94 | 95 | ## References 96 | Research mentioned in this README: 97 | [1] J. Devlin et al. “BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding”. In: arXiv:1810.04805 [cs] (2019). arXiv: 1810.04805. URL: http://arxiv.org/abs/1810.04805. 98 | [2] A. Conneau et al. “Unsupervised Cross-lingual Representation Learning at Scale”. In: arXiv:1911.02116 [cs] (2020). arXiv: 1911.02116. URL: http://arxiv.org/abs/1911.02116. 99 | [3] A. N. Farhan & M. L. Khodra. “Sentiment-specific word embedding for Indonesian sentiment analysis”. In: 2017 International Conference on Advanced Informatics, Concepts, Theory, and Applications (ICAICTA). 2017, 1–5. DOI: 10.1109/ICAICTA.2017.8090964. 100 | [4] I. A.P. A. Crisdayanti & A. Purwarianti. “Improving Bi-LSTM Performance for Indonesian Sentiment Analysis Using Paragraph Vector”. In: (2019). 101 | [5] M. O. Ibrohim & I. Budi. “Multi-label Hate Speech and Abusive Language Detection in Indonesian Twitter”. In: Proceedings of the Third Workshop on Abusive Language Online. Association for Computational Linguistics, 2019, 46–57. DOI: 10 . 18653 / v1 / W19 - 3506. URL: https://www.aclweb.org/anthology/W19-3506. 102 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | # Dataset Source 2 | Here's the link to the dataset used in this project. 3 | 4 | ## Indonesian 5 | * Sentiment Analysis 1: [(Farhan & Khodra, 2017)](https://www.kaggle.com/ilhamfp31/dataset-tripadvisor) 6 | * Sentiment Analysis 2: [(Crisdayanti & Purwarianti, 2019)](https://www.kaggle.com/ilhamfp31/dataset-prosa/) 7 | * Hate-speech and Abusive (Ibrohim & Budi, 2019): 8 | * Original dataset link: https://github.com/okkyibrohim/id-multi-label-hate-speech-and-abusive-language-detection 9 | * Kaggle port of the original dataset: https://www.kaggle.com/ilhamfp31/indonesian-abusive-and-hate-speech-twitter-text 10 | * [Here's the kernel to my preprocessing process](https://www.kaggle.com/ilhamfp31/simpler-preprocess-indonesian-hate-abusive-text) 11 | 12 | 13 | 14 | ## English 15 | * Sentiment Analysis: [Yelp Review Sentiment Dataset](https://www.kaggle.com/ilhamfp31/yelp-review-dataset) 16 | * Toxic Comment: [Jigsaw Toxic Comment](https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification/data) -------------------------------------------------------------------------------- /docs/book/.gitignore: -------------------------------------------------------------------------------- 1 | *.aux 2 | *.toc 3 | *.tex~ 4 | *.log 5 | *.lof 6 | *.idx 7 | *.bbl 8 | *.blg 9 | *.lot 10 | *.ilg 11 | *.lol 12 | *.out 13 | *.ind 14 | *.backup 15 | *.synctex.gz 16 | texmf 17 | thesis.pdf 18 | .directory 19 | output/* 20 | build/* 21 | *.pth 22 | codes_xnli_100.txt 23 | notebook/tools/mosesdecoder/* 24 | __pycache__/* -------------------------------------------------------------------------------- /docs/book/.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | dist: trusty 3 | 4 | before_install: 5 | - sudo apt-get update 6 | 7 | install: 8 | - sudo apt-get install -y texlive-latex-recommended texlive-latex-extra texlive-fonts-recommended 9 | - sudo apt-get install -y texlive-bibtex-extra biber xzdec texlive-lang-other 10 | - sudo apt-get install -y latexmk 11 | - tex --version 12 | - pdflatex --version 13 | 14 | script: 15 | - make install 16 | -------------------------------------------------------------------------------- /docs/book/Makefile: -------------------------------------------------------------------------------- 1 | all: clean install 2 | 3 | install: 4 | mkdir -p output 5 | mkdir -p build 6 | cd src && latexmk -pdf -bibtex -outdir=../build thesis.tex 7 | mv build/thesis.pdf output/ta.pdf 8 | 9 | clean: 10 | rm -f output/* build/* 11 | find . -iname "*~" -exec rm '{}' ';' 12 | -------------------------------------------------------------------------------- /docs/book/README.md: -------------------------------------------------------------------------------- 1 | Templat LaTeX Tesis Informatika ITB 2 | =================================== 3 | oleh: Petra Novandi 4 | 5 | Dokumen ini merupakan templat LaTeX yang ditujukan untuk laporan 6 | tesis di program studi Teknik Informatika ITB. Templat ini penulis 7 | gunakan dalam penulisan laporan tesis penulis dan dengan semangat 8 | berbagi penulis memutuskan untuk mempublikasikan templat ini agar 9 | dapat digunakan oleh banyak orang. 10 | 11 | Silakan mengunduh, menggunakan, memodifikasi, dan menyebarkan 12 | templat ini. :) 13 | 14 | 15 | Kebutuhan 16 | --------- 17 | 18 | Program telah diuji dalam sistem operasi Linux Ubuntu 18.04. Untuk melakukan instalasi 19 | perangkat lunak yang dibutuhkan, eksekusi perintah berikut. 20 | 21 | ``` 22 | sudo apt-get -qq update && sudo apt-get install -y --no-install-recommends \ 23 | texlive-fonts-recommended texlive-latex-extra texlive-fonts-extra \ 24 | dvipng texlive-latex-recommended \ 25 | texlive-bibtex-extra biber xzdec 26 | ``` 27 | 28 | For latexmk package 29 | ``` 30 | sudo apt install latexmk 31 | ``` 32 | 33 | For bahasa package 34 | ``` 35 | sudo apt-get install texlive-lang-other 36 | ``` 37 | 38 | For russian package 39 | ``` 40 | sudo apt-get install texlive-lang-cyrillic 41 | ``` 42 | 43 | For japanese package 44 | ``` 45 | sudo apt-get install latex-cjk-all 46 | ``` 47 | 48 | Penggunaan 49 | ---------- 50 | 51 | Templat ini telah dilengkapi oleh skrip untuk melakukan kompilasi 52 | Makefile. Untuk melakukan kompilasi cukup eksekusi perintah berikut 53 | 54 | ``` 55 | make 56 | ``` 57 | 58 | Hasil kompilasi akan berada pada berkas `output/tesis.pdf`. 59 | 60 | Kontribusi 61 | ---------- 62 | 63 | Templat ini dapat digunakan secara gratis, akan tetapi penulis sangat 64 | berharap adanya kritik serta saran dari pengguna untuk meningkatkan 65 | kualitas hasil dan penggunaan templat ini. 66 | 67 | Kritik dan saran tersebut dapat dikirim melalui URL 68 | . 69 | 70 | Terima Kasih 71 | ----------- 72 | 73 | * Steven Lolong atas pemberian templat LaTeX yang asli. 74 | * Peb Ruswono Aryan atas bantuan pelengkapan struktur dokumen. 75 | -------------------------------------------------------------------------------- /docs/book/make.bat: -------------------------------------------------------------------------------- 1 | REM clean 2 | rmdir /S /Q build 3 | rmdir /S /Q output 4 | 5 | REM install 6 | mkdir -p output 7 | mkdir -p build 8 | latexmk -pdf -bibtex -outdir=../build -cd src/thesis.tex 9 | move build\thesis.pdf output 10 | -------------------------------------------------------------------------------- /docs/book/src/chapters/abstract-en.tex: -------------------------------------------------------------------------------- 1 | \clearpage 2 | \chapter*{Abstract} 3 | \addcontentsline{toc}{chapter}{Abstract} 4 | 5 | %put your abstract here 6 | \blindtext 7 | 8 | \clearpage -------------------------------------------------------------------------------- /docs/book/src/chapters/abstract-id.tex: -------------------------------------------------------------------------------- 1 | \clearpage 2 | \chapter*{ABSTRAK} 3 | \addcontentsline{toc}{chapter}{Abstrak} 4 | \begin{center} 5 | \large \bfseries \MakeUppercase{Klasifikasi Teks Berbahasa Indonesia Menggunakan \textit{Multilingual Language Model (Studi Kasus: Klasifikasi Ujaran Kebencian dan Analisis Sentimen)}} 6 | 7 | \normalsize \normalfont{Oleh\\ 8 | ILHAM FIRDAUSI PUTRA\\ 9 | NIM : 13516140 10 | } 11 | \end{center} 12 | 13 | %taruh abstrak bahasa indonesia di sini 14 | 15 | Klasifikasi teks adalah proses memprediksi kategori tertentu dari sebuah teks. Contoh kategori adalah nilai sentimen atau status ujaran kebencian. Teknik klasifikasi teks \textit{state-of-the-art} saat ini menggunakan deep learning yang memerlukan data latih dalam ukuran besar. Bersamaan dengan itu, perkembangan dalam bidang representasi teks telah memungkinkan teks dari berbagai bahasa direpresentasikan dalam satu bidang yang sama menggunakan \textit{multilingual language model}. Dua diantaranya adalah MultilingualBERT \parencite{Devlin_Chang_Lee_Toutanova_2019} dan XLM-R\parencite{Conneau_XLMR}. 16 | 17 | Dengan memanfaatkan MultilingualBERT dan XLM-R, representasi teks antar bahasa dapat digunakan untuk membangun model klasifikasi bahasa Indonesia dengan kombinasi data bahasa Indonesia dan bahasa Inggris. Tugas akhir ini memanfaatkan hal tersebut untuk membangun model klasifikasi teks bahasa Indonesia yang meningkatkan performa hasil penelitian \parencite{FarhanKhodra2017} \& \parencite{CrisdayantiPurwarianti2019} mengenai analisis sentimen dan versi biner penelitian \parencite{Ibrohim_Budi_2019} mengenai ujaran kebencian \& kasar. Eksperimen dilakukan dengan memvariasikan jumlah data bahasa Indonesia, jumlah data bahasa Inggris, dan teknik \textit{fine-tuning}. 18 | 19 | Hasil eksperimen menunjukkan XLM-R berhasil meningkatkan hasil analisis sentimen pada dataset penelitian \parencite{FarhanKhodra2017} dari F1-score 0,8341 ke 0,893; penelitian \parencite{CrisdayantiPurwarianti2019} dari F1-score 0,9369 ke 1; dan penelitian \parencite{Ibrohim_Budi_2019} dari rata-rata akurasi 77.36\% ke 89.52\%. Meski ada kasus dimana penambahan data bahasa Inggris berlebih menurunkan performa klasifikasi yang harus dianalisa lebih lanjut, hasil eksperimen menunjukkan bahwa penambahan dataset bahasa Inggris, terutama jika data bahasa Indonesia sedikit, dapat membantu meningkatkan performa klasifikasi teks bahasa Indonesia menggunakan model XLM-R. 20 | 21 | \textbf{Kata kunci:} \textit{multilingual language model}, analisis sentimen, klasifikasi ujaran kebencian 22 | \clearpage -------------------------------------------------------------------------------- /docs/book/src/chapters/appendix-1.tex: -------------------------------------------------------------------------------- 1 | \chapter{Algoritma \textit{Byte Pair Encoding} Sederhana} 2 | \label{appendix:simple_bpe_algorithm} 3 | 4 | Algoritma (contoh nama file: \(bpe.py\)): 5 | \begin{lstlisting}[language=Python] 6 | import re, collections 7 | def get_stats(vocab): 8 | pairs = collections.defaultdict(int) 9 | for word, freq in vocab.items(): 10 | symbols = word.split() 11 | for i in range(len(symbols)-1): 12 | pairs[symbols[i],symbols[i+1]] += freq 13 | return pairs 14 | 15 | def merge_vocab(pair, v_in): 16 | v_out = {} 17 | bigram = re.escape(' '.join(pair)) 18 | p = re.compile(r'(?' : 5, 'l o w e r ' : 2, 25 | 'n e w e s t ':6, 'w i d e s t ':3} 26 | vocab_test = {'l o w e s t ': 1} 27 | 28 | num_merges = 10 29 | for i in range(num_merges): 30 | pairs = get_stats(vocab) 31 | best = max(pairs, key=pairs.get) 32 | print('~~~') 33 | vocab = merge_vocab(best, vocab) 34 | vocab_test = merge_vocab(best, vocab_test) 35 | print("best: ", best) 36 | print("vocab: ", vocab) 37 | print("vocab_test: ", vocab_test) 38 | \end{lstlisting} 39 | 40 | Setelah dijalankan di mesin bersistem operasi Ubuntu 18.04 dengan perintah 41 | \begin{lstlisting}[language=bash] 42 | $ python3 bpe.py 43 | \end{lstlisting} 44 | 45 | akan didapatkan keluaran sebagai berikut: 46 | \begin{lstlisting}[language=bash] 47 | ~~~ 48 | best: ('e', 's') 49 | vocab: {'l o w ': 5, 'l o w e r ': 2, 'n e w es t ': 6, 'w i d es t ': 3} 50 | vocab_test: {'l o w es t ': 1} 51 | ~~~ 52 | best: ('es', 't') 53 | vocab: {'l o w ': 5, 'l o w e r ': 2, 'n e w est ': 6, 'w i d est ': 3} 54 | vocab_test: {'l o w est ': 1} 55 | ~~~ 56 | best: ('est', '') 57 | vocab: {'l o w ': 5, 'l o w e r ': 2, 'n e w est': 6, 'w i d est': 3} 58 | vocab_test: {'l o w est': 1} 59 | ~~~ 60 | best: ('l', 'o') 61 | vocab: {'lo w ': 5, 'lo w e r ': 2, 'n e w est': 6, 'w i d est': 3} 62 | vocab_test: {'lo w est': 1} 63 | ~~~ 64 | best: ('lo', 'w') 65 | vocab: {'low ': 5, 'low e r ': 2, 'n e w est': 6, 'w i d est': 3} 66 | vocab_test: {'low est': 1} 67 | ~~~ 68 | best: ('n', 'e') 69 | vocab: {'low ': 5, 'low e r ': 2, 'ne w est': 6, 'w i d est': 3} 70 | vocab_test: {'low est': 1} 71 | ~~~ 72 | best: ('ne', 'w') 73 | vocab: {'low ': 5, 'low e r ': 2, 'new est': 6, 'w i d est': 3} 74 | vocab_test: {'low est': 1} 75 | ~~~ 76 | best: ('new', 'est') 77 | vocab: {'low ': 5, 'low e r ': 2, 'newest': 6, 'w i d est': 3} 78 | vocab_test: {'low est': 1} 79 | ~~~ 80 | best: ('low', '') 81 | vocab: {'low': 5, 'low e r ': 2, 'newest': 6, 'w i d est': 3} 82 | vocab_test: {'low est': 1} 83 | ~~~ 84 | best: ('w', 'i') 85 | vocab: {'low': 5, 'low e r ': 2, 'newest': 6, 'wi d est': 3} 86 | vocab_test: {'low est': 1} 87 | \end{lstlisting} -------------------------------------------------------------------------------- /docs/book/src/chapters/appendix-2.tex: -------------------------------------------------------------------------------- 1 | \chapter{Rincian Kasus Uji} -------------------------------------------------------------------------------- /docs/book/src/chapters/appendix-3.tex: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/chapters/appendix-3.tex -------------------------------------------------------------------------------- /docs/book/src/chapters/approval.tex: -------------------------------------------------------------------------------- 1 | \clearpage 2 | \pagestyle{empty} 3 | \newgeometry{top=3.5cm,bottom=2.5cm,left=3cm,right=2cm} 4 | \begin{center} 5 | \smallskip 6 | 7 | \Large \bfseries \MakeUppercase{\thetitle} 8 | \vfill 9 | 10 | \Large Laporan Tugas Akhir 11 | \vfill 12 | 13 | \large Oleh 14 | 15 | \Large \theauthor 16 | 17 | \large Program Studi Teknik Informatika \\ 18 | \normalsize \normalfont Sekolah Teknik Elektro dan Informatika \\ 19 | Institut Teknologi Bandung \\ 20 | 21 | \vfill 22 | \normalsize \normalfont{ 23 | 24 | } 25 | 26 | Telah disetujui dan disahkan sebagai Laporan Tugas Akhir \\ 27 | di Bandung, pada tanggal 22 Juni 2020. 28 | 29 | \vfill 30 | \normalsize \normalfont 31 | Pembimbing\\ 32 | % \begin{figure}[!h] 33 | % \centering 34 | % \includegraphics[width=0.2\textwidth]{resources/tandatangan_bu_ayu.png} 35 | % \end{figure} 36 | \vfill 37 | \underline{Dr. Eng. Ayu Purwarianti, ST.,MT.} \\ 38 | NIP 19770127 200801 2 011 39 | 40 | % \begin{tabular}{c@{\hskip 0.5in}c} 41 | % Pembimbing I, & Pembimbing II \\ 42 | % & \\ 43 | % & \\ 44 | % & \\ 45 | % & \\ 46 | % Dr. Eng. Ayu Purwarianti, ST.,MT. & Nama dan Gelar Pembimbing II \\ 47 | % NIP 19770127 200801 2 011 & NIP 123456789 \\ 48 | % \end{tabular} 49 | 50 | \end{center} 51 | \restoregeometry 52 | \clearpage 53 | -------------------------------------------------------------------------------- /docs/book/src/chapters/chapter-5.tex: -------------------------------------------------------------------------------- 1 | \chapter{Kesimpulan dan Saran} 2 | 3 | \section{Kesimpulan} 4 | Berikut beberapa kesimpulan yang dapat ditarik dari tugas akhir ini: 5 | \begin{enumerate} 6 | \item Penambahan dataset bahasa Inggris, terutama jika data bahasa Indonesia sedikit, dapat membantu meningkatkan performa klasifikasi teks bahasa Indonesia menggunakan \textit{multilingual language model} XLM-R. Dapat dilihat pada Tabel \ref{tab:gain_conclusion}, jumlah peningkatan performa naik dengan semakin sedikit total data. Hanya saja ada kasus dimana penambahan data bahasa Inggris berlebih menurunkan performa klasifikasi. Analisis lebih lanjut mengenai kenapa hal ini terjadi dan apa solusinya diperlukan. 7 | % Please add the following required packages to your document preamble: 8 | % \usepackage{multirow} 9 | \begin{table}[] 10 | \centering 11 | \caption{Rangkuman peningkatan performa} 12 | \begin{tabular}{|l|r|r|} 13 | \hline 14 | \multicolumn{1}{|c|}{\multirow{2}{*}{\textbf{\begin{tabular}[c]{@{}c@{}}Total\\ Data\end{tabular}}}} & \multicolumn{2}{c|}{\textbf{Rata-rata peningkatan}} \\ \cline{2-3} 15 | \multicolumn{1}{|c|}{} & \multicolumn{1}{c|}{\textbf{XLM-R}} & \multicolumn{1}{c|}{\textbf{mBERT}} \\ \hline 16 | 500 & 0.176221 & 0.129394 \\ \hline 17 | 1000 & 0.165718 & 0.109215 \\ \hline 18 | 2500 & 0.118456 & 0.051226 \\ \hline 19 | 5000 & 0.095780 & 0.029564 \\ \hline 20 | 7500 & 0.086930 & 0.028043 \\ \hline 21 | 10000 & 0.077875 & 0.020184 \\ \hline 22 | \end{tabular} 23 | \label{tab:gain_conclusion} 24 | \end{table} 25 | \item Penambahan dataset bahasa Inggris dapat membantu model mendapatkan informasi baru dalam bahasa Indonesia. Hal ini dapat dilihat dari analisis hasil peningkatan performa pada analisis sentimen dataset B. Model yang sebelumnya gagal memprediksi teks bahasa Indonesia, berhasil setelah ditambahkan data bahasa Inggris. 26 | \item Penggunaan \textit{multilingual language model} yang di \textit{fine-tune} sepenuhnya sangat efektif dalam klasifikasi teks bahasa Indonesia. 27 | \begin{enumerate} 28 | \item Pada eksperimen sentimen analisis dataset A, model mendapatkan F1-score 0,893. Sebuah peningkatan dari penelitian sebelumnya yang mendapatkan F1-score 0,8521. 29 | \item Pada eksperimen sentimen analisis dataset B, model mendapatkan F1-score sempurna. Sebuah peningkatan absolut dari penelitian sebelumnya yang mendapatkan F1-score 0,9369. 30 | \item Pada eksperimen klasifikasi ujaran kebencian, model mendapatkan F1-score 0.898 dan akurasi 89.9\%. Penelitian sebelumnya yang menggunakan 3 label, bukan yang disimplifikasi menjadi 2 seperti di penelitian ini, mendapatkan rata-rata akurasi tertinggi 77.36\%. Agar dapat dibandingkan, eksperimen dijalankan dengan konfigurasi label yang sama dan didapatkan rata-rata akurasi 89.52\% yang merupakan peningkatan dari penelitian sebelumnya. 31 | \end{enumerate} 32 | 33 | \end{enumerate} 34 | 35 | \section{Saran} 36 | Berikut beberapa saran yang dapat digunakan untuk memperbaiki, memperbaharui, atau mengembangkan hasil tugas akhir ini: 37 | \begin{enumerate} 38 | \item Jauhnya perbedaan, baik dari domain, bahasa, atau faktor lainnya yang mempengaruhi pengumpulan data, antara dataset dapat menyebabkan memburuknya performa \textit{multilingual learning}. Untuk penelitian selanjutnya, dapat dicoba beberapa cara untuk mengatasi hal ini. Beberapa diantaranya adalah seperti penelitian \parencite{Lai_Oguz_Yang_Stoyanov_2019} yang menggunakan \textit{universal data augmentation} untuk mengurangi perbedaan tadi pada saat pembelajaran dilakukan. 39 | \item Turunnya performa pada \textit{multilingual learning} permasalahan klasifikasi ujaran kebencian masih harus diteliti lebih lanjut. Perlu dianalisa lebih dalam lagi apakah hal ini dikarenakan teknik fine-tuningnya atau perbedaan domain yang melekat dalam dataset. Penelitian \parencite{Peters_Ruder_Smith_2019} meneliti perbedaan antara dua teknik fine-tuning yang dicoba pada tugas akhir ini dan mencoba menganalisa kemiripan datasetnya. Hal tersebut dapat dijadikan pedoman dalam menganalisa lebih lanjut fenomena yang diobservasi pada tugas akhir ini. 40 | \item Tugas akhir ini sudah membuktikkan efektifnya penggunaan \textit{language model} dalam berbagai permasalahan klasifikasi teks bahasa Indonesia. Penelitian \parencite{Conneau_XLMR} telah mengobservasi turunnya performa model secara general dengan ditambahnya bahasa dalam pelatihan \textit{multilingual language model} Untuk sampai saat tugas akhir ini ditulis, belum terdapat \textit{language model} spesifik yang dilatih secara masif dalam bahasa Indonesia. Hal ini dapat dicoba dan dibandingkan performanya dengan \textit{multilingual language model} yang dilatih dalam berbagai bahasa. 41 | \end{enumerate} -------------------------------------------------------------------------------- /docs/book/src/chapters/cover.tex: -------------------------------------------------------------------------------- 1 | \clearpage 2 | \pagestyle{empty} 3 | \newgeometry{top=3.5cm,bottom=2.5cm,left=3cm,right=2cm} 4 | \begin{center} 5 | \smallskip 6 | 7 | \Large \bfseries \MakeUppercase{\thetitle} 8 | \vfill 9 | 10 | \Large Laporan Tugas Akhir 11 | \vfill 12 | 13 | \large Disusun sebagai syarat kelulusan tingkat sarjana 14 | \vfill 15 | 16 | \large Oleh 17 | 18 | \Large \theauthor 19 | 20 | \vfill 21 | \begin{figure}[h] 22 | \centering 23 | \includegraphics[width=0.2\textwidth]{resources/cover-ganesha.jpg} 24 | \end{figure} 25 | \vfill 26 | 27 | \large 28 | \uppercase{ 29 | Program Studi Teknik Informatika \\ 30 | Sekolah Teknik Elektro dan Informatika \\ 31 | Institut Teknologi Bandung 32 | } 33 | 34 | Juni 2020 35 | 36 | \end{center} 37 | \restoregeometry 38 | \clearpage 39 | -------------------------------------------------------------------------------- /docs/book/src/chapters/daftar_istilah.tex: -------------------------------------------------------------------------------- 1 | % \chapter*{Daftar Istilah} 2 | \clearpage 3 | \begin{center} 4 | \smallskip 5 | \large \bfseries{Daftar Istilah} 6 | 7 | \begin{table}[h] 8 | \begin{tabularx}{\textwidth}{|l|X|} 9 | \textbf{Dataset} & Kumpulan data yang digunakan untuk melakukan pelatihan, validasi, maupun evaluasi \\ 10 | \textbf{Model} & Representasi matematika yang didapat dari hasil pembelajaran menggunakan data latih \\ 11 | \textbf{Baseline} & Performa model atau model yang dijadikan acuan dasar \\ 12 | \textbf{Arsitektur} & Struktur model yang terdiri dari berbagai macam rule, fungsionalitas, dan implementasi \\ 13 | \textbf{Transformer} & Arsitektur yang pertama kali dideskripsikan oleh (Vaswani et al., 2017) untuk memodelkan sekuens. \\ 14 | \textbf{LearningRate} & Besar perubahan yang dilakukan ke model pada setiap iterasi pembelajaran \\ 15 | \textbf{Callback} & Fungsi yang melekat pada fase pembelajaran model \\ 16 | \textbf{EarlyStopping} & Callback yang akan memberhentikan pembelajaran ketika kondisi yang ditentukan telah dipenuhi \\ 17 | \textbf{ReduceeLrOnPlateau} & Callback yang akan menurunkan besar LearningRate ketika model sudah tidak belajar lagi berdasarkan kondisi ditentukan. \\ 18 | \textbf{Fine-tune} & Proses melatih kembali model ke permasalahan spesifik dari model yang sebelumnya sudah dilatih pada data umum \\ 19 | \end{tabularx} 20 | \end{table} 21 | \end{center} 22 | \clearpage 23 | -------------------------------------------------------------------------------- /docs/book/src/chapters/forewords.tex: -------------------------------------------------------------------------------- 1 | \chapter*{Kata Pengantar} 2 | \addcontentsline{toc}{chapter}{Kata Pengantar} 3 | 4 | Puji syukur penulis panjatkan ke hadirat Tuhan Yang Maha Kuasa karena atas berkat dan karunia-Nya, penulis dapat menyelesaikan tugas akhir yang berjudul “Klasifikasi Teks Berbahasa Indonesia Menggunakan \textit{Multilingual Language Model (Studi Kasus: Klasifikasi Ujaran Kebencian dan Analisis Sentimen)}” untuk memenuhi syarat kelulusan tingkat sarjana. Penulis juga ingin mengucapkan terima kasih kepada pihak-pihak yang telah membantu dan mendukung penulis selama pengerjaan tugas akhir ini: 5 | 6 | \begin{enumerate} 7 | \item Ibu Dr. Eng. Ayu Purwarianti, ST.,MT., selaku dosen pembimbing yang telah memberikan arahan, nasehat, dan dukungan selama pengerjaan tugas akhir. 8 | \item Ibu Fariska Zakhralativa Ruskanda S.T., M.T., dan Ibu Dr. Masayu Leylia Khodra, ST., MT. selaku dosen penguji yang telah memberikan evaluasi dan saran kepada penulis. 9 | \item Ibu Dessi Puji Lestari S.T.,M.Eng.,Ph.D, Ibu Dr. Fazat Nur Azizah S.T., M.Sc., dan Bapak Nugraha Priya Utama, Ph.D. selaku dosen mata kuliah IF4091 Tugas Akhir I K01 dan IF4092 Tugas Akhir II yang telah memberi arahan selama pelaksanaan tugas akhir ini. 10 | \item Ibu Dra. Harlili M.Sc. selaku dosen wali yang telah memberikan arahan, nasehat, dan dukungan selama empat tahun berkuliah di program studi Teknik Informatika ITB. 11 | \item Keluarga penulis yang selalu mendukung dan memotivasi penulis untuk tetap semangat dalam kuliah hingga menyelesaikan tugas akhir. 12 | \item Seluruh staf pengajar yang belum disebutkan dari program studi Teknik Informatika yang telah membekali penulis dengan ilmu dan wawasan untuk mendukung pengerjaan tugas akhir. 13 | \item Staf Tata Usaha program studi Teknik Informatika yang telah membantu selama perkuliahan khususnya dalam proses administrasi tugas akhir. 14 | \item Teman-teman penulis yang telah mendukung serta menemani perjalanan kuliah dan pengerjaan tugas akhir ini. 15 | 16 | 17 | \end{enumerate} 18 | Akhir kata, terima kasih banyak kepada semua pihak yang telah secara langsung maupun tidak langsung membantu penyelesaian tugas akhir ini. Penulis berharap tugas akhir ini dapat bermanfaat bagi para pembaca. Penulis juga menyadari bahwa tugas akhir ini tidaklah sempurna. Oleh karena itu, penulis sangat terbuka terhadap kritik dan saran yang membangun terkait tugas akhir ini. 19 | 20 | \begin{flushright} 21 | Bandung, 22 Juni 2020 \\ 22 | % \begin{figure}[!h] 23 | % \raggedleft 24 | % \includegraphics[width=0.2\textwidth]{resources/tandatangan.png} 25 | % \end{figure} 26 | Penulis 27 | \end{flushright} 28 | \clearpage 29 | -------------------------------------------------------------------------------- /docs/book/src/chapters/instruction.tex: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/chapters/instruction.tex -------------------------------------------------------------------------------- /docs/book/src/chapters/statement.tex: -------------------------------------------------------------------------------- 1 | \clearpage 2 | 3 | \chapter*{Lembar Pernyataan} 4 | 5 | Dengan ini saya menyatakan bahwa: 6 | 7 | \begin{enumerate} 8 | 9 | \item Pengerjaan dan penulisan Laporan Tugas Akhir ini dilakukan tanpa menggunakan bantuan yang tidak dibenarkan. 10 | \item Segala bentuk kutipan dan acuan terhadap tulisan orang lain yang digunakan di dalam penyusunan laporan tugas akhir ini telah dituliskan dengan baik dan benar. 11 | \item Laporan Tugas Akhir ini belum pernah diajukan pada program pendidikan di perguruan tinggi mana pun. 12 | 13 | \end{enumerate} 14 | 15 | Jika terbukti melanggar hal-hal di atas, saya bersedia dikenakan sanksi sesuai dengan Peraturan Akademik dan Kemahasiswaan Institut Teknologi Bandung bagian Penegakan Norma Akademik dan Kemahasiswaan khususnya Pasal 2.1 dan Pasal 2.2. 16 | \vspace{15mm} 17 | 18 | 19 | 20 | \begin{flushright} 21 | Bandung, 22 Juni 2020 \\ 22 | \vskip 0.5in 23 | % \begin{figure}[!h] 24 | % \raggedleft 25 | % \includegraphics[width=0.2\textwidth]{resources/tandatangan.png} 26 | % \end{figure} 27 | Ilham Firdausi Putra \\ 28 | NIM 13516140 29 | \end{flushright} 30 | 31 | \clearpage -------------------------------------------------------------------------------- /docs/book/src/config/hypenation-id.tex: -------------------------------------------------------------------------------- 1 | %--------------------------------------------------------------------% 2 | % 3 | % Hypenation untuk Bahasa Indonesia 4 | % 5 | % @author Petra Barus 6 | % 7 | %--------------------------------------------------------------------% 8 | % 9 | % Secara otomatis LaTeX dapat langsung memenggal kata dalam dokumen, 10 | % tapi sering kali terdapat kesalahan dalam pemenggalan kata. Untuk 11 | % memperbaiki kesalahan pemenggalan kata tertentu, cara pemenggalan 12 | % kata tersebut dapat ditambahkan pada dokumen ini. Pemenggalan 13 | % dilakukan dengan menambahkan karakter '-' pada suku kata yang 14 | % perlu dipisahkan. 15 | % 16 | % Contoh pemenggalan kata 'analisa' dilakukan dengan 'a-na-li-sa' 17 | % 18 | %--------------------------------------------------------------------% 19 | 20 | \hypenation { 21 | % A 22 | % 23 | a-na-li-sa 24 | a-pli-ka-si 25 | 26 | % B 27 | % 28 | be-be-ra-pa 29 | ber-ge-rak 30 | 31 | % C 32 | % 33 | ca-ri 34 | 35 | % D 36 | % 37 | da-e-rah 38 | di-nya-ta-kan 39 | de-fi-ni-si 40 | 41 | % E 42 | % 43 | e-ner-gi 44 | eks-klu-sif 45 | 46 | % F 47 | % 48 | fa-si-li-tas 49 | 50 | % G 51 | % 52 | ga-bung-an 53 | 54 | % H 55 | % 56 | ha-lang-an 57 | 58 | % I 59 | % 60 | i-nduk 61 | 62 | % J 63 | % 64 | ka-me-ra 65 | kua-li-tas 66 | 67 | % K 68 | % 69 | 70 | % L 71 | % 72 | 73 | % M 74 | % 75 | 76 | % N 77 | % 78 | 79 | % O 80 | % 81 | 82 | % P 83 | % 84 | 85 | % Q 86 | % 87 | 88 | % R 89 | % 90 | 91 | % S 92 | % 93 | 94 | % T 95 | % 96 | 97 | % U 98 | % 99 | 100 | % V 101 | % 102 | 103 | % W 104 | % 105 | 106 | % X 107 | % 108 | 109 | % Y 110 | % 111 | 112 | % Z 113 | % 114 | } 115 | -------------------------------------------------------------------------------- /docs/book/src/config/if-itb-thesis.sty: -------------------------------------------------------------------------------- 1 | %-------------------------------------------------------------------% 2 | % 3 | % Konfigurasi dokumen LaTeX untuk laporan tesis IF ITB 4 | % 5 | % @author Ilham Firdausi Putra 6 | % 7 | %-------------------------------------------------------------------% 8 | % 9 | % Berkas ini merupakan pembaharuan dari berkas awal milik Petra Novandi dan Steven Lolong 10 | % 11 | %-------------------------------------------------------------------% 12 | 13 | % Ukuran kertas 14 | \special{papersize=210mm,297mm} 15 | 16 | % Setting margin 17 | \usepackage[top=3cm,bottom=2.5cm,left=4cm,right=2.5cm]{geometry} 18 | 19 | \usepackage{mathptmx} 20 | 21 | % Format citation 22 | \usepackage[backend=bibtex,citestyle=authoryear,sorting=nyt,firstinits=true]{biblatex} 23 | \renewcommand*{\nameyeardelim}{\addcomma\space} 24 | \renewcommand*\finalnamedelim{\addspace\&\space} 25 | 26 | % Anti hyphenation 27 | \tolerance=1 28 | \emergencystretch=\maxdimen 29 | \hyphenpenalty=10000 30 | \hbadness=10000 31 | 32 | % Judul bahasa Indonesia 33 | \usepackage[russian, bahasa]{babel} 34 | \usepackage[utf8]{inputenc} 35 | \usepackage{csquotes} 36 | \setquotestyle{english} 37 | \usepackage{graphicx} 38 | \usepackage{titling} 39 | \usepackage{blindtext} 40 | \usepackage{sectsty} 41 | \usepackage{chngcntr} 42 | \usepackage{etoolbox} 43 | \usepackage{hyperref} % Package untuk link di daftar isi. 44 | \usepackage{titlesec} % Package Format judul 45 | \usepackage{parskip} 46 | 47 | % Daftar Istilah 48 | \usepackage{tabularx} 49 | 50 | % confusion matrix 51 | \usepackage{multirow} 52 | 53 | % package di equation 54 | \usepackage{amsmath} 55 | \usepackage{amsfonts} 56 | 57 | % Line satu setengah spasi 58 | \renewcommand{\baselinestretch}{1.5} 59 | 60 | % Setting judul chapter 61 | \chapterfont{\centering \large} 62 | \titleformat{\chapter}[display] 63 | {\large\centering\bfseries} 64 | {\chaptertitlename\ \thechapter}{0pt} 65 | {\large\bfseries\uppercase} 66 | 67 | \titlespacing*{\chapter}{0pt}{-30pt}{40pt} 68 | \titlespacing*{\section}{0pt}{10pt}{0pt} 69 | \titlespacing*{\subsection}{0pt}{10pt}{0pt} 70 | 71 | % Setting besar font section 72 | % \newcommand{\secfnt}{\fontsize{8}{12}} 73 | % \newcommand{\ssecfnt}{\fontsize{8}{12}} 74 | 75 | % \titleformat{\section} 76 | % {\normalfont\secfnt\bfseries}{\thesection}{1em}{} 77 | 78 | % \titleformat{\subsection} 79 | % {\normalfont\ssecfnt\bfseries}{\thesubsection}{1em}{} 80 | % \titleformat*{\section}{\normalsize\bfseries} 81 | % \titleformat*{\subsection}{\normalsize\bfseries} 82 | % \sectionfont{\fontsize{8}{12}\selectfont} 83 | 84 | % Untuk nampilin kode 85 | \usepackage[utf8]{inputenc} 86 | 87 | \usepackage{listings} 88 | \usepackage{xcolor} 89 | 90 | \definecolor{codegreen}{rgb}{0,0.6,0} 91 | \definecolor{codegray}{rgb}{0.5,0.5,0.5} 92 | \definecolor{codepurple}{rgb}{0.58,0,0.82} 93 | \definecolor{backcolour}{rgb}{0.95,0.95,0.92} 94 | 95 | \lstdefinestyle{mystyle}{ 96 | backgroundcolor=\color{backcolour}, 97 | commentstyle=\color{codegreen}, 98 | keywordstyle=\color{magenta}, 99 | numberstyle=\tiny\color{codegray}, 100 | stringstyle=\color{codepurple}, 101 | basicstyle=\ttfamily\footnotesize, 102 | breakatwhitespace=false, 103 | breaklines=true, 104 | captionpos=b, 105 | keepspaces=true, 106 | numbers=left, 107 | numbersep=5pt, 108 | showspaces=false, 109 | showstringspaces=false, 110 | showtabs=false, 111 | tabsize=2 112 | } 113 | 114 | \lstset{style=mystyle} 115 | 116 | % Setting nomor pada subbsubsubbab 117 | \setcounter{secnumdepth}{3} 118 | 119 | \makeatletter 120 | 121 | \makeatother 122 | 123 | % Counter untuk figure dan table. 124 | \counterwithin{figure}{section} 125 | \counterwithin{table}{section} 126 | 127 | % bahasa asing 128 | \usepackage{CJKutf8} 129 | 130 | % Spacing pada daftar figure dan table 131 | \makeatletter 132 | \renewcommand*\l@figure{\@dottedtocline{1}{1em}{3.2em}} 133 | \makeatother 134 | 135 | \makeatletter 136 | \renewcommand*\l@table{\@dottedtocline{1}{1em}{3.2em}} 137 | \makeatother 138 | 139 | 140 | % Ganti judul bibliography 141 | % \renewcommand\bibname{Daftar Pustaka} 142 | -------------------------------------------------------------------------------- /docs/book/src/config/informations.tex: -------------------------------------------------------------------------------- 1 | %-------------------------------------------------------------------% 2 | % 3 | % Berkas informasi umum tesis 4 | % 5 | % @author Ilham Firdausi Putra 6 | % 7 | %-------------------------------------------------------------------% 8 | % 9 | %-------------------------------------------------------------------% 10 | -------------------------------------------------------------------------------- /docs/book/src/resources/Arsitektur-TA-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/Arsitektur-TA-1.png -------------------------------------------------------------------------------- /docs/book/src/resources/Arsitektur-TA-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/Arsitektur-TA-2.png -------------------------------------------------------------------------------- /docs/book/src/resources/Arsitektur-TA-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/Arsitektur-TA-3.png -------------------------------------------------------------------------------- /docs/book/src/resources/Data-tipe-A.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/Data-tipe-A.png -------------------------------------------------------------------------------- /docs/book/src/resources/Data-tipe-B.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/Data-tipe-B.png -------------------------------------------------------------------------------- /docs/book/src/resources/Data-tipe-C.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/Data-tipe-C.png -------------------------------------------------------------------------------- /docs/book/src/resources/Full-fine-tune.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/Full-fine-tune.png -------------------------------------------------------------------------------- /docs/book/src/resources/Head-fine-tune.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/Head-fine-tune.png -------------------------------------------------------------------------------- /docs/book/src/resources/cbow-skip-gram-illustration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/cbow-skip-gram-illustration.png -------------------------------------------------------------------------------- /docs/book/src/resources/cover-ganesha.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/cover-ganesha.jpg -------------------------------------------------------------------------------- /docs/book/src/resources/data_xlm_r.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/data_xlm_r.png -------------------------------------------------------------------------------- /docs/book/src/resources/ilustrasi-mlm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/ilustrasi-mlm.png -------------------------------------------------------------------------------- /docs/book/src/resources/ilustration-eng-spn-word.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/ilustration-eng-spn-word.png -------------------------------------------------------------------------------- /docs/book/src/resources/linimasa-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/linimasa-1.jpg -------------------------------------------------------------------------------- /docs/book/src/resources/linimasa-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/linimasa-2.jpg -------------------------------------------------------------------------------- /docs/book/src/resources/luong_et_al_2015.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/luong_et_al_2015.jpg -------------------------------------------------------------------------------- /docs/book/src/resources/overview-attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/overview-attention.png -------------------------------------------------------------------------------- /docs/book/src/resources/overview-transformer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/overview-transformer.png -------------------------------------------------------------------------------- /docs/book/src/resources/plot-full-prosa-xlmr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-full-prosa-xlmr.png -------------------------------------------------------------------------------- /docs/book/src/resources/plot-full-toxic-xlmr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-full-toxic-xlmr.png -------------------------------------------------------------------------------- /docs/book/src/resources/plot-full-trip-advisor-xlmr-duplicate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-full-trip-advisor-xlmr-duplicate.png -------------------------------------------------------------------------------- /docs/book/src/resources/plot-full-trip-advisor-xlmr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-full-trip-advisor-xlmr.png -------------------------------------------------------------------------------- /docs/book/src/resources/plot-gain-mbert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-gain-mbert.png -------------------------------------------------------------------------------- /docs/book/src/resources/plot-gain-xlmr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-gain-xlmr.png -------------------------------------------------------------------------------- /docs/book/src/resources/plot-head-prosa-mbert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-head-prosa-mbert.png -------------------------------------------------------------------------------- /docs/book/src/resources/plot-head-prosa-xlmr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-head-prosa-xlmr.png -------------------------------------------------------------------------------- /docs/book/src/resources/plot-head-toxic-mbert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-head-toxic-mbert.png -------------------------------------------------------------------------------- /docs/book/src/resources/plot-head-toxic-xlmr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-head-toxic-xlmr.png -------------------------------------------------------------------------------- /docs/book/src/resources/plot-head-trip-mbert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-head-trip-mbert.png -------------------------------------------------------------------------------- /docs/book/src/resources/plot-head-trip-xlmr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/plot-head-trip-xlmr.png -------------------------------------------------------------------------------- /docs/book/src/resources/prosa-mbert-eng-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/prosa-mbert-eng-1.png -------------------------------------------------------------------------------- /docs/book/src/resources/prosa-mbert-eng-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/prosa-mbert-eng-2.png -------------------------------------------------------------------------------- /docs/book/src/resources/prosa-mbert-malay-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/prosa-mbert-malay-1.png -------------------------------------------------------------------------------- /docs/book/src/resources/prosa-mbert-malay-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/prosa-mbert-malay-2.png -------------------------------------------------------------------------------- /docs/book/src/resources/prosa-xlmr-eng-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/prosa-xlmr-eng-1.png -------------------------------------------------------------------------------- /docs/book/src/resources/prosa-xlmr-eng-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/prosa-xlmr-eng-2.png -------------------------------------------------------------------------------- /docs/book/src/resources/prosa-xlmr-malay-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/prosa-xlmr-malay-1.png -------------------------------------------------------------------------------- /docs/book/src/resources/prosa-xlmr-malay-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/prosa-xlmr-malay-2.png -------------------------------------------------------------------------------- /docs/book/src/resources/tandatangan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/tandatangan.png -------------------------------------------------------------------------------- /docs/book/src/resources/tandatangan_bu_ayu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/book/src/resources/tandatangan_bu_ayu.png -------------------------------------------------------------------------------- /docs/book/src/thesis-blx.bib: -------------------------------------------------------------------------------- 1 | @Comment{$ biblatex control file $} 2 | @Comment{$ biblatex version 2.9 $} 3 | Do not modify this file! 4 | 5 | This is an auxiliary file used by the 'biblatex' package. 6 | This file may safely be deleted. It will be recreated as 7 | required. 8 | 9 | @Control{biblatex-control, 10 | options = {2.9:0:0:1:0:1:1:0:0:1:0:2:3:1:79:+:nyt}, 11 | } 12 | -------------------------------------------------------------------------------- /docs/book/src/thesis.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 23 | 28 | 33 | 36 | 39 | 42 | ]> 43 | 44 | 45 | latex 46 | 47 | thesis.aux 48 | thesis-blx.bib 49 | 50 | 51 | thesis.bbl 52 | 53 | 54 | blx-dm.def 55 | blx-compat.def 56 | blx-bibtex.def 57 | biblatex.def 58 | numeric.bbx 59 | standard.bbx 60 | authoryear.cbx 61 | biblatex.cfg 62 | english.lbx 63 | russian.lbx 64 | 65 | 66 | 67 | bibtex 68 | 69 | bibtex 70 | 71 | thesis 72 | 73 | 74 | thesis.aux 75 | 76 | 77 | thesis.bbl 78 | 79 | 80 | thesis.bbl 81 | 82 | 83 | thesis.aux 84 | thesis-blx.bib 85 | 86 | 87 | references.bib 88 | 89 | 90 | biblatex.bst 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /docs/book/src/thesis.tex: -------------------------------------------------------------------------------- 1 | %--------------------------------------------------------------------% 2 | % 3 | % Berkas utama templat LaTeX. 4 | % 5 | % author Ilham Firadusi Putra 6 | % template dari Petra Barus dan Peb Ruswono Aryan 7 | % 8 | %--------------------------------------------------------------------% 9 | % 10 | % Berkas ini berisi struktur utama dokumen LaTeX yang akan dibuat. 11 | % 12 | %--------------------------------------------------------------------% 13 | 14 | \documentclass[12pt, a4paper, onecolumn, oneside, final]{report} 15 | 16 | \input{config/if-itb-thesis.sty} 17 | 18 | \makeatletter 19 | 20 | \makeatother 21 | 22 | \bibliography{references} 23 | 24 | \begin{document} 25 | 26 | %Basic configuration 27 | \title{ 28 | Klasifikasi Teks Berbahasa Indonesia Menggunakan \textit{Multilingual Language Model} \\ 29 | (Studi Kasus: Klasifikasi Ujaran Kebencian dan Analisis Sentimen)} 30 | \date{} 31 | \author{ 32 | Ilham Firdausi Putra\\ 33 | NIM : 13516140 34 | } 35 | 36 | \pagenumbering{roman} 37 | \setcounter{page}{0} 38 | 39 | \input{chapters/cover} 40 | \input{chapters/approval} 41 | \input{chapters/statement} 42 | 43 | \pagestyle{plain} 44 | 45 | \input{chapters/abstract-id} 46 | % \input{chapters/abstract-en} 47 | \input{chapters/forewords} 48 | 49 | % \titleformat*{\section}{\centering\bfseries\Large\MakeUpperCase} 50 | \titleformat*{\section}{\centering\bfseries\fontsize{8}{12}\MakeUpperCase} 51 | 52 | \tableofcontents 53 | \listoffigures 54 | \listoftables 55 | \input{chapters/daftar_istilah} 56 | 57 | 58 | % \titleformat*{\section}{\bfseries\Large} 59 | \titleformat*{\section}{\bfseries\fontsize{8}{12}} 60 | \titleformat*{\subsection}{\bfseries\fontsize{8}{12}} 61 | \pagenumbering{arabic} 62 | 63 | %----------------------------------------------------------------% 64 | % Konfigurasi Bab 65 | %----------------------------------------------------------------% 66 | \setcounter{page}{1} 67 | \renewcommand{\chaptername}{BAB} 68 | \renewcommand{\thechapter}{\Roman{chapter}} 69 | %----------------------------------------------------------------% 70 | 71 | %----------------------------------------------------------------% 72 | % Dafter Bab 73 | % Untuk menambahkan daftar bab, buat berkas bab misalnya `chapter-6` di direktori `chapters`, dan masukkan ke sini. 74 | %----------------------------------------------------------------% 75 | \input{chapters/chapter-1} 76 | \input{chapters/chapter-2} 77 | \input{chapters/chapter-3} 78 | \input{chapters/chapter-4} 79 | \input{chapters/chapter-5} 80 | %----------------------------------------------------------------% 81 | 82 | % Daftar pustaka 83 | \printbibliography[title={Daftar Pustaka}] 84 | 85 | % Index 86 | \appendix 87 | 88 | \addcontentsline{toc}{part}{Lampiran} 89 | \part*{Lampiran} 90 | 91 | \input{chapters/appendix-1} 92 | % \input{chapters/appendix-2} 93 | 94 | \end{document} 95 | -------------------------------------------------------------------------------- /docs/paper/Figure_explained.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/paper/Figure_explained.png -------------------------------------------------------------------------------- /docs/paper/Figure_symbol.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/paper/Figure_symbol.png -------------------------------------------------------------------------------- /docs/paper/Improving Indonesian Text Classification Using Multilingual Language Model.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/paper/Improving Indonesian Text Classification Using Multilingual Language Model.docx -------------------------------------------------------------------------------- /docs/paper/Improving Indonesian Text Classification Using Multilingual Language Model.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/paper/Improving Indonesian Text Classification Using Multilingual Language Model.pdf -------------------------------------------------------------------------------- /docs/paper/Improving Indonesian Text Classification Using Multilingual Language Model.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/paper/Improving Indonesian Text Classification Using Multilingual Language Model.zip -------------------------------------------------------------------------------- /docs/paper/Improving-Indonesian-Text-Classification-Using-Multilingual-Language-Model-Putra.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/docs/paper/Improving-Indonesian-Text-Classification-Using-Multilingual-Language-Model-Putra.pdf -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Notebook Source Code 2 | Except the `/result_analysis` directory, any notebooks in this directory was originally a Kaggle kernel. I have provided the `.ipynb` file in this directory along with links to the original Kaggle kernel. The naming is slightly different from the original: 3 | * `trip_advisor` means the data is from [(Farhan & Khodra, 2017)](https://www.researchgate.net/publication/320832619_Sentiment-specific_word_embedding_for_Indonesian_sentiment_analysis) 4 | * `prosa` means the data is from [(Crisdayanti & Purwarianti, 2019)](https://ieeexplore.ieee.org/abstract/document/8904199/) 5 | * `toxic` means the data is from [(Ibrohim & Budi, 2019)](https://www.aclweb.org/anthology/W19-3506.pdf) 6 | 7 | 8 | Links to the original Kaggle kernel: 9 | * fine_tune_full 10 | * prosa 11 | * [xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-full-prosa-xlm-r) 12 | * toxic 13 | * [xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-full-toxic-xlm-r-simpler) 14 | * [xlm_r_comparable](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-full-toxic-xlm-r-comparable/) 15 | * trip_advisor 16 | * [xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-full-tripadvisor-xlm-r) 17 | * [xlm_r_duplicate_removed](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-full-tripadvisor-xlm-r-dupli) 18 | 19 | * fine_tune_head 20 | * prosa 21 | * [mbert](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-head-prosa-mbert-all) 22 | * [xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-head-prosa-xlm-r-all) 23 | * toxic 24 | * [mbert](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-head-toxic-mbert-all) 25 | * [xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-head-toxic-xlm-r-all) 26 | * trip_advisor 27 | * [mbert](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-head-tripadvisor-mbert-all) 28 | * [xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-fine-tune-head-tripadvisor-xlm-r-all) 29 | * extracting_features 30 | * prosa 31 | * [mbert](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-prosa-features-mbert) 32 | * [xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-prosa-features-xlm-r) 33 | * toxic 34 | * [mbert](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-features-mbert) 35 | * [xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-features-xlm-r) 36 | * trip_advisor 37 | * [mbert](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-tripadvisor-features-mbert) 38 | * [xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-tripadvisor-features-xlm-r) 39 | * yelp_review 40 | * mbert 41 | * [mbert_1](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-mbert-1) 42 | * [mbert_2](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-mbert-2) 43 | * [mbert_3](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-mbert-3) 44 | * [mbert_4](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-mbert-4) 45 | * [mbert_5](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-mbert-5) 46 | * [mbert_6](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-mbert-6) 47 | * [mbert_7](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-mbert-7) 48 | * [mbert_8](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-mbert-8) 49 | * [mbert_9](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-mbert-9) 50 | * [combining_mbert](https://www.kaggle.com/ilhamfp31/indoxtc-combining-yelp-features-mbert) 51 | * xlm_r 52 | * [xlm_r_1](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-xlm-r-1) 53 | * [xlm_r_2](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-xlm-r-2) 54 | * [xlm_r_3](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-xlm-r-3) 55 | * [xlm_r_4](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-xlm-r-4) 56 | * [xlm_r_5](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-xlm-r-5) 57 | * [xlm_r_6](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-xlm-r-6) 58 | * [xlm_r_7](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-xlm-r-7) 59 | * [xlm_r_8](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-xlm-r-8) 60 | * [xlm_r_9](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-yelp-features-xlm-r-9) 61 | * [combining_xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-combining-yelp-features-xlm-r) 62 | * jigsaw_toxic 63 | * mbert 64 | * [mbert_1](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-en-features-mbert-1) 65 | * [mbert_2](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-en-features-mbert-2) 66 | * [mbert_3](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-en-features-mbert-3) 67 | * [mbert_4](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-en-features-mbert-4) 68 | * [mbert_5](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-en-features-mbert-5) 69 | * [mbert_6](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-en-features-mbert-6) 70 | * [combining_mbert](https://www.kaggle.com/ilhamfp31/indoxtc-combining-toxic-en-features-mbert) 71 | * xlm_r 72 | * [xlm_r_1](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-en-features-xlm-r-1) 73 | * [xlm_r_2](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-en-features-xlm-r-2) 74 | * [xlm_r_3](https://www.kaggle.com/ilhamfp31/indoxtc-extracting-toxic-en-features-xlm-r-3) 75 | * [combining_xlm_r](https://www.kaggle.com/ilhamfp31/indoxtc-combining-toxic-en-features-xlm-r) 76 | 77 | -------------------------------------------------------------------------------- /notebooks/fine_tune_head/extracting_features/jigsaw_toxic/mbert/indoxtc-combining-toxic-en-features-mbert.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# IndoXTC - Combining Toxic-EN Features [mBERT]\n", 8 | "Exploring Indonesian hate speech/abusive & sentiment text classification using multilingual language model.\n", 9 | "\n", 10 | "This kernel is a part of my undergraduate final year project.\n", 11 | "Checkout the full github repository:\n", 12 | "https://github.com/ilhamfp/indonesian-text-classification-multilingual" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": { 19 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 20 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" 21 | }, 22 | "outputs": [ 23 | { 24 | "name": "stdout", 25 | "output_type": "stream", 26 | "text": [ 27 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-1/__notebook__.ipynb\n", 28 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-1/custom.css\n", 29 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-1/train_label.csv\n", 30 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-1/__results__.html\n", 31 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-1/__output__.json\n", 32 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-1/train_text.npy\n", 33 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-5/__notebook__.ipynb\n", 34 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-5/custom.css\n", 35 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-5/train_label.csv\n", 36 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-5/__results__.html\n", 37 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-5/__output__.json\n", 38 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-5/train_text.npy\n", 39 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-4/__notebook__.ipynb\n", 40 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-4/custom.css\n", 41 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-4/train_label.csv\n", 42 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-4/__results__.html\n", 43 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-4/__output__.json\n", 44 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-4/train_text.npy\n", 45 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-2/__notebook__.ipynb\n", 46 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-2/custom.css\n", 47 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-2/train_label.csv\n", 48 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-2/__results__.html\n", 49 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-2/__output__.json\n", 50 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-2/train_text.npy\n", 51 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-3/__notebook__.ipynb\n", 52 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-3/custom.css\n", 53 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-3/train_label.csv\n", 54 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-3/__results__.html\n", 55 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-3/__output__.json\n", 56 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-3/train_text.npy\n", 57 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-6/__notebook__.ipynb\n", 58 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-6/custom.css\n", 59 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-6/train_label.csv\n", 60 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-6/__results__.html\n", 61 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-6/__output__.json\n", 62 | "/kaggle/input/indoxtc-extracting-toxic-en-features-mbert-6/train_text.npy\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "import numpy as np\n", 68 | "import pandas as pd \n", 69 | "import os\n", 70 | "for dirname, _, filenames in os.walk('/kaggle/input'):\n", 71 | " for filename in filenames:\n", 72 | " print(os.path.join(dirname, filename))" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 2, 78 | "metadata": { 79 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 80 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" 81 | }, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "(120000, 1, 768)\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "train_x = np.concatenate([\n", 93 | " np.array([x for x in np.load('../input/indoxtc-extracting-toxic-en-features-mbert-1/train_text.npy', allow_pickle=True)]),\n", 94 | " np.array([x for x in np.load('../input/indoxtc-extracting-toxic-en-features-mbert-2/train_text.npy', allow_pickle=True)]),\n", 95 | " np.array([x for x in np.load('../input/indoxtc-extracting-toxic-en-features-mbert-3/train_text.npy', allow_pickle=True)]),\n", 96 | " np.array([x for x in np.load('../input/indoxtc-extracting-toxic-en-features-mbert-4/train_text.npy', allow_pickle=True)]),\n", 97 | " np.array([x for x in np.load('../input/indoxtc-extracting-toxic-en-features-mbert-5/train_text.npy', allow_pickle=True)]),\n", 98 | " np.array([x for x in np.load('../input/indoxtc-extracting-toxic-en-features-mbert-6/train_text.npy', allow_pickle=True)]),\n", 99 | " ])\n", 100 | "\n", 101 | "print(train_x.shape)\n", 102 | "np.save(\"train_text.npy\", train_x)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 3, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "name": "stdout", 112 | "output_type": "stream", 113 | "text": [ 114 | "(120000, 1)\n", 115 | "1 60000\n", 116 | "0 60000\n", 117 | "Name: label, dtype: int64\n" 118 | ] 119 | }, 120 | { 121 | "data": { 122 | "text/html": [ 123 | "
\n", 124 | "\n", 137 | "\n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | "
label
01
11
21
31
41
\n", 167 | "
" 168 | ], 169 | "text/plain": [ 170 | " label\n", 171 | "0 1\n", 172 | "1 1\n", 173 | "2 1\n", 174 | "3 1\n", 175 | "4 1" 176 | ] 177 | }, 178 | "execution_count": 3, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "train_y = pd.concat([\n", 185 | " pd.read_csv('../input/indoxtc-extracting-toxic-en-features-mbert-1/train_label.csv'),\n", 186 | " pd.read_csv('../input/indoxtc-extracting-toxic-en-features-mbert-2/train_label.csv'),\n", 187 | " pd.read_csv('../input/indoxtc-extracting-toxic-en-features-mbert-3/train_label.csv'),\n", 188 | " pd.read_csv('../input/indoxtc-extracting-toxic-en-features-mbert-4/train_label.csv'),\n", 189 | " pd.read_csv('../input/indoxtc-extracting-toxic-en-features-mbert-5/train_label.csv'),\n", 190 | " pd.read_csv('../input/indoxtc-extracting-toxic-en-features-mbert-6/train_label.csv'),\n", 191 | "])\n", 192 | "\n", 193 | "train_y['label'].to_csv('train_label.csv', index=False, header=['label'])\n", 194 | "\n", 195 | "print(train_y.shape)\n", 196 | "print(train_y.label.value_counts())\n", 197 | "train_y.head()" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 4, 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "name": "stdout", 207 | "output_type": "stream", 208 | "text": [ 209 | "__notebook__.ipynb train_label.csv train_text.npy\r\n" 210 | ] 211 | } 212 | ], 213 | "source": [ 214 | "!ls '.'" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [] 223 | } 224 | ], 225 | "metadata": { 226 | "kernelspec": { 227 | "display_name": "Python 3", 228 | "language": "python", 229 | "name": "python3" 230 | }, 231 | "language_info": { 232 | "codemirror_mode": { 233 | "name": "ipython", 234 | "version": 3 235 | }, 236 | "file_extension": ".py", 237 | "mimetype": "text/x-python", 238 | "name": "python", 239 | "nbconvert_exporter": "python", 240 | "pygments_lexer": "ipython3", 241 | "version": "3.6.6" 242 | } 243 | }, 244 | "nbformat": 4, 245 | "nbformat_minor": 4 246 | } 247 | -------------------------------------------------------------------------------- /notebooks/fine_tune_head/extracting_features/jigsaw_toxic/xlm_r/indoxtc-combining-toxic-en-features-xlm-r.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# IndoXTC - Combining Toxic-EN Features [XLM-R]\n", 8 | "Exploring Indonesian hate speech/abusive & sentiment text classification using multilingual language model.\n", 9 | "\n", 10 | "This kernel is a part of my undergraduate final year project.\n", 11 | "Checkout the full github repository:\n", 12 | "https://github.com/ilhamfp/indonesian-text-classification-multilingual" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": { 19 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 20 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" 21 | }, 22 | "outputs": [ 23 | { 24 | "name": "stdout", 25 | "output_type": "stream", 26 | "text": [ 27 | "/kaggle/input/indoxtc-extracting-toxic-en-features-xlm-r-2/__results__.html\n", 28 | "/kaggle/input/indoxtc-extracting-toxic-en-features-xlm-r-2/train_text.npy\n", 29 | "/kaggle/input/indoxtc-extracting-toxic-en-features-xlm-r-2/__notebook__.ipynb\n", 30 | "/kaggle/input/indoxtc-extracting-toxic-en-features-xlm-r-2/train_label.csv\n", 31 | "/kaggle/input/indoxtc-extracting-toxic-en-features-xlm-r-2/custom.css\n", 32 | "/kaggle/input/indoxtc-extracting-toxic-en-features-xlm-r-2/__output__.json\n", 33 | "/kaggle/input/indoxtc-extracting-toxic-en-features-xlm-r-3/__results__.html\n", 34 | "/kaggle/input/indoxtc-extracting-toxic-en-features-xlm-r-3/train_text.npy\n", 35 | "/kaggle/input/indoxtc-extracting-toxic-en-features-xlm-r-3/__notebook__.ipynb\n", 36 | "/kaggle/input/indoxtc-extracting-toxic-en-features-xlm-r-3/train_label.csv\n", 37 | "/kaggle/input/indoxtc-extracting-toxic-en-features-xlm-r-3/custom.css\n", 38 | "/kaggle/input/indoxtc-extracting-toxic-en-features-xlm-r-3/__output__.json\n", 39 | "/kaggle/input/indoxtc-extracting-toxic-en-features-xlm-r-1/__results__.html\n", 40 | "/kaggle/input/indoxtc-extracting-toxic-en-features-xlm-r-1/train_text.npy\n", 41 | "/kaggle/input/indoxtc-extracting-toxic-en-features-xlm-r-1/__notebook__.ipynb\n", 42 | "/kaggle/input/indoxtc-extracting-toxic-en-features-xlm-r-1/train_label.csv\n", 43 | "/kaggle/input/indoxtc-extracting-toxic-en-features-xlm-r-1/custom.css\n", 44 | "/kaggle/input/indoxtc-extracting-toxic-en-features-xlm-r-1/__output__.json\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "import numpy as np\n", 50 | "import pandas as pd \n", 51 | "import os\n", 52 | "for dirname, _, filenames in os.walk('/kaggle/input'):\n", 53 | " for filename in filenames:\n", 54 | " print(os.path.join(dirname, filename))" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "metadata": { 61 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 62 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" 63 | }, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "(120000, 1, 1024)\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "train_x = np.concatenate([\n", 75 | " np.array([x for x in np.load('../input/indoxtc-extracting-toxic-en-features-xlm-r-1/train_text.npy', allow_pickle=True)]),\n", 76 | " np.array([x for x in np.load('../input/indoxtc-extracting-toxic-en-features-xlm-r-2/train_text.npy', allow_pickle=True)]),\n", 77 | " np.array([x for x in np.load('../input/indoxtc-extracting-toxic-en-features-xlm-r-3/train_text.npy', allow_pickle=True)]),\n", 78 | " ])\n", 79 | "\n", 80 | "print(train_x.shape)\n", 81 | "np.save(\"train_text.npy\", train_x)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 3, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "name": "stdout", 91 | "output_type": "stream", 92 | "text": [ 93 | "(120000, 1)\n", 94 | "1 60000\n", 95 | "0 60000\n", 96 | "Name: label, dtype: int64\n" 97 | ] 98 | }, 99 | { 100 | "data": { 101 | "text/html": [ 102 | "
\n", 103 | "\n", 116 | "\n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | "
label
01
11
21
31
41
\n", 146 | "
" 147 | ], 148 | "text/plain": [ 149 | " label\n", 150 | "0 1\n", 151 | "1 1\n", 152 | "2 1\n", 153 | "3 1\n", 154 | "4 1" 155 | ] 156 | }, 157 | "execution_count": 3, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "train_y = pd.concat([\n", 164 | " pd.read_csv('../input/indoxtc-extracting-toxic-en-features-xlm-r-1/train_label.csv'),\n", 165 | " pd.read_csv('../input/indoxtc-extracting-toxic-en-features-xlm-r-2/train_label.csv'),\n", 166 | " pd.read_csv('../input/indoxtc-extracting-toxic-en-features-xlm-r-3/train_label.csv'),\n", 167 | "])\n", 168 | "\n", 169 | "train_y['label'].to_csv('train_label.csv', index=False, header=['label'])\n", 170 | "\n", 171 | "print(train_y.shape)\n", 172 | "print(train_y.label.value_counts())\n", 173 | "train_y.head()" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 4, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "__notebook__.ipynb train_label.csv train_text.npy\r\n" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "!ls '.'" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [] 199 | } 200 | ], 201 | "metadata": { 202 | "kernelspec": { 203 | "display_name": "Python 3", 204 | "language": "python", 205 | "name": "python3" 206 | }, 207 | "language_info": { 208 | "codemirror_mode": { 209 | "name": "ipython", 210 | "version": 3 211 | }, 212 | "file_extension": ".py", 213 | "mimetype": "text/x-python", 214 | "name": "python", 215 | "nbconvert_exporter": "python", 216 | "pygments_lexer": "ipython3", 217 | "version": "3.6.6" 218 | } 219 | }, 220 | "nbformat": 4, 221 | "nbformat_minor": 4 222 | } 223 | -------------------------------------------------------------------------------- /notebooks/fine_tune_head/extracting_features/toxic/xlm_r/indoxtc-extracting-toxic-features-xlm-r.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# IndoXTC - Extracting Toxic Features [XLM-R]\n", 8 | "Exploring Indonesian hate speech/abusive & sentiment text classification using multilingual language model. \n", 9 | " \n", 10 | "This kernel is a part of my undergraduate final year project. \n", 11 | "Checkout the full github repository: \n", 12 | "https://github.com/ilhamfp/indonesian-text-classification-multilingual" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": { 19 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 20 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np\n", 25 | "import pandas as pd\n", 26 | "from load_data import load_dataset_indonesian\n", 27 | "from extract_feature import FeatureExtractor" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 34 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a", 35 | "collapsed": true 36 | }, 37 | "source": [ 38 | "## Load Data" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 2, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "name": "stdout", 48 | "output_type": "stream", 49 | "text": [ 50 | "~~~Train Data~~~\n", 51 | "Shape: (11852, 2)\n", 52 | " text label\n", 53 | "0 cebong pindah on masuk kemako brigade mobil 1\n", 54 | "1 sungguh rezim jokowi sadis kuasa soeha o puluh 1\n", 55 | "\n", 56 | "Label:\n", 57 | "1 6578\n", 58 | "0 5274\n", 59 | "Name: label, dtype: int64\n", 60 | "\n", 61 | "~~~Test Data~~~\n", 62 | "Shape: (1317, 2)\n", 63 | " text label\n", 64 | "0 gubernur daerah khusus ibukota salat jumat erd... 0\n", 65 | "1 hindu kenal sidang hindu percaya hukum karma t... 0\n", 66 | "2 jancuk jancuk 1\n", 67 | "3 bukti on bukti congor doang 1\n", 68 | "\n", 69 | "Label:\n", 70 | "1 731\n", 71 | "0 586\n", 72 | "Name: label, dtype: int64\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "train, test = load_dataset_indonesian(data_name='toxic')" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "## Extract Feature" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 3, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "name": "stderr", 94 | "output_type": "stream", 95 | "text": [ 96 | "Downloading: \"https://github.com/pytorch/fairseq/archive/master.zip\" to /root/.cache/torch/hub/master.zip\n" 97 | ] 98 | }, 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "running build_ext\n", 104 | "cythoning fairseq/data/data_utils_fast.pyx to fairseq/data/data_utils_fast.cpp\n", 105 | "cythoning fairseq/data/token_block_utils_fast.pyx to fairseq/data/token_block_utils_fast.cpp\n", 106 | "building 'fairseq.libbleu' extension\n", 107 | "creating build\n", 108 | "creating build/temp.linux-x86_64-3.6\n", 109 | "creating build/temp.linux-x86_64-3.6/fairseq\n", 110 | "creating build/temp.linux-x86_64-3.6/fairseq/clib\n", 111 | "creating build/temp.linux-x86_64-3.6/fairseq/clib/libbleu\n", 112 | "gcc -pthread -B /opt/conda/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/opt/conda/include/python3.6m -c fairseq/clib/libbleu/libbleu.cpp -o build/temp.linux-x86_64-3.6/fairseq/clib/libbleu/libbleu.o -std=c++11 -O3 -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=libbleu -D_GLIBCXX_USE_CXX11_ABI=0\n", 113 | "gcc -pthread -B /opt/conda/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/opt/conda/include/python3.6m -c fairseq/clib/libbleu/module.cpp -o build/temp.linux-x86_64-3.6/fairseq/clib/libbleu/module.o -std=c++11 -O3 -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=libbleu -D_GLIBCXX_USE_CXX11_ABI=0\n", 114 | "creating build/lib.linux-x86_64-3.6\n", 115 | "creating build/lib.linux-x86_64-3.6/fairseq\n", 116 | "g++ -pthread -shared -B /opt/conda/compiler_compat -L/opt/conda/lib -Wl,-rpath=/opt/conda/lib -Wl,--no-as-needed -Wl,--sysroot=/ build/temp.linux-x86_64-3.6/fairseq/clib/libbleu/libbleu.o build/temp.linux-x86_64-3.6/fairseq/clib/libbleu/module.o -o build/lib.linux-x86_64-3.6/fairseq/libbleu.cpython-36m-x86_64-linux-gnu.so\n", 117 | "building 'fairseq.data.data_utils_fast' extension\n", 118 | "creating build/temp.linux-x86_64-3.6/fairseq/data\n", 119 | "gcc -pthread -B /opt/conda/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/opt/conda/lib/python3.6/site-packages/numpy/core/include -I/opt/conda/lib/python3.6/site-packages/numpy/core/include -I/opt/conda/include/python3.6m -c fairseq/data/data_utils_fast.cpp -o build/temp.linux-x86_64-3.6/fairseq/data/data_utils_fast.o -std=c++11 -O3 -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=data_utils_fast -D_GLIBCXX_USE_CXX11_ABI=0\n", 120 | "creating build/lib.linux-x86_64-3.6/fairseq/data\n", 121 | "g++ -pthread -shared -B /opt/conda/compiler_compat -L/opt/conda/lib -Wl,-rpath=/opt/conda/lib -Wl,--no-as-needed -Wl,--sysroot=/ build/temp.linux-x86_64-3.6/fairseq/data/data_utils_fast.o -o build/lib.linux-x86_64-3.6/fairseq/data/data_utils_fast.cpython-36m-x86_64-linux-gnu.so\n", 122 | "building 'fairseq.data.token_block_utils_fast' extension\n", 123 | "gcc -pthread -B /opt/conda/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/opt/conda/lib/python3.6/site-packages/numpy/core/include -I/opt/conda/lib/python3.6/site-packages/numpy/core/include -I/opt/conda/include/python3.6m -c fairseq/data/token_block_utils_fast.cpp -o build/temp.linux-x86_64-3.6/fairseq/data/token_block_utils_fast.o -std=c++11 -O3 -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=token_block_utils_fast -D_GLIBCXX_USE_CXX11_ABI=0\n", 124 | "g++ -pthread -shared -B /opt/conda/compiler_compat -L/opt/conda/lib -Wl,-rpath=/opt/conda/lib -Wl,--no-as-needed -Wl,--sysroot=/ build/temp.linux-x86_64-3.6/fairseq/data/token_block_utils_fast.o -o build/lib.linux-x86_64-3.6/fairseq/data/token_block_utils_fast.cpython-36m-x86_64-linux-gnu.so\n", 125 | "building 'fairseq.libnat' extension\n", 126 | "creating build/temp.linux-x86_64-3.6/fairseq/clib/libnat\n", 127 | "gcc -pthread -B /opt/conda/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/opt/conda/lib/python3.6/site-packages/torch/include -I/opt/conda/lib/python3.6/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.6/site-packages/torch/include/TH -I/opt/conda/lib/python3.6/site-packages/torch/include/THC -I/opt/conda/include/python3.6m -c fairseq/clib/libnat/edit_dist.cpp -o build/temp.linux-x86_64-3.6/fairseq/clib/libnat/edit_dist.o -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=libnat -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++11\n", 128 | "g++ -pthread -shared -B /opt/conda/compiler_compat -L/opt/conda/lib -Wl,-rpath=/opt/conda/lib -Wl,--no-as-needed -Wl,--sysroot=/ build/temp.linux-x86_64-3.6/fairseq/clib/libnat/edit_dist.o -o build/lib.linux-x86_64-3.6/fairseq/libnat.cpython-36m-x86_64-linux-gnu.so\n", 129 | "copying build/lib.linux-x86_64-3.6/fairseq/libbleu.cpython-36m-x86_64-linux-gnu.so -> fairseq\n", 130 | "copying build/lib.linux-x86_64-3.6/fairseq/data/data_utils_fast.cpython-36m-x86_64-linux-gnu.so -> fairseq/data\n", 131 | "copying build/lib.linux-x86_64-3.6/fairseq/data/token_block_utils_fast.cpython-36m-x86_64-linux-gnu.so -> fairseq/data\n", 132 | "copying build/lib.linux-x86_64-3.6/fairseq/libnat.cpython-36m-x86_64-linux-gnu.so -> fairseq\n" 133 | ] 134 | }, 135 | { 136 | "name": "stderr", 137 | "output_type": "stream", 138 | "text": [ 139 | "100%|██████████| 1028340964/1028340964 [00:23<00:00, 44381051.03B/s]\n" 140 | ] 141 | } 142 | ], 143 | "source": [ 144 | "FE = FeatureExtractor(model_name='xlm-r')" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 4, 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "data": { 154 | "text/html": [ 155 | "
\n", 156 | "\n", 169 | "\n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | "
textlabel
0[[-0.029517096, 0.03590906, 0.14920011, -0.039...1
1[[0.06659257, 0.008365027, 0.0791176, -0.01333...1
2[[-0.009453215, 0.04063628, 0.16281019, -0.031...1
3[[-0.0192231, -0.013337574, 0.12597367, -0.024...0
4[[-0.13913733, 0.07883438, 0.18570043, 0.00406...0
\n", 205 | "
" 206 | ], 207 | "text/plain": [ 208 | " text label\n", 209 | "0 [[-0.029517096, 0.03590906, 0.14920011, -0.039... 1\n", 210 | "1 [[0.06659257, 0.008365027, 0.0791176, -0.01333... 1\n", 211 | "2 [[-0.009453215, 0.04063628, 0.16281019, -0.031... 1\n", 212 | "3 [[-0.0192231, -0.013337574, 0.12597367, -0.024... 0\n", 213 | "4 [[-0.13913733, 0.07883438, 0.18570043, 0.00406... 0" 214 | ] 215 | }, 216 | "execution_count": 4, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "train['text'] = train['text'].apply(lambda x: FE.extract_features(x))\n", 223 | "test['text'] = test['text'].apply(lambda x: FE.extract_features(x))\n", 224 | "train.head()" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "## Saving Results" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 5, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "np.save(\"train_text.npy\", train['text'].values)\n", 241 | "np.save(\"test_text.npy\", test['text'].values)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 6, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "train['label'].to_csv('train_label.csv', index=False, header=['label'])\n", 251 | "test['label'].to_csv('test_label.csv', index=False, header=['label'])" 252 | ] 253 | } 254 | ], 255 | "metadata": { 256 | "kernelspec": { 257 | "display_name": "Python 3", 258 | "language": "python", 259 | "name": "python3" 260 | }, 261 | "language_info": { 262 | "codemirror_mode": { 263 | "name": "ipython", 264 | "version": 3 265 | }, 266 | "file_extension": ".py", 267 | "mimetype": "text/x-python", 268 | "name": "python", 269 | "nbconvert_exporter": "python", 270 | "pygments_lexer": "ipython3", 271 | "version": "3.6.6" 272 | } 273 | }, 274 | "nbformat": 4, 275 | "nbformat_minor": 4 276 | } 277 | -------------------------------------------------------------------------------- /notebooks/fine_tune_head/extracting_features/yelp_review/mbert/indoxtc-combining-yelp-features-mbert.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# IndoXTC - Combining Yelp Features [mBERT]\n", 8 | "Exploring Indonesian hate speech/abusive & sentiment text classification using multilingual language model.\n", 9 | "\n", 10 | "This kernel is a part of my undergraduate final year project.\n", 11 | "Checkout the full github repository:\n", 12 | "https://github.com/ilhamfp/indonesian-text-classification-multilingual" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": { 19 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 20 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" 21 | }, 22 | "outputs": [ 23 | { 24 | "name": "stdout", 25 | "output_type": "stream", 26 | "text": [ 27 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-6/train_text.npy\n", 28 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-6/custom.css\n", 29 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-6/__notebook__.ipynb\n", 30 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-6/__results__.html\n", 31 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-6/__output__.json\n", 32 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-6/train_label.csv\n", 33 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-2/train_text.npy\n", 34 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-2/custom.css\n", 35 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-2/__notebook__.ipynb\n", 36 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-2/__results__.html\n", 37 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-2/__output__.json\n", 38 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-2/train_label.csv\n", 39 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-9/train_text.npy\n", 40 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-9/custom.css\n", 41 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-9/__notebook__.ipynb\n", 42 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-9/__results__.html\n", 43 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-9/__output__.json\n", 44 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-9/train_label.csv\n", 45 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-4/train_text.npy\n", 46 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-4/custom.css\n", 47 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-4/__notebook__.ipynb\n", 48 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-4/__results__.html\n", 49 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-4/__output__.json\n", 50 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-4/train_label.csv\n", 51 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-8/train_text.npy\n", 52 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-8/custom.css\n", 53 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-8/__notebook__.ipynb\n", 54 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-8/__results__.html\n", 55 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-8/__output__.json\n", 56 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-8/train_label.csv\n", 57 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-1/train_text.npy\n", 58 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-1/custom.css\n", 59 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-1/__notebook__.ipynb\n", 60 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-1/__results__.html\n", 61 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-1/__output__.json\n", 62 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-1/train_label.csv\n", 63 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-5/train_text.npy\n", 64 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-5/custom.css\n", 65 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-5/__notebook__.ipynb\n", 66 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-5/__results__.html\n", 67 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-5/__output__.json\n", 68 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-5/train_label.csv\n", 69 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-7/train_text.npy\n", 70 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-7/custom.css\n", 71 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-7/__notebook__.ipynb\n", 72 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-7/__results__.html\n", 73 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-7/__output__.json\n", 74 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-7/train_label.csv\n", 75 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-3/train_text.npy\n", 76 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-3/custom.css\n", 77 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-3/__notebook__.ipynb\n", 78 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-3/__results__.html\n", 79 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-3/__output__.json\n", 80 | "/kaggle/input/indoxtc-extracting-yelp-features-mbert-3/train_label.csv\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "import numpy as np\n", 86 | "import pandas as pd \n", 87 | "import os\n", 88 | "for dirname, _, filenames in os.walk('/kaggle/input'):\n", 89 | " for filename in filenames:\n", 90 | " print(os.path.join(dirname, filename))" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 2, 96 | "metadata": { 97 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 98 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" 99 | }, 100 | "outputs": [ 101 | { 102 | "name": "stdout", 103 | "output_type": "stream", 104 | "text": [ 105 | "(135000, 1, 768)\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "train_x = np.concatenate([\n", 111 | " np.array([x for x in np.load('../input/indoxtc-extracting-yelp-features-mbert-1/train_text.npy', allow_pickle=True)]),\n", 112 | " np.array([x for x in np.load('../input/indoxtc-extracting-yelp-features-mbert-2/train_text.npy', allow_pickle=True)]),\n", 113 | " np.array([x for x in np.load('../input/indoxtc-extracting-yelp-features-mbert-3/train_text.npy', allow_pickle=True)]),\n", 114 | " np.array([x for x in np.load('../input/indoxtc-extracting-yelp-features-mbert-4/train_text.npy', allow_pickle=True)]),\n", 115 | " np.array([x for x in np.load('../input/indoxtc-extracting-yelp-features-mbert-5/train_text.npy', allow_pickle=True)]),\n", 116 | " np.array([x for x in np.load('../input/indoxtc-extracting-yelp-features-mbert-6/train_text.npy', allow_pickle=True)]),\n", 117 | " np.array([x for x in np.load('../input/indoxtc-extracting-yelp-features-mbert-7/train_text.npy', allow_pickle=True)]),\n", 118 | " np.array([x for x in np.load('../input/indoxtc-extracting-yelp-features-mbert-8/train_text.npy', allow_pickle=True)]),\n", 119 | " np.array([x for x in np.load('../input/indoxtc-extracting-yelp-features-mbert-9/train_text.npy', allow_pickle=True)]),\n", 120 | " ])\n", 121 | "\n", 122 | "print(train_x.shape)\n", 123 | "np.save(\"train_text.npy\", train_x)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 3, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "name": "stdout", 133 | "output_type": "stream", 134 | "text": [ 135 | "(135000, 1)\n", 136 | "1 67500\n", 137 | "0 67500\n", 138 | "Name: label, dtype: int64\n" 139 | ] 140 | }, 141 | { 142 | "data": { 143 | "text/html": [ 144 | "
\n", 145 | "\n", 158 | "\n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | "
label
01
11
21
31
41
\n", 188 | "
" 189 | ], 190 | "text/plain": [ 191 | " label\n", 192 | "0 1\n", 193 | "1 1\n", 194 | "2 1\n", 195 | "3 1\n", 196 | "4 1" 197 | ] 198 | }, 199 | "execution_count": 3, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "train_y = pd.concat([\n", 206 | " pd.read_csv('../input/indoxtc-extracting-yelp-features-mbert-1/train_label.csv'),\n", 207 | " pd.read_csv('../input/indoxtc-extracting-yelp-features-mbert-2/train_label.csv'),\n", 208 | " pd.read_csv('../input/indoxtc-extracting-yelp-features-mbert-3/train_label.csv'),\n", 209 | " pd.read_csv('../input/indoxtc-extracting-yelp-features-mbert-4/train_label.csv'),\n", 210 | " pd.read_csv('../input/indoxtc-extracting-yelp-features-mbert-5/train_label.csv'),\n", 211 | " pd.read_csv('../input/indoxtc-extracting-yelp-features-mbert-6/train_label.csv'),\n", 212 | " pd.read_csv('../input/indoxtc-extracting-yelp-features-mbert-7/train_label.csv'),\n", 213 | " pd.read_csv('../input/indoxtc-extracting-yelp-features-mbert-8/train_label.csv'),\n", 214 | " pd.read_csv('../input/indoxtc-extracting-yelp-features-mbert-9/train_label.csv'),\n", 215 | "])\n", 216 | "\n", 217 | "train_y['label'].to_csv('train_label.csv', index=False, header=['label'])\n", 218 | "\n", 219 | "print(train_y.shape)\n", 220 | "print(train_y.label.value_counts())\n", 221 | "train_y.head()" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 4, 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "name": "stdout", 231 | "output_type": "stream", 232 | "text": [ 233 | "__notebook__.ipynb train_label.csv train_text.npy\r\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "!ls '.'" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [] 247 | } 248 | ], 249 | "metadata": { 250 | "kernelspec": { 251 | "display_name": "Python 3", 252 | "language": "python", 253 | "name": "python3" 254 | }, 255 | "language_info": { 256 | "codemirror_mode": { 257 | "name": "ipython", 258 | "version": 3 259 | }, 260 | "file_extension": ".py", 261 | "mimetype": "text/x-python", 262 | "name": "python", 263 | "nbconvert_exporter": "python", 264 | "pygments_lexer": "ipython3", 265 | "version": "3.6.6" 266 | } 267 | }, 268 | "nbformat": 4, 269 | "nbformat_minor": 4 270 | } 271 | -------------------------------------------------------------------------------- /notebooks/fine_tune_head/extracting_features/yelp_review/xlm_r/indoxtc-combining-yelp-features-xlm-r.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# IndoXTC - Combining Yelp Features [XLM-R]\n", 8 | "Exploring Indonesian hate speech/abusive & sentiment text classification using multilingual language model.\n", 9 | "\n", 10 | "This kernel is a part of my undergraduate final year project.\n", 11 | "Checkout the full github repository:\n", 12 | "https://github.com/ilhamfp/indonesian-text-classification-multilingual" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": { 19 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 20 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" 21 | }, 22 | "outputs": [ 23 | { 24 | "name": "stdout", 25 | "output_type": "stream", 26 | "text": [ 27 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-7/__results__.html\n", 28 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-7/train_text.npy\n", 29 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-7/train_label.csv\n", 30 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-7/__notebook__.ipynb\n", 31 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-7/custom.css\n", 32 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-7/__output__.json\n", 33 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-2/__results__.html\n", 34 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-2/train_text.npy\n", 35 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-2/train_label.csv\n", 36 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-2/__notebook__.ipynb\n", 37 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-2/custom.css\n", 38 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-2/__output__.json\n", 39 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-5/__results__.html\n", 40 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-5/train_text.npy\n", 41 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-5/train_label.csv\n", 42 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-5/__notebook__.ipynb\n", 43 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-5/custom.css\n", 44 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-5/__output__.json\n", 45 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-9/__results__.html\n", 46 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-9/train_text.npy\n", 47 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-9/train_label.csv\n", 48 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-9/__notebook__.ipynb\n", 49 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-9/custom.css\n", 50 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-9/__output__.json\n", 51 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-4/__results__.html\n", 52 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-4/train_text.npy\n", 53 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-4/train_label.csv\n", 54 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-4/__notebook__.ipynb\n", 55 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-4/custom.css\n", 56 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-4/__output__.json\n", 57 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-8/__results__.html\n", 58 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-8/train_text.npy\n", 59 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-8/train_label.csv\n", 60 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-8/__notebook__.ipynb\n", 61 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-8/custom.css\n", 62 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-8/__output__.json\n", 63 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-3/__results__.html\n", 64 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-3/train_text.npy\n", 65 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-3/train_label.csv\n", 66 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-3/__notebook__.ipynb\n", 67 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-3/custom.css\n", 68 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-3/__output__.json\n", 69 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-6/__results__.html\n", 70 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-6/train_text.npy\n", 71 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-6/train_label.csv\n", 72 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-6/__notebook__.ipynb\n", 73 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-6/custom.css\n", 74 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-6/__output__.json\n", 75 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-1/__results__.html\n", 76 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-1/train_text.npy\n", 77 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-1/train_label.csv\n", 78 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-1/__notebook__.ipynb\n", 79 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-1/custom.css\n", 80 | "/kaggle/input/indoxtc-extracting-yelp-features-xlm-r-1/__output__.json\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "import numpy as np\n", 86 | "import pandas as pd \n", 87 | "import os\n", 88 | "for dirname, _, filenames in os.walk('/kaggle/input'):\n", 89 | " for filename in filenames:\n", 90 | " print(os.path.join(dirname, filename))" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 2, 96 | "metadata": { 97 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 98 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" 99 | }, 100 | "outputs": [ 101 | { 102 | "name": "stdout", 103 | "output_type": "stream", 104 | "text": [ 105 | "(135000, 1, 1024)\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "train_x = np.concatenate([\n", 111 | " np.array([x for x in np.load('../input/indoxtc-extracting-yelp-features-xlm-r-1/train_text.npy', allow_pickle=True)]),\n", 112 | " np.array([x for x in np.load('../input/indoxtc-extracting-yelp-features-xlm-r-2/train_text.npy', allow_pickle=True)]),\n", 113 | " np.array([x for x in np.load('../input/indoxtc-extracting-yelp-features-xlm-r-3/train_text.npy', allow_pickle=True)]),\n", 114 | " np.array([x for x in np.load('../input/indoxtc-extracting-yelp-features-xlm-r-4/train_text.npy', allow_pickle=True)]),\n", 115 | " np.array([x for x in np.load('../input/indoxtc-extracting-yelp-features-xlm-r-5/train_text.npy', allow_pickle=True)]),\n", 116 | " np.array([x for x in np.load('../input/indoxtc-extracting-yelp-features-xlm-r-6/train_text.npy', allow_pickle=True)]),\n", 117 | " np.array([x for x in np.load('../input/indoxtc-extracting-yelp-features-xlm-r-7/train_text.npy', allow_pickle=True)]),\n", 118 | " np.array([x for x in np.load('../input/indoxtc-extracting-yelp-features-xlm-r-8/train_text.npy', allow_pickle=True)]),\n", 119 | " np.array([x for x in np.load('../input/indoxtc-extracting-yelp-features-xlm-r-9/train_text.npy', allow_pickle=True)]),\n", 120 | " ])\n", 121 | "\n", 122 | "print(train_x.shape)\n", 123 | "np.save(\"train_text.npy\", train_x)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 3, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "name": "stdout", 133 | "output_type": "stream", 134 | "text": [ 135 | "(135000, 1)\n", 136 | "1 67500\n", 137 | "0 67500\n", 138 | "Name: label, dtype: int64\n" 139 | ] 140 | }, 141 | { 142 | "data": { 143 | "text/html": [ 144 | "
\n", 145 | "\n", 158 | "\n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | "
label
01
11
21
31
41
\n", 188 | "
" 189 | ], 190 | "text/plain": [ 191 | " label\n", 192 | "0 1\n", 193 | "1 1\n", 194 | "2 1\n", 195 | "3 1\n", 196 | "4 1" 197 | ] 198 | }, 199 | "execution_count": 3, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "train_y = pd.concat([\n", 206 | " pd.read_csv('../input/indoxtc-extracting-yelp-features-xlm-r-1/train_label.csv'),\n", 207 | " pd.read_csv('../input/indoxtc-extracting-yelp-features-xlm-r-2/train_label.csv'),\n", 208 | " pd.read_csv('../input/indoxtc-extracting-yelp-features-xlm-r-3/train_label.csv'),\n", 209 | " pd.read_csv('../input/indoxtc-extracting-yelp-features-xlm-r-4/train_label.csv'),\n", 210 | " pd.read_csv('../input/indoxtc-extracting-yelp-features-xlm-r-5/train_label.csv'),\n", 211 | " pd.read_csv('../input/indoxtc-extracting-yelp-features-xlm-r-6/train_label.csv'),\n", 212 | " pd.read_csv('../input/indoxtc-extracting-yelp-features-xlm-r-7/train_label.csv'),\n", 213 | " pd.read_csv('../input/indoxtc-extracting-yelp-features-xlm-r-8/train_label.csv'),\n", 214 | " pd.read_csv('../input/indoxtc-extracting-yelp-features-xlm-r-9/train_label.csv'),\n", 215 | "])\n", 216 | "\n", 217 | "train_y['label'].to_csv('train_label.csv', index=False, header=['label'])\n", 218 | "\n", 219 | "print(train_y.shape)\n", 220 | "print(train_y.label.value_counts())\n", 221 | "train_y.head()" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 4, 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "name": "stdout", 231 | "output_type": "stream", 232 | "text": [ 233 | "__notebook__.ipynb train_label.csv train_text.npy\r\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "!ls '.'" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [] 247 | } 248 | ], 249 | "metadata": { 250 | "kernelspec": { 251 | "display_name": "Python 3", 252 | "language": "python", 253 | "name": "python3" 254 | }, 255 | "language_info": { 256 | "codemirror_mode": { 257 | "name": "ipython", 258 | "version": 3 259 | }, 260 | "file_extension": ".py", 261 | "mimetype": "text/x-python", 262 | "name": "python", 263 | "nbconvert_exporter": "python", 264 | "pygments_lexer": "ipython3", 265 | "version": "3.6.6" 266 | } 267 | }, 268 | "nbformat": 4, 269 | "nbformat_minor": 4 270 | } 271 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_full/prosa/xlm_r/final_prosa_yelp_xlm_r_result_combined_10981.csv: -------------------------------------------------------------------------------- 1 | tipe,total_data,foreign_mult,total_foreign_data,max_f1,max_recall,max_precision,max_accuracy 2 | A,10981,0.0,0,0.9805825242718447,0.9711538461538461,0.9901960784313726,0.9805825242718447 3 | B,10981,-1.0,0,0.33548387096774196,1.0,0.5048543689320388,0.5048543689320388 4 | C,10981,0.5,5490,0.985436550017674,0.9711538461538461,1.0,0.9854368932038835 5 | C,10981,1.0,10981,0.9805820666902321,0.9759615384615384,0.9854368932038835,0.9805825242718447 6 | C,10981,1.5,16471,1.0,1.0,1.0,1.0 7 | C,10981,2.0,21962,0.9514380009429515,0.9230769230769231,0.9795918367346939,0.9514563106796117 8 | C,10981,3.0,32943,0.9538832234613503,0.9471153846153846,0.9609756097560975,0.9538834951456311 9 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_full/prosa/xlm_r/plot-full-prosa-xlmr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_full/prosa/xlm_r/plot-full-prosa-xlmr.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_full/prosa/xlm_r/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_full/prosa/xlm_r/plot.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_full/prosa/xlm_r/result_prosa_yelp_XLM_R_A_10981_0.5_full.csv: -------------------------------------------------------------------------------- 1 | y_pred,y_true 2 | 0.0028371513,0 3 | 0.0011813343,0 4 | 0.00096553564,0 5 | 0.0011271536,0 6 | 0.0016358495,0 7 | 0.0022429228,0 8 | 0.00049582124,0 9 | 0.0015485287,0 10 | 0.0012152791,0 11 | 0.0011089742,0 12 | 0.001562953,0 13 | 0.00050887465,0 14 | 0.0011677146,0 15 | 0.0012116134,0 16 | 0.0006636381,0 17 | 0.3530836,0 18 | 0.0028057992,0 19 | 0.0012459457,0 20 | 0.0008583069,0 21 | 0.0003851652,0 22 | 0.0008853972,0 23 | 0.00068593025,0 24 | 0.0016987622,0 25 | 0.0006300509,0 26 | 0.0009134412,0 27 | 0.0009763837,0 28 | 0.0014330149,0 29 | 0.0007466674,0 30 | 0.002332896,0 31 | 0.0007596612,0 32 | 0.0016959906,0 33 | 0.0004477799,0 34 | 0.0007265806,0 35 | 0.0006707907,0 36 | 0.0012639761,0 37 | 0.0013332665,0 38 | 0.0009151101,0 39 | 0.00078091025,0 40 | 0.0010513365,0 41 | 0.0009662509,0 42 | 0.0010936558,0 43 | 0.0004788041,0 44 | 0.0004272461,0 45 | 0.001006037,0 46 | 0.001015842,0 47 | 0.0005579293,0 48 | 0.00059345365,0 49 | 0.00042060018,0 50 | 0.00092467666,0 51 | 0.00087562203,0 52 | 0.0011960566,0 53 | 0.0012201071,0 54 | 0.0015929639,0 55 | 0.0010064542,0 56 | 0.0010813177,0 57 | 0.0015856624,0 58 | 0.001450628,0 59 | 0.0022078454,0 60 | 0.0009673238,0 61 | 0.0022165477,0 62 | 0.00087344646,0 63 | 0.00079751015,0 64 | 0.0018280149,0 65 | 0.005019158,0 66 | 0.0005661249,0 67 | 0.003528148,0 68 | 0.002067417,0 69 | 0.0005800724,0 70 | 0.011292934,0 71 | 0.0019759238,0 72 | 0.000636816,0 73 | 0.0027013123,0 74 | 0.0008663237,0 75 | 0.003936529,0 76 | 0.0004119277,0 77 | 0.0058499277,0 78 | 0.0016512573,0 79 | 0.0009354949,0 80 | 0.001445353,0 81 | 0.00072699785,0 82 | 0.00061795115,0 83 | 0.018065244,0 84 | 0.0017905533,0 85 | 0.0010290742,0 86 | 0.0004888475,0 87 | 0.0016208589,0 88 | 0.0009956956,0 89 | 0.0007609129,0 90 | 0.0013440847,0 91 | 0.005272031,0 92 | 0.0021708906,0 93 | 0.0010156035,0 94 | 0.0010391772,0 95 | 0.000410676,0 96 | 0.0011898875,0 97 | 0.0003299713,0 98 | 0.0010725856,0 99 | 0.0009779334,0 100 | 0.00041937828,0 101 | 0.0004861951,0 102 | 0.0011425912,0 103 | 0.0007224083,0 104 | 0.0008479357,0 105 | 0.0007298291,0 106 | 0.0007214546,0 107 | 0.00077548623,0 108 | 0.0011605918,0 109 | 0.0015477538,0 110 | 0.0066192746,0 111 | 0.0018577576,0 112 | 0.0005392134,0 113 | 0.00058698654,0 114 | 0.002278179,0 115 | 0.0009872019,0 116 | 0.00096940994,0 117 | 0.0010025799,0 118 | 0.009294093,0 119 | 0.0005580187,0 120 | 0.0043124557,0 121 | 0.0010252595,0 122 | 0.0018390417,0 123 | 0.0010148585,0 124 | 0.001292795,0 125 | 0.0011893213,0 126 | 0.0012114942,0 127 | 0.99891114,1 128 | 0.9978825,1 129 | 0.9997203,1 130 | 0.99807715,1 131 | 0.99948454,1 132 | 0.9996502,1 133 | 0.999697,1 134 | 0.9924351,1 135 | 0.99979085,1 136 | 0.9996284,1 137 | 0.9996877,1 138 | 0.99987626,1 139 | 0.9998672,1 140 | 0.9998529,1 141 | 0.99727017,1 142 | 0.99987674,1 143 | 0.9996182,1 144 | 0.99976254,1 145 | 0.9999053,1 146 | 0.9974234,1 147 | 0.99923325,1 148 | 0.9994787,1 149 | 0.99933064,1 150 | 0.9997184,1 151 | 0.99754727,1 152 | 0.0035475492,0 153 | 0.99737704,1 154 | 0.9949058,1 155 | 0.9330683,1 156 | 0.9990713,1 157 | 0.99933976,1 158 | 0.047497958,1 159 | 0.99935323,1 160 | 0.9986533,1 161 | 0.9947136,1 162 | 0.998862,1 163 | 0.9988531,1 164 | 0.9928149,1 165 | 0.99916494,1 166 | 0.99966466,1 167 | 0.9954777,1 168 | 0.9996837,1 169 | 0.9995726,1 170 | 0.999756,1 171 | 0.99933535,1 172 | 0.99546194,1 173 | 0.9995426,1 174 | 0.99778247,1 175 | 0.9939749,1 176 | 0.9900185,1 177 | 0.9344236,1 178 | 0.9840019,1 179 | 0.99963635,1 180 | 0.992197,1 181 | 0.9512014,1 182 | 0.9997456,1 183 | 0.9943141,1 184 | 0.99719113,1 185 | 0.99941033,1 186 | 0.99988854,1 187 | 0.9997666,1 188 | 0.99748814,1 189 | 0.9967574,1 190 | 0.99869204,1 191 | 0.9977852,1 192 | 0.9987507,1 193 | 0.99269044,1 194 | 0.9998494,1 195 | 0.9985642,1 196 | 0.999359,1 197 | 0.9974271,1 198 | 0.9996938,1 199 | 0.99972856,1 200 | 0.9992944,1 201 | 0.999066,1 202 | 0.9994255,1 203 | 0.9988807,1 204 | 0.99955916,1 205 | 0.9394129,1 206 | 0.9768443,1 207 | 0.9974547,1 208 | 0.99949753,1 209 | 0.9650377,1 210 | 0.99827874,1 211 | 0.99980116,1 212 | 0.99964786,1 213 | 0.99937004,1 214 | 0.99871504,1 215 | 0.9989753,1 216 | 0.99949086,1 217 | 0.99944645,1 218 | 0.99760604,1 219 | 0.9955275,1 220 | 0.6538324,1 221 | 0.9994761,1 222 | 0.99650264,1 223 | 0.99968076,1 224 | 0.9997833,1 225 | 0.99959207,1 226 | 0.99818754,1 227 | 0.99967194,1 228 | 0.99922276,1 229 | 0.9945812,1 230 | 0.99887437,1 231 | 0.9903903,1 232 | 0.9998474,1 233 | 0.9993378,1 234 | 0.9980387,1 235 | 0.9958585,1 236 | 0.9996322,1 237 | 0.998229,1 238 | 0.999933,1 239 | 0.996638,1 240 | 0.9997206,1 241 | 0.999061,1 242 | 0.8570441,1 243 | 0.97862566,1 244 | 0.9998253,1 245 | 0.9957373,1 246 | 0.97735524,1 247 | 0.99915826,1 248 | 0.968012,1 249 | 0.99895704,1 250 | 0.99712414,1 251 | 0.99965906,1 252 | 0.47811973,0 253 | 0.0015258789,0 254 | 0.980193,1 255 | 0.9955393,1 256 | 0.98789805,1 257 | 0.0005798042,0 258 | 0.004489541,0 259 | 0.0018240213,0 260 | 0.00046369433,0 261 | 0.00070384145,0 262 | 0.00044935942,0 263 | 0.0027134717,0 264 | 0.0005957782,0 265 | 0.00066021085,0 266 | 0.004429519,0 267 | 0.0009124875,0 268 | 0.0005246401,0 269 | 0.00035908818,0 270 | 0.0011014044,0 271 | 0.0005416274,0 272 | 0.0003668964,0 273 | 0.00077074766,0 274 | 0.00026360154,0 275 | 0.0003310442,0 276 | 0.00043222308,0 277 | 0.00079116225,0 278 | 0.0006402433,0 279 | 0.0009139478,0 280 | 0.0009869635,0 281 | 0.00068083405,0 282 | 0.00048422813,0 283 | 0.00084728,0 284 | 0.00029295683,0 285 | 0.0003439486,0 286 | 0.85668373,0 287 | 0.00023972988,0 288 | 0.0008932054,0 289 | 0.00053450465,0 290 | 0.0006082654,0 291 | 0.00044980645,0 292 | 0.000590086,0 293 | 0.00030636787,0 294 | 0.00083842874,0 295 | 0.0010247827,0 296 | 0.0006134808,0 297 | 0.0006607771,0 298 | 0.008158535,0 299 | 0.7040569,0 300 | 0.0011187196,0 301 | 0.0007980168,0 302 | 0.00035104156,0 303 | 0.00066676736,0 304 | 0.0013515651,0 305 | 0.0008659065,0 306 | 0.00067448616,0 307 | 0.0009434223,0 308 | 0.0026720166,0 309 | 0.0003862977,0 310 | 0.0016413927,0 311 | 0.00046503544,0 312 | 0.0017946362,0 313 | 0.0010461509,0 314 | 0.0004746318,0 315 | 0.0049010217,0 316 | 0.0010503232,0 317 | 0.0015173256,0 318 | 0.0003530383,0 319 | 0.0030049384,0 320 | 0.0007943213,0 321 | 0.0029292405,0 322 | 0.0009926558,0 323 | 0.0016698837,0 324 | 0.00044223666,0 325 | 0.0004888177,0 326 | 0.0006715059,0 327 | 0.0009602308,0 328 | 0.0011735559,0 329 | 0.0007569492,0 330 | 0.0041497946,0 331 | 0.0006547272,0 332 | 0.9989189,1 333 | 0.99947137,1 334 | 0.9980493,1 335 | 0.9873562,1 336 | 0.9909099,1 337 | 0.9970604,1 338 | 0.9998491,1 339 | 0.999609,1 340 | 0.99909234,1 341 | 0.9979861,1 342 | 0.9997306,1 343 | 0.9964456,1 344 | 0.5229982,1 345 | 0.99606276,1 346 | 0.9942,1 347 | 0.9514158,1 348 | 0.99952054,1 349 | 0.9979154,1 350 | 0.98505384,1 351 | 0.9981349,1 352 | 0.9991057,1 353 | 0.98905957,1 354 | 0.99879074,1 355 | 0.9975592,1 356 | 0.9994728,1 357 | 0.7904707,1 358 | 0.94943637,1 359 | 0.9992691,1 360 | 0.99880165,1 361 | 0.99921274,1 362 | 0.97589123,1 363 | 0.9960775,1 364 | 0.9993319,1 365 | 0.99905515,1 366 | 0.99759126,1 367 | 0.99853927,1 368 | 0.99891925,1 369 | 0.21851113,1 370 | 0.999289,1 371 | 0.99739206,1 372 | 0.9996809,1 373 | 0.876298,1 374 | 0.9904746,1 375 | 0.99848187,1 376 | 0.99957055,1 377 | 0.99969965,1 378 | 0.99961823,1 379 | 0.96931994,1 380 | 0.96729565,1 381 | 0.997439,1 382 | 0.9975997,1 383 | 0.9987605,1 384 | 0.9949405,1 385 | 0.0009434521,1 386 | 0.9996838,1 387 | 0.014259845,1 388 | 0.99608123,1 389 | 0.9996954,1 390 | 0.99615103,1 391 | 0.99961936,1 392 | 0.9939201,1 393 | 0.99210215,1 394 | 0.9995949,1 395 | 0.9997343,1 396 | 0.97532856,1 397 | 0.9976076,1 398 | 0.99945545,1 399 | 0.99949634,1 400 | 0.074680805,1 401 | 0.9947585,1 402 | 0.0011223555,1 403 | 0.9996493,1 404 | 0.9996817,1 405 | 0.99879,1 406 | 0.9858354,1 407 | 0.009619564,0 408 | 0.9853815,1 409 | 0.9977231,1 410 | 0.9499521,1 411 | 0.9992758,1 412 | 0.9917017,1 413 | 0.9982939,1 414 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_full/prosa/xlm_r/result_prosa_yelp_XLM_R_B_10981_0.5_full.csv: -------------------------------------------------------------------------------- 1 | y_pred,y_true 2 | 0.68213224,0 3 | 0.6786875,0 4 | 0.6866307,0 5 | 0.6803219,0 6 | 0.6870112,0 7 | 0.67769015,0 8 | 0.69628036,0 9 | 0.6728379,0 10 | 0.68016917,0 11 | 0.6908817,0 12 | 0.6884112,0 13 | 0.6814516,0 14 | 0.6790727,0 15 | 0.6835853,0 16 | 0.69141746,0 17 | 0.69918436,0 18 | 0.6936699,0 19 | 0.69211864,0 20 | 0.68054736,0 21 | 0.6924427,0 22 | 0.69506186,0 23 | 0.6944263,0 24 | 0.6833306,0 25 | 0.68910646,0 26 | 0.6718591,0 27 | 0.68718827,0 28 | 0.68367714,0 29 | 0.6804948,0 30 | 0.68576616,0 31 | 0.68223166,0 32 | 0.6786016,0 33 | 0.70862424,0 34 | 0.6841669,0 35 | 0.6852546,0 36 | 0.69905496,0 37 | 0.6962747,0 38 | 0.6858975,0 39 | 0.6820753,0 40 | 0.700496,0 41 | 0.69376457,0 42 | 0.68789524,0 43 | 0.6917836,0 44 | 0.69934857,0 45 | 0.6933022,0 46 | 0.6943179,0 47 | 0.7009489,0 48 | 0.70152956,0 49 | 0.7006978,0 50 | 0.6740087,0 51 | 0.6915602,0 52 | 0.6941266,0 53 | 0.6864672,0 54 | 0.6788878,0 55 | 0.6828745,0 56 | 0.68422526,0 57 | 0.6829897,0 58 | 0.6908111,0 59 | 0.6887517,0 60 | 0.69615984,0 61 | 0.6786651,0 62 | 0.6808683,0 63 | 0.67686653,0 64 | 0.6755887,0 65 | 0.6812146,0 66 | 0.6996704,0 67 | 0.6902443,0 68 | 0.6890558,0 69 | 0.6879731,0 70 | 0.68872494,0 71 | 0.6865545,0 72 | 0.70309937,0 73 | 0.70149344,0 74 | 0.6923886,0 75 | 0.683513,0 76 | 0.6927441,0 77 | 0.6846307,0 78 | 0.6942282,0 79 | 0.69349396,0 80 | 0.6964947,0 81 | 0.68372965,0 82 | 0.7007309,0 83 | 0.69109,0 84 | 0.68285614,0 85 | 0.6824147,0 86 | 0.69451797,0 87 | 0.69905204,0 88 | 0.68832684,0 89 | 0.6880552,0 90 | 0.67305887,0 91 | 0.6774869,0 92 | 0.67882144,0 93 | 0.6838778,0 94 | 0.6831507,0 95 | 0.69722784,0 96 | 0.704051,0 97 | 0.6989585,0 98 | 0.70174015,0 99 | 0.68847966,0 100 | 0.6944586,0 101 | 0.6954297,0 102 | 0.7040631,0 103 | 0.69404507,0 104 | 0.70366913,0 105 | 0.6947849,0 106 | 0.6870486,0 107 | 0.6877037,0 108 | 0.681568,0 109 | 0.6948494,0 110 | 0.69490874,0 111 | 0.6908118,0 112 | 0.69975483,0 113 | 0.69654477,0 114 | 0.6860161,0 115 | 0.68835187,0 116 | 0.69918215,0 117 | 0.7025756,0 118 | 0.68896425,0 119 | 0.6953721,0 120 | 0.69376963,0 121 | 0.6824294,0 122 | 0.6800182,0 123 | 0.67900515,0 124 | 0.69456935,0 125 | 0.67832226,0 126 | 0.68337166,0 127 | 0.6929089,1 128 | 0.6935117,1 129 | 0.6995606,1 130 | 0.6785876,1 131 | 0.71513397,1 132 | 0.70849097,1 133 | 0.6785905,1 134 | 0.6990297,1 135 | 0.69580394,1 136 | 0.7048814,1 137 | 0.70431197,1 138 | 0.69964224,1 139 | 0.71613973,1 140 | 0.68723106,1 141 | 0.7068194,1 142 | 0.70324796,1 143 | 0.6999868,1 144 | 0.6835666,1 145 | 0.6882658,1 146 | 0.6897285,1 147 | 0.6867706,1 148 | 0.6950878,1 149 | 0.6890944,1 150 | 0.6832132,1 151 | 0.6893712,1 152 | 0.68695635,0 153 | 0.6769266,1 154 | 0.67724234,1 155 | 0.69218665,1 156 | 0.68782,1 157 | 0.68003654,1 158 | 0.6662736,1 159 | 0.7065742,1 160 | 0.6924641,1 161 | 0.70404565,1 162 | 0.69405633,1 163 | 0.69647706,1 164 | 0.6968379,1 165 | 0.70409954,1 166 | 0.70927995,1 167 | 0.7035279,1 168 | 0.6896358,1 169 | 0.70981747,1 170 | 0.708097,1 171 | 0.70420337,1 172 | 0.6855361,1 173 | 0.7060693,1 174 | 0.6970281,1 175 | 0.68898004,1 176 | 0.6915958,1 177 | 0.68051684,1 178 | 0.6802717,1 179 | 0.6977375,1 180 | 0.69596916,1 181 | 0.69463986,1 182 | 0.6859355,1 183 | 0.69736266,1 184 | 0.67794394,1 185 | 0.6834339,1 186 | 0.68231726,1 187 | 0.679116,1 188 | 0.67664576,1 189 | 0.69196093,1 190 | 0.70145106,1 191 | 0.6910311,1 192 | 0.7016324,1 193 | 0.6877759,1 194 | 0.6784426,1 195 | 0.6880101,1 196 | 0.686705,1 197 | 0.6906308,1 198 | 0.6947281,1 199 | 0.68353415,1 200 | 0.6861998,1 201 | 0.6739447,1 202 | 0.7031076,1 203 | 0.6985299,1 204 | 0.6863333,1 205 | 0.698705,1 206 | 0.6999395,1 207 | 0.6900413,1 208 | 0.6983703,1 209 | 0.68554044,1 210 | 0.7058162,1 211 | 0.69825876,1 212 | 0.7185106,1 213 | 0.69654924,1 214 | 0.7066318,1 215 | 0.67436135,1 216 | 0.6993192,1 217 | 0.701882,1 218 | 0.68422586,1 219 | 0.7076246,1 220 | 0.70349824,1 221 | 0.70224196,1 222 | 0.6927283,1 223 | 0.7149203,1 224 | 0.69117653,1 225 | 0.6845177,1 226 | 0.68366796,1 227 | 0.68444395,1 228 | 0.6839788,1 229 | 0.69070137,1 230 | 0.6730996,1 231 | 0.6835808,1 232 | 0.6920813,1 233 | 0.69068295,1 234 | 0.71776974,1 235 | 0.696033,1 236 | 0.6969535,1 237 | 0.69346917,1 238 | 0.70493823,1 239 | 0.694313,1 240 | 0.69410527,1 241 | 0.690819,1 242 | 0.6850434,1 243 | 0.6865351,1 244 | 0.6847992,1 245 | 0.69896364,1 246 | 0.69706,1 247 | 0.6810052,1 248 | 0.6961558,1 249 | 0.6935128,1 250 | 0.68988377,1 251 | 0.6986633,1 252 | 0.6917824,0 253 | 0.6948701,0 254 | 0.69249964,1 255 | 0.7140803,1 256 | 0.691488,1 257 | 0.7033272,0 258 | 0.6961612,0 259 | 0.6959705,0 260 | 0.7187608,0 261 | 0.7134937,0 262 | 0.7041091,0 263 | 0.6991506,0 264 | 0.69483614,0 265 | 0.7023072,0 266 | 0.6866256,0 267 | 0.6912448,0 268 | 0.7064622,0 269 | 0.7089082,0 270 | 0.71020377,0 271 | 0.69656825,0 272 | 0.695894,0 273 | 0.70163023,0 274 | 0.69867533,0 275 | 0.7006619,0 276 | 0.6933965,0 277 | 0.69665927,0 278 | 0.71998525,0 279 | 0.7030432,0 280 | 0.7026851,0 281 | 0.6977576,0 282 | 0.69577,0 283 | 0.70329225,0 284 | 0.7132834,0 285 | 0.7042451,0 286 | 0.71223193,0 287 | 0.7051904,0 288 | 0.7004936,0 289 | 0.71142215,0 290 | 0.69423854,0 291 | 0.70141363,0 292 | 0.6972768,0 293 | 0.71343553,0 294 | 0.69884205,0 295 | 0.6937603,0 296 | 0.7143543,0 297 | 0.71756446,0 298 | 0.7160784,0 299 | 0.7082322,0 300 | 0.71489877,0 301 | 0.70851016,0 302 | 0.7051393,0 303 | 0.6986847,0 304 | 0.70915806,0 305 | 0.71730465,0 306 | 0.7089006,0 307 | 0.7146661,0 308 | 0.69674325,0 309 | 0.7121306,0 310 | 0.72314304,0 311 | 0.6972754,0 312 | 0.71940726,0 313 | 0.7235526,0 314 | 0.7076438,0 315 | 0.7084509,0 316 | 0.6960751,0 317 | 0.7041254,0 318 | 0.7170229,0 319 | 0.7126704,0 320 | 0.7132227,0 321 | 0.7055848,0 322 | 0.70595324,0 323 | 0.7167878,0 324 | 0.7167774,0 325 | 0.6992185,0 326 | 0.71809065,0 327 | 0.693193,0 328 | 0.70327604,0 329 | 0.6941078,0 330 | 0.7071695,0 331 | 0.69471455,0 332 | 0.69951606,1 333 | 0.7165646,1 334 | 0.7104117,1 335 | 0.7030631,1 336 | 0.70847946,1 337 | 0.69933844,1 338 | 0.704613,1 339 | 0.69901896,1 340 | 0.7274786,1 341 | 0.7047987,1 342 | 0.723433,1 343 | 0.7020554,1 344 | 0.70066094,1 345 | 0.69902813,1 346 | 0.71057576,1 347 | 0.7014834,1 348 | 0.72011584,1 349 | 0.69700766,1 350 | 0.7037027,1 351 | 0.70031154,1 352 | 0.7079717,1 353 | 0.6831587,1 354 | 0.6953689,1 355 | 0.70450455,1 356 | 0.7070943,1 357 | 0.692307,1 358 | 0.69443125,1 359 | 0.68628883,1 360 | 0.6808965,1 361 | 0.7118518,1 362 | 0.7165481,1 363 | 0.6904777,1 364 | 0.7118194,1 365 | 0.69671655,1 366 | 0.70481086,1 367 | 0.69655746,1 368 | 0.7108649,1 369 | 0.6979078,1 370 | 0.7112396,1 371 | 0.7062162,1 372 | 0.7023715,1 373 | 0.70097697,1 374 | 0.72463965,1 375 | 0.7193123,1 376 | 0.71191585,1 377 | 0.6903316,1 378 | 0.68477476,1 379 | 0.6995062,1 380 | 0.70195717,1 381 | 0.6979962,1 382 | 0.6956881,1 383 | 0.690232,1 384 | 0.71032417,1 385 | 0.7161655,1 386 | 0.7010443,1 387 | 0.7141672,1 388 | 0.713406,1 389 | 0.6967311,1 390 | 0.70521533,1 391 | 0.7034729,1 392 | 0.70787835,1 393 | 0.70595044,1 394 | 0.6938428,1 395 | 0.7116764,1 396 | 0.70337856,1 397 | 0.7150278,1 398 | 0.69851536,1 399 | 0.699001,1 400 | 0.7212655,1 401 | 0.70155317,1 402 | 0.68237644,1 403 | 0.6969049,1 404 | 0.7079494,1 405 | 0.6995392,1 406 | 0.7092527,1 407 | 0.7023766,0 408 | 0.7234284,1 409 | 0.698527,1 410 | 0.7148522,1 411 | 0.70538306,1 412 | 0.69931906,1 413 | 0.71545756,1 414 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_full/prosa/xlm_r/result_prosa_yelp_XLM_R_C_10981_0.5_full.csv: -------------------------------------------------------------------------------- 1 | y_pred,y_true 2 | 0.013374001,0 3 | 0.0010137856,0 4 | 0.0010749996,0 5 | 0.00864616,0 6 | 0.0046549737,0 7 | 0.0032093227,0 8 | 0.00065493584,0 9 | 0.0015570521,0 10 | 0.0013408363,0 11 | 0.002058804,0 12 | 0.0035028756,0 13 | 0.0029934049,0 14 | 0.0014412701,0 15 | 0.0012920499,0 16 | 0.003632456,0 17 | 0.018010825,0 18 | 0.0025951862,0 19 | 0.0060855746,0 20 | 0.0012007356,0 21 | 0.00094744563,0 22 | 0.0052491426,0 23 | 0.0018177629,0 24 | 0.0035856962,0 25 | 0.0012126863,0 26 | 0.0007674694,0 27 | 0.001963675,0 28 | 0.002251625,0 29 | 0.0031457245,0 30 | 0.00223279,0 31 | 0.0012711287,0 32 | 0.0008087456,0 33 | 0.0031868517,0 34 | 0.0013155043,0 35 | 0.0011141598,0 36 | 0.027820766,0 37 | 0.003894508,0 38 | 0.0010700226,0 39 | 0.0012925565,0 40 | 0.0041888356,0 41 | 0.010000467,0 42 | 0.001232624,0 43 | 0.0009920001,0 44 | 0.0014898777,0 45 | 0.0019069016,0 46 | 0.0010989308,0 47 | 0.0017666519,0 48 | 0.0015288889,0 49 | 0.0021098256,0 50 | 0.0016364157,0 51 | 0.0017783642,0 52 | 0.02078852,0 53 | 0.003227055,0 54 | 0.0020849109,0 55 | 0.0009442866,0 56 | 0.0033294857,0 57 | 0.0025802553,0 58 | 0.0026740432,0 59 | 0.021084577,0 60 | 0.0013195276,0 61 | 0.0028672218,0 62 | 0.0023718476,0 63 | 0.0007405281,0 64 | 0.0013415217,0 65 | 0.004620254,0 66 | 0.009732544,0 67 | 0.021231145,0 68 | 0.0025379062,0 69 | 0.0045723617,0 70 | 0.007604778,0 71 | 0.0036344528,0 72 | 0.0043341815,0 73 | 0.05151254,0 74 | 0.0014137328,0 75 | 0.011022866,0 76 | 0.00191167,0 77 | 0.012188435,0 78 | 0.016408741,0 79 | 0.0028091967,0 80 | 0.0032361746,0 81 | 0.001260519,0 82 | 0.0062453747,0 83 | 0.01527676,0 84 | 0.0014061928,0 85 | 0.00184834,0 86 | 0.0013042688,0 87 | 0.0019603372,0 88 | 0.0023638606,0 89 | 0.0010814667,0 90 | 0.02109459,0 91 | 0.017033398,0 92 | 0.0043922663,0 93 | 0.0019480884,0 94 | 0.001463443,0 95 | 0.0008482635,0 96 | 0.007019937,0 97 | 0.0018702447,0 98 | 0.04145664,0 99 | 0.0020953715,0 100 | 0.0013412535,0 101 | 0.0033457875,0 102 | 0.01447314,0 103 | 0.0009613633,0 104 | 0.0069155097,0 105 | 0.0009076297,0 106 | 0.005501598,0 107 | 0.0011555552,0 108 | 0.0019935071,0 109 | 0.0047958195,0 110 | 0.016773283,0 111 | 0.0034577847,0 112 | 0.0011916757,0 113 | 0.000965327,0 114 | 0.0354726,0 115 | 0.0018634796,0 116 | 0.0019402206,0 117 | 0.006754726,0 118 | 0.22692302,0 119 | 0.0034950972,0 120 | 0.0025161505,0 121 | 0.001621902,0 122 | 0.005983621,0 123 | 0.0011722445,0 124 | 0.002812624,0 125 | 0.0019788742,0 126 | 0.0016035438,0 127 | 0.99930286,1 128 | 0.9993174,1 129 | 0.9994426,1 130 | 0.99652565,1 131 | 0.9998228,1 132 | 0.9997086,1 133 | 0.9968391,1 134 | 0.9996251,1 135 | 0.9997549,1 136 | 0.9986494,1 137 | 0.99938035,1 138 | 0.99979746,1 139 | 0.99880993,1 140 | 0.9995202,1 141 | 0.99955714,1 142 | 0.9980508,1 143 | 0.9998238,1 144 | 0.9991691,1 145 | 0.99941933,1 146 | 0.99962115,1 147 | 0.99932015,1 148 | 0.99919665,1 149 | 0.99902964,1 150 | 0.9979528,1 151 | 0.99594426,1 152 | 0.0030746758,0 153 | 0.9989547,1 154 | 0.99929285,1 155 | 0.96834505,1 156 | 0.9996855,1 157 | 0.9991462,1 158 | 0.5761992,1 159 | 0.98978865,1 160 | 0.9973395,1 161 | 0.99700224,1 162 | 0.9972371,1 163 | 0.9972186,1 164 | 0.9499519,1 165 | 0.9987862,1 166 | 0.9984224,1 167 | 0.9948021,1 168 | 0.99918646,1 169 | 0.9996495,1 170 | 0.99812424,1 171 | 0.9995382,1 172 | 0.9848944,1 173 | 0.9981368,1 174 | 0.99838716,1 175 | 0.9990085,1 176 | 0.9969578,1 177 | 0.1180464,1 178 | 0.99544907,1 179 | 0.9988357,1 180 | 0.99366784,1 181 | 0.9973509,1 182 | 0.9993168,1 183 | 0.99938565,1 184 | 0.99948907,1 185 | 0.99386245,1 186 | 0.99963325,1 187 | 0.9997264,1 188 | 0.99985176,1 189 | 0.9997692,1 190 | 0.9994069,1 191 | 0.9991691,1 192 | 0.9997622,1 193 | 0.99913466,1 194 | 0.9992589,1 195 | 0.9990792,1 196 | 0.99945116,1 197 | 0.99838996,1 198 | 0.9995759,1 199 | 0.9993506,1 200 | 0.9988034,1 201 | 0.9993173,1 202 | 0.9998132,1 203 | 0.9982586,1 204 | 0.99968773,1 205 | 0.9989176,1 206 | 0.9986894,1 207 | 0.99758065,1 208 | 0.9997513,1 209 | 0.111174285,1 210 | 0.9994716,1 211 | 0.9996898,1 212 | 0.9997553,1 213 | 0.9961797,1 214 | 0.99943864,1 215 | 0.9994024,1 216 | 0.99971986,1 217 | 0.9994689,1 218 | 0.9978728,1 219 | 0.93965614,1 220 | 0.68972355,1 221 | 0.99931055,1 222 | 0.99947786,1 223 | 0.9993998,1 224 | 0.9997549,1 225 | 0.9996019,1 226 | 0.9990193,1 227 | 0.9993793,1 228 | 0.99883544,1 229 | 0.9997801,1 230 | 0.99904907,1 231 | 0.9997012,1 232 | 0.9992241,1 233 | 0.9981923,1 234 | 0.99888027,1 235 | 0.99919647,1 236 | 0.9995352,1 237 | 0.99464214,1 238 | 0.9981538,1 239 | 0.9992368,1 240 | 0.9994912,1 241 | 0.9989663,1 242 | 0.9159327,1 243 | 0.99140406,1 244 | 0.9997953,1 245 | 0.99941754,1 246 | 0.9828094,1 247 | 0.99864453,1 248 | 0.98884225,1 249 | 0.99973273,1 250 | 0.9977404,1 251 | 0.99971044,1 252 | 0.19870126,0 253 | 0.03679657,0 254 | 0.9981549,1 255 | 0.9992104,1 256 | 0.99830306,1 257 | 0.001760304,0 258 | 0.014457643,0 259 | 0.003979683,0 260 | 0.00379017,0 261 | 0.0017703474,0 262 | 0.0026535988,0 263 | 0.014683783,0 264 | 0.011624128,0 265 | 0.0032987297,0 266 | 0.0031321645,0 267 | 0.0095541775,0 268 | 0.0023303032,0 269 | 0.0072380006,0 270 | 0.01756221,0 271 | 0.0007340312,0 272 | 0.002111137,0 273 | 0.004008293,0 274 | 0.0013486743,0 275 | 0.0009326935,0 276 | 0.0030748844,0 277 | 0.0054294467,0 278 | 0.011447251,0 279 | 0.0053251386,0 280 | 0.0021106005,0 281 | 0.0037835538,0 282 | 0.0018314123,0 283 | 0.004982531,0 284 | 0.0029719174,0 285 | 0.011327684,0 286 | 0.10573703,0 287 | 0.004457891,0 288 | 0.0054608583,0 289 | 0.0020674765,0 290 | 0.01045984,0 291 | 0.0103600025,0 292 | 0.0075323284,0 293 | 0.0012206733,0 294 | 0.00750798,0 295 | 0.001951158,0 296 | 0.008589834,0 297 | 0.0043160915,0 298 | 0.05282119,0 299 | 0.4291433,0 300 | 0.007525176,0 301 | 0.004467398,0 302 | 0.0014523864,0 303 | 0.008149862,0 304 | 0.013185322,0 305 | 0.0012267232,0 306 | 0.018376768,0 307 | 0.021921605,0 308 | 0.009922415,0 309 | 0.001132071,0 310 | 0.0048155487,0 311 | 0.0009250939,0 312 | 0.0036700964,0 313 | 0.007196456,0 314 | 0.0054932535,0 315 | 0.36024356,0 316 | 0.0060750246,0 317 | 0.020482004,0 318 | 0.08484012,0 319 | 0.0590654,0 320 | 0.0016846359,0 321 | 0.017030358,0 322 | 0.008978009,0 323 | 0.0071922243,0 324 | 0.0017992556,0 325 | 0.0016529262,0 326 | 0.009411335,0 327 | 0.102279186,0 328 | 0.028624862,0 329 | 0.0053718686,0 330 | 0.03389445,0 331 | 0.005106002,0 332 | 0.9995996,1 333 | 0.9980893,1 334 | 0.9986732,1 335 | 0.95276314,1 336 | 0.99522626,1 337 | 0.996301,1 338 | 0.9993449,1 339 | 0.9996685,1 340 | 0.99719787,1 341 | 0.9985757,1 342 | 0.99969274,1 343 | 0.99817,1 344 | 0.05557555,1 345 | 0.9784421,1 346 | 0.99780107,1 347 | 0.94932604,1 348 | 0.9986552,1 349 | 0.9981569,1 350 | 0.9971129,1 351 | 0.99978435,1 352 | 0.9937814,1 353 | 0.9959169,1 354 | 0.96179754,1 355 | 0.9969908,1 356 | 0.98869604,1 357 | 0.98884714,1 358 | 0.0039734244,1 359 | 0.998657,1 360 | 0.9950249,1 361 | 0.9995556,1 362 | 0.99852395,1 363 | 0.9891635,1 364 | 0.99961364,1 365 | 0.99729395,1 366 | 0.9966003,1 367 | 0.998958,1 368 | 0.9981513,1 369 | 0.9804191,1 370 | 0.99835086,1 371 | 0.9954897,1 372 | 0.9989381,1 373 | 0.9628703,1 374 | 0.9975141,1 375 | 0.9970557,1 376 | 0.9994136,1 377 | 0.9997052,1 378 | 0.9978205,1 379 | 0.99942803,1 380 | 0.9278972,1 381 | 0.9894031,1 382 | 0.9994395,1 383 | 0.9965949,1 384 | 0.99685144,1 385 | 0.19563031,1 386 | 0.99928105,1 387 | 0.9972477,1 388 | 0.99955773,1 389 | 0.99972653,1 390 | 0.99949324,1 391 | 0.9937149,1 392 | 0.9939196,1 393 | 0.99955034,1 394 | 0.993284,1 395 | 0.999261,1 396 | 0.99845374,1 397 | 0.9977756,1 398 | 0.9995555,1 399 | 0.9994869,1 400 | 0.9852623,1 401 | 0.9994252,1 402 | 0.03175941,1 403 | 0.99916774,1 404 | 0.9995626,1 405 | 0.9993237,1 406 | 0.9983026,1 407 | 0.021528184,0 408 | 0.98458683,1 409 | 0.99866617,1 410 | 0.99089915,1 411 | 0.99942714,1 412 | 0.99779224,1 413 | 0.9955529,1 414 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_full/prosa/xlm_r/result_prosa_yelp_XLM_R_C_10981_1.5_full.csv: -------------------------------------------------------------------------------- 1 | y_pred,y_true 2 | 0.0009711981,0 3 | 0.00058308244,0 4 | 0.001124084,0 5 | 0.005188614,0 6 | 0.028678149,0 7 | 0.00247854,0 8 | 0.002380252,0 9 | 0.021045148,0 10 | 0.00060096383,0 11 | 0.00086671114,0 12 | 0.002033323,0 13 | 0.00068998337,0 14 | 0.0003630519,0 15 | 0.00083336234,0 16 | 0.043345004,0 17 | 0.43168405,0 18 | 0.0016153753,0 19 | 0.0016118586,0 20 | 0.00043717027,0 21 | 0.0011144578,0 22 | 0.018398046,0 23 | 0.0011036098,0 24 | 0.00093752146,0 25 | 0.00043469667,0 26 | 0.00031811,0 27 | 0.0007862747,0 28 | 0.0010377169,0 29 | 0.0009063184,0 30 | 0.0010533631,0 31 | 0.00080788136,0 32 | 0.0006378889,0 33 | 0.00050130486,0 34 | 0.00395599,0 35 | 0.0023857951,0 36 | 0.0050435066,0 37 | 0.000792712,0 38 | 0.0023774803,0 39 | 0.00063693523,0 40 | 0.00070914626,0 41 | 0.002126217,0 42 | 0.0009202957,0 43 | 0.0017364621,0 44 | 0.002512008,0 45 | 0.0013025403,0 46 | 0.0015153289,0 47 | 0.0018045604,0 48 | 0.00079369545,0 49 | 0.001190126,0 50 | 0.0033880174,0 51 | 0.00039169192,0 52 | 0.0036310852,0 53 | 0.0035902858,0 54 | 0.00075244904,0 55 | 0.0006482899,0 56 | 0.0016233623,0 57 | 0.0012896359,0 58 | 0.005924523,0 59 | 0.083363295,0 60 | 0.0011655986,0 61 | 0.018218964,0 62 | 0.012401462,0 63 | 0.009561628,0 64 | 0.006485224,0 65 | 0.34583843,0 66 | 0.0019258559,0 67 | 0.06033945,0 68 | 0.02272892,0 69 | 0.0034036338,0 70 | 0.37228918,0 71 | 0.037317783,0 72 | 0.01843822,0 73 | 0.030632943,0 74 | 0.009070039,0 75 | 0.001752317,0 76 | 0.005250782,0 77 | 0.09701255,0 78 | 0.0055588186,0 79 | 0.0029674768,0 80 | 0.0013380349,0 81 | 0.0008096397,0 82 | 0.0003913939,0 83 | 0.09544745,0 84 | 0.0019350052,0 85 | 0.0016887188,0 86 | 0.002180785,0 87 | 0.025429904,0 88 | 0.012505293,0 89 | 0.0029800832,0 90 | 0.028616965,0 91 | 0.20621759,0 92 | 0.100298226,0 93 | 0.009139895,0 94 | 0.0014582276,0 95 | 0.00060752034,0 96 | 0.0018763244,0 97 | 0.0004620552,0 98 | 0.17021266,0 99 | 0.003942907,0 100 | 0.0017300546,0 101 | 0.0007349849,0 102 | 0.0015960932,0 103 | 0.0019681752,0 104 | 0.13900465,0 105 | 0.0016764104,0 106 | 0.0029539168,0 107 | 0.0004518628,0 108 | 0.007417798,0 109 | 0.0010567307,0 110 | 0.030727655,0 111 | 0.0044939816,0 112 | 0.00051888824,0 113 | 0.00032922626,0 114 | 0.04323256,0 115 | 0.0024155378,0 116 | 0.000651747,0 117 | 0.0045479834,0 118 | 0.3710159,0 119 | 0.0018381476,0 120 | 0.0033464432,0 121 | 0.0005722046,0 122 | 0.10930884,0 123 | 0.00036993623,0 124 | 0.0021786988,0 125 | 0.0023053885,0 126 | 0.0036679804,0 127 | 0.998983,1 128 | 0.9994869,1 129 | 0.9995954,1 130 | 0.9995183,1 131 | 0.99971867,1 132 | 0.99955964,1 133 | 0.99946785,1 134 | 0.9997159,1 135 | 0.9997752,1 136 | 0.9993333,1 137 | 0.99938136,1 138 | 0.9998398,1 139 | 0.9994059,1 140 | 0.99965835,1 141 | 0.99836326,1 142 | 0.9998578,1 143 | 0.99960876,1 144 | 0.99989295,1 145 | 0.99987876,1 146 | 0.99975663,1 147 | 0.99954474,1 148 | 0.9996491,1 149 | 0.9995898,1 150 | 0.9977859,1 151 | 0.99527884,1 152 | 0.017733902,0 153 | 0.99896485,1 154 | 0.9975101,1 155 | 0.9981811,1 156 | 0.9987252,1 157 | 0.99950016,1 158 | 0.8276683,1 159 | 0.99949944,1 160 | 0.9997481,1 161 | 0.9971201,1 162 | 0.9992579,1 163 | 0.9972835,1 164 | 0.82960105,1 165 | 0.99648833,1 166 | 0.9983886,1 167 | 0.99506843,1 168 | 0.9976988,1 169 | 0.9993032,1 170 | 0.9981682,1 171 | 0.9964988,1 172 | 0.9830858,1 173 | 0.9987562,1 174 | 0.9997226,1 175 | 0.9835362,1 176 | 0.9559078,1 177 | 0.7371024,1 178 | 0.9922241,1 179 | 0.9989693,1 180 | 0.961841,1 181 | 0.99347186,1 182 | 0.9995644,1 183 | 0.99978155,1 184 | 0.99883807,1 185 | 0.9993583,1 186 | 0.9997475,1 187 | 0.999742,1 188 | 0.9994122,1 189 | 0.99959135,1 190 | 0.9995316,1 191 | 0.99857354,1 192 | 0.9988533,1 193 | 0.999463,1 194 | 0.99966437,1 195 | 0.9982636,1 196 | 0.9995049,1 197 | 0.999397,1 198 | 0.9997566,1 199 | 0.9990437,1 200 | 0.9997946,1 201 | 0.99963945,1 202 | 0.9953328,1 203 | 0.9949085,1 204 | 0.99985147,1 205 | 0.99919355,1 206 | 0.99737597,1 207 | 0.99473506,1 208 | 0.99954593,1 209 | 0.9164518,1 210 | 0.99660456,1 211 | 0.9997114,1 212 | 0.99962425,1 213 | 0.997235,1 214 | 0.99610066,1 215 | 0.9982159,1 216 | 0.99952406,1 217 | 0.9994669,1 218 | 0.9988846,1 219 | 0.97190046,1 220 | 0.90608466,1 221 | 0.9996344,1 222 | 0.99968237,1 223 | 0.99964297,1 224 | 0.99975073,1 225 | 0.9997947,1 226 | 0.9996619,1 227 | 0.99927276,1 228 | 0.999313,1 229 | 0.99883616,1 230 | 0.99952775,1 231 | 0.9930715,1 232 | 0.9998286,1 233 | 0.9996666,1 234 | 0.999221,1 235 | 0.9997059,1 236 | 0.99983305,1 237 | 0.9987016,1 238 | 0.9998578,1 239 | 0.99959254,1 240 | 0.99972844,1 241 | 0.9992188,1 242 | 0.95663774,1 243 | 0.9985225,1 244 | 0.9998753,1 245 | 0.9994545,1 246 | 0.9904078,1 247 | 0.9994972,1 248 | 0.99332637,1 249 | 0.999431,1 250 | 0.99890697,1 251 | 0.99980485,1 252 | 0.24267074,0 253 | 0.22277689,0 254 | 0.9784062,1 255 | 0.99870306,1 256 | 0.9969815,1 257 | 0.004519373,0 258 | 0.13163775,0 259 | 0.026851475,0 260 | 0.004463345,0 261 | 0.0016537309,0 262 | 0.002191037,0 263 | 0.028500259,0 264 | 0.0005222857,0 265 | 0.0006003082,0 266 | 0.004601717,0 267 | 0.0011081696,0 268 | 0.0030302703,0 269 | 0.0042490065,0 270 | 0.041440874,0 271 | 0.004099667,0 272 | 0.010529548,0 273 | 0.010041505,0 274 | 0.0023169518,0 275 | 0.012170643,0 276 | 0.0083013475,0 277 | 0.01158002,0 278 | 0.002157867,0 279 | 0.0030227602,0 280 | 0.013426244,0 281 | 0.012784094,0 282 | 0.01504752,0 283 | 0.015563428,0 284 | 0.0020652711,0 285 | 0.0046711564,0 286 | 0.05138001,0 287 | 0.0020730793,0 288 | 0.007373929,0 289 | 0.001327455,0 290 | 0.061686426,0 291 | 0.010882318,0 292 | 0.005015433,0 293 | 0.0020422041,0 294 | 0.0076985657,0 295 | 0.0082035065,0 296 | 0.017854989,0 297 | 0.39747682,0 298 | 0.02294606,0 299 | 0.16294375,0 300 | 0.0009850264,0 301 | 0.039109856,0 302 | 0.0015825629,0 303 | 0.0055568516,0 304 | 0.013944089,0 305 | 0.009864509,0 306 | 0.0070754886,0 307 | 0.045043766,0 308 | 0.44734982,0 309 | 0.004386306,0 310 | 0.108201504,0 311 | 0.009330153,0 312 | 0.061425596,0 313 | 0.010903448,0 314 | 0.01709953,0 315 | 0.013614833,0 316 | 0.030522197,0 317 | 0.09586197,0 318 | 0.018266559,0 319 | 0.01745236,0 320 | 0.0016918778,0 321 | 0.0027058125,0 322 | 0.011981577,0 323 | 0.034855396,0 324 | 0.06080854,0 325 | 0.0009908676,0 326 | 0.031225294,0 327 | 0.29054588,0 328 | 0.0080755055,0 329 | 0.003462255,0 330 | 0.18079454,0 331 | 0.0136111975,0 332 | 0.99800324,1 333 | 0.99975616,1 334 | 0.9982927,1 335 | 0.99666727,1 336 | 0.9947259,1 337 | 0.9985585,1 338 | 0.9993504,1 339 | 0.99969876,1 340 | 0.9991883,1 341 | 0.9990752,1 342 | 0.99978065,1 343 | 0.9991014,1 344 | 0.97761446,1 345 | 0.99356264,1 346 | 0.9983366,1 347 | 0.9737908,1 348 | 0.9997125,1 349 | 0.99332845,1 350 | 0.99887866,1 351 | 0.9987287,1 352 | 0.99119663,1 353 | 0.9979181,1 354 | 0.99076325,1 355 | 0.9993992,1 356 | 0.99871576,1 357 | 0.7590035,1 358 | 0.7138873,1 359 | 0.9993665,1 360 | 0.99936795,1 361 | 0.9991797,1 362 | 0.9850665,1 363 | 0.7589284,1 364 | 0.99966943,1 365 | 0.99966025,1 366 | 0.9994334,1 367 | 0.9978104,1 368 | 0.99874187,1 369 | 0.992272,1 370 | 0.98916376,1 371 | 0.99890685,1 372 | 0.9996155,1 373 | 0.9720841,1 374 | 0.9959844,1 375 | 0.9975829,1 376 | 0.99964154,1 377 | 0.999786,1 378 | 0.99975264,1 379 | 0.9769808,1 380 | 0.9833561,1 381 | 0.9994643,1 382 | 0.99964046,1 383 | 0.9977553,1 384 | 0.9972346,1 385 | 0.830151,1 386 | 0.99922436,1 387 | 0.99666846,1 388 | 0.9983842,1 389 | 0.99966574,1 390 | 0.9901128,1 391 | 0.9787309,1 392 | 0.9894835,1 393 | 0.9993695,1 394 | 0.99963874,1 395 | 0.9998624,1 396 | 0.999707,1 397 | 0.99875486,1 398 | 0.99966294,1 399 | 0.9994147,1 400 | 0.7940718,1 401 | 0.9816929,1 402 | 0.73489213,1 403 | 0.9995686,1 404 | 0.9995367,1 405 | 0.99960625,1 406 | 0.9972786,1 407 | 0.008070469,0 408 | 0.9980122,1 409 | 0.9990788,1 410 | 0.99450064,1 411 | 0.99928033,1 412 | 0.99706906,1 413 | 0.99770665,1 414 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_full/prosa/xlm_r/result_prosa_yelp_XLM_R_C_10981_1_full.csv: -------------------------------------------------------------------------------- 1 | y_pred,y_true 2 | 0.004943788,0 3 | 0.00085407495,0 4 | 0.0011693537,0 5 | 0.01366353,0 6 | 0.0021578372,0 7 | 0.0006032586,0 8 | 0.0039269924,0 9 | 0.0024264455,0 10 | 0.0011650026,0 11 | 0.0011419058,0 12 | 0.0007427335,0 13 | 0.00055888295,0 14 | 0.0006506145,0 15 | 0.0008160472,0 16 | 0.0018242896,0 17 | 0.01501593,0 18 | 0.001047343,0 19 | 0.0016092956,0 20 | 0.00045070052,0 21 | 0.0005815029,0 22 | 0.0027697384,0 23 | 0.0009787381,0 24 | 0.0010237396,0 25 | 0.0012221932,0 26 | 0.00037014484,0 27 | 0.00043302774,0 28 | 0.0013627112,0 29 | 0.00084635615,0 30 | 0.00047165155,0 31 | 0.0007748306,0 32 | 0.0017157197,0 33 | 0.007131487,0 34 | 0.00050112605,0 35 | 0.0005492568,0 36 | 0.0070627034,0 37 | 0.00080630183,0 38 | 0.0006393194,0 39 | 0.00092723966,0 40 | 0.00056380033,0 41 | 0.0021761358,0 42 | 0.00083866715,0 43 | 0.00041398406,0 44 | 0.00056806207,0 45 | 0.0013116002,0 46 | 0.001075238,0 47 | 0.0015493333,0 48 | 0.000408113,0 49 | 0.0006880164,0 50 | 0.0017605126,0 51 | 0.00046947598,0 52 | 0.18849781,0 53 | 0.0038850307,0 54 | 0.0005259514,0 55 | 0.0009588897,0 56 | 0.0007312894,0 57 | 0.00097590685,0 58 | 0.002408266,0 59 | 0.012484372,0 60 | 0.002881527,0 61 | 0.0055641234,0 62 | 0.0028600097,0 63 | 0.0019322634,0 64 | 0.0053616464,0 65 | 0.015483081,0 66 | 0.0021419525,0 67 | 0.011961281,0 68 | 0.0070286095,0 69 | 0.004161775,0 70 | 0.11164096,0 71 | 0.0022634268,0 72 | 0.0023635924,0 73 | 0.035384953,0 74 | 0.001424849,0 75 | 0.0019298196,0 76 | 0.00084626675,0 77 | 0.079266965,0 78 | 0.00631243,0 79 | 0.0016098917,0 80 | 0.0013759136,0 81 | 0.0040921867,0 82 | 0.0033737123,0 83 | 0.0077733397,0 84 | 0.0029361844,0 85 | 0.0019219816,0 86 | 0.0006800592,0 87 | 0.0153671205,0 88 | 0.00066152215,0 89 | 0.00334993,0 90 | 0.0158782,0 91 | 0.0065170527,0 92 | 0.015909016,0 93 | 0.0061148107,0 94 | 0.0010205507,0 95 | 0.0016627908,0 96 | 0.0045859218,0 97 | 0.0009239614,0 98 | 0.022403717,0 99 | 0.0028616786,0 100 | 0.00071310997,0 101 | 0.0018651485,0 102 | 0.0013507307,0 103 | 0.00076541305,0 104 | 0.003978342,0 105 | 0.002087444,0 106 | 0.00657019,0 107 | 0.0011055171,0 108 | 0.0053173006,0 109 | 0.0014086962,0 110 | 0.006438583,0 111 | 0.0021323264,0 112 | 0.00050091743,0 113 | 0.00037795305,0 114 | 0.08797908,0 115 | 0.002850026,0 116 | 0.00071278214,0 117 | 0.0013558567,0 118 | 0.7397268,0 119 | 0.0011234581,0 120 | 0.0015200973,0 121 | 0.0003646016,0 122 | 0.009744525,0 123 | 0.00029027462,0 124 | 0.00076800585,0 125 | 0.0011367202,0 126 | 0.00084510446,0 127 | 0.9993525,1 128 | 0.99618757,1 129 | 0.9990231,1 130 | 0.998346,1 131 | 0.99755,1 132 | 0.9964402,1 133 | 0.9994298,1 134 | 0.9978541,1 135 | 0.99984264,1 136 | 0.99609005,1 137 | 0.99763894,1 138 | 0.99867743,1 139 | 0.9922245,1 140 | 0.9997434,1 141 | 0.99698174,1 142 | 0.9998919,1 143 | 0.9994672,1 144 | 0.99990296,1 145 | 0.99985915,1 146 | 0.99981105,1 147 | 0.9992572,1 148 | 0.99944425,1 149 | 0.9940305,1 150 | 0.9991379,1 151 | 0.9801915,1 152 | 0.002384007,0 153 | 0.999786,1 154 | 0.9989308,1 155 | 0.8495718,1 156 | 0.999323,1 157 | 0.9992128,1 158 | 0.050292253,1 159 | 0.99620324,1 160 | 0.98677945,1 161 | 0.97044873,1 162 | 0.99006474,1 163 | 0.99026585,1 164 | 0.9281974,1 165 | 0.9906502,1 166 | 0.9970399,1 167 | 0.9708275,1 168 | 0.9980726,1 169 | 0.9976048,1 170 | 0.99780804,1 171 | 0.9969516,1 172 | 0.9715183,1 173 | 0.9957755,1 174 | 0.99863154,1 175 | 0.99865115,1 176 | 0.9640074,1 177 | 0.4555641,1 178 | 0.9559907,1 179 | 0.9975862,1 180 | 0.99693656,1 181 | 0.98892856,1 182 | 0.99928534,1 183 | 0.99701416,1 184 | 0.998932,1 185 | 0.9993087,1 186 | 0.9998467,1 187 | 0.9999405,1 188 | 0.9987042,1 189 | 0.99818766,1 190 | 0.9987332,1 191 | 0.9944072,1 192 | 0.99152434,1 193 | 0.9992347,1 194 | 0.99987113,1 195 | 0.99642414,1 196 | 0.999341,1 197 | 0.999053,1 198 | 0.99877334,1 199 | 0.999636,1 200 | 0.9997561,1 201 | 0.99986625,1 202 | 0.99767864,1 203 | 0.9931193,1 204 | 0.9990434,1 205 | 0.98695314,1 206 | 0.9975201,1 207 | 0.9982635,1 208 | 0.9996923,1 209 | 0.13107386,1 210 | 0.9985314,1 211 | 0.99880946,1 212 | 0.9993538,1 213 | 0.9982444,1 214 | 0.9916438,1 215 | 0.9991337,1 216 | 0.99935055,1 217 | 0.9984828,1 218 | 0.9985131,1 219 | 0.98769,1 220 | 0.8453206,1 221 | 0.99824154,1 222 | 0.9995911,1 223 | 0.99865687,1 224 | 0.9994216,1 225 | 0.9996749,1 226 | 0.99985015,1 227 | 0.9998007,1 228 | 0.99902225,1 229 | 0.996917,1 230 | 0.9992697,1 231 | 0.9958502,1 232 | 0.9993943,1 233 | 0.99930394,1 234 | 0.9920551,1 235 | 0.9985242,1 236 | 0.9981883,1 237 | 0.99817944,1 238 | 0.99976635,1 239 | 0.9980032,1 240 | 0.99478537,1 241 | 0.9928818,1 242 | 0.98483837,1 243 | 0.9844973,1 244 | 0.9990047,1 245 | 0.9978211,1 246 | 0.98671407,1 247 | 0.9993399,1 248 | 0.9864326,1 249 | 0.9978619,1 250 | 0.99649256,1 251 | 0.99872285,1 252 | 0.09153187,0 253 | 0.007632941,0 254 | 0.961166,1 255 | 0.9795415,1 256 | 0.9796772,1 257 | 0.0009837747,0 258 | 0.0019720197,0 259 | 0.0037468374,0 260 | 0.0021073818,0 261 | 0.0006375313,0 262 | 0.007059157,0 263 | 0.0044200122,0 264 | 0.0030762851,0 265 | 0.0010746717,0 266 | 0.007844478,0 267 | 0.0018922389,0 268 | 0.00058302283,0 269 | 0.00048971176,0 270 | 0.013150245,0 271 | 0.0014609396,0 272 | 0.0009133518,0 273 | 0.006932944,0 274 | 0.0015151203,0 275 | 0.00045919418,0 276 | 0.0012466013,0 277 | 0.010174125,0 278 | 0.017240644,0 279 | 0.0032975376,0 280 | 0.0014509857,0 281 | 0.0068438053,0 282 | 0.012311816,0 283 | 0.0014663935,0 284 | 0.0011547804,0 285 | 0.0023887157,0 286 | 0.66902924,0 287 | 0.0008607507,0 288 | 0.011066616,0 289 | 0.0025726855,0 290 | 0.0132475495,0 291 | 0.006929666,0 292 | 0.0017021,0 293 | 0.0060649514,0 294 | 0.0017452538,0 295 | 0.010641277,0 296 | 0.0029860735,0 297 | 0.0825395,0 298 | 0.47845858,0 299 | 0.24182034,0 300 | 0.0009327829,0 301 | 0.03369513,0 302 | 0.0010532439,0 303 | 0.0028850138,0 304 | 0.01769355,0 305 | 0.00265041,0 306 | 0.011190534,0 307 | 0.0022771358,0 308 | 0.0021789372,0 309 | 0.0019407868,0 310 | 0.0016225576,0 311 | 0.00064510107,0 312 | 0.0020058155,0 313 | 0.0054410994,0 314 | 0.0035632253,0 315 | 0.56984603,0 316 | 0.006597489,0 317 | 0.028614253,0 318 | 0.026460826,0 319 | 0.010340542,0 320 | 0.005100727,0 321 | 0.0087172985,0 322 | 0.015279859,0 323 | 0.011980742,0 324 | 0.0011380911,0 325 | 0.0009172261,0 326 | 0.0026233494,0 327 | 0.010242641,0 328 | 0.020711482,0 329 | 0.0018496513,0 330 | 0.005771339,0 331 | 0.0029448569,0 332 | 0.97369194,1 333 | 0.99600554,1 334 | 0.9894505,1 335 | 0.98512864,1 336 | 0.9752221,1 337 | 0.97596943,1 338 | 0.9971029,1 339 | 0.9997708,1 340 | 0.9865365,1 341 | 0.9857131,1 342 | 0.99864036,1 343 | 0.9809332,1 344 | 0.90630853,1 345 | 0.9942366,1 346 | 0.9880254,1 347 | 0.9707611,1 348 | 0.99234915,1 349 | 0.9201252,1 350 | 0.9746042,1 351 | 0.98549336,1 352 | 0.9897784,1 353 | 0.9793196,1 354 | 0.9966066,1 355 | 0.9851043,1 356 | 0.961019,1 357 | 0.7300224,1 358 | 0.010942727,1 359 | 0.9977571,1 360 | 0.99772537,1 361 | 0.9939202,1 362 | 0.97666395,1 363 | 0.95374906,1 364 | 0.9948354,1 365 | 0.98877525,1 366 | 0.98798037,1 367 | 0.99683493,1 368 | 0.98018664,1 369 | 0.9574679,1 370 | 0.9948081,1 371 | 0.9883169,1 372 | 0.9979688,1 373 | 0.81548953,1 374 | 0.98548543,1 375 | 0.9938208,1 376 | 0.9975672,1 377 | 0.9991668,1 378 | 0.9969114,1 379 | 0.97534335,1 380 | 0.9833622,1 381 | 0.99575925,1 382 | 0.9910926,1 383 | 0.980745,1 384 | 0.9573963,1 385 | 0.51766837,1 386 | 0.98956037,1 387 | 0.9824076,1 388 | 0.99576867,1 389 | 0.9958708,1 390 | 0.9740015,1 391 | 0.9748087,1 392 | 0.975862,1 393 | 0.9951755,1 394 | 0.9914843,1 395 | 0.9975083,1 396 | 0.99359083,1 397 | 0.99467087,1 398 | 0.9911108,1 399 | 0.9896956,1 400 | 0.95444095,1 401 | 0.98990095,1 402 | 0.22326344,1 403 | 0.9985768,1 404 | 0.99336934,1 405 | 0.9953866,1 406 | 0.98973227,1 407 | 0.021468312,0 408 | 0.9104846,1 409 | 0.9909626,1 410 | 0.93395424,1 411 | 0.99020237,1 412 | 0.9791813,1 413 | 0.93755484,1 414 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_full/prosa/xlm_r/result_prosa_yelp_XLM_R_C_10981_2_full.csv: -------------------------------------------------------------------------------- 1 | y_pred,y_true 2 | 0.31868044,0 3 | 0.0007534623,0 4 | 0.00087052584,0 5 | 0.01810512,0 6 | 0.0032435656,0 7 | 0.0008109212,0 8 | 0.00094527006,0 9 | 0.0066501796,0 10 | 0.0008072257,0 11 | 0.011835277,0 12 | 0.0008676946,0 13 | 0.0008776188,0 14 | 0.000849396,0 15 | 0.00091427565,0 16 | 0.0009366572,0 17 | 0.9434712,0 18 | 0.00086569786,0 19 | 0.017036945,0 20 | 0.0008431077,0 21 | 0.0009844303,0 22 | 0.0010008514,0 23 | 0.0012655854,0 24 | 0.0008469522,0 25 | 0.0008699298,0 26 | 0.00092446804,0 27 | 0.00088587403,0 28 | 0.0009240806,0 29 | 0.00088462234,0 30 | 0.001987338,0 31 | 0.0008574724,0 32 | 0.0017362237,0 33 | 0.0008915067,0 34 | 0.0009505749,0 35 | 0.0014037788,0 36 | 0.0051903427,0 37 | 0.01874882,0 38 | 0.17051092,0 39 | 0.000833869,0 40 | 0.036693245,0 41 | 0.0071790814,0 42 | 0.0015119612,0 43 | 0.00089111924,0 44 | 0.0009815395,0 45 | 0.0010558367,0 46 | 0.0008801818,0 47 | 0.0010712743,0 48 | 0.00086161494,0 49 | 0.0010996461,0 50 | 0.0010177493,0 51 | 0.0009469092,0 52 | 0.0011335313,0 53 | 0.05529222,0 54 | 0.0008226335,0 55 | 0.00076666474,0 56 | 0.13202637,0 57 | 0.00091198087,0 58 | 0.023843497,0 59 | 0.033901572,0 60 | 0.33106643,0 61 | 0.31874925,0 62 | 0.08509159,0 63 | 0.085421145,0 64 | 0.00083464384,0 65 | 0.12795866,0 66 | 0.00097087026,0 67 | 0.07656491,0 68 | 0.00088369846,0 69 | 0.0008930564,0 70 | 0.3322796,0 71 | 0.0009084046,0 72 | 0.001335293,0 73 | 0.0013265014,0 74 | 0.00086823106,0 75 | 0.14450192,0 76 | 0.0008691251,0 77 | 0.52233845,0 78 | 0.009144425,0 79 | 0.0008970201,0 80 | 0.015350819,0 81 | 0.0014542043,0 82 | 0.0008761287,0 83 | 0.0009121001,0 84 | 0.00087213516,0 85 | 0.00081318617,0 86 | 0.00091326237,0 87 | 0.00091281533,0 88 | 0.0008201003,0 89 | 0.75409913,0 90 | 0.015977293,0 91 | 0.015490383,0 92 | 0.08215636,0 93 | 0.076836586,0 94 | 0.008574992,0 95 | 0.0008749366,0 96 | 0.002270341,0 97 | 0.0008942783,0 98 | 0.0028199255,0 99 | 0.0009056926,0 100 | 0.00081667304,0 101 | 0.0010192692,0 102 | 0.003620237,0 103 | 0.00090655684,0 104 | 0.0008701682,0 105 | 0.00095799565,0 106 | 0.00094124675,0 107 | 0.00079989433,0 108 | 0.0010875463,0 109 | 0.0009634495,0 110 | 0.054808617,0 111 | 0.002818644,0 112 | 0.0009815395,0 113 | 0.00093227625,0 114 | 0.0038847923,0 115 | 0.014591068,0 116 | 0.0010812283,0 117 | 0.00092563033,0 118 | 0.07237789,0 119 | 0.001001656,0 120 | 0.0010997653,0 121 | 0.0061351955,0 122 | 0.24945569,0 123 | 0.0009595156,0 124 | 0.00088110566,0 125 | 0.00087994337,0 126 | 0.00085040927,0 127 | 0.9980359,1 128 | 0.97632635,1 129 | 0.99990785,1 130 | 0.9905031,1 131 | 0.9998951,1 132 | 0.9997175,1 133 | 0.9997999,1 134 | 0.899884,1 135 | 0.99989474,1 136 | 0.9998204,1 137 | 0.9177494,1 138 | 0.99990237,1 139 | 0.99990696,1 140 | 0.9998276,1 141 | 0.99480784,1 142 | 0.9999053,1 143 | 0.9999061,1 144 | 0.9999037,1 145 | 0.9998985,1 146 | 0.99816376,1 147 | 0.99972117,1 148 | 0.99389565,1 149 | 0.99989367,1 150 | 0.993915,1 151 | 0.96423876,1 152 | 0.0025874972,0 153 | 0.99966854,1 154 | 0.32238272,1 155 | 0.9950416,1 156 | 0.9900552,1 157 | 0.9973688,1 158 | 0.11657137,1 159 | 0.9996002,1 160 | 0.99985695,1 161 | 0.9969368,1 162 | 0.99976206,1 163 | 0.9998863,1 164 | 0.9216664,1 165 | 0.9999081,1 166 | 0.99988556,1 167 | 0.99988985,1 168 | 0.9998894,1 169 | 0.999897,1 170 | 0.9999031,1 171 | 0.99990857,1 172 | 0.9999,1 173 | 0.99989533,1 174 | 0.99083745,1 175 | 0.99982446,1 176 | 0.9892304,1 177 | 0.29423228,1 178 | 0.9906039,1 179 | 0.99989957,1 180 | 0.767975,1 181 | 0.98897827,1 182 | 0.99989784,1 183 | 0.9998418,1 184 | 0.995383,1 185 | 0.9914353,1 186 | 0.9998677,1 187 | 0.9998236,1 188 | 0.9945072,1 189 | 0.99320924,1 190 | 0.99968314,1 191 | 0.9583733,1 192 | 0.3738829,1 193 | 0.45308536,1 194 | 0.99837554,1 195 | 0.65706575,1 196 | 0.99167717,1 197 | 0.98438585,1 198 | 0.99214685,1 199 | 0.8863166,1 200 | 0.99974525,1 201 | 0.9998411,1 202 | 0.9959507,1 203 | 0.9999008,1 204 | 0.9998976,1 205 | 0.3251947,1 206 | 0.9998426,1 207 | 0.9997674,1 208 | 0.9999038,1 209 | 0.16625705,1 210 | 0.99032485,1 211 | 0.9999076,1 212 | 0.9999002,1 213 | 0.9998909,1 214 | 0.99989676,1 215 | 0.99990714,1 216 | 0.9721373,1 217 | 0.99989545,1 218 | 0.84560144,1 219 | 0.7391087,1 220 | 0.42974445,1 221 | 0.9826047,1 222 | 0.999896,1 223 | 0.99989355,1 224 | 0.99990666,1 225 | 0.99990195,1 226 | 0.9999028,1 227 | 0.96294713,1 228 | 0.99979645,1 229 | 0.9863858,1 230 | 0.9997195,1 231 | 0.9176018,1 232 | 0.99989516,1 233 | 0.99989164,1 234 | 0.9998952,1 235 | 0.999835,1 236 | 0.99965686,1 237 | 0.9849665,1 238 | 0.9999025,1 239 | 0.999817,1 240 | 0.99990076,1 241 | 0.9941778,1 242 | 0.99088156,1 243 | 0.9980968,1 244 | 0.99990296,1 245 | 0.99988186,1 246 | 0.9895926,1 247 | 0.9998903,1 248 | 0.9920114,1 249 | 0.9998951,1 250 | 0.9999006,1 251 | 0.9999008,1 252 | 0.0649437,0 253 | 0.0033558905,0 254 | 0.9999106,1 255 | 0.997502,1 256 | 0.99937844,1 257 | 0.014715254,0 258 | 0.037831694,0 259 | 0.005732417,0 260 | 0.007854968,0 261 | 0.0009787381,0 262 | 0.0021919906,0 263 | 0.0008224845,0 264 | 0.0008505285,0 265 | 0.018479884,0 266 | 0.00094491243,0 267 | 0.0016820729,0 268 | 0.028788805,0 269 | 0.0025835931,0 270 | 0.37053168,0 271 | 0.0023235977,0 272 | 0.038066566,0 273 | 0.16462162,0 274 | 0.09096351,0 275 | 0.2949729,0 276 | 0.00094765425,0 277 | 0.0009690821,0 278 | 0.012228072,0 279 | 0.022611976,0 280 | 0.0035614073,0 281 | 0.0009782314,0 282 | 0.14337811,0 283 | 0.0009445846,0 284 | 0.0010090172,0 285 | 0.11908546,0 286 | 0.09328601,0 287 | 0.094273925,0 288 | 0.18330082,0 289 | 0.014781386,0 290 | 0.18065289,0 291 | 0.12272775,0 292 | 0.01271078,0 293 | 0.16773164,0 294 | 0.0047434866,0 295 | 0.04228559,0 296 | 0.005897194,0 297 | 0.16308269,0 298 | 0.44607115,0 299 | 0.48219773,0 300 | 0.0009866059,0 301 | 0.2015974,0 302 | 0.006850928,0 303 | 0.0012180209,0 304 | 0.16026205,0 305 | 0.0015356839,0 306 | 0.04299766,0 307 | 0.018803477,0 308 | 0.0048318207,0 309 | 0.020252556,0 310 | 0.09837684,0 311 | 0.012144536,0 312 | 0.15264228,0 313 | 0.019550651,0 314 | 0.057415485,0 315 | 0.9997593,0 316 | 0.013870031,0 317 | 0.016429335,0 318 | 0.12097347,0 319 | 0.0028581321,0 320 | 0.36592913,0 321 | 0.008752316,0 322 | 0.21555558,0 323 | 0.05675578,0 324 | 0.0012824237,0 325 | 0.0009279549,0 326 | 0.04714939,0 327 | 0.07919234,0 328 | 0.108621955,0 329 | 0.01729402,0 330 | 0.19863904,0 331 | 0.006544322,0 332 | 0.39494523,1 333 | 0.9998785,1 334 | 0.9897784,1 335 | 0.99981284,1 336 | 0.99981993,1 337 | 0.9998941,1 338 | 0.9999088,1 339 | 0.9998994,1 340 | 0.999833,1 341 | 0.99989486,1 342 | 0.9998325,1 343 | 0.9946156,1 344 | 0.41573393,1 345 | 0.99944526,1 346 | 0.97824574,1 347 | 0.99289256,1 348 | 0.9998716,1 349 | 0.94439775,1 350 | 0.9580476,1 351 | 0.9880438,1 352 | 0.99973536,1 353 | 0.99113613,1 354 | 0.38130295,1 355 | 0.999887,1 356 | 0.99127007,1 357 | 0.9915633,1 358 | 0.07023659,1 359 | 0.98934865,1 360 | 0.991363,1 361 | 0.9998466,1 362 | 0.1512619,1 363 | 0.42943192,1 364 | 0.9998728,1 365 | 0.99976426,1 366 | 0.99984694,1 367 | 0.99973273,1 368 | 0.9998903,1 369 | 0.4415183,1 370 | 0.9998468,1 371 | 0.99978054,1 372 | 0.9998939,1 373 | 0.56393915,1 374 | 0.9844638,1 375 | 0.98973894,1 376 | 0.99988496,1 377 | 0.99947405,1 378 | 0.99958706,1 379 | 0.8789463,1 380 | 0.91079664,1 381 | 0.6002038,1 382 | 0.9520062,1 383 | 0.99989045,1 384 | 0.9932778,1 385 | 0.84876555,1 386 | 0.9998418,1 387 | 0.9944835,1 388 | 0.99988914,1 389 | 0.9998443,1 390 | 0.83089566,1 391 | 0.9998729,1 392 | 0.9996768,1 393 | 0.99968576,1 394 | 0.9998775,1 395 | 0.99988836,1 396 | 0.9997585,1 397 | 0.99988997,1 398 | 0.99989337,1 399 | 0.99819565,1 400 | 0.21436483,1 401 | 0.9843521,1 402 | 0.55772674,1 403 | 0.99986786,1 404 | 0.99987674,1 405 | 0.99987894,1 406 | 0.9998193,1 407 | 0.0054287612,0 408 | 0.84989333,1 409 | 0.9942343,1 410 | 0.9907303,1 411 | 0.9998727,1 412 | 0.8572862,1 413 | 0.9987359,1 414 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_full/prosa/xlm_r/result_prosa_yelp_XLM_R_C_10981_3_full.csv: -------------------------------------------------------------------------------- 1 | y_pred,y_true 2 | 0.56989473,0 3 | 0.0046035647,0 4 | 0.006973207,0 5 | 0.00956589,0 6 | 0.010309517,0 7 | 0.0124361515,0 8 | 0.009011209,0 9 | 0.00829795,0 10 | 0.004699737,0 11 | 0.0077689886,0 12 | 0.0058767796,0 13 | 0.009486914,0 14 | 0.009904355,0 15 | 0.005618483,0 16 | 0.0049708486,0 17 | 0.92907774,0 18 | 0.0070020854,0 19 | 0.049595863,0 20 | 0.006057173,0 21 | 0.0031442642,0 22 | 0.024021,0 23 | 0.006847471,0 24 | 0.35860914,0 25 | 0.0063755214,0 26 | 0.0037611723,0 27 | 0.03182292,0 28 | 0.0056716204,0 29 | 0.004248798,0 30 | 0.007141471,0 31 | 0.003151089,0 32 | 0.013613582,0 33 | 0.0058882833,0 34 | 0.003911793,0 35 | 0.010510296,0 36 | 0.026238024,0 37 | 0.023744792,0 38 | 0.061113656,0 39 | 0.023657143,0 40 | 0.007111639,0 41 | 0.007980108,0 42 | 0.009766191,0 43 | 0.0044822395,0 44 | 0.0029438734,0 45 | 0.00646922,0 46 | 0.008185357,0 47 | 0.0058706105,0 48 | 0.004883975,0 49 | 0.002363056,0 50 | 0.004229605,0 51 | 0.008059591,0 52 | 0.012334287,0 53 | 0.012118906,0 54 | 0.005265683,0 55 | 0.004590243,0 56 | 0.18237662,0 57 | 0.004115492,0 58 | 0.0075672567,0 59 | 0.13051647,0 60 | 0.68169326,0 61 | 0.29688168,0 62 | 0.07665157,0 63 | 0.01908648,0 64 | 0.0053371787,0 65 | 0.10181132,0 66 | 0.0053630173,0 67 | 0.03494236,0 68 | 0.0047740936,0 69 | 0.008783132,0 70 | 0.2604278,0 71 | 0.0038532913,0 72 | 0.006640047,0 73 | 0.005635947,0 74 | 0.0041667223,0 75 | 0.065202355,0 76 | 0.007094145,0 77 | 0.027414858,0 78 | 0.0051695406,0 79 | 0.0065339506,0 80 | 0.0078704655,0 81 | 0.005698353,0 82 | 0.0046871006,0 83 | 0.01564312,0 84 | 0.004480958,0 85 | 0.007024884,0 86 | 0.0055063963,0 87 | 0.0043138564,0 88 | 0.0058146715,0 89 | 0.7413943,0 90 | 0.00460279,0 91 | 0.04624152,0 92 | 0.06246215,0 93 | 0.08727607,0 94 | 0.013625383,0 95 | 0.009331703,0 96 | 0.003403455,0 97 | 0.0032598376,0 98 | 0.10277766,0 99 | 0.0065179765,0 100 | 0.008565396,0 101 | 0.002647847,0 102 | 0.0058535635,0 103 | 0.005086243,0 104 | 0.0069758,0 105 | 0.003844738,0 106 | 0.0034255981,0 107 | 0.00554803,0 108 | 0.007671535,0 109 | 0.0027485788,0 110 | 0.012863338,0 111 | 0.005527675,0 112 | 0.0034887493,0 113 | 0.009845853,0 114 | 0.1265057,0 115 | 0.009393573,0 116 | 0.0039017797,0 117 | 0.0047289133,0 118 | 0.0675312,0 119 | 0.004695624,0 120 | 0.004965514,0 121 | 0.006005645,0 122 | 0.042886466,0 123 | 0.0039806664,0 124 | 0.006958872,0 125 | 0.013924658,0 126 | 0.008535624,0 127 | 0.9980439,1 128 | 0.8535718,1 129 | 0.9979708,1 130 | 0.9980897,1 131 | 0.9980695,1 132 | 0.998104,1 133 | 0.99784887,1 134 | 0.9980674,1 135 | 0.99795777,1 136 | 0.9980892,1 137 | 0.9937072,1 138 | 0.997969,1 139 | 0.9981109,1 140 | 0.9978768,1 141 | 0.9984284,1 142 | 0.9978981,1 143 | 0.9979452,1 144 | 0.9978037,1 145 | 0.9978943,1 146 | 0.9980183,1 147 | 0.9978893,1 148 | 0.99810255,1 149 | 0.9979495,1 150 | 0.9978956,1 151 | 0.99816394,1 152 | 0.009078771,0 153 | 0.99790674,1 154 | 0.23121944,1 155 | 0.99835527,1 156 | 0.9986328,1 157 | 0.9980736,1 158 | 0.066970855,1 159 | 0.99806273,1 160 | 0.99808514,1 161 | 0.99809027,1 162 | 0.998016,1 163 | 0.99795926,1 164 | 0.94309354,1 165 | 0.99803764,1 166 | 0.99817526,1 167 | 0.9980602,1 168 | 0.9979955,1 169 | 0.998043,1 170 | 0.99812293,1 171 | 0.9979441,1 172 | 0.99813086,1 173 | 0.99797916,1 174 | 0.9981179,1 175 | 0.9980444,1 176 | 0.99816823,1 177 | 0.46536556,1 178 | 0.9990866,1 179 | 0.9979931,1 180 | 0.9981422,1 181 | 0.99817675,1 182 | 0.99784654,1 183 | 0.99816054,1 184 | 0.99805546,1 185 | 0.99859035,1 186 | 0.99789333,1 187 | 0.9982058,1 188 | 0.99541855,1 189 | 0.99812746,1 190 | 0.99816144,1 191 | 0.99841535,1 192 | 0.9983038,1 193 | 0.8951596,1 194 | 0.99804544,1 195 | 0.995895,1 196 | 0.99808174,1 197 | 0.99820507,1 198 | 0.99808925,1 199 | 0.9987207,1 200 | 0.99802995,1 201 | 0.9979378,1 202 | 0.9981011,1 203 | 0.99800956,1 204 | 0.9979639,1 205 | 0.98566014,1 206 | 0.99801314,1 207 | 0.99805367,1 208 | 0.9979315,1 209 | 0.93165886,1 210 | 0.9985891,1 211 | 0.9979688,1 212 | 0.99799764,1 213 | 0.9979563,1 214 | 0.9979487,1 215 | 0.9978605,1 216 | 0.9981915,1 217 | 0.9980314,1 218 | 0.9986634,1 219 | 0.9887543,1 220 | 0.59560245,1 221 | 0.9819951,1 222 | 0.99806356,1 223 | 0.99800265,1 224 | 0.99796116,1 225 | 0.9979758,1 226 | 0.9979739,1 227 | 0.99801266,1 228 | 0.9979751,1 229 | 0.99906445,1 230 | 0.9980028,1 231 | 0.98491085,1 232 | 0.998019,1 233 | 0.99794996,1 234 | 0.99809915,1 235 | 0.9980793,1 236 | 0.99806154,1 237 | 0.9982803,1 238 | 0.99800956,1 239 | 0.9980679,1 240 | 0.9980508,1 241 | 0.99855757,1 242 | 0.9989728,1 243 | 0.99809766,1 244 | 0.9980397,1 245 | 0.99811554,1 246 | 0.9901631,1 247 | 0.99783385,1 248 | 0.9990787,1 249 | 0.9979514,1 250 | 0.99803585,1 251 | 0.9979478,1 252 | 0.019308805,0 253 | 0.022287726,0 254 | 0.998024,1 255 | 0.99896586,1 256 | 0.9981065,1 257 | 0.011517972,0 258 | 0.06102249,0 259 | 0.0071179867,0 260 | 0.003862381,0 261 | 0.003921777,0 262 | 0.006992817,0 263 | 0.0041930974,0 264 | 0.0030920208,0 265 | 0.005643368,0 266 | 0.004972607,0 267 | 0.005780548,0 268 | 0.011301577,0 269 | 0.006954789,0 270 | 0.07056278,0 271 | 0.005588144,0 272 | 0.0076907873,0 273 | 0.012147576,0 274 | 0.004659176,0 275 | 0.01324299,0 276 | 0.0048742592,0 277 | 0.010090351,0 278 | 0.010801733,0 279 | 0.034189492,0 280 | 0.007693976,0 281 | 0.0055065155,0 282 | 0.010482252,0 283 | 0.0074474514,0 284 | 0.0029467642,0 285 | 0.009468257,0 286 | 0.032076955,0 287 | 0.004322201,0 288 | 0.02005878,0 289 | 0.0055120885,0 290 | 0.14421368,0 291 | 0.0079933405,0 292 | 0.0057908893,0 293 | 0.11508325,0 294 | 0.0054630935,0 295 | 0.049064636,0 296 | 0.009953946,0 297 | 0.007694721,0 298 | 0.546702,0 299 | 0.99823153,0 300 | 0.003641814,0 301 | 0.24975127,0 302 | 0.0052604675,0 303 | 0.007744044,0 304 | 0.27073872,0 305 | 0.006627351,0 306 | 0.11037099,0 307 | 0.074622124,0 308 | 0.04468763,0 309 | 0.007243067,0 310 | 0.005800545,0 311 | 0.010251969,0 312 | 0.006365359,0 313 | 0.014940053,0 314 | 0.017106771,0 315 | 0.99889886,0 316 | 0.0035209954,0 317 | 0.0072847605,0 318 | 0.0064748526,0 319 | 0.10363394,0 320 | 0.9982368,0 321 | 0.01436609,0 322 | 0.029863149,0 323 | 0.01964736,0 324 | 0.0066364408,0 325 | 0.0035355687,0 326 | 0.0048186183,0 327 | 0.014069259,0 328 | 0.28035727,0 329 | 0.008062422,0 330 | 0.008963615,0 331 | 0.046138108,0 332 | 0.998833,1 333 | 0.9981712,1 334 | 0.99853206,1 335 | 0.9981438,1 336 | 0.99876547,1 337 | 0.9980243,1 338 | 0.99807906,1 339 | 0.9980426,1 340 | 0.99817383,1 341 | 0.9981146,1 342 | 0.99811757,1 343 | 0.9990877,1 344 | 0.4462088,1 345 | 0.9981307,1 346 | 0.962801,1 347 | 0.99891996,1 348 | 0.9980511,1 349 | 0.9986024,1 350 | 0.9989511,1 351 | 0.9987233,1 352 | 0.9980527,1 353 | 0.99867725,1 354 | 0.027750373,1 355 | 0.99809945,1 356 | 0.9986335,1 357 | 0.46349296,1 358 | 0.013264775,1 359 | 0.9989748,1 360 | 0.99909246,1 361 | 0.998026,1 362 | 0.7965491,1 363 | 0.38025326,1 364 | 0.9981642,1 365 | 0.9981139,1 366 | 0.9982325,1 367 | 0.99816895,1 368 | 0.99814487,1 369 | 0.434722,1 370 | 0.9981166,1 371 | 0.99899274,1 372 | 0.99807227,1 373 | 0.99906385,1 374 | 0.9981829,1 375 | 0.9912313,1 376 | 0.99809587,1 377 | 0.9980085,1 378 | 0.99814767,1 379 | 0.9984629,1 380 | 0.9938804,1 381 | 0.99811554,1 382 | 0.99905455,1 383 | 0.99817777,1 384 | 0.9991045,1 385 | 0.5144662,1 386 | 0.99812686,1 387 | 0.8377235,1 388 | 0.9986697,1 389 | 0.998235,1 390 | 0.99908483,1 391 | 0.9982338,1 392 | 0.9990817,1 393 | 0.9982898,1 394 | 0.9981419,1 395 | 0.9981502,1 396 | 0.9983414,1 397 | 0.9982773,1 398 | 0.99826676,1 399 | 0.99875844,1 400 | 0.026038378,1 401 | 0.9851346,1 402 | 0.010273457,1 403 | 0.9981687,1 404 | 0.998108,1 405 | 0.99810505,1 406 | 0.9981212,1 407 | 0.059390634,0 408 | 0.9901328,1 409 | 0.9983573,1 410 | 0.9985495,1 411 | 0.9982264,1 412 | 0.99175453,1 413 | 0.99845123,1 414 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_full/toxic/xlm_r/final_toxic_toxic_xlm_r_result_combined_11852.csv: -------------------------------------------------------------------------------- 1 | tipe,total_data,foreign_mult,total_foreign_data,max_f1,max_recall,max_precision,max_accuracy 2 | A,11852,0.0,0,0.8685647045717231,0.8549931600547196,0.9044862518089725,0.8694001518602885 3 | B,11852,-1.0,0,0.5209952038369305,0.8577291381668947,0.5865294667913938,0.5854214123006833 4 | C,11852,0.5,5926,0.8695178335535005,0.920656634746922,0.859514687100894,0.8724373576309795 5 | C,11852,1.0,11852,0.8881014455643753,0.8878248974008208,0.9102384291725105,0.8891419893697798 6 | C,11852,1.5,17778,0.8828543174219474,0.8782489740082079,0.9093484419263456,0.8838268792710706 7 | C,11852,2.0,23704,0.8915156507413509,0.8741450068399452,0.9274310595065312,0.8921791951404707 8 | C,11852,3.0,35556,0.8984035009874802,0.8796169630642955,0.934593023255814,0.8990129081245254 9 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_full/toxic/xlm_r/plot-full-toxic-xlmr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_full/toxic/xlm_r/plot-full-toxic-xlmr.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_full/toxic/xlm_r_comparable/Result Toxic Comparable.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from sklearn.metrics import classification_report\n", 12 | "from sklearn.metrics import f1_score, recall_score, precision_score\n", 13 | "from sklearn.metrics import accuracy_score\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "from matplotlib.pyplot import figure, xticks\n", 16 | "%matplotlib inline" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "(1317, 2)\n" 29 | ] 30 | }, 31 | { 32 | "data": { 33 | "text/html": [ 34 | "
\n", 35 | "\n", 48 | "\n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | "
y_predy_true
00.9967801
10.0479710
20.3103690
30.0313040
40.7854271
\n", 84 | "
" 85 | ], 86 | "text/plain": [ 87 | " y_pred y_true\n", 88 | "0 0.996780 1\n", 89 | "1 0.047971 0\n", 90 | "2 0.310369 0\n", 91 | "3 0.031304 0\n", 92 | "4 0.785427 1" 93 | ] 94 | }, 95 | "execution_count": 2, 96 | "metadata": {}, 97 | "output_type": "execute_result" 98 | } 99 | ], 100 | "source": [ 101 | "data = pd.read_csv('result_Abusive_toxic_toxic_XLM_R_A_11852_0.5_full.csv')\n", 102 | "print(data.shape)\n", 103 | "data.head()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 3, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "def evaluate(y_true, y_pred_proba, debug=False):\n", 113 | " max_threshold = -1\n", 114 | " max_f1 = 0\n", 115 | " max_recall = 0\n", 116 | " max_precision = 0\n", 117 | " mac_acc = 0\n", 118 | "\n", 119 | " for THRESHOLD in range(50, 51):\n", 120 | "# for THRESHOLD in range(0, 100):\n", 121 | " THRESHOLD = THRESHOLD/100\n", 122 | " y_pred_thr = [1 if x>=THRESHOLD else 0 for x in y_pred_proba]\n", 123 | " f1 = f1_score(y_true, y_pred_thr, average='macro')\n", 124 | " recall = recall_score(y_true, y_pred_thr)\n", 125 | " precision = precision_score(y_true, y_pred_thr)\n", 126 | " acc = accuracy_score(y_true, y_pred_thr)\n", 127 | " \n", 128 | " if debug:\n", 129 | " print(\"THRESHOLD: {:.3f} \\tF1: {:.8f} \\tRecall: {:.8f} \\tPrecision: {:.8f}\".format(THRESHOLD, \n", 130 | " f1, \n", 131 | " recall, \n", 132 | " precision))\n", 133 | "\n", 134 | " if acc>mac_acc:\n", 135 | " max_f1 = f1\n", 136 | " max_recall = recall\n", 137 | " max_precision = precision\n", 138 | " mac_acc = acc\n", 139 | " max_threshold = THRESHOLD\n", 140 | " \n", 141 | " print(\"##MAX## \\nTHRESHOLD: {:.3f} \\tF1: {:.8f} \\tRecall: {:.8f} \\tPrec: {:.8f} \\tAcc: {:.8f}\".format(max_threshold,\n", 142 | " max_f1, \n", 143 | " max_recall, \n", 144 | " max_precision,\n", 145 | " mac_acc))\n", 146 | " return max_f1, max_recall, max_precision, mac_acc" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 4, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "##MAX## \n", 159 | "THRESHOLD: 0.500 \tF1: 0.93094791 \tRecall: 0.92215569 \tPrec: 0.90766208 \tAcc: 0.93470008\n" 160 | ] 161 | }, 162 | { 163 | "data": { 164 | "text/plain": [ 165 | "(0.9309479100619422,\n", 166 | " 0.9221556886227545,\n", 167 | " 0.9076620825147348,\n", 168 | " 0.9347000759301443)" 169 | ] 170 | }, 171 | "execution_count": 4, 172 | "metadata": {}, 173 | "output_type": "execute_result" 174 | } 175 | ], 176 | "source": [ 177 | "evaluate(data.y_true, data.y_pred)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 5, 183 | "metadata": {}, 184 | "outputs": [ 185 | { 186 | "name": "stdout", 187 | "output_type": "stream", 188 | "text": [ 189 | "(1317, 2)\n" 190 | ] 191 | }, 192 | { 193 | "data": { 194 | "text/html": [ 195 | "
\n", 196 | "\n", 209 | "\n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | "
y_predy_true
00.9854291
10.0564910
20.0750790
30.7341751
40.4754770
\n", 245 | "
" 246 | ], 247 | "text/plain": [ 248 | " y_pred y_true\n", 249 | "0 0.985429 1\n", 250 | "1 0.056491 0\n", 251 | "2 0.075079 0\n", 252 | "3 0.734175 1\n", 253 | "4 0.475477 0" 254 | ] 255 | }, 256 | "execution_count": 5, 257 | "metadata": {}, 258 | "output_type": "execute_result" 259 | } 260 | ], 261 | "source": [ 262 | "data = pd.read_csv('result_HS_toxic_toxic_XLM_R_A_11852_0.5_full.csv')\n", 263 | "print(data.shape)\n", 264 | "data.head()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 6, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "##MAX## \n", 277 | "THRESHOLD: 0.500 \tF1: 0.85331737 \tRecall: 0.87408759 \tPrec: 0.79833333 \tAcc: 0.85573273\n" 278 | ] 279 | }, 280 | { 281 | "data": { 282 | "text/plain": [ 283 | "(0.8533173733006317,\n", 284 | " 0.8740875912408759,\n", 285 | " 0.7983333333333333,\n", 286 | " 0.8557327258921792)" 287 | ] 288 | }, 289 | "execution_count": 6, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "evaluate(data.y_true, data.y_pred)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 7, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/plain": [ 306 | "0.895216405" 307 | ] 308 | }, 309 | "execution_count": 7, 310 | "metadata": {}, 311 | "output_type": "execute_result" 312 | } 313 | ], 314 | "source": [ 315 | "(0.93470008 + 0.85573273)/2" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [] 324 | } 325 | ], 326 | "metadata": { 327 | "kernelspec": { 328 | "display_name": "Python 3", 329 | "language": "python", 330 | "name": "python3" 331 | }, 332 | "language_info": { 333 | "codemirror_mode": { 334 | "name": "ipython", 335 | "version": 3 336 | }, 337 | "file_extension": ".py", 338 | "mimetype": "text/x-python", 339 | "name": "python", 340 | "nbconvert_exporter": "python", 341 | "pygments_lexer": "ipython3", 342 | "version": "3.7.6" 343 | } 344 | }, 345 | "nbformat": 4, 346 | "nbformat_minor": 4 347 | } 348 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_full/trip_advisor/xlm_r/final_trip_advisor_yelp_xlm_r_result_combined_12389.csv: -------------------------------------------------------------------------------- 1 | tipe,total_data,foreign_mult,total_foreign_data,max_f1,max_recall,max_precision,max_accuracy 2 | A,12389,0.0,0,0.35938710731456847,0.012444444444444444,0.42424242424242425,0.5347879785920132 3 | B,12389,-1.0,0,0.8936976700004071,0.9342222222222222,0.8510121457489879,0.8937834499794154 4 | C,12389,0.5,6194,0.3669406575781877,0.018666666666666668,0.6,0.5397282832441334 5 | C,12389,1.0,12389,0.6510512102877761,0.38133333333333336,0.8863636363636364,0.6908192671881432 6 | C,12389,1.5,18583,0.7368764827772605,0.5644444444444444,0.8455392809587217,0.7505146150679292 7 | C,12389,2.0,24778,0.6412098349978289,0.3582222222222222,0.909706546275395,0.6862906545903664 8 | C,12389,3.0,37167,0.6233246176026559,0.3893333333333333,0.7474402730375427,0.6562371346233018 9 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_full/trip_advisor/xlm_r/plot-full-trip-advisor-xlmr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_full/trip_advisor/xlm_r/plot-full-trip-advisor-xlmr.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_full/trip_advisor/xlm_r_duplicate/final_trip_advisor_yelp_xlm_r_result_combined_9816.csv: -------------------------------------------------------------------------------- 1 | tipe,total_data,foreign_mult,total_foreign_data,max_f1,max_recall,max_precision,max_accuracy 2 | A,9816,0.0,0,0.638970426622639,0.36977777777777776,0.859504132231405,0.6801152737752162 3 | B,9816,-1.0,0,0.8868968475376315,0.9022222222222223,0.8608990670059372,0.8871963771099218 4 | C,9816,0.5,4908,0.744634995729023,0.5351111111111111,0.9190839694656489,0.7628653766982297 5 | C,9816,1.0,9816,0.7619530786292211,0.5928888888888889,0.8799472295514512,0.7739810621655002 6 | C,9816,1.5,14724,0.6795289626229877,0.43466666666666665,0.8763440860215054,0.7097571016879374 7 | C,9816,2.0,19632,0.8666238044928826,0.7697777777777778,0.9382448537378115,0.869905310827501 8 | C,9816,3.0,29448,0.8181090516243108,0.7191111111111111,0.8755411255411255,0.8225607245780157 9 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_full/trip_advisor/xlm_r_duplicate/plot-full-trip-advisor-xlmr-duplicate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_full/trip_advisor/xlm_r_duplicate/plot-full-trip-advisor-xlmr-duplicate.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/compilation/average-f1-score-gains.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/compilation/average-f1-score-gains.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/compilation/plot-prosa-mbert-english.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/compilation/plot-prosa-mbert-english.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/compilation/plot-prosa-xlmr-english.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/compilation/plot-prosa-xlmr-english.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/compilation/plot-toxic-mbert-english.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/compilation/plot-toxic-mbert-english.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/compilation/plot-toxic-xlmr-english.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/compilation/plot-toxic-xlmr-english.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/compilation/plot-trip-mbert-english.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/compilation/plot-trip-mbert-english.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/compilation/plot-trip-xlmr-english.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/compilation/plot-trip-xlmr-english.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/plot.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/plot_mbert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/plot_mbert.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/prosa/mbert/gains.csv: -------------------------------------------------------------------------------- 1 | data,gain 2 | 500,0.1728943185149034 3 | 1000,0.11193071737149163 4 | 2500,0.08278291746194233 5 | 5000,0.04609309869696088 6 | 7500,0.036935865692835135 7 | 10000,0.020514832306416708 8 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/prosa/mbert/plot-prosa-mbert-english.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/prosa/mbert/plot-prosa-mbert-english.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/prosa/mbert/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/prosa/mbert/plot.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/prosa/xlm_r/gains.csv: -------------------------------------------------------------------------------- 1 | data,gain 2 | 500,0.05184308856615921 3 | 1000,0.29280261365141225 4 | 2500,0.2020859080277152 5 | 5000,0.17402067620771755 6 | 7500,0.1855049480453861 7 | 10000,0.17312141927330504 8 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/prosa/xlm_r/plot-prosa-xlmr-english.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/prosa/xlm_r/plot-prosa-xlmr-english.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/prosa/xlm_r/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/prosa/xlm_r/plot.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/toxic/mbert/gains.csv: -------------------------------------------------------------------------------- 1 | data,gain 2 | 500,0.16056351836372218 3 | 1000,0.17823350011509753 4 | 2500,0.0 5 | 5000,0.0 6 | 7500,0.0 7 | 10000,0.0 8 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/toxic/mbert/plot-toxic-mbert-english.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/toxic/mbert/plot-toxic-mbert-english.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/toxic/mbert/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/toxic/mbert/plot.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/toxic/xlm_r/gains.csv: -------------------------------------------------------------------------------- 1 | data,gain 2 | 500,0.2781949074899959 3 | 1000,0.08658539622838057 4 | 2500,0.05071094881969074 5 | 5000,0.0199039776756742 6 | 7500,0.005295925026993165 7 | 10000,0.001343874484408314 8 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/toxic/xlm_r/plot-toxic-xlmr-english.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/toxic/xlm_r/plot-toxic-xlmr-english.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/toxic/xlm_r/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/toxic/xlm_r/plot.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/trip_advisor/mbert/gains.csv: -------------------------------------------------------------------------------- 1 | data,gain 2 | 500,0.054725011315118754 3 | 1000,0.037481664179051466 4 | 2500,0.07089631895925208 5 | 5000,0.04259760791340961 6 | 7500,0.047192595689128924 7 | 10000,0.04003840467405828 8 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/trip_advisor/mbert/plot-trip-mbert-english.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/trip_advisor/mbert/plot-trip-mbert-english.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/trip_advisor/mbert/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/trip_advisor/mbert/plot.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/trip_advisor/xlm_r/gains.csv: -------------------------------------------------------------------------------- 1 | data,gain 2 | 500,0.19862625756036784 3 | 1000,0.11776557626073325 4 | 2500,0.10257103931293043 5 | 5000,0.09341452327867139 6 | 7500,0.06998790845136615 7 | 10000,0.05915904740582867 8 | -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/trip_advisor/xlm_r/plot-trip-xlmr-english.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/trip_advisor/xlm_r/plot-trip-xlmr-english.png -------------------------------------------------------------------------------- /notebooks/result_analysis/fine_tune_head/trip_advisor/xlm_r/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilhamfp/indonesian-text-classification-multilingual/46679e2235cb03b0d85901cc0b471d42b3c592f2/notebooks/result_analysis/fine_tune_head/trip_advisor/xlm_r/plot.png -------------------------------------------------------------------------------- /src/README.md: -------------------------------------------------------------------------------- 1 | # Source Code 2 | The files in this directory was originally a Kaggle utility script. Here's the link: 3 | * [load-data.py](https://www.kaggle.com/ilhamfp31/load-data) 4 | * [extract-feature.py](https://www.kaggle.com/ilhamfp31/extract-feature) 5 | * [model-head.py](https://www.kaggle.com/ilhamfp31/model-head) 6 | * [model-full.py](https://www.kaggle.com/ilhamfp31/model-full) -------------------------------------------------------------------------------- /src/extract-feature.py: -------------------------------------------------------------------------------- 1 | # This source code is part of a final year undergraduate project 2 | # on exploring Indonesian hate speech/abusive & sentiment text 3 | # classification using a multilingual language model 4 | # 5 | # Checkout the full github repository: 6 | # https://github.com/ilhamfp/indonesian-text-classification-multilingual 7 | 8 | import torch 9 | from transformers import AutoModel, AutoTokenizer, BertTokenizer 10 | 11 | torch.set_grad_enabled(False) 12 | 13 | class FeatureExtractor(): 14 | 15 | def __init__(self, model_name='xlm-r'): 16 | self.model_name = model_name 17 | self.max_length = 512 18 | 19 | if self.model_name == 'xlm-r': 20 | xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.large') 21 | xlmr.eval() 22 | self.model = xlmr 23 | 24 | elif self.model_name == 'mbert': 25 | MODEL_NAME = "bert-base-multilingual-cased" 26 | model = AutoModel.from_pretrained(MODEL_NAME) 27 | tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) 28 | self.model = model 29 | self.tokenizer = tokenizer 30 | 31 | def extract_features(self, text): 32 | text = str(text) 33 | if self.model_name == 'xlm-r': 34 | tokens = self.model.encode(text) 35 | 36 | # Truncate 37 | if len(tokens) > self.max_length: 38 | tokens = torch.cat( (tokens[:511], torch.Tensor([2]).long()), 0 ) 39 | 40 | last_layer_features = self.model.extract_features(tokens) 41 | features = last_layer_features[:, 0, :].data.numpy() 42 | 43 | elif self.model_name == 'mbert': 44 | tokens_pt2 = self.tokenizer.encode_plus(text, 45 | return_tensors="pt", 46 | pad_to_max_length=True, 47 | max_length=self.max_length) 48 | 49 | outputs2, pooled2 = self.model(**tokens_pt2) 50 | features = pooled2.data.numpy() 51 | 52 | return features -------------------------------------------------------------------------------- /src/model-full.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import random 4 | import numpy as np 5 | import torch 6 | import tensorflow as tf 7 | from tensorflow.keras.layers import Dense, Input, Dropout 8 | from tensorflow.keras.optimizers import Adam 9 | from tensorflow.keras.models import Model 10 | from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, CSVLogger, EarlyStopping 11 | import transformers 12 | from transformers import TFAutoModel, AutoTokenizer 13 | from tqdm.notebook import tqdm 14 | from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors, SentencePieceBPETokenizer 15 | from tensorflow.keras import backend as K 16 | 17 | def set_seed(seed=1): 18 | random.seed(seed) 19 | torch.manual_seed(seed) 20 | torch.cuda.manual_seed_all(seed) 21 | np.random.seed(seed) 22 | os.environ['PYTHONHASHSEED'] = str(seed) 23 | torch.backends.cudnn.deterministic = True 24 | 25 | def regular_encode(texts, tokenizer, maxlen=512): 26 | enc_di = tokenizer.batch_encode_plus( 27 | texts, 28 | return_attention_masks=False, 29 | return_token_type_ids=False, 30 | pad_to_max_length=True, 31 | max_length=maxlen 32 | ) 33 | 34 | return np.array(enc_di['input_ids']) 35 | 36 | def f1(y_true, y_pred): 37 | def recall(y_true, y_pred): 38 | """Recall metric. 39 | 40 | Only computes a batch-wise average of recall. 41 | 42 | Computes the recall, a metric for multi-label classification of 43 | how many relevant items are selected. 44 | """ 45 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 46 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 47 | recall = true_positives / (possible_positives + K.epsilon()) 48 | return recall 49 | 50 | def precision(y_true, y_pred): 51 | """Precision metric. 52 | 53 | Only computes a batch-wise average of precision. 54 | 55 | Computes the precision, a metric for multi-label classification of 56 | how many selected items are relevant. 57 | """ 58 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 59 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 60 | precision = true_positives / (predicted_positives + K.epsilon()) 61 | return precision 62 | precision = precision(y_true, y_pred) 63 | recall = recall(y_true, y_pred) 64 | return 2*((precision*recall)/(precision+recall+K.epsilon())) 65 | 66 | def build_model(transformer, learning_rate=1e-5, max_len=512): 67 | input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids") 68 | sequence_output = transformer(input_word_ids)[0] 69 | cls_token = sequence_output[:, 0, :] 70 | cls_token = Dropout(0.2)(cls_token) 71 | out = Dense(1, activation='sigmoid')(cls_token) 72 | 73 | model = Model(inputs=input_word_ids, outputs=out) 74 | model.compile(Adam(lr=learning_rate), loss='binary_crossentropy', metrics=[f1, 'accuracy']) 75 | 76 | return model 77 | 78 | def callback(): 79 | cb = [] 80 | 81 | reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss', 82 | factor=0.5, patience=0, 83 | verbose=1, mode='min', 84 | epsilon=0.0001, min_lr=0, 85 | restore_best_weights=True) 86 | cb.append(reduceLROnPlat) 87 | 88 | log = CSVLogger('log.csv') 89 | cb.append(log) 90 | 91 | es = EarlyStopping(monitor='val_loss', patience=4, verbose=0, 92 | mode='min', restore_best_weights=True) 93 | 94 | cb.append(es) 95 | 96 | return cb -------------------------------------------------------------------------------- /src/model-head.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import random 4 | import numpy as np 5 | import torch 6 | from torch.utils.data.sampler import SubsetRandomSampler 7 | from torch.utils.data import TensorDataset, DataLoader 8 | 9 | train_on_gpu = torch.cuda.is_available() 10 | if not train_on_gpu: 11 | print('CUDA is not available. Training on CPU ...') 12 | else: 13 | print('CUDA is available! Training on GPU ...') 14 | 15 | def set_seed(seed=1): 16 | random.seed(seed) 17 | torch.manual_seed(seed) 18 | torch.cuda.manual_seed_all(seed) 19 | np.random.seed(seed) 20 | os.environ['PYTHONHASHSEED'] = str(seed) 21 | torch.backends.cudnn.deterministic = True 22 | 23 | import torch.nn as nn 24 | import torch.nn.functional as F 25 | class Net(nn.Module): 26 | def __init__(self, input_dim=1024): 27 | super(Net, self).__init__() 28 | prob_dropout = 0.2 29 | output_size = 1 30 | 31 | self.dropout_1 = nn.Dropout(p=prob_dropout) 32 | self.out_proj = nn.Linear(input_dim, output_size) 33 | self.sig = nn.Sigmoid() 34 | 35 | def forward(self, x, debug=False): 36 | x = x.squeeze() 37 | 38 | if debug: 39 | print("Init ", x.shape) 40 | 41 | x = self.dropout_1(x) 42 | x = self.out_proj(x) 43 | 44 | if debug: 45 | print("out_proj ", x.shape) 46 | 47 | x = self.sig(x) 48 | 49 | if debug: 50 | print("sig ", x.shape) 51 | 52 | return x 53 | 54 | def train(train_loader, valid_loader, input_dim=1024, learning_rate=0.001, debug=False): 55 | model = Net(input_dim=input_dim) 56 | if train_on_gpu: 57 | model.cuda() 58 | 59 | lr = learning_rate 60 | criterion = nn.BCELoss() 61 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 62 | scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 63 | mode='min', 64 | factor=0.5, 65 | patience=5, 66 | verbose=debug) 67 | 68 | n_epochs = 30000 69 | early_stopping_patience = 12 70 | 71 | valid_loss_min = np.Inf 72 | train_loss_min = np.Inf 73 | train_acc_min = np.Inf 74 | valid_acc_min = np.Inf 75 | last_best_epoch = 0 76 | 77 | len_train = len(train_loader.sampler) 78 | len_valid = len(valid_loader.sampler) 79 | 80 | for epoch in range(1, n_epochs+1): 81 | train_loss = 0.0 82 | valid_loss = 0.0 83 | 84 | ############### 85 | # train model # 86 | ############### 87 | model.train() 88 | train_correct = 0 89 | for data, target in train_loader: 90 | if train_on_gpu: 91 | data, target = data.cuda(), target.cuda() 92 | 93 | model.zero_grad() 94 | output = model(data) 95 | 96 | loss = criterion(output.squeeze(), target.float()) 97 | loss.backward() 98 | optimizer.step() 99 | train_loss += loss.item()*data.size(0) 100 | 101 | 102 | pred = torch.round(output.squeeze()) 103 | correct_tensor = pred.eq(target.float().view_as(pred)) 104 | correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy()) 105 | train_correct += np.sum(correct) 106 | 107 | ################## 108 | # validate model # 109 | ################## 110 | 111 | val_correct = 0 112 | for data, target in valid_loader: 113 | if train_on_gpu: 114 | data, target = data.cuda(), target.cuda() 115 | 116 | output = model(data) 117 | loss = criterion(output.squeeze(), target.float()) 118 | valid_loss += loss.item()*data.size(0) 119 | 120 | pred = torch.round(output.squeeze()).int() 121 | correct_tensor = pred.eq(target.int().view_as(pred)) 122 | correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy()) 123 | val_correct += np.sum(correct) 124 | 125 | 126 | train_loss = train_loss/len_train 127 | valid_loss = valid_loss/len_valid 128 | train_acc = train_correct/len_train 129 | val_acc = val_correct/len_valid 130 | 131 | if debug: 132 | print('Epoch: {} \tT-Loss: {:.6f} \tT-Acc: {:.6f} \tV-Loss: {:.6f} \tV-Acc: {:.6f}'.format( 133 | epoch, train_loss, train_acc, valid_loss, val_acc)) 134 | 135 | scheduler.step(valid_loss) 136 | 137 | # save model if validation loss has decreased 138 | if valid_loss <= valid_loss_min: 139 | if debug: 140 | print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format( 141 | valid_loss_min, 142 | valid_loss)) 143 | 144 | torch.save(model.state_dict(), 'model.pt') 145 | last_best_epoch = epoch 146 | valid_loss_min = valid_loss 147 | train_loss_min = train_loss 148 | train_acc_min = train_acc 149 | valid_acc_min = val_acc 150 | 151 | elif (epoch-last_best_epoch) > early_stopping_patience: 152 | print("EarlyStopping! Epoch {}".format(epoch)) 153 | print('Last: {} \tT-Loss: {:.6f} \tT-Acc: {:.6f} \tV-Loss: {:.6f} \tV-Acc: {:.6f}'.format( 154 | last_best_epoch, train_loss_min, train_acc_min, valid_loss_min, valid_acc_min)) 155 | 156 | break 157 | 158 | def test(test_loader, input_dim=1024): 159 | model = Net(input_dim=input_dim) 160 | criterion = nn.BCELoss() 161 | if train_on_gpu: 162 | model.cuda() 163 | 164 | model.load_state_dict(torch.load('model.pt')) 165 | 166 | test_loss = 0.0 167 | num_correct = 0 168 | y_true = np.array([]) 169 | y_pred = np.array([]) 170 | y_pred_proba = np.array([]) 171 | 172 | model.eval() 173 | for data, target in test_loader: 174 | if train_on_gpu: 175 | data, target = data.cuda(), target.cuda() 176 | 177 | output = model(data) 178 | loss = criterion(output.squeeze(), target.float()) 179 | test_loss += loss.item()*data.size(0) 180 | 181 | pred = torch.round(output.squeeze()).int() 182 | correct_tensor = pred.eq(target.int().view_as(pred)) 183 | correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy()) 184 | num_correct += np.sum(correct) 185 | 186 | y_true = np.concatenate([y_true, target.int().view_as(pred).detach().numpy()]) 187 | y_pred = np.concatenate([y_pred, pred.detach().numpy()]) 188 | y_pred_proba = np.concatenate([y_pred_proba, output.squeeze().detach().numpy()]) 189 | 190 | 191 | test_loss = test_loss/len(test_loader.sampler) 192 | print('Final test Loss: {:.6f}'.format(test_loss)) 193 | 194 | return y_true, y_pred_proba 195 | 196 | from sklearn.metrics import classification_report 197 | from sklearn.metrics import f1_score, recall_score, precision_score 198 | def evaluate(y_true, y_pred_proba, threshold=None, debug=False): 199 | max_threshold = -1 200 | max_f1 = 0 201 | max_recall = 0 202 | max_precision = 0 203 | if threshold==None: 204 | print("[Evaluate] No threshold argument. Finding best threshold.") 205 | threshold_list = [x/100 for x in range(0, 100)] 206 | else: 207 | print("[Evaluate] Threshold argument set. Using {} as threshold".format(threshold)) 208 | threshold_list = [threshold] 209 | 210 | for threshold_it in threshold_list: 211 | y_pred_thr = [1 if x>=threshold_it else 0 for x in y_pred_proba] 212 | f1 = f1_score(y_true, y_pred_thr, average='macro') 213 | recall = recall_score(y_true, y_pred_thr) 214 | precision = precision_score(y_true, y_pred_thr) 215 | 216 | if debug: 217 | print("[Evaluate] THRESHOLD: {:.3f} \tF1: {:.8f} \tRecall: {:.8f} \tPrecision: {:.8f}".format(threshold_it, 218 | f1, 219 | recall, 220 | precision)) 221 | 222 | if f1>max_f1: 223 | max_f1 = f1 224 | max_recall = recall 225 | max_precision = precision 226 | max_threshold = threshold_it 227 | 228 | print("[Evaluate] ##MAX## \nTHRESHOLD: {:.3f} \tF1: {:.8f} \tRecall: {:.8f} \tPrec: {:.8f}".format(max_threshold, 229 | max_f1, 230 | max_recall, 231 | max_precision)) 232 | return max_f1, max_recall, max_precision, max_threshold 233 | 234 | if __name__ == "main": 235 | print('ok') --------------------------------------------------------------------------------